dcd5581f9574003cc3e1e1cd23bd6797c35559d2
[mesa.git] / src / vulkan / anv_compiler.cpp
1 /*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 */
23
24 #include <sys/stat.h>
25 #include <unistd.h>
26 #include <fcntl.h>
27
28 #include "anv_private.h"
29 #include "anv_nir.h"
30
31 #include <brw_context.h>
32 #include <brw_wm.h> /* brw_new_shader_program is here */
33 #include <brw_nir.h>
34
35 #include <brw_vs.h>
36 #include <brw_gs.h>
37 #include <brw_cs.h>
38
39 #include <mesa/main/shaderobj.h>
40 #include <mesa/main/fbobject.h>
41 #include <mesa/main/context.h>
42 #include <mesa/program/program.h>
43 #include <glsl/program.h>
44
45 /* XXX: We need this to keep symbols in nir.h from conflicting with the
46 * generated GEN command packing headers. We need to fix *both* to not
47 * define something as generic as LOAD.
48 */
49 #undef LOAD
50
51 #include <glsl/nir/nir_spirv.h>
52
53 #define SPIR_V_MAGIC_NUMBER 0x07230203
54
55 static void
56 fail_if(int cond, const char *format, ...)
57 {
58 va_list args;
59
60 if (!cond)
61 return;
62
63 va_start(args, format);
64 vfprintf(stderr, format, args);
65 va_end(args);
66
67 exit(1);
68 }
69
70 static VkResult
71 set_binding_table_layout(struct brw_stage_prog_data *prog_data,
72 struct anv_pipeline *pipeline, uint32_t stage)
73 {
74 uint32_t bias, count, k, *map;
75 struct anv_pipeline_layout *layout = pipeline->layout;
76
77 /* No layout is valid for shaders that don't bind any resources. */
78 if (pipeline->layout == NULL)
79 return VK_SUCCESS;
80
81 if (stage == VK_SHADER_STAGE_FRAGMENT)
82 bias = MAX_RTS;
83 else
84 bias = 0;
85
86 count = layout->stage[stage].surface_count;
87 prog_data->map_entries =
88 (uint32_t *) malloc(count * sizeof(prog_data->map_entries[0]));
89 if (prog_data->map_entries == NULL)
90 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
91
92 k = bias;
93 map = prog_data->map_entries;
94 for (uint32_t i = 0; i < layout->num_sets; i++) {
95 prog_data->bind_map[i].index = map;
96 for (uint32_t j = 0; j < layout->set[i].layout->stage[stage].surface_count; j++)
97 *map++ = k++;
98
99 prog_data->bind_map[i].index_count =
100 layout->set[i].layout->stage[stage].surface_count;
101 }
102
103 return VK_SUCCESS;
104 }
105
106 static uint32_t
107 upload_kernel(struct anv_pipeline *pipeline, const void *data, size_t size)
108 {
109 struct anv_state state =
110 anv_state_stream_alloc(&pipeline->program_stream, size, 64);
111
112 assert(size < pipeline->program_stream.block_pool->block_size);
113
114 memcpy(state.map, data, size);
115
116 return state.offset;
117 }
118
119 static void
120 create_params_array(struct anv_pipeline *pipeline,
121 struct gl_shader *shader,
122 struct brw_stage_prog_data *prog_data)
123 {
124 VkShaderStage stage = anv_vk_shader_stage_for_mesa_stage(shader->Stage);
125 unsigned num_params = 0;
126
127 if (shader->num_uniform_components) {
128 /* If the shader uses any push constants at all, we'll just give
129 * them the maximum possible number
130 */
131 num_params += MAX_PUSH_CONSTANTS_SIZE / sizeof(float);
132 }
133
134 if (pipeline->layout && pipeline->layout->stage[stage].has_dynamic_offsets)
135 num_params += MAX_DYNAMIC_BUFFERS;
136
137 if (num_params == 0)
138 return;
139
140 prog_data->param = (const gl_constant_value **)
141 anv_device_alloc(pipeline->device,
142 num_params * sizeof(gl_constant_value *),
143 8, VK_SYSTEM_ALLOC_TYPE_INTERNAL_SHADER);
144
145 /* We now set the param values to be offsets into a
146 * anv_push_constant_data structure. Since the compiler doesn't
147 * actually dereference any of the gl_constant_value pointers in the
148 * params array, it doesn't really matter what we put here.
149 */
150 struct anv_push_constants *null_data = NULL;
151 for (unsigned i = 0; i < num_params; i++)
152 prog_data->param[i] =
153 (const gl_constant_value *)&null_data->client_data[i * sizeof(float)];
154 }
155
156 static void
157 brw_vs_populate_key(struct brw_context *brw,
158 struct brw_vertex_program *vp,
159 struct brw_vs_prog_key *key)
160 {
161 struct gl_context *ctx = &brw->ctx;
162 /* BRW_NEW_VERTEX_PROGRAM */
163 struct gl_program *prog = (struct gl_program *) vp;
164
165 memset(key, 0, sizeof(*key));
166
167 /* Just upload the program verbatim for now. Always send it all
168 * the inputs it asks for, whether they are varying or not.
169 */
170 key->base.program_string_id = vp->id;
171 brw_setup_vue_key_clip_info(brw, &key->base,
172 vp->program.Base.UsesClipDistanceOut);
173
174 /* _NEW_POLYGON */
175 if (brw->gen < 6) {
176 key->copy_edgeflag = (ctx->Polygon.FrontMode != GL_FILL ||
177 ctx->Polygon.BackMode != GL_FILL);
178 }
179
180 if (prog->OutputsWritten & (VARYING_BIT_COL0 | VARYING_BIT_COL1 |
181 VARYING_BIT_BFC0 | VARYING_BIT_BFC1)) {
182 /* _NEW_LIGHT | _NEW_BUFFERS */
183 key->clamp_vertex_color = ctx->Light._ClampVertexColor;
184 }
185
186 /* _NEW_POINT */
187 if (brw->gen < 6 && ctx->Point.PointSprite) {
188 for (int i = 0; i < 8; i++) {
189 if (ctx->Point.CoordReplace[i])
190 key->point_coord_replace |= (1 << i);
191 }
192 }
193
194 /* _NEW_TEXTURE */
195 brw_populate_sampler_prog_key_data(ctx, prog, brw->vs.base.sampler_count,
196 &key->base.tex);
197 }
198
199 static bool
200 really_do_vs_prog(struct brw_context *brw,
201 struct gl_shader_program *prog,
202 struct brw_vertex_program *vp,
203 struct brw_vs_prog_key *key, struct anv_pipeline *pipeline)
204 {
205 GLuint program_size;
206 const GLuint *program;
207 struct brw_vs_prog_data *prog_data = &pipeline->vs_prog_data;
208 void *mem_ctx;
209 struct gl_shader *vs = NULL;
210
211 if (prog)
212 vs = prog->_LinkedShaders[MESA_SHADER_VERTEX];
213
214 memset(prog_data, 0, sizeof(*prog_data));
215
216 mem_ctx = ralloc_context(NULL);
217
218 create_params_array(pipeline, vs, &prog_data->base.base);
219 anv_nir_apply_dynamic_offsets(pipeline, vs->Program->nir,
220 &prog_data->base.base);
221
222 GLbitfield64 outputs_written = vp->program.Base.OutputsWritten;
223 prog_data->inputs_read = vp->program.Base.InputsRead;
224
225 if (key->copy_edgeflag) {
226 outputs_written |= BITFIELD64_BIT(VARYING_SLOT_EDGE);
227 prog_data->inputs_read |= VERT_BIT_EDGEFLAG;
228 }
229
230 if (brw->gen < 6) {
231 /* Put dummy slots into the VUE for the SF to put the replaced
232 * point sprite coords in. We shouldn't need these dummy slots,
233 * which take up precious URB space, but it would mean that the SF
234 * doesn't get nice aligned pairs of input coords into output
235 * coords, which would be a pain to handle.
236 */
237 for (int i = 0; i < 8; i++) {
238 if (key->point_coord_replace & (1 << i))
239 outputs_written |= BITFIELD64_BIT(VARYING_SLOT_TEX0 + i);
240 }
241
242 /* if back colors are written, allocate slots for front colors too */
243 if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_BFC0))
244 outputs_written |= BITFIELD64_BIT(VARYING_SLOT_COL0);
245 if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_BFC1))
246 outputs_written |= BITFIELD64_BIT(VARYING_SLOT_COL1);
247 }
248
249 /* In order for legacy clipping to work, we need to populate the clip
250 * distance varying slots whenever clipping is enabled, even if the vertex
251 * shader doesn't write to gl_ClipDistance.
252 */
253 if (key->base.userclip_active) {
254 outputs_written |= BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0);
255 outputs_written |= BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1);
256 }
257
258 brw_compute_vue_map(brw->intelScreen->devinfo,
259 &prog_data->base.vue_map, outputs_written);
260 \
261 set_binding_table_layout(&prog_data->base.base, pipeline,
262 VK_SHADER_STAGE_VERTEX);
263
264 /* Emit GEN4 code.
265 */
266 program = brw_vs_emit(brw, mem_ctx, key, prog_data, &vp->program,
267 prog, &program_size);
268 if (program == NULL) {
269 ralloc_free(mem_ctx);
270 return false;
271 }
272
273 const uint32_t offset = upload_kernel(pipeline, program, program_size);
274 if (prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8) {
275 pipeline->vs_simd8 = offset;
276 pipeline->vs_vec4 = NO_KERNEL;
277 } else {
278 pipeline->vs_simd8 = NO_KERNEL;
279 pipeline->vs_vec4 = offset;
280 }
281
282 ralloc_free(mem_ctx);
283
284 return true;
285 }
286
287 void brw_wm_populate_key(struct brw_context *brw,
288 struct brw_fragment_program *fp,
289 struct brw_wm_prog_key *key)
290 {
291 struct gl_context *ctx = &brw->ctx;
292 struct gl_program *prog = (struct gl_program *) brw->fragment_program;
293 GLuint lookup = 0;
294 GLuint line_aa;
295 bool program_uses_dfdy = fp->program.UsesDFdy;
296 struct gl_framebuffer draw_buffer;
297 bool multisample_fbo;
298
299 memset(key, 0, sizeof(*key));
300
301 for (int i = 0; i < MAX_SAMPLERS; i++) {
302 /* Assume color sampler, no swizzling. */
303 key->tex.swizzles[i] = SWIZZLE_XYZW;
304 }
305
306 /* A non-zero framebuffer name indicates that the framebuffer was created by
307 * the user rather than the window system. */
308 draw_buffer.Name = 1;
309 draw_buffer.Visual.samples = 1;
310 draw_buffer._NumColorDrawBuffers = 1;
311 draw_buffer._NumColorDrawBuffers = 1;
312 draw_buffer.Width = 400;
313 draw_buffer.Height = 400;
314 ctx->DrawBuffer = &draw_buffer;
315
316 multisample_fbo = ctx->DrawBuffer->Visual.samples > 1;
317
318 /* Build the index for table lookup
319 */
320 if (brw->gen < 6) {
321 /* _NEW_COLOR */
322 if (fp->program.UsesKill || ctx->Color.AlphaEnabled)
323 lookup |= IZ_PS_KILL_ALPHATEST_BIT;
324
325 if (fp->program.Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
326 lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
327
328 /* _NEW_DEPTH */
329 if (ctx->Depth.Test)
330 lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
331
332 if (ctx->Depth.Test && ctx->Depth.Mask) /* ?? */
333 lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
334
335 /* _NEW_STENCIL | _NEW_BUFFERS */
336 if (ctx->Stencil._Enabled) {
337 lookup |= IZ_STENCIL_TEST_ENABLE_BIT;
338
339 if (ctx->Stencil.WriteMask[0] ||
340 ctx->Stencil.WriteMask[ctx->Stencil._BackFace])
341 lookup |= IZ_STENCIL_WRITE_ENABLE_BIT;
342 }
343 key->iz_lookup = lookup;
344 }
345
346 line_aa = AA_NEVER;
347
348 /* _NEW_LINE, _NEW_POLYGON, BRW_NEW_REDUCED_PRIMITIVE */
349 if (ctx->Line.SmoothFlag) {
350 if (brw->reduced_primitive == GL_LINES) {
351 line_aa = AA_ALWAYS;
352 }
353 else if (brw->reduced_primitive == GL_TRIANGLES) {
354 if (ctx->Polygon.FrontMode == GL_LINE) {
355 line_aa = AA_SOMETIMES;
356
357 if (ctx->Polygon.BackMode == GL_LINE ||
358 (ctx->Polygon.CullFlag &&
359 ctx->Polygon.CullFaceMode == GL_BACK))
360 line_aa = AA_ALWAYS;
361 }
362 else if (ctx->Polygon.BackMode == GL_LINE) {
363 line_aa = AA_SOMETIMES;
364
365 if ((ctx->Polygon.CullFlag &&
366 ctx->Polygon.CullFaceMode == GL_FRONT))
367 line_aa = AA_ALWAYS;
368 }
369 }
370 }
371
372 key->line_aa = line_aa;
373
374 /* _NEW_HINT */
375 key->high_quality_derivatives =
376 ctx->Hint.FragmentShaderDerivative == GL_NICEST;
377
378 if (brw->gen < 6)
379 key->stats_wm = brw->stats_wm;
380
381 /* _NEW_LIGHT */
382 key->flat_shade = (ctx->Light.ShadeModel == GL_FLAT);
383
384 /* _NEW_FRAG_CLAMP | _NEW_BUFFERS */
385 key->clamp_fragment_color = ctx->Color._ClampFragmentColor;
386
387 /* _NEW_TEXTURE */
388 brw_populate_sampler_prog_key_data(ctx, prog, brw->wm.base.sampler_count,
389 &key->tex);
390
391 /* _NEW_BUFFERS */
392 /*
393 * Include the draw buffer origin and height so that we can calculate
394 * fragment position values relative to the bottom left of the drawable,
395 * from the incoming screen origin relative position we get as part of our
396 * payload.
397 *
398 * This is only needed for the WM_WPOSXY opcode when the fragment program
399 * uses the gl_FragCoord input.
400 *
401 * We could avoid recompiling by including this as a constant referenced by
402 * our program, but if we were to do that it would also be nice to handle
403 * getting that constant updated at batchbuffer submit time (when we
404 * hold the lock and know where the buffer really is) rather than at emit
405 * time when we don't hold the lock and are just guessing. We could also
406 * just avoid using this as key data if the program doesn't use
407 * fragment.position.
408 *
409 * For DRI2 the origin_x/y will always be (0,0) but we still need the
410 * drawable height in order to invert the Y axis.
411 */
412 if (fp->program.Base.InputsRead & VARYING_BIT_POS) {
413 key->drawable_height = ctx->DrawBuffer->Height;
414 }
415
416 if ((fp->program.Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
417 key->render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
418 }
419
420 /* _NEW_BUFFERS */
421 key->nr_color_regions = ctx->DrawBuffer->_NumColorDrawBuffers;
422
423 /* _NEW_MULTISAMPLE, _NEW_COLOR, _NEW_BUFFERS */
424 key->replicate_alpha = ctx->DrawBuffer->_NumColorDrawBuffers > 1 &&
425 (ctx->Multisample.SampleAlphaToCoverage || ctx->Color.AlphaEnabled);
426
427 /* _NEW_BUFFERS _NEW_MULTISAMPLE */
428 /* Ignore sample qualifier while computing this flag. */
429 key->persample_shading =
430 _mesa_get_min_invocations_per_fragment(ctx, &fp->program, true) > 1;
431 if (key->persample_shading)
432 key->persample_2x = ctx->DrawBuffer->Visual.samples == 2;
433
434 key->compute_pos_offset =
435 _mesa_get_min_invocations_per_fragment(ctx, &fp->program, false) > 1 &&
436 fp->program.Base.SystemValuesRead & SYSTEM_BIT_SAMPLE_POS;
437
438 key->compute_sample_id =
439 multisample_fbo &&
440 ctx->Multisample.Enabled &&
441 (fp->program.Base.SystemValuesRead & SYSTEM_BIT_SAMPLE_ID);
442
443 /* BRW_NEW_VUE_MAP_GEOM_OUT */
444 if (brw->gen < 6 || _mesa_bitcount_64(fp->program.Base.InputsRead &
445 BRW_FS_VARYING_INPUT_MASK) > 16)
446 key->input_slots_valid = brw->vue_map_geom_out.slots_valid;
447
448
449 /* _NEW_COLOR | _NEW_BUFFERS */
450 /* Pre-gen6, the hardware alpha test always used each render
451 * target's alpha to do alpha test, as opposed to render target 0's alpha
452 * like GL requires. Fix that by building the alpha test into the
453 * shader, and we'll skip enabling the fixed function alpha test.
454 */
455 if (brw->gen < 6 && ctx->DrawBuffer->_NumColorDrawBuffers > 1 && ctx->Color.AlphaEnabled) {
456 key->alpha_test_func = ctx->Color.AlphaFunc;
457 key->alpha_test_ref = ctx->Color.AlphaRef;
458 }
459
460 /* The unique fragment program ID */
461 key->program_string_id = fp->id;
462
463 ctx->DrawBuffer = NULL;
464 }
465
466 static uint8_t
467 computed_depth_mode(struct gl_fragment_program *fp)
468 {
469 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
470 switch (fp->FragDepthLayout) {
471 case FRAG_DEPTH_LAYOUT_NONE:
472 case FRAG_DEPTH_LAYOUT_ANY:
473 return BRW_PSCDEPTH_ON;
474 case FRAG_DEPTH_LAYOUT_GREATER:
475 return BRW_PSCDEPTH_ON_GE;
476 case FRAG_DEPTH_LAYOUT_LESS:
477 return BRW_PSCDEPTH_ON_LE;
478 case FRAG_DEPTH_LAYOUT_UNCHANGED:
479 return BRW_PSCDEPTH_OFF;
480 }
481 }
482 return BRW_PSCDEPTH_OFF;
483 }
484
485 static bool
486 really_do_wm_prog(struct brw_context *brw,
487 struct gl_shader_program *prog,
488 struct brw_fragment_program *fp,
489 struct brw_wm_prog_key *key, struct anv_pipeline *pipeline)
490 {
491 void *mem_ctx = ralloc_context(NULL);
492 struct brw_wm_prog_data *prog_data = &pipeline->wm_prog_data;
493 struct gl_shader *fs = NULL;
494 unsigned int program_size;
495 const uint32_t *program;
496
497 if (prog)
498 fs = prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
499
500 memset(prog_data, 0, sizeof(*prog_data));
501
502 /* key->alpha_test_func means simulating alpha testing via discards,
503 * so the shader definitely kills pixels.
504 */
505 prog_data->uses_kill = fp->program.UsesKill || key->alpha_test_func;
506
507 prog_data->computed_depth_mode = computed_depth_mode(&fp->program);
508
509 create_params_array(pipeline, fs, &prog_data->base);
510 anv_nir_apply_dynamic_offsets(pipeline, fs->Program->nir, &prog_data->base);
511
512 prog_data->barycentric_interp_modes =
513 brw_compute_barycentric_interp_modes(brw, key->flat_shade,
514 key->persample_shading,
515 &fp->program);
516
517 set_binding_table_layout(&prog_data->base, pipeline,
518 VK_SHADER_STAGE_FRAGMENT);
519 /* This needs to come after shader time and pull constant entries, but we
520 * don't have those set up now, so just put it after the layout entries.
521 */
522 prog_data->binding_table.render_target_start = 0;
523
524 program = brw_wm_fs_emit(brw, mem_ctx, key, prog_data,
525 &fp->program, prog, &program_size);
526 if (program == NULL) {
527 ralloc_free(mem_ctx);
528 return false;
529 }
530
531 uint32_t offset = upload_kernel(pipeline, program, program_size);
532
533 if (prog_data->no_8)
534 pipeline->ps_simd8 = NO_KERNEL;
535 else
536 pipeline->ps_simd8 = offset;
537
538 if (prog_data->no_8 || prog_data->prog_offset_16) {
539 pipeline->ps_simd16 = offset + prog_data->prog_offset_16;
540 } else {
541 pipeline->ps_simd16 = NO_KERNEL;
542 }
543
544 ralloc_free(mem_ctx);
545
546 return true;
547 }
548
549 static void
550 brw_gs_populate_key(struct brw_context *brw,
551 struct anv_pipeline *pipeline,
552 struct brw_geometry_program *gp,
553 struct brw_gs_prog_key *key)
554 {
555 struct gl_context *ctx = &brw->ctx;
556 struct brw_stage_state *stage_state = &brw->gs.base;
557 struct gl_program *prog = &gp->program.Base;
558
559 memset(key, 0, sizeof(*key));
560
561 key->base.program_string_id = gp->id;
562 brw_setup_vue_key_clip_info(brw, &key->base,
563 gp->program.Base.UsesClipDistanceOut);
564
565 /* _NEW_TEXTURE */
566 brw_populate_sampler_prog_key_data(ctx, prog, stage_state->sampler_count,
567 &key->base.tex);
568
569 struct brw_vs_prog_data *prog_data = &pipeline->vs_prog_data;
570
571 /* BRW_NEW_VUE_MAP_VS */
572 key->input_varyings = prog_data->base.vue_map.slots_valid;
573 }
574
575 static bool
576 really_do_gs_prog(struct brw_context *brw,
577 struct gl_shader_program *prog,
578 struct brw_geometry_program *gp,
579 struct brw_gs_prog_key *key, struct anv_pipeline *pipeline)
580 {
581 struct brw_gs_compile_output output;
582
583 /* FIXME: We pass the bind map to the compile in the output struct. Need
584 * something better. */
585 set_binding_table_layout(&output.prog_data.base.base,
586 pipeline, VK_SHADER_STAGE_GEOMETRY);
587
588 brw_compile_gs_prog(brw, prog, gp, key, &output);
589
590 pipeline->gs_vec4 = upload_kernel(pipeline, output.program, output.program_size);
591 pipeline->gs_vertex_count = gp->program.VerticesIn;
592
593 ralloc_free(output.mem_ctx);
594
595 return true;
596 }
597
598 static bool
599 brw_codegen_cs_prog(struct brw_context *brw,
600 struct gl_shader_program *prog,
601 struct brw_compute_program *cp,
602 struct brw_cs_prog_key *key, struct anv_pipeline *pipeline)
603 {
604 const GLuint *program;
605 void *mem_ctx = ralloc_context(NULL);
606 GLuint program_size;
607 struct brw_cs_prog_data *prog_data = &pipeline->cs_prog_data;
608
609 struct gl_shader *cs = prog->_LinkedShaders[MESA_SHADER_COMPUTE];
610 assert (cs);
611
612 memset(prog_data, 0, sizeof(*prog_data));
613
614 set_binding_table_layout(&prog_data->base, pipeline, VK_SHADER_STAGE_COMPUTE);
615
616 create_params_array(pipeline, cs, &prog_data->base);
617 anv_nir_apply_dynamic_offsets(pipeline, cs->Program->nir, &prog_data->base);
618
619 program = brw_cs_emit(brw, mem_ctx, key, prog_data,
620 &cp->program, prog, &program_size);
621 if (program == NULL) {
622 ralloc_free(mem_ctx);
623 return false;
624 }
625
626 if (unlikely(INTEL_DEBUG & DEBUG_CS))
627 fprintf(stderr, "\n");
628
629 pipeline->cs_simd = upload_kernel(pipeline, program, program_size);
630
631 ralloc_free(mem_ctx);
632
633 return true;
634 }
635
636 static void
637 brw_cs_populate_key(struct brw_context *brw,
638 struct brw_compute_program *bcp, struct brw_cs_prog_key *key)
639 {
640 memset(key, 0, sizeof(*key));
641
642 /* The unique compute program ID */
643 key->program_string_id = bcp->id;
644 }
645
646 struct anv_compiler {
647 struct anv_device *device;
648 struct intel_screen *screen;
649 struct brw_context *brw;
650 struct gl_pipeline_object pipeline;
651 };
652
653 extern "C" {
654
655 struct anv_compiler *
656 anv_compiler_create(struct anv_device *device)
657 {
658 const struct brw_device_info *devinfo = &device->info;
659 struct anv_compiler *compiler;
660 struct gl_context *ctx;
661
662 compiler = rzalloc(NULL, struct anv_compiler);
663 if (compiler == NULL)
664 return NULL;
665
666 compiler->screen = rzalloc(compiler, struct intel_screen);
667 if (compiler->screen == NULL)
668 goto fail;
669
670 compiler->brw = rzalloc(compiler, struct brw_context);
671 if (compiler->brw == NULL)
672 goto fail;
673
674 compiler->device = device;
675
676 compiler->brw->gen = devinfo->gen;
677 compiler->brw->is_g4x = devinfo->is_g4x;
678 compiler->brw->is_baytrail = devinfo->is_baytrail;
679 compiler->brw->is_haswell = devinfo->is_haswell;
680 compiler->brw->is_cherryview = devinfo->is_cherryview;
681
682 /* We need this at least for CS, which will check brw->max_cs_threads
683 * against the work group size. */
684 compiler->brw->max_vs_threads = devinfo->max_vs_threads;
685 compiler->brw->max_hs_threads = devinfo->max_hs_threads;
686 compiler->brw->max_ds_threads = devinfo->max_ds_threads;
687 compiler->brw->max_gs_threads = devinfo->max_gs_threads;
688 compiler->brw->max_wm_threads = devinfo->max_wm_threads;
689 compiler->brw->max_cs_threads = devinfo->max_cs_threads;
690 compiler->brw->urb.size = devinfo->urb.size;
691 compiler->brw->urb.min_vs_entries = devinfo->urb.min_vs_entries;
692 compiler->brw->urb.max_vs_entries = devinfo->urb.max_vs_entries;
693 compiler->brw->urb.max_hs_entries = devinfo->urb.max_hs_entries;
694 compiler->brw->urb.max_ds_entries = devinfo->urb.max_ds_entries;
695 compiler->brw->urb.max_gs_entries = devinfo->urb.max_gs_entries;
696
697 compiler->brw->intelScreen = compiler->screen;
698 compiler->screen->devinfo = &device->info;
699
700 brw_process_intel_debug_variable(compiler->screen);
701
702 compiler->screen->compiler = brw_compiler_create(compiler, &device->info);
703
704 ctx = &compiler->brw->ctx;
705 _mesa_init_shader_object_functions(&ctx->Driver);
706
707 /* brw_select_clip_planes() needs this for bogus reasons. */
708 ctx->_Shader = &compiler->pipeline;
709
710 return compiler;
711
712 fail:
713 ralloc_free(compiler);
714 return NULL;
715 }
716
717 void
718 anv_compiler_destroy(struct anv_compiler *compiler)
719 {
720 _mesa_free_errors_data(&compiler->brw->ctx);
721 ralloc_free(compiler);
722 }
723
724 /* From gen7_urb.c */
725
726 /* FIXME: Add to struct intel_device_info */
727
728 static const int gen8_push_size = 32 * 1024;
729
730 static void
731 gen7_compute_urb_partition(struct anv_pipeline *pipeline)
732 {
733 const struct brw_device_info *devinfo = &pipeline->device->info;
734 bool vs_present = pipeline->vs_simd8 != NO_KERNEL;
735 unsigned vs_size = vs_present ? pipeline->vs_prog_data.base.urb_entry_size : 1;
736 unsigned vs_entry_size_bytes = vs_size * 64;
737 bool gs_present = pipeline->gs_vec4 != NO_KERNEL;
738 unsigned gs_size = gs_present ? pipeline->gs_prog_data.base.urb_entry_size : 1;
739 unsigned gs_entry_size_bytes = gs_size * 64;
740
741 /* From p35 of the Ivy Bridge PRM (section 1.7.1: 3DSTATE_URB_GS):
742 *
743 * VS Number of URB Entries must be divisible by 8 if the VS URB Entry
744 * Allocation Size is less than 9 512-bit URB entries.
745 *
746 * Similar text exists for GS.
747 */
748 unsigned vs_granularity = (vs_size < 9) ? 8 : 1;
749 unsigned gs_granularity = (gs_size < 9) ? 8 : 1;
750
751 /* URB allocations must be done in 8k chunks. */
752 unsigned chunk_size_bytes = 8192;
753
754 /* Determine the size of the URB in chunks. */
755 unsigned urb_chunks = devinfo->urb.size * 1024 / chunk_size_bytes;
756
757 /* Reserve space for push constants */
758 unsigned push_constant_bytes = gen8_push_size;
759 unsigned push_constant_chunks =
760 push_constant_bytes / chunk_size_bytes;
761
762 /* Initially, assign each stage the minimum amount of URB space it needs,
763 * and make a note of how much additional space it "wants" (the amount of
764 * additional space it could actually make use of).
765 */
766
767 /* VS has a lower limit on the number of URB entries */
768 unsigned vs_chunks =
769 ALIGN(devinfo->urb.min_vs_entries * vs_entry_size_bytes,
770 chunk_size_bytes) / chunk_size_bytes;
771 unsigned vs_wants =
772 ALIGN(devinfo->urb.max_vs_entries * vs_entry_size_bytes,
773 chunk_size_bytes) / chunk_size_bytes - vs_chunks;
774
775 unsigned gs_chunks = 0;
776 unsigned gs_wants = 0;
777 if (gs_present) {
778 /* There are two constraints on the minimum amount of URB space we can
779 * allocate:
780 *
781 * (1) We need room for at least 2 URB entries, since we always operate
782 * the GS in DUAL_OBJECT mode.
783 *
784 * (2) We can't allocate less than nr_gs_entries_granularity.
785 */
786 gs_chunks = ALIGN(MAX2(gs_granularity, 2) * gs_entry_size_bytes,
787 chunk_size_bytes) / chunk_size_bytes;
788 gs_wants =
789 ALIGN(devinfo->urb.max_gs_entries * gs_entry_size_bytes,
790 chunk_size_bytes) / chunk_size_bytes - gs_chunks;
791 }
792
793 /* There should always be enough URB space to satisfy the minimum
794 * requirements of each stage.
795 */
796 unsigned total_needs = push_constant_chunks + vs_chunks + gs_chunks;
797 assert(total_needs <= urb_chunks);
798
799 /* Mete out remaining space (if any) in proportion to "wants". */
800 unsigned total_wants = vs_wants + gs_wants;
801 unsigned remaining_space = urb_chunks - total_needs;
802 if (remaining_space > total_wants)
803 remaining_space = total_wants;
804 if (remaining_space > 0) {
805 unsigned vs_additional = (unsigned)
806 round(vs_wants * (((double) remaining_space) / total_wants));
807 vs_chunks += vs_additional;
808 remaining_space -= vs_additional;
809 gs_chunks += remaining_space;
810 }
811
812 /* Sanity check that we haven't over-allocated. */
813 assert(push_constant_chunks + vs_chunks + gs_chunks <= urb_chunks);
814
815 /* Finally, compute the number of entries that can fit in the space
816 * allocated to each stage.
817 */
818 unsigned nr_vs_entries = vs_chunks * chunk_size_bytes / vs_entry_size_bytes;
819 unsigned nr_gs_entries = gs_chunks * chunk_size_bytes / gs_entry_size_bytes;
820
821 /* Since we rounded up when computing *_wants, this may be slightly more
822 * than the maximum allowed amount, so correct for that.
823 */
824 nr_vs_entries = MIN2(nr_vs_entries, devinfo->urb.max_vs_entries);
825 nr_gs_entries = MIN2(nr_gs_entries, devinfo->urb.max_gs_entries);
826
827 /* Ensure that we program a multiple of the granularity. */
828 nr_vs_entries = ROUND_DOWN_TO(nr_vs_entries, vs_granularity);
829 nr_gs_entries = ROUND_DOWN_TO(nr_gs_entries, gs_granularity);
830
831 /* Finally, sanity check to make sure we have at least the minimum number
832 * of entries needed for each stage.
833 */
834 assert(nr_vs_entries >= devinfo->urb.min_vs_entries);
835 if (gs_present)
836 assert(nr_gs_entries >= 2);
837
838 /* Lay out the URB in the following order:
839 * - push constants
840 * - VS
841 * - GS
842 */
843 pipeline->urb.vs_start = push_constant_chunks;
844 pipeline->urb.vs_size = vs_size;
845 pipeline->urb.nr_vs_entries = nr_vs_entries;
846
847 pipeline->urb.gs_start = push_constant_chunks + vs_chunks;
848 pipeline->urb.gs_size = gs_size;
849 pipeline->urb.nr_gs_entries = nr_gs_entries;
850 }
851
852 static const struct {
853 uint32_t token;
854 gl_shader_stage stage;
855 const char *name;
856 } stage_info[] = {
857 { GL_VERTEX_SHADER, MESA_SHADER_VERTEX, "vertex" },
858 { GL_TESS_CONTROL_SHADER, (gl_shader_stage)-1,"tess control" },
859 { GL_TESS_EVALUATION_SHADER, (gl_shader_stage)-1, "tess evaluation" },
860 { GL_GEOMETRY_SHADER, MESA_SHADER_GEOMETRY, "geometry" },
861 { GL_FRAGMENT_SHADER, MESA_SHADER_FRAGMENT, "fragment" },
862 { GL_COMPUTE_SHADER, MESA_SHADER_COMPUTE, "compute" },
863 };
864
865 struct spirv_header{
866 uint32_t magic;
867 uint32_t version;
868 uint32_t gen_magic;
869 };
870
871 static void
872 setup_nir_io(struct gl_shader *mesa_shader,
873 nir_shader *shader)
874 {
875 struct gl_program *prog = mesa_shader->Program;
876 foreach_list_typed(nir_variable, var, node, &shader->inputs) {
877 prog->InputsRead |= BITFIELD64_BIT(var->data.location);
878 if (shader->stage == MESA_SHADER_FRAGMENT) {
879 struct gl_fragment_program *fprog = (struct gl_fragment_program *)prog;
880
881 fprog->InterpQualifier[var->data.location] =
882 (glsl_interp_qualifier)var->data.interpolation;
883 if (var->data.centroid)
884 fprog->IsCentroid |= BITFIELD64_BIT(var->data.location);
885 if (var->data.sample)
886 fprog->IsSample |= BITFIELD64_BIT(var->data.location);
887 }
888 }
889
890 foreach_list_typed(nir_variable, var, node, &shader->outputs) {
891 prog->OutputsWritten |= BITFIELD64_BIT(var->data.location);
892 }
893
894 mesa_shader->num_uniform_components = shader->num_uniforms;
895 }
896
897 static void
898 anv_compile_shader_spirv(struct anv_compiler *compiler,
899 struct gl_shader_program *program,
900 struct anv_pipeline *pipeline, uint32_t stage)
901 {
902 struct brw_context *brw = compiler->brw;
903 struct anv_shader *shader = pipeline->shaders[stage];
904 struct gl_shader *mesa_shader;
905 int name = 0;
906 uint32_t *spirv;
907
908 mesa_shader = brw_new_shader(&brw->ctx, name, stage_info[stage].token);
909 fail_if(mesa_shader == NULL,
910 "failed to create %s shader\n", stage_info[stage].name);
911
912 #define CREATE_PROGRAM(stage) \
913 _mesa_init_##stage##_program(&brw->ctx, &ralloc(mesa_shader, struct brw_##stage##_program)->program, 0, 0)
914
915 bool is_scalar;
916 struct gl_program *prog;
917 switch (stage) {
918 case VK_SHADER_STAGE_VERTEX:
919 prog = CREATE_PROGRAM(vertex);
920 is_scalar = compiler->screen->compiler->scalar_vs;
921 break;
922 case VK_SHADER_STAGE_GEOMETRY:
923 prog = CREATE_PROGRAM(geometry);
924 is_scalar = false;
925 break;
926 case VK_SHADER_STAGE_FRAGMENT:
927 prog = CREATE_PROGRAM(fragment);
928 is_scalar = true;
929 break;
930 case VK_SHADER_STAGE_COMPUTE:
931 prog = CREATE_PROGRAM(compute);
932 is_scalar = true;
933 break;
934 default:
935 unreachable("Unsupported shader stage");
936 }
937 _mesa_reference_program(&brw->ctx, &mesa_shader->Program, prog);
938
939 mesa_shader->Program->Parameters =
940 rzalloc(mesa_shader, struct gl_program_parameter_list);
941
942 mesa_shader->Type = stage_info[stage].token;
943 mesa_shader->Stage = stage_info[stage].stage;
944
945 struct gl_shader_compiler_options *glsl_options =
946 &compiler->screen->compiler->glsl_compiler_options[stage_info[stage].stage];
947
948 spirv = (uint32_t *) shader->module->data;
949 assert(spirv[0] == SPIR_V_MAGIC_NUMBER);
950 assert(shader->module->size % 4 == 0);
951
952 mesa_shader->Program->nir =
953 spirv_to_nir(spirv, shader->module->size / 4,
954 stage_info[stage].stage, glsl_options->NirOptions);
955 nir_validate_shader(mesa_shader->Program->nir);
956
957 brw_process_nir(mesa_shader->Program->nir,
958 compiler->screen->devinfo,
959 NULL, mesa_shader->Stage, is_scalar);
960
961 setup_nir_io(mesa_shader, mesa_shader->Program->nir);
962
963 fail_if(mesa_shader->Program->nir == NULL,
964 "failed to translate SPIR-V to NIR\n");
965
966 _mesa_reference_shader(&brw->ctx, &program->Shaders[program->NumShaders],
967 mesa_shader);
968 program->NumShaders++;
969 }
970
971 static void
972 add_compiled_stage(struct anv_pipeline *pipeline, uint32_t stage,
973 struct brw_stage_prog_data *prog_data)
974 {
975 struct brw_device_info *devinfo = &pipeline->device->info;
976 uint32_t max_threads[] = {
977 [VK_SHADER_STAGE_VERTEX] = devinfo->max_vs_threads,
978 [VK_SHADER_STAGE_TESS_CONTROL] = 0,
979 [VK_SHADER_STAGE_TESS_EVALUATION] = 0,
980 [VK_SHADER_STAGE_GEOMETRY] = devinfo->max_gs_threads,
981 [VK_SHADER_STAGE_FRAGMENT] = devinfo->max_wm_threads,
982 [VK_SHADER_STAGE_COMPUTE] = devinfo->max_cs_threads,
983 };
984
985 pipeline->prog_data[stage] = prog_data;
986 pipeline->active_stages |= 1 << stage;
987 pipeline->scratch_start[stage] = pipeline->total_scratch;
988 pipeline->total_scratch =
989 align_u32(pipeline->total_scratch, 1024) +
990 prog_data->total_scratch * max_threads[stage];
991 }
992
993 int
994 anv_compiler_run(struct anv_compiler *compiler, struct anv_pipeline *pipeline)
995 {
996 struct gl_shader_program *program;
997 int name = 0;
998 struct brw_context *brw = compiler->brw;
999
1000 pipeline->writes_point_size = false;
1001
1002 /* When we free the pipeline, we detect stages based on the NULL status
1003 * of various prog_data pointers. Make them NULL by default.
1004 */
1005 memset(pipeline->prog_data, 0, sizeof(pipeline->prog_data));
1006 memset(pipeline->scratch_start, 0, sizeof(pipeline->scratch_start));
1007
1008 brw->use_rep_send = pipeline->use_repclear;
1009 brw->no_simd8 = pipeline->use_repclear;
1010
1011 program = brw->ctx.Driver.NewShaderProgram(name);
1012 program->Shaders = (struct gl_shader **)
1013 calloc(VK_SHADER_STAGE_NUM, sizeof(struct gl_shader *));
1014 fail_if(program == NULL || program->Shaders == NULL,
1015 "failed to create program\n");
1016
1017 for (unsigned i = 0; i < VK_SHADER_STAGE_NUM; i++) {
1018 if (pipeline->shaders[i])
1019 anv_compile_shader_spirv(compiler, program, pipeline, i);
1020 }
1021
1022 for (unsigned i = 0; i < program->NumShaders; i++) {
1023 struct gl_shader *shader = program->Shaders[i];
1024 program->_LinkedShaders[shader->Stage] = shader;
1025 }
1026
1027 bool success;
1028 pipeline->active_stages = 0;
1029 pipeline->total_scratch = 0;
1030
1031 if (pipeline->shaders[VK_SHADER_STAGE_VERTEX]) {
1032 struct brw_vs_prog_key vs_key;
1033 struct gl_vertex_program *vp = (struct gl_vertex_program *)
1034 program->_LinkedShaders[MESA_SHADER_VERTEX]->Program;
1035 struct brw_vertex_program *bvp = brw_vertex_program(vp);
1036
1037 brw_vs_populate_key(brw, bvp, &vs_key);
1038
1039 success = really_do_vs_prog(brw, program, bvp, &vs_key, pipeline);
1040 fail_if(!success, "do_wm_prog failed\n");
1041 add_compiled_stage(pipeline, VK_SHADER_STAGE_VERTEX,
1042 &pipeline->vs_prog_data.base.base);
1043
1044 if (vp->Base.OutputsWritten & VARYING_SLOT_PSIZ)
1045 pipeline->writes_point_size = true;
1046 } else {
1047 memset(&pipeline->vs_prog_data, 0, sizeof(pipeline->vs_prog_data));
1048 pipeline->vs_simd8 = NO_KERNEL;
1049 pipeline->vs_vec4 = NO_KERNEL;
1050 }
1051
1052
1053 if (pipeline->shaders[VK_SHADER_STAGE_GEOMETRY]) {
1054 struct brw_gs_prog_key gs_key;
1055 struct gl_geometry_program *gp = (struct gl_geometry_program *)
1056 program->_LinkedShaders[MESA_SHADER_GEOMETRY]->Program;
1057 struct brw_geometry_program *bgp = brw_geometry_program(gp);
1058
1059 brw_gs_populate_key(brw, pipeline, bgp, &gs_key);
1060
1061 success = really_do_gs_prog(brw, program, bgp, &gs_key, pipeline);
1062 fail_if(!success, "do_gs_prog failed\n");
1063 add_compiled_stage(pipeline, VK_SHADER_STAGE_GEOMETRY,
1064 &pipeline->gs_prog_data.base.base);
1065
1066 if (gp->Base.OutputsWritten & VARYING_SLOT_PSIZ)
1067 pipeline->writes_point_size = true;
1068 } else {
1069 pipeline->gs_vec4 = NO_KERNEL;
1070 }
1071
1072 if (pipeline->shaders[VK_SHADER_STAGE_FRAGMENT]) {
1073 struct brw_wm_prog_key wm_key;
1074 struct gl_fragment_program *fp = (struct gl_fragment_program *)
1075 program->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
1076 struct brw_fragment_program *bfp = brw_fragment_program(fp);
1077
1078 brw_wm_populate_key(brw, bfp, &wm_key);
1079
1080 success = really_do_wm_prog(brw, program, bfp, &wm_key, pipeline);
1081 fail_if(!success, "do_wm_prog failed\n");
1082 add_compiled_stage(pipeline, VK_SHADER_STAGE_FRAGMENT,
1083 &pipeline->wm_prog_data.base);
1084 }
1085
1086 if (pipeline->shaders[VK_SHADER_STAGE_COMPUTE]) {
1087 struct brw_cs_prog_key cs_key;
1088 struct gl_compute_program *cp = (struct gl_compute_program *)
1089 program->_LinkedShaders[MESA_SHADER_COMPUTE]->Program;
1090 struct brw_compute_program *bcp = brw_compute_program(cp);
1091
1092 brw_cs_populate_key(brw, bcp, &cs_key);
1093
1094 success = brw_codegen_cs_prog(brw, program, bcp, &cs_key, pipeline);
1095 fail_if(!success, "brw_codegen_cs_prog failed\n");
1096 add_compiled_stage(pipeline, VK_SHADER_STAGE_COMPUTE,
1097 &pipeline->cs_prog_data.base);
1098 }
1099
1100 brw->ctx.Driver.DeleteShaderProgram(&brw->ctx, program);
1101
1102 struct anv_device *device = compiler->device;
1103 while (device->scratch_block_pool.bo.size < pipeline->total_scratch)
1104 anv_block_pool_alloc(&device->scratch_block_pool);
1105
1106 gen7_compute_urb_partition(pipeline);
1107
1108 return 0;
1109 }
1110
1111 /* This badly named function frees the struct anv_pipeline data that the compiler
1112 * allocates. Currently just the prog_data structs.
1113 */
1114 void
1115 anv_compiler_free(struct anv_pipeline *pipeline)
1116 {
1117 for (uint32_t stage = 0; stage < VK_SHADER_STAGE_NUM; stage++) {
1118 if (pipeline->prog_data[stage]) {
1119 free(pipeline->prog_data[stage]->map_entries);
1120 /* We only ever set up the params array because we don't do
1121 * non-UBO pull constants
1122 */
1123 anv_device_free(pipeline->device, pipeline->prog_data[stage]->param);
1124 }
1125 }
1126 }
1127
1128 }