Merge ../mesa into vulkan
[mesa.git] / src / vulkan / anv_compiler.cpp
1 /*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 */
23
24 #include <sys/stat.h>
25 #include <unistd.h>
26 #include <fcntl.h>
27
28 #include "anv_private.h"
29 #include "anv_nir.h"
30
31 #include <brw_context.h>
32 #include <brw_wm.h> /* brw_new_shader_program is here */
33 #include <brw_nir.h>
34
35 #include <brw_vs.h>
36 #include <brw_gs.h>
37 #include <brw_cs.h>
38 #include "brw_vec4_gs_visitor.h"
39
40 #include <mesa/main/shaderobj.h>
41 #include <mesa/main/fbobject.h>
42 #include <mesa/main/context.h>
43 #include <mesa/program/program.h>
44 #include <glsl/program.h>
45
46 /* XXX: We need this to keep symbols in nir.h from conflicting with the
47 * generated GEN command packing headers. We need to fix *both* to not
48 * define something as generic as LOAD.
49 */
50 #undef LOAD
51
52 #include <glsl/nir/nir_spirv.h>
53
54 #define SPIR_V_MAGIC_NUMBER 0x07230203
55
56 static void
57 fail_if(int cond, const char *format, ...)
58 {
59 va_list args;
60
61 if (!cond)
62 return;
63
64 va_start(args, format);
65 vfprintf(stderr, format, args);
66 va_end(args);
67
68 exit(1);
69 }
70
71 static VkResult
72 set_binding_table_layout(struct brw_stage_prog_data *prog_data,
73 struct anv_pipeline *pipeline, uint32_t stage)
74 {
75 uint32_t bias, count, k, *map;
76 struct anv_pipeline_layout *layout = pipeline->layout;
77
78 /* No layout is valid for shaders that don't bind any resources. */
79 if (pipeline->layout == NULL)
80 return VK_SUCCESS;
81
82 if (stage == VK_SHADER_STAGE_FRAGMENT)
83 bias = MAX_RTS;
84 else
85 bias = 0;
86
87 count = layout->stage[stage].surface_count;
88 prog_data->map_entries =
89 (uint32_t *) malloc(count * sizeof(prog_data->map_entries[0]));
90 if (prog_data->map_entries == NULL)
91 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
92
93 k = bias;
94 map = prog_data->map_entries;
95 for (uint32_t i = 0; i < layout->num_sets; i++) {
96 prog_data->bind_map[i].index = map;
97 for (uint32_t j = 0; j < layout->set[i].layout->stage[stage].surface_count; j++)
98 *map++ = k++;
99
100 prog_data->bind_map[i].index_count =
101 layout->set[i].layout->stage[stage].surface_count;
102 }
103
104 return VK_SUCCESS;
105 }
106
107 static uint32_t
108 upload_kernel(struct anv_pipeline *pipeline, const void *data, size_t size)
109 {
110 struct anv_state state =
111 anv_state_stream_alloc(&pipeline->program_stream, size, 64);
112
113 assert(size < pipeline->program_stream.block_pool->block_size);
114
115 memcpy(state.map, data, size);
116
117 return state.offset;
118 }
119
120 static void
121 create_params_array(struct anv_pipeline *pipeline,
122 struct gl_shader *shader,
123 struct brw_stage_prog_data *prog_data)
124 {
125 VkShaderStage stage = anv_vk_shader_stage_for_mesa_stage(shader->Stage);
126 unsigned num_params = 0;
127
128 if (shader->num_uniform_components) {
129 /* If the shader uses any push constants at all, we'll just give
130 * them the maximum possible number
131 */
132 num_params += MAX_PUSH_CONSTANTS_SIZE / sizeof(float);
133 }
134
135 if (pipeline->layout && pipeline->layout->stage[stage].has_dynamic_offsets)
136 num_params += MAX_DYNAMIC_BUFFERS;
137
138 if (num_params == 0)
139 return;
140
141 prog_data->param = (const gl_constant_value **)
142 anv_device_alloc(pipeline->device,
143 num_params * sizeof(gl_constant_value *),
144 8, VK_SYSTEM_ALLOC_TYPE_INTERNAL_SHADER);
145
146 /* We now set the param values to be offsets into a
147 * anv_push_constant_data structure. Since the compiler doesn't
148 * actually dereference any of the gl_constant_value pointers in the
149 * params array, it doesn't really matter what we put here.
150 */
151 struct anv_push_constants *null_data = NULL;
152 for (unsigned i = 0; i < num_params; i++)
153 prog_data->param[i] =
154 (const gl_constant_value *)&null_data->client_data[i * sizeof(float)];
155 }
156
157 /**
158 * Return a bitfield where bit n is set if barycentric interpolation mode n
159 * (see enum brw_wm_barycentric_interp_mode) is needed by the fragment shader.
160 */
161 unsigned
162 brw_compute_barycentric_interp_modes(const struct brw_device_info *devinfo,
163 bool shade_model_flat,
164 bool persample_shading,
165 nir_shader *shader)
166 {
167 unsigned barycentric_interp_modes = 0;
168
169 nir_foreach_variable(var, &shader->inputs) {
170 enum glsl_interp_qualifier interp_qualifier =
171 (enum glsl_interp_qualifier) var->data.interpolation;
172 bool is_centroid = var->data.centroid && !persample_shading;
173 bool is_sample = var->data.sample || persample_shading;
174 bool is_gl_Color = (var->data.location == VARYING_SLOT_COL0) ||
175 (var->data.location == VARYING_SLOT_COL1);
176
177 /* Ignore WPOS and FACE, because they don't require interpolation. */
178 if (var->data.location == VARYING_SLOT_POS ||
179 var->data.location == VARYING_SLOT_FACE)
180 continue;
181
182 /* Determine the set (or sets) of barycentric coordinates needed to
183 * interpolate this variable. Note that when
184 * brw->needs_unlit_centroid_workaround is set, centroid interpolation
185 * uses PIXEL interpolation for unlit pixels and CENTROID interpolation
186 * for lit pixels, so we need both sets of barycentric coordinates.
187 */
188 if (interp_qualifier == INTERP_QUALIFIER_NOPERSPECTIVE) {
189 if (is_centroid) {
190 barycentric_interp_modes |=
191 1 << BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
192 } else if (is_sample) {
193 barycentric_interp_modes |=
194 1 << BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
195 }
196 if ((!is_centroid && !is_sample) ||
197 devinfo->needs_unlit_centroid_workaround) {
198 barycentric_interp_modes |=
199 1 << BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
200 }
201 } else if (interp_qualifier == INTERP_QUALIFIER_SMOOTH ||
202 (!(shade_model_flat && is_gl_Color) &&
203 interp_qualifier == INTERP_QUALIFIER_NONE)) {
204 if (is_centroid) {
205 barycentric_interp_modes |=
206 1 << BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
207 } else if (is_sample) {
208 barycentric_interp_modes |=
209 1 << BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
210 }
211 if ((!is_centroid && !is_sample) ||
212 devinfo->needs_unlit_centroid_workaround) {
213 barycentric_interp_modes |=
214 1 << BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
215 }
216 }
217 }
218
219 return barycentric_interp_modes;
220 }
221
222 static void
223 brw_vs_populate_key(struct brw_context *brw,
224 struct brw_vertex_program *vp,
225 struct brw_vs_prog_key *key)
226 {
227 struct gl_context *ctx = &brw->ctx;
228 /* BRW_NEW_VERTEX_PROGRAM */
229 struct gl_program *prog = (struct gl_program *) vp;
230
231 memset(key, 0, sizeof(*key));
232
233 /* Just upload the program verbatim for now. Always send it all
234 * the inputs it asks for, whether they are varying or not.
235 */
236 key->program_string_id = vp->id;
237
238 /* _NEW_POLYGON */
239 if (brw->gen < 6) {
240 key->copy_edgeflag = (ctx->Polygon.FrontMode != GL_FILL ||
241 ctx->Polygon.BackMode != GL_FILL);
242 }
243
244 if (prog->OutputsWritten & (VARYING_BIT_COL0 | VARYING_BIT_COL1 |
245 VARYING_BIT_BFC0 | VARYING_BIT_BFC1)) {
246 /* _NEW_LIGHT | _NEW_BUFFERS */
247 key->clamp_vertex_color = ctx->Light._ClampVertexColor;
248 }
249
250 /* _NEW_POINT */
251 if (brw->gen < 6 && ctx->Point.PointSprite) {
252 for (int i = 0; i < 8; i++) {
253 if (ctx->Point.CoordReplace[i])
254 key->point_coord_replace |= (1 << i);
255 }
256 }
257 }
258
259 static bool
260 really_do_vs_prog(struct brw_context *brw,
261 struct gl_shader_program *prog,
262 struct brw_vertex_program *vp,
263 struct brw_vs_prog_key *key, struct anv_pipeline *pipeline)
264 {
265 GLuint program_size;
266 const GLuint *program;
267 struct brw_vs_prog_data *prog_data = &pipeline->vs_prog_data;
268 void *mem_ctx;
269 struct gl_shader *vs = NULL;
270
271 if (prog)
272 vs = prog->_LinkedShaders[MESA_SHADER_VERTEX];
273
274 memset(prog_data, 0, sizeof(*prog_data));
275
276 mem_ctx = ralloc_context(NULL);
277
278 create_params_array(pipeline, vs, &prog_data->base.base);
279 anv_nir_apply_dynamic_offsets(pipeline, vs->Program->nir,
280 &prog_data->base.base);
281
282 GLbitfield64 outputs_written = vp->program.Base.OutputsWritten;
283 prog_data->inputs_read = vp->program.Base.InputsRead;
284
285 if (key->copy_edgeflag) {
286 outputs_written |= BITFIELD64_BIT(VARYING_SLOT_EDGE);
287 prog_data->inputs_read |= VERT_BIT_EDGEFLAG;
288 }
289
290 if (brw->gen < 6) {
291 /* Put dummy slots into the VUE for the SF to put the replaced
292 * point sprite coords in. We shouldn't need these dummy slots,
293 * which take up precious URB space, but it would mean that the SF
294 * doesn't get nice aligned pairs of input coords into output
295 * coords, which would be a pain to handle.
296 */
297 for (int i = 0; i < 8; i++) {
298 if (key->point_coord_replace & (1 << i))
299 outputs_written |= BITFIELD64_BIT(VARYING_SLOT_TEX0 + i);
300 }
301
302 /* if back colors are written, allocate slots for front colors too */
303 if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_BFC0))
304 outputs_written |= BITFIELD64_BIT(VARYING_SLOT_COL0);
305 if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_BFC1))
306 outputs_written |= BITFIELD64_BIT(VARYING_SLOT_COL1);
307 }
308
309 /* In order for legacy clipping to work, we need to populate the clip
310 * distance varying slots whenever clipping is enabled, even if the vertex
311 * shader doesn't write to gl_ClipDistance.
312 */
313 if (key->nr_userclip_plane_consts) {
314 outputs_written |= BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0);
315 outputs_written |= BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1);
316 }
317
318 brw_compute_vue_map(brw->intelScreen->devinfo,
319 &prog_data->base.vue_map, outputs_written,
320 prog ? prog->SeparateShader : false);
321
322 set_binding_table_layout(&prog_data->base.base, pipeline,
323 VK_SHADER_STAGE_VERTEX);
324
325 /* Emit GEN4 code.
326 */
327 program = brw_vs_emit(brw, mem_ctx, key, prog_data, &vp->program,
328 prog, -1, &program_size);
329 if (program == NULL) {
330 ralloc_free(mem_ctx);
331 return false;
332 }
333
334 const uint32_t offset = upload_kernel(pipeline, program, program_size);
335 if (prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8) {
336 pipeline->vs_simd8 = offset;
337 pipeline->vs_vec4 = NO_KERNEL;
338 } else {
339 pipeline->vs_simd8 = NO_KERNEL;
340 pipeline->vs_vec4 = offset;
341 }
342
343 ralloc_free(mem_ctx);
344
345 return true;
346 }
347
348 void brw_wm_populate_key(struct brw_context *brw,
349 struct brw_fragment_program *fp,
350 struct brw_wm_prog_key *key)
351 {
352 struct gl_context *ctx = &brw->ctx;
353 GLuint lookup = 0;
354 GLuint line_aa;
355 bool program_uses_dfdy = fp->program.UsesDFdy;
356 struct gl_framebuffer draw_buffer;
357 bool multisample_fbo;
358
359 memset(key, 0, sizeof(*key));
360
361 for (int i = 0; i < MAX_SAMPLERS; i++) {
362 /* Assume color sampler, no swizzling. */
363 key->tex.swizzles[i] = SWIZZLE_XYZW;
364 }
365
366 /* A non-zero framebuffer name indicates that the framebuffer was created by
367 * the user rather than the window system. */
368 draw_buffer.Name = 1;
369 draw_buffer.Visual.samples = 1;
370 draw_buffer._NumColorDrawBuffers = 1;
371 draw_buffer._NumColorDrawBuffers = 1;
372 draw_buffer.Width = 400;
373 draw_buffer.Height = 400;
374 ctx->DrawBuffer = &draw_buffer;
375
376 multisample_fbo = ctx->DrawBuffer->Visual.samples > 1;
377
378 /* Build the index for table lookup
379 */
380 if (brw->gen < 6) {
381 /* _NEW_COLOR */
382 if (fp->program.UsesKill || ctx->Color.AlphaEnabled)
383 lookup |= IZ_PS_KILL_ALPHATEST_BIT;
384
385 if (fp->program.Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
386 lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
387
388 /* _NEW_DEPTH */
389 if (ctx->Depth.Test)
390 lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
391
392 if (ctx->Depth.Test && ctx->Depth.Mask) /* ?? */
393 lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
394
395 /* _NEW_STENCIL | _NEW_BUFFERS */
396 if (ctx->Stencil._Enabled) {
397 lookup |= IZ_STENCIL_TEST_ENABLE_BIT;
398
399 if (ctx->Stencil.WriteMask[0] ||
400 ctx->Stencil.WriteMask[ctx->Stencil._BackFace])
401 lookup |= IZ_STENCIL_WRITE_ENABLE_BIT;
402 }
403 key->iz_lookup = lookup;
404 }
405
406 line_aa = AA_NEVER;
407
408 /* _NEW_LINE, _NEW_POLYGON, BRW_NEW_REDUCED_PRIMITIVE */
409 if (ctx->Line.SmoothFlag) {
410 if (brw->reduced_primitive == GL_LINES) {
411 line_aa = AA_ALWAYS;
412 }
413 else if (brw->reduced_primitive == GL_TRIANGLES) {
414 if (ctx->Polygon.FrontMode == GL_LINE) {
415 line_aa = AA_SOMETIMES;
416
417 if (ctx->Polygon.BackMode == GL_LINE ||
418 (ctx->Polygon.CullFlag &&
419 ctx->Polygon.CullFaceMode == GL_BACK))
420 line_aa = AA_ALWAYS;
421 }
422 else if (ctx->Polygon.BackMode == GL_LINE) {
423 line_aa = AA_SOMETIMES;
424
425 if ((ctx->Polygon.CullFlag &&
426 ctx->Polygon.CullFaceMode == GL_FRONT))
427 line_aa = AA_ALWAYS;
428 }
429 }
430 }
431
432 key->line_aa = line_aa;
433
434 /* _NEW_HINT */
435 key->high_quality_derivatives =
436 ctx->Hint.FragmentShaderDerivative == GL_NICEST;
437
438 if (brw->gen < 6)
439 key->stats_wm = brw->stats_wm;
440
441 /* _NEW_LIGHT */
442 key->flat_shade = (ctx->Light.ShadeModel == GL_FLAT);
443
444 /* _NEW_FRAG_CLAMP | _NEW_BUFFERS */
445 key->clamp_fragment_color = ctx->Color._ClampFragmentColor;
446
447 /* _NEW_BUFFERS */
448 /*
449 * Include the draw buffer origin and height so that we can calculate
450 * fragment position values relative to the bottom left of the drawable,
451 * from the incoming screen origin relative position we get as part of our
452 * payload.
453 *
454 * This is only needed for the WM_WPOSXY opcode when the fragment program
455 * uses the gl_FragCoord input.
456 *
457 * We could avoid recompiling by including this as a constant referenced by
458 * our program, but if we were to do that it would also be nice to handle
459 * getting that constant updated at batchbuffer submit time (when we
460 * hold the lock and know where the buffer really is) rather than at emit
461 * time when we don't hold the lock and are just guessing. We could also
462 * just avoid using this as key data if the program doesn't use
463 * fragment.position.
464 *
465 * For DRI2 the origin_x/y will always be (0,0) but we still need the
466 * drawable height in order to invert the Y axis.
467 */
468 if (fp->program.Base.InputsRead & VARYING_BIT_POS) {
469 key->drawable_height = ctx->DrawBuffer->Height;
470 }
471
472 if ((fp->program.Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
473 key->render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
474 }
475
476 /* _NEW_BUFFERS */
477 key->nr_color_regions = ctx->DrawBuffer->_NumColorDrawBuffers;
478
479 /* _NEW_MULTISAMPLE, _NEW_COLOR, _NEW_BUFFERS */
480 key->replicate_alpha = ctx->DrawBuffer->_NumColorDrawBuffers > 1 &&
481 (ctx->Multisample.SampleAlphaToCoverage || ctx->Color.AlphaEnabled);
482
483 /* _NEW_BUFFERS _NEW_MULTISAMPLE */
484 /* Ignore sample qualifier while computing this flag. */
485 key->persample_shading =
486 _mesa_get_min_invocations_per_fragment(ctx, &fp->program, true) > 1;
487 if (key->persample_shading)
488 key->persample_2x = ctx->DrawBuffer->Visual.samples == 2;
489
490 key->compute_pos_offset =
491 _mesa_get_min_invocations_per_fragment(ctx, &fp->program, false) > 1 &&
492 fp->program.Base.SystemValuesRead & SYSTEM_BIT_SAMPLE_POS;
493
494 key->compute_sample_id =
495 multisample_fbo &&
496 ctx->Multisample.Enabled &&
497 (fp->program.Base.SystemValuesRead & SYSTEM_BIT_SAMPLE_ID);
498
499 /* BRW_NEW_VUE_MAP_GEOM_OUT */
500 if (brw->gen < 6 || _mesa_bitcount_64(fp->program.Base.InputsRead &
501 BRW_FS_VARYING_INPUT_MASK) > 16)
502 key->input_slots_valid = brw->vue_map_geom_out.slots_valid;
503
504
505 /* _NEW_COLOR | _NEW_BUFFERS */
506 /* Pre-gen6, the hardware alpha test always used each render
507 * target's alpha to do alpha test, as opposed to render target 0's alpha
508 * like GL requires. Fix that by building the alpha test into the
509 * shader, and we'll skip enabling the fixed function alpha test.
510 */
511 if (brw->gen < 6 && ctx->DrawBuffer->_NumColorDrawBuffers > 1 && ctx->Color.AlphaEnabled) {
512 key->alpha_test_func = ctx->Color.AlphaFunc;
513 key->alpha_test_ref = ctx->Color.AlphaRef;
514 }
515
516 /* The unique fragment program ID */
517 key->program_string_id = fp->id;
518
519 ctx->DrawBuffer = NULL;
520 }
521
522 static uint8_t
523 computed_depth_mode(struct gl_fragment_program *fp)
524 {
525 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
526 switch (fp->FragDepthLayout) {
527 case FRAG_DEPTH_LAYOUT_NONE:
528 case FRAG_DEPTH_LAYOUT_ANY:
529 return BRW_PSCDEPTH_ON;
530 case FRAG_DEPTH_LAYOUT_GREATER:
531 return BRW_PSCDEPTH_ON_GE;
532 case FRAG_DEPTH_LAYOUT_LESS:
533 return BRW_PSCDEPTH_ON_LE;
534 case FRAG_DEPTH_LAYOUT_UNCHANGED:
535 return BRW_PSCDEPTH_OFF;
536 }
537 }
538 return BRW_PSCDEPTH_OFF;
539 }
540
541 static bool
542 really_do_wm_prog(struct brw_context *brw,
543 struct gl_shader_program *prog,
544 struct brw_fragment_program *fp,
545 struct brw_wm_prog_key *key, struct anv_pipeline *pipeline)
546 {
547 void *mem_ctx = ralloc_context(NULL);
548 struct brw_wm_prog_data *prog_data = &pipeline->wm_prog_data;
549 struct gl_shader *fs = NULL;
550 unsigned int program_size;
551 const uint32_t *program;
552
553 if (prog)
554 fs = prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
555
556 memset(prog_data, 0, sizeof(*prog_data));
557
558 /* key->alpha_test_func means simulating alpha testing via discards,
559 * so the shader definitely kills pixels.
560 */
561 prog_data->uses_kill = fp->program.UsesKill || key->alpha_test_func;
562
563 prog_data->computed_depth_mode = computed_depth_mode(&fp->program);
564
565 create_params_array(pipeline, fs, &prog_data->base);
566 anv_nir_apply_dynamic_offsets(pipeline, fs->Program->nir, &prog_data->base);
567
568 prog_data->barycentric_interp_modes =
569 brw_compute_barycentric_interp_modes(brw->intelScreen->devinfo,
570 key->flat_shade,
571 key->persample_shading,
572 fp->program.Base.nir);
573
574 set_binding_table_layout(&prog_data->base, pipeline,
575 VK_SHADER_STAGE_FRAGMENT);
576 /* This needs to come after shader time and pull constant entries, but we
577 * don't have those set up now, so just put it after the layout entries.
578 */
579 prog_data->binding_table.render_target_start = 0;
580
581 program = brw_wm_fs_emit(brw, mem_ctx, key, prog_data,
582 &fp->program, prog, -1, -1, &program_size);
583 if (program == NULL) {
584 ralloc_free(mem_ctx);
585 return false;
586 }
587
588 uint32_t offset = upload_kernel(pipeline, program, program_size);
589
590 if (prog_data->no_8)
591 pipeline->ps_simd8 = NO_KERNEL;
592 else
593 pipeline->ps_simd8 = offset;
594
595 if (prog_data->no_8 || prog_data->prog_offset_16) {
596 pipeline->ps_simd16 = offset + prog_data->prog_offset_16;
597 } else {
598 pipeline->ps_simd16 = NO_KERNEL;
599 }
600
601 ralloc_free(mem_ctx);
602
603 return true;
604 }
605
606 bool
607 anv_codegen_gs_prog(struct brw_context *brw,
608 struct gl_shader_program *prog,
609 struct brw_geometry_program *gp,
610 struct brw_gs_prog_key *key,
611 struct anv_pipeline *pipeline)
612 {
613 struct brw_gs_compile c;
614
615 memset(&c, 0, sizeof(c));
616 c.key = *key;
617 c.gp = gp;
618
619 c.prog_data.include_primitive_id =
620 (gp->program.Base.InputsRead & VARYING_BIT_PRIMITIVE_ID) != 0;
621
622 c.prog_data.invocations = gp->program.Invocations;
623
624 set_binding_table_layout(&c.prog_data.base.base,
625 pipeline, VK_SHADER_STAGE_GEOMETRY);
626
627 /* Allocate the references to the uniforms that will end up in the
628 * prog_data associated with the compiled program, and which will be freed
629 * by the state cache.
630 *
631 * Note: param_count needs to be num_uniform_components * 4, since we add
632 * padding around uniform values below vec4 size, so the worst case is that
633 * every uniform is a float which gets padded to the size of a vec4.
634 */
635 struct gl_shader *gs = prog->_LinkedShaders[MESA_SHADER_GEOMETRY];
636 int param_count = gp->program.Base.nir->num_uniforms * 4;
637
638 c.prog_data.base.base.param =
639 rzalloc_array(NULL, const gl_constant_value *, param_count);
640 c.prog_data.base.base.pull_param =
641 rzalloc_array(NULL, const gl_constant_value *, param_count);
642 c.prog_data.base.base.image_param =
643 rzalloc_array(NULL, struct brw_image_param, gs->NumImages);
644 c.prog_data.base.base.nr_params = param_count;
645 c.prog_data.base.base.nr_image_params = gs->NumImages;
646
647 brw_nir_setup_glsl_uniforms(gp->program.Base.nir, prog, &gp->program.Base,
648 &c.prog_data.base.base, false);
649
650 if (brw->gen >= 8) {
651 c.prog_data.static_vertex_count = !gp->program.Base.nir ? -1 :
652 nir_gs_count_vertices(gp->program.Base.nir);
653 }
654
655 if (brw->gen >= 7) {
656 if (gp->program.OutputType == GL_POINTS) {
657 /* When the output type is points, the geometry shader may output data
658 * to multiple streams, and EndPrimitive() has no effect. So we
659 * configure the hardware to interpret the control data as stream ID.
660 */
661 c.prog_data.control_data_format = GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID;
662
663 /* We only have to emit control bits if we are using streams */
664 if (prog->Geom.UsesStreams)
665 c.control_data_bits_per_vertex = 2;
666 else
667 c.control_data_bits_per_vertex = 0;
668 } else {
669 /* When the output type is triangle_strip or line_strip, EndPrimitive()
670 * may be used to terminate the current strip and start a new one
671 * (similar to primitive restart), and outputting data to multiple
672 * streams is not supported. So we configure the hardware to interpret
673 * the control data as EndPrimitive information (a.k.a. "cut bits").
674 */
675 c.prog_data.control_data_format = GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT;
676
677 /* We only need to output control data if the shader actually calls
678 * EndPrimitive().
679 */
680 c.control_data_bits_per_vertex = gp->program.UsesEndPrimitive ? 1 : 0;
681 }
682 } else {
683 /* There are no control data bits in gen6. */
684 c.control_data_bits_per_vertex = 0;
685
686 /* If it is using transform feedback, enable it */
687 if (prog->TransformFeedback.NumVarying)
688 c.prog_data.gen6_xfb_enabled = true;
689 else
690 c.prog_data.gen6_xfb_enabled = false;
691 }
692 c.control_data_header_size_bits =
693 gp->program.VerticesOut * c.control_data_bits_per_vertex;
694
695 /* 1 HWORD = 32 bytes = 256 bits */
696 c.prog_data.control_data_header_size_hwords =
697 ALIGN(c.control_data_header_size_bits, 256) / 256;
698
699 GLbitfield64 outputs_written = gp->program.Base.OutputsWritten;
700
701 brw_compute_vue_map(brw->intelScreen->devinfo,
702 &c.prog_data.base.vue_map, outputs_written,
703 prog ? prog->SeparateShader : false);
704
705 /* Compute the output vertex size.
706 *
707 * From the Ivy Bridge PRM, Vol2 Part1 7.2.1.1 STATE_GS - Output Vertex
708 * Size (p168):
709 *
710 * [0,62] indicating [1,63] 16B units
711 *
712 * Specifies the size of each vertex stored in the GS output entry
713 * (following any Control Header data) as a number of 128-bit units
714 * (minus one).
715 *
716 * Programming Restrictions: The vertex size must be programmed as a
717 * multiple of 32B units with the following exception: Rendering is
718 * disabled (as per SOL stage state) and the vertex size output by the
719 * GS thread is 16B.
720 *
721 * If rendering is enabled (as per SOL state) the vertex size must be
722 * programmed as a multiple of 32B units. In other words, the only time
723 * software can program a vertex size with an odd number of 16B units
724 * is when rendering is disabled.
725 *
726 * Note: B=bytes in the above text.
727 *
728 * It doesn't seem worth the extra trouble to optimize the case where the
729 * vertex size is 16B (especially since this would require special-casing
730 * the GEN assembly that writes to the URB). So we just set the vertex
731 * size to a multiple of 32B (2 vec4's) in all cases.
732 *
733 * The maximum output vertex size is 62*16 = 992 bytes (31 hwords). We
734 * budget that as follows:
735 *
736 * 512 bytes for varyings (a varying component is 4 bytes and
737 * gl_MaxGeometryOutputComponents = 128)
738 * 16 bytes overhead for VARYING_SLOT_PSIZ (each varying slot is 16
739 * bytes)
740 * 16 bytes overhead for gl_Position (we allocate it a slot in the VUE
741 * even if it's not used)
742 * 32 bytes overhead for gl_ClipDistance (we allocate it 2 VUE slots
743 * whenever clip planes are enabled, even if the shader doesn't
744 * write to gl_ClipDistance)
745 * 16 bytes overhead since the VUE size must be a multiple of 32 bytes
746 * (see below)--this causes up to 1 VUE slot to be wasted
747 * 400 bytes available for varying packing overhead
748 *
749 * Worst-case varying packing overhead is 3/4 of a varying slot (12 bytes)
750 * per interpolation type, so this is plenty.
751 *
752 */
753 unsigned output_vertex_size_bytes = c.prog_data.base.vue_map.num_slots * 16;
754 assert(brw->gen == 6 ||
755 output_vertex_size_bytes <= GEN7_MAX_GS_OUTPUT_VERTEX_SIZE_BYTES);
756 c.prog_data.output_vertex_size_hwords =
757 ALIGN(output_vertex_size_bytes, 32) / 32;
758
759 /* Compute URB entry size. The maximum allowed URB entry size is 32k.
760 * That divides up as follows:
761 *
762 * 64 bytes for the control data header (cut indices or StreamID bits)
763 * 4096 bytes for varyings (a varying component is 4 bytes and
764 * gl_MaxGeometryTotalOutputComponents = 1024)
765 * 4096 bytes overhead for VARYING_SLOT_PSIZ (each varying slot is 16
766 * bytes/vertex and gl_MaxGeometryOutputVertices is 256)
767 * 4096 bytes overhead for gl_Position (we allocate it a slot in the VUE
768 * even if it's not used)
769 * 8192 bytes overhead for gl_ClipDistance (we allocate it 2 VUE slots
770 * whenever clip planes are enabled, even if the shader doesn't
771 * write to gl_ClipDistance)
772 * 4096 bytes overhead since the VUE size must be a multiple of 32
773 * bytes (see above)--this causes up to 1 VUE slot to be wasted
774 * 8128 bytes available for varying packing overhead
775 *
776 * Worst-case varying packing overhead is 3/4 of a varying slot per
777 * interpolation type, which works out to 3072 bytes, so this would allow
778 * us to accommodate 2 interpolation types without any danger of running
779 * out of URB space.
780 *
781 * In practice, the risk of running out of URB space is very small, since
782 * the above figures are all worst-case, and most of them scale with the
783 * number of output vertices. So we'll just calculate the amount of space
784 * we need, and if it's too large, fail to compile.
785 *
786 * The above is for gen7+ where we have a single URB entry that will hold
787 * all the output. In gen6, we will have to allocate URB entries for every
788 * vertex we emit, so our URB entries only need to be large enough to hold
789 * a single vertex. Also, gen6 does not have a control data header.
790 */
791 unsigned output_size_bytes;
792 if (brw->gen >= 7) {
793 output_size_bytes =
794 c.prog_data.output_vertex_size_hwords * 32 * gp->program.VerticesOut;
795 output_size_bytes += 32 * c.prog_data.control_data_header_size_hwords;
796 } else {
797 output_size_bytes = c.prog_data.output_vertex_size_hwords * 32;
798 }
799
800 /* Broadwell stores "Vertex Count" as a full 8 DWord (32 byte) URB output,
801 * which comes before the control header.
802 */
803 if (brw->gen >= 8)
804 output_size_bytes += 32;
805
806 assert(output_size_bytes >= 1);
807 int max_output_size_bytes = GEN7_MAX_GS_URB_ENTRY_SIZE_BYTES;
808 if (brw->gen == 6)
809 max_output_size_bytes = GEN6_MAX_GS_URB_ENTRY_SIZE_BYTES;
810 if (output_size_bytes > max_output_size_bytes)
811 return false;
812
813
814 /* URB entry sizes are stored as a multiple of 64 bytes in gen7+ and
815 * a multiple of 128 bytes in gen6.
816 */
817 if (brw->gen >= 7)
818 c.prog_data.base.urb_entry_size = ALIGN(output_size_bytes, 64) / 64;
819 else
820 c.prog_data.base.urb_entry_size = ALIGN(output_size_bytes, 128) / 128;
821
822 /* FIXME: Need to pull this from nir shader. */
823 c.prog_data.output_topology = _3DPRIM_TRISTRIP;
824
825 /* The GLSL linker will have already matched up GS inputs and the outputs
826 * of prior stages. The driver does extend VS outputs in some cases, but
827 * only for legacy OpenGL or Gen4-5 hardware, neither of which offer
828 * geometry shader support. So we can safely ignore that.
829 *
830 * For SSO pipelines, we use a fixed VUE map layout based on variable
831 * locations, so we can rely on rendezvous-by-location making this work.
832 *
833 * However, we need to ignore VARYING_SLOT_PRIMITIVE_ID, as it's not
834 * written by previous stages and shows up via payload magic.
835 */
836 GLbitfield64 inputs_read =
837 gp->program.Base.InputsRead & ~VARYING_BIT_PRIMITIVE_ID;
838 brw_compute_vue_map(brw->intelScreen->devinfo,
839 &c.input_vue_map, inputs_read,
840 prog->SeparateShader);
841
842 /* GS inputs are read from the VUE 256 bits (2 vec4's) at a time, so we
843 * need to program a URB read length of ceiling(num_slots / 2).
844 */
845 c.prog_data.base.urb_read_length = (c.input_vue_map.num_slots + 1) / 2;
846
847 void *mem_ctx = ralloc_context(NULL);
848 unsigned program_size;
849 const unsigned *program =
850 brw_gs_emit(brw, prog, &c, mem_ctx, -1, &program_size);
851 if (program == NULL) {
852 ralloc_free(mem_ctx);
853 return false;
854 }
855
856 pipeline->gs_vec4 = upload_kernel(pipeline, program, program_size);
857 pipeline->gs_vertex_count = gp->program.VerticesIn;
858
859 ralloc_free(mem_ctx);
860
861 return true;
862 }
863
864 static bool
865 brw_codegen_cs_prog(struct brw_context *brw,
866 struct gl_shader_program *prog,
867 struct brw_compute_program *cp,
868 struct brw_cs_prog_key *key, struct anv_pipeline *pipeline)
869 {
870 const GLuint *program;
871 void *mem_ctx = ralloc_context(NULL);
872 GLuint program_size;
873 struct brw_cs_prog_data *prog_data = &pipeline->cs_prog_data;
874
875 struct gl_shader *cs = prog->_LinkedShaders[MESA_SHADER_COMPUTE];
876 assert (cs);
877
878 memset(prog_data, 0, sizeof(*prog_data));
879
880 set_binding_table_layout(&prog_data->base, pipeline, VK_SHADER_STAGE_COMPUTE);
881
882 create_params_array(pipeline, cs, &prog_data->base);
883 anv_nir_apply_dynamic_offsets(pipeline, cs->Program->nir, &prog_data->base);
884
885 program = brw_cs_emit(brw, mem_ctx, key, prog_data,
886 &cp->program, prog, -1, &program_size);
887 if (program == NULL) {
888 ralloc_free(mem_ctx);
889 return false;
890 }
891
892 if (unlikely(INTEL_DEBUG & DEBUG_CS))
893 fprintf(stderr, "\n");
894
895 pipeline->cs_simd = upload_kernel(pipeline, program, program_size);
896
897 ralloc_free(mem_ctx);
898
899 return true;
900 }
901
902 static void
903 brw_cs_populate_key(struct brw_context *brw,
904 struct brw_compute_program *bcp, struct brw_cs_prog_key *key)
905 {
906 memset(key, 0, sizeof(*key));
907
908 /* The unique compute program ID */
909 key->program_string_id = bcp->id;
910 }
911
912 struct anv_compiler {
913 struct anv_device *device;
914 struct intel_screen *screen;
915 struct brw_context *brw;
916 struct gl_pipeline_object pipeline;
917 };
918
919 extern "C" {
920
921 struct anv_compiler *
922 anv_compiler_create(struct anv_device *device)
923 {
924 const struct brw_device_info *devinfo = &device->info;
925 struct anv_compiler *compiler;
926 struct gl_context *ctx;
927
928 compiler = rzalloc(NULL, struct anv_compiler);
929 if (compiler == NULL)
930 return NULL;
931
932 compiler->screen = rzalloc(compiler, struct intel_screen);
933 if (compiler->screen == NULL)
934 goto fail;
935
936 compiler->brw = rzalloc(compiler, struct brw_context);
937 if (compiler->brw == NULL)
938 goto fail;
939
940 compiler->device = device;
941
942 compiler->brw->gen = devinfo->gen;
943 compiler->brw->is_g4x = devinfo->is_g4x;
944 compiler->brw->is_baytrail = devinfo->is_baytrail;
945 compiler->brw->is_haswell = devinfo->is_haswell;
946 compiler->brw->is_cherryview = devinfo->is_cherryview;
947
948 /* We need this at least for CS, which will check brw->max_cs_threads
949 * against the work group size. */
950 compiler->brw->max_vs_threads = devinfo->max_vs_threads;
951 compiler->brw->max_hs_threads = devinfo->max_hs_threads;
952 compiler->brw->max_ds_threads = devinfo->max_ds_threads;
953 compiler->brw->max_gs_threads = devinfo->max_gs_threads;
954 compiler->brw->max_wm_threads = devinfo->max_wm_threads;
955 compiler->brw->max_cs_threads = devinfo->max_cs_threads;
956 compiler->brw->urb.size = devinfo->urb.size;
957 compiler->brw->urb.min_vs_entries = devinfo->urb.min_vs_entries;
958 compiler->brw->urb.max_vs_entries = devinfo->urb.max_vs_entries;
959 compiler->brw->urb.max_hs_entries = devinfo->urb.max_hs_entries;
960 compiler->brw->urb.max_ds_entries = devinfo->urb.max_ds_entries;
961 compiler->brw->urb.max_gs_entries = devinfo->urb.max_gs_entries;
962
963 compiler->brw->intelScreen = compiler->screen;
964 compiler->screen->devinfo = &device->info;
965
966 brw_process_intel_debug_variable();
967
968 compiler->screen->compiler = brw_compiler_create(compiler, &device->info);
969
970 ctx = &compiler->brw->ctx;
971 _mesa_init_shader_object_functions(&ctx->Driver);
972
973 /* brw_select_clip_planes() needs this for bogus reasons. */
974 ctx->_Shader = &compiler->pipeline;
975
976 return compiler;
977
978 fail:
979 ralloc_free(compiler);
980 return NULL;
981 }
982
983 void
984 anv_compiler_destroy(struct anv_compiler *compiler)
985 {
986 _mesa_free_errors_data(&compiler->brw->ctx);
987 ralloc_free(compiler);
988 }
989
990 /* From gen7_urb.c */
991
992 /* FIXME: Add to struct intel_device_info */
993
994 static const int gen8_push_size = 32 * 1024;
995
996 static void
997 gen7_compute_urb_partition(struct anv_pipeline *pipeline)
998 {
999 const struct brw_device_info *devinfo = &pipeline->device->info;
1000 bool vs_present = pipeline->vs_simd8 != NO_KERNEL;
1001 unsigned vs_size = vs_present ? pipeline->vs_prog_data.base.urb_entry_size : 1;
1002 unsigned vs_entry_size_bytes = vs_size * 64;
1003 bool gs_present = pipeline->gs_vec4 != NO_KERNEL;
1004 unsigned gs_size = gs_present ? pipeline->gs_prog_data.base.urb_entry_size : 1;
1005 unsigned gs_entry_size_bytes = gs_size * 64;
1006
1007 /* From p35 of the Ivy Bridge PRM (section 1.7.1: 3DSTATE_URB_GS):
1008 *
1009 * VS Number of URB Entries must be divisible by 8 if the VS URB Entry
1010 * Allocation Size is less than 9 512-bit URB entries.
1011 *
1012 * Similar text exists for GS.
1013 */
1014 unsigned vs_granularity = (vs_size < 9) ? 8 : 1;
1015 unsigned gs_granularity = (gs_size < 9) ? 8 : 1;
1016
1017 /* URB allocations must be done in 8k chunks. */
1018 unsigned chunk_size_bytes = 8192;
1019
1020 /* Determine the size of the URB in chunks. */
1021 unsigned urb_chunks = devinfo->urb.size * 1024 / chunk_size_bytes;
1022
1023 /* Reserve space for push constants */
1024 unsigned push_constant_bytes = gen8_push_size;
1025 unsigned push_constant_chunks =
1026 push_constant_bytes / chunk_size_bytes;
1027
1028 /* Initially, assign each stage the minimum amount of URB space it needs,
1029 * and make a note of how much additional space it "wants" (the amount of
1030 * additional space it could actually make use of).
1031 */
1032
1033 /* VS has a lower limit on the number of URB entries */
1034 unsigned vs_chunks =
1035 ALIGN(devinfo->urb.min_vs_entries * vs_entry_size_bytes,
1036 chunk_size_bytes) / chunk_size_bytes;
1037 unsigned vs_wants =
1038 ALIGN(devinfo->urb.max_vs_entries * vs_entry_size_bytes,
1039 chunk_size_bytes) / chunk_size_bytes - vs_chunks;
1040
1041 unsigned gs_chunks = 0;
1042 unsigned gs_wants = 0;
1043 if (gs_present) {
1044 /* There are two constraints on the minimum amount of URB space we can
1045 * allocate:
1046 *
1047 * (1) We need room for at least 2 URB entries, since we always operate
1048 * the GS in DUAL_OBJECT mode.
1049 *
1050 * (2) We can't allocate less than nr_gs_entries_granularity.
1051 */
1052 gs_chunks = ALIGN(MAX2(gs_granularity, 2) * gs_entry_size_bytes,
1053 chunk_size_bytes) / chunk_size_bytes;
1054 gs_wants =
1055 ALIGN(devinfo->urb.max_gs_entries * gs_entry_size_bytes,
1056 chunk_size_bytes) / chunk_size_bytes - gs_chunks;
1057 }
1058
1059 /* There should always be enough URB space to satisfy the minimum
1060 * requirements of each stage.
1061 */
1062 unsigned total_needs = push_constant_chunks + vs_chunks + gs_chunks;
1063 assert(total_needs <= urb_chunks);
1064
1065 /* Mete out remaining space (if any) in proportion to "wants". */
1066 unsigned total_wants = vs_wants + gs_wants;
1067 unsigned remaining_space = urb_chunks - total_needs;
1068 if (remaining_space > total_wants)
1069 remaining_space = total_wants;
1070 if (remaining_space > 0) {
1071 unsigned vs_additional = (unsigned)
1072 round(vs_wants * (((double) remaining_space) / total_wants));
1073 vs_chunks += vs_additional;
1074 remaining_space -= vs_additional;
1075 gs_chunks += remaining_space;
1076 }
1077
1078 /* Sanity check that we haven't over-allocated. */
1079 assert(push_constant_chunks + vs_chunks + gs_chunks <= urb_chunks);
1080
1081 /* Finally, compute the number of entries that can fit in the space
1082 * allocated to each stage.
1083 */
1084 unsigned nr_vs_entries = vs_chunks * chunk_size_bytes / vs_entry_size_bytes;
1085 unsigned nr_gs_entries = gs_chunks * chunk_size_bytes / gs_entry_size_bytes;
1086
1087 /* Since we rounded up when computing *_wants, this may be slightly more
1088 * than the maximum allowed amount, so correct for that.
1089 */
1090 nr_vs_entries = MIN2(nr_vs_entries, devinfo->urb.max_vs_entries);
1091 nr_gs_entries = MIN2(nr_gs_entries, devinfo->urb.max_gs_entries);
1092
1093 /* Ensure that we program a multiple of the granularity. */
1094 nr_vs_entries = ROUND_DOWN_TO(nr_vs_entries, vs_granularity);
1095 nr_gs_entries = ROUND_DOWN_TO(nr_gs_entries, gs_granularity);
1096
1097 /* Finally, sanity check to make sure we have at least the minimum number
1098 * of entries needed for each stage.
1099 */
1100 assert(nr_vs_entries >= devinfo->urb.min_vs_entries);
1101 if (gs_present)
1102 assert(nr_gs_entries >= 2);
1103
1104 /* Lay out the URB in the following order:
1105 * - push constants
1106 * - VS
1107 * - GS
1108 */
1109 pipeline->urb.vs_start = push_constant_chunks;
1110 pipeline->urb.vs_size = vs_size;
1111 pipeline->urb.nr_vs_entries = nr_vs_entries;
1112
1113 pipeline->urb.gs_start = push_constant_chunks + vs_chunks;
1114 pipeline->urb.gs_size = gs_size;
1115 pipeline->urb.nr_gs_entries = nr_gs_entries;
1116 }
1117
1118 static const struct {
1119 uint32_t token;
1120 gl_shader_stage stage;
1121 const char *name;
1122 } stage_info[] = {
1123 { GL_VERTEX_SHADER, MESA_SHADER_VERTEX, "vertex" },
1124 { GL_TESS_CONTROL_SHADER, (gl_shader_stage)-1,"tess control" },
1125 { GL_TESS_EVALUATION_SHADER, (gl_shader_stage)-1, "tess evaluation" },
1126 { GL_GEOMETRY_SHADER, MESA_SHADER_GEOMETRY, "geometry" },
1127 { GL_FRAGMENT_SHADER, MESA_SHADER_FRAGMENT, "fragment" },
1128 { GL_COMPUTE_SHADER, MESA_SHADER_COMPUTE, "compute" },
1129 };
1130
1131 struct spirv_header{
1132 uint32_t magic;
1133 uint32_t version;
1134 uint32_t gen_magic;
1135 };
1136
1137 static void
1138 setup_nir_io(struct gl_shader *mesa_shader,
1139 nir_shader *shader)
1140 {
1141 struct gl_program *prog = mesa_shader->Program;
1142 foreach_list_typed(nir_variable, var, node, &shader->inputs) {
1143 prog->InputsRead |= BITFIELD64_BIT(var->data.location);
1144 if (shader->stage == MESA_SHADER_FRAGMENT) {
1145 struct gl_fragment_program *fprog = (struct gl_fragment_program *)prog;
1146
1147 fprog->InterpQualifier[var->data.location] =
1148 (glsl_interp_qualifier)var->data.interpolation;
1149 if (var->data.centroid)
1150 fprog->IsCentroid |= BITFIELD64_BIT(var->data.location);
1151 if (var->data.sample)
1152 fprog->IsSample |= BITFIELD64_BIT(var->data.location);
1153 }
1154 }
1155
1156 foreach_list_typed(nir_variable, var, node, &shader->outputs) {
1157 prog->OutputsWritten |= BITFIELD64_BIT(var->data.location);
1158 }
1159
1160 shader->info.inputs_read = prog->InputsRead;
1161 shader->info.outputs_written = prog->OutputsWritten;
1162
1163 mesa_shader->num_uniform_components = shader->num_uniforms;
1164 }
1165
1166 static void
1167 anv_compile_shader_spirv(struct anv_compiler *compiler,
1168 struct gl_shader_program *program,
1169 struct anv_pipeline *pipeline, uint32_t stage)
1170 {
1171 struct brw_context *brw = compiler->brw;
1172 struct anv_shader *shader = pipeline->shaders[stage];
1173 struct gl_shader *mesa_shader;
1174 int name = 0;
1175
1176 mesa_shader = brw_new_shader(&brw->ctx, name, stage_info[stage].token);
1177 fail_if(mesa_shader == NULL,
1178 "failed to create %s shader\n", stage_info[stage].name);
1179
1180 #define CREATE_PROGRAM(stage) \
1181 _mesa_init_##stage##_program(&brw->ctx, &ralloc(mesa_shader, struct brw_##stage##_program)->program, 0, 0)
1182
1183 bool is_scalar;
1184 struct gl_program *prog;
1185 switch (stage) {
1186 case VK_SHADER_STAGE_VERTEX:
1187 prog = CREATE_PROGRAM(vertex);
1188 is_scalar = compiler->screen->compiler->scalar_vs;
1189 break;
1190 case VK_SHADER_STAGE_GEOMETRY:
1191 prog = CREATE_PROGRAM(geometry);
1192 is_scalar = false;
1193 break;
1194 case VK_SHADER_STAGE_FRAGMENT:
1195 prog = CREATE_PROGRAM(fragment);
1196 is_scalar = true;
1197 break;
1198 case VK_SHADER_STAGE_COMPUTE:
1199 prog = CREATE_PROGRAM(compute);
1200 is_scalar = true;
1201 break;
1202 default:
1203 unreachable("Unsupported shader stage");
1204 }
1205 _mesa_reference_program(&brw->ctx, &mesa_shader->Program, prog);
1206
1207 mesa_shader->Program->Parameters =
1208 rzalloc(mesa_shader, struct gl_program_parameter_list);
1209
1210 mesa_shader->Type = stage_info[stage].token;
1211 mesa_shader->Stage = stage_info[stage].stage;
1212
1213 struct gl_shader_compiler_options *glsl_options =
1214 &compiler->screen->compiler->glsl_compiler_options[stage_info[stage].stage];
1215
1216 if (shader->module->nir) {
1217 /* Some things such as our meta clear/blit code will give us a NIR
1218 * shader directly. In that case, we just ignore the SPIR-V entirely
1219 * and just use the NIR shader */
1220 mesa_shader->Program->nir = shader->module->nir;
1221 mesa_shader->Program->nir->options = glsl_options->NirOptions;
1222 } else {
1223 uint32_t *spirv = (uint32_t *) shader->module->data;
1224 assert(spirv[0] == SPIR_V_MAGIC_NUMBER);
1225 assert(shader->module->size % 4 == 0);
1226
1227 mesa_shader->Program->nir =
1228 spirv_to_nir(spirv, shader->module->size / 4,
1229 stage_info[stage].stage, glsl_options->NirOptions);
1230 }
1231 nir_validate_shader(mesa_shader->Program->nir);
1232
1233 brw_process_nir(mesa_shader->Program->nir,
1234 compiler->screen->devinfo,
1235 NULL, mesa_shader->Stage, is_scalar);
1236
1237 setup_nir_io(mesa_shader, mesa_shader->Program->nir);
1238
1239 fail_if(mesa_shader->Program->nir == NULL,
1240 "failed to translate SPIR-V to NIR\n");
1241
1242 _mesa_reference_shader(&brw->ctx, &program->Shaders[program->NumShaders],
1243 mesa_shader);
1244 program->NumShaders++;
1245 }
1246
1247 static void
1248 add_compiled_stage(struct anv_pipeline *pipeline, uint32_t stage,
1249 struct brw_stage_prog_data *prog_data)
1250 {
1251 struct brw_device_info *devinfo = &pipeline->device->info;
1252 uint32_t max_threads[] = {
1253 [VK_SHADER_STAGE_VERTEX] = devinfo->max_vs_threads,
1254 [VK_SHADER_STAGE_TESS_CONTROL] = 0,
1255 [VK_SHADER_STAGE_TESS_EVALUATION] = 0,
1256 [VK_SHADER_STAGE_GEOMETRY] = devinfo->max_gs_threads,
1257 [VK_SHADER_STAGE_FRAGMENT] = devinfo->max_wm_threads,
1258 [VK_SHADER_STAGE_COMPUTE] = devinfo->max_cs_threads,
1259 };
1260
1261 pipeline->prog_data[stage] = prog_data;
1262 pipeline->active_stages |= 1 << stage;
1263 pipeline->scratch_start[stage] = pipeline->total_scratch;
1264 pipeline->total_scratch =
1265 align_u32(pipeline->total_scratch, 1024) +
1266 prog_data->total_scratch * max_threads[stage];
1267 }
1268
1269 int
1270 anv_compiler_run(struct anv_compiler *compiler, struct anv_pipeline *pipeline)
1271 {
1272 struct gl_shader_program *program;
1273 int name = 0;
1274 struct brw_context *brw = compiler->brw;
1275
1276 pipeline->writes_point_size = false;
1277
1278 /* When we free the pipeline, we detect stages based on the NULL status
1279 * of various prog_data pointers. Make them NULL by default.
1280 */
1281 memset(pipeline->prog_data, 0, sizeof(pipeline->prog_data));
1282 memset(pipeline->scratch_start, 0, sizeof(pipeline->scratch_start));
1283
1284 brw->use_rep_send = pipeline->use_repclear;
1285 brw->no_simd8 = pipeline->use_repclear;
1286
1287 program = _mesa_new_shader_program(name);
1288 program->Shaders = (struct gl_shader **)
1289 calloc(VK_SHADER_STAGE_NUM, sizeof(struct gl_shader *));
1290 fail_if(program == NULL || program->Shaders == NULL,
1291 "failed to create program\n");
1292
1293 for (unsigned i = 0; i < VK_SHADER_STAGE_NUM; i++) {
1294 if (pipeline->shaders[i])
1295 anv_compile_shader_spirv(compiler, program, pipeline, i);
1296 }
1297
1298 for (unsigned i = 0; i < program->NumShaders; i++) {
1299 struct gl_shader *shader = program->Shaders[i];
1300 program->_LinkedShaders[shader->Stage] = shader;
1301 }
1302
1303 bool success;
1304 pipeline->active_stages = 0;
1305 pipeline->total_scratch = 0;
1306
1307 if (pipeline->shaders[VK_SHADER_STAGE_VERTEX]) {
1308 struct brw_vs_prog_key vs_key;
1309 struct gl_vertex_program *vp = (struct gl_vertex_program *)
1310 program->_LinkedShaders[MESA_SHADER_VERTEX]->Program;
1311 struct brw_vertex_program *bvp = brw_vertex_program(vp);
1312
1313 brw_vs_populate_key(brw, bvp, &vs_key);
1314
1315 success = really_do_vs_prog(brw, program, bvp, &vs_key, pipeline);
1316 fail_if(!success, "do_wm_prog failed\n");
1317 add_compiled_stage(pipeline, VK_SHADER_STAGE_VERTEX,
1318 &pipeline->vs_prog_data.base.base);
1319
1320 if (vp->Base.OutputsWritten & VARYING_SLOT_PSIZ)
1321 pipeline->writes_point_size = true;
1322 } else {
1323 memset(&pipeline->vs_prog_data, 0, sizeof(pipeline->vs_prog_data));
1324 pipeline->vs_simd8 = NO_KERNEL;
1325 pipeline->vs_vec4 = NO_KERNEL;
1326 }
1327
1328
1329 if (pipeline->shaders[VK_SHADER_STAGE_GEOMETRY]) {
1330 struct brw_gs_prog_key gs_key;
1331 struct gl_geometry_program *gp = (struct gl_geometry_program *)
1332 program->_LinkedShaders[MESA_SHADER_GEOMETRY]->Program;
1333 struct brw_geometry_program *bgp = brw_geometry_program(gp);
1334
1335 success = anv_codegen_gs_prog(brw, program, bgp, &gs_key, pipeline);
1336 fail_if(!success, "do_gs_prog failed\n");
1337 add_compiled_stage(pipeline, VK_SHADER_STAGE_GEOMETRY,
1338 &pipeline->gs_prog_data.base.base);
1339
1340 if (gp->Base.OutputsWritten & VARYING_SLOT_PSIZ)
1341 pipeline->writes_point_size = true;
1342 } else {
1343 pipeline->gs_vec4 = NO_KERNEL;
1344 }
1345
1346 if (pipeline->shaders[VK_SHADER_STAGE_FRAGMENT]) {
1347 struct brw_wm_prog_key wm_key;
1348 struct gl_fragment_program *fp = (struct gl_fragment_program *)
1349 program->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
1350 struct brw_fragment_program *bfp = brw_fragment_program(fp);
1351
1352 brw_wm_populate_key(brw, bfp, &wm_key);
1353
1354 success = really_do_wm_prog(brw, program, bfp, &wm_key, pipeline);
1355 fail_if(!success, "do_wm_prog failed\n");
1356 add_compiled_stage(pipeline, VK_SHADER_STAGE_FRAGMENT,
1357 &pipeline->wm_prog_data.base);
1358 }
1359
1360 if (pipeline->shaders[VK_SHADER_STAGE_COMPUTE]) {
1361 struct brw_cs_prog_key cs_key;
1362 struct gl_compute_program *cp = (struct gl_compute_program *)
1363 program->_LinkedShaders[MESA_SHADER_COMPUTE]->Program;
1364 struct brw_compute_program *bcp = brw_compute_program(cp);
1365
1366 brw_cs_populate_key(brw, bcp, &cs_key);
1367
1368 success = brw_codegen_cs_prog(brw, program, bcp, &cs_key, pipeline);
1369 fail_if(!success, "brw_codegen_cs_prog failed\n");
1370 add_compiled_stage(pipeline, VK_SHADER_STAGE_COMPUTE,
1371 &pipeline->cs_prog_data.base);
1372 }
1373
1374 _mesa_delete_shader_program(&brw->ctx, program);
1375
1376 struct anv_device *device = compiler->device;
1377 while (device->scratch_block_pool.bo.size < pipeline->total_scratch)
1378 anv_block_pool_alloc(&device->scratch_block_pool);
1379
1380 gen7_compute_urb_partition(pipeline);
1381
1382 return 0;
1383 }
1384
1385 /* This badly named function frees the struct anv_pipeline data that the compiler
1386 * allocates. Currently just the prog_data structs.
1387 */
1388 void
1389 anv_compiler_free(struct anv_pipeline *pipeline)
1390 {
1391 for (uint32_t stage = 0; stage < VK_SHADER_STAGE_NUM; stage++) {
1392 if (pipeline->prog_data[stage]) {
1393 free(pipeline->prog_data[stage]->map_entries);
1394 /* We only ever set up the params array because we don't do
1395 * non-UBO pull constants
1396 */
1397 anv_device_free(pipeline->device, pipeline->prog_data[stage]->param);
1398 }
1399 }
1400 }
1401
1402 }