2 * Copyright © 2015 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
28 #include "anv_private.h"
31 #include <brw_context.h>
32 #include <brw_wm.h> /* brw_new_shader_program is here */
38 #include "brw_vec4_gs_visitor.h"
40 #include <mesa/main/shaderobj.h>
41 #include <mesa/main/fbobject.h>
42 #include <mesa/main/context.h>
43 #include <mesa/program/program.h>
44 #include <glsl/program.h>
46 /* XXX: We need this to keep symbols in nir.h from conflicting with the
47 * generated GEN command packing headers. We need to fix *both* to not
48 * define something as generic as LOAD.
52 #include <glsl/nir/nir_spirv.h>
54 #define SPIR_V_MAGIC_NUMBER 0x07230203
57 fail_if(int cond
, const char *format
, ...)
64 va_start(args
, format
);
65 vfprintf(stderr
, format
, args
);
72 set_binding_table_layout(struct brw_stage_prog_data
*prog_data
,
73 struct anv_pipeline
*pipeline
, uint32_t stage
)
75 uint32_t bias
, count
, k
, *map
;
76 struct anv_pipeline_layout
*layout
= pipeline
->layout
;
78 /* No layout is valid for shaders that don't bind any resources. */
79 if (pipeline
->layout
== NULL
)
82 if (stage
== VK_SHADER_STAGE_FRAGMENT
)
87 count
= layout
->stage
[stage
].surface_count
;
88 prog_data
->map_entries
=
89 (uint32_t *) malloc(count
* sizeof(prog_data
->map_entries
[0]));
90 if (prog_data
->map_entries
== NULL
)
91 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY
);
94 map
= prog_data
->map_entries
;
95 for (uint32_t i
= 0; i
< layout
->num_sets
; i
++) {
96 prog_data
->bind_map
[i
].index
= map
;
97 for (uint32_t j
= 0; j
< layout
->set
[i
].layout
->stage
[stage
].surface_count
; j
++)
100 prog_data
->bind_map
[i
].index_count
=
101 layout
->set
[i
].layout
->stage
[stage
].surface_count
;
108 upload_kernel(struct anv_pipeline
*pipeline
, const void *data
, size_t size
)
110 struct anv_state state
=
111 anv_state_stream_alloc(&pipeline
->program_stream
, size
, 64);
113 assert(size
< pipeline
->program_stream
.block_pool
->block_size
);
115 memcpy(state
.map
, data
, size
);
121 create_params_array(struct anv_pipeline
*pipeline
,
122 struct gl_shader
*shader
,
123 struct brw_stage_prog_data
*prog_data
)
125 VkShaderStage stage
= anv_vk_shader_stage_for_mesa_stage(shader
->Stage
);
126 unsigned num_params
= 0;
128 if (shader
->num_uniform_components
) {
129 /* If the shader uses any push constants at all, we'll just give
130 * them the maximum possible number
132 num_params
+= MAX_PUSH_CONSTANTS_SIZE
/ sizeof(float);
135 if (pipeline
->layout
&& pipeline
->layout
->stage
[stage
].has_dynamic_offsets
)
136 num_params
+= MAX_DYNAMIC_BUFFERS
;
141 prog_data
->param
= (const gl_constant_value
**)
142 anv_device_alloc(pipeline
->device
,
143 num_params
* sizeof(gl_constant_value
*),
144 8, VK_SYSTEM_ALLOC_TYPE_INTERNAL_SHADER
);
146 /* We now set the param values to be offsets into a
147 * anv_push_constant_data structure. Since the compiler doesn't
148 * actually dereference any of the gl_constant_value pointers in the
149 * params array, it doesn't really matter what we put here.
151 struct anv_push_constants
*null_data
= NULL
;
152 for (unsigned i
= 0; i
< num_params
; i
++)
153 prog_data
->param
[i
] =
154 (const gl_constant_value
*)&null_data
->client_data
[i
* sizeof(float)];
158 * Return a bitfield where bit n is set if barycentric interpolation mode n
159 * (see enum brw_wm_barycentric_interp_mode) is needed by the fragment shader.
162 brw_compute_barycentric_interp_modes(const struct brw_device_info
*devinfo
,
163 bool shade_model_flat
,
164 bool persample_shading
,
167 unsigned barycentric_interp_modes
= 0;
169 nir_foreach_variable(var
, &shader
->inputs
) {
170 enum glsl_interp_qualifier interp_qualifier
=
171 (enum glsl_interp_qualifier
) var
->data
.interpolation
;
172 bool is_centroid
= var
->data
.centroid
&& !persample_shading
;
173 bool is_sample
= var
->data
.sample
|| persample_shading
;
174 bool is_gl_Color
= (var
->data
.location
== VARYING_SLOT_COL0
) ||
175 (var
->data
.location
== VARYING_SLOT_COL1
);
177 /* Ignore WPOS and FACE, because they don't require interpolation. */
178 if (var
->data
.location
== VARYING_SLOT_POS
||
179 var
->data
.location
== VARYING_SLOT_FACE
)
182 /* Determine the set (or sets) of barycentric coordinates needed to
183 * interpolate this variable. Note that when
184 * brw->needs_unlit_centroid_workaround is set, centroid interpolation
185 * uses PIXEL interpolation for unlit pixels and CENTROID interpolation
186 * for lit pixels, so we need both sets of barycentric coordinates.
188 if (interp_qualifier
== INTERP_QUALIFIER_NOPERSPECTIVE
) {
190 barycentric_interp_modes
|=
191 1 << BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC
;
192 } else if (is_sample
) {
193 barycentric_interp_modes
|=
194 1 << BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC
;
196 if ((!is_centroid
&& !is_sample
) ||
197 devinfo
->needs_unlit_centroid_workaround
) {
198 barycentric_interp_modes
|=
199 1 << BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC
;
201 } else if (interp_qualifier
== INTERP_QUALIFIER_SMOOTH
||
202 (!(shade_model_flat
&& is_gl_Color
) &&
203 interp_qualifier
== INTERP_QUALIFIER_NONE
)) {
205 barycentric_interp_modes
|=
206 1 << BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC
;
207 } else if (is_sample
) {
208 barycentric_interp_modes
|=
209 1 << BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC
;
211 if ((!is_centroid
&& !is_sample
) ||
212 devinfo
->needs_unlit_centroid_workaround
) {
213 barycentric_interp_modes
|=
214 1 << BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC
;
219 return barycentric_interp_modes
;
223 brw_vs_populate_key(struct brw_context
*brw
,
224 struct brw_vertex_program
*vp
,
225 struct brw_vs_prog_key
*key
)
227 struct gl_context
*ctx
= &brw
->ctx
;
228 /* BRW_NEW_VERTEX_PROGRAM */
229 struct gl_program
*prog
= (struct gl_program
*) vp
;
231 memset(key
, 0, sizeof(*key
));
233 /* Just upload the program verbatim for now. Always send it all
234 * the inputs it asks for, whether they are varying or not.
236 key
->program_string_id
= vp
->id
;
240 key
->copy_edgeflag
= (ctx
->Polygon
.FrontMode
!= GL_FILL
||
241 ctx
->Polygon
.BackMode
!= GL_FILL
);
244 if (prog
->OutputsWritten
& (VARYING_BIT_COL0
| VARYING_BIT_COL1
|
245 VARYING_BIT_BFC0
| VARYING_BIT_BFC1
)) {
246 /* _NEW_LIGHT | _NEW_BUFFERS */
247 key
->clamp_vertex_color
= ctx
->Light
._ClampVertexColor
;
251 if (brw
->gen
< 6 && ctx
->Point
.PointSprite
) {
252 for (int i
= 0; i
< 8; i
++) {
253 if (ctx
->Point
.CoordReplace
[i
])
254 key
->point_coord_replace
|= (1 << i
);
260 really_do_vs_prog(struct brw_context
*brw
,
261 struct gl_shader_program
*prog
,
262 struct brw_vertex_program
*vp
,
263 struct brw_vs_prog_key
*key
, struct anv_pipeline
*pipeline
)
266 const GLuint
*program
;
267 struct brw_vs_prog_data
*prog_data
= &pipeline
->vs_prog_data
;
269 struct gl_shader
*vs
= NULL
;
272 vs
= prog
->_LinkedShaders
[MESA_SHADER_VERTEX
];
274 memset(prog_data
, 0, sizeof(*prog_data
));
276 mem_ctx
= ralloc_context(NULL
);
278 create_params_array(pipeline
, vs
, &prog_data
->base
.base
);
279 anv_nir_apply_dynamic_offsets(pipeline
, vs
->Program
->nir
,
280 &prog_data
->base
.base
);
282 GLbitfield64 outputs_written
= vp
->program
.Base
.OutputsWritten
;
283 prog_data
->inputs_read
= vp
->program
.Base
.InputsRead
;
285 if (key
->copy_edgeflag
) {
286 outputs_written
|= BITFIELD64_BIT(VARYING_SLOT_EDGE
);
287 prog_data
->inputs_read
|= VERT_BIT_EDGEFLAG
;
291 /* Put dummy slots into the VUE for the SF to put the replaced
292 * point sprite coords in. We shouldn't need these dummy slots,
293 * which take up precious URB space, but it would mean that the SF
294 * doesn't get nice aligned pairs of input coords into output
295 * coords, which would be a pain to handle.
297 for (int i
= 0; i
< 8; i
++) {
298 if (key
->point_coord_replace
& (1 << i
))
299 outputs_written
|= BITFIELD64_BIT(VARYING_SLOT_TEX0
+ i
);
302 /* if back colors are written, allocate slots for front colors too */
303 if (outputs_written
& BITFIELD64_BIT(VARYING_SLOT_BFC0
))
304 outputs_written
|= BITFIELD64_BIT(VARYING_SLOT_COL0
);
305 if (outputs_written
& BITFIELD64_BIT(VARYING_SLOT_BFC1
))
306 outputs_written
|= BITFIELD64_BIT(VARYING_SLOT_COL1
);
309 /* In order for legacy clipping to work, we need to populate the clip
310 * distance varying slots whenever clipping is enabled, even if the vertex
311 * shader doesn't write to gl_ClipDistance.
313 if (key
->nr_userclip_plane_consts
) {
314 outputs_written
|= BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0
);
315 outputs_written
|= BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1
);
318 brw_compute_vue_map(brw
->intelScreen
->devinfo
,
319 &prog_data
->base
.vue_map
, outputs_written
,
320 prog
? prog
->SeparateShader
: false);
322 set_binding_table_layout(&prog_data
->base
.base
, pipeline
,
323 VK_SHADER_STAGE_VERTEX
);
327 program
= brw_vs_emit(brw
, mem_ctx
, key
, prog_data
, &vp
->program
,
328 prog
, -1, &program_size
);
329 if (program
== NULL
) {
330 ralloc_free(mem_ctx
);
334 const uint32_t offset
= upload_kernel(pipeline
, program
, program_size
);
335 if (prog_data
->base
.dispatch_mode
== DISPATCH_MODE_SIMD8
) {
336 pipeline
->vs_simd8
= offset
;
337 pipeline
->vs_vec4
= NO_KERNEL
;
339 pipeline
->vs_simd8
= NO_KERNEL
;
340 pipeline
->vs_vec4
= offset
;
343 ralloc_free(mem_ctx
);
348 void brw_wm_populate_key(struct brw_context
*brw
,
349 struct brw_fragment_program
*fp
,
350 struct brw_wm_prog_key
*key
)
352 struct gl_context
*ctx
= &brw
->ctx
;
355 bool program_uses_dfdy
= fp
->program
.UsesDFdy
;
356 struct gl_framebuffer draw_buffer
;
357 bool multisample_fbo
;
359 memset(key
, 0, sizeof(*key
));
361 for (int i
= 0; i
< MAX_SAMPLERS
; i
++) {
362 /* Assume color sampler, no swizzling. */
363 key
->tex
.swizzles
[i
] = SWIZZLE_XYZW
;
366 /* A non-zero framebuffer name indicates that the framebuffer was created by
367 * the user rather than the window system. */
368 draw_buffer
.Name
= 1;
369 draw_buffer
.Visual
.samples
= 1;
370 draw_buffer
._NumColorDrawBuffers
= 1;
371 draw_buffer
._NumColorDrawBuffers
= 1;
372 draw_buffer
.Width
= 400;
373 draw_buffer
.Height
= 400;
374 ctx
->DrawBuffer
= &draw_buffer
;
376 multisample_fbo
= ctx
->DrawBuffer
->Visual
.samples
> 1;
378 /* Build the index for table lookup
382 if (fp
->program
.UsesKill
|| ctx
->Color
.AlphaEnabled
)
383 lookup
|= IZ_PS_KILL_ALPHATEST_BIT
;
385 if (fp
->program
.Base
.OutputsWritten
& BITFIELD64_BIT(FRAG_RESULT_DEPTH
))
386 lookup
|= IZ_PS_COMPUTES_DEPTH_BIT
;
390 lookup
|= IZ_DEPTH_TEST_ENABLE_BIT
;
392 if (ctx
->Depth
.Test
&& ctx
->Depth
.Mask
) /* ?? */
393 lookup
|= IZ_DEPTH_WRITE_ENABLE_BIT
;
395 /* _NEW_STENCIL | _NEW_BUFFERS */
396 if (ctx
->Stencil
._Enabled
) {
397 lookup
|= IZ_STENCIL_TEST_ENABLE_BIT
;
399 if (ctx
->Stencil
.WriteMask
[0] ||
400 ctx
->Stencil
.WriteMask
[ctx
->Stencil
._BackFace
])
401 lookup
|= IZ_STENCIL_WRITE_ENABLE_BIT
;
403 key
->iz_lookup
= lookup
;
408 /* _NEW_LINE, _NEW_POLYGON, BRW_NEW_REDUCED_PRIMITIVE */
409 if (ctx
->Line
.SmoothFlag
) {
410 if (brw
->reduced_primitive
== GL_LINES
) {
413 else if (brw
->reduced_primitive
== GL_TRIANGLES
) {
414 if (ctx
->Polygon
.FrontMode
== GL_LINE
) {
415 line_aa
= AA_SOMETIMES
;
417 if (ctx
->Polygon
.BackMode
== GL_LINE
||
418 (ctx
->Polygon
.CullFlag
&&
419 ctx
->Polygon
.CullFaceMode
== GL_BACK
))
422 else if (ctx
->Polygon
.BackMode
== GL_LINE
) {
423 line_aa
= AA_SOMETIMES
;
425 if ((ctx
->Polygon
.CullFlag
&&
426 ctx
->Polygon
.CullFaceMode
== GL_FRONT
))
432 key
->line_aa
= line_aa
;
435 key
->high_quality_derivatives
=
436 ctx
->Hint
.FragmentShaderDerivative
== GL_NICEST
;
439 key
->stats_wm
= brw
->stats_wm
;
442 key
->flat_shade
= (ctx
->Light
.ShadeModel
== GL_FLAT
);
444 /* _NEW_FRAG_CLAMP | _NEW_BUFFERS */
445 key
->clamp_fragment_color
= ctx
->Color
._ClampFragmentColor
;
449 * Include the draw buffer origin and height so that we can calculate
450 * fragment position values relative to the bottom left of the drawable,
451 * from the incoming screen origin relative position we get as part of our
454 * This is only needed for the WM_WPOSXY opcode when the fragment program
455 * uses the gl_FragCoord input.
457 * We could avoid recompiling by including this as a constant referenced by
458 * our program, but if we were to do that it would also be nice to handle
459 * getting that constant updated at batchbuffer submit time (when we
460 * hold the lock and know where the buffer really is) rather than at emit
461 * time when we don't hold the lock and are just guessing. We could also
462 * just avoid using this as key data if the program doesn't use
465 * For DRI2 the origin_x/y will always be (0,0) but we still need the
466 * drawable height in order to invert the Y axis.
468 if (fp
->program
.Base
.InputsRead
& VARYING_BIT_POS
) {
469 key
->drawable_height
= ctx
->DrawBuffer
->Height
;
472 if ((fp
->program
.Base
.InputsRead
& VARYING_BIT_POS
) || program_uses_dfdy
) {
473 key
->render_to_fbo
= _mesa_is_user_fbo(ctx
->DrawBuffer
);
477 key
->nr_color_regions
= ctx
->DrawBuffer
->_NumColorDrawBuffers
;
479 /* _NEW_MULTISAMPLE, _NEW_COLOR, _NEW_BUFFERS */
480 key
->replicate_alpha
= ctx
->DrawBuffer
->_NumColorDrawBuffers
> 1 &&
481 (ctx
->Multisample
.SampleAlphaToCoverage
|| ctx
->Color
.AlphaEnabled
);
483 /* _NEW_BUFFERS _NEW_MULTISAMPLE */
484 /* Ignore sample qualifier while computing this flag. */
485 key
->persample_shading
=
486 _mesa_get_min_invocations_per_fragment(ctx
, &fp
->program
, true) > 1;
487 if (key
->persample_shading
)
488 key
->persample_2x
= ctx
->DrawBuffer
->Visual
.samples
== 2;
490 key
->compute_pos_offset
=
491 _mesa_get_min_invocations_per_fragment(ctx
, &fp
->program
, false) > 1 &&
492 fp
->program
.Base
.SystemValuesRead
& SYSTEM_BIT_SAMPLE_POS
;
494 key
->compute_sample_id
=
496 ctx
->Multisample
.Enabled
&&
497 (fp
->program
.Base
.SystemValuesRead
& SYSTEM_BIT_SAMPLE_ID
);
499 /* BRW_NEW_VUE_MAP_GEOM_OUT */
500 if (brw
->gen
< 6 || _mesa_bitcount_64(fp
->program
.Base
.InputsRead
&
501 BRW_FS_VARYING_INPUT_MASK
) > 16)
502 key
->input_slots_valid
= brw
->vue_map_geom_out
.slots_valid
;
505 /* _NEW_COLOR | _NEW_BUFFERS */
506 /* Pre-gen6, the hardware alpha test always used each render
507 * target's alpha to do alpha test, as opposed to render target 0's alpha
508 * like GL requires. Fix that by building the alpha test into the
509 * shader, and we'll skip enabling the fixed function alpha test.
511 if (brw
->gen
< 6 && ctx
->DrawBuffer
->_NumColorDrawBuffers
> 1 && ctx
->Color
.AlphaEnabled
) {
512 key
->alpha_test_func
= ctx
->Color
.AlphaFunc
;
513 key
->alpha_test_ref
= ctx
->Color
.AlphaRef
;
516 /* The unique fragment program ID */
517 key
->program_string_id
= fp
->id
;
519 ctx
->DrawBuffer
= NULL
;
523 computed_depth_mode(struct gl_fragment_program
*fp
)
525 if (fp
->Base
.OutputsWritten
& BITFIELD64_BIT(FRAG_RESULT_DEPTH
)) {
526 switch (fp
->FragDepthLayout
) {
527 case FRAG_DEPTH_LAYOUT_NONE
:
528 case FRAG_DEPTH_LAYOUT_ANY
:
529 return BRW_PSCDEPTH_ON
;
530 case FRAG_DEPTH_LAYOUT_GREATER
:
531 return BRW_PSCDEPTH_ON_GE
;
532 case FRAG_DEPTH_LAYOUT_LESS
:
533 return BRW_PSCDEPTH_ON_LE
;
534 case FRAG_DEPTH_LAYOUT_UNCHANGED
:
535 return BRW_PSCDEPTH_OFF
;
538 return BRW_PSCDEPTH_OFF
;
542 really_do_wm_prog(struct brw_context
*brw
,
543 struct gl_shader_program
*prog
,
544 struct brw_fragment_program
*fp
,
545 struct brw_wm_prog_key
*key
, struct anv_pipeline
*pipeline
)
547 void *mem_ctx
= ralloc_context(NULL
);
548 struct brw_wm_prog_data
*prog_data
= &pipeline
->wm_prog_data
;
549 struct gl_shader
*fs
= NULL
;
550 unsigned int program_size
;
551 const uint32_t *program
;
554 fs
= prog
->_LinkedShaders
[MESA_SHADER_FRAGMENT
];
556 memset(prog_data
, 0, sizeof(*prog_data
));
558 /* key->alpha_test_func means simulating alpha testing via discards,
559 * so the shader definitely kills pixels.
561 prog_data
->uses_kill
= fp
->program
.UsesKill
|| key
->alpha_test_func
;
563 prog_data
->computed_depth_mode
= computed_depth_mode(&fp
->program
);
565 create_params_array(pipeline
, fs
, &prog_data
->base
);
566 anv_nir_apply_dynamic_offsets(pipeline
, fs
->Program
->nir
, &prog_data
->base
);
568 prog_data
->barycentric_interp_modes
=
569 brw_compute_barycentric_interp_modes(brw
->intelScreen
->devinfo
,
571 key
->persample_shading
,
572 fp
->program
.Base
.nir
);
574 set_binding_table_layout(&prog_data
->base
, pipeline
,
575 VK_SHADER_STAGE_FRAGMENT
);
576 /* This needs to come after shader time and pull constant entries, but we
577 * don't have those set up now, so just put it after the layout entries.
579 prog_data
->binding_table
.render_target_start
= 0;
581 program
= brw_wm_fs_emit(brw
, mem_ctx
, key
, prog_data
,
582 &fp
->program
, prog
, -1, -1, &program_size
);
583 if (program
== NULL
) {
584 ralloc_free(mem_ctx
);
588 uint32_t offset
= upload_kernel(pipeline
, program
, program_size
);
591 pipeline
->ps_simd8
= NO_KERNEL
;
593 pipeline
->ps_simd8
= offset
;
595 if (prog_data
->no_8
|| prog_data
->prog_offset_16
) {
596 pipeline
->ps_simd16
= offset
+ prog_data
->prog_offset_16
;
598 pipeline
->ps_simd16
= NO_KERNEL
;
601 ralloc_free(mem_ctx
);
607 anv_codegen_gs_prog(struct brw_context
*brw
,
608 struct gl_shader_program
*prog
,
609 struct brw_geometry_program
*gp
,
610 struct brw_gs_prog_key
*key
,
611 struct anv_pipeline
*pipeline
)
613 struct brw_gs_compile c
;
615 memset(&c
, 0, sizeof(c
));
619 c
.prog_data
.include_primitive_id
=
620 (gp
->program
.Base
.InputsRead
& VARYING_BIT_PRIMITIVE_ID
) != 0;
622 c
.prog_data
.invocations
= gp
->program
.Invocations
;
624 set_binding_table_layout(&c
.prog_data
.base
.base
,
625 pipeline
, VK_SHADER_STAGE_GEOMETRY
);
627 /* Allocate the references to the uniforms that will end up in the
628 * prog_data associated with the compiled program, and which will be freed
629 * by the state cache.
631 * Note: param_count needs to be num_uniform_components * 4, since we add
632 * padding around uniform values below vec4 size, so the worst case is that
633 * every uniform is a float which gets padded to the size of a vec4.
635 struct gl_shader
*gs
= prog
->_LinkedShaders
[MESA_SHADER_GEOMETRY
];
636 int param_count
= gp
->program
.Base
.nir
->num_uniforms
* 4;
638 c
.prog_data
.base
.base
.param
=
639 rzalloc_array(NULL
, const gl_constant_value
*, param_count
);
640 c
.prog_data
.base
.base
.pull_param
=
641 rzalloc_array(NULL
, const gl_constant_value
*, param_count
);
642 c
.prog_data
.base
.base
.image_param
=
643 rzalloc_array(NULL
, struct brw_image_param
, gs
->NumImages
);
644 c
.prog_data
.base
.base
.nr_params
= param_count
;
645 c
.prog_data
.base
.base
.nr_image_params
= gs
->NumImages
;
647 brw_nir_setup_glsl_uniforms(gp
->program
.Base
.nir
, prog
, &gp
->program
.Base
,
648 &c
.prog_data
.base
.base
, false);
651 c
.prog_data
.static_vertex_count
= !gp
->program
.Base
.nir
? -1 :
652 nir_gs_count_vertices(gp
->program
.Base
.nir
);
656 if (gp
->program
.OutputType
== GL_POINTS
) {
657 /* When the output type is points, the geometry shader may output data
658 * to multiple streams, and EndPrimitive() has no effect. So we
659 * configure the hardware to interpret the control data as stream ID.
661 c
.prog_data
.control_data_format
= GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID
;
663 /* We only have to emit control bits if we are using streams */
664 if (prog
->Geom
.UsesStreams
)
665 c
.control_data_bits_per_vertex
= 2;
667 c
.control_data_bits_per_vertex
= 0;
669 /* When the output type is triangle_strip or line_strip, EndPrimitive()
670 * may be used to terminate the current strip and start a new one
671 * (similar to primitive restart), and outputting data to multiple
672 * streams is not supported. So we configure the hardware to interpret
673 * the control data as EndPrimitive information (a.k.a. "cut bits").
675 c
.prog_data
.control_data_format
= GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT
;
677 /* We only need to output control data if the shader actually calls
680 c
.control_data_bits_per_vertex
= gp
->program
.UsesEndPrimitive
? 1 : 0;
683 /* There are no control data bits in gen6. */
684 c
.control_data_bits_per_vertex
= 0;
686 /* If it is using transform feedback, enable it */
687 if (prog
->TransformFeedback
.NumVarying
)
688 c
.prog_data
.gen6_xfb_enabled
= true;
690 c
.prog_data
.gen6_xfb_enabled
= false;
692 c
.control_data_header_size_bits
=
693 gp
->program
.VerticesOut
* c
.control_data_bits_per_vertex
;
695 /* 1 HWORD = 32 bytes = 256 bits */
696 c
.prog_data
.control_data_header_size_hwords
=
697 ALIGN(c
.control_data_header_size_bits
, 256) / 256;
699 GLbitfield64 outputs_written
= gp
->program
.Base
.OutputsWritten
;
701 brw_compute_vue_map(brw
->intelScreen
->devinfo
,
702 &c
.prog_data
.base
.vue_map
, outputs_written
,
703 prog
? prog
->SeparateShader
: false);
705 /* Compute the output vertex size.
707 * From the Ivy Bridge PRM, Vol2 Part1 7.2.1.1 STATE_GS - Output Vertex
710 * [0,62] indicating [1,63] 16B units
712 * Specifies the size of each vertex stored in the GS output entry
713 * (following any Control Header data) as a number of 128-bit units
716 * Programming Restrictions: The vertex size must be programmed as a
717 * multiple of 32B units with the following exception: Rendering is
718 * disabled (as per SOL stage state) and the vertex size output by the
721 * If rendering is enabled (as per SOL state) the vertex size must be
722 * programmed as a multiple of 32B units. In other words, the only time
723 * software can program a vertex size with an odd number of 16B units
724 * is when rendering is disabled.
726 * Note: B=bytes in the above text.
728 * It doesn't seem worth the extra trouble to optimize the case where the
729 * vertex size is 16B (especially since this would require special-casing
730 * the GEN assembly that writes to the URB). So we just set the vertex
731 * size to a multiple of 32B (2 vec4's) in all cases.
733 * The maximum output vertex size is 62*16 = 992 bytes (31 hwords). We
734 * budget that as follows:
736 * 512 bytes for varyings (a varying component is 4 bytes and
737 * gl_MaxGeometryOutputComponents = 128)
738 * 16 bytes overhead for VARYING_SLOT_PSIZ (each varying slot is 16
740 * 16 bytes overhead for gl_Position (we allocate it a slot in the VUE
741 * even if it's not used)
742 * 32 bytes overhead for gl_ClipDistance (we allocate it 2 VUE slots
743 * whenever clip planes are enabled, even if the shader doesn't
744 * write to gl_ClipDistance)
745 * 16 bytes overhead since the VUE size must be a multiple of 32 bytes
746 * (see below)--this causes up to 1 VUE slot to be wasted
747 * 400 bytes available for varying packing overhead
749 * Worst-case varying packing overhead is 3/4 of a varying slot (12 bytes)
750 * per interpolation type, so this is plenty.
753 unsigned output_vertex_size_bytes
= c
.prog_data
.base
.vue_map
.num_slots
* 16;
754 assert(brw
->gen
== 6 ||
755 output_vertex_size_bytes
<= GEN7_MAX_GS_OUTPUT_VERTEX_SIZE_BYTES
);
756 c
.prog_data
.output_vertex_size_hwords
=
757 ALIGN(output_vertex_size_bytes
, 32) / 32;
759 /* Compute URB entry size. The maximum allowed URB entry size is 32k.
760 * That divides up as follows:
762 * 64 bytes for the control data header (cut indices or StreamID bits)
763 * 4096 bytes for varyings (a varying component is 4 bytes and
764 * gl_MaxGeometryTotalOutputComponents = 1024)
765 * 4096 bytes overhead for VARYING_SLOT_PSIZ (each varying slot is 16
766 * bytes/vertex and gl_MaxGeometryOutputVertices is 256)
767 * 4096 bytes overhead for gl_Position (we allocate it a slot in the VUE
768 * even if it's not used)
769 * 8192 bytes overhead for gl_ClipDistance (we allocate it 2 VUE slots
770 * whenever clip planes are enabled, even if the shader doesn't
771 * write to gl_ClipDistance)
772 * 4096 bytes overhead since the VUE size must be a multiple of 32
773 * bytes (see above)--this causes up to 1 VUE slot to be wasted
774 * 8128 bytes available for varying packing overhead
776 * Worst-case varying packing overhead is 3/4 of a varying slot per
777 * interpolation type, which works out to 3072 bytes, so this would allow
778 * us to accommodate 2 interpolation types without any danger of running
781 * In practice, the risk of running out of URB space is very small, since
782 * the above figures are all worst-case, and most of them scale with the
783 * number of output vertices. So we'll just calculate the amount of space
784 * we need, and if it's too large, fail to compile.
786 * The above is for gen7+ where we have a single URB entry that will hold
787 * all the output. In gen6, we will have to allocate URB entries for every
788 * vertex we emit, so our URB entries only need to be large enough to hold
789 * a single vertex. Also, gen6 does not have a control data header.
791 unsigned output_size_bytes
;
794 c
.prog_data
.output_vertex_size_hwords
* 32 * gp
->program
.VerticesOut
;
795 output_size_bytes
+= 32 * c
.prog_data
.control_data_header_size_hwords
;
797 output_size_bytes
= c
.prog_data
.output_vertex_size_hwords
* 32;
800 /* Broadwell stores "Vertex Count" as a full 8 DWord (32 byte) URB output,
801 * which comes before the control header.
804 output_size_bytes
+= 32;
806 assert(output_size_bytes
>= 1);
807 int max_output_size_bytes
= GEN7_MAX_GS_URB_ENTRY_SIZE_BYTES
;
809 max_output_size_bytes
= GEN6_MAX_GS_URB_ENTRY_SIZE_BYTES
;
810 if (output_size_bytes
> max_output_size_bytes
)
814 /* URB entry sizes are stored as a multiple of 64 bytes in gen7+ and
815 * a multiple of 128 bytes in gen6.
818 c
.prog_data
.base
.urb_entry_size
= ALIGN(output_size_bytes
, 64) / 64;
820 c
.prog_data
.base
.urb_entry_size
= ALIGN(output_size_bytes
, 128) / 128;
822 /* FIXME: Need to pull this from nir shader. */
823 c
.prog_data
.output_topology
= _3DPRIM_TRISTRIP
;
825 /* The GLSL linker will have already matched up GS inputs and the outputs
826 * of prior stages. The driver does extend VS outputs in some cases, but
827 * only for legacy OpenGL or Gen4-5 hardware, neither of which offer
828 * geometry shader support. So we can safely ignore that.
830 * For SSO pipelines, we use a fixed VUE map layout based on variable
831 * locations, so we can rely on rendezvous-by-location making this work.
833 * However, we need to ignore VARYING_SLOT_PRIMITIVE_ID, as it's not
834 * written by previous stages and shows up via payload magic.
836 GLbitfield64 inputs_read
=
837 gp
->program
.Base
.InputsRead
& ~VARYING_BIT_PRIMITIVE_ID
;
838 brw_compute_vue_map(brw
->intelScreen
->devinfo
,
839 &c
.input_vue_map
, inputs_read
,
840 prog
->SeparateShader
);
842 /* GS inputs are read from the VUE 256 bits (2 vec4's) at a time, so we
843 * need to program a URB read length of ceiling(num_slots / 2).
845 c
.prog_data
.base
.urb_read_length
= (c
.input_vue_map
.num_slots
+ 1) / 2;
847 void *mem_ctx
= ralloc_context(NULL
);
848 unsigned program_size
;
849 const unsigned *program
=
850 brw_gs_emit(brw
, prog
, &c
, mem_ctx
, -1, &program_size
);
851 if (program
== NULL
) {
852 ralloc_free(mem_ctx
);
856 pipeline
->gs_vec4
= upload_kernel(pipeline
, program
, program_size
);
857 pipeline
->gs_vertex_count
= gp
->program
.VerticesIn
;
859 ralloc_free(mem_ctx
);
865 brw_codegen_cs_prog(struct brw_context
*brw
,
866 struct gl_shader_program
*prog
,
867 struct brw_compute_program
*cp
,
868 struct brw_cs_prog_key
*key
, struct anv_pipeline
*pipeline
)
870 const GLuint
*program
;
871 void *mem_ctx
= ralloc_context(NULL
);
873 struct brw_cs_prog_data
*prog_data
= &pipeline
->cs_prog_data
;
875 struct gl_shader
*cs
= prog
->_LinkedShaders
[MESA_SHADER_COMPUTE
];
878 memset(prog_data
, 0, sizeof(*prog_data
));
880 set_binding_table_layout(&prog_data
->base
, pipeline
, VK_SHADER_STAGE_COMPUTE
);
882 create_params_array(pipeline
, cs
, &prog_data
->base
);
883 anv_nir_apply_dynamic_offsets(pipeline
, cs
->Program
->nir
, &prog_data
->base
);
885 program
= brw_cs_emit(brw
, mem_ctx
, key
, prog_data
,
886 &cp
->program
, prog
, -1, &program_size
);
887 if (program
== NULL
) {
888 ralloc_free(mem_ctx
);
892 if (unlikely(INTEL_DEBUG
& DEBUG_CS
))
893 fprintf(stderr
, "\n");
895 pipeline
->cs_simd
= upload_kernel(pipeline
, program
, program_size
);
897 ralloc_free(mem_ctx
);
903 brw_cs_populate_key(struct brw_context
*brw
,
904 struct brw_compute_program
*bcp
, struct brw_cs_prog_key
*key
)
906 memset(key
, 0, sizeof(*key
));
908 /* The unique compute program ID */
909 key
->program_string_id
= bcp
->id
;
912 struct anv_compiler
{
913 struct anv_device
*device
;
914 struct intel_screen
*screen
;
915 struct brw_context
*brw
;
916 struct gl_pipeline_object pipeline
;
921 struct anv_compiler
*
922 anv_compiler_create(struct anv_device
*device
)
924 const struct brw_device_info
*devinfo
= &device
->info
;
925 struct anv_compiler
*compiler
;
926 struct gl_context
*ctx
;
928 compiler
= rzalloc(NULL
, struct anv_compiler
);
929 if (compiler
== NULL
)
932 compiler
->screen
= rzalloc(compiler
, struct intel_screen
);
933 if (compiler
->screen
== NULL
)
936 compiler
->brw
= rzalloc(compiler
, struct brw_context
);
937 if (compiler
->brw
== NULL
)
940 compiler
->device
= device
;
942 compiler
->brw
->gen
= devinfo
->gen
;
943 compiler
->brw
->is_g4x
= devinfo
->is_g4x
;
944 compiler
->brw
->is_baytrail
= devinfo
->is_baytrail
;
945 compiler
->brw
->is_haswell
= devinfo
->is_haswell
;
946 compiler
->brw
->is_cherryview
= devinfo
->is_cherryview
;
948 /* We need this at least for CS, which will check brw->max_cs_threads
949 * against the work group size. */
950 compiler
->brw
->max_vs_threads
= devinfo
->max_vs_threads
;
951 compiler
->brw
->max_hs_threads
= devinfo
->max_hs_threads
;
952 compiler
->brw
->max_ds_threads
= devinfo
->max_ds_threads
;
953 compiler
->brw
->max_gs_threads
= devinfo
->max_gs_threads
;
954 compiler
->brw
->max_wm_threads
= devinfo
->max_wm_threads
;
955 compiler
->brw
->max_cs_threads
= devinfo
->max_cs_threads
;
956 compiler
->brw
->urb
.size
= devinfo
->urb
.size
;
957 compiler
->brw
->urb
.min_vs_entries
= devinfo
->urb
.min_vs_entries
;
958 compiler
->brw
->urb
.max_vs_entries
= devinfo
->urb
.max_vs_entries
;
959 compiler
->brw
->urb
.max_hs_entries
= devinfo
->urb
.max_hs_entries
;
960 compiler
->brw
->urb
.max_ds_entries
= devinfo
->urb
.max_ds_entries
;
961 compiler
->brw
->urb
.max_gs_entries
= devinfo
->urb
.max_gs_entries
;
963 compiler
->brw
->intelScreen
= compiler
->screen
;
964 compiler
->screen
->devinfo
= &device
->info
;
966 brw_process_intel_debug_variable();
968 compiler
->screen
->compiler
= brw_compiler_create(compiler
, &device
->info
);
970 ctx
= &compiler
->brw
->ctx
;
971 _mesa_init_shader_object_functions(&ctx
->Driver
);
973 /* brw_select_clip_planes() needs this for bogus reasons. */
974 ctx
->_Shader
= &compiler
->pipeline
;
979 ralloc_free(compiler
);
984 anv_compiler_destroy(struct anv_compiler
*compiler
)
986 _mesa_free_errors_data(&compiler
->brw
->ctx
);
987 ralloc_free(compiler
);
990 /* From gen7_urb.c */
992 /* FIXME: Add to struct intel_device_info */
994 static const int gen8_push_size
= 32 * 1024;
997 gen7_compute_urb_partition(struct anv_pipeline
*pipeline
)
999 const struct brw_device_info
*devinfo
= &pipeline
->device
->info
;
1000 bool vs_present
= pipeline
->vs_simd8
!= NO_KERNEL
;
1001 unsigned vs_size
= vs_present
? pipeline
->vs_prog_data
.base
.urb_entry_size
: 1;
1002 unsigned vs_entry_size_bytes
= vs_size
* 64;
1003 bool gs_present
= pipeline
->gs_vec4
!= NO_KERNEL
;
1004 unsigned gs_size
= gs_present
? pipeline
->gs_prog_data
.base
.urb_entry_size
: 1;
1005 unsigned gs_entry_size_bytes
= gs_size
* 64;
1007 /* From p35 of the Ivy Bridge PRM (section 1.7.1: 3DSTATE_URB_GS):
1009 * VS Number of URB Entries must be divisible by 8 if the VS URB Entry
1010 * Allocation Size is less than 9 512-bit URB entries.
1012 * Similar text exists for GS.
1014 unsigned vs_granularity
= (vs_size
< 9) ? 8 : 1;
1015 unsigned gs_granularity
= (gs_size
< 9) ? 8 : 1;
1017 /* URB allocations must be done in 8k chunks. */
1018 unsigned chunk_size_bytes
= 8192;
1020 /* Determine the size of the URB in chunks. */
1021 unsigned urb_chunks
= devinfo
->urb
.size
* 1024 / chunk_size_bytes
;
1023 /* Reserve space for push constants */
1024 unsigned push_constant_bytes
= gen8_push_size
;
1025 unsigned push_constant_chunks
=
1026 push_constant_bytes
/ chunk_size_bytes
;
1028 /* Initially, assign each stage the minimum amount of URB space it needs,
1029 * and make a note of how much additional space it "wants" (the amount of
1030 * additional space it could actually make use of).
1033 /* VS has a lower limit on the number of URB entries */
1034 unsigned vs_chunks
=
1035 ALIGN(devinfo
->urb
.min_vs_entries
* vs_entry_size_bytes
,
1036 chunk_size_bytes
) / chunk_size_bytes
;
1038 ALIGN(devinfo
->urb
.max_vs_entries
* vs_entry_size_bytes
,
1039 chunk_size_bytes
) / chunk_size_bytes
- vs_chunks
;
1041 unsigned gs_chunks
= 0;
1042 unsigned gs_wants
= 0;
1044 /* There are two constraints on the minimum amount of URB space we can
1047 * (1) We need room for at least 2 URB entries, since we always operate
1048 * the GS in DUAL_OBJECT mode.
1050 * (2) We can't allocate less than nr_gs_entries_granularity.
1052 gs_chunks
= ALIGN(MAX2(gs_granularity
, 2) * gs_entry_size_bytes
,
1053 chunk_size_bytes
) / chunk_size_bytes
;
1055 ALIGN(devinfo
->urb
.max_gs_entries
* gs_entry_size_bytes
,
1056 chunk_size_bytes
) / chunk_size_bytes
- gs_chunks
;
1059 /* There should always be enough URB space to satisfy the minimum
1060 * requirements of each stage.
1062 unsigned total_needs
= push_constant_chunks
+ vs_chunks
+ gs_chunks
;
1063 assert(total_needs
<= urb_chunks
);
1065 /* Mete out remaining space (if any) in proportion to "wants". */
1066 unsigned total_wants
= vs_wants
+ gs_wants
;
1067 unsigned remaining_space
= urb_chunks
- total_needs
;
1068 if (remaining_space
> total_wants
)
1069 remaining_space
= total_wants
;
1070 if (remaining_space
> 0) {
1071 unsigned vs_additional
= (unsigned)
1072 round(vs_wants
* (((double) remaining_space
) / total_wants
));
1073 vs_chunks
+= vs_additional
;
1074 remaining_space
-= vs_additional
;
1075 gs_chunks
+= remaining_space
;
1078 /* Sanity check that we haven't over-allocated. */
1079 assert(push_constant_chunks
+ vs_chunks
+ gs_chunks
<= urb_chunks
);
1081 /* Finally, compute the number of entries that can fit in the space
1082 * allocated to each stage.
1084 unsigned nr_vs_entries
= vs_chunks
* chunk_size_bytes
/ vs_entry_size_bytes
;
1085 unsigned nr_gs_entries
= gs_chunks
* chunk_size_bytes
/ gs_entry_size_bytes
;
1087 /* Since we rounded up when computing *_wants, this may be slightly more
1088 * than the maximum allowed amount, so correct for that.
1090 nr_vs_entries
= MIN2(nr_vs_entries
, devinfo
->urb
.max_vs_entries
);
1091 nr_gs_entries
= MIN2(nr_gs_entries
, devinfo
->urb
.max_gs_entries
);
1093 /* Ensure that we program a multiple of the granularity. */
1094 nr_vs_entries
= ROUND_DOWN_TO(nr_vs_entries
, vs_granularity
);
1095 nr_gs_entries
= ROUND_DOWN_TO(nr_gs_entries
, gs_granularity
);
1097 /* Finally, sanity check to make sure we have at least the minimum number
1098 * of entries needed for each stage.
1100 assert(nr_vs_entries
>= devinfo
->urb
.min_vs_entries
);
1102 assert(nr_gs_entries
>= 2);
1104 /* Lay out the URB in the following order:
1109 pipeline
->urb
.vs_start
= push_constant_chunks
;
1110 pipeline
->urb
.vs_size
= vs_size
;
1111 pipeline
->urb
.nr_vs_entries
= nr_vs_entries
;
1113 pipeline
->urb
.gs_start
= push_constant_chunks
+ vs_chunks
;
1114 pipeline
->urb
.gs_size
= gs_size
;
1115 pipeline
->urb
.nr_gs_entries
= nr_gs_entries
;
1118 static const struct {
1120 gl_shader_stage stage
;
1123 { GL_VERTEX_SHADER
, MESA_SHADER_VERTEX
, "vertex" },
1124 { GL_TESS_CONTROL_SHADER
, (gl_shader_stage
)-1,"tess control" },
1125 { GL_TESS_EVALUATION_SHADER
, (gl_shader_stage
)-1, "tess evaluation" },
1126 { GL_GEOMETRY_SHADER
, MESA_SHADER_GEOMETRY
, "geometry" },
1127 { GL_FRAGMENT_SHADER
, MESA_SHADER_FRAGMENT
, "fragment" },
1128 { GL_COMPUTE_SHADER
, MESA_SHADER_COMPUTE
, "compute" },
1131 struct spirv_header
{
1138 setup_nir_io(struct gl_shader
*mesa_shader
,
1141 struct gl_program
*prog
= mesa_shader
->Program
;
1142 foreach_list_typed(nir_variable
, var
, node
, &shader
->inputs
) {
1143 prog
->InputsRead
|= BITFIELD64_BIT(var
->data
.location
);
1144 if (shader
->stage
== MESA_SHADER_FRAGMENT
) {
1145 struct gl_fragment_program
*fprog
= (struct gl_fragment_program
*)prog
;
1147 fprog
->InterpQualifier
[var
->data
.location
] =
1148 (glsl_interp_qualifier
)var
->data
.interpolation
;
1149 if (var
->data
.centroid
)
1150 fprog
->IsCentroid
|= BITFIELD64_BIT(var
->data
.location
);
1151 if (var
->data
.sample
)
1152 fprog
->IsSample
|= BITFIELD64_BIT(var
->data
.location
);
1156 foreach_list_typed(nir_variable
, var
, node
, &shader
->outputs
) {
1157 prog
->OutputsWritten
|= BITFIELD64_BIT(var
->data
.location
);
1160 shader
->info
.inputs_read
= prog
->InputsRead
;
1161 shader
->info
.outputs_written
= prog
->OutputsWritten
;
1163 mesa_shader
->num_uniform_components
= shader
->num_uniforms
;
1167 anv_compile_shader_spirv(struct anv_compiler
*compiler
,
1168 struct gl_shader_program
*program
,
1169 struct anv_pipeline
*pipeline
, uint32_t stage
)
1171 struct brw_context
*brw
= compiler
->brw
;
1172 struct anv_shader
*shader
= pipeline
->shaders
[stage
];
1173 struct gl_shader
*mesa_shader
;
1176 mesa_shader
= brw_new_shader(&brw
->ctx
, name
, stage_info
[stage
].token
);
1177 fail_if(mesa_shader
== NULL
,
1178 "failed to create %s shader\n", stage_info
[stage
].name
);
1180 #define CREATE_PROGRAM(stage) \
1181 _mesa_init_##stage##_program(&brw->ctx, &ralloc(mesa_shader, struct brw_##stage##_program)->program, 0, 0)
1184 struct gl_program
*prog
;
1186 case VK_SHADER_STAGE_VERTEX
:
1187 prog
= CREATE_PROGRAM(vertex
);
1188 is_scalar
= compiler
->screen
->compiler
->scalar_vs
;
1190 case VK_SHADER_STAGE_GEOMETRY
:
1191 prog
= CREATE_PROGRAM(geometry
);
1194 case VK_SHADER_STAGE_FRAGMENT
:
1195 prog
= CREATE_PROGRAM(fragment
);
1198 case VK_SHADER_STAGE_COMPUTE
:
1199 prog
= CREATE_PROGRAM(compute
);
1203 unreachable("Unsupported shader stage");
1205 _mesa_reference_program(&brw
->ctx
, &mesa_shader
->Program
, prog
);
1207 mesa_shader
->Program
->Parameters
=
1208 rzalloc(mesa_shader
, struct gl_program_parameter_list
);
1210 mesa_shader
->Type
= stage_info
[stage
].token
;
1211 mesa_shader
->Stage
= stage_info
[stage
].stage
;
1213 struct gl_shader_compiler_options
*glsl_options
=
1214 &compiler
->screen
->compiler
->glsl_compiler_options
[stage_info
[stage
].stage
];
1216 if (shader
->module
->nir
) {
1217 /* Some things such as our meta clear/blit code will give us a NIR
1218 * shader directly. In that case, we just ignore the SPIR-V entirely
1219 * and just use the NIR shader */
1220 mesa_shader
->Program
->nir
= shader
->module
->nir
;
1221 mesa_shader
->Program
->nir
->options
= glsl_options
->NirOptions
;
1223 uint32_t *spirv
= (uint32_t *) shader
->module
->data
;
1224 assert(spirv
[0] == SPIR_V_MAGIC_NUMBER
);
1225 assert(shader
->module
->size
% 4 == 0);
1227 mesa_shader
->Program
->nir
=
1228 spirv_to_nir(spirv
, shader
->module
->size
/ 4,
1229 stage_info
[stage
].stage
, glsl_options
->NirOptions
);
1231 nir_validate_shader(mesa_shader
->Program
->nir
);
1233 brw_process_nir(mesa_shader
->Program
->nir
,
1234 compiler
->screen
->devinfo
,
1235 NULL
, mesa_shader
->Stage
, is_scalar
);
1237 setup_nir_io(mesa_shader
, mesa_shader
->Program
->nir
);
1239 fail_if(mesa_shader
->Program
->nir
== NULL
,
1240 "failed to translate SPIR-V to NIR\n");
1242 _mesa_reference_shader(&brw
->ctx
, &program
->Shaders
[program
->NumShaders
],
1244 program
->NumShaders
++;
1248 add_compiled_stage(struct anv_pipeline
*pipeline
, uint32_t stage
,
1249 struct brw_stage_prog_data
*prog_data
)
1251 struct brw_device_info
*devinfo
= &pipeline
->device
->info
;
1252 uint32_t max_threads
[] = {
1253 [VK_SHADER_STAGE_VERTEX
] = devinfo
->max_vs_threads
,
1254 [VK_SHADER_STAGE_TESS_CONTROL
] = 0,
1255 [VK_SHADER_STAGE_TESS_EVALUATION
] = 0,
1256 [VK_SHADER_STAGE_GEOMETRY
] = devinfo
->max_gs_threads
,
1257 [VK_SHADER_STAGE_FRAGMENT
] = devinfo
->max_wm_threads
,
1258 [VK_SHADER_STAGE_COMPUTE
] = devinfo
->max_cs_threads
,
1261 pipeline
->prog_data
[stage
] = prog_data
;
1262 pipeline
->active_stages
|= 1 << stage
;
1263 pipeline
->scratch_start
[stage
] = pipeline
->total_scratch
;
1264 pipeline
->total_scratch
=
1265 align_u32(pipeline
->total_scratch
, 1024) +
1266 prog_data
->total_scratch
* max_threads
[stage
];
1270 anv_compiler_run(struct anv_compiler
*compiler
, struct anv_pipeline
*pipeline
)
1272 struct gl_shader_program
*program
;
1274 struct brw_context
*brw
= compiler
->brw
;
1276 pipeline
->writes_point_size
= false;
1278 /* When we free the pipeline, we detect stages based on the NULL status
1279 * of various prog_data pointers. Make them NULL by default.
1281 memset(pipeline
->prog_data
, 0, sizeof(pipeline
->prog_data
));
1282 memset(pipeline
->scratch_start
, 0, sizeof(pipeline
->scratch_start
));
1284 brw
->use_rep_send
= pipeline
->use_repclear
;
1285 brw
->no_simd8
= pipeline
->use_repclear
;
1287 program
= _mesa_new_shader_program(name
);
1288 program
->Shaders
= (struct gl_shader
**)
1289 calloc(VK_SHADER_STAGE_NUM
, sizeof(struct gl_shader
*));
1290 fail_if(program
== NULL
|| program
->Shaders
== NULL
,
1291 "failed to create program\n");
1293 for (unsigned i
= 0; i
< VK_SHADER_STAGE_NUM
; i
++) {
1294 if (pipeline
->shaders
[i
])
1295 anv_compile_shader_spirv(compiler
, program
, pipeline
, i
);
1298 for (unsigned i
= 0; i
< program
->NumShaders
; i
++) {
1299 struct gl_shader
*shader
= program
->Shaders
[i
];
1300 program
->_LinkedShaders
[shader
->Stage
] = shader
;
1304 pipeline
->active_stages
= 0;
1305 pipeline
->total_scratch
= 0;
1307 if (pipeline
->shaders
[VK_SHADER_STAGE_VERTEX
]) {
1308 struct brw_vs_prog_key vs_key
;
1309 struct gl_vertex_program
*vp
= (struct gl_vertex_program
*)
1310 program
->_LinkedShaders
[MESA_SHADER_VERTEX
]->Program
;
1311 struct brw_vertex_program
*bvp
= brw_vertex_program(vp
);
1313 brw_vs_populate_key(brw
, bvp
, &vs_key
);
1315 success
= really_do_vs_prog(brw
, program
, bvp
, &vs_key
, pipeline
);
1316 fail_if(!success
, "do_wm_prog failed\n");
1317 add_compiled_stage(pipeline
, VK_SHADER_STAGE_VERTEX
,
1318 &pipeline
->vs_prog_data
.base
.base
);
1320 if (vp
->Base
.OutputsWritten
& VARYING_SLOT_PSIZ
)
1321 pipeline
->writes_point_size
= true;
1323 memset(&pipeline
->vs_prog_data
, 0, sizeof(pipeline
->vs_prog_data
));
1324 pipeline
->vs_simd8
= NO_KERNEL
;
1325 pipeline
->vs_vec4
= NO_KERNEL
;
1329 if (pipeline
->shaders
[VK_SHADER_STAGE_GEOMETRY
]) {
1330 struct brw_gs_prog_key gs_key
;
1331 struct gl_geometry_program
*gp
= (struct gl_geometry_program
*)
1332 program
->_LinkedShaders
[MESA_SHADER_GEOMETRY
]->Program
;
1333 struct brw_geometry_program
*bgp
= brw_geometry_program(gp
);
1335 success
= anv_codegen_gs_prog(brw
, program
, bgp
, &gs_key
, pipeline
);
1336 fail_if(!success
, "do_gs_prog failed\n");
1337 add_compiled_stage(pipeline
, VK_SHADER_STAGE_GEOMETRY
,
1338 &pipeline
->gs_prog_data
.base
.base
);
1340 if (gp
->Base
.OutputsWritten
& VARYING_SLOT_PSIZ
)
1341 pipeline
->writes_point_size
= true;
1343 pipeline
->gs_vec4
= NO_KERNEL
;
1346 if (pipeline
->shaders
[VK_SHADER_STAGE_FRAGMENT
]) {
1347 struct brw_wm_prog_key wm_key
;
1348 struct gl_fragment_program
*fp
= (struct gl_fragment_program
*)
1349 program
->_LinkedShaders
[MESA_SHADER_FRAGMENT
]->Program
;
1350 struct brw_fragment_program
*bfp
= brw_fragment_program(fp
);
1352 brw_wm_populate_key(brw
, bfp
, &wm_key
);
1354 success
= really_do_wm_prog(brw
, program
, bfp
, &wm_key
, pipeline
);
1355 fail_if(!success
, "do_wm_prog failed\n");
1356 add_compiled_stage(pipeline
, VK_SHADER_STAGE_FRAGMENT
,
1357 &pipeline
->wm_prog_data
.base
);
1360 if (pipeline
->shaders
[VK_SHADER_STAGE_COMPUTE
]) {
1361 struct brw_cs_prog_key cs_key
;
1362 struct gl_compute_program
*cp
= (struct gl_compute_program
*)
1363 program
->_LinkedShaders
[MESA_SHADER_COMPUTE
]->Program
;
1364 struct brw_compute_program
*bcp
= brw_compute_program(cp
);
1366 brw_cs_populate_key(brw
, bcp
, &cs_key
);
1368 success
= brw_codegen_cs_prog(brw
, program
, bcp
, &cs_key
, pipeline
);
1369 fail_if(!success
, "brw_codegen_cs_prog failed\n");
1370 add_compiled_stage(pipeline
, VK_SHADER_STAGE_COMPUTE
,
1371 &pipeline
->cs_prog_data
.base
);
1374 _mesa_delete_shader_program(&brw
->ctx
, program
);
1376 struct anv_device
*device
= compiler
->device
;
1377 while (device
->scratch_block_pool
.bo
.size
< pipeline
->total_scratch
)
1378 anv_block_pool_alloc(&device
->scratch_block_pool
);
1380 gen7_compute_urb_partition(pipeline
);
1385 /* This badly named function frees the struct anv_pipeline data that the compiler
1386 * allocates. Currently just the prog_data structs.
1389 anv_compiler_free(struct anv_pipeline
*pipeline
)
1391 for (uint32_t stage
= 0; stage
< VK_SHADER_STAGE_NUM
; stage
++) {
1392 if (pipeline
->prog_data
[stage
]) {
1393 free(pipeline
->prog_data
[stage
]->map_entries
);
1394 /* We only ever set up the params array because we don't do
1395 * non-UBO pull constants
1397 anv_device_free(pipeline
->device
, pipeline
->prog_data
[stage
]->param
);