2 * Copyright © 2015 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
28 #include "anv_private.h"
31 #include <brw_context.h>
32 #include <brw_wm.h> /* brw_new_shader_program is here */
38 #include "brw_vec4_gs_visitor.h"
39 #include <brw_compiler.h>
41 #include <mesa/main/shaderobj.h>
42 #include <mesa/main/fbobject.h>
43 #include <mesa/main/context.h>
44 #include <mesa/program/program.h>
45 #include <glsl/program.h>
47 /* XXX: We need this to keep symbols in nir.h from conflicting with the
48 * generated GEN command packing headers. We need to fix *both* to not
49 * define something as generic as LOAD.
53 #include <glsl/nir/nir_spirv.h>
55 #define SPIR_V_MAGIC_NUMBER 0x07230203
58 fail_if(int cond
, const char *format
, ...)
65 va_start(args
, format
);
66 vfprintf(stderr
, format
, args
);
73 set_binding_table_layout(struct brw_stage_prog_data
*prog_data
,
74 struct anv_pipeline
*pipeline
, uint32_t stage
)
77 if (stage
== VK_SHADER_STAGE_FRAGMENT
)
82 prog_data
->binding_table
.size_bytes
= 0;
83 prog_data
->binding_table
.texture_start
= bias
;
84 prog_data
->binding_table
.ubo_start
= bias
;
85 prog_data
->binding_table
.image_start
= bias
;
91 upload_kernel(struct anv_pipeline
*pipeline
, const void *data
, size_t size
)
93 struct anv_state state
=
94 anv_state_stream_alloc(&pipeline
->program_stream
, size
, 64);
96 assert(size
< pipeline
->program_stream
.block_pool
->block_size
);
98 memcpy(state
.map
, data
, size
);
104 create_params_array(struct anv_pipeline
*pipeline
,
105 struct gl_shader
*shader
,
106 struct brw_stage_prog_data
*prog_data
)
108 VkShaderStage stage
= anv_vk_shader_stage_for_mesa_stage(shader
->Stage
);
109 unsigned num_params
= 0;
111 if (shader
->num_uniform_components
) {
112 /* If the shader uses any push constants at all, we'll just give
113 * them the maximum possible number
115 num_params
+= MAX_PUSH_CONSTANTS_SIZE
/ sizeof(float);
118 if (pipeline
->layout
&& pipeline
->layout
->stage
[stage
].has_dynamic_offsets
)
119 num_params
+= MAX_DYNAMIC_BUFFERS
;
124 prog_data
->param
= (const gl_constant_value
**)
125 anv_device_alloc(pipeline
->device
,
126 num_params
* sizeof(gl_constant_value
*),
127 8, VK_SYSTEM_ALLOC_TYPE_INTERNAL_SHADER
);
129 /* We now set the param values to be offsets into a
130 * anv_push_constant_data structure. Since the compiler doesn't
131 * actually dereference any of the gl_constant_value pointers in the
132 * params array, it doesn't really matter what we put here.
134 struct anv_push_constants
*null_data
= NULL
;
135 for (unsigned i
= 0; i
< num_params
; i
++)
136 prog_data
->param
[i
] =
137 (const gl_constant_value
*)&null_data
->client_data
[i
* sizeof(float)];
141 * Return a bitfield where bit n is set if barycentric interpolation mode n
142 * (see enum brw_wm_barycentric_interp_mode) is needed by the fragment shader.
145 brw_compute_barycentric_interp_modes(const struct brw_device_info
*devinfo
,
146 bool shade_model_flat
,
147 bool persample_shading
,
150 unsigned barycentric_interp_modes
= 0;
152 nir_foreach_variable(var
, &shader
->inputs
) {
153 enum glsl_interp_qualifier interp_qualifier
=
154 (enum glsl_interp_qualifier
) var
->data
.interpolation
;
155 bool is_centroid
= var
->data
.centroid
&& !persample_shading
;
156 bool is_sample
= var
->data
.sample
|| persample_shading
;
157 bool is_gl_Color
= (var
->data
.location
== VARYING_SLOT_COL0
) ||
158 (var
->data
.location
== VARYING_SLOT_COL1
);
160 /* Ignore WPOS and FACE, because they don't require interpolation. */
161 if (var
->data
.location
== VARYING_SLOT_POS
||
162 var
->data
.location
== VARYING_SLOT_FACE
)
165 /* Determine the set (or sets) of barycentric coordinates needed to
166 * interpolate this variable. Note that when
167 * brw->needs_unlit_centroid_workaround is set, centroid interpolation
168 * uses PIXEL interpolation for unlit pixels and CENTROID interpolation
169 * for lit pixels, so we need both sets of barycentric coordinates.
171 if (interp_qualifier
== INTERP_QUALIFIER_NOPERSPECTIVE
) {
173 barycentric_interp_modes
|=
174 1 << BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC
;
175 } else if (is_sample
) {
176 barycentric_interp_modes
|=
177 1 << BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC
;
179 if ((!is_centroid
&& !is_sample
) ||
180 devinfo
->needs_unlit_centroid_workaround
) {
181 barycentric_interp_modes
|=
182 1 << BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC
;
184 } else if (interp_qualifier
== INTERP_QUALIFIER_SMOOTH
||
185 (!(shade_model_flat
&& is_gl_Color
) &&
186 interp_qualifier
== INTERP_QUALIFIER_NONE
)) {
188 barycentric_interp_modes
|=
189 1 << BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC
;
190 } else if (is_sample
) {
191 barycentric_interp_modes
|=
192 1 << BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC
;
194 if ((!is_centroid
&& !is_sample
) ||
195 devinfo
->needs_unlit_centroid_workaround
) {
196 barycentric_interp_modes
|=
197 1 << BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC
;
202 return barycentric_interp_modes
;
206 brw_vs_populate_key(struct brw_context
*brw
,
207 struct brw_vertex_program
*vp
,
208 struct brw_vs_prog_key
*key
)
210 struct gl_context
*ctx
= &brw
->ctx
;
211 /* BRW_NEW_VERTEX_PROGRAM */
212 struct gl_program
*prog
= (struct gl_program
*) vp
;
214 memset(key
, 0, sizeof(*key
));
216 /* Just upload the program verbatim for now. Always send it all
217 * the inputs it asks for, whether they are varying or not.
219 key
->program_string_id
= vp
->id
;
223 key
->copy_edgeflag
= (ctx
->Polygon
.FrontMode
!= GL_FILL
||
224 ctx
->Polygon
.BackMode
!= GL_FILL
);
227 if (prog
->OutputsWritten
& (VARYING_BIT_COL0
| VARYING_BIT_COL1
|
228 VARYING_BIT_BFC0
| VARYING_BIT_BFC1
)) {
229 /* _NEW_LIGHT | _NEW_BUFFERS */
230 key
->clamp_vertex_color
= ctx
->Light
._ClampVertexColor
;
234 if (brw
->gen
< 6 && ctx
->Point
.PointSprite
) {
235 for (int i
= 0; i
< 8; i
++) {
236 if (ctx
->Point
.CoordReplace
[i
])
237 key
->point_coord_replace
|= (1 << i
);
243 really_do_vs_prog(struct brw_context
*brw
,
244 struct gl_shader_program
*prog
,
245 struct brw_vertex_program
*vp
,
246 struct brw_vs_prog_key
*key
, struct anv_pipeline
*pipeline
)
249 const GLuint
*program
;
250 struct brw_vs_prog_data
*prog_data
= &pipeline
->vs_prog_data
;
252 struct gl_shader
*vs
= NULL
;
255 vs
= prog
->_LinkedShaders
[MESA_SHADER_VERTEX
];
257 memset(prog_data
, 0, sizeof(*prog_data
));
259 mem_ctx
= ralloc_context(NULL
);
261 create_params_array(pipeline
, vs
, &prog_data
->base
.base
);
262 anv_nir_apply_dynamic_offsets(pipeline
, vs
->Program
->nir
,
263 &prog_data
->base
.base
);
264 anv_nir_apply_pipeline_layout(vs
->Program
->nir
, pipeline
->layout
);
266 GLbitfield64 outputs_written
= vp
->program
.Base
.OutputsWritten
;
267 prog_data
->inputs_read
= vp
->program
.Base
.InputsRead
;
269 if (key
->copy_edgeflag
) {
270 outputs_written
|= BITFIELD64_BIT(VARYING_SLOT_EDGE
);
271 prog_data
->inputs_read
|= VERT_BIT_EDGEFLAG
;
275 /* Put dummy slots into the VUE for the SF to put the replaced
276 * point sprite coords in. We shouldn't need these dummy slots,
277 * which take up precious URB space, but it would mean that the SF
278 * doesn't get nice aligned pairs of input coords into output
279 * coords, which would be a pain to handle.
281 for (int i
= 0; i
< 8; i
++) {
282 if (key
->point_coord_replace
& (1 << i
))
283 outputs_written
|= BITFIELD64_BIT(VARYING_SLOT_TEX0
+ i
);
286 /* if back colors are written, allocate slots for front colors too */
287 if (outputs_written
& BITFIELD64_BIT(VARYING_SLOT_BFC0
))
288 outputs_written
|= BITFIELD64_BIT(VARYING_SLOT_COL0
);
289 if (outputs_written
& BITFIELD64_BIT(VARYING_SLOT_BFC1
))
290 outputs_written
|= BITFIELD64_BIT(VARYING_SLOT_COL1
);
293 /* In order for legacy clipping to work, we need to populate the clip
294 * distance varying slots whenever clipping is enabled, even if the vertex
295 * shader doesn't write to gl_ClipDistance.
297 if (key
->nr_userclip_plane_consts
) {
298 outputs_written
|= BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0
);
299 outputs_written
|= BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1
);
302 brw_compute_vue_map(brw
->intelScreen
->devinfo
,
303 &prog_data
->base
.vue_map
, outputs_written
,
304 prog
? prog
->SeparateShader
: false);
306 set_binding_table_layout(&prog_data
->base
.base
, pipeline
,
307 VK_SHADER_STAGE_VERTEX
);
311 program
= brw_compile_vs(brw
->intelScreen
->compiler
, brw
, mem_ctx
,
312 key
, prog_data
, vs
->Program
->nir
, NULL
, false, -1,
313 &program_size
, NULL
);
314 if (program
== NULL
) {
315 ralloc_free(mem_ctx
);
319 const uint32_t offset
= upload_kernel(pipeline
, program
, program_size
);
320 if (prog_data
->base
.dispatch_mode
== DISPATCH_MODE_SIMD8
) {
321 pipeline
->vs_simd8
= offset
;
322 pipeline
->vs_vec4
= NO_KERNEL
;
324 pipeline
->vs_simd8
= NO_KERNEL
;
325 pipeline
->vs_vec4
= offset
;
328 ralloc_free(mem_ctx
);
333 void brw_wm_populate_key(struct brw_context
*brw
,
334 struct brw_fragment_program
*fp
,
335 struct brw_wm_prog_key
*key
)
337 struct gl_context
*ctx
= &brw
->ctx
;
340 bool program_uses_dfdy
= fp
->program
.UsesDFdy
;
341 struct gl_framebuffer draw_buffer
;
342 bool multisample_fbo
;
344 memset(key
, 0, sizeof(*key
));
346 for (int i
= 0; i
< MAX_SAMPLERS
; i
++) {
347 /* Assume color sampler, no swizzling. */
348 key
->tex
.swizzles
[i
] = SWIZZLE_XYZW
;
351 /* A non-zero framebuffer name indicates that the framebuffer was created by
352 * the user rather than the window system. */
353 draw_buffer
.Name
= 1;
354 draw_buffer
.Visual
.samples
= 1;
355 draw_buffer
._NumColorDrawBuffers
= 1;
356 draw_buffer
._NumColorDrawBuffers
= 1;
357 draw_buffer
.Width
= 400;
358 draw_buffer
.Height
= 400;
359 ctx
->DrawBuffer
= &draw_buffer
;
361 multisample_fbo
= ctx
->DrawBuffer
->Visual
.samples
> 1;
363 /* Build the index for table lookup
367 if (fp
->program
.UsesKill
|| ctx
->Color
.AlphaEnabled
)
368 lookup
|= IZ_PS_KILL_ALPHATEST_BIT
;
370 if (fp
->program
.Base
.OutputsWritten
& BITFIELD64_BIT(FRAG_RESULT_DEPTH
))
371 lookup
|= IZ_PS_COMPUTES_DEPTH_BIT
;
375 lookup
|= IZ_DEPTH_TEST_ENABLE_BIT
;
377 if (ctx
->Depth
.Test
&& ctx
->Depth
.Mask
) /* ?? */
378 lookup
|= IZ_DEPTH_WRITE_ENABLE_BIT
;
380 /* _NEW_STENCIL | _NEW_BUFFERS */
381 if (ctx
->Stencil
._Enabled
) {
382 lookup
|= IZ_STENCIL_TEST_ENABLE_BIT
;
384 if (ctx
->Stencil
.WriteMask
[0] ||
385 ctx
->Stencil
.WriteMask
[ctx
->Stencil
._BackFace
])
386 lookup
|= IZ_STENCIL_WRITE_ENABLE_BIT
;
388 key
->iz_lookup
= lookup
;
393 /* _NEW_LINE, _NEW_POLYGON, BRW_NEW_REDUCED_PRIMITIVE */
394 if (ctx
->Line
.SmoothFlag
) {
395 if (brw
->reduced_primitive
== GL_LINES
) {
398 else if (brw
->reduced_primitive
== GL_TRIANGLES
) {
399 if (ctx
->Polygon
.FrontMode
== GL_LINE
) {
400 line_aa
= AA_SOMETIMES
;
402 if (ctx
->Polygon
.BackMode
== GL_LINE
||
403 (ctx
->Polygon
.CullFlag
&&
404 ctx
->Polygon
.CullFaceMode
== GL_BACK
))
407 else if (ctx
->Polygon
.BackMode
== GL_LINE
) {
408 line_aa
= AA_SOMETIMES
;
410 if ((ctx
->Polygon
.CullFlag
&&
411 ctx
->Polygon
.CullFaceMode
== GL_FRONT
))
417 key
->line_aa
= line_aa
;
420 key
->high_quality_derivatives
=
421 ctx
->Hint
.FragmentShaderDerivative
== GL_NICEST
;
424 key
->stats_wm
= brw
->stats_wm
;
427 key
->flat_shade
= (ctx
->Light
.ShadeModel
== GL_FLAT
);
429 /* _NEW_FRAG_CLAMP | _NEW_BUFFERS */
430 key
->clamp_fragment_color
= ctx
->Color
._ClampFragmentColor
;
434 * Include the draw buffer origin and height so that we can calculate
435 * fragment position values relative to the bottom left of the drawable,
436 * from the incoming screen origin relative position we get as part of our
439 * This is only needed for the WM_WPOSXY opcode when the fragment program
440 * uses the gl_FragCoord input.
442 * We could avoid recompiling by including this as a constant referenced by
443 * our program, but if we were to do that it would also be nice to handle
444 * getting that constant updated at batchbuffer submit time (when we
445 * hold the lock and know where the buffer really is) rather than at emit
446 * time when we don't hold the lock and are just guessing. We could also
447 * just avoid using this as key data if the program doesn't use
450 * For DRI2 the origin_x/y will always be (0,0) but we still need the
451 * drawable height in order to invert the Y axis.
453 if (fp
->program
.Base
.InputsRead
& VARYING_BIT_POS
) {
454 key
->drawable_height
= ctx
->DrawBuffer
->Height
;
457 if ((fp
->program
.Base
.InputsRead
& VARYING_BIT_POS
) || program_uses_dfdy
) {
458 key
->render_to_fbo
= _mesa_is_user_fbo(ctx
->DrawBuffer
);
462 key
->nr_color_regions
= ctx
->DrawBuffer
->_NumColorDrawBuffers
;
464 /* _NEW_MULTISAMPLE, _NEW_COLOR, _NEW_BUFFERS */
465 key
->replicate_alpha
= ctx
->DrawBuffer
->_NumColorDrawBuffers
> 1 &&
466 (ctx
->Multisample
.SampleAlphaToCoverage
|| ctx
->Color
.AlphaEnabled
);
468 /* _NEW_BUFFERS _NEW_MULTISAMPLE */
469 /* Ignore sample qualifier while computing this flag. */
470 key
->persample_shading
=
471 _mesa_get_min_invocations_per_fragment(ctx
, &fp
->program
, true) > 1;
472 if (key
->persample_shading
)
473 key
->persample_2x
= ctx
->DrawBuffer
->Visual
.samples
== 2;
475 key
->compute_pos_offset
=
476 _mesa_get_min_invocations_per_fragment(ctx
, &fp
->program
, false) > 1 &&
477 fp
->program
.Base
.SystemValuesRead
& SYSTEM_BIT_SAMPLE_POS
;
479 key
->compute_sample_id
=
481 ctx
->Multisample
.Enabled
&&
482 (fp
->program
.Base
.SystemValuesRead
& SYSTEM_BIT_SAMPLE_ID
);
484 /* BRW_NEW_VUE_MAP_GEOM_OUT */
485 if (brw
->gen
< 6 || _mesa_bitcount_64(fp
->program
.Base
.InputsRead
&
486 BRW_FS_VARYING_INPUT_MASK
) > 16)
487 key
->input_slots_valid
= brw
->vue_map_geom_out
.slots_valid
;
490 /* _NEW_COLOR | _NEW_BUFFERS */
491 /* Pre-gen6, the hardware alpha test always used each render
492 * target's alpha to do alpha test, as opposed to render target 0's alpha
493 * like GL requires. Fix that by building the alpha test into the
494 * shader, and we'll skip enabling the fixed function alpha test.
496 if (brw
->gen
< 6 && ctx
->DrawBuffer
->_NumColorDrawBuffers
> 1 && ctx
->Color
.AlphaEnabled
) {
497 key
->alpha_test_func
= ctx
->Color
.AlphaFunc
;
498 key
->alpha_test_ref
= ctx
->Color
.AlphaRef
;
501 /* The unique fragment program ID */
502 key
->program_string_id
= fp
->id
;
504 ctx
->DrawBuffer
= NULL
;
508 computed_depth_mode(struct gl_fragment_program
*fp
)
510 if (fp
->Base
.OutputsWritten
& BITFIELD64_BIT(FRAG_RESULT_DEPTH
)) {
511 switch (fp
->FragDepthLayout
) {
512 case FRAG_DEPTH_LAYOUT_NONE
:
513 case FRAG_DEPTH_LAYOUT_ANY
:
514 return BRW_PSCDEPTH_ON
;
515 case FRAG_DEPTH_LAYOUT_GREATER
:
516 return BRW_PSCDEPTH_ON_GE
;
517 case FRAG_DEPTH_LAYOUT_LESS
:
518 return BRW_PSCDEPTH_ON_LE
;
519 case FRAG_DEPTH_LAYOUT_UNCHANGED
:
520 return BRW_PSCDEPTH_OFF
;
523 return BRW_PSCDEPTH_OFF
;
527 really_do_wm_prog(struct brw_context
*brw
,
528 struct gl_shader_program
*prog
,
529 struct brw_fragment_program
*fp
,
530 struct brw_wm_prog_key
*key
, struct anv_pipeline
*pipeline
)
532 void *mem_ctx
= ralloc_context(NULL
);
533 struct brw_wm_prog_data
*prog_data
= &pipeline
->wm_prog_data
;
534 struct gl_shader
*fs
= NULL
;
535 unsigned int program_size
;
536 const uint32_t *program
;
539 fs
= prog
->_LinkedShaders
[MESA_SHADER_FRAGMENT
];
541 memset(prog_data
, 0, sizeof(*prog_data
));
543 /* key->alpha_test_func means simulating alpha testing via discards,
544 * so the shader definitely kills pixels.
546 prog_data
->uses_kill
= fp
->program
.UsesKill
|| key
->alpha_test_func
;
548 prog_data
->computed_depth_mode
= computed_depth_mode(&fp
->program
);
550 create_params_array(pipeline
, fs
, &prog_data
->base
);
551 anv_nir_apply_dynamic_offsets(pipeline
, fs
->Program
->nir
, &prog_data
->base
);
552 anv_nir_apply_pipeline_layout(fs
->Program
->nir
, pipeline
->layout
);
554 prog_data
->barycentric_interp_modes
=
555 brw_compute_barycentric_interp_modes(brw
->intelScreen
->devinfo
,
557 key
->persample_shading
,
558 fp
->program
.Base
.nir
);
560 set_binding_table_layout(&prog_data
->base
, pipeline
,
561 VK_SHADER_STAGE_FRAGMENT
);
562 /* This needs to come after shader time and pull constant entries, but we
563 * don't have those set up now, so just put it after the layout entries.
565 prog_data
->binding_table
.render_target_start
= 0;
567 program
= brw_compile_fs(brw
->intelScreen
->compiler
, brw
, mem_ctx
, key
,
568 prog_data
, fp
->program
.Base
.nir
, fs
->Program
,
569 -1, -1, brw
->use_rep_send
, &program_size
, NULL
);
570 if (program
== NULL
) {
571 ralloc_free(mem_ctx
);
575 uint32_t offset
= upload_kernel(pipeline
, program
, program_size
);
578 pipeline
->ps_simd8
= NO_KERNEL
;
580 pipeline
->ps_simd8
= offset
;
582 if (prog_data
->no_8
|| prog_data
->prog_offset_16
) {
583 pipeline
->ps_simd16
= offset
+ prog_data
->prog_offset_16
;
585 pipeline
->ps_simd16
= NO_KERNEL
;
588 ralloc_free(mem_ctx
);
594 anv_codegen_gs_prog(struct brw_context
*brw
,
595 struct gl_shader_program
*prog
,
596 struct brw_geometry_program
*gp
,
597 struct brw_gs_prog_key
*key
,
598 struct anv_pipeline
*pipeline
)
600 struct brw_gs_compile c
;
602 memset(&c
, 0, sizeof(c
));
606 c
.prog_data
.include_primitive_id
=
607 (gp
->program
.Base
.InputsRead
& VARYING_BIT_PRIMITIVE_ID
) != 0;
609 c
.prog_data
.invocations
= gp
->program
.Invocations
;
611 set_binding_table_layout(&c
.prog_data
.base
.base
,
612 pipeline
, VK_SHADER_STAGE_GEOMETRY
);
614 /* Allocate the references to the uniforms that will end up in the
615 * prog_data associated with the compiled program, and which will be freed
616 * by the state cache.
618 * Note: param_count needs to be num_uniform_components * 4, since we add
619 * padding around uniform values below vec4 size, so the worst case is that
620 * every uniform is a float which gets padded to the size of a vec4.
622 struct gl_shader
*gs
= prog
->_LinkedShaders
[MESA_SHADER_GEOMETRY
];
623 int param_count
= gp
->program
.Base
.nir
->num_uniforms
* 4;
625 c
.prog_data
.base
.base
.param
=
626 rzalloc_array(NULL
, const gl_constant_value
*, param_count
);
627 c
.prog_data
.base
.base
.pull_param
=
628 rzalloc_array(NULL
, const gl_constant_value
*, param_count
);
629 c
.prog_data
.base
.base
.image_param
=
630 rzalloc_array(NULL
, struct brw_image_param
, gs
->NumImages
);
631 c
.prog_data
.base
.base
.nr_params
= param_count
;
632 c
.prog_data
.base
.base
.nr_image_params
= gs
->NumImages
;
634 brw_nir_setup_glsl_uniforms(gp
->program
.Base
.nir
, prog
, &gp
->program
.Base
,
635 &c
.prog_data
.base
.base
, false);
638 c
.prog_data
.static_vertex_count
= !gp
->program
.Base
.nir
? -1 :
639 nir_gs_count_vertices(gp
->program
.Base
.nir
);
643 if (gp
->program
.OutputType
== GL_POINTS
) {
644 /* When the output type is points, the geometry shader may output data
645 * to multiple streams, and EndPrimitive() has no effect. So we
646 * configure the hardware to interpret the control data as stream ID.
648 c
.prog_data
.control_data_format
= GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID
;
650 /* We only have to emit control bits if we are using streams */
651 if (prog
->Geom
.UsesStreams
)
652 c
.control_data_bits_per_vertex
= 2;
654 c
.control_data_bits_per_vertex
= 0;
656 /* When the output type is triangle_strip or line_strip, EndPrimitive()
657 * may be used to terminate the current strip and start a new one
658 * (similar to primitive restart), and outputting data to multiple
659 * streams is not supported. So we configure the hardware to interpret
660 * the control data as EndPrimitive information (a.k.a. "cut bits").
662 c
.prog_data
.control_data_format
= GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT
;
664 /* We only need to output control data if the shader actually calls
667 c
.control_data_bits_per_vertex
= gp
->program
.UsesEndPrimitive
? 1 : 0;
670 /* There are no control data bits in gen6. */
671 c
.control_data_bits_per_vertex
= 0;
673 /* If it is using transform feedback, enable it */
674 if (prog
->TransformFeedback
.NumVarying
)
675 c
.prog_data
.gen6_xfb_enabled
= true;
677 c
.prog_data
.gen6_xfb_enabled
= false;
679 c
.control_data_header_size_bits
=
680 gp
->program
.VerticesOut
* c
.control_data_bits_per_vertex
;
682 /* 1 HWORD = 32 bytes = 256 bits */
683 c
.prog_data
.control_data_header_size_hwords
=
684 ALIGN(c
.control_data_header_size_bits
, 256) / 256;
686 GLbitfield64 outputs_written
= gp
->program
.Base
.OutputsWritten
;
688 brw_compute_vue_map(brw
->intelScreen
->devinfo
,
689 &c
.prog_data
.base
.vue_map
, outputs_written
,
690 prog
? prog
->SeparateShader
: false);
692 /* Compute the output vertex size.
694 * From the Ivy Bridge PRM, Vol2 Part1 7.2.1.1 STATE_GS - Output Vertex
697 * [0,62] indicating [1,63] 16B units
699 * Specifies the size of each vertex stored in the GS output entry
700 * (following any Control Header data) as a number of 128-bit units
703 * Programming Restrictions: The vertex size must be programmed as a
704 * multiple of 32B units with the following exception: Rendering is
705 * disabled (as per SOL stage state) and the vertex size output by the
708 * If rendering is enabled (as per SOL state) the vertex size must be
709 * programmed as a multiple of 32B units. In other words, the only time
710 * software can program a vertex size with an odd number of 16B units
711 * is when rendering is disabled.
713 * Note: B=bytes in the above text.
715 * It doesn't seem worth the extra trouble to optimize the case where the
716 * vertex size is 16B (especially since this would require special-casing
717 * the GEN assembly that writes to the URB). So we just set the vertex
718 * size to a multiple of 32B (2 vec4's) in all cases.
720 * The maximum output vertex size is 62*16 = 992 bytes (31 hwords). We
721 * budget that as follows:
723 * 512 bytes for varyings (a varying component is 4 bytes and
724 * gl_MaxGeometryOutputComponents = 128)
725 * 16 bytes overhead for VARYING_SLOT_PSIZ (each varying slot is 16
727 * 16 bytes overhead for gl_Position (we allocate it a slot in the VUE
728 * even if it's not used)
729 * 32 bytes overhead for gl_ClipDistance (we allocate it 2 VUE slots
730 * whenever clip planes are enabled, even if the shader doesn't
731 * write to gl_ClipDistance)
732 * 16 bytes overhead since the VUE size must be a multiple of 32 bytes
733 * (see below)--this causes up to 1 VUE slot to be wasted
734 * 400 bytes available for varying packing overhead
736 * Worst-case varying packing overhead is 3/4 of a varying slot (12 bytes)
737 * per interpolation type, so this is plenty.
740 unsigned output_vertex_size_bytes
= c
.prog_data
.base
.vue_map
.num_slots
* 16;
741 assert(brw
->gen
== 6 ||
742 output_vertex_size_bytes
<= GEN7_MAX_GS_OUTPUT_VERTEX_SIZE_BYTES
);
743 c
.prog_data
.output_vertex_size_hwords
=
744 ALIGN(output_vertex_size_bytes
, 32) / 32;
746 /* Compute URB entry size. The maximum allowed URB entry size is 32k.
747 * That divides up as follows:
749 * 64 bytes for the control data header (cut indices or StreamID bits)
750 * 4096 bytes for varyings (a varying component is 4 bytes and
751 * gl_MaxGeometryTotalOutputComponents = 1024)
752 * 4096 bytes overhead for VARYING_SLOT_PSIZ (each varying slot is 16
753 * bytes/vertex and gl_MaxGeometryOutputVertices is 256)
754 * 4096 bytes overhead for gl_Position (we allocate it a slot in the VUE
755 * even if it's not used)
756 * 8192 bytes overhead for gl_ClipDistance (we allocate it 2 VUE slots
757 * whenever clip planes are enabled, even if the shader doesn't
758 * write to gl_ClipDistance)
759 * 4096 bytes overhead since the VUE size must be a multiple of 32
760 * bytes (see above)--this causes up to 1 VUE slot to be wasted
761 * 8128 bytes available for varying packing overhead
763 * Worst-case varying packing overhead is 3/4 of a varying slot per
764 * interpolation type, which works out to 3072 bytes, so this would allow
765 * us to accommodate 2 interpolation types without any danger of running
768 * In practice, the risk of running out of URB space is very small, since
769 * the above figures are all worst-case, and most of them scale with the
770 * number of output vertices. So we'll just calculate the amount of space
771 * we need, and if it's too large, fail to compile.
773 * The above is for gen7+ where we have a single URB entry that will hold
774 * all the output. In gen6, we will have to allocate URB entries for every
775 * vertex we emit, so our URB entries only need to be large enough to hold
776 * a single vertex. Also, gen6 does not have a control data header.
778 unsigned output_size_bytes
;
781 c
.prog_data
.output_vertex_size_hwords
* 32 * gp
->program
.VerticesOut
;
782 output_size_bytes
+= 32 * c
.prog_data
.control_data_header_size_hwords
;
784 output_size_bytes
= c
.prog_data
.output_vertex_size_hwords
* 32;
787 /* Broadwell stores "Vertex Count" as a full 8 DWord (32 byte) URB output,
788 * which comes before the control header.
791 output_size_bytes
+= 32;
793 assert(output_size_bytes
>= 1);
794 int max_output_size_bytes
= GEN7_MAX_GS_URB_ENTRY_SIZE_BYTES
;
796 max_output_size_bytes
= GEN6_MAX_GS_URB_ENTRY_SIZE_BYTES
;
797 if (output_size_bytes
> max_output_size_bytes
)
801 /* URB entry sizes are stored as a multiple of 64 bytes in gen7+ and
802 * a multiple of 128 bytes in gen6.
805 c
.prog_data
.base
.urb_entry_size
= ALIGN(output_size_bytes
, 64) / 64;
807 c
.prog_data
.base
.urb_entry_size
= ALIGN(output_size_bytes
, 128) / 128;
809 /* FIXME: Need to pull this from nir shader. */
810 c
.prog_data
.output_topology
= _3DPRIM_TRISTRIP
;
812 /* The GLSL linker will have already matched up GS inputs and the outputs
813 * of prior stages. The driver does extend VS outputs in some cases, but
814 * only for legacy OpenGL or Gen4-5 hardware, neither of which offer
815 * geometry shader support. So we can safely ignore that.
817 * For SSO pipelines, we use a fixed VUE map layout based on variable
818 * locations, so we can rely on rendezvous-by-location making this work.
820 * However, we need to ignore VARYING_SLOT_PRIMITIVE_ID, as it's not
821 * written by previous stages and shows up via payload magic.
823 GLbitfield64 inputs_read
=
824 gp
->program
.Base
.InputsRead
& ~VARYING_BIT_PRIMITIVE_ID
;
825 brw_compute_vue_map(brw
->intelScreen
->devinfo
,
826 &c
.input_vue_map
, inputs_read
,
827 prog
->SeparateShader
);
829 /* GS inputs are read from the VUE 256 bits (2 vec4's) at a time, so we
830 * need to program a URB read length of ceiling(num_slots / 2).
832 c
.prog_data
.base
.urb_read_length
= (c
.input_vue_map
.num_slots
+ 1) / 2;
834 void *mem_ctx
= ralloc_context(NULL
);
835 unsigned program_size
;
836 const unsigned *program
=
837 brw_compile_gs(brw
->intelScreen
->compiler
, brw
, &c
, gp
->program
.Base
.nir
,
838 prog
, mem_ctx
, -1, &program_size
, NULL
);
839 if (program
== NULL
) {
840 ralloc_free(mem_ctx
);
844 pipeline
->gs_vec4
= upload_kernel(pipeline
, program
, program_size
);
845 pipeline
->gs_vertex_count
= gp
->program
.VerticesIn
;
847 ralloc_free(mem_ctx
);
853 brw_codegen_cs_prog(struct brw_context
*brw
,
854 struct gl_shader_program
*prog
,
855 struct brw_compute_program
*cp
,
856 struct brw_cs_prog_key
*key
, struct anv_pipeline
*pipeline
)
858 const GLuint
*program
;
859 void *mem_ctx
= ralloc_context(NULL
);
861 struct brw_cs_prog_data
*prog_data
= &pipeline
->cs_prog_data
;
863 struct gl_shader
*cs
= prog
->_LinkedShaders
[MESA_SHADER_COMPUTE
];
866 memset(prog_data
, 0, sizeof(*prog_data
));
868 set_binding_table_layout(&prog_data
->base
, pipeline
, VK_SHADER_STAGE_COMPUTE
);
870 create_params_array(pipeline
, cs
, &prog_data
->base
);
871 anv_nir_apply_dynamic_offsets(pipeline
, cs
->Program
->nir
, &prog_data
->base
);
872 anv_nir_apply_pipeline_layout(cs
->Program
->nir
, pipeline
->layout
);
874 program
= brw_compile_cs(brw
->intelScreen
->compiler
, brw
, mem_ctx
, key
,
875 prog_data
, cs
->Program
->nir
, -1,
876 &program_size
, NULL
);
877 if (program
== NULL
) {
878 ralloc_free(mem_ctx
);
882 if (unlikely(INTEL_DEBUG
& DEBUG_CS
))
883 fprintf(stderr
, "\n");
885 pipeline
->cs_simd
= upload_kernel(pipeline
, program
, program_size
);
887 ralloc_free(mem_ctx
);
893 brw_cs_populate_key(struct brw_context
*brw
,
894 struct brw_compute_program
*bcp
, struct brw_cs_prog_key
*key
)
896 memset(key
, 0, sizeof(*key
));
898 /* The unique compute program ID */
899 key
->program_string_id
= bcp
->id
;
902 struct anv_compiler
{
903 struct anv_device
*device
;
904 struct intel_screen
*screen
;
905 struct brw_context
*brw
;
906 struct gl_pipeline_object pipeline
;
911 struct anv_compiler
*
912 anv_compiler_create(struct anv_device
*device
)
914 const struct brw_device_info
*devinfo
= &device
->info
;
915 struct anv_compiler
*compiler
;
916 struct gl_context
*ctx
;
918 compiler
= rzalloc(NULL
, struct anv_compiler
);
919 if (compiler
== NULL
)
922 compiler
->screen
= rzalloc(compiler
, struct intel_screen
);
923 if (compiler
->screen
== NULL
)
926 compiler
->brw
= rzalloc(compiler
, struct brw_context
);
927 if (compiler
->brw
== NULL
)
930 compiler
->device
= device
;
932 compiler
->brw
->gen
= devinfo
->gen
;
933 compiler
->brw
->is_g4x
= devinfo
->is_g4x
;
934 compiler
->brw
->is_baytrail
= devinfo
->is_baytrail
;
935 compiler
->brw
->is_haswell
= devinfo
->is_haswell
;
936 compiler
->brw
->is_cherryview
= devinfo
->is_cherryview
;
938 /* We need this at least for CS, which will check brw->max_cs_threads
939 * against the work group size. */
940 compiler
->brw
->max_vs_threads
= devinfo
->max_vs_threads
;
941 compiler
->brw
->max_hs_threads
= devinfo
->max_hs_threads
;
942 compiler
->brw
->max_ds_threads
= devinfo
->max_ds_threads
;
943 compiler
->brw
->max_gs_threads
= devinfo
->max_gs_threads
;
944 compiler
->brw
->max_wm_threads
= devinfo
->max_wm_threads
;
945 compiler
->brw
->max_cs_threads
= devinfo
->max_cs_threads
;
946 compiler
->brw
->urb
.size
= devinfo
->urb
.size
;
947 compiler
->brw
->urb
.min_vs_entries
= devinfo
->urb
.min_vs_entries
;
948 compiler
->brw
->urb
.max_vs_entries
= devinfo
->urb
.max_vs_entries
;
949 compiler
->brw
->urb
.max_hs_entries
= devinfo
->urb
.max_hs_entries
;
950 compiler
->brw
->urb
.max_ds_entries
= devinfo
->urb
.max_ds_entries
;
951 compiler
->brw
->urb
.max_gs_entries
= devinfo
->urb
.max_gs_entries
;
953 compiler
->brw
->intelScreen
= compiler
->screen
;
954 compiler
->screen
->devinfo
= &device
->info
;
956 brw_process_intel_debug_variable();
958 compiler
->screen
->compiler
= brw_compiler_create(compiler
, &device
->info
);
960 ctx
= &compiler
->brw
->ctx
;
961 _mesa_init_shader_object_functions(&ctx
->Driver
);
963 /* brw_select_clip_planes() needs this for bogus reasons. */
964 ctx
->_Shader
= &compiler
->pipeline
;
969 ralloc_free(compiler
);
974 anv_compiler_destroy(struct anv_compiler
*compiler
)
976 _mesa_free_errors_data(&compiler
->brw
->ctx
);
977 ralloc_free(compiler
);
980 /* From gen7_urb.c */
982 /* FIXME: Add to struct intel_device_info */
984 static const int gen8_push_size
= 32 * 1024;
987 gen7_compute_urb_partition(struct anv_pipeline
*pipeline
)
989 const struct brw_device_info
*devinfo
= &pipeline
->device
->info
;
990 bool vs_present
= pipeline
->vs_simd8
!= NO_KERNEL
;
991 unsigned vs_size
= vs_present
? pipeline
->vs_prog_data
.base
.urb_entry_size
: 1;
992 unsigned vs_entry_size_bytes
= vs_size
* 64;
993 bool gs_present
= pipeline
->gs_vec4
!= NO_KERNEL
;
994 unsigned gs_size
= gs_present
? pipeline
->gs_prog_data
.base
.urb_entry_size
: 1;
995 unsigned gs_entry_size_bytes
= gs_size
* 64;
997 /* From p35 of the Ivy Bridge PRM (section 1.7.1: 3DSTATE_URB_GS):
999 * VS Number of URB Entries must be divisible by 8 if the VS URB Entry
1000 * Allocation Size is less than 9 512-bit URB entries.
1002 * Similar text exists for GS.
1004 unsigned vs_granularity
= (vs_size
< 9) ? 8 : 1;
1005 unsigned gs_granularity
= (gs_size
< 9) ? 8 : 1;
1007 /* URB allocations must be done in 8k chunks. */
1008 unsigned chunk_size_bytes
= 8192;
1010 /* Determine the size of the URB in chunks. */
1011 unsigned urb_chunks
= devinfo
->urb
.size
* 1024 / chunk_size_bytes
;
1013 /* Reserve space for push constants */
1014 unsigned push_constant_bytes
= gen8_push_size
;
1015 unsigned push_constant_chunks
=
1016 push_constant_bytes
/ chunk_size_bytes
;
1018 /* Initially, assign each stage the minimum amount of URB space it needs,
1019 * and make a note of how much additional space it "wants" (the amount of
1020 * additional space it could actually make use of).
1023 /* VS has a lower limit on the number of URB entries */
1024 unsigned vs_chunks
=
1025 ALIGN(devinfo
->urb
.min_vs_entries
* vs_entry_size_bytes
,
1026 chunk_size_bytes
) / chunk_size_bytes
;
1028 ALIGN(devinfo
->urb
.max_vs_entries
* vs_entry_size_bytes
,
1029 chunk_size_bytes
) / chunk_size_bytes
- vs_chunks
;
1031 unsigned gs_chunks
= 0;
1032 unsigned gs_wants
= 0;
1034 /* There are two constraints on the minimum amount of URB space we can
1037 * (1) We need room for at least 2 URB entries, since we always operate
1038 * the GS in DUAL_OBJECT mode.
1040 * (2) We can't allocate less than nr_gs_entries_granularity.
1042 gs_chunks
= ALIGN(MAX2(gs_granularity
, 2) * gs_entry_size_bytes
,
1043 chunk_size_bytes
) / chunk_size_bytes
;
1045 ALIGN(devinfo
->urb
.max_gs_entries
* gs_entry_size_bytes
,
1046 chunk_size_bytes
) / chunk_size_bytes
- gs_chunks
;
1049 /* There should always be enough URB space to satisfy the minimum
1050 * requirements of each stage.
1052 unsigned total_needs
= push_constant_chunks
+ vs_chunks
+ gs_chunks
;
1053 assert(total_needs
<= urb_chunks
);
1055 /* Mete out remaining space (if any) in proportion to "wants". */
1056 unsigned total_wants
= vs_wants
+ gs_wants
;
1057 unsigned remaining_space
= urb_chunks
- total_needs
;
1058 if (remaining_space
> total_wants
)
1059 remaining_space
= total_wants
;
1060 if (remaining_space
> 0) {
1061 unsigned vs_additional
= (unsigned)
1062 round(vs_wants
* (((double) remaining_space
) / total_wants
));
1063 vs_chunks
+= vs_additional
;
1064 remaining_space
-= vs_additional
;
1065 gs_chunks
+= remaining_space
;
1068 /* Sanity check that we haven't over-allocated. */
1069 assert(push_constant_chunks
+ vs_chunks
+ gs_chunks
<= urb_chunks
);
1071 /* Finally, compute the number of entries that can fit in the space
1072 * allocated to each stage.
1074 unsigned nr_vs_entries
= vs_chunks
* chunk_size_bytes
/ vs_entry_size_bytes
;
1075 unsigned nr_gs_entries
= gs_chunks
* chunk_size_bytes
/ gs_entry_size_bytes
;
1077 /* Since we rounded up when computing *_wants, this may be slightly more
1078 * than the maximum allowed amount, so correct for that.
1080 nr_vs_entries
= MIN2(nr_vs_entries
, devinfo
->urb
.max_vs_entries
);
1081 nr_gs_entries
= MIN2(nr_gs_entries
, devinfo
->urb
.max_gs_entries
);
1083 /* Ensure that we program a multiple of the granularity. */
1084 nr_vs_entries
= ROUND_DOWN_TO(nr_vs_entries
, vs_granularity
);
1085 nr_gs_entries
= ROUND_DOWN_TO(nr_gs_entries
, gs_granularity
);
1087 /* Finally, sanity check to make sure we have at least the minimum number
1088 * of entries needed for each stage.
1090 assert(nr_vs_entries
>= devinfo
->urb
.min_vs_entries
);
1092 assert(nr_gs_entries
>= 2);
1094 /* Lay out the URB in the following order:
1099 pipeline
->urb
.vs_start
= push_constant_chunks
;
1100 pipeline
->urb
.vs_size
= vs_size
;
1101 pipeline
->urb
.nr_vs_entries
= nr_vs_entries
;
1103 pipeline
->urb
.gs_start
= push_constant_chunks
+ vs_chunks
;
1104 pipeline
->urb
.gs_size
= gs_size
;
1105 pipeline
->urb
.nr_gs_entries
= nr_gs_entries
;
1108 static const struct {
1110 gl_shader_stage stage
;
1113 { GL_VERTEX_SHADER
, MESA_SHADER_VERTEX
, "vertex" },
1114 { GL_TESS_CONTROL_SHADER
, (gl_shader_stage
)-1,"tess control" },
1115 { GL_TESS_EVALUATION_SHADER
, (gl_shader_stage
)-1, "tess evaluation" },
1116 { GL_GEOMETRY_SHADER
, MESA_SHADER_GEOMETRY
, "geometry" },
1117 { GL_FRAGMENT_SHADER
, MESA_SHADER_FRAGMENT
, "fragment" },
1118 { GL_COMPUTE_SHADER
, MESA_SHADER_COMPUTE
, "compute" },
1121 struct spirv_header
{
1128 setup_nir_io(struct gl_shader
*mesa_shader
,
1131 struct gl_program
*prog
= mesa_shader
->Program
;
1132 foreach_list_typed(nir_variable
, var
, node
, &shader
->inputs
) {
1133 prog
->InputsRead
|= BITFIELD64_BIT(var
->data
.location
);
1134 if (shader
->stage
== MESA_SHADER_FRAGMENT
) {
1135 struct gl_fragment_program
*fprog
= (struct gl_fragment_program
*)prog
;
1137 fprog
->InterpQualifier
[var
->data
.location
] =
1138 (glsl_interp_qualifier
)var
->data
.interpolation
;
1139 if (var
->data
.centroid
)
1140 fprog
->IsCentroid
|= BITFIELD64_BIT(var
->data
.location
);
1141 if (var
->data
.sample
)
1142 fprog
->IsSample
|= BITFIELD64_BIT(var
->data
.location
);
1146 foreach_list_typed(nir_variable
, var
, node
, &shader
->outputs
) {
1147 prog
->OutputsWritten
|= BITFIELD64_BIT(var
->data
.location
);
1150 shader
->info
.system_values_read
= 0;
1151 foreach_list_typed(nir_variable
, var
, node
, &shader
->system_values
) {
1152 shader
->info
.system_values_read
|= BITFIELD64_BIT(var
->data
.location
);
1155 shader
->info
.inputs_read
= prog
->InputsRead
;
1156 shader
->info
.outputs_written
= prog
->OutputsWritten
;
1160 anv_compile_shader_spirv(struct anv_compiler
*compiler
,
1161 struct gl_shader_program
*program
,
1162 struct anv_pipeline
*pipeline
, uint32_t stage
)
1164 struct brw_context
*brw
= compiler
->brw
;
1165 struct anv_shader
*shader
= pipeline
->shaders
[stage
];
1166 struct gl_shader
*mesa_shader
;
1169 mesa_shader
= brw_new_shader(&brw
->ctx
, name
, stage_info
[stage
].token
);
1170 fail_if(mesa_shader
== NULL
,
1171 "failed to create %s shader\n", stage_info
[stage
].name
);
1173 #define CREATE_PROGRAM(stage) \
1174 &ralloc(mesa_shader, struct brw_##stage##_program)->program.Base
1177 struct gl_program
*prog
;
1179 case VK_SHADER_STAGE_VERTEX
:
1180 prog
= CREATE_PROGRAM(vertex
);
1181 is_scalar
= compiler
->screen
->compiler
->scalar_vs
;
1183 case VK_SHADER_STAGE_GEOMETRY
:
1184 prog
= CREATE_PROGRAM(geometry
);
1187 case VK_SHADER_STAGE_FRAGMENT
:
1188 prog
= CREATE_PROGRAM(fragment
);
1191 case VK_SHADER_STAGE_COMPUTE
:
1192 prog
= CREATE_PROGRAM(compute
);
1196 unreachable("Unsupported shader stage");
1198 _mesa_init_gl_program(prog
, 0, 0);
1199 _mesa_reference_program(&brw
->ctx
, &mesa_shader
->Program
, prog
);
1201 mesa_shader
->Program
->Parameters
=
1202 rzalloc(mesa_shader
, struct gl_program_parameter_list
);
1204 mesa_shader
->Type
= stage_info
[stage
].token
;
1205 mesa_shader
->Stage
= stage_info
[stage
].stage
;
1207 struct gl_shader_compiler_options
*glsl_options
=
1208 &compiler
->screen
->compiler
->glsl_compiler_options
[stage_info
[stage
].stage
];
1210 if (shader
->module
->nir
) {
1211 /* Some things such as our meta clear/blit code will give us a NIR
1212 * shader directly. In that case, we just ignore the SPIR-V entirely
1213 * and just use the NIR shader */
1214 mesa_shader
->Program
->nir
= shader
->module
->nir
;
1215 mesa_shader
->Program
->nir
->options
= glsl_options
->NirOptions
;
1217 uint32_t *spirv
= (uint32_t *) shader
->module
->data
;
1218 assert(spirv
[0] == SPIR_V_MAGIC_NUMBER
);
1219 assert(shader
->module
->size
% 4 == 0);
1221 mesa_shader
->Program
->nir
=
1222 spirv_to_nir(spirv
, shader
->module
->size
/ 4,
1223 stage_info
[stage
].stage
, glsl_options
->NirOptions
);
1225 nir_validate_shader(mesa_shader
->Program
->nir
);
1227 setup_nir_io(mesa_shader
, mesa_shader
->Program
->nir
);
1229 brw_process_nir(mesa_shader
->Program
->nir
,
1230 compiler
->screen
->devinfo
,
1231 NULL
, mesa_shader
->Stage
, is_scalar
);
1233 mesa_shader
->num_uniform_components
=
1234 mesa_shader
->Program
->nir
->num_uniforms
;
1236 fail_if(mesa_shader
->Program
->nir
== NULL
,
1237 "failed to translate SPIR-V to NIR\n");
1239 _mesa_reference_shader(&brw
->ctx
, &program
->Shaders
[program
->NumShaders
],
1241 program
->NumShaders
++;
1245 add_compiled_stage(struct anv_pipeline
*pipeline
, uint32_t stage
,
1246 struct brw_stage_prog_data
*prog_data
)
1248 struct brw_device_info
*devinfo
= &pipeline
->device
->info
;
1249 uint32_t max_threads
[] = {
1250 [VK_SHADER_STAGE_VERTEX
] = devinfo
->max_vs_threads
,
1251 [VK_SHADER_STAGE_TESS_CONTROL
] = 0,
1252 [VK_SHADER_STAGE_TESS_EVALUATION
] = 0,
1253 [VK_SHADER_STAGE_GEOMETRY
] = devinfo
->max_gs_threads
,
1254 [VK_SHADER_STAGE_FRAGMENT
] = devinfo
->max_wm_threads
,
1255 [VK_SHADER_STAGE_COMPUTE
] = devinfo
->max_cs_threads
,
1258 pipeline
->prog_data
[stage
] = prog_data
;
1259 pipeline
->active_stages
|= 1 << stage
;
1260 pipeline
->scratch_start
[stage
] = pipeline
->total_scratch
;
1261 pipeline
->total_scratch
=
1262 align_u32(pipeline
->total_scratch
, 1024) +
1263 prog_data
->total_scratch
* max_threads
[stage
];
1267 anv_compiler_run(struct anv_compiler
*compiler
, struct anv_pipeline
*pipeline
)
1269 struct gl_shader_program
*program
;
1271 struct brw_context
*brw
= compiler
->brw
;
1273 pipeline
->writes_point_size
= false;
1275 /* When we free the pipeline, we detect stages based on the NULL status
1276 * of various prog_data pointers. Make them NULL by default.
1278 memset(pipeline
->prog_data
, 0, sizeof(pipeline
->prog_data
));
1279 memset(pipeline
->scratch_start
, 0, sizeof(pipeline
->scratch_start
));
1281 brw
->use_rep_send
= pipeline
->use_repclear
;
1282 brw
->no_simd8
= pipeline
->use_repclear
;
1284 program
= _mesa_new_shader_program(name
);
1285 program
->Shaders
= (struct gl_shader
**)
1286 calloc(VK_SHADER_STAGE_NUM
, sizeof(struct gl_shader
*));
1287 fail_if(program
== NULL
|| program
->Shaders
== NULL
,
1288 "failed to create program\n");
1290 for (unsigned i
= 0; i
< VK_SHADER_STAGE_NUM
; i
++) {
1291 if (pipeline
->shaders
[i
])
1292 anv_compile_shader_spirv(compiler
, program
, pipeline
, i
);
1295 for (unsigned i
= 0; i
< program
->NumShaders
; i
++) {
1296 struct gl_shader
*shader
= program
->Shaders
[i
];
1297 program
->_LinkedShaders
[shader
->Stage
] = shader
;
1301 pipeline
->active_stages
= 0;
1302 pipeline
->total_scratch
= 0;
1304 if (pipeline
->shaders
[VK_SHADER_STAGE_VERTEX
]) {
1305 struct brw_vs_prog_key vs_key
;
1306 struct gl_vertex_program
*vp
= (struct gl_vertex_program
*)
1307 program
->_LinkedShaders
[MESA_SHADER_VERTEX
]->Program
;
1308 struct brw_vertex_program
*bvp
= brw_vertex_program(vp
);
1310 brw_vs_populate_key(brw
, bvp
, &vs_key
);
1312 success
= really_do_vs_prog(brw
, program
, bvp
, &vs_key
, pipeline
);
1313 fail_if(!success
, "do_wm_prog failed\n");
1314 add_compiled_stage(pipeline
, VK_SHADER_STAGE_VERTEX
,
1315 &pipeline
->vs_prog_data
.base
.base
);
1317 if (vp
->Base
.OutputsWritten
& VARYING_SLOT_PSIZ
)
1318 pipeline
->writes_point_size
= true;
1320 memset(&pipeline
->vs_prog_data
, 0, sizeof(pipeline
->vs_prog_data
));
1321 pipeline
->vs_simd8
= NO_KERNEL
;
1322 pipeline
->vs_vec4
= NO_KERNEL
;
1326 if (pipeline
->shaders
[VK_SHADER_STAGE_GEOMETRY
]) {
1327 struct brw_gs_prog_key gs_key
;
1328 struct gl_geometry_program
*gp
= (struct gl_geometry_program
*)
1329 program
->_LinkedShaders
[MESA_SHADER_GEOMETRY
]->Program
;
1330 struct brw_geometry_program
*bgp
= brw_geometry_program(gp
);
1332 success
= anv_codegen_gs_prog(brw
, program
, bgp
, &gs_key
, pipeline
);
1333 fail_if(!success
, "do_gs_prog failed\n");
1334 add_compiled_stage(pipeline
, VK_SHADER_STAGE_GEOMETRY
,
1335 &pipeline
->gs_prog_data
.base
.base
);
1337 if (gp
->Base
.OutputsWritten
& VARYING_SLOT_PSIZ
)
1338 pipeline
->writes_point_size
= true;
1340 pipeline
->gs_vec4
= NO_KERNEL
;
1343 if (pipeline
->shaders
[VK_SHADER_STAGE_FRAGMENT
]) {
1344 struct brw_wm_prog_key wm_key
;
1345 struct gl_fragment_program
*fp
= (struct gl_fragment_program
*)
1346 program
->_LinkedShaders
[MESA_SHADER_FRAGMENT
]->Program
;
1347 struct brw_fragment_program
*bfp
= brw_fragment_program(fp
);
1349 brw_wm_populate_key(brw
, bfp
, &wm_key
);
1351 success
= really_do_wm_prog(brw
, program
, bfp
, &wm_key
, pipeline
);
1352 fail_if(!success
, "do_wm_prog failed\n");
1353 add_compiled_stage(pipeline
, VK_SHADER_STAGE_FRAGMENT
,
1354 &pipeline
->wm_prog_data
.base
);
1357 if (pipeline
->shaders
[VK_SHADER_STAGE_COMPUTE
]) {
1358 struct brw_cs_prog_key cs_key
;
1359 struct gl_compute_program
*cp
= (struct gl_compute_program
*)
1360 program
->_LinkedShaders
[MESA_SHADER_COMPUTE
]->Program
;
1361 struct brw_compute_program
*bcp
= brw_compute_program(cp
);
1363 brw_cs_populate_key(brw
, bcp
, &cs_key
);
1365 success
= brw_codegen_cs_prog(brw
, program
, bcp
, &cs_key
, pipeline
);
1366 fail_if(!success
, "brw_codegen_cs_prog failed\n");
1367 add_compiled_stage(pipeline
, VK_SHADER_STAGE_COMPUTE
,
1368 &pipeline
->cs_prog_data
.base
);
1371 _mesa_delete_shader_program(&brw
->ctx
, program
);
1373 struct anv_device
*device
= compiler
->device
;
1374 while (device
->scratch_block_pool
.bo
.size
< pipeline
->total_scratch
)
1375 anv_block_pool_alloc(&device
->scratch_block_pool
);
1377 gen7_compute_urb_partition(pipeline
);
1382 /* This badly named function frees the struct anv_pipeline data that the compiler
1383 * allocates. Currently just the prog_data structs.
1386 anv_compiler_free(struct anv_pipeline
*pipeline
)
1388 for (uint32_t stage
= 0; stage
< VK_SHADER_STAGE_NUM
; stage
++) {
1389 if (pipeline
->prog_data
[stage
]) {
1390 /* We only ever set up the params array because we don't do
1391 * non-UBO pull constants
1393 anv_device_free(pipeline
->device
, pipeline
->prog_data
[stage
]->param
);