2 * Copyright © 2017 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
26 #include "common/gen_device_info.h"
27 #include "genxml/gen_macros.h"
29 #include "brw_context.h"
31 #include "brw_defines.h"
33 #include "brw_state.h"
37 #include "intel_batchbuffer.h"
38 #include "intel_buffer_objects.h"
39 #include "intel_fbo.h"
41 #include "main/enums.h"
42 #include "main/fbobject.h"
43 #include "main/framebuffer.h"
44 #include "main/glformats.h"
45 #include "main/stencil.h"
46 #include "main/transformfeedback.h"
47 #include "main/viewport.h"
50 emit_dwords(struct brw_context
*brw
, unsigned n
)
52 intel_batchbuffer_begin(brw
, n
, RENDER_RING
);
53 uint32_t *map
= brw
->batch
.map_next
;
54 brw
->batch
.map_next
+= n
;
55 intel_batchbuffer_advance(brw
);
61 uint32_t read_domains
;
62 uint32_t write_domain
;
67 emit_reloc(struct brw_context
*brw
,
68 void *location
, struct brw_address address
, uint32_t delta
)
70 uint32_t offset
= (char *) location
- (char *) brw
->batch
.map
;
72 return brw_emit_reloc(&brw
->batch
, offset
, address
.bo
,
73 address
.offset
+ delta
,
75 address
.write_domain
);
78 #define __gen_address_type struct brw_address
79 #define __gen_user_data struct brw_context
82 __gen_combine_address(struct brw_context
*brw
, void *location
,
83 struct brw_address address
, uint32_t delta
)
85 if (address
.bo
== NULL
) {
86 return address
.offset
+ delta
;
88 return emit_reloc(brw
, location
, address
, delta
);
92 static inline struct brw_address
93 render_bo(struct brw_bo
*bo
, uint32_t offset
)
95 return (struct brw_address
) {
98 .read_domains
= I915_GEM_DOMAIN_RENDER
,
99 .write_domain
= I915_GEM_DOMAIN_RENDER
,
103 static inline struct brw_address
104 instruction_bo(struct brw_bo
*bo
, uint32_t offset
)
106 return (struct brw_address
) {
109 .read_domains
= I915_GEM_DOMAIN_INSTRUCTION
,
110 .write_domain
= I915_GEM_DOMAIN_INSTRUCTION
,
114 #include "genxml/genX_pack.h"
116 #define _brw_cmd_length(cmd) cmd ## _length
117 #define _brw_cmd_length_bias(cmd) cmd ## _length_bias
118 #define _brw_cmd_header(cmd) cmd ## _header
119 #define _brw_cmd_pack(cmd) cmd ## _pack
121 #define brw_batch_emit(brw, cmd, name) \
122 for (struct cmd name = { _brw_cmd_header(cmd) }, \
123 *_dst = emit_dwords(brw, _brw_cmd_length(cmd)); \
124 __builtin_expect(_dst != NULL, 1); \
125 _brw_cmd_pack(cmd)(brw, (void *)_dst, &name), \
128 #define brw_batch_emitn(brw, cmd, n, ...) ({ \
129 uint32_t *_dw = emit_dwords(brw, n); \
130 struct cmd template = { \
131 _brw_cmd_header(cmd), \
132 .DWordLength = n - _brw_cmd_length_bias(cmd), \
135 _brw_cmd_pack(cmd)(brw, _dw, &template); \
136 _dw + 1; /* Array starts at dw[1] */ \
139 #define brw_state_emit(brw, cmd, align, offset, name) \
140 for (struct cmd name = { 0, }, \
141 *_dst = brw_state_batch(brw, _brw_cmd_length(cmd) * 4, \
143 __builtin_expect(_dst != NULL, 1); \
144 _brw_cmd_pack(cmd)(brw, (void *)_dst, &name), \
149 * Determine the appropriate attribute override value to store into the
150 * 3DSTATE_SF structure for a given fragment shader attribute. The attribute
151 * override value contains two pieces of information: the location of the
152 * attribute in the VUE (relative to urb_entry_read_offset, see below), and a
153 * flag indicating whether to "swizzle" the attribute based on the direction
154 * the triangle is facing.
156 * If an attribute is "swizzled", then the given VUE location is used for
157 * front-facing triangles, and the VUE location that immediately follows is
158 * used for back-facing triangles. We use this to implement the mapping from
159 * gl_FrontColor/gl_BackColor to gl_Color.
161 * urb_entry_read_offset is the offset into the VUE at which the SF unit is
162 * being instructed to begin reading attribute data. It can be set to a
163 * nonzero value to prevent the SF unit from wasting time reading elements of
164 * the VUE that are not needed by the fragment shader. It is measured in
165 * 256-bit increments.
168 genX(get_attr_override
)(struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL
) *attr
,
169 const struct brw_vue_map
*vue_map
,
170 int urb_entry_read_offset
, int fs_attr
,
171 bool two_side_color
, uint32_t *max_source_attr
)
173 /* Find the VUE slot for this attribute. */
174 int slot
= vue_map
->varying_to_slot
[fs_attr
];
176 /* Viewport and Layer are stored in the VUE header. We need to override
177 * them to zero if earlier stages didn't write them, as GL requires that
178 * they read back as zero when not explicitly set.
180 if (fs_attr
== VARYING_SLOT_VIEWPORT
|| fs_attr
== VARYING_SLOT_LAYER
) {
181 attr
->ComponentOverrideX
= true;
182 attr
->ComponentOverrideW
= true;
183 attr
->ConstantSource
= CONST_0000
;
185 if (!(vue_map
->slots_valid
& VARYING_BIT_LAYER
))
186 attr
->ComponentOverrideY
= true;
187 if (!(vue_map
->slots_valid
& VARYING_BIT_VIEWPORT
))
188 attr
->ComponentOverrideZ
= true;
193 /* If there was only a back color written but not front, use back
194 * as the color instead of undefined
196 if (slot
== -1 && fs_attr
== VARYING_SLOT_COL0
)
197 slot
= vue_map
->varying_to_slot
[VARYING_SLOT_BFC0
];
198 if (slot
== -1 && fs_attr
== VARYING_SLOT_COL1
)
199 slot
= vue_map
->varying_to_slot
[VARYING_SLOT_BFC1
];
202 /* This attribute does not exist in the VUE--that means that the vertex
203 * shader did not write to it. This means that either:
205 * (a) This attribute is a texture coordinate, and it is going to be
206 * replaced with point coordinates (as a consequence of a call to
207 * glTexEnvi(GL_POINT_SPRITE, GL_COORD_REPLACE, GL_TRUE)), so the
208 * hardware will ignore whatever attribute override we supply.
210 * (b) This attribute is read by the fragment shader but not written by
211 * the vertex shader, so its value is undefined. Therefore the
212 * attribute override we supply doesn't matter.
214 * (c) This attribute is gl_PrimitiveID, and it wasn't written by the
215 * previous shader stage.
217 * Note that we don't have to worry about the cases where the attribute
218 * is gl_PointCoord or is undergoing point sprite coordinate
219 * replacement, because in those cases, this function isn't called.
221 * In case (c), we need to program the attribute overrides so that the
222 * primitive ID will be stored in this slot. In every other case, the
223 * attribute override we supply doesn't matter. So just go ahead and
224 * program primitive ID in every case.
226 attr
->ComponentOverrideW
= true;
227 attr
->ComponentOverrideX
= true;
228 attr
->ComponentOverrideY
= true;
229 attr
->ComponentOverrideZ
= true;
230 attr
->ConstantSource
= PRIM_ID
;
234 /* Compute the location of the attribute relative to urb_entry_read_offset.
235 * Each increment of urb_entry_read_offset represents a 256-bit value, so
236 * it counts for two 128-bit VUE slots.
238 int source_attr
= slot
- 2 * urb_entry_read_offset
;
239 assert(source_attr
>= 0 && source_attr
< 32);
241 /* If we are doing two-sided color, and the VUE slot following this one
242 * represents a back-facing color, then we need to instruct the SF unit to
243 * do back-facing swizzling.
245 bool swizzling
= two_side_color
&&
246 ((vue_map
->slot_to_varying
[slot
] == VARYING_SLOT_COL0
&&
247 vue_map
->slot_to_varying
[slot
+1] == VARYING_SLOT_BFC0
) ||
248 (vue_map
->slot_to_varying
[slot
] == VARYING_SLOT_COL1
&&
249 vue_map
->slot_to_varying
[slot
+1] == VARYING_SLOT_BFC1
));
251 /* Update max_source_attr. If swizzling, the SF will read this slot + 1. */
252 if (*max_source_attr
< source_attr
+ swizzling
)
253 *max_source_attr
= source_attr
+ swizzling
;
255 attr
->SourceAttribute
= source_attr
;
257 attr
->SwizzleSelect
= INPUTATTR_FACING
;
262 genX(calculate_attr_overrides
)(const struct brw_context
*brw
,
263 struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL
) *attr_overrides
,
264 uint32_t *point_sprite_enables
,
265 uint32_t *urb_entry_read_length
,
266 uint32_t *urb_entry_read_offset
)
268 const struct gl_context
*ctx
= &brw
->ctx
;
271 const struct gl_point_attrib
*point
= &ctx
->Point
;
273 /* BRW_NEW_FS_PROG_DATA */
274 const struct brw_wm_prog_data
*wm_prog_data
=
275 brw_wm_prog_data(brw
->wm
.base
.prog_data
);
276 uint32_t max_source_attr
= 0;
278 *point_sprite_enables
= 0;
280 /* BRW_NEW_FRAGMENT_PROGRAM
282 * If the fragment shader reads VARYING_SLOT_LAYER, then we need to pass in
283 * the full vertex header. Otherwise, we can program the SF to start
284 * reading at an offset of 1 (2 varying slots) to skip unnecessary data:
285 * - VARYING_SLOT_PSIZ and BRW_VARYING_SLOT_NDC on gen4-5
286 * - VARYING_SLOT_{PSIZ,LAYER} and VARYING_SLOT_POS on gen6+
289 bool fs_needs_vue_header
= brw
->fragment_program
->info
.inputs_read
&
290 (VARYING_BIT_LAYER
| VARYING_BIT_VIEWPORT
);
292 *urb_entry_read_offset
= fs_needs_vue_header
? 0 : 1;
294 /* From the Ivybridge PRM, Vol 2 Part 1, 3DSTATE_SBE,
295 * description of dw10 Point Sprite Texture Coordinate Enable:
297 * "This field must be programmed to zero when non-point primitives
300 * The SandyBridge PRM doesn't explicitly say that point sprite enables
301 * must be programmed to zero when rendering non-point primitives, but
302 * the IvyBridge PRM does, and if we don't, we get garbage.
304 * This is not required on Haswell, as the hardware ignores this state
305 * when drawing non-points -- although we do still need to be careful to
306 * correctly set the attr overrides.
309 * BRW_NEW_PRIMITIVE | BRW_NEW_GS_PROG_DATA | BRW_NEW_TES_PROG_DATA
311 bool drawing_points
= brw_is_drawing_points(brw
);
313 for (int attr
= 0; attr
< VARYING_SLOT_MAX
; attr
++) {
314 int input_index
= wm_prog_data
->urb_setup
[attr
];
320 bool point_sprite
= false;
321 if (drawing_points
) {
322 if (point
->PointSprite
&&
323 (attr
>= VARYING_SLOT_TEX0
&& attr
<= VARYING_SLOT_TEX7
) &&
324 (point
->CoordReplace
& (1u << (attr
- VARYING_SLOT_TEX0
)))) {
328 if (attr
== VARYING_SLOT_PNTC
)
332 *point_sprite_enables
|= (1 << input_index
);
335 /* BRW_NEW_VUE_MAP_GEOM_OUT | _NEW_LIGHT | _NEW_PROGRAM */
336 struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL
) attribute
= { 0 };
339 genX(get_attr_override
)(&attribute
,
340 &brw
->vue_map_geom_out
,
341 *urb_entry_read_offset
, attr
,
342 brw
->ctx
.VertexProgram
._TwoSideEnabled
,
346 /* The hardware can only do the overrides on 16 overrides at a
347 * time, and the other up to 16 have to be lined up so that the
348 * input index = the output index. We'll need to do some
349 * tweaking to make sure that's the case.
351 if (input_index
< 16)
352 attr_overrides
[input_index
] = attribute
;
354 assert(attribute
.SourceAttribute
== input_index
);
357 /* From the Sandy Bridge PRM, Volume 2, Part 1, documentation for
358 * 3DSTATE_SF DWord 1 bits 15:11, "Vertex URB Entry Read Length":
360 * "This field should be set to the minimum length required to read the
361 * maximum source attribute. The maximum source attribute is indicated
362 * by the maximum value of the enabled Attribute # Source Attribute if
363 * Attribute Swizzle Enable is set, Number of Output Attributes-1 if
365 * read_length = ceiling((max_source_attr + 1) / 2)
367 * [errata] Corruption/Hang possible if length programmed larger than
370 * Similar text exists for Ivy Bridge.
372 *urb_entry_read_length
= DIV_ROUND_UP(max_source_attr
+ 1, 2);
375 /* ---------------------------------------------------------------------- */
378 genX(upload_depth_stencil_state
)(struct brw_context
*brw
)
380 struct gl_context
*ctx
= &brw
->ctx
;
383 struct intel_renderbuffer
*depth_irb
=
384 intel_get_renderbuffer(ctx
->DrawBuffer
, BUFFER_DEPTH
);
387 struct gl_depthbuffer_attrib
*depth
= &ctx
->Depth
;
390 struct gl_stencil_attrib
*stencil
= &ctx
->Stencil
;
391 const int b
= stencil
->_BackFace
;
394 brw_batch_emit(brw
, GENX(3DSTATE_WM_DEPTH_STENCIL
), wmds
) {
397 brw_state_emit(brw
, GENX(DEPTH_STENCIL_STATE
), 64, &ds_offset
, wmds
) {
399 if (depth
->Test
&& depth_irb
) {
400 wmds
.DepthTestEnable
= true;
401 wmds
.DepthBufferWriteEnable
= brw_depth_writes_enabled(brw
);
402 wmds
.DepthTestFunction
= intel_translate_compare_func(depth
->Func
);
405 if (stencil
->_Enabled
) {
406 wmds
.StencilTestEnable
= true;
407 wmds
.StencilWriteMask
= stencil
->WriteMask
[0] & 0xff;
408 wmds
.StencilTestMask
= stencil
->ValueMask
[0] & 0xff;
410 wmds
.StencilTestFunction
=
411 intel_translate_compare_func(stencil
->Function
[0]);
413 intel_translate_stencil_op(stencil
->FailFunc
[0]);
414 wmds
.StencilPassDepthPassOp
=
415 intel_translate_stencil_op(stencil
->ZPassFunc
[0]);
416 wmds
.StencilPassDepthFailOp
=
417 intel_translate_stencil_op(stencil
->ZFailFunc
[0]);
419 wmds
.StencilBufferWriteEnable
= stencil
->_WriteEnabled
;
421 if (stencil
->_TestTwoSide
) {
422 wmds
.DoubleSidedStencilEnable
= true;
423 wmds
.BackfaceStencilWriteMask
= stencil
->WriteMask
[b
] & 0xff;
424 wmds
.BackfaceStencilTestMask
= stencil
->ValueMask
[b
] & 0xff;
426 wmds
.BackfaceStencilTestFunction
=
427 intel_translate_compare_func(stencil
->Function
[b
]);
428 wmds
.BackfaceStencilFailOp
=
429 intel_translate_stencil_op(stencil
->FailFunc
[b
]);
430 wmds
.BackfaceStencilPassDepthPassOp
=
431 intel_translate_stencil_op(stencil
->ZPassFunc
[b
]);
432 wmds
.BackfaceStencilPassDepthFailOp
=
433 intel_translate_stencil_op(stencil
->ZFailFunc
[b
]);
437 wmds
.StencilReferenceValue
= _mesa_get_stencil_ref(ctx
, 0);
438 wmds
.BackfaceStencilReferenceValue
= _mesa_get_stencil_ref(ctx
, b
);
444 brw_batch_emit(brw
, GENX(3DSTATE_CC_STATE_POINTERS
), ptr
) {
445 ptr
.PointertoDEPTH_STENCIL_STATE
= ds_offset
;
446 ptr
.DEPTH_STENCIL_STATEChange
= true;
449 brw_batch_emit(brw
, GENX(3DSTATE_DEPTH_STENCIL_STATE_POINTERS
), ptr
) {
450 ptr
.PointertoDEPTH_STENCIL_STATE
= ds_offset
;
455 static const struct brw_tracked_state
genX(depth_stencil_state
) = {
457 .mesa
= _NEW_BUFFERS
|
460 .brw
= BRW_NEW_BLORP
|
461 (GEN_GEN
>= 8 ? BRW_NEW_CONTEXT
463 BRW_NEW_STATE_BASE_ADDRESS
),
465 .emit
= genX(upload_depth_stencil_state
),
468 /* ---------------------------------------------------------------------- */
471 genX(upload_clip_state
)(struct brw_context
*brw
)
473 struct gl_context
*ctx
= &brw
->ctx
;
476 struct gl_framebuffer
*fb
= ctx
->DrawBuffer
;
478 /* BRW_NEW_FS_PROG_DATA */
479 struct brw_wm_prog_data
*wm_prog_data
=
480 brw_wm_prog_data(brw
->wm
.base
.prog_data
);
482 brw_batch_emit(brw
, GENX(3DSTATE_CLIP
), clip
) {
483 clip
.StatisticsEnable
= !brw
->meta_in_progress
;
485 if (wm_prog_data
->barycentric_interp_modes
&
486 BRW_BARYCENTRIC_NONPERSPECTIVE_BITS
)
487 clip
.NonPerspectiveBarycentricEnable
= true;
490 clip
.EarlyCullEnable
= true;
494 clip
.FrontWinding
= ctx
->Polygon
._FrontBit
== _mesa_is_user_fbo(fb
);
496 if (ctx
->Polygon
.CullFlag
) {
497 switch (ctx
->Polygon
.CullFaceMode
) {
499 clip
.CullMode
= CULLMODE_FRONT
;
502 clip
.CullMode
= CULLMODE_BACK
;
504 case GL_FRONT_AND_BACK
:
505 clip
.CullMode
= CULLMODE_BOTH
;
508 unreachable("Should not get here: invalid CullFlag");
511 clip
.CullMode
= CULLMODE_NONE
;
516 clip
.UserClipDistanceCullTestEnableBitmask
=
517 brw_vue_prog_data(brw
->vs
.base
.prog_data
)->cull_distance_mask
;
519 clip
.ViewportZClipTestEnable
= !ctx
->Transform
.DepthClamp
;
523 if (ctx
->Light
.ProvokingVertex
== GL_FIRST_VERTEX_CONVENTION
) {
524 clip
.TriangleStripListProvokingVertexSelect
= 0;
525 clip
.TriangleFanProvokingVertexSelect
= 1;
526 clip
.LineStripListProvokingVertexSelect
= 0;
528 clip
.TriangleStripListProvokingVertexSelect
= 2;
529 clip
.TriangleFanProvokingVertexSelect
= 2;
530 clip
.LineStripListProvokingVertexSelect
= 1;
534 clip
.UserClipDistanceClipTestEnableBitmask
=
535 ctx
->Transform
.ClipPlanesEnabled
;
538 clip
.ForceUserClipDistanceClipTestEnableBitmask
= true;
541 if (ctx
->Transform
.ClipDepthMode
== GL_ZERO_TO_ONE
)
542 clip
.APIMode
= APIMODE_D3D
;
544 clip
.APIMode
= APIMODE_OGL
;
546 clip
.GuardbandClipTestEnable
= true;
548 /* BRW_NEW_VIEWPORT_COUNT */
549 const unsigned viewport_count
= brw
->clip
.viewport_count
;
551 if (ctx
->RasterDiscard
) {
552 clip
.ClipMode
= CLIPMODE_REJECT_ALL
;
554 perf_debug("Rasterizer discard is currently implemented via the "
555 "clipper; having the GS not write primitives would "
556 "likely be faster.\n");
559 clip
.ClipMode
= CLIPMODE_NORMAL
;
562 clip
.ClipEnable
= brw
->primitive
!= _3DPRIM_RECTLIST
;
565 * BRW_NEW_GEOMETRY_PROGRAM | BRW_NEW_TES_PROG_DATA | BRW_NEW_PRIMITIVE
567 if (!brw_is_drawing_points(brw
) && !brw_is_drawing_lines(brw
))
568 clip
.ViewportXYClipTestEnable
= true;
570 clip
.MinimumPointWidth
= 0.125;
571 clip
.MaximumPointWidth
= 255.875;
572 clip
.MaximumVPIndex
= viewport_count
- 1;
573 if (_mesa_geometric_layers(fb
) == 0)
574 clip
.ForceZeroRTAIndexEnable
= true;
578 static const struct brw_tracked_state
genX(clip_state
) = {
580 .mesa
= _NEW_BUFFERS
|
584 .brw
= BRW_NEW_BLORP
|
586 BRW_NEW_FS_PROG_DATA
|
587 BRW_NEW_GS_PROG_DATA
|
588 BRW_NEW_VS_PROG_DATA
|
589 BRW_NEW_META_IN_PROGRESS
|
591 BRW_NEW_RASTERIZER_DISCARD
|
592 BRW_NEW_TES_PROG_DATA
|
593 BRW_NEW_VIEWPORT_COUNT
,
595 .emit
= genX(upload_clip_state
),
598 /* ---------------------------------------------------------------------- */
601 genX(upload_sf
)(struct brw_context
*brw
)
603 struct gl_context
*ctx
= &brw
->ctx
;
608 bool render_to_fbo
= _mesa_is_user_fbo(ctx
->DrawBuffer
);
609 const bool multisampled_fbo
= _mesa_geometric_samples(ctx
->DrawBuffer
) > 1;
612 brw_batch_emit(brw
, GENX(3DSTATE_SF
), sf
) {
613 sf
.StatisticsEnable
= true;
614 sf
.ViewportTransformEnable
= brw
->sf
.viewport_transform_enable
;
618 sf
.DepthBufferSurfaceFormat
= brw_depthbuffer_format(brw
);
623 sf
.FrontWinding
= ctx
->Polygon
._FrontBit
== render_to_fbo
;
624 sf
.GlobalDepthOffsetEnableSolid
= ctx
->Polygon
.OffsetFill
;
625 sf
.GlobalDepthOffsetEnableWireframe
= ctx
->Polygon
.OffsetLine
;
626 sf
.GlobalDepthOffsetEnablePoint
= ctx
->Polygon
.OffsetPoint
;
628 switch (ctx
->Polygon
.FrontMode
) {
630 sf
.FrontFaceFillMode
= FILL_MODE_SOLID
;
633 sf
.FrontFaceFillMode
= FILL_MODE_WIREFRAME
;
636 sf
.FrontFaceFillMode
= FILL_MODE_POINT
;
639 unreachable("not reached");
642 switch (ctx
->Polygon
.BackMode
) {
644 sf
.BackFaceFillMode
= FILL_MODE_SOLID
;
647 sf
.BackFaceFillMode
= FILL_MODE_WIREFRAME
;
650 sf
.BackFaceFillMode
= FILL_MODE_POINT
;
653 unreachable("not reached");
656 sf
.ScissorRectangleEnable
= true;
658 if (ctx
->Polygon
.CullFlag
) {
659 switch (ctx
->Polygon
.CullFaceMode
) {
661 sf
.CullMode
= CULLMODE_FRONT
;
664 sf
.CullMode
= CULLMODE_BACK
;
666 case GL_FRONT_AND_BACK
:
667 sf
.CullMode
= CULLMODE_BOTH
;
670 unreachable("not reached");
673 sf
.CullMode
= CULLMODE_NONE
;
677 sf
.LineStippleEnable
= ctx
->Line
.StippleFlag
;
680 if (multisampled_fbo
&& ctx
->Multisample
.Enabled
)
681 sf
.MultisampleRasterizationMode
= MSRASTMODE_ON_PATTERN
;
683 sf
.GlobalDepthOffsetConstant
= ctx
->Polygon
.OffsetUnits
* 2;
684 sf
.GlobalDepthOffsetScale
= ctx
->Polygon
.OffsetFactor
;
685 sf
.GlobalDepthOffsetClamp
= ctx
->Polygon
.OffsetClamp
;
689 sf
.LineWidth
= brw_get_line_width_float(brw
);
691 if (ctx
->Line
.SmoothFlag
) {
692 sf
.LineEndCapAntialiasingRegionWidth
= _10pixels
;
694 sf
.AntiAliasingEnable
= true;
698 /* _NEW_POINT - Clamp to ARB_point_parameters user limits */
699 point_size
= CLAMP(ctx
->Point
.Size
, ctx
->Point
.MinSize
, ctx
->Point
.MaxSize
);
700 /* Clamp to the hardware limits */
701 sf
.PointWidth
= CLAMP(point_size
, 0.125f
, 255.875f
);
703 /* _NEW_PROGRAM | _NEW_POINT, BRW_NEW_VUE_MAP_GEOM_OUT */
704 if (use_state_point_size(brw
))
705 sf
.PointWidthSource
= State
;
708 /* _NEW_POINT | _NEW_MULTISAMPLE */
709 if ((ctx
->Point
.SmoothFlag
|| _mesa_is_multisample_enabled(ctx
)) &&
710 !ctx
->Point
.PointSprite
)
711 sf
.SmoothPointEnable
= true;
714 sf
.AALineDistanceMode
= AALINEDISTANCE_TRUE
;
717 if (ctx
->Light
.ProvokingVertex
!= GL_FIRST_VERTEX_CONVENTION
) {
718 sf
.TriangleStripListProvokingVertexSelect
= 2;
719 sf
.TriangleFanProvokingVertexSelect
= 2;
720 sf
.LineStripListProvokingVertexSelect
= 1;
722 sf
.TriangleFanProvokingVertexSelect
= 1;
726 /* BRW_NEW_FS_PROG_DATA */
727 const struct brw_wm_prog_data
*wm_prog_data
=
728 brw_wm_prog_data(brw
->wm
.base
.prog_data
);
730 sf
.AttributeSwizzleEnable
= true;
731 sf
.NumberofSFOutputAttributes
= wm_prog_data
->num_varying_inputs
;
734 * Window coordinates in an FBO are inverted, which means point
735 * sprite origin must be inverted, too.
737 if ((ctx
->Point
.SpriteOrigin
== GL_LOWER_LEFT
) != render_to_fbo
) {
738 sf
.PointSpriteTextureCoordinateOrigin
= LOWERLEFT
;
740 sf
.PointSpriteTextureCoordinateOrigin
= UPPERLEFT
;
743 /* BRW_NEW_VUE_MAP_GEOM_OUT | BRW_NEW_FRAGMENT_PROGRAM |
744 * _NEW_POINT | _NEW_LIGHT | _NEW_PROGRAM | BRW_NEW_FS_PROG_DATA
746 uint32_t urb_entry_read_length
;
747 uint32_t urb_entry_read_offset
;
748 uint32_t point_sprite_enables
;
749 genX(calculate_attr_overrides
)(brw
, sf
.Attribute
, &point_sprite_enables
,
750 &urb_entry_read_length
,
751 &urb_entry_read_offset
);
752 sf
.VertexURBEntryReadLength
= urb_entry_read_length
;
753 sf
.VertexURBEntryReadOffset
= urb_entry_read_offset
;
754 sf
.PointSpriteTextureCoordinateEnable
= point_sprite_enables
;
755 sf
.ConstantInterpolationEnable
= wm_prog_data
->flat_inputs
;
760 static const struct brw_tracked_state
genX(sf_state
) = {
767 (GEN_GEN
<= 7 ? _NEW_BUFFERS
| _NEW_POLYGON
: 0),
768 .brw
= BRW_NEW_BLORP
|
770 BRW_NEW_VUE_MAP_GEOM_OUT
|
771 (GEN_GEN
<= 7 ? BRW_NEW_GS_PROG_DATA
|
773 BRW_NEW_TES_PROG_DATA
775 (GEN_GEN
== 6 ? BRW_NEW_FS_PROG_DATA
|
776 BRW_NEW_FRAGMENT_PROGRAM
779 .emit
= genX(upload_sf
),
782 /* ---------------------------------------------------------------------- */
785 genX(upload_wm
)(struct brw_context
*brw
)
787 struct gl_context
*ctx
= &brw
->ctx
;
789 /* BRW_NEW_FS_PROG_DATA */
790 const struct brw_wm_prog_data
*wm_prog_data
=
791 brw_wm_prog_data(brw
->wm
.base
.prog_data
);
793 UNUSED
bool writes_depth
=
794 wm_prog_data
->computed_depth_mode
!= BRW_PSCDEPTH_OFF
;
797 const struct brw_stage_state
*stage_state
= &brw
->wm
.base
;
798 const struct gen_device_info
*devinfo
= &brw
->screen
->devinfo
;
800 /* We can't fold this into gen6_upload_wm_push_constants(), because
801 * according to the SNB PRM, vol 2 part 1 section 7.2.2
802 * (3DSTATE_CONSTANT_PS [DevSNB]):
804 * "[DevSNB]: This packet must be followed by WM_STATE."
806 brw_batch_emit(brw
, GENX(3DSTATE_CONSTANT_PS
), wmcp
) {
807 if (wm_prog_data
->base
.nr_params
!= 0) {
808 wmcp
.Buffer0Valid
= true;
809 /* Pointer to the WM constant buffer. Covered by the set of
810 * state flags from gen6_upload_wm_push_constants.
812 wmcp
.PointertoPSConstantBuffer0
= stage_state
->push_const_offset
;
813 wmcp
.PSConstantBuffer0ReadLength
= stage_state
->push_const_size
- 1;
818 brw_batch_emit(brw
, GENX(3DSTATE_WM
), wm
) {
819 wm
.StatisticsEnable
= true;
820 wm
.LineAntialiasingRegionWidth
= _10pixels
;
821 wm
.LineEndCapAntialiasingRegionWidth
= _05pixels
;
824 if (wm_prog_data
->base
.use_alt_mode
)
825 wm
.FloatingPointMode
= Alternate
;
827 wm
.SamplerCount
= DIV_ROUND_UP(stage_state
->sampler_count
, 4);
828 wm
.BindingTableEntryCount
= wm_prog_data
->base
.binding_table
.size_bytes
/ 4;
829 wm
.MaximumNumberofThreads
= devinfo
->max_wm_threads
- 1;
830 wm
._8PixelDispatchEnable
= wm_prog_data
->dispatch_8
;
831 wm
._16PixelDispatchEnable
= wm_prog_data
->dispatch_16
;
832 wm
.DispatchGRFStartRegisterForConstantSetupData0
=
833 wm_prog_data
->base
.dispatch_grf_start_reg
;
834 wm
.DispatchGRFStartRegisterForConstantSetupData2
=
835 wm_prog_data
->dispatch_grf_start_reg_2
;
836 wm
.KernelStartPointer0
= stage_state
->prog_offset
;
837 wm
.KernelStartPointer2
= stage_state
->prog_offset
+
838 wm_prog_data
->prog_offset_2
;
839 wm
.DualSourceBlendEnable
=
840 wm_prog_data
->dual_src_blend
&& (ctx
->Color
.BlendEnabled
& 1) &&
841 ctx
->Color
.Blend
[0]._UsesDualSrc
;
842 wm
.oMaskPresenttoRenderTarget
= wm_prog_data
->uses_omask
;
843 wm
.NumberofSFOutputAttributes
= wm_prog_data
->num_varying_inputs
;
845 /* From the SNB PRM, volume 2 part 1, page 281:
846 * "If the PS kernel does not need the Position XY Offsets
847 * to compute a Position XY value, then this field should be
848 * programmed to POSOFFSET_NONE."
850 * "SW Recommendation: If the PS kernel needs the Position Offsets
851 * to compute a Position XY value, this field should match Position
852 * ZW Interpolation Mode to ensure a consistent position.xyzw
854 * We only require XY sample offsets. So, this recommendation doesn't
855 * look useful at the moment. We might need this in future.
857 if (wm_prog_data
->uses_pos_offset
)
858 wm
.PositionXYOffsetSelect
= POSOFFSET_SAMPLE
;
860 wm
.PositionXYOffsetSelect
= POSOFFSET_NONE
;
862 if (wm_prog_data
->base
.total_scratch
) {
863 wm
.ScratchSpaceBasePointer
=
864 render_bo(stage_state
->scratch_bo
,
865 ffs(stage_state
->per_thread_scratch
) - 11);
868 wm
.PixelShaderComputedDepth
= writes_depth
;
871 wm
.PointRasterizationRule
= RASTRULE_UPPER_RIGHT
;
874 wm
.LineStippleEnable
= ctx
->Line
.StippleFlag
;
877 wm
.PolygonStippleEnable
= ctx
->Polygon
.StippleFlag
;
878 wm
.BarycentricInterpolationMode
= wm_prog_data
->barycentric_interp_modes
;
882 const bool multisampled_fbo
= _mesa_geometric_samples(ctx
->DrawBuffer
) > 1;
884 wm
.PixelShaderUsesSourceDepth
= wm_prog_data
->uses_src_depth
;
885 wm
.PixelShaderUsesSourceW
= wm_prog_data
->uses_src_w
;
886 if (wm_prog_data
->uses_kill
||
887 _mesa_is_alpha_test_enabled(ctx
) ||
888 _mesa_is_alpha_to_coverage_enabled(ctx
) ||
889 wm_prog_data
->uses_omask
) {
890 wm
.PixelShaderKillsPixel
= true;
893 /* _NEW_BUFFERS | _NEW_COLOR */
894 if (brw_color_buffer_write_enabled(brw
) || writes_depth
||
895 wm_prog_data
->has_side_effects
|| wm
.PixelShaderKillsPixel
) {
896 wm
.ThreadDispatchEnable
= true;
898 if (multisampled_fbo
) {
899 /* _NEW_MULTISAMPLE */
900 if (ctx
->Multisample
.Enabled
)
901 wm
.MultisampleRasterizationMode
= MSRASTMODE_ON_PATTERN
;
903 wm
.MultisampleRasterizationMode
= MSRASTMODE_OFF_PIXEL
;
905 if (wm_prog_data
->persample_dispatch
)
906 wm
.MultisampleDispatchMode
= MSDISPMODE_PERSAMPLE
;
908 wm
.MultisampleDispatchMode
= MSDISPMODE_PERPIXEL
;
910 wm
.MultisampleRasterizationMode
= MSRASTMODE_OFF_PIXEL
;
911 wm
.MultisampleDispatchMode
= MSDISPMODE_PERSAMPLE
;
915 wm
.PixelShaderComputedDepthMode
= wm_prog_data
->computed_depth_mode
;
916 wm
.PixelShaderUsesInputCoverageMask
= wm_prog_data
->uses_sample_mask
;
919 /* The "UAV access enable" bits are unnecessary on HSW because they only
920 * seem to have an effect on the HW-assisted coherency mechanism which we
921 * don't need, and the rasterization-related UAV_ONLY flag and the
922 * DISPATCH_ENABLE bit can be set independently from it.
923 * C.f. gen8_upload_ps_extra().
925 * BRW_NEW_FRAGMENT_PROGRAM | BRW_NEW_FS_PROG_DATA | _NEW_BUFFERS |
929 if (!(brw_color_buffer_write_enabled(brw
) || writes_depth
) &&
930 wm_prog_data
->has_side_effects
)
936 /* BRW_NEW_FS_PROG_DATA */
937 if (wm_prog_data
->early_fragment_tests
)
938 wm
.EarlyDepthStencilControl
= EDSC_PREPS
;
939 else if (wm_prog_data
->has_side_effects
)
940 wm
.EarlyDepthStencilControl
= EDSC_PSEXEC
;
945 static const struct brw_tracked_state
genX(wm_state
) = {
949 (GEN_GEN
< 8 ? _NEW_BUFFERS
|
953 (GEN_GEN
< 7 ? _NEW_PROGRAM_CONSTANTS
: 0),
954 .brw
= BRW_NEW_BLORP
|
955 BRW_NEW_FS_PROG_DATA
|
956 (GEN_GEN
< 7 ? BRW_NEW_PUSH_CONSTANT_ALLOCATION
|
960 .emit
= genX(upload_wm
),
963 /* ---------------------------------------------------------------------- */
965 #define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix) \
966 pkt.KernelStartPointer = stage_state->prog_offset; \
968 DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4); \
969 pkt.BindingTableEntryCount = \
970 stage_prog_data->binding_table.size_bytes / 4; \
971 pkt.FloatingPointMode = stage_prog_data->use_alt_mode; \
973 if (stage_prog_data->total_scratch) { \
974 pkt.ScratchSpaceBasePointer = \
975 render_bo(stage_state->scratch_bo, 0); \
976 pkt.PerThreadScratchSpace = \
977 ffs(stage_state->per_thread_scratch) - 11; \
980 pkt.DispatchGRFStartRegisterForURBData = \
981 stage_prog_data->dispatch_grf_start_reg; \
982 pkt.prefix##URBEntryReadLength = vue_prog_data->urb_read_length; \
983 pkt.prefix##URBEntryReadOffset = 0; \
985 pkt.StatisticsEnable = true; \
990 genX(upload_vs_state
)(struct brw_context
*brw
)
992 const struct gen_device_info
*devinfo
= &brw
->screen
->devinfo
;
993 const struct brw_stage_state
*stage_state
= &brw
->vs
.base
;
995 /* BRW_NEW_VS_PROG_DATA */
996 const struct brw_vue_prog_data
*vue_prog_data
=
997 brw_vue_prog_data(brw
->vs
.base
.prog_data
);
998 const struct brw_stage_prog_data
*stage_prog_data
= &vue_prog_data
->base
;
1000 assert(vue_prog_data
->dispatch_mode
== DISPATCH_MODE_SIMD8
||
1001 vue_prog_data
->dispatch_mode
== DISPATCH_MODE_4X2_DUAL_OBJECT
);
1003 /* From the BSpec, 3D Pipeline > Geometry > Vertex Shader > State,
1004 * 3DSTATE_VS, Dword 5.0 "VS Function Enable":
1006 * [DevSNB] A pipeline flush must be programmed prior to a 3DSTATE_VS
1007 * command that causes the VS Function Enable to toggle. Pipeline
1008 * flush can be executed by sending a PIPE_CONTROL command with CS
1009 * stall bit set and a post sync operation.
1011 * We've already done such a flush at the start of state upload, so we
1012 * don't need to do another one here.
1016 brw_batch_emit(brw
, GENX(3DSTATE_CONSTANT_VS
), cvs
) {
1017 if (stage_state
->push_const_size
!= 0) {
1018 cvs
.Buffer0Valid
= true;
1019 cvs
.PointertoVSConstantBuffer0
= stage_state
->push_const_offset
;
1020 cvs
.VSConstantBuffer0ReadLength
= stage_state
->push_const_size
- 1;
1025 if (GEN_GEN
== 7 && devinfo
->is_ivybridge
)
1026 gen7_emit_vs_workaround_flush(brw
);
1028 brw_batch_emit(brw
, GENX(3DSTATE_VS
), vs
) {
1029 INIT_THREAD_DISPATCH_FIELDS(vs
, Vertex
);
1031 vs
.MaximumNumberofThreads
= devinfo
->max_vs_threads
- 1;
1034 vs
.SIMD8DispatchEnable
=
1035 vue_prog_data
->dispatch_mode
== DISPATCH_MODE_SIMD8
;
1037 vs
.UserClipDistanceCullTestEnableBitmask
=
1038 vue_prog_data
->cull_distance_mask
;
1043 /* Based on my reading of the simulator, the VS constants don't get
1044 * pulled into the VS FF unit until an appropriate pipeline flush
1045 * happens, and instead the 3DSTATE_CONSTANT_VS packet just adds
1046 * references to them into a little FIFO. The flushes are common,
1047 * but don't reliably happen between this and a 3DPRIMITIVE, causing
1048 * the primitive to use the wrong constants. Then the FIFO
1049 * containing the constant setup gets added to again on the next
1050 * constants change, and eventually when a flush does happen the
1051 * unit is overwhelmed by constant changes and dies.
1053 * To avoid this, send a PIPE_CONTROL down the line that will
1054 * update the unit immediately loading the constants. The flush
1055 * type bits here were those set by the STATE_BASE_ADDRESS whose
1056 * move in a82a43e8d99e1715dd11c9c091b5ab734079b6a6 triggered the
1057 * bug reports that led to this workaround, and may be more than
1058 * what is strictly required to avoid the issue.
1060 brw_emit_pipe_control_flush(brw
,
1061 PIPE_CONTROL_DEPTH_STALL
|
1062 PIPE_CONTROL_INSTRUCTION_INVALIDATE
|
1063 PIPE_CONTROL_STATE_CACHE_INVALIDATE
);
1067 static const struct brw_tracked_state
genX(vs_state
) = {
1069 .mesa
= (GEN_GEN
< 7 ? (_NEW_PROGRAM_CONSTANTS
| _NEW_TRANSFORM
) : 0),
1070 .brw
= BRW_NEW_BATCH
|
1073 BRW_NEW_VS_PROG_DATA
|
1074 (GEN_GEN
< 7 ? BRW_NEW_PUSH_CONSTANT_ALLOCATION
|
1075 BRW_NEW_VERTEX_PROGRAM
1078 .emit
= genX(upload_vs_state
),
1081 /* ---------------------------------------------------------------------- */
1084 brw_calculate_guardband_size(const struct gen_device_info
*devinfo
,
1085 uint32_t fb_width
, uint32_t fb_height
,
1086 float m00
, float m11
, float m30
, float m31
,
1087 float *xmin
, float *xmax
,
1088 float *ymin
, float *ymax
)
1090 /* According to the "Vertex X,Y Clamping and Quantization" section of the
1091 * Strips and Fans documentation:
1093 * "The vertex X and Y screen-space coordinates are also /clamped/ to the
1094 * fixed-point "guardband" range supported by the rasterization hardware"
1098 * "In almost all circumstances, if an object’s vertices are actually
1099 * modified by this clamping (i.e., had X or Y coordinates outside of
1100 * the guardband extent the rendered object will not match the intended
1101 * result. Therefore software should take steps to ensure that this does
1102 * not happen - e.g., by clipping objects such that they do not exceed
1103 * these limits after the Drawing Rectangle is applied."
1105 * I believe the fundamental restriction is that the rasterizer (in
1106 * the SF/WM stages) have a limit on the number of pixels that can be
1107 * rasterized. We need to ensure any coordinates beyond the rasterizer
1108 * limit are handled by the clipper. So effectively that limit becomes
1109 * the clipper's guardband size.
1111 * It goes on to say:
1113 * "In addition, in order to be correctly rendered, objects must have a
1114 * screenspace bounding box not exceeding 8K in the X or Y direction.
1115 * This additional restriction must also be comprehended by software,
1116 * i.e., enforced by use of clipping."
1118 * This makes no sense. Gen7+ hardware supports 16K render targets,
1119 * and you definitely need to be able to draw polygons that fill the
1120 * surface. Our assumption is that the rasterizer was limited to 8K
1121 * on Sandybridge, which only supports 8K surfaces, and it was actually
1122 * increased to 16K on Ivybridge and later.
1124 * So, limit the guardband to 16K on Gen7+ and 8K on Sandybridge.
1126 const float gb_size
= devinfo
->gen
>= 7 ? 16384.0f
: 8192.0f
;
1128 if (m00
!= 0 && m11
!= 0) {
1129 /* First, we compute the screen-space render area */
1130 const float ss_ra_xmin
= MIN3( 0, m30
+ m00
, m30
- m00
);
1131 const float ss_ra_xmax
= MAX3( fb_width
, m30
+ m00
, m30
- m00
);
1132 const float ss_ra_ymin
= MIN3( 0, m31
+ m11
, m31
- m11
);
1133 const float ss_ra_ymax
= MAX3(fb_height
, m31
+ m11
, m31
- m11
);
1135 /* We want the guardband to be centered on that */
1136 const float ss_gb_xmin
= (ss_ra_xmin
+ ss_ra_xmax
) / 2 - gb_size
;
1137 const float ss_gb_xmax
= (ss_ra_xmin
+ ss_ra_xmax
) / 2 + gb_size
;
1138 const float ss_gb_ymin
= (ss_ra_ymin
+ ss_ra_ymax
) / 2 - gb_size
;
1139 const float ss_gb_ymax
= (ss_ra_ymin
+ ss_ra_ymax
) / 2 + gb_size
;
1141 /* Now we need it in native device coordinates */
1142 const float ndc_gb_xmin
= (ss_gb_xmin
- m30
) / m00
;
1143 const float ndc_gb_xmax
= (ss_gb_xmax
- m30
) / m00
;
1144 const float ndc_gb_ymin
= (ss_gb_ymin
- m31
) / m11
;
1145 const float ndc_gb_ymax
= (ss_gb_ymax
- m31
) / m11
;
1147 /* Thanks to Y-flipping and ORIGIN_UPPER_LEFT, the Y coordinates may be
1148 * flipped upside-down. X should be fine though.
1150 assert(ndc_gb_xmin
<= ndc_gb_xmax
);
1151 *xmin
= ndc_gb_xmin
;
1152 *xmax
= ndc_gb_xmax
;
1153 *ymin
= MIN2(ndc_gb_ymin
, ndc_gb_ymax
);
1154 *ymax
= MAX2(ndc_gb_ymin
, ndc_gb_ymax
);
1156 /* The viewport scales to 0, so nothing will be rendered. */
1165 genX(upload_sf_clip_viewport
)(struct brw_context
*brw
)
1167 struct gl_context
*ctx
= &brw
->ctx
;
1168 float y_scale
, y_bias
;
1169 const struct gen_device_info
*devinfo
= &brw
->screen
->devinfo
;
1171 /* BRW_NEW_VIEWPORT_COUNT */
1172 const unsigned viewport_count
= brw
->clip
.viewport_count
;
1175 const bool render_to_fbo
= _mesa_is_user_fbo(ctx
->DrawBuffer
);
1176 const uint32_t fb_width
= (float)_mesa_geometric_width(ctx
->DrawBuffer
);
1177 const uint32_t fb_height
= (float)_mesa_geometric_height(ctx
->DrawBuffer
);
1181 struct GENX(SF_CLIP_VIEWPORT
) sfv
;
1182 uint32_t sf_clip_vp_offset
;
1183 uint32_t *sf_clip_map
= brw_state_batch(brw
, 16 * 4 * viewport_count
,
1184 64, &sf_clip_vp_offset
);
1186 struct GENX(SF_VIEWPORT
) sfv
;
1187 struct GENX(CLIP_VIEWPORT
) clv
;
1188 uint32_t *sf_map
= brw_state_batch(brw
, 8 * 4 * viewport_count
,
1189 32, &brw
->sf
.vp_offset
);
1190 uint32_t *clip_map
= brw_state_batch(brw
, 4 * 4 * viewport_count
,
1191 32, &brw
->clip
.vp_offset
);
1195 if (render_to_fbo
) {
1200 y_bias
= (float)fb_height
;
1203 for (unsigned i
= 0; i
< brw
->clip
.viewport_count
; i
++) {
1204 /* _NEW_VIEWPORT: Guardband Clipping */
1205 float scale
[3], translate
[3], gb_xmin
, gb_xmax
, gb_ymin
, gb_ymax
;
1206 _mesa_get_viewport_xform(ctx
, i
, scale
, translate
);
1208 sfv
.ViewportMatrixElementm00
= scale
[0];
1209 sfv
.ViewportMatrixElementm11
= scale
[1] * y_scale
,
1210 sfv
.ViewportMatrixElementm22
= scale
[2],
1211 sfv
.ViewportMatrixElementm30
= translate
[0],
1212 sfv
.ViewportMatrixElementm31
= translate
[1] * y_scale
+ y_bias
,
1213 sfv
.ViewportMatrixElementm32
= translate
[2],
1214 brw_calculate_guardband_size(devinfo
, fb_width
, fb_height
,
1215 sfv
.ViewportMatrixElementm00
,
1216 sfv
.ViewportMatrixElementm11
,
1217 sfv
.ViewportMatrixElementm30
,
1218 sfv
.ViewportMatrixElementm31
,
1219 &gb_xmin
, &gb_xmax
, &gb_ymin
, &gb_ymax
);
1222 clv
.XMinClipGuardband
= gb_xmin
;
1223 clv
.XMaxClipGuardband
= gb_xmax
;
1224 clv
.YMinClipGuardband
= gb_ymin
;
1225 clv
.YMaxClipGuardband
= gb_ymax
;
1228 /* _NEW_VIEWPORT | _NEW_BUFFERS: Screen Space Viewport
1229 * The hardware will take the intersection of the drawing rectangle,
1230 * scissor rectangle, and the viewport extents. We don't need to be
1231 * smart, and can therefore just program the viewport extents.
1233 const float viewport_Xmax
=
1234 ctx
->ViewportArray
[i
].X
+ ctx
->ViewportArray
[i
].Width
;
1235 const float viewport_Ymax
=
1236 ctx
->ViewportArray
[i
].Y
+ ctx
->ViewportArray
[i
].Height
;
1238 if (render_to_fbo
) {
1239 sfv
.XMinViewPort
= ctx
->ViewportArray
[i
].X
;
1240 sfv
.XMaxViewPort
= viewport_Xmax
- 1;
1241 sfv
.YMinViewPort
= ctx
->ViewportArray
[i
].Y
;
1242 sfv
.YMaxViewPort
= viewport_Ymax
- 1;
1244 sfv
.XMinViewPort
= ctx
->ViewportArray
[i
].X
;
1245 sfv
.XMaxViewPort
= viewport_Xmax
- 1;
1246 sfv
.YMinViewPort
= fb_height
- viewport_Ymax
;
1247 sfv
.YMaxViewPort
= fb_height
- ctx
->ViewportArray
[i
].Y
- 1;
1252 GENX(SF_CLIP_VIEWPORT_pack
)(NULL
, sf_clip_map
, &sfv
);
1255 GENX(SF_VIEWPORT_pack
)(NULL
, sf_map
, &sfv
);
1256 GENX(CLIP_VIEWPORT_pack
)(NULL
, clip_map
, &clv
);
1263 brw_batch_emit(brw
, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP
), ptr
) {
1264 ptr
.SFClipViewportPointer
= sf_clip_vp_offset
;
1267 brw
->ctx
.NewDriverState
|= BRW_NEW_SF_VP
| BRW_NEW_CLIP_VP
;
1271 static const struct brw_tracked_state
genX(sf_clip_viewport
) = {
1273 .mesa
= _NEW_BUFFERS
|
1275 .brw
= BRW_NEW_BATCH
|
1277 BRW_NEW_VIEWPORT_COUNT
,
1279 .emit
= genX(upload_sf_clip_viewport
),
1283 genX(upload_gs_state
)(struct brw_context
*brw
)
1285 const struct gen_device_info
*devinfo
= &brw
->screen
->devinfo
;
1286 const struct brw_stage_state
*stage_state
= &brw
->gs
.base
;
1287 /* BRW_NEW_GEOMETRY_PROGRAM */
1288 bool active
= brw
->geometry_program
;
1290 /* BRW_NEW_GS_PROG_DATA */
1291 struct brw_stage_prog_data
*stage_prog_data
= stage_state
->prog_data
;
1292 const struct brw_vue_prog_data
*vue_prog_data
=
1293 brw_vue_prog_data(stage_prog_data
);
1295 const struct brw_gs_prog_data
*gs_prog_data
=
1296 brw_gs_prog_data(stage_prog_data
);
1300 brw_batch_emit(brw
, GENX(3DSTATE_CONSTANT_GS
), cgs
) {
1301 if (active
&& stage_state
->push_const_size
!= 0) {
1302 cgs
.Buffer0Valid
= true;
1303 cgs
.PointertoGSConstantBuffer0
= stage_state
->push_const_offset
;
1304 cgs
.GSConstantBuffer0ReadLength
= stage_state
->push_const_size
- 1;
1309 #if GEN_GEN == 7 && !GEN_IS_HASWELL
1311 * From Graphics BSpec: 3D-Media-GPGPU Engine > 3D Pipeline Stages >
1312 * Geometry > Geometry Shader > State:
1314 * "Note: Because of corruption in IVB:GT2, software needs to flush the
1315 * whole fixed function pipeline when the GS enable changes value in
1318 * The hardware architects have clarified that in this context "flush the
1319 * whole fixed function pipeline" means to emit a PIPE_CONTROL with the "CS
1322 if (brw
->gt
== 2 && brw
->gs
.enabled
!= active
)
1323 gen7_emit_cs_stall_flush(brw
);
1327 brw_batch_emit(brw
, GENX(3DSTATE_GS
), gs
) {
1328 INIT_THREAD_DISPATCH_FIELDS(gs
, Vertex
);
1331 gs
.OutputVertexSize
= gs_prog_data
->output_vertex_size_hwords
* 2 - 1;
1332 gs
.OutputTopology
= gs_prog_data
->output_topology
;
1333 gs
.ControlDataHeaderSize
=
1334 gs_prog_data
->control_data_header_size_hwords
;
1336 gs
.InstanceControl
= gs_prog_data
->invocations
- 1;
1337 gs
.DispatchMode
= vue_prog_data
->dispatch_mode
;
1339 gs
.IncludePrimitiveID
= gs_prog_data
->include_primitive_id
;
1341 gs
.ControlDataFormat
= gs_prog_data
->control_data_format
;
1344 /* Note: the meaning of the GEN7_GS_REORDER_TRAILING bit changes between
1345 * Ivy Bridge and Haswell.
1347 * On Ivy Bridge, setting this bit causes the vertices of a triangle
1348 * strip to be delivered to the geometry shader in an order that does
1349 * not strictly follow the OpenGL spec, but preserves triangle
1350 * orientation. For example, if the vertices are (1, 2, 3, 4, 5), then
1351 * the geometry shader sees triangles:
1353 * (1, 2, 3), (2, 4, 3), (3, 4, 5)
1355 * (Clearing the bit is even worse, because it fails to preserve
1358 * Triangle strips with adjacency always ordered in a way that preserves
1359 * triangle orientation but does not strictly follow the OpenGL spec,
1360 * regardless of the setting of this bit.
1362 * On Haswell, both triangle strips and triangle strips with adjacency
1363 * are always ordered in a way that preserves triangle orientation.
1364 * Setting this bit causes the ordering to strictly follow the OpenGL
1367 * So in either case we want to set the bit. Unfortunately on Ivy
1368 * Bridge this will get the order close to correct but not perfect.
1370 gs
.ReorderMode
= TRAILING
;
1371 gs
.MaximumNumberofThreads
=
1372 GEN_GEN
== 8 ? (devinfo
->max_gs_threads
/ 2 - 1)
1373 : (devinfo
->max_gs_threads
- 1);
1376 gs
.SOStatisticsEnable
= true;
1377 gs
.RenderingEnabled
= 1;
1378 if (brw
->geometry_program
->info
.has_transform_feedback_varyings
)
1379 gs
.SVBIPayloadEnable
= true;
1381 /* GEN6_GS_SPF_MODE and GEN6_GS_VECTOR_MASK_ENABLE are enabled as it
1382 * was previously done for gen6.
1384 * TODO: test with both disabled to see if the HW is behaving
1385 * as expected, like in gen7.
1387 gs
.SingleProgramFlow
= true;
1388 gs
.VectorMaskEnable
= true;
1392 gs
.ExpectedVertexCount
= gs_prog_data
->vertices_in
;
1394 if (gs_prog_data
->static_vertex_count
!= -1) {
1395 gs
.StaticOutput
= true;
1396 gs
.StaticOutputVertexCount
= gs_prog_data
->static_vertex_count
;
1398 gs
.IncludeVertexHandles
= vue_prog_data
->include_vue_handles
;
1400 gs
.UserClipDistanceCullTestEnableBitmask
=
1401 vue_prog_data
->cull_distance_mask
;
1403 const int urb_entry_write_offset
= 1;
1404 const uint32_t urb_entry_output_length
=
1405 DIV_ROUND_UP(vue_prog_data
->vue_map
.num_slots
, 2) -
1406 urb_entry_write_offset
;
1408 gs
.VertexURBEntryOutputReadOffset
= urb_entry_write_offset
;
1409 gs
.VertexURBEntryOutputLength
= MAX2(urb_entry_output_length
, 1);
1413 } else if (brw
->ff_gs
.prog_active
) {
1414 /* In gen6, transform feedback for the VS stage is done with an ad-hoc GS
1415 * program. This function provides the needed 3DSTATE_GS for this.
1417 upload_gs_state_for_tf(brw
);
1420 brw_batch_emit(brw
, GENX(3DSTATE_GS
), gs
) {
1421 gs
.StatisticsEnable
= true;
1423 gs
.RenderingEnabled
= true;
1427 gs
.DispatchGRFStartRegisterForURBData
= 1;
1429 gs
.IncludeVertexHandles
= true;
1435 brw
->gs
.enabled
= active
;
1439 static const struct brw_tracked_state
genX(gs_state
) = {
1441 .mesa
= (GEN_GEN
< 7 ? _NEW_PROGRAM_CONSTANTS
: 0),
1442 .brw
= BRW_NEW_BATCH
|
1445 BRW_NEW_GEOMETRY_PROGRAM
|
1446 BRW_NEW_GS_PROG_DATA
|
1447 (GEN_GEN
< 7 ? BRW_NEW_FF_GS_PROG_DATA
|
1448 BRW_NEW_PUSH_CONSTANT_ALLOCATION
1451 .emit
= genX(upload_gs_state
),
1454 /* ---------------------------------------------------------------------- */
1456 #define blend_factor(x) brw_translate_blend_factor(x)
1457 #define blend_eqn(x) brw_translate_blend_equation(x)
1460 genX(upload_blend_state
)(struct brw_context
*brw
)
1462 struct gl_context
*ctx
= &brw
->ctx
;
1465 /* We need at least one BLEND_STATE written, because we might do
1466 * thread dispatch even if _NumColorDrawBuffers is 0 (for example
1467 * for computed depth or alpha test), which will do an FB write
1468 * with render target 0, which will reference BLEND_STATE[0] for
1469 * alpha test enable.
1471 int nr_draw_buffers
= ctx
->DrawBuffer
->_NumColorDrawBuffers
;
1472 if (nr_draw_buffers
== 0 && ctx
->Color
.AlphaEnabled
)
1473 nr_draw_buffers
= 1;
1475 size
= GENX(BLEND_STATE_ENTRY_length
) * 4 * nr_draw_buffers
;
1477 size
+= GENX(BLEND_STATE_length
) * 4;
1480 uint32_t *blend_map
;
1481 blend_map
= brw_state_batch(brw
, size
, 64, &brw
->cc
.blend_state_offset
);
1484 struct GENX(BLEND_STATE
) blend
= { 0 };
1487 for (int i
= 0; i
< nr_draw_buffers
; i
++) {
1488 struct GENX(BLEND_STATE_ENTRY
) entry
= { 0 };
1491 /* OpenGL specification 3.3 (page 196), section 4.1.3 says:
1492 * "If drawbuffer zero is not NONE and the buffer it references has an
1493 * integer format, the SAMPLE_ALPHA_TO_COVERAGE and SAMPLE_ALPHA_TO_ONE
1494 * operations are skipped."
1496 if (!(ctx
->DrawBuffer
->_IntegerBuffers
& 0x1)) {
1497 /* _NEW_MULTISAMPLE */
1498 if (_mesa_is_multisample_enabled(ctx
)) {
1499 if (ctx
->Multisample
.SampleAlphaToCoverage
) {
1500 blend
.AlphaToCoverageEnable
= true;
1501 blend
.AlphaToCoverageDitherEnable
= GEN_GEN
>= 7;
1503 if (ctx
->Multisample
.SampleAlphaToOne
)
1504 blend
.AlphaToOneEnable
= true;
1508 if (ctx
->Color
.AlphaEnabled
) {
1509 blend
.AlphaTestEnable
= true;
1510 blend
.AlphaTestFunction
=
1511 intel_translate_compare_func(ctx
->Color
.AlphaFunc
);
1514 if (ctx
->Color
.DitherFlag
) {
1515 blend
.ColorDitherEnable
= true;
1520 for (int i
= 0; i
< nr_draw_buffers
; i
++) {
1521 struct GENX(BLEND_STATE_ENTRY
) entry
= { 0 };
1527 struct gl_renderbuffer
*rb
= ctx
->DrawBuffer
->_ColorDrawBuffers
[i
];
1529 /* Used for implementing the following bit of GL_EXT_texture_integer:
1530 * "Per-fragment operations that require floating-point color
1531 * components, including multisample alpha operations, alpha test,
1532 * blending, and dithering, have no effect when the corresponding
1533 * colors are written to an integer color buffer."
1535 bool integer
= ctx
->DrawBuffer
->_IntegerBuffers
& (0x1 << i
);
1538 if (ctx
->Color
.ColorLogicOpEnabled
) {
1539 GLenum rb_type
= rb
? _mesa_get_format_datatype(rb
->Format
)
1540 : GL_UNSIGNED_NORMALIZED
;
1541 WARN_ONCE(ctx
->Color
.LogicOp
!= GL_COPY
&&
1542 rb_type
!= GL_UNSIGNED_NORMALIZED
&&
1543 rb_type
!= GL_FLOAT
, "Ignoring %s logic op on %s "
1545 _mesa_enum_to_string(ctx
->Color
.LogicOp
),
1546 _mesa_enum_to_string(rb_type
));
1547 if (GEN_GEN
>= 8 || rb_type
== GL_UNSIGNED_NORMALIZED
) {
1548 entry
.LogicOpEnable
= true;
1549 entry
.LogicOpFunction
=
1550 intel_translate_logic_op(ctx
->Color
.LogicOp
);
1552 } else if (ctx
->Color
.BlendEnabled
& (1 << i
) && !integer
&&
1553 !ctx
->Color
._AdvancedBlendMode
) {
1554 GLenum eqRGB
= ctx
->Color
.Blend
[i
].EquationRGB
;
1555 GLenum eqA
= ctx
->Color
.Blend
[i
].EquationA
;
1556 GLenum srcRGB
= ctx
->Color
.Blend
[i
].SrcRGB
;
1557 GLenum dstRGB
= ctx
->Color
.Blend
[i
].DstRGB
;
1558 GLenum srcA
= ctx
->Color
.Blend
[i
].SrcA
;
1559 GLenum dstA
= ctx
->Color
.Blend
[i
].DstA
;
1561 if (eqRGB
== GL_MIN
|| eqRGB
== GL_MAX
)
1562 srcRGB
= dstRGB
= GL_ONE
;
1564 if (eqA
== GL_MIN
|| eqA
== GL_MAX
)
1565 srcA
= dstA
= GL_ONE
;
1567 /* Due to hardware limitations, the destination may have information
1568 * in an alpha channel even when the format specifies no alpha
1569 * channel. In order to avoid getting any incorrect blending due to
1570 * that alpha channel, coerce the blend factors to values that will
1571 * not read the alpha channel, but will instead use the correct
1572 * implicit value for alpha.
1574 if (rb
&& !_mesa_base_format_has_channel(rb
->_BaseFormat
,
1575 GL_TEXTURE_ALPHA_TYPE
)) {
1576 srcRGB
= brw_fix_xRGB_alpha(srcRGB
);
1577 srcA
= brw_fix_xRGB_alpha(srcA
);
1578 dstRGB
= brw_fix_xRGB_alpha(dstRGB
);
1579 dstA
= brw_fix_xRGB_alpha(dstA
);
1582 entry
.ColorBufferBlendEnable
= true;
1583 entry
.DestinationBlendFactor
= blend_factor(dstRGB
);
1584 entry
.SourceBlendFactor
= blend_factor(srcRGB
);
1585 entry
.DestinationAlphaBlendFactor
= blend_factor(dstA
);
1586 entry
.SourceAlphaBlendFactor
= blend_factor(srcA
);
1587 entry
.ColorBlendFunction
= blend_eqn(eqRGB
);
1588 entry
.AlphaBlendFunction
= blend_eqn(eqA
);
1590 if (srcA
!= srcRGB
|| dstA
!= dstRGB
|| eqA
!= eqRGB
)
1591 blend
.IndependentAlphaBlendEnable
= true;
1594 /* See section 8.1.6 "Pre-Blend Color Clamping" of the
1595 * SandyBridge PRM Volume 2 Part 1 for HW requirements.
1597 * We do our ARB_color_buffer_float CLAMP_FRAGMENT_COLOR
1598 * clamping in the fragment shader. For its clamping of
1599 * blending, the spec says:
1601 * "RESOLVED: For fixed-point color buffers, the inputs and
1602 * the result of the blending equation are clamped. For
1603 * floating-point color buffers, no clamping occurs."
1605 * So, generally, we want clamping to the render target's range.
1606 * And, good news, the hardware tables for both pre- and
1607 * post-blend color clamping are either ignored, or any are
1608 * allowed, or clamping is required but RT range clamping is a
1611 entry
.PreBlendColorClampEnable
= true;
1612 entry
.PostBlendColorClampEnable
= true;
1613 entry
.ColorClampRange
= COLORCLAMP_RTFORMAT
;
1615 entry
.WriteDisableRed
= !ctx
->Color
.ColorMask
[i
][0];
1616 entry
.WriteDisableGreen
= !ctx
->Color
.ColorMask
[i
][1];
1617 entry
.WriteDisableBlue
= !ctx
->Color
.ColorMask
[i
][2];
1618 entry
.WriteDisableAlpha
= !ctx
->Color
.ColorMask
[i
][3];
1620 /* From the BLEND_STATE docs, DWord 0, Bit 29 (AlphaToOne Enable):
1621 * "If Dual Source Blending is enabled, this bit must be disabled."
1623 WARN_ONCE(ctx
->Color
.Blend
[i
]._UsesDualSrc
&&
1624 _mesa_is_multisample_enabled(ctx
) &&
1625 ctx
->Multisample
.SampleAlphaToOne
,
1626 "HW workaround: disabling alpha to one with dual src "
1628 if (ctx
->Color
.Blend
[i
]._UsesDualSrc
)
1629 blend
.AlphaToOneEnable
= false;
1631 GENX(BLEND_STATE_ENTRY_pack
)(NULL
, &blend_map
[1 + i
* 2], &entry
);
1633 GENX(BLEND_STATE_ENTRY_pack
)(NULL
, &blend_map
[i
* 2], &entry
);
1639 GENX(BLEND_STATE_pack
)(NULL
, blend_map
, &blend
);
1643 brw_batch_emit(brw
, GENX(3DSTATE_CC_STATE_POINTERS
), ptr
) {
1644 ptr
.PointertoBLEND_STATE
= brw
->cc
.blend_state_offset
;
1645 ptr
.BLEND_STATEChange
= true;
1648 brw_batch_emit(brw
, GENX(3DSTATE_BLEND_STATE_POINTERS
), ptr
) {
1649 ptr
.BlendStatePointer
= brw
->cc
.blend_state_offset
;
1651 ptr
.BlendStatePointerValid
= true;
1657 static const struct brw_tracked_state
genX(blend_state
) = {
1659 .mesa
= _NEW_BUFFERS
|
1662 .brw
= BRW_NEW_BATCH
|
1664 BRW_NEW_STATE_BASE_ADDRESS
,
1666 .emit
= genX(upload_blend_state
),
1671 /* ---------------------------------------------------------------------- */
1675 genX(upload_sbe
)(struct brw_context
*brw
)
1677 struct gl_context
*ctx
= &brw
->ctx
;
1678 /* BRW_NEW_FS_PROG_DATA */
1679 const struct brw_wm_prog_data
*wm_prog_data
=
1680 brw_wm_prog_data(brw
->wm
.base
.prog_data
);
1682 struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL
) attr_overrides
[16] = { { 0 } };
1684 #define attr_overrides sbe.Attribute
1686 uint32_t urb_entry_read_length
;
1687 uint32_t urb_entry_read_offset
;
1688 uint32_t point_sprite_enables
;
1690 brw_batch_emit(brw
, GENX(3DSTATE_SBE
), sbe
) {
1691 sbe
.AttributeSwizzleEnable
= true;
1692 sbe
.NumberofSFOutputAttributes
= wm_prog_data
->num_varying_inputs
;
1695 bool render_to_fbo
= _mesa_is_user_fbo(ctx
->DrawBuffer
);
1699 * Window coordinates in an FBO are inverted, which means point
1700 * sprite origin must be inverted.
1702 if ((ctx
->Point
.SpriteOrigin
== GL_LOWER_LEFT
) != render_to_fbo
)
1703 sbe
.PointSpriteTextureCoordinateOrigin
= LOWERLEFT
;
1705 sbe
.PointSpriteTextureCoordinateOrigin
= UPPERLEFT
;
1707 /* _NEW_POINT | _NEW_LIGHT | _NEW_PROGRAM,
1708 * BRW_NEW_FS_PROG_DATA | BRW_NEW_FRAGMENT_PROGRAM |
1709 * BRW_NEW_GS_PROG_DATA | BRW_NEW_PRIMITIVE | BRW_NEW_TES_PROG_DATA |
1710 * BRW_NEW_VUE_MAP_GEOM_OUT
1712 genX(calculate_attr_overrides
)(brw
,
1714 &point_sprite_enables
,
1715 &urb_entry_read_length
,
1716 &urb_entry_read_offset
);
1718 /* Typically, the URB entry read length and offset should be programmed
1719 * in 3DSTATE_VS and 3DSTATE_GS; SBE inherits it from the last active
1720 * stage which produces geometry. However, we don't know the proper
1721 * value until we call calculate_attr_overrides().
1723 * To fit with our existing code, we override the inherited values and
1724 * specify it here directly, as we did on previous generations.
1726 sbe
.VertexURBEntryReadLength
= urb_entry_read_length
;
1727 sbe
.VertexURBEntryReadOffset
= urb_entry_read_offset
;
1728 sbe
.PointSpriteTextureCoordinateEnable
= point_sprite_enables
;
1729 sbe
.ConstantInterpolationEnable
= wm_prog_data
->flat_inputs
;
1732 sbe
.ForceVertexURBEntryReadLength
= true;
1733 sbe
.ForceVertexURBEntryReadOffset
= true;
1737 /* prepare the active component dwords */
1738 int input_index
= 0;
1739 for (int attr
= 0; attr
< VARYING_SLOT_MAX
; attr
++) {
1740 if (!(brw
->fragment_program
->info
.inputs_read
&
1741 BITFIELD64_BIT(attr
))) {
1745 assert(input_index
< 32);
1747 sbe
.AttributeActiveComponentFormat
[input_index
] = ACTIVE_COMPONENT_XYZW
;
1754 brw_batch_emit(brw
, GENX(3DSTATE_SBE_SWIZ
), sbes
) {
1755 for (int i
= 0; i
< 16; i
++)
1756 sbes
.Attribute
[i
] = attr_overrides
[i
];
1760 #undef attr_overrides
1763 static const struct brw_tracked_state
genX(sbe_state
) = {
1765 .mesa
= _NEW_BUFFERS
|
1770 .brw
= BRW_NEW_BLORP
|
1772 BRW_NEW_FRAGMENT_PROGRAM
|
1773 BRW_NEW_FS_PROG_DATA
|
1774 BRW_NEW_GS_PROG_DATA
|
1775 BRW_NEW_TES_PROG_DATA
|
1776 BRW_NEW_VUE_MAP_GEOM_OUT
|
1777 (GEN_GEN
== 7 ? BRW_NEW_PRIMITIVE
1780 .emit
= genX(upload_sbe
),
1783 /* ---------------------------------------------------------------------- */
1786 * Outputs the 3DSTATE_SO_DECL_LIST command.
1788 * The data output is a series of 64-bit entries containing a SO_DECL per
1789 * stream. We only have one stream of rendering coming out of the GS unit, so
1790 * we only emit stream 0 (low 16 bits) SO_DECLs.
1793 genX(upload_3dstate_so_decl_list
)(struct brw_context
*brw
,
1794 const struct brw_vue_map
*vue_map
)
1796 struct gl_context
*ctx
= &brw
->ctx
;
1797 /* BRW_NEW_TRANSFORM_FEEDBACK */
1798 struct gl_transform_feedback_object
*xfb_obj
=
1799 ctx
->TransformFeedback
.CurrentObject
;
1800 const struct gl_transform_feedback_info
*linked_xfb_info
=
1801 xfb_obj
->program
->sh
.LinkedTransformFeedback
;
1802 struct GENX(SO_DECL
) so_decl
[MAX_VERTEX_STREAMS
][128];
1803 int buffer_mask
[MAX_VERTEX_STREAMS
] = {0, 0, 0, 0};
1804 int next_offset
[MAX_VERTEX_STREAMS
] = {0, 0, 0, 0};
1805 int decls
[MAX_VERTEX_STREAMS
] = {0, 0, 0, 0};
1807 STATIC_ASSERT(ARRAY_SIZE(so_decl
[0]) >= MAX_PROGRAM_OUTPUTS
);
1809 memset(so_decl
, 0, sizeof(so_decl
));
1811 /* Construct the list of SO_DECLs to be emitted. The formatting of the
1812 * command feels strange -- each dword pair contains a SO_DECL per stream.
1814 for (unsigned i
= 0; i
< linked_xfb_info
->NumOutputs
; i
++) {
1815 int buffer
= linked_xfb_info
->Outputs
[i
].OutputBuffer
;
1816 struct GENX(SO_DECL
) decl
= {0};
1817 int varying
= linked_xfb_info
->Outputs
[i
].OutputRegister
;
1818 const unsigned components
= linked_xfb_info
->Outputs
[i
].NumComponents
;
1819 unsigned component_mask
= (1 << components
) - 1;
1820 unsigned stream_id
= linked_xfb_info
->Outputs
[i
].StreamId
;
1821 unsigned decl_buffer_slot
= buffer
;
1822 assert(stream_id
< MAX_VERTEX_STREAMS
);
1824 /* gl_PointSize is stored in VARYING_SLOT_PSIZ.w
1825 * gl_Layer is stored in VARYING_SLOT_PSIZ.y
1826 * gl_ViewportIndex is stored in VARYING_SLOT_PSIZ.z
1828 if (varying
== VARYING_SLOT_PSIZ
) {
1829 assert(components
== 1);
1830 component_mask
<<= 3;
1831 } else if (varying
== VARYING_SLOT_LAYER
) {
1832 assert(components
== 1);
1833 component_mask
<<= 1;
1834 } else if (varying
== VARYING_SLOT_VIEWPORT
) {
1835 assert(components
== 1);
1836 component_mask
<<= 2;
1838 component_mask
<<= linked_xfb_info
->Outputs
[i
].ComponentOffset
;
1841 buffer_mask
[stream_id
] |= 1 << buffer
;
1843 decl
.OutputBufferSlot
= decl_buffer_slot
;
1844 if (varying
== VARYING_SLOT_LAYER
|| varying
== VARYING_SLOT_VIEWPORT
) {
1845 decl
.RegisterIndex
= vue_map
->varying_to_slot
[VARYING_SLOT_PSIZ
];
1847 assert(vue_map
->varying_to_slot
[varying
] >= 0);
1848 decl
.RegisterIndex
= vue_map
->varying_to_slot
[varying
];
1850 decl
.ComponentMask
= component_mask
;
1852 /* Mesa doesn't store entries for gl_SkipComponents in the Outputs[]
1853 * array. Instead, it simply increments DstOffset for the following
1854 * input by the number of components that should be skipped.
1856 * Our hardware is unusual in that it requires us to program SO_DECLs
1857 * for fake "hole" components, rather than simply taking the offset
1858 * for each real varying. Each hole can have size 1, 2, 3, or 4; we
1859 * program as many size = 4 holes as we can, then a final hole to
1860 * accommodate the final 1, 2, or 3 remaining.
1862 int skip_components
=
1863 linked_xfb_info
->Outputs
[i
].DstOffset
- next_offset
[buffer
];
1865 next_offset
[buffer
] += skip_components
;
1867 while (skip_components
>= 4) {
1868 struct GENX(SO_DECL
) *d
= &so_decl
[stream_id
][decls
[stream_id
]++];
1870 d
->OutputBufferSlot
= decl_buffer_slot
;
1871 d
->ComponentMask
= 0xf;
1872 skip_components
-= 4;
1875 if (skip_components
> 0) {
1876 struct GENX(SO_DECL
) *d
= &so_decl
[stream_id
][decls
[stream_id
]++];
1878 d
->OutputBufferSlot
= decl_buffer_slot
;
1879 d
->ComponentMask
= (1 << skip_components
) - 1;
1882 assert(linked_xfb_info
->Outputs
[i
].DstOffset
== next_offset
[buffer
]);
1884 next_offset
[buffer
] += components
;
1886 so_decl
[stream_id
][decls
[stream_id
]++] = decl
;
1888 if (decls
[stream_id
] > max_decls
)
1889 max_decls
= decls
[stream_id
];
1893 dw
= brw_batch_emitn(brw
, GENX(3DSTATE_SO_DECL_LIST
), 3 + 2 * max_decls
,
1894 .StreamtoBufferSelects0
= buffer_mask
[0],
1895 .StreamtoBufferSelects1
= buffer_mask
[1],
1896 .StreamtoBufferSelects2
= buffer_mask
[2],
1897 .StreamtoBufferSelects3
= buffer_mask
[3],
1898 .NumEntries0
= decls
[0],
1899 .NumEntries1
= decls
[1],
1900 .NumEntries2
= decls
[2],
1901 .NumEntries3
= decls
[3]);
1903 for (int i
= 0; i
< max_decls
; i
++) {
1904 GENX(SO_DECL_ENTRY_pack
)(
1905 brw
, dw
+ 2 + i
* 2,
1906 &(struct GENX(SO_DECL_ENTRY
)) {
1907 .Stream0Decl
= so_decl
[0][i
],
1908 .Stream1Decl
= so_decl
[1][i
],
1909 .Stream2Decl
= so_decl
[2][i
],
1910 .Stream3Decl
= so_decl
[3][i
],
1916 genX(upload_3dstate_so_buffers
)(struct brw_context
*brw
)
1918 struct gl_context
*ctx
= &brw
->ctx
;
1919 /* BRW_NEW_TRANSFORM_FEEDBACK */
1920 struct gl_transform_feedback_object
*xfb_obj
=
1921 ctx
->TransformFeedback
.CurrentObject
;
1923 const struct gl_transform_feedback_info
*linked_xfb_info
=
1924 xfb_obj
->program
->sh
.LinkedTransformFeedback
;
1926 struct brw_transform_feedback_object
*brw_obj
=
1927 (struct brw_transform_feedback_object
*) xfb_obj
;
1928 uint32_t mocs_wb
= brw
->gen
>= 9 ? SKL_MOCS_WB
: BDW_MOCS_WB
;
1931 /* Set up the up to 4 output buffers. These are the ranges defined in the
1932 * gl_transform_feedback_object.
1934 for (int i
= 0; i
< 4; i
++) {
1935 struct intel_buffer_object
*bufferobj
=
1936 intel_buffer_object(xfb_obj
->Buffers
[i
]);
1939 brw_batch_emit(brw
, GENX(3DSTATE_SO_BUFFER
), sob
) {
1940 sob
.SOBufferIndex
= i
;
1945 uint32_t start
= xfb_obj
->Offset
[i
];
1946 assert(start
% 4 == 0);
1947 uint32_t end
= ALIGN(start
+ xfb_obj
->Size
[i
], 4);
1949 intel_bufferobj_buffer(brw
, bufferobj
, start
, end
- start
);
1950 assert(end
<= bo
->size
);
1952 brw_batch_emit(brw
, GENX(3DSTATE_SO_BUFFER
), sob
) {
1953 sob
.SOBufferIndex
= i
;
1955 sob
.SurfaceBaseAddress
= render_bo(bo
, start
);
1957 sob
.SurfacePitch
= linked_xfb_info
->Buffers
[i
].Stride
* 4;
1958 sob
.SurfaceEndAddress
= render_bo(bo
, end
);
1960 sob
.SOBufferEnable
= true;
1961 sob
.StreamOffsetWriteEnable
= true;
1962 sob
.StreamOutputBufferOffsetAddressEnable
= true;
1963 sob
.SOBufferMOCS
= mocs_wb
;
1965 sob
.SurfaceSize
= MAX2(xfb_obj
->Size
[i
] / 4, 1) - 1;
1966 sob
.StreamOutputBufferOffsetAddress
=
1967 instruction_bo(brw_obj
->offset_bo
, i
* sizeof(uint32_t));
1969 if (brw_obj
->zero_offsets
) {
1970 /* Zero out the offset and write that to offset_bo */
1971 sob
.StreamOffset
= 0;
1973 /* Use offset_bo as the "Stream Offset." */
1974 sob
.StreamOffset
= 0xFFFFFFFF;
1981 brw_obj
->zero_offsets
= false;
1986 query_active(struct gl_query_object
*q
)
1988 return q
&& q
->Active
;
1992 genX(upload_3dstate_streamout
)(struct brw_context
*brw
, bool active
,
1993 const struct brw_vue_map
*vue_map
)
1995 struct gl_context
*ctx
= &brw
->ctx
;
1996 /* BRW_NEW_TRANSFORM_FEEDBACK */
1997 struct gl_transform_feedback_object
*xfb_obj
=
1998 ctx
->TransformFeedback
.CurrentObject
;
2000 brw_batch_emit(brw
, GENX(3DSTATE_STREAMOUT
), sos
) {
2002 int urb_entry_read_offset
= 0;
2003 int urb_entry_read_length
= (vue_map
->num_slots
+ 1) / 2 -
2004 urb_entry_read_offset
;
2006 sos
.SOFunctionEnable
= true;
2007 sos
.SOStatisticsEnable
= true;
2009 /* BRW_NEW_RASTERIZER_DISCARD */
2010 if (ctx
->RasterDiscard
) {
2011 if (!query_active(ctx
->Query
.PrimitivesGenerated
[0])) {
2012 sos
.RenderingDisable
= true;
2014 perf_debug("Rasterizer discard with a GL_PRIMITIVES_GENERATED "
2015 "query active relies on the clipper.");
2020 if (ctx
->Light
.ProvokingVertex
!= GL_FIRST_VERTEX_CONVENTION
)
2021 sos
.ReorderMode
= TRAILING
;
2024 sos
.SOBufferEnable0
= xfb_obj
->Buffers
[0] != NULL
;
2025 sos
.SOBufferEnable1
= xfb_obj
->Buffers
[1] != NULL
;
2026 sos
.SOBufferEnable2
= xfb_obj
->Buffers
[2] != NULL
;
2027 sos
.SOBufferEnable3
= xfb_obj
->Buffers
[3] != NULL
;
2029 const struct gl_transform_feedback_info
*linked_xfb_info
=
2030 xfb_obj
->program
->sh
.LinkedTransformFeedback
;
2031 /* Set buffer pitches; 0 means unbound. */
2032 if (xfb_obj
->Buffers
[0])
2033 sos
.Buffer0SurfacePitch
= linked_xfb_info
->Buffers
[0].Stride
* 4;
2034 if (xfb_obj
->Buffers
[1])
2035 sos
.Buffer1SurfacePitch
= linked_xfb_info
->Buffers
[1].Stride
* 4;
2036 if (xfb_obj
->Buffers
[2])
2037 sos
.Buffer2SurfacePitch
= linked_xfb_info
->Buffers
[2].Stride
* 4;
2038 if (xfb_obj
->Buffers
[3])
2039 sos
.Buffer3SurfacePitch
= linked_xfb_info
->Buffers
[3].Stride
* 4;
2042 /* We always read the whole vertex. This could be reduced at some
2043 * point by reading less and offsetting the register index in the
2046 sos
.Stream0VertexReadOffset
= urb_entry_read_offset
;
2047 sos
.Stream0VertexReadLength
= urb_entry_read_length
- 1;
2048 sos
.Stream1VertexReadOffset
= urb_entry_read_offset
;
2049 sos
.Stream1VertexReadLength
= urb_entry_read_length
- 1;
2050 sos
.Stream2VertexReadOffset
= urb_entry_read_offset
;
2051 sos
.Stream2VertexReadLength
= urb_entry_read_length
- 1;
2052 sos
.Stream3VertexReadOffset
= urb_entry_read_offset
;
2053 sos
.Stream3VertexReadLength
= urb_entry_read_length
- 1;
2059 genX(upload_sol
)(struct brw_context
*brw
)
2061 struct gl_context
*ctx
= &brw
->ctx
;
2062 /* BRW_NEW_TRANSFORM_FEEDBACK */
2063 bool active
= _mesa_is_xfb_active_and_unpaused(ctx
);
2066 genX(upload_3dstate_so_buffers
)(brw
);
2068 /* BRW_NEW_VUE_MAP_GEOM_OUT */
2069 genX(upload_3dstate_so_decl_list
)(brw
, &brw
->vue_map_geom_out
);
2072 /* Finally, set up the SOL stage. This command must always follow updates to
2073 * the nonpipelined SOL state (3DSTATE_SO_BUFFER, 3DSTATE_SO_DECL_LIST) or
2074 * MMIO register updates (current performed by the kernel at each batch
2077 genX(upload_3dstate_streamout
)(brw
, active
, &brw
->vue_map_geom_out
);
2080 static const struct brw_tracked_state
genX(sol_state
) = {
2083 .brw
= BRW_NEW_BATCH
|
2085 BRW_NEW_RASTERIZER_DISCARD
|
2086 BRW_NEW_VUE_MAP_GEOM_OUT
|
2087 BRW_NEW_TRANSFORM_FEEDBACK
,
2089 .emit
= genX(upload_sol
),
2092 /* ---------------------------------------------------------------------- */
2095 genX(upload_ps
)(struct brw_context
*brw
)
2097 UNUSED
const struct gl_context
*ctx
= &brw
->ctx
;
2098 UNUSED
const struct gen_device_info
*devinfo
= &brw
->screen
->devinfo
;
2100 /* BRW_NEW_FS_PROG_DATA */
2101 const struct brw_wm_prog_data
*prog_data
=
2102 brw_wm_prog_data(brw
->wm
.base
.prog_data
);
2103 const struct brw_stage_state
*stage_state
= &brw
->wm
.base
;
2108 brw_batch_emit(brw
, GENX(3DSTATE_PS
), ps
) {
2109 /* Initialize the execution mask with VMask. Otherwise, derivatives are
2110 * incorrect for subspans where some of the pixels are unlit. We believe
2111 * the bit just didn't take effect in previous generations.
2113 ps
.VectorMaskEnable
= GEN_GEN
>= 8;
2116 DIV_ROUND_UP(CLAMP(stage_state
->sampler_count
, 0, 16), 4);
2118 /* BRW_NEW_FS_PROG_DATA */
2119 ps
.BindingTableEntryCount
= prog_data
->base
.binding_table
.size_bytes
/ 4;
2121 if (prog_data
->base
.use_alt_mode
)
2122 ps
.FloatingPointMode
= Alternate
;
2124 /* Haswell requires the sample mask to be set in this packet as well as
2125 * in 3DSTATE_SAMPLE_MASK; the values should match.
2128 /* _NEW_BUFFERS, _NEW_MULTISAMPLE */
2130 ps
.SampleMask
= gen6_determine_sample_mask(brw
);
2133 /* 3DSTATE_PS expects the number of threads per PSD, which is always 64;
2134 * it implicitly scales for different GT levels (which have some # of
2137 * In Gen8 the format is U8-2 whereas in Gen9 it is U8-1.
2140 ps
.MaximumNumberofThreadsPerPSD
= 64 - 1;
2142 ps
.MaximumNumberofThreadsPerPSD
= 64 - 2;
2144 ps
.MaximumNumberofThreads
= devinfo
->max_wm_threads
- 1;
2147 if (prog_data
->base
.nr_params
> 0)
2148 ps
.PushConstantEnable
= true;
2151 /* From the IVB PRM, volume 2 part 1, page 287:
2152 * "This bit is inserted in the PS payload header and made available to
2153 * the DataPort (either via the message header or via header bypass) to
2154 * indicate that oMask data (one or two phases) is included in Render
2155 * Target Write messages. If present, the oMask data is used to mask off
2158 ps
.oMaskPresenttoRenderTarget
= prog_data
->uses_omask
;
2160 /* The hardware wedges if you have this bit set but don't turn on any
2161 * dual source blend factors.
2163 * BRW_NEW_FS_PROG_DATA | _NEW_COLOR
2165 ps
.DualSourceBlendEnable
= prog_data
->dual_src_blend
&&
2166 (ctx
->Color
.BlendEnabled
& 1) &&
2167 ctx
->Color
.Blend
[0]._UsesDualSrc
;
2169 /* BRW_NEW_FS_PROG_DATA */
2170 ps
.AttributeEnable
= (prog_data
->num_varying_inputs
!= 0);
2173 /* From the documentation for this packet:
2174 * "If the PS kernel does not need the Position XY Offsets to
2175 * compute a Position Value, then this field should be programmed
2176 * to POSOFFSET_NONE."
2178 * "SW Recommendation: If the PS kernel needs the Position Offsets
2179 * to compute a Position XY value, this field should match Position
2180 * ZW Interpolation Mode to ensure a consistent position.xyzw
2183 * We only require XY sample offsets. So, this recommendation doesn't
2184 * look useful at the moment. We might need this in future.
2186 if (prog_data
->uses_pos_offset
)
2187 ps
.PositionXYOffsetSelect
= POSOFFSET_SAMPLE
;
2189 ps
.PositionXYOffsetSelect
= POSOFFSET_NONE
;
2191 ps
.RenderTargetFastClearEnable
= brw
->wm
.fast_clear_op
;
2192 ps
._8PixelDispatchEnable
= prog_data
->dispatch_8
;
2193 ps
._16PixelDispatchEnable
= prog_data
->dispatch_16
;
2194 ps
.DispatchGRFStartRegisterForConstantSetupData0
=
2195 prog_data
->base
.dispatch_grf_start_reg
;
2196 ps
.DispatchGRFStartRegisterForConstantSetupData2
=
2197 prog_data
->dispatch_grf_start_reg_2
;
2199 ps
.KernelStartPointer0
= stage_state
->prog_offset
;
2200 ps
.KernelStartPointer2
= stage_state
->prog_offset
+
2201 prog_data
->prog_offset_2
;
2203 if (prog_data
->base
.total_scratch
) {
2204 ps
.ScratchSpaceBasePointer
=
2205 render_bo(stage_state
->scratch_bo
,
2206 ffs(stage_state
->per_thread_scratch
) - 11);
2211 static const struct brw_tracked_state
genX(ps_state
) = {
2213 .mesa
= _NEW_MULTISAMPLE
|
2214 (GEN_GEN
< 8 ? _NEW_BUFFERS
|
2217 .brw
= BRW_NEW_BATCH
|
2219 BRW_NEW_FS_PROG_DATA
,
2221 .emit
= genX(upload_ps
),
2224 /* ---------------------------------------------------------------------- */
2227 genX(upload_hs_state
)(struct brw_context
*brw
)
2229 const struct gen_device_info
*devinfo
= &brw
->screen
->devinfo
;
2230 struct brw_stage_state
*stage_state
= &brw
->tcs
.base
;
2231 struct brw_stage_prog_data
*stage_prog_data
= stage_state
->prog_data
;
2232 const struct brw_vue_prog_data
*vue_prog_data
=
2233 brw_vue_prog_data(stage_prog_data
);
2235 /* BRW_NEW_TES_PROG_DATA */
2236 struct brw_tcs_prog_data
*tcs_prog_data
=
2237 brw_tcs_prog_data(stage_prog_data
);
2239 if (!tcs_prog_data
) {
2240 brw_batch_emit(brw
, GENX(3DSTATE_HS
), hs
);
2242 brw_batch_emit(brw
, GENX(3DSTATE_HS
), hs
) {
2243 INIT_THREAD_DISPATCH_FIELDS(hs
, Vertex
);
2245 hs
.InstanceCount
= tcs_prog_data
->instances
- 1;
2246 hs
.IncludeVertexHandles
= true;
2248 hs
.MaximumNumberofThreads
= devinfo
->max_tcs_threads
- 1;
2253 static const struct brw_tracked_state
genX(hs_state
) = {
2256 .brw
= BRW_NEW_BATCH
|
2258 BRW_NEW_TCS_PROG_DATA
|
2259 BRW_NEW_TESS_PROGRAMS
,
2261 .emit
= genX(upload_hs_state
),
2265 genX(upload_ds_state
)(struct brw_context
*brw
)
2267 const struct gen_device_info
*devinfo
= &brw
->screen
->devinfo
;
2268 const struct brw_stage_state
*stage_state
= &brw
->tes
.base
;
2269 struct brw_stage_prog_data
*stage_prog_data
= stage_state
->prog_data
;
2271 /* BRW_NEW_TES_PROG_DATA */
2272 const struct brw_tes_prog_data
*tes_prog_data
=
2273 brw_tes_prog_data(stage_prog_data
);
2274 const struct brw_vue_prog_data
*vue_prog_data
=
2275 brw_vue_prog_data(stage_prog_data
);
2277 if (!tes_prog_data
) {
2278 brw_batch_emit(brw
, GENX(3DSTATE_DS
), ds
);
2280 brw_batch_emit(brw
, GENX(3DSTATE_DS
), ds
) {
2281 INIT_THREAD_DISPATCH_FIELDS(ds
, Patch
);
2283 ds
.MaximumNumberofThreads
= devinfo
->max_tes_threads
- 1;
2284 ds
.ComputeWCoordinateEnable
=
2285 tes_prog_data
->domain
== BRW_TESS_DOMAIN_TRI
;
2288 if (vue_prog_data
->dispatch_mode
== DISPATCH_MODE_SIMD8
)
2289 ds
.DispatchMode
= DISPATCH_MODE_SIMD8_SINGLE_PATCH
;
2290 ds
.UserClipDistanceCullTestEnableBitmask
=
2291 vue_prog_data
->cull_distance_mask
;
2297 static const struct brw_tracked_state
genX(ds_state
) = {
2300 .brw
= BRW_NEW_BATCH
|
2302 BRW_NEW_TESS_PROGRAMS
|
2303 BRW_NEW_TES_PROG_DATA
,
2305 .emit
= genX(upload_ds_state
),
2310 /* ---------------------------------------------------------------------- */
2314 genX(upload_raster
)(struct brw_context
*brw
)
2316 struct gl_context
*ctx
= &brw
->ctx
;
2319 bool render_to_fbo
= _mesa_is_user_fbo(ctx
->DrawBuffer
);
2322 struct gl_polygon_attrib
*polygon
= &ctx
->Polygon
;
2325 struct gl_point_attrib
*point
= &ctx
->Point
;
2327 brw_batch_emit(brw
, GENX(3DSTATE_RASTER
), raster
) {
2328 if (polygon
->_FrontBit
== render_to_fbo
)
2329 raster
.FrontWinding
= CounterClockwise
;
2331 if (polygon
->CullFlag
) {
2332 switch (polygon
->CullFaceMode
) {
2334 raster
.CullMode
= CULLMODE_FRONT
;
2337 raster
.CullMode
= CULLMODE_BACK
;
2339 case GL_FRONT_AND_BACK
:
2340 raster
.CullMode
= CULLMODE_BOTH
;
2343 unreachable("not reached");
2346 raster
.CullMode
= CULLMODE_NONE
;
2349 point
->SmoothFlag
= raster
.SmoothPointEnable
;
2351 raster
.DXMultisampleRasterizationEnable
=
2352 _mesa_is_multisample_enabled(ctx
);
2354 raster
.GlobalDepthOffsetEnableSolid
= polygon
->OffsetFill
;
2355 raster
.GlobalDepthOffsetEnableWireframe
= polygon
->OffsetLine
;
2356 raster
.GlobalDepthOffsetEnablePoint
= polygon
->OffsetPoint
;
2358 switch (polygon
->FrontMode
) {
2360 raster
.FrontFaceFillMode
= FILL_MODE_SOLID
;
2363 raster
.FrontFaceFillMode
= FILL_MODE_WIREFRAME
;
2366 raster
.FrontFaceFillMode
= FILL_MODE_POINT
;
2369 unreachable("not reached");
2372 switch (polygon
->BackMode
) {
2374 raster
.BackFaceFillMode
= FILL_MODE_SOLID
;
2377 raster
.BackFaceFillMode
= FILL_MODE_WIREFRAME
;
2380 raster
.BackFaceFillMode
= FILL_MODE_POINT
;
2383 unreachable("not reached");
2387 raster
.AntialiasingEnable
= ctx
->Line
.SmoothFlag
;
2390 raster
.ScissorRectangleEnable
= ctx
->Scissor
.EnableFlags
;
2392 /* _NEW_TRANSFORM */
2393 if (!ctx
->Transform
.DepthClamp
) {
2395 raster
.ViewportZFarClipTestEnable
= true;
2396 raster
.ViewportZNearClipTestEnable
= true;
2398 raster
.ViewportZClipTestEnable
= true;
2402 /* BRW_NEW_CONSERVATIVE_RASTERIZATION */
2404 raster
.ConservativeRasterizationEnable
=
2405 ctx
->IntelConservativeRasterization
;
2408 raster
.GlobalDepthOffsetClamp
= polygon
->OffsetClamp
;
2409 raster
.GlobalDepthOffsetScale
= polygon
->OffsetFactor
;
2411 raster
.GlobalDepthOffsetConstant
= polygon
->OffsetUnits
* 2;
2415 static const struct brw_tracked_state
genX(raster_state
) = {
2417 .mesa
= _NEW_BUFFERS
|
2424 .brw
= BRW_NEW_BLORP
|
2426 BRW_NEW_CONSERVATIVE_RASTERIZATION
,
2428 .emit
= genX(upload_raster
),
2431 /* ---------------------------------------------------------------------- */
2434 genX(upload_ps_extra
)(struct brw_context
*brw
)
2436 UNUSED
struct gl_context
*ctx
= &brw
->ctx
;
2438 const struct brw_wm_prog_data
*prog_data
=
2439 brw_wm_prog_data(brw
->wm
.base
.prog_data
);
2441 brw_batch_emit(brw
, GENX(3DSTATE_PS_EXTRA
), psx
) {
2442 psx
.PixelShaderValid
= true;
2443 psx
.PixelShaderComputedDepthMode
= prog_data
->computed_depth_mode
;
2444 psx
.PixelShaderKillsPixel
= prog_data
->uses_kill
;
2445 psx
.AttributeEnable
= prog_data
->num_varying_inputs
!= 0;
2446 psx
.PixelShaderUsesSourceDepth
= prog_data
->uses_src_depth
;
2447 psx
.PixelShaderUsesSourceW
= prog_data
->uses_src_w
;
2448 psx
.PixelShaderIsPerSample
= prog_data
->persample_dispatch
;
2450 /* _NEW_MULTISAMPLE | BRW_NEW_CONSERVATIVE_RASTERIZATION */
2451 if (prog_data
->uses_sample_mask
) {
2453 if (prog_data
->post_depth_coverage
)
2454 psx
.InputCoverageMaskState
= ICMS_DEPTH_COVERAGE
;
2455 else if (prog_data
->inner_coverage
&& ctx
->IntelConservativeRasterization
)
2456 psx
.InputCoverageMaskState
= ICMS_INNER_CONSERVATIVE
;
2458 psx
.InputCoverageMaskState
= ICMS_NORMAL
;
2460 psx
.PixelShaderUsesInputCoverageMask
= true;
2464 psx
.oMaskPresenttoRenderTarget
= prog_data
->uses_omask
;
2466 psx
.PixelShaderPullsBary
= prog_data
->pulls_bary
;
2467 psx
.PixelShaderComputesStencil
= prog_data
->computed_stencil
;
2470 /* The stricter cross-primitive coherency guarantees that the hardware
2471 * gives us with the "Accesses UAV" bit set for at least one shader stage
2472 * and the "UAV coherency required" bit set on the 3DPRIMITIVE command
2473 * are redundant within the current image, atomic counter and SSBO GL
2474 * APIs, which all have very loose ordering and coherency requirements
2475 * and generally rely on the application to insert explicit barriers when
2476 * a shader invocation is expected to see the memory writes performed by
2477 * the invocations of some previous primitive. Regardless of the value
2478 * of "UAV coherency required", the "Accesses UAV" bits will implicitly
2479 * cause an in most cases useless DC flush when the lowermost stage with
2480 * the bit set finishes execution.
2482 * It would be nice to disable it, but in some cases we can't because on
2483 * Gen8+ it also has an influence on rasterization via the PS UAV-only
2484 * signal (which could be set independently from the coherency mechanism
2485 * in the 3DSTATE_WM command on Gen7), and because in some cases it will
2486 * determine whether the hardware skips execution of the fragment shader
2487 * or not via the ThreadDispatchEnable signal. However if we know that
2488 * GEN8_PS_BLEND_HAS_WRITEABLE_RT is going to be set and
2489 * GEN8_PSX_PIXEL_SHADER_NO_RT_WRITE is not set it shouldn't make any
2490 * difference so we may just disable it here.
2492 * Gen8 hardware tries to compute ThreadDispatchEnable for us but doesn't
2493 * take into account KillPixels when no depth or stencil writes are
2494 * enabled. In order for occlusion queries to work correctly with no
2495 * attachments, we need to force-enable here.
2497 * BRW_NEW_FS_PROG_DATA | BRW_NEW_FRAGMENT_PROGRAM | _NEW_BUFFERS |
2500 if ((prog_data
->has_side_effects
|| prog_data
->uses_kill
) &&
2501 !brw_color_buffer_write_enabled(brw
))
2502 psx
.PixelShaderHasUAV
= true;
2506 const struct brw_tracked_state
genX(ps_extra
) = {
2508 .mesa
= _NEW_BUFFERS
| _NEW_COLOR
,
2509 .brw
= BRW_NEW_BLORP
|
2511 BRW_NEW_FRAGMENT_PROGRAM
|
2512 BRW_NEW_FS_PROG_DATA
|
2513 BRW_NEW_CONSERVATIVE_RASTERIZATION
,
2515 .emit
= genX(upload_ps_extra
),
2518 /* ---------------------------------------------------------------------- */
2521 genX(upload_ps_blend
)(struct brw_context
*brw
)
2523 struct gl_context
*ctx
= &brw
->ctx
;
2526 struct gl_renderbuffer
*rb
= ctx
->DrawBuffer
->_ColorDrawBuffers
[0];
2527 const bool buffer0_is_integer
= ctx
->DrawBuffer
->_IntegerBuffers
& 0x1;
2530 struct gl_colorbuffer_attrib
*color
= &ctx
->Color
;
2532 brw_batch_emit(brw
, GENX(3DSTATE_PS_BLEND
), pb
) {
2533 /* BRW_NEW_FRAGMENT_PROGRAM | _NEW_BUFFERS | _NEW_COLOR */
2534 pb
.HasWriteableRT
= brw_color_buffer_write_enabled(brw
);
2536 if (!buffer0_is_integer
) {
2537 /* _NEW_MULTISAMPLE */
2538 pb
.AlphaToCoverageEnable
=
2539 _mesa_is_multisample_enabled(ctx
) &&
2540 ctx
->Multisample
.SampleAlphaToCoverage
;
2542 pb
.AlphaTestEnable
= color
->AlphaEnabled
;
2545 /* Used for implementing the following bit of GL_EXT_texture_integer:
2546 * "Per-fragment operations that require floating-point color
2547 * components, including multisample alpha operations, alpha test,
2548 * blending, and dithering, have no effect when the corresponding
2549 * colors are written to an integer color buffer."
2551 * The OpenGL specification 3.3 (page 196), section 4.1.3 says:
2552 * "If drawbuffer zero is not NONE and the buffer it references has an
2553 * integer format, the SAMPLE_ALPHA_TO_COVERAGE and SAMPLE_ALPHA_TO_ONE
2554 * operations are skipped."
2556 if (rb
&& !buffer0_is_integer
&& (color
->BlendEnabled
& 1)) {
2557 GLenum eqRGB
= color
->Blend
[0].EquationRGB
;
2558 GLenum eqA
= color
->Blend
[0].EquationA
;
2559 GLenum srcRGB
= color
->Blend
[0].SrcRGB
;
2560 GLenum dstRGB
= color
->Blend
[0].DstRGB
;
2561 GLenum srcA
= color
->Blend
[0].SrcA
;
2562 GLenum dstA
= color
->Blend
[0].DstA
;
2564 if (eqRGB
== GL_MIN
|| eqRGB
== GL_MAX
)
2565 srcRGB
= dstRGB
= GL_ONE
;
2567 if (eqA
== GL_MIN
|| eqA
== GL_MAX
)
2568 srcA
= dstA
= GL_ONE
;
2570 /* Due to hardware limitations, the destination may have information
2571 * in an alpha channel even when the format specifies no alpha
2572 * channel. In order to avoid getting any incorrect blending due to
2573 * that alpha channel, coerce the blend factors to values that will
2574 * not read the alpha channel, but will instead use the correct
2575 * implicit value for alpha.
2577 if (!_mesa_base_format_has_channel(rb
->_BaseFormat
,
2578 GL_TEXTURE_ALPHA_TYPE
)) {
2579 srcRGB
= brw_fix_xRGB_alpha(srcRGB
);
2580 srcA
= brw_fix_xRGB_alpha(srcA
);
2581 dstRGB
= brw_fix_xRGB_alpha(dstRGB
);
2582 dstA
= brw_fix_xRGB_alpha(dstA
);
2585 pb
.ColorBufferBlendEnable
= true;
2586 pb
.SourceAlphaBlendFactor
= brw_translate_blend_factor(srcA
);
2587 pb
.DestinationAlphaBlendFactor
= brw_translate_blend_factor(dstA
);
2588 pb
.SourceBlendFactor
= brw_translate_blend_factor(srcRGB
);
2589 pb
.DestinationBlendFactor
= brw_translate_blend_factor(dstRGB
);
2591 pb
.IndependentAlphaBlendEnable
=
2592 srcA
!= srcRGB
|| dstA
!= dstRGB
|| eqA
!= eqRGB
;
2597 static const struct brw_tracked_state
genX(ps_blend
) = {
2599 .mesa
= _NEW_BUFFERS
|
2602 .brw
= BRW_NEW_BLORP
|
2604 BRW_NEW_FRAGMENT_PROGRAM
,
2606 .emit
= genX(upload_ps_blend
)
2610 /* ---------------------------------------------------------------------- */
2613 genX(init_atoms
)(struct brw_context
*brw
)
2616 static const struct brw_tracked_state
*render_atoms
[] =
2618 /* Once all the programs are done, we know how large urb entry
2619 * sizes need to be and can decide if we need to change the urb
2623 &brw_recalculate_urb_fence
,
2628 /* Surface state setup. Must come before the VS/WM unit. The binding
2629 * table upload must be last.
2631 &brw_vs_pull_constants
,
2632 &brw_wm_pull_constants
,
2633 &brw_renderbuffer_surfaces
,
2634 &brw_renderbuffer_read_surfaces
,
2635 &brw_texture_surfaces
,
2636 &brw_vs_binding_table
,
2637 &brw_wm_binding_table
,
2642 /* These set up state for brw_psp_urb_cbs */
2646 &brw_vs_unit
, /* always required, enabled or not */
2652 &brw_invariant_state
,
2654 &brw_binding_table_pointers
,
2655 &brw_blend_constant_color
,
2659 &brw_polygon_stipple
,
2660 &brw_polygon_stipple_offset
,
2667 &brw_indices
, /* must come before brw_vertices */
2671 &brw_constant_buffer
2674 static const struct brw_tracked_state
*render_atoms
[] =
2676 &genX(sf_clip_viewport
),
2678 /* Command packets: */
2681 &gen6_viewport_state
, /* must do after *_vp stages */
2684 &genX(blend_state
), /* must do before cc unit */
2685 &gen6_color_calc_state
, /* must do before cc unit */
2686 &gen6_depth_stencil_state
, /* must do before cc unit */
2688 &gen6_vs_push_constants
, /* Before vs_state */
2689 &gen6_gs_push_constants
, /* Before gs_state */
2690 &gen6_wm_push_constants
, /* Before wm_state */
2692 /* Surface state setup. Must come before the VS/WM unit. The binding
2693 * table upload must be last.
2695 &brw_vs_pull_constants
,
2696 &brw_vs_ubo_surfaces
,
2697 &brw_gs_pull_constants
,
2698 &brw_gs_ubo_surfaces
,
2699 &brw_wm_pull_constants
,
2700 &brw_wm_ubo_surfaces
,
2701 &gen6_renderbuffer_surfaces
,
2702 &brw_renderbuffer_read_surfaces
,
2703 &brw_texture_surfaces
,
2705 &brw_vs_binding_table
,
2706 &gen6_gs_binding_table
,
2707 &brw_wm_binding_table
,
2712 &gen6_sampler_state
,
2713 &gen6_multisample_state
,
2721 &gen6_scissor_state
,
2723 &gen6_binding_table_pointers
,
2727 &brw_polygon_stipple
,
2728 &brw_polygon_stipple_offset
,
2734 &brw_indices
, /* must come before brw_vertices */
2739 static const struct brw_tracked_state
*render_atoms
[] =
2741 /* Command packets: */
2744 &genX(sf_clip_viewport
),
2747 &gen7_push_constant_space
,
2749 &genX(blend_state
), /* must do before cc unit */
2750 &gen6_color_calc_state
, /* must do before cc unit */
2751 &genX(depth_stencil_state
), /* must do before cc unit */
2753 &brw_vs_image_surfaces
, /* Before vs push/pull constants and binding table */
2754 &brw_tcs_image_surfaces
, /* Before tcs push/pull constants and binding table */
2755 &brw_tes_image_surfaces
, /* Before tes push/pull constants and binding table */
2756 &brw_gs_image_surfaces
, /* Before gs push/pull constants and binding table */
2757 &brw_wm_image_surfaces
, /* Before wm push/pull constants and binding table */
2759 &gen6_vs_push_constants
, /* Before vs_state */
2760 &gen7_tcs_push_constants
,
2761 &gen7_tes_push_constants
,
2762 &gen6_gs_push_constants
, /* Before gs_state */
2763 &gen6_wm_push_constants
, /* Before wm_surfaces and constant_buffer */
2765 /* Surface state setup. Must come before the VS/WM unit. The binding
2766 * table upload must be last.
2768 &brw_vs_pull_constants
,
2769 &brw_vs_ubo_surfaces
,
2770 &brw_vs_abo_surfaces
,
2771 &brw_tcs_pull_constants
,
2772 &brw_tcs_ubo_surfaces
,
2773 &brw_tcs_abo_surfaces
,
2774 &brw_tes_pull_constants
,
2775 &brw_tes_ubo_surfaces
,
2776 &brw_tes_abo_surfaces
,
2777 &brw_gs_pull_constants
,
2778 &brw_gs_ubo_surfaces
,
2779 &brw_gs_abo_surfaces
,
2780 &brw_wm_pull_constants
,
2781 &brw_wm_ubo_surfaces
,
2782 &brw_wm_abo_surfaces
,
2783 &gen6_renderbuffer_surfaces
,
2784 &brw_renderbuffer_read_surfaces
,
2785 &brw_texture_surfaces
,
2786 &brw_vs_binding_table
,
2787 &brw_tcs_binding_table
,
2788 &brw_tes_binding_table
,
2789 &brw_gs_binding_table
,
2790 &brw_wm_binding_table
,
2797 &gen6_multisample_state
,
2811 &gen6_scissor_state
,
2815 &brw_polygon_stipple
,
2816 &brw_polygon_stipple_offset
,
2822 &brw_indices
, /* must come before brw_vertices */
2829 static const struct brw_tracked_state
*render_atoms
[] =
2832 &genX(sf_clip_viewport
),
2835 &gen7_push_constant_space
,
2838 &gen6_color_calc_state
,
2840 &brw_vs_image_surfaces
, /* Before vs push/pull constants and binding table */
2841 &brw_tcs_image_surfaces
, /* Before tcs push/pull constants and binding table */
2842 &brw_tes_image_surfaces
, /* Before tes push/pull constants and binding table */
2843 &brw_gs_image_surfaces
, /* Before gs push/pull constants and binding table */
2844 &brw_wm_image_surfaces
, /* Before wm push/pull constants and binding table */
2846 &gen6_vs_push_constants
, /* Before vs_state */
2847 &gen7_tcs_push_constants
,
2848 &gen7_tes_push_constants
,
2849 &gen6_gs_push_constants
, /* Before gs_state */
2850 &gen6_wm_push_constants
, /* Before wm_surfaces and constant_buffer */
2852 /* Surface state setup. Must come before the VS/WM unit. The binding
2853 * table upload must be last.
2855 &brw_vs_pull_constants
,
2856 &brw_vs_ubo_surfaces
,
2857 &brw_vs_abo_surfaces
,
2858 &brw_tcs_pull_constants
,
2859 &brw_tcs_ubo_surfaces
,
2860 &brw_tcs_abo_surfaces
,
2861 &brw_tes_pull_constants
,
2862 &brw_tes_ubo_surfaces
,
2863 &brw_tes_abo_surfaces
,
2864 &brw_gs_pull_constants
,
2865 &brw_gs_ubo_surfaces
,
2866 &brw_gs_abo_surfaces
,
2867 &brw_wm_pull_constants
,
2868 &brw_wm_ubo_surfaces
,
2869 &brw_wm_abo_surfaces
,
2870 &gen6_renderbuffer_surfaces
,
2871 &brw_renderbuffer_read_surfaces
,
2872 &brw_texture_surfaces
,
2873 &brw_vs_binding_table
,
2874 &brw_tcs_binding_table
,
2875 &brw_tes_binding_table
,
2876 &brw_gs_binding_table
,
2877 &brw_wm_binding_table
,
2884 &gen8_multisample_state
,
2893 &genX(raster_state
),
2899 &genX(depth_stencil_state
),
2902 &gen6_scissor_state
,
2906 &brw_polygon_stipple
,
2907 &brw_polygon_stipple_offset
,
2924 STATIC_ASSERT(ARRAY_SIZE(render_atoms
) <= ARRAY_SIZE(brw
->render_atoms
));
2925 brw_copy_pipeline_atoms(brw
, BRW_RENDER_PIPELINE
,
2926 render_atoms
, ARRAY_SIZE(render_atoms
));
2929 static const struct brw_tracked_state
*compute_atoms
[] =
2932 &brw_cs_image_surfaces
,
2933 &gen7_cs_push_constants
,
2934 &brw_cs_pull_constants
,
2935 &brw_cs_ubo_surfaces
,
2936 &brw_cs_abo_surfaces
,
2937 &brw_cs_texture_surfaces
,
2938 &brw_cs_work_groups_surface
,
2943 STATIC_ASSERT(ARRAY_SIZE(compute_atoms
) <= ARRAY_SIZE(brw
->compute_atoms
));
2944 brw_copy_pipeline_atoms(brw
, BRW_COMPUTE_PIPELINE
,
2945 compute_atoms
, ARRAY_SIZE(compute_atoms
));