i965: Port gen6+ 3DSTATE_VS to genxml.
[mesa.git] / src / mesa / drivers / dri / i965 / genX_state_upload.c
1 /*
2 * Copyright © 2017 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <assert.h>
25
26 #include "common/gen_device_info.h"
27 #include "genxml/gen_macros.h"
28
29 #include "brw_context.h"
30 #if GEN_GEN == 6
31 #include "brw_defines.h"
32 #endif
33 #include "brw_state.h"
34 #include "brw_wm.h"
35 #include "brw_util.h"
36
37 #include "intel_batchbuffer.h"
38 #include "intel_buffer_objects.h"
39 #include "intel_fbo.h"
40
41 #include "main/fbobject.h"
42 #include "main/framebuffer.h"
43 #include "main/stencil.h"
44 #include "main/transformfeedback.h"
45
46 UNUSED static void *
47 emit_dwords(struct brw_context *brw, unsigned n)
48 {
49 intel_batchbuffer_begin(brw, n, RENDER_RING);
50 uint32_t *map = brw->batch.map_next;
51 brw->batch.map_next += n;
52 intel_batchbuffer_advance(brw);
53 return map;
54 }
55
56 struct brw_address {
57 struct brw_bo *bo;
58 uint32_t read_domains;
59 uint32_t write_domain;
60 uint32_t offset;
61 };
62
63 static uint64_t
64 emit_reloc(struct brw_context *brw,
65 void *location, struct brw_address address, uint32_t delta)
66 {
67 uint32_t offset = (char *) location - (char *) brw->batch.map;
68
69 return brw_emit_reloc(&brw->batch, offset, address.bo,
70 address.offset + delta,
71 address.read_domains,
72 address.write_domain);
73 }
74
75 #define __gen_address_type struct brw_address
76 #define __gen_user_data struct brw_context
77
78 static uint64_t
79 __gen_combine_address(struct brw_context *brw, void *location,
80 struct brw_address address, uint32_t delta)
81 {
82 if (address.bo == NULL) {
83 return address.offset + delta;
84 } else {
85 return emit_reloc(brw, location, address, delta);
86 }
87 }
88
89 static inline struct brw_address
90 render_bo(struct brw_bo *bo, uint32_t offset)
91 {
92 return (struct brw_address) {
93 .bo = bo,
94 .offset = offset,
95 .read_domains = I915_GEM_DOMAIN_RENDER,
96 .write_domain = I915_GEM_DOMAIN_RENDER,
97 };
98 }
99
100 static inline struct brw_address
101 instruction_bo(struct brw_bo *bo, uint32_t offset)
102 {
103 return (struct brw_address) {
104 .bo = bo,
105 .offset = offset,
106 .read_domains = I915_GEM_DOMAIN_INSTRUCTION,
107 .write_domain = I915_GEM_DOMAIN_INSTRUCTION,
108 };
109 }
110
111 #include "genxml/genX_pack.h"
112
113 #define _brw_cmd_length(cmd) cmd ## _length
114 #define _brw_cmd_length_bias(cmd) cmd ## _length_bias
115 #define _brw_cmd_header(cmd) cmd ## _header
116 #define _brw_cmd_pack(cmd) cmd ## _pack
117
118 #define brw_batch_emit(brw, cmd, name) \
119 for (struct cmd name = { _brw_cmd_header(cmd) }, \
120 *_dst = emit_dwords(brw, _brw_cmd_length(cmd)); \
121 __builtin_expect(_dst != NULL, 1); \
122 _brw_cmd_pack(cmd)(brw, (void *)_dst, &name), \
123 _dst = NULL)
124
125 #define brw_batch_emitn(brw, cmd, n, ...) ({ \
126 uint32_t *_dw = emit_dwords(brw, n); \
127 struct cmd template = { \
128 _brw_cmd_header(cmd), \
129 .DWordLength = n - _brw_cmd_length_bias(cmd), \
130 __VA_ARGS__ \
131 }; \
132 _brw_cmd_pack(cmd)(brw, _dw, &template); \
133 _dw + 1; /* Array starts at dw[1] */ \
134 })
135
136 #define brw_state_emit(brw, cmd, align, offset, name) \
137 for (struct cmd name = { 0, }, \
138 *_dst = brw_state_batch(brw, _brw_cmd_length(cmd) * 4, \
139 align, offset); \
140 __builtin_expect(_dst != NULL, 1); \
141 _brw_cmd_pack(cmd)(brw, (void *)_dst, &name), \
142 _dst = NULL)
143
144 #if GEN_GEN >= 6
145 /**
146 * Determine the appropriate attribute override value to store into the
147 * 3DSTATE_SF structure for a given fragment shader attribute. The attribute
148 * override value contains two pieces of information: the location of the
149 * attribute in the VUE (relative to urb_entry_read_offset, see below), and a
150 * flag indicating whether to "swizzle" the attribute based on the direction
151 * the triangle is facing.
152 *
153 * If an attribute is "swizzled", then the given VUE location is used for
154 * front-facing triangles, and the VUE location that immediately follows is
155 * used for back-facing triangles. We use this to implement the mapping from
156 * gl_FrontColor/gl_BackColor to gl_Color.
157 *
158 * urb_entry_read_offset is the offset into the VUE at which the SF unit is
159 * being instructed to begin reading attribute data. It can be set to a
160 * nonzero value to prevent the SF unit from wasting time reading elements of
161 * the VUE that are not needed by the fragment shader. It is measured in
162 * 256-bit increments.
163 */
164 static void
165 genX(get_attr_override)(struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr,
166 const struct brw_vue_map *vue_map,
167 int urb_entry_read_offset, int fs_attr,
168 bool two_side_color, uint32_t *max_source_attr)
169 {
170 /* Find the VUE slot for this attribute. */
171 int slot = vue_map->varying_to_slot[fs_attr];
172
173 /* Viewport and Layer are stored in the VUE header. We need to override
174 * them to zero if earlier stages didn't write them, as GL requires that
175 * they read back as zero when not explicitly set.
176 */
177 if (fs_attr == VARYING_SLOT_VIEWPORT || fs_attr == VARYING_SLOT_LAYER) {
178 attr->ComponentOverrideX = true;
179 attr->ComponentOverrideW = true;
180 attr->ConstantSource = CONST_0000;
181
182 if (!(vue_map->slots_valid & VARYING_BIT_LAYER))
183 attr->ComponentOverrideY = true;
184 if (!(vue_map->slots_valid & VARYING_BIT_VIEWPORT))
185 attr->ComponentOverrideZ = true;
186
187 return;
188 }
189
190 /* If there was only a back color written but not front, use back
191 * as the color instead of undefined
192 */
193 if (slot == -1 && fs_attr == VARYING_SLOT_COL0)
194 slot = vue_map->varying_to_slot[VARYING_SLOT_BFC0];
195 if (slot == -1 && fs_attr == VARYING_SLOT_COL1)
196 slot = vue_map->varying_to_slot[VARYING_SLOT_BFC1];
197
198 if (slot == -1) {
199 /* This attribute does not exist in the VUE--that means that the vertex
200 * shader did not write to it. This means that either:
201 *
202 * (a) This attribute is a texture coordinate, and it is going to be
203 * replaced with point coordinates (as a consequence of a call to
204 * glTexEnvi(GL_POINT_SPRITE, GL_COORD_REPLACE, GL_TRUE)), so the
205 * hardware will ignore whatever attribute override we supply.
206 *
207 * (b) This attribute is read by the fragment shader but not written by
208 * the vertex shader, so its value is undefined. Therefore the
209 * attribute override we supply doesn't matter.
210 *
211 * (c) This attribute is gl_PrimitiveID, and it wasn't written by the
212 * previous shader stage.
213 *
214 * Note that we don't have to worry about the cases where the attribute
215 * is gl_PointCoord or is undergoing point sprite coordinate
216 * replacement, because in those cases, this function isn't called.
217 *
218 * In case (c), we need to program the attribute overrides so that the
219 * primitive ID will be stored in this slot. In every other case, the
220 * attribute override we supply doesn't matter. So just go ahead and
221 * program primitive ID in every case.
222 */
223 attr->ComponentOverrideW = true;
224 attr->ComponentOverrideX = true;
225 attr->ComponentOverrideY = true;
226 attr->ComponentOverrideZ = true;
227 attr->ConstantSource = PRIM_ID;
228 return;
229 }
230
231 /* Compute the location of the attribute relative to urb_entry_read_offset.
232 * Each increment of urb_entry_read_offset represents a 256-bit value, so
233 * it counts for two 128-bit VUE slots.
234 */
235 int source_attr = slot - 2 * urb_entry_read_offset;
236 assert(source_attr >= 0 && source_attr < 32);
237
238 /* If we are doing two-sided color, and the VUE slot following this one
239 * represents a back-facing color, then we need to instruct the SF unit to
240 * do back-facing swizzling.
241 */
242 bool swizzling = two_side_color &&
243 ((vue_map->slot_to_varying[slot] == VARYING_SLOT_COL0 &&
244 vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC0) ||
245 (vue_map->slot_to_varying[slot] == VARYING_SLOT_COL1 &&
246 vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC1));
247
248 /* Update max_source_attr. If swizzling, the SF will read this slot + 1. */
249 if (*max_source_attr < source_attr + swizzling)
250 *max_source_attr = source_attr + swizzling;
251
252 attr->SourceAttribute = source_attr;
253 if (swizzling)
254 attr->SwizzleSelect = INPUTATTR_FACING;
255 }
256
257
258 static void
259 genX(calculate_attr_overrides)(const struct brw_context *brw,
260 struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr_overrides,
261 uint32_t *point_sprite_enables,
262 uint32_t *urb_entry_read_length,
263 uint32_t *urb_entry_read_offset)
264 {
265 const struct gl_context *ctx = &brw->ctx;
266
267 /* _NEW_POINT */
268 const struct gl_point_attrib *point = &ctx->Point;
269
270 /* BRW_NEW_FS_PROG_DATA */
271 const struct brw_wm_prog_data *wm_prog_data =
272 brw_wm_prog_data(brw->wm.base.prog_data);
273 uint32_t max_source_attr = 0;
274
275 *point_sprite_enables = 0;
276
277 /* BRW_NEW_FRAGMENT_PROGRAM
278 *
279 * If the fragment shader reads VARYING_SLOT_LAYER, then we need to pass in
280 * the full vertex header. Otherwise, we can program the SF to start
281 * reading at an offset of 1 (2 varying slots) to skip unnecessary data:
282 * - VARYING_SLOT_PSIZ and BRW_VARYING_SLOT_NDC on gen4-5
283 * - VARYING_SLOT_{PSIZ,LAYER} and VARYING_SLOT_POS on gen6+
284 */
285
286 bool fs_needs_vue_header = brw->fragment_program->info.inputs_read &
287 (VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT);
288
289 *urb_entry_read_offset = fs_needs_vue_header ? 0 : 1;
290
291 /* From the Ivybridge PRM, Vol 2 Part 1, 3DSTATE_SBE,
292 * description of dw10 Point Sprite Texture Coordinate Enable:
293 *
294 * "This field must be programmed to zero when non-point primitives
295 * are rendered."
296 *
297 * The SandyBridge PRM doesn't explicitly say that point sprite enables
298 * must be programmed to zero when rendering non-point primitives, but
299 * the IvyBridge PRM does, and if we don't, we get garbage.
300 *
301 * This is not required on Haswell, as the hardware ignores this state
302 * when drawing non-points -- although we do still need to be careful to
303 * correctly set the attr overrides.
304 *
305 * _NEW_POLYGON
306 * BRW_NEW_PRIMITIVE | BRW_NEW_GS_PROG_DATA | BRW_NEW_TES_PROG_DATA
307 */
308 bool drawing_points = brw_is_drawing_points(brw);
309
310 for (int attr = 0; attr < VARYING_SLOT_MAX; attr++) {
311 int input_index = wm_prog_data->urb_setup[attr];
312
313 if (input_index < 0)
314 continue;
315
316 /* _NEW_POINT */
317 bool point_sprite = false;
318 if (drawing_points) {
319 if (point->PointSprite &&
320 (attr >= VARYING_SLOT_TEX0 && attr <= VARYING_SLOT_TEX7) &&
321 (point->CoordReplace & (1u << (attr - VARYING_SLOT_TEX0)))) {
322 point_sprite = true;
323 }
324
325 if (attr == VARYING_SLOT_PNTC)
326 point_sprite = true;
327
328 if (point_sprite)
329 *point_sprite_enables |= (1 << input_index);
330 }
331
332 /* BRW_NEW_VUE_MAP_GEOM_OUT | _NEW_LIGHT | _NEW_PROGRAM */
333 struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) attribute = { 0 };
334
335 if (!point_sprite) {
336 genX(get_attr_override)(&attribute,
337 &brw->vue_map_geom_out,
338 *urb_entry_read_offset, attr,
339 brw->ctx.VertexProgram._TwoSideEnabled,
340 &max_source_attr);
341 }
342
343 /* The hardware can only do the overrides on 16 overrides at a
344 * time, and the other up to 16 have to be lined up so that the
345 * input index = the output index. We'll need to do some
346 * tweaking to make sure that's the case.
347 */
348 if (input_index < 16)
349 attr_overrides[input_index] = attribute;
350 else
351 assert(attribute.SourceAttribute == input_index);
352 }
353
354 /* From the Sandy Bridge PRM, Volume 2, Part 1, documentation for
355 * 3DSTATE_SF DWord 1 bits 15:11, "Vertex URB Entry Read Length":
356 *
357 * "This field should be set to the minimum length required to read the
358 * maximum source attribute. The maximum source attribute is indicated
359 * by the maximum value of the enabled Attribute # Source Attribute if
360 * Attribute Swizzle Enable is set, Number of Output Attributes-1 if
361 * enable is not set.
362 * read_length = ceiling((max_source_attr + 1) / 2)
363 *
364 * [errata] Corruption/Hang possible if length programmed larger than
365 * recommended"
366 *
367 * Similar text exists for Ivy Bridge.
368 */
369 *urb_entry_read_length = DIV_ROUND_UP(max_source_attr + 1, 2);
370 }
371
372 /* ---------------------------------------------------------------------- */
373
374 static void
375 genX(upload_depth_stencil_state)(struct brw_context *brw)
376 {
377 struct gl_context *ctx = &brw->ctx;
378
379 /* _NEW_BUFFERS */
380 struct intel_renderbuffer *depth_irb =
381 intel_get_renderbuffer(ctx->DrawBuffer, BUFFER_DEPTH);
382
383 /* _NEW_DEPTH */
384 struct gl_depthbuffer_attrib *depth = &ctx->Depth;
385
386 /* _NEW_STENCIL */
387 struct gl_stencil_attrib *stencil = &ctx->Stencil;
388 const int b = stencil->_BackFace;
389
390 #if GEN_GEN >= 8
391 brw_batch_emit(brw, GENX(3DSTATE_WM_DEPTH_STENCIL), wmds) {
392 #else
393 uint32_t ds_offset;
394 brw_state_emit(brw, GENX(DEPTH_STENCIL_STATE), 64, &ds_offset, wmds) {
395 #endif
396 if (depth->Test && depth_irb) {
397 wmds.DepthTestEnable = true;
398 wmds.DepthBufferWriteEnable = brw_depth_writes_enabled(brw);
399 wmds.DepthTestFunction = intel_translate_compare_func(depth->Func);
400 }
401
402 if (stencil->_Enabled) {
403 wmds.StencilTestEnable = true;
404 wmds.StencilWriteMask = stencil->WriteMask[0] & 0xff;
405 wmds.StencilTestMask = stencil->ValueMask[0] & 0xff;
406
407 wmds.StencilTestFunction =
408 intel_translate_compare_func(stencil->Function[0]);
409 wmds.StencilFailOp =
410 intel_translate_stencil_op(stencil->FailFunc[0]);
411 wmds.StencilPassDepthPassOp =
412 intel_translate_stencil_op(stencil->ZPassFunc[0]);
413 wmds.StencilPassDepthFailOp =
414 intel_translate_stencil_op(stencil->ZFailFunc[0]);
415
416 wmds.StencilBufferWriteEnable = stencil->_WriteEnabled;
417
418 if (stencil->_TestTwoSide) {
419 wmds.DoubleSidedStencilEnable = true;
420 wmds.BackfaceStencilWriteMask = stencil->WriteMask[b] & 0xff;
421 wmds.BackfaceStencilTestMask = stencil->ValueMask[b] & 0xff;
422
423 wmds.BackfaceStencilTestFunction =
424 intel_translate_compare_func(stencil->Function[b]);
425 wmds.BackfaceStencilFailOp =
426 intel_translate_stencil_op(stencil->FailFunc[b]);
427 wmds.BackfaceStencilPassDepthPassOp =
428 intel_translate_stencil_op(stencil->ZPassFunc[b]);
429 wmds.BackfaceStencilPassDepthFailOp =
430 intel_translate_stencil_op(stencil->ZFailFunc[b]);
431 }
432
433 #if GEN_GEN >= 9
434 wmds.StencilReferenceValue = _mesa_get_stencil_ref(ctx, 0);
435 wmds.BackfaceStencilReferenceValue = _mesa_get_stencil_ref(ctx, b);
436 #endif
437 }
438 }
439
440 #if GEN_GEN == 6
441 brw_batch_emit(brw, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
442 ptr.PointertoDEPTH_STENCIL_STATE = ds_offset;
443 ptr.DEPTH_STENCIL_STATEChange = true;
444 }
445 #elif GEN_GEN == 7
446 brw_batch_emit(brw, GENX(3DSTATE_DEPTH_STENCIL_STATE_POINTERS), ptr) {
447 ptr.PointertoDEPTH_STENCIL_STATE = ds_offset;
448 }
449 #endif
450 }
451
452 static const struct brw_tracked_state genX(depth_stencil_state) = {
453 .dirty = {
454 .mesa = _NEW_BUFFERS |
455 _NEW_DEPTH |
456 _NEW_STENCIL,
457 .brw = BRW_NEW_BLORP |
458 (GEN_GEN >= 8 ? BRW_NEW_CONTEXT
459 : BRW_NEW_BATCH |
460 BRW_NEW_STATE_BASE_ADDRESS),
461 },
462 .emit = genX(upload_depth_stencil_state),
463 };
464
465 /* ---------------------------------------------------------------------- */
466
467 static void
468 genX(upload_clip_state)(struct brw_context *brw)
469 {
470 struct gl_context *ctx = &brw->ctx;
471
472 /* _NEW_BUFFERS */
473 struct gl_framebuffer *fb = ctx->DrawBuffer;
474
475 /* BRW_NEW_FS_PROG_DATA */
476 struct brw_wm_prog_data *wm_prog_data =
477 brw_wm_prog_data(brw->wm.base.prog_data);
478
479 brw_batch_emit(brw, GENX(3DSTATE_CLIP), clip) {
480 clip.StatisticsEnable = !brw->meta_in_progress;
481
482 if (wm_prog_data->barycentric_interp_modes &
483 BRW_BARYCENTRIC_NONPERSPECTIVE_BITS)
484 clip.NonPerspectiveBarycentricEnable = true;
485
486 #if GEN_GEN >= 7
487 clip.EarlyCullEnable = true;
488 #endif
489
490 #if GEN_GEN == 7
491 clip.FrontWinding = ctx->Polygon._FrontBit == _mesa_is_user_fbo(fb);
492
493 if (ctx->Polygon.CullFlag) {
494 switch (ctx->Polygon.CullFaceMode) {
495 case GL_FRONT:
496 clip.CullMode = CULLMODE_FRONT;
497 break;
498 case GL_BACK:
499 clip.CullMode = CULLMODE_BACK;
500 break;
501 case GL_FRONT_AND_BACK:
502 clip.CullMode = CULLMODE_BOTH;
503 break;
504 default:
505 unreachable("Should not get here: invalid CullFlag");
506 }
507 } else {
508 clip.CullMode = CULLMODE_NONE;
509 }
510 #endif
511
512 #if GEN_GEN < 8
513 clip.UserClipDistanceCullTestEnableBitmask =
514 brw_vue_prog_data(brw->vs.base.prog_data)->cull_distance_mask;
515
516 clip.ViewportZClipTestEnable = !ctx->Transform.DepthClamp;
517 #endif
518
519 /* _NEW_LIGHT */
520 if (ctx->Light.ProvokingVertex == GL_FIRST_VERTEX_CONVENTION) {
521 clip.TriangleStripListProvokingVertexSelect = 0;
522 clip.TriangleFanProvokingVertexSelect = 1;
523 clip.LineStripListProvokingVertexSelect = 0;
524 } else {
525 clip.TriangleStripListProvokingVertexSelect = 2;
526 clip.TriangleFanProvokingVertexSelect = 2;
527 clip.LineStripListProvokingVertexSelect = 1;
528 }
529
530 /* _NEW_TRANSFORM */
531 clip.UserClipDistanceClipTestEnableBitmask =
532 ctx->Transform.ClipPlanesEnabled;
533
534 #if GEN_GEN >= 8
535 clip.ForceUserClipDistanceClipTestEnableBitmask = true;
536 #endif
537
538 if (ctx->Transform.ClipDepthMode == GL_ZERO_TO_ONE)
539 clip.APIMode = APIMODE_D3D;
540 else
541 clip.APIMode = APIMODE_OGL;
542
543 clip.GuardbandClipTestEnable = true;
544
545 /* BRW_NEW_VIEWPORT_COUNT */
546 const unsigned viewport_count = brw->clip.viewport_count;
547
548 if (ctx->RasterDiscard) {
549 clip.ClipMode = CLIPMODE_REJECT_ALL;
550 #if GEN_GEN == 6
551 perf_debug("Rasterizer discard is currently implemented via the "
552 "clipper; having the GS not write primitives would "
553 "likely be faster.\n");
554 #endif
555 } else {
556 clip.ClipMode = CLIPMODE_NORMAL;
557 }
558
559 clip.ClipEnable = brw->primitive != _3DPRIM_RECTLIST;
560
561 /* _NEW_POLYGON,
562 * BRW_NEW_GEOMETRY_PROGRAM | BRW_NEW_TES_PROG_DATA | BRW_NEW_PRIMITIVE
563 */
564 if (!brw_is_drawing_points(brw) && !brw_is_drawing_lines(brw))
565 clip.ViewportXYClipTestEnable = true;
566
567 clip.MinimumPointWidth = 0.125;
568 clip.MaximumPointWidth = 255.875;
569 clip.MaximumVPIndex = viewport_count - 1;
570 if (_mesa_geometric_layers(fb) == 0)
571 clip.ForceZeroRTAIndexEnable = true;
572 }
573 }
574
575 static const struct brw_tracked_state genX(clip_state) = {
576 .dirty = {
577 .mesa = _NEW_BUFFERS |
578 _NEW_LIGHT |
579 _NEW_POLYGON |
580 _NEW_TRANSFORM,
581 .brw = BRW_NEW_BLORP |
582 BRW_NEW_CONTEXT |
583 BRW_NEW_FS_PROG_DATA |
584 BRW_NEW_GS_PROG_DATA |
585 BRW_NEW_VS_PROG_DATA |
586 BRW_NEW_META_IN_PROGRESS |
587 BRW_NEW_PRIMITIVE |
588 BRW_NEW_RASTERIZER_DISCARD |
589 BRW_NEW_TES_PROG_DATA |
590 BRW_NEW_VIEWPORT_COUNT,
591 },
592 .emit = genX(upload_clip_state),
593 };
594
595 /* ---------------------------------------------------------------------- */
596
597 static void
598 genX(upload_sf)(struct brw_context *brw)
599 {
600 struct gl_context *ctx = &brw->ctx;
601 float point_size;
602
603 #if GEN_GEN <= 7
604 /* _NEW_BUFFERS */
605 bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
606 const bool multisampled_fbo = _mesa_geometric_samples(ctx->DrawBuffer) > 1;
607 #endif
608
609 brw_batch_emit(brw, GENX(3DSTATE_SF), sf) {
610 sf.StatisticsEnable = true;
611 sf.ViewportTransformEnable = brw->sf.viewport_transform_enable;
612
613 #if GEN_GEN == 7
614 /* _NEW_BUFFERS */
615 sf.DepthBufferSurfaceFormat = brw_depthbuffer_format(brw);
616 #endif
617
618 #if GEN_GEN <= 7
619 /* _NEW_POLYGON */
620 sf.FrontWinding = ctx->Polygon._FrontBit == render_to_fbo;
621 sf.GlobalDepthOffsetEnableSolid = ctx->Polygon.OffsetFill;
622 sf.GlobalDepthOffsetEnableWireframe = ctx->Polygon.OffsetLine;
623 sf.GlobalDepthOffsetEnablePoint = ctx->Polygon.OffsetPoint;
624
625 switch (ctx->Polygon.FrontMode) {
626 case GL_FILL:
627 sf.FrontFaceFillMode = FILL_MODE_SOLID;
628 break;
629 case GL_LINE:
630 sf.FrontFaceFillMode = FILL_MODE_WIREFRAME;
631 break;
632 case GL_POINT:
633 sf.FrontFaceFillMode = FILL_MODE_POINT;
634 break;
635 default:
636 unreachable("not reached");
637 }
638
639 switch (ctx->Polygon.BackMode) {
640 case GL_FILL:
641 sf.BackFaceFillMode = FILL_MODE_SOLID;
642 break;
643 case GL_LINE:
644 sf.BackFaceFillMode = FILL_MODE_WIREFRAME;
645 break;
646 case GL_POINT:
647 sf.BackFaceFillMode = FILL_MODE_POINT;
648 break;
649 default:
650 unreachable("not reached");
651 }
652
653 sf.ScissorRectangleEnable = true;
654
655 if (ctx->Polygon.CullFlag) {
656 switch (ctx->Polygon.CullFaceMode) {
657 case GL_FRONT:
658 sf.CullMode = CULLMODE_FRONT;
659 break;
660 case GL_BACK:
661 sf.CullMode = CULLMODE_BACK;
662 break;
663 case GL_FRONT_AND_BACK:
664 sf.CullMode = CULLMODE_BOTH;
665 break;
666 default:
667 unreachable("not reached");
668 }
669 } else {
670 sf.CullMode = CULLMODE_NONE;
671 }
672
673 #if GEN_IS_HASWELL
674 sf.LineStippleEnable = ctx->Line.StippleFlag;
675 #endif
676
677 if (multisampled_fbo && ctx->Multisample.Enabled)
678 sf.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
679
680 sf.GlobalDepthOffsetConstant = ctx->Polygon.OffsetUnits * 2;
681 sf.GlobalDepthOffsetScale = ctx->Polygon.OffsetFactor;
682 sf.GlobalDepthOffsetClamp = ctx->Polygon.OffsetClamp;
683 #endif
684
685 /* _NEW_LINE */
686 sf.LineWidth = brw_get_line_width_float(brw);
687
688 if (ctx->Line.SmoothFlag) {
689 sf.LineEndCapAntialiasingRegionWidth = _10pixels;
690 #if GEN_GEN <= 7
691 sf.AntiAliasingEnable = true;
692 #endif
693 }
694
695 /* _NEW_POINT - Clamp to ARB_point_parameters user limits */
696 point_size = CLAMP(ctx->Point.Size, ctx->Point.MinSize, ctx->Point.MaxSize);
697 /* Clamp to the hardware limits */
698 sf.PointWidth = CLAMP(point_size, 0.125f, 255.875f);
699
700 /* _NEW_PROGRAM | _NEW_POINT, BRW_NEW_VUE_MAP_GEOM_OUT */
701 if (use_state_point_size(brw))
702 sf.PointWidthSource = State;
703
704 #if GEN_GEN >= 8
705 /* _NEW_POINT | _NEW_MULTISAMPLE */
706 if ((ctx->Point.SmoothFlag || _mesa_is_multisample_enabled(ctx)) &&
707 !ctx->Point.PointSprite)
708 sf.SmoothPointEnable = true;
709 #endif
710
711 sf.AALineDistanceMode = AALINEDISTANCE_TRUE;
712
713 /* _NEW_LIGHT */
714 if (ctx->Light.ProvokingVertex != GL_FIRST_VERTEX_CONVENTION) {
715 sf.TriangleStripListProvokingVertexSelect = 2;
716 sf.TriangleFanProvokingVertexSelect = 2;
717 sf.LineStripListProvokingVertexSelect = 1;
718 } else {
719 sf.TriangleFanProvokingVertexSelect = 1;
720 }
721
722 #if GEN_GEN == 6
723 /* BRW_NEW_FS_PROG_DATA */
724 const struct brw_wm_prog_data *wm_prog_data =
725 brw_wm_prog_data(brw->wm.base.prog_data);
726
727 sf.AttributeSwizzleEnable = true;
728 sf.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
729
730 /*
731 * Window coordinates in an FBO are inverted, which means point
732 * sprite origin must be inverted, too.
733 */
734 if ((ctx->Point.SpriteOrigin == GL_LOWER_LEFT) != render_to_fbo) {
735 sf.PointSpriteTextureCoordinateOrigin = LOWERLEFT;
736 } else {
737 sf.PointSpriteTextureCoordinateOrigin = UPPERLEFT;
738 }
739
740 /* BRW_NEW_VUE_MAP_GEOM_OUT | BRW_NEW_FRAGMENT_PROGRAM |
741 * _NEW_POINT | _NEW_LIGHT | _NEW_PROGRAM | BRW_NEW_FS_PROG_DATA
742 */
743 uint32_t urb_entry_read_length;
744 uint32_t urb_entry_read_offset;
745 uint32_t point_sprite_enables;
746 genX(calculate_attr_overrides)(brw, sf.Attribute, &point_sprite_enables,
747 &urb_entry_read_length,
748 &urb_entry_read_offset);
749 sf.VertexURBEntryReadLength = urb_entry_read_length;
750 sf.VertexURBEntryReadOffset = urb_entry_read_offset;
751 sf.PointSpriteTextureCoordinateEnable = point_sprite_enables;
752 sf.ConstantInterpolationEnable = wm_prog_data->flat_inputs;
753 #endif
754 }
755 }
756
757 static const struct brw_tracked_state genX(sf_state) = {
758 .dirty = {
759 .mesa = _NEW_LIGHT |
760 _NEW_LINE |
761 _NEW_MULTISAMPLE |
762 _NEW_POINT |
763 _NEW_PROGRAM |
764 (GEN_GEN <= 7 ? _NEW_BUFFERS | _NEW_POLYGON : 0),
765 .brw = BRW_NEW_BLORP |
766 BRW_NEW_CONTEXT |
767 BRW_NEW_VUE_MAP_GEOM_OUT |
768 (GEN_GEN <= 7 ? BRW_NEW_GS_PROG_DATA |
769 BRW_NEW_PRIMITIVE |
770 BRW_NEW_TES_PROG_DATA
771 : 0) |
772 (GEN_GEN == 6 ? BRW_NEW_FS_PROG_DATA |
773 BRW_NEW_FRAGMENT_PROGRAM
774 : 0),
775 },
776 .emit = genX(upload_sf),
777 };
778
779 /* ---------------------------------------------------------------------- */
780
781 static void
782 genX(upload_wm)(struct brw_context *brw)
783 {
784 struct gl_context *ctx = &brw->ctx;
785
786 /* BRW_NEW_FS_PROG_DATA */
787 const struct brw_wm_prog_data *wm_prog_data =
788 brw_wm_prog_data(brw->wm.base.prog_data);
789
790 UNUSED bool writes_depth =
791 wm_prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF;
792
793 #if GEN_GEN < 7
794 const struct brw_stage_state *stage_state = &brw->wm.base;
795 const struct gen_device_info *devinfo = &brw->screen->devinfo;
796
797 /* We can't fold this into gen6_upload_wm_push_constants(), because
798 * according to the SNB PRM, vol 2 part 1 section 7.2.2
799 * (3DSTATE_CONSTANT_PS [DevSNB]):
800 *
801 * "[DevSNB]: This packet must be followed by WM_STATE."
802 */
803 brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_PS), wmcp) {
804 if (wm_prog_data->base.nr_params != 0) {
805 wmcp.Buffer0Valid = true;
806 /* Pointer to the WM constant buffer. Covered by the set of
807 * state flags from gen6_upload_wm_push_constants.
808 */
809 wmcp.PointertoPSConstantBuffer0 = stage_state->push_const_offset;
810 wmcp.PSConstantBuffer0ReadLength = stage_state->push_const_size - 1;
811 }
812 }
813 #endif
814
815 brw_batch_emit(brw, GENX(3DSTATE_WM), wm) {
816 wm.StatisticsEnable = true;
817 wm.LineAntialiasingRegionWidth = _10pixels;
818 wm.LineEndCapAntialiasingRegionWidth = _05pixels;
819
820 #if GEN_GEN < 7
821 if (wm_prog_data->base.use_alt_mode)
822 wm.FloatingPointMode = Alternate;
823
824 wm.SamplerCount = DIV_ROUND_UP(stage_state->sampler_count, 4);
825 wm.BindingTableEntryCount = wm_prog_data->base.binding_table.size_bytes / 4;
826 wm.MaximumNumberofThreads = devinfo->max_wm_threads - 1;
827 wm._8PixelDispatchEnable = wm_prog_data->dispatch_8;
828 wm._16PixelDispatchEnable = wm_prog_data->dispatch_16;
829 wm.DispatchGRFStartRegisterForConstantSetupData0 =
830 wm_prog_data->base.dispatch_grf_start_reg;
831 wm.DispatchGRFStartRegisterForConstantSetupData2 =
832 wm_prog_data->dispatch_grf_start_reg_2;
833 wm.KernelStartPointer0 = stage_state->prog_offset;
834 wm.KernelStartPointer2 = stage_state->prog_offset +
835 wm_prog_data->prog_offset_2;
836 wm.DualSourceBlendEnable =
837 wm_prog_data->dual_src_blend && (ctx->Color.BlendEnabled & 1) &&
838 ctx->Color.Blend[0]._UsesDualSrc;
839 wm.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
840 wm.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
841
842 /* From the SNB PRM, volume 2 part 1, page 281:
843 * "If the PS kernel does not need the Position XY Offsets
844 * to compute a Position XY value, then this field should be
845 * programmed to POSOFFSET_NONE."
846 *
847 * "SW Recommendation: If the PS kernel needs the Position Offsets
848 * to compute a Position XY value, this field should match Position
849 * ZW Interpolation Mode to ensure a consistent position.xyzw
850 * computation."
851 * We only require XY sample offsets. So, this recommendation doesn't
852 * look useful at the moment. We might need this in future.
853 */
854 if (wm_prog_data->uses_pos_offset)
855 wm.PositionXYOffsetSelect = POSOFFSET_SAMPLE;
856 else
857 wm.PositionXYOffsetSelect = POSOFFSET_NONE;
858
859 if (wm_prog_data->base.total_scratch) {
860 wm.ScratchSpaceBasePointer =
861 render_bo(stage_state->scratch_bo,
862 ffs(stage_state->per_thread_scratch) - 11);
863 }
864
865 wm.PixelShaderComputedDepth = writes_depth;
866 #endif
867
868 wm.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
869
870 /* _NEW_LINE */
871 wm.LineStippleEnable = ctx->Line.StippleFlag;
872
873 /* _NEW_POLYGON */
874 wm.PolygonStippleEnable = ctx->Polygon.StippleFlag;
875 wm.BarycentricInterpolationMode = wm_prog_data->barycentric_interp_modes;
876
877 #if GEN_GEN < 8
878 /* _NEW_BUFFERS */
879 const bool multisampled_fbo = _mesa_geometric_samples(ctx->DrawBuffer) > 1;
880
881 wm.PixelShaderUsesSourceDepth = wm_prog_data->uses_src_depth;
882 wm.PixelShaderUsesSourceW = wm_prog_data->uses_src_w;
883 if (wm_prog_data->uses_kill ||
884 _mesa_is_alpha_test_enabled(ctx) ||
885 _mesa_is_alpha_to_coverage_enabled(ctx) ||
886 wm_prog_data->uses_omask) {
887 wm.PixelShaderKillsPixel = true;
888 }
889
890 /* _NEW_BUFFERS | _NEW_COLOR */
891 if (brw_color_buffer_write_enabled(brw) || writes_depth ||
892 wm_prog_data->has_side_effects || wm.PixelShaderKillsPixel) {
893 wm.ThreadDispatchEnable = true;
894 }
895 if (multisampled_fbo) {
896 /* _NEW_MULTISAMPLE */
897 if (ctx->Multisample.Enabled)
898 wm.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
899 else
900 wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
901
902 if (wm_prog_data->persample_dispatch)
903 wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
904 else
905 wm.MultisampleDispatchMode = MSDISPMODE_PERPIXEL;
906 } else {
907 wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
908 wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
909 }
910
911 #if GEN_GEN >= 7
912 wm.PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode;
913 wm.PixelShaderUsesInputCoverageMask = wm_prog_data->uses_sample_mask;
914 #endif
915
916 /* The "UAV access enable" bits are unnecessary on HSW because they only
917 * seem to have an effect on the HW-assisted coherency mechanism which we
918 * don't need, and the rasterization-related UAV_ONLY flag and the
919 * DISPATCH_ENABLE bit can be set independently from it.
920 * C.f. gen8_upload_ps_extra().
921 *
922 * BRW_NEW_FRAGMENT_PROGRAM | BRW_NEW_FS_PROG_DATA | _NEW_BUFFERS |
923 * _NEW_COLOR
924 */
925 #if GEN_IS_HASWELL
926 if (!(brw_color_buffer_write_enabled(brw) || writes_depth) &&
927 wm_prog_data->has_side_effects)
928 wm.PSUAVonly = ON;
929 #endif
930 #endif
931
932 #if GEN_GEN >= 7
933 /* BRW_NEW_FS_PROG_DATA */
934 if (wm_prog_data->early_fragment_tests)
935 wm.EarlyDepthStencilControl = EDSC_PREPS;
936 else if (wm_prog_data->has_side_effects)
937 wm.EarlyDepthStencilControl = EDSC_PSEXEC;
938 #endif
939 }
940 }
941
942 static const struct brw_tracked_state genX(wm_state) = {
943 .dirty = {
944 .mesa = _NEW_LINE |
945 _NEW_POLYGON |
946 (GEN_GEN < 8 ? _NEW_BUFFERS |
947 _NEW_COLOR |
948 _NEW_MULTISAMPLE :
949 0) |
950 (GEN_GEN < 7 ? _NEW_PROGRAM_CONSTANTS : 0),
951 .brw = BRW_NEW_BLORP |
952 BRW_NEW_FS_PROG_DATA |
953 (GEN_GEN < 7 ? BRW_NEW_PUSH_CONSTANT_ALLOCATION |
954 BRW_NEW_BATCH
955 : BRW_NEW_CONTEXT),
956 },
957 .emit = genX(upload_wm),
958 };
959
960 /* ---------------------------------------------------------------------- */
961
962 #define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix) \
963 pkt.KernelStartPointer = stage_state->prog_offset; \
964 pkt.SamplerCount = \
965 DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4); \
966 pkt.BindingTableEntryCount = \
967 stage_prog_data->binding_table.size_bytes / 4; \
968 pkt.FloatingPointMode = stage_prog_data->use_alt_mode; \
969 \
970 if (stage_prog_data->total_scratch) { \
971 pkt.ScratchSpaceBasePointer = \
972 render_bo(stage_state->scratch_bo, 0); \
973 pkt.PerThreadScratchSpace = \
974 ffs(stage_state->per_thread_scratch) - 11; \
975 } \
976 \
977 pkt.DispatchGRFStartRegisterForURBData = \
978 stage_prog_data->dispatch_grf_start_reg; \
979 pkt.prefix##URBEntryReadLength = vue_prog_data->urb_read_length; \
980 pkt.prefix##URBEntryReadOffset = 0; \
981 \
982 pkt.StatisticsEnable = true; \
983 pkt.Enable = true;
984
985
986 static void
987 genX(upload_vs_state)(struct brw_context *brw)
988 {
989 const struct gen_device_info *devinfo = &brw->screen->devinfo;
990 const struct brw_stage_state *stage_state = &brw->vs.base;
991
992 /* BRW_NEW_VS_PROG_DATA */
993 const struct brw_vue_prog_data *vue_prog_data =
994 brw_vue_prog_data(brw->vs.base.prog_data);
995 const struct brw_stage_prog_data *stage_prog_data = &vue_prog_data->base;
996
997 assert(vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8 ||
998 vue_prog_data->dispatch_mode == DISPATCH_MODE_4X2_DUAL_OBJECT);
999
1000 /* From the BSpec, 3D Pipeline > Geometry > Vertex Shader > State,
1001 * 3DSTATE_VS, Dword 5.0 "VS Function Enable":
1002 *
1003 * [DevSNB] A pipeline flush must be programmed prior to a 3DSTATE_VS
1004 * command that causes the VS Function Enable to toggle. Pipeline
1005 * flush can be executed by sending a PIPE_CONTROL command with CS
1006 * stall bit set and a post sync operation.
1007 *
1008 * We've already done such a flush at the start of state upload, so we
1009 * don't need to do another one here.
1010 */
1011
1012 #if GEN_GEN < 7
1013 brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_VS), cvs) {
1014 if (stage_state->push_const_size != 0) {
1015 cvs.Buffer0Valid = true;
1016 cvs.PointertoVSConstantBuffer0 = stage_state->push_const_offset;
1017 cvs.VSConstantBuffer0ReadLength = stage_state->push_const_size - 1;
1018 }
1019 }
1020 #endif
1021
1022 if (GEN_GEN == 7 && devinfo->is_ivybridge)
1023 gen7_emit_vs_workaround_flush(brw);
1024
1025 brw_batch_emit(brw, GENX(3DSTATE_VS), vs) {
1026 INIT_THREAD_DISPATCH_FIELDS(vs, Vertex);
1027
1028 vs.MaximumNumberofThreads = devinfo->max_vs_threads - 1;
1029
1030 #if GEN_GEN >= 8
1031 vs.SIMD8DispatchEnable =
1032 vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8;
1033
1034 vs.UserClipDistanceCullTestEnableBitmask =
1035 vue_prog_data->cull_distance_mask;
1036 #endif
1037 }
1038
1039 #if GEN_GEN < 7
1040 /* Based on my reading of the simulator, the VS constants don't get
1041 * pulled into the VS FF unit until an appropriate pipeline flush
1042 * happens, and instead the 3DSTATE_CONSTANT_VS packet just adds
1043 * references to them into a little FIFO. The flushes are common,
1044 * but don't reliably happen between this and a 3DPRIMITIVE, causing
1045 * the primitive to use the wrong constants. Then the FIFO
1046 * containing the constant setup gets added to again on the next
1047 * constants change, and eventually when a flush does happen the
1048 * unit is overwhelmed by constant changes and dies.
1049 *
1050 * To avoid this, send a PIPE_CONTROL down the line that will
1051 * update the unit immediately loading the constants. The flush
1052 * type bits here were those set by the STATE_BASE_ADDRESS whose
1053 * move in a82a43e8d99e1715dd11c9c091b5ab734079b6a6 triggered the
1054 * bug reports that led to this workaround, and may be more than
1055 * what is strictly required to avoid the issue.
1056 */
1057 brw_emit_pipe_control_flush(brw,
1058 PIPE_CONTROL_DEPTH_STALL |
1059 PIPE_CONTROL_INSTRUCTION_INVALIDATE |
1060 PIPE_CONTROL_STATE_CACHE_INVALIDATE);
1061 #endif
1062 }
1063
1064 static const struct brw_tracked_state genX(vs_state) = {
1065 .dirty = {
1066 .mesa = (GEN_GEN < 7 ? (_NEW_PROGRAM_CONSTANTS | _NEW_TRANSFORM) : 0),
1067 .brw = BRW_NEW_BATCH |
1068 BRW_NEW_BLORP |
1069 BRW_NEW_CONTEXT |
1070 BRW_NEW_VS_PROG_DATA |
1071 (GEN_GEN < 7 ? BRW_NEW_PUSH_CONSTANT_ALLOCATION |
1072 BRW_NEW_VERTEX_PROGRAM
1073 : 0),
1074 },
1075 .emit = genX(upload_vs_state),
1076 };
1077
1078 #endif
1079
1080 /* ---------------------------------------------------------------------- */
1081
1082 #if GEN_GEN >= 7
1083 static void
1084 genX(upload_sbe)(struct brw_context *brw)
1085 {
1086 struct gl_context *ctx = &brw->ctx;
1087 /* BRW_NEW_FS_PROG_DATA */
1088 const struct brw_wm_prog_data *wm_prog_data =
1089 brw_wm_prog_data(brw->wm.base.prog_data);
1090 #if GEN_GEN >= 8
1091 struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) attr_overrides[16] = { { 0 } };
1092 #else
1093 #define attr_overrides sbe.Attribute
1094 #endif
1095 uint32_t urb_entry_read_length;
1096 uint32_t urb_entry_read_offset;
1097 uint32_t point_sprite_enables;
1098
1099 brw_batch_emit(brw, GENX(3DSTATE_SBE), sbe) {
1100 sbe.AttributeSwizzleEnable = true;
1101 sbe.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
1102
1103 /* _NEW_BUFFERS */
1104 bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
1105
1106 /* _NEW_POINT
1107 *
1108 * Window coordinates in an FBO are inverted, which means point
1109 * sprite origin must be inverted.
1110 */
1111 if ((ctx->Point.SpriteOrigin == GL_LOWER_LEFT) != render_to_fbo)
1112 sbe.PointSpriteTextureCoordinateOrigin = LOWERLEFT;
1113 else
1114 sbe.PointSpriteTextureCoordinateOrigin = UPPERLEFT;
1115
1116 /* _NEW_POINT | _NEW_LIGHT | _NEW_PROGRAM,
1117 * BRW_NEW_FS_PROG_DATA | BRW_NEW_FRAGMENT_PROGRAM |
1118 * BRW_NEW_GS_PROG_DATA | BRW_NEW_PRIMITIVE | BRW_NEW_TES_PROG_DATA |
1119 * BRW_NEW_VUE_MAP_GEOM_OUT
1120 */
1121 genX(calculate_attr_overrides)(brw,
1122 attr_overrides,
1123 &point_sprite_enables,
1124 &urb_entry_read_length,
1125 &urb_entry_read_offset);
1126
1127 /* Typically, the URB entry read length and offset should be programmed
1128 * in 3DSTATE_VS and 3DSTATE_GS; SBE inherits it from the last active
1129 * stage which produces geometry. However, we don't know the proper
1130 * value until we call calculate_attr_overrides().
1131 *
1132 * To fit with our existing code, we override the inherited values and
1133 * specify it here directly, as we did on previous generations.
1134 */
1135 sbe.VertexURBEntryReadLength = urb_entry_read_length;
1136 sbe.VertexURBEntryReadOffset = urb_entry_read_offset;
1137 sbe.PointSpriteTextureCoordinateEnable = point_sprite_enables;
1138 sbe.ConstantInterpolationEnable = wm_prog_data->flat_inputs;
1139
1140 #if GEN_GEN >= 8
1141 sbe.ForceVertexURBEntryReadLength = true;
1142 sbe.ForceVertexURBEntryReadOffset = true;
1143 #endif
1144
1145 #if GEN_GEN >= 9
1146 /* prepare the active component dwords */
1147 int input_index = 0;
1148 for (int attr = 0; attr < VARYING_SLOT_MAX; attr++) {
1149 if (!(brw->fragment_program->info.inputs_read &
1150 BITFIELD64_BIT(attr))) {
1151 continue;
1152 }
1153
1154 assert(input_index < 32);
1155
1156 sbe.AttributeActiveComponentFormat[input_index] = ACTIVE_COMPONENT_XYZW;
1157 ++input_index;
1158 }
1159 #endif
1160 }
1161
1162 #if GEN_GEN >= 8
1163 brw_batch_emit(brw, GENX(3DSTATE_SBE_SWIZ), sbes) {
1164 for (int i = 0; i < 16; i++)
1165 sbes.Attribute[i] = attr_overrides[i];
1166 }
1167 #endif
1168
1169 #undef attr_overrides
1170 }
1171
1172 static const struct brw_tracked_state genX(sbe_state) = {
1173 .dirty = {
1174 .mesa = _NEW_BUFFERS |
1175 _NEW_LIGHT |
1176 _NEW_POINT |
1177 _NEW_POLYGON |
1178 _NEW_PROGRAM,
1179 .brw = BRW_NEW_BLORP |
1180 BRW_NEW_CONTEXT |
1181 BRW_NEW_FRAGMENT_PROGRAM |
1182 BRW_NEW_FS_PROG_DATA |
1183 BRW_NEW_GS_PROG_DATA |
1184 BRW_NEW_TES_PROG_DATA |
1185 BRW_NEW_VUE_MAP_GEOM_OUT |
1186 (GEN_GEN == 7 ? BRW_NEW_PRIMITIVE
1187 : 0),
1188 },
1189 .emit = genX(upload_sbe),
1190 };
1191
1192 /* ---------------------------------------------------------------------- */
1193
1194 /**
1195 * Outputs the 3DSTATE_SO_DECL_LIST command.
1196 *
1197 * The data output is a series of 64-bit entries containing a SO_DECL per
1198 * stream. We only have one stream of rendering coming out of the GS unit, so
1199 * we only emit stream 0 (low 16 bits) SO_DECLs.
1200 */
1201 static void
1202 genX(upload_3dstate_so_decl_list)(struct brw_context *brw,
1203 const struct brw_vue_map *vue_map)
1204 {
1205 struct gl_context *ctx = &brw->ctx;
1206 /* BRW_NEW_TRANSFORM_FEEDBACK */
1207 struct gl_transform_feedback_object *xfb_obj =
1208 ctx->TransformFeedback.CurrentObject;
1209 const struct gl_transform_feedback_info *linked_xfb_info =
1210 xfb_obj->program->sh.LinkedTransformFeedback;
1211 struct GENX(SO_DECL) so_decl[MAX_VERTEX_STREAMS][128];
1212 int buffer_mask[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
1213 int next_offset[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
1214 int decls[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
1215 int max_decls = 0;
1216 STATIC_ASSERT(ARRAY_SIZE(so_decl[0]) >= MAX_PROGRAM_OUTPUTS);
1217
1218 memset(so_decl, 0, sizeof(so_decl));
1219
1220 /* Construct the list of SO_DECLs to be emitted. The formatting of the
1221 * command feels strange -- each dword pair contains a SO_DECL per stream.
1222 */
1223 for (unsigned i = 0; i < linked_xfb_info->NumOutputs; i++) {
1224 int buffer = linked_xfb_info->Outputs[i].OutputBuffer;
1225 struct GENX(SO_DECL) decl = {0};
1226 int varying = linked_xfb_info->Outputs[i].OutputRegister;
1227 const unsigned components = linked_xfb_info->Outputs[i].NumComponents;
1228 unsigned component_mask = (1 << components) - 1;
1229 unsigned stream_id = linked_xfb_info->Outputs[i].StreamId;
1230 unsigned decl_buffer_slot = buffer;
1231 assert(stream_id < MAX_VERTEX_STREAMS);
1232
1233 /* gl_PointSize is stored in VARYING_SLOT_PSIZ.w
1234 * gl_Layer is stored in VARYING_SLOT_PSIZ.y
1235 * gl_ViewportIndex is stored in VARYING_SLOT_PSIZ.z
1236 */
1237 if (varying == VARYING_SLOT_PSIZ) {
1238 assert(components == 1);
1239 component_mask <<= 3;
1240 } else if (varying == VARYING_SLOT_LAYER) {
1241 assert(components == 1);
1242 component_mask <<= 1;
1243 } else if (varying == VARYING_SLOT_VIEWPORT) {
1244 assert(components == 1);
1245 component_mask <<= 2;
1246 } else {
1247 component_mask <<= linked_xfb_info->Outputs[i].ComponentOffset;
1248 }
1249
1250 buffer_mask[stream_id] |= 1 << buffer;
1251
1252 decl.OutputBufferSlot = decl_buffer_slot;
1253 if (varying == VARYING_SLOT_LAYER || varying == VARYING_SLOT_VIEWPORT) {
1254 decl.RegisterIndex = vue_map->varying_to_slot[VARYING_SLOT_PSIZ];
1255 } else {
1256 assert(vue_map->varying_to_slot[varying] >= 0);
1257 decl.RegisterIndex = vue_map->varying_to_slot[varying];
1258 }
1259 decl.ComponentMask = component_mask;
1260
1261 /* Mesa doesn't store entries for gl_SkipComponents in the Outputs[]
1262 * array. Instead, it simply increments DstOffset for the following
1263 * input by the number of components that should be skipped.
1264 *
1265 * Our hardware is unusual in that it requires us to program SO_DECLs
1266 * for fake "hole" components, rather than simply taking the offset
1267 * for each real varying. Each hole can have size 1, 2, 3, or 4; we
1268 * program as many size = 4 holes as we can, then a final hole to
1269 * accommodate the final 1, 2, or 3 remaining.
1270 */
1271 int skip_components =
1272 linked_xfb_info->Outputs[i].DstOffset - next_offset[buffer];
1273
1274 next_offset[buffer] += skip_components;
1275
1276 while (skip_components >= 4) {
1277 struct GENX(SO_DECL) *d = &so_decl[stream_id][decls[stream_id]++];
1278 d->HoleFlag = 1;
1279 d->OutputBufferSlot = decl_buffer_slot;
1280 d->ComponentMask = 0xf;
1281 skip_components -= 4;
1282 }
1283
1284 if (skip_components > 0) {
1285 struct GENX(SO_DECL) *d = &so_decl[stream_id][decls[stream_id]++];
1286 d->HoleFlag = 1;
1287 d->OutputBufferSlot = decl_buffer_slot;
1288 d->ComponentMask = (1 << skip_components) - 1;
1289 }
1290
1291 assert(linked_xfb_info->Outputs[i].DstOffset == next_offset[buffer]);
1292
1293 next_offset[buffer] += components;
1294
1295 so_decl[stream_id][decls[stream_id]++] = decl;
1296
1297 if (decls[stream_id] > max_decls)
1298 max_decls = decls[stream_id];
1299 }
1300
1301 uint32_t *dw;
1302 dw = brw_batch_emitn(brw, GENX(3DSTATE_SO_DECL_LIST), 3 + 2 * max_decls,
1303 .StreamtoBufferSelects0 = buffer_mask[0],
1304 .StreamtoBufferSelects1 = buffer_mask[1],
1305 .StreamtoBufferSelects2 = buffer_mask[2],
1306 .StreamtoBufferSelects3 = buffer_mask[3],
1307 .NumEntries0 = decls[0],
1308 .NumEntries1 = decls[1],
1309 .NumEntries2 = decls[2],
1310 .NumEntries3 = decls[3]);
1311
1312 for (int i = 0; i < max_decls; i++) {
1313 GENX(SO_DECL_ENTRY_pack)(
1314 brw, dw + 2 + i * 2,
1315 &(struct GENX(SO_DECL_ENTRY)) {
1316 .Stream0Decl = so_decl[0][i],
1317 .Stream1Decl = so_decl[1][i],
1318 .Stream2Decl = so_decl[2][i],
1319 .Stream3Decl = so_decl[3][i],
1320 });
1321 }
1322 }
1323
1324 static void
1325 genX(upload_3dstate_so_buffers)(struct brw_context *brw)
1326 {
1327 struct gl_context *ctx = &brw->ctx;
1328 /* BRW_NEW_TRANSFORM_FEEDBACK */
1329 struct gl_transform_feedback_object *xfb_obj =
1330 ctx->TransformFeedback.CurrentObject;
1331 #if GEN_GEN < 8
1332 const struct gl_transform_feedback_info *linked_xfb_info =
1333 xfb_obj->program->sh.LinkedTransformFeedback;
1334 #else
1335 struct brw_transform_feedback_object *brw_obj =
1336 (struct brw_transform_feedback_object *) xfb_obj;
1337 uint32_t mocs_wb = brw->gen >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB;
1338 #endif
1339
1340 /* Set up the up to 4 output buffers. These are the ranges defined in the
1341 * gl_transform_feedback_object.
1342 */
1343 for (int i = 0; i < 4; i++) {
1344 struct intel_buffer_object *bufferobj =
1345 intel_buffer_object(xfb_obj->Buffers[i]);
1346
1347 if (!bufferobj) {
1348 brw_batch_emit(brw, GENX(3DSTATE_SO_BUFFER), sob) {
1349 sob.SOBufferIndex = i;
1350 }
1351 continue;
1352 }
1353
1354 uint32_t start = xfb_obj->Offset[i];
1355 assert(start % 4 == 0);
1356 uint32_t end = ALIGN(start + xfb_obj->Size[i], 4);
1357 struct brw_bo *bo =
1358 intel_bufferobj_buffer(brw, bufferobj, start, end - start);
1359 assert(end <= bo->size);
1360
1361 brw_batch_emit(brw, GENX(3DSTATE_SO_BUFFER), sob) {
1362 sob.SOBufferIndex = i;
1363
1364 sob.SurfaceBaseAddress = render_bo(bo, start);
1365 #if GEN_GEN < 8
1366 sob.SurfacePitch = linked_xfb_info->Buffers[i].Stride * 4;
1367 sob.SurfaceEndAddress = render_bo(bo, end);
1368 #else
1369 sob.SOBufferEnable = true;
1370 sob.StreamOffsetWriteEnable = true;
1371 sob.StreamOutputBufferOffsetAddressEnable = true;
1372 sob.SOBufferMOCS = mocs_wb;
1373
1374 sob.SurfaceSize = MAX2(xfb_obj->Size[i] / 4, 1) - 1;
1375 sob.StreamOutputBufferOffsetAddress =
1376 instruction_bo(brw_obj->offset_bo, i * sizeof(uint32_t));
1377
1378 if (brw_obj->zero_offsets) {
1379 /* Zero out the offset and write that to offset_bo */
1380 sob.StreamOffset = 0;
1381 } else {
1382 /* Use offset_bo as the "Stream Offset." */
1383 sob.StreamOffset = 0xFFFFFFFF;
1384 }
1385 #endif
1386 }
1387 }
1388
1389 #if GEN_GEN >= 8
1390 brw_obj->zero_offsets = false;
1391 #endif
1392 }
1393
1394 static inline bool
1395 query_active(struct gl_query_object *q)
1396 {
1397 return q && q->Active;
1398 }
1399
1400 static void
1401 genX(upload_3dstate_streamout)(struct brw_context *brw, bool active,
1402 const struct brw_vue_map *vue_map)
1403 {
1404 struct gl_context *ctx = &brw->ctx;
1405 /* BRW_NEW_TRANSFORM_FEEDBACK */
1406 struct gl_transform_feedback_object *xfb_obj =
1407 ctx->TransformFeedback.CurrentObject;
1408
1409 brw_batch_emit(brw, GENX(3DSTATE_STREAMOUT), sos) {
1410 if (active) {
1411 int urb_entry_read_offset = 0;
1412 int urb_entry_read_length = (vue_map->num_slots + 1) / 2 -
1413 urb_entry_read_offset;
1414
1415 sos.SOFunctionEnable = true;
1416 sos.SOStatisticsEnable = true;
1417
1418 /* BRW_NEW_RASTERIZER_DISCARD */
1419 if (ctx->RasterDiscard) {
1420 if (!query_active(ctx->Query.PrimitivesGenerated[0])) {
1421 sos.RenderingDisable = true;
1422 } else {
1423 perf_debug("Rasterizer discard with a GL_PRIMITIVES_GENERATED "
1424 "query active relies on the clipper.");
1425 }
1426 }
1427
1428 /* _NEW_LIGHT */
1429 if (ctx->Light.ProvokingVertex != GL_FIRST_VERTEX_CONVENTION)
1430 sos.ReorderMode = TRAILING;
1431
1432 #if GEN_GEN < 8
1433 sos.SOBufferEnable0 = xfb_obj->Buffers[0] != NULL;
1434 sos.SOBufferEnable1 = xfb_obj->Buffers[1] != NULL;
1435 sos.SOBufferEnable2 = xfb_obj->Buffers[2] != NULL;
1436 sos.SOBufferEnable3 = xfb_obj->Buffers[3] != NULL;
1437 #else
1438 const struct gl_transform_feedback_info *linked_xfb_info =
1439 xfb_obj->program->sh.LinkedTransformFeedback;
1440 /* Set buffer pitches; 0 means unbound. */
1441 if (xfb_obj->Buffers[0])
1442 sos.Buffer0SurfacePitch = linked_xfb_info->Buffers[0].Stride * 4;
1443 if (xfb_obj->Buffers[1])
1444 sos.Buffer1SurfacePitch = linked_xfb_info->Buffers[1].Stride * 4;
1445 if (xfb_obj->Buffers[2])
1446 sos.Buffer2SurfacePitch = linked_xfb_info->Buffers[2].Stride * 4;
1447 if (xfb_obj->Buffers[3])
1448 sos.Buffer3SurfacePitch = linked_xfb_info->Buffers[3].Stride * 4;
1449 #endif
1450
1451 /* We always read the whole vertex. This could be reduced at some
1452 * point by reading less and offsetting the register index in the
1453 * SO_DECLs.
1454 */
1455 sos.Stream0VertexReadOffset = urb_entry_read_offset;
1456 sos.Stream0VertexReadLength = urb_entry_read_length - 1;
1457 sos.Stream1VertexReadOffset = urb_entry_read_offset;
1458 sos.Stream1VertexReadLength = urb_entry_read_length - 1;
1459 sos.Stream2VertexReadOffset = urb_entry_read_offset;
1460 sos.Stream2VertexReadLength = urb_entry_read_length - 1;
1461 sos.Stream3VertexReadOffset = urb_entry_read_offset;
1462 sos.Stream3VertexReadLength = urb_entry_read_length - 1;
1463 }
1464 }
1465 }
1466
1467 static void
1468 genX(upload_sol)(struct brw_context *brw)
1469 {
1470 struct gl_context *ctx = &brw->ctx;
1471 /* BRW_NEW_TRANSFORM_FEEDBACK */
1472 bool active = _mesa_is_xfb_active_and_unpaused(ctx);
1473
1474 if (active) {
1475 genX(upload_3dstate_so_buffers)(brw);
1476
1477 /* BRW_NEW_VUE_MAP_GEOM_OUT */
1478 genX(upload_3dstate_so_decl_list)(brw, &brw->vue_map_geom_out);
1479 }
1480
1481 /* Finally, set up the SOL stage. This command must always follow updates to
1482 * the nonpipelined SOL state (3DSTATE_SO_BUFFER, 3DSTATE_SO_DECL_LIST) or
1483 * MMIO register updates (current performed by the kernel at each batch
1484 * emit).
1485 */
1486 genX(upload_3dstate_streamout)(brw, active, &brw->vue_map_geom_out);
1487 }
1488
1489 static const struct brw_tracked_state genX(sol_state) = {
1490 .dirty = {
1491 .mesa = _NEW_LIGHT,
1492 .brw = BRW_NEW_BATCH |
1493 BRW_NEW_BLORP |
1494 BRW_NEW_RASTERIZER_DISCARD |
1495 BRW_NEW_VUE_MAP_GEOM_OUT |
1496 BRW_NEW_TRANSFORM_FEEDBACK,
1497 },
1498 .emit = genX(upload_sol),
1499 };
1500
1501 /* ---------------------------------------------------------------------- */
1502
1503 static void
1504 genX(upload_ps)(struct brw_context *brw)
1505 {
1506 UNUSED const struct gl_context *ctx = &brw->ctx;
1507 UNUSED const struct gen_device_info *devinfo = &brw->screen->devinfo;
1508
1509 /* BRW_NEW_FS_PROG_DATA */
1510 const struct brw_wm_prog_data *prog_data =
1511 brw_wm_prog_data(brw->wm.base.prog_data);
1512 const struct brw_stage_state *stage_state = &brw->wm.base;
1513
1514 #if GEN_GEN < 8
1515 #endif
1516
1517 brw_batch_emit(brw, GENX(3DSTATE_PS), ps) {
1518 /* Initialize the execution mask with VMask. Otherwise, derivatives are
1519 * incorrect for subspans where some of the pixels are unlit. We believe
1520 * the bit just didn't take effect in previous generations.
1521 */
1522 ps.VectorMaskEnable = GEN_GEN >= 8;
1523
1524 ps.SamplerCount =
1525 DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4);
1526
1527 /* BRW_NEW_FS_PROG_DATA */
1528 ps.BindingTableEntryCount = prog_data->base.binding_table.size_bytes / 4;
1529
1530 if (prog_data->base.use_alt_mode)
1531 ps.FloatingPointMode = Alternate;
1532
1533 /* Haswell requires the sample mask to be set in this packet as well as
1534 * in 3DSTATE_SAMPLE_MASK; the values should match.
1535 */
1536
1537 /* _NEW_BUFFERS, _NEW_MULTISAMPLE */
1538 #if GEN_IS_HASWELL
1539 ps.SampleMask = gen6_determine_sample_mask(brw);
1540 #endif
1541
1542 /* 3DSTATE_PS expects the number of threads per PSD, which is always 64;
1543 * it implicitly scales for different GT levels (which have some # of
1544 * PSDs).
1545 *
1546 * In Gen8 the format is U8-2 whereas in Gen9 it is U8-1.
1547 */
1548 #if GEN_GEN >= 9
1549 ps.MaximumNumberofThreadsPerPSD = 64 - 1;
1550 #elif GEN_GEN >= 8
1551 ps.MaximumNumberofThreadsPerPSD = 64 - 2;
1552 #else
1553 ps.MaximumNumberofThreads = devinfo->max_wm_threads - 1;
1554 #endif
1555
1556 if (prog_data->base.nr_params > 0)
1557 ps.PushConstantEnable = true;
1558
1559 #if GEN_GEN < 8
1560 /* From the IVB PRM, volume 2 part 1, page 287:
1561 * "This bit is inserted in the PS payload header and made available to
1562 * the DataPort (either via the message header or via header bypass) to
1563 * indicate that oMask data (one or two phases) is included in Render
1564 * Target Write messages. If present, the oMask data is used to mask off
1565 * samples."
1566 */
1567 ps.oMaskPresenttoRenderTarget = prog_data->uses_omask;
1568
1569 /* The hardware wedges if you have this bit set but don't turn on any
1570 * dual source blend factors.
1571 *
1572 * BRW_NEW_FS_PROG_DATA | _NEW_COLOR
1573 */
1574 ps.DualSourceBlendEnable = prog_data->dual_src_blend &&
1575 (ctx->Color.BlendEnabled & 1) &&
1576 ctx->Color.Blend[0]._UsesDualSrc;
1577
1578 /* BRW_NEW_FS_PROG_DATA */
1579 ps.AttributeEnable = (prog_data->num_varying_inputs != 0);
1580 #endif
1581
1582 /* From the documentation for this packet:
1583 * "If the PS kernel does not need the Position XY Offsets to
1584 * compute a Position Value, then this field should be programmed
1585 * to POSOFFSET_NONE."
1586 *
1587 * "SW Recommendation: If the PS kernel needs the Position Offsets
1588 * to compute a Position XY value, this field should match Position
1589 * ZW Interpolation Mode to ensure a consistent position.xyzw
1590 * computation."
1591 *
1592 * We only require XY sample offsets. So, this recommendation doesn't
1593 * look useful at the moment. We might need this in future.
1594 */
1595 if (prog_data->uses_pos_offset)
1596 ps.PositionXYOffsetSelect = POSOFFSET_SAMPLE;
1597 else
1598 ps.PositionXYOffsetSelect = POSOFFSET_NONE;
1599
1600 ps.RenderTargetFastClearEnable = brw->wm.fast_clear_op;
1601 ps._8PixelDispatchEnable = prog_data->dispatch_8;
1602 ps._16PixelDispatchEnable = prog_data->dispatch_16;
1603 ps.DispatchGRFStartRegisterForConstantSetupData0 =
1604 prog_data->base.dispatch_grf_start_reg;
1605 ps.DispatchGRFStartRegisterForConstantSetupData2 =
1606 prog_data->dispatch_grf_start_reg_2;
1607
1608 ps.KernelStartPointer0 = stage_state->prog_offset;
1609 ps.KernelStartPointer2 = stage_state->prog_offset +
1610 prog_data->prog_offset_2;
1611
1612 if (prog_data->base.total_scratch) {
1613 ps.ScratchSpaceBasePointer =
1614 render_bo(stage_state->scratch_bo,
1615 ffs(stage_state->per_thread_scratch) - 11);
1616 }
1617 }
1618 }
1619
1620 static const struct brw_tracked_state genX(ps_state) = {
1621 .dirty = {
1622 .mesa = _NEW_MULTISAMPLE |
1623 (GEN_GEN < 8 ? _NEW_BUFFERS |
1624 _NEW_COLOR
1625 : 0),
1626 .brw = BRW_NEW_BATCH |
1627 BRW_NEW_BLORP |
1628 BRW_NEW_FS_PROG_DATA,
1629 },
1630 .emit = genX(upload_ps),
1631 };
1632
1633 #endif
1634
1635 /* ---------------------------------------------------------------------- */
1636
1637 #if GEN_GEN >= 8
1638 static void
1639 genX(upload_raster)(struct brw_context *brw)
1640 {
1641 struct gl_context *ctx = &brw->ctx;
1642
1643 /* _NEW_BUFFERS */
1644 bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
1645
1646 /* _NEW_POLYGON */
1647 struct gl_polygon_attrib *polygon = &ctx->Polygon;
1648
1649 /* _NEW_POINT */
1650 struct gl_point_attrib *point = &ctx->Point;
1651
1652 brw_batch_emit(brw, GENX(3DSTATE_RASTER), raster) {
1653 if (polygon->_FrontBit == render_to_fbo)
1654 raster.FrontWinding = CounterClockwise;
1655
1656 if (polygon->CullFlag) {
1657 switch (polygon->CullFaceMode) {
1658 case GL_FRONT:
1659 raster.CullMode = CULLMODE_FRONT;
1660 break;
1661 case GL_BACK:
1662 raster.CullMode = CULLMODE_BACK;
1663 break;
1664 case GL_FRONT_AND_BACK:
1665 raster.CullMode = CULLMODE_BOTH;
1666 break;
1667 default:
1668 unreachable("not reached");
1669 }
1670 } else {
1671 raster.CullMode = CULLMODE_NONE;
1672 }
1673
1674 point->SmoothFlag = raster.SmoothPointEnable;
1675
1676 raster.DXMultisampleRasterizationEnable =
1677 _mesa_is_multisample_enabled(ctx);
1678
1679 raster.GlobalDepthOffsetEnableSolid = polygon->OffsetFill;
1680 raster.GlobalDepthOffsetEnableWireframe = polygon->OffsetLine;
1681 raster.GlobalDepthOffsetEnablePoint = polygon->OffsetPoint;
1682
1683 switch (polygon->FrontMode) {
1684 case GL_FILL:
1685 raster.FrontFaceFillMode = FILL_MODE_SOLID;
1686 break;
1687 case GL_LINE:
1688 raster.FrontFaceFillMode = FILL_MODE_WIREFRAME;
1689 break;
1690 case GL_POINT:
1691 raster.FrontFaceFillMode = FILL_MODE_POINT;
1692 break;
1693 default:
1694 unreachable("not reached");
1695 }
1696
1697 switch (polygon->BackMode) {
1698 case GL_FILL:
1699 raster.BackFaceFillMode = FILL_MODE_SOLID;
1700 break;
1701 case GL_LINE:
1702 raster.BackFaceFillMode = FILL_MODE_WIREFRAME;
1703 break;
1704 case GL_POINT:
1705 raster.BackFaceFillMode = FILL_MODE_POINT;
1706 break;
1707 default:
1708 unreachable("not reached");
1709 }
1710
1711 /* _NEW_LINE */
1712 raster.AntialiasingEnable = ctx->Line.SmoothFlag;
1713
1714 /* _NEW_SCISSOR */
1715 raster.ScissorRectangleEnable = ctx->Scissor.EnableFlags;
1716
1717 /* _NEW_TRANSFORM */
1718 if (!ctx->Transform.DepthClamp) {
1719 #if GEN_GEN >= 9
1720 raster.ViewportZFarClipTestEnable = true;
1721 raster.ViewportZNearClipTestEnable = true;
1722 #else
1723 raster.ViewportZClipTestEnable = true;
1724 #endif
1725 }
1726
1727 /* BRW_NEW_CONSERVATIVE_RASTERIZATION */
1728 #if GEN_GEN >= 9
1729 raster.ConservativeRasterizationEnable =
1730 ctx->IntelConservativeRasterization;
1731 #endif
1732
1733 raster.GlobalDepthOffsetClamp = polygon->OffsetClamp;
1734 raster.GlobalDepthOffsetScale = polygon->OffsetFactor;
1735
1736 raster.GlobalDepthOffsetConstant = polygon->OffsetUnits * 2;
1737 }
1738 }
1739
1740 static const struct brw_tracked_state genX(raster_state) = {
1741 .dirty = {
1742 .mesa = _NEW_BUFFERS |
1743 _NEW_LINE |
1744 _NEW_MULTISAMPLE |
1745 _NEW_POINT |
1746 _NEW_POLYGON |
1747 _NEW_SCISSOR |
1748 _NEW_TRANSFORM,
1749 .brw = BRW_NEW_BLORP |
1750 BRW_NEW_CONTEXT |
1751 BRW_NEW_CONSERVATIVE_RASTERIZATION,
1752 },
1753 .emit = genX(upload_raster),
1754 };
1755
1756 /* ---------------------------------------------------------------------- */
1757
1758 static void
1759 genX(upload_ps_extra)(struct brw_context *brw)
1760 {
1761 UNUSED struct gl_context *ctx = &brw->ctx;
1762
1763 const struct brw_wm_prog_data *prog_data =
1764 brw_wm_prog_data(brw->wm.base.prog_data);
1765
1766 brw_batch_emit(brw, GENX(3DSTATE_PS_EXTRA), psx) {
1767 psx.PixelShaderValid = true;
1768 psx.PixelShaderComputedDepthMode = prog_data->computed_depth_mode;
1769 psx.PixelShaderKillsPixel = prog_data->uses_kill;
1770 psx.AttributeEnable = prog_data->num_varying_inputs != 0;
1771 psx.PixelShaderUsesSourceDepth = prog_data->uses_src_depth;
1772 psx.PixelShaderUsesSourceW = prog_data->uses_src_w;
1773 psx.PixelShaderIsPerSample = prog_data->persample_dispatch;
1774
1775 /* _NEW_MULTISAMPLE | BRW_NEW_CONSERVATIVE_RASTERIZATION */
1776 if (prog_data->uses_sample_mask) {
1777 #if GEN_GEN >= 9
1778 if (prog_data->post_depth_coverage)
1779 psx.InputCoverageMaskState = ICMS_DEPTH_COVERAGE;
1780 else if (prog_data->inner_coverage && ctx->IntelConservativeRasterization)
1781 psx.InputCoverageMaskState = ICMS_INNER_CONSERVATIVE;
1782 else
1783 psx.InputCoverageMaskState = ICMS_NORMAL;
1784 #else
1785 psx.PixelShaderUsesInputCoverageMask = true;
1786 #endif
1787 }
1788
1789 psx.oMaskPresenttoRenderTarget = prog_data->uses_omask;
1790 #if GEN_GEN >= 9
1791 psx.PixelShaderPullsBary = prog_data->pulls_bary;
1792 psx.PixelShaderComputesStencil = prog_data->computed_stencil;
1793 #endif
1794
1795 /* The stricter cross-primitive coherency guarantees that the hardware
1796 * gives us with the "Accesses UAV" bit set for at least one shader stage
1797 * and the "UAV coherency required" bit set on the 3DPRIMITIVE command
1798 * are redundant within the current image, atomic counter and SSBO GL
1799 * APIs, which all have very loose ordering and coherency requirements
1800 * and generally rely on the application to insert explicit barriers when
1801 * a shader invocation is expected to see the memory writes performed by
1802 * the invocations of some previous primitive. Regardless of the value
1803 * of "UAV coherency required", the "Accesses UAV" bits will implicitly
1804 * cause an in most cases useless DC flush when the lowermost stage with
1805 * the bit set finishes execution.
1806 *
1807 * It would be nice to disable it, but in some cases we can't because on
1808 * Gen8+ it also has an influence on rasterization via the PS UAV-only
1809 * signal (which could be set independently from the coherency mechanism
1810 * in the 3DSTATE_WM command on Gen7), and because in some cases it will
1811 * determine whether the hardware skips execution of the fragment shader
1812 * or not via the ThreadDispatchEnable signal. However if we know that
1813 * GEN8_PS_BLEND_HAS_WRITEABLE_RT is going to be set and
1814 * GEN8_PSX_PIXEL_SHADER_NO_RT_WRITE is not set it shouldn't make any
1815 * difference so we may just disable it here.
1816 *
1817 * Gen8 hardware tries to compute ThreadDispatchEnable for us but doesn't
1818 * take into account KillPixels when no depth or stencil writes are
1819 * enabled. In order for occlusion queries to work correctly with no
1820 * attachments, we need to force-enable here.
1821 *
1822 * BRW_NEW_FS_PROG_DATA | BRW_NEW_FRAGMENT_PROGRAM | _NEW_BUFFERS |
1823 * _NEW_COLOR
1824 */
1825 if ((prog_data->has_side_effects || prog_data->uses_kill) &&
1826 !brw_color_buffer_write_enabled(brw))
1827 psx.PixelShaderHasUAV = true;
1828 }
1829 }
1830
1831 const struct brw_tracked_state genX(ps_extra) = {
1832 .dirty = {
1833 .mesa = _NEW_BUFFERS | _NEW_COLOR,
1834 .brw = BRW_NEW_BLORP |
1835 BRW_NEW_CONTEXT |
1836 BRW_NEW_FRAGMENT_PROGRAM |
1837 BRW_NEW_FS_PROG_DATA |
1838 BRW_NEW_CONSERVATIVE_RASTERIZATION,
1839 },
1840 .emit = genX(upload_ps_extra),
1841 };
1842 #endif
1843
1844 /* ---------------------------------------------------------------------- */
1845
1846 void
1847 genX(init_atoms)(struct brw_context *brw)
1848 {
1849 #if GEN_GEN < 6
1850 static const struct brw_tracked_state *render_atoms[] =
1851 {
1852 /* Once all the programs are done, we know how large urb entry
1853 * sizes need to be and can decide if we need to change the urb
1854 * layout.
1855 */
1856 &brw_curbe_offsets,
1857 &brw_recalculate_urb_fence,
1858
1859 &brw_cc_vp,
1860 &brw_cc_unit,
1861
1862 /* Surface state setup. Must come before the VS/WM unit. The binding
1863 * table upload must be last.
1864 */
1865 &brw_vs_pull_constants,
1866 &brw_wm_pull_constants,
1867 &brw_renderbuffer_surfaces,
1868 &brw_renderbuffer_read_surfaces,
1869 &brw_texture_surfaces,
1870 &brw_vs_binding_table,
1871 &brw_wm_binding_table,
1872
1873 &brw_fs_samplers,
1874 &brw_vs_samplers,
1875
1876 /* These set up state for brw_psp_urb_cbs */
1877 &brw_wm_unit,
1878 &brw_sf_vp,
1879 &brw_sf_unit,
1880 &brw_vs_unit, /* always required, enabled or not */
1881 &brw_clip_unit,
1882 &brw_gs_unit,
1883
1884 /* Command packets:
1885 */
1886 &brw_invariant_state,
1887
1888 &brw_binding_table_pointers,
1889 &brw_blend_constant_color,
1890
1891 &brw_depthbuffer,
1892
1893 &brw_polygon_stipple,
1894 &brw_polygon_stipple_offset,
1895
1896 &brw_line_stipple,
1897
1898 &brw_psp_urb_cbs,
1899
1900 &brw_drawing_rect,
1901 &brw_indices, /* must come before brw_vertices */
1902 &brw_index_buffer,
1903 &brw_vertices,
1904
1905 &brw_constant_buffer
1906 };
1907 #elif GEN_GEN == 6
1908 static const struct brw_tracked_state *render_atoms[] =
1909 {
1910 &gen6_sf_and_clip_viewports,
1911
1912 /* Command packets: */
1913
1914 &brw_cc_vp,
1915 &gen6_viewport_state, /* must do after *_vp stages */
1916
1917 &gen6_urb,
1918 &gen6_blend_state, /* must do before cc unit */
1919 &gen6_color_calc_state, /* must do before cc unit */
1920 &gen6_depth_stencil_state, /* must do before cc unit */
1921
1922 &gen6_vs_push_constants, /* Before vs_state */
1923 &gen6_gs_push_constants, /* Before gs_state */
1924 &gen6_wm_push_constants, /* Before wm_state */
1925
1926 /* Surface state setup. Must come before the VS/WM unit. The binding
1927 * table upload must be last.
1928 */
1929 &brw_vs_pull_constants,
1930 &brw_vs_ubo_surfaces,
1931 &brw_gs_pull_constants,
1932 &brw_gs_ubo_surfaces,
1933 &brw_wm_pull_constants,
1934 &brw_wm_ubo_surfaces,
1935 &gen6_renderbuffer_surfaces,
1936 &brw_renderbuffer_read_surfaces,
1937 &brw_texture_surfaces,
1938 &gen6_sol_surface,
1939 &brw_vs_binding_table,
1940 &gen6_gs_binding_table,
1941 &brw_wm_binding_table,
1942
1943 &brw_fs_samplers,
1944 &brw_vs_samplers,
1945 &brw_gs_samplers,
1946 &gen6_sampler_state,
1947 &gen6_multisample_state,
1948
1949 &genX(vs_state),
1950 &gen6_gs_state,
1951 &genX(clip_state),
1952 &genX(sf_state),
1953 &genX(wm_state),
1954
1955 &gen6_scissor_state,
1956
1957 &gen6_binding_table_pointers,
1958
1959 &brw_depthbuffer,
1960
1961 &brw_polygon_stipple,
1962 &brw_polygon_stipple_offset,
1963
1964 &brw_line_stipple,
1965
1966 &brw_drawing_rect,
1967
1968 &brw_indices, /* must come before brw_vertices */
1969 &brw_index_buffer,
1970 &brw_vertices,
1971 };
1972 #elif GEN_GEN == 7
1973 static const struct brw_tracked_state *render_atoms[] =
1974 {
1975 /* Command packets: */
1976
1977 &brw_cc_vp,
1978 &gen7_sf_clip_viewport,
1979
1980 &gen7_l3_state,
1981 &gen7_push_constant_space,
1982 &gen7_urb,
1983 &gen6_blend_state, /* must do before cc unit */
1984 &gen6_color_calc_state, /* must do before cc unit */
1985 &genX(depth_stencil_state), /* must do before cc unit */
1986
1987 &brw_vs_image_surfaces, /* Before vs push/pull constants and binding table */
1988 &brw_tcs_image_surfaces, /* Before tcs push/pull constants and binding table */
1989 &brw_tes_image_surfaces, /* Before tes push/pull constants and binding table */
1990 &brw_gs_image_surfaces, /* Before gs push/pull constants and binding table */
1991 &brw_wm_image_surfaces, /* Before wm push/pull constants and binding table */
1992
1993 &gen6_vs_push_constants, /* Before vs_state */
1994 &gen7_tcs_push_constants,
1995 &gen7_tes_push_constants,
1996 &gen6_gs_push_constants, /* Before gs_state */
1997 &gen6_wm_push_constants, /* Before wm_surfaces and constant_buffer */
1998
1999 /* Surface state setup. Must come before the VS/WM unit. The binding
2000 * table upload must be last.
2001 */
2002 &brw_vs_pull_constants,
2003 &brw_vs_ubo_surfaces,
2004 &brw_vs_abo_surfaces,
2005 &brw_tcs_pull_constants,
2006 &brw_tcs_ubo_surfaces,
2007 &brw_tcs_abo_surfaces,
2008 &brw_tes_pull_constants,
2009 &brw_tes_ubo_surfaces,
2010 &brw_tes_abo_surfaces,
2011 &brw_gs_pull_constants,
2012 &brw_gs_ubo_surfaces,
2013 &brw_gs_abo_surfaces,
2014 &brw_wm_pull_constants,
2015 &brw_wm_ubo_surfaces,
2016 &brw_wm_abo_surfaces,
2017 &gen6_renderbuffer_surfaces,
2018 &brw_renderbuffer_read_surfaces,
2019 &brw_texture_surfaces,
2020 &brw_vs_binding_table,
2021 &brw_tcs_binding_table,
2022 &brw_tes_binding_table,
2023 &brw_gs_binding_table,
2024 &brw_wm_binding_table,
2025
2026 &brw_fs_samplers,
2027 &brw_vs_samplers,
2028 &brw_tcs_samplers,
2029 &brw_tes_samplers,
2030 &brw_gs_samplers,
2031 &gen6_multisample_state,
2032
2033 &genX(vs_state),
2034 &gen7_hs_state,
2035 &gen7_te_state,
2036 &gen7_ds_state,
2037 &gen7_gs_state,
2038 &genX(sol_state),
2039 &genX(clip_state),
2040 &genX(sbe_state),
2041 &genX(sf_state),
2042 &genX(wm_state),
2043 &genX(ps_state),
2044
2045 &gen6_scissor_state,
2046
2047 &gen7_depthbuffer,
2048
2049 &brw_polygon_stipple,
2050 &brw_polygon_stipple_offset,
2051
2052 &brw_line_stipple,
2053
2054 &brw_drawing_rect,
2055
2056 &brw_indices, /* must come before brw_vertices */
2057 &brw_index_buffer,
2058 &brw_vertices,
2059
2060 &haswell_cut_index,
2061 };
2062 #elif GEN_GEN >= 8
2063 static const struct brw_tracked_state *render_atoms[] =
2064 {
2065 &brw_cc_vp,
2066 &gen8_sf_clip_viewport,
2067
2068 &gen7_l3_state,
2069 &gen7_push_constant_space,
2070 &gen7_urb,
2071 &gen8_blend_state,
2072 &gen6_color_calc_state,
2073
2074 &brw_vs_image_surfaces, /* Before vs push/pull constants and binding table */
2075 &brw_tcs_image_surfaces, /* Before tcs push/pull constants and binding table */
2076 &brw_tes_image_surfaces, /* Before tes push/pull constants and binding table */
2077 &brw_gs_image_surfaces, /* Before gs push/pull constants and binding table */
2078 &brw_wm_image_surfaces, /* Before wm push/pull constants and binding table */
2079
2080 &gen6_vs_push_constants, /* Before vs_state */
2081 &gen7_tcs_push_constants,
2082 &gen7_tes_push_constants,
2083 &gen6_gs_push_constants, /* Before gs_state */
2084 &gen6_wm_push_constants, /* Before wm_surfaces and constant_buffer */
2085
2086 /* Surface state setup. Must come before the VS/WM unit. The binding
2087 * table upload must be last.
2088 */
2089 &brw_vs_pull_constants,
2090 &brw_vs_ubo_surfaces,
2091 &brw_vs_abo_surfaces,
2092 &brw_tcs_pull_constants,
2093 &brw_tcs_ubo_surfaces,
2094 &brw_tcs_abo_surfaces,
2095 &brw_tes_pull_constants,
2096 &brw_tes_ubo_surfaces,
2097 &brw_tes_abo_surfaces,
2098 &brw_gs_pull_constants,
2099 &brw_gs_ubo_surfaces,
2100 &brw_gs_abo_surfaces,
2101 &brw_wm_pull_constants,
2102 &brw_wm_ubo_surfaces,
2103 &brw_wm_abo_surfaces,
2104 &gen6_renderbuffer_surfaces,
2105 &brw_renderbuffer_read_surfaces,
2106 &brw_texture_surfaces,
2107 &brw_vs_binding_table,
2108 &brw_tcs_binding_table,
2109 &brw_tes_binding_table,
2110 &brw_gs_binding_table,
2111 &brw_wm_binding_table,
2112
2113 &brw_fs_samplers,
2114 &brw_vs_samplers,
2115 &brw_tcs_samplers,
2116 &brw_tes_samplers,
2117 &brw_gs_samplers,
2118 &gen8_multisample_state,
2119
2120 &genX(vs_state),
2121 &gen8_hs_state,
2122 &gen7_te_state,
2123 &gen8_ds_state,
2124 &gen8_gs_state,
2125 &genX(sol_state),
2126 &genX(clip_state),
2127 &genX(raster_state),
2128 &genX(sbe_state),
2129 &genX(sf_state),
2130 &gen8_ps_blend,
2131 &genX(ps_extra),
2132 &genX(ps_state),
2133 &genX(depth_stencil_state),
2134 &genX(wm_state),
2135
2136 &gen6_scissor_state,
2137
2138 &gen7_depthbuffer,
2139
2140 &brw_polygon_stipple,
2141 &brw_polygon_stipple_offset,
2142
2143 &brw_line_stipple,
2144
2145 &brw_drawing_rect,
2146
2147 &gen8_vf_topology,
2148
2149 &brw_indices,
2150 &gen8_index_buffer,
2151 &gen8_vertices,
2152
2153 &haswell_cut_index,
2154 &gen8_pma_fix,
2155 };
2156 #endif
2157
2158 STATIC_ASSERT(ARRAY_SIZE(render_atoms) <= ARRAY_SIZE(brw->render_atoms));
2159 brw_copy_pipeline_atoms(brw, BRW_RENDER_PIPELINE,
2160 render_atoms, ARRAY_SIZE(render_atoms));
2161
2162 #if GEN_GEN >= 7
2163 static const struct brw_tracked_state *compute_atoms[] =
2164 {
2165 &gen7_l3_state,
2166 &brw_cs_image_surfaces,
2167 &gen7_cs_push_constants,
2168 &brw_cs_pull_constants,
2169 &brw_cs_ubo_surfaces,
2170 &brw_cs_abo_surfaces,
2171 &brw_cs_texture_surfaces,
2172 &brw_cs_work_groups_surface,
2173 &brw_cs_samplers,
2174 &brw_cs_state,
2175 };
2176
2177 STATIC_ASSERT(ARRAY_SIZE(compute_atoms) <= ARRAY_SIZE(brw->compute_atoms));
2178 brw_copy_pipeline_atoms(brw, BRW_COMPUTE_PIPELINE,
2179 compute_atoms, ARRAY_SIZE(compute_atoms));
2180 #endif
2181 }