mesa: rename logging functions to reflect that they format strings
[mesa.git] / src / mesa / drivers / dri / i965 / genX_state_upload.c
1 /*
2 * Copyright © 2017 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <assert.h>
25
26 #include "dev/gen_device_info.h"
27 #include "common/gen_sample_positions.h"
28 #include "genxml/gen_macros.h"
29
30 #include "main/bufferobj.h"
31 #include "main/context.h"
32 #include "main/enums.h"
33 #include "main/macros.h"
34 #include "main/state.h"
35
36 #include "genX_boilerplate.h"
37
38 #include "brw_context.h"
39 #include "brw_draw.h"
40 #include "brw_multisample_state.h"
41 #include "brw_state.h"
42 #include "brw_wm.h"
43 #include "brw_util.h"
44
45 #include "intel_batchbuffer.h"
46 #include "intel_buffer_objects.h"
47 #include "intel_fbo.h"
48
49 #include "main/enums.h"
50 #include "main/fbobject.h"
51 #include "main/framebuffer.h"
52 #include "main/glformats.h"
53 #include "main/samplerobj.h"
54 #include "main/shaderapi.h"
55 #include "main/stencil.h"
56 #include "main/transformfeedback.h"
57 #include "main/varray.h"
58 #include "main/viewport.h"
59 #include "util/half_float.h"
60
61 #if GEN_GEN == 4
62 static struct brw_address
63 KSP(struct brw_context *brw, uint32_t offset)
64 {
65 return ro_bo(brw->cache.bo, offset);
66 }
67 #else
68 static uint32_t
69 KSP(UNUSED struct brw_context *brw, uint32_t offset)
70 {
71 return offset;
72 }
73 #endif
74
75 #if GEN_GEN >= 7
76 MAYBE_UNUSED static void
77 emit_lrm(struct brw_context *brw, uint32_t reg, struct brw_address addr)
78 {
79 brw_batch_emit(brw, GENX(MI_LOAD_REGISTER_MEM), lrm) {
80 lrm.RegisterAddress = reg;
81 lrm.MemoryAddress = addr;
82 }
83 }
84 #endif
85
86 MAYBE_UNUSED static void
87 emit_lri(struct brw_context *brw, uint32_t reg, uint32_t imm)
88 {
89 brw_batch_emit(brw, GENX(MI_LOAD_REGISTER_IMM), lri) {
90 lri.RegisterOffset = reg;
91 lri.DataDWord = imm;
92 }
93 }
94
95 #if GEN_IS_HASWELL || GEN_GEN >= 8
96 MAYBE_UNUSED static void
97 emit_lrr(struct brw_context *brw, uint32_t dst, uint32_t src)
98 {
99 brw_batch_emit(brw, GENX(MI_LOAD_REGISTER_REG), lrr) {
100 lrr.SourceRegisterAddress = src;
101 lrr.DestinationRegisterAddress = dst;
102 }
103 }
104 #endif
105
106 /**
107 * Polygon stipple packet
108 */
109 static void
110 genX(upload_polygon_stipple)(struct brw_context *brw)
111 {
112 struct gl_context *ctx = &brw->ctx;
113
114 /* _NEW_POLYGON */
115 if (!ctx->Polygon.StippleFlag)
116 return;
117
118 brw_batch_emit(brw, GENX(3DSTATE_POLY_STIPPLE_PATTERN), poly) {
119 /* Polygon stipple is provided in OpenGL order, i.e. bottom
120 * row first. If we're rendering to a window (i.e. the
121 * default frame buffer object, 0), then we need to invert
122 * it to match our pixel layout. But if we're rendering
123 * to a FBO (i.e. any named frame buffer object), we *don't*
124 * need to invert - we already match the layout.
125 */
126 if (ctx->DrawBuffer->FlipY) {
127 for (unsigned i = 0; i < 32; i++)
128 poly.PatternRow[i] = ctx->PolygonStipple[31 - i]; /* invert */
129 } else {
130 for (unsigned i = 0; i < 32; i++)
131 poly.PatternRow[i] = ctx->PolygonStipple[i];
132 }
133 }
134 }
135
136 static const struct brw_tracked_state genX(polygon_stipple) = {
137 .dirty = {
138 .mesa = _NEW_POLYGON |
139 _NEW_POLYGONSTIPPLE,
140 .brw = BRW_NEW_CONTEXT,
141 },
142 .emit = genX(upload_polygon_stipple),
143 };
144
145 /**
146 * Polygon stipple offset packet
147 */
148 static void
149 genX(upload_polygon_stipple_offset)(struct brw_context *brw)
150 {
151 struct gl_context *ctx = &brw->ctx;
152
153 /* _NEW_POLYGON */
154 if (!ctx->Polygon.StippleFlag)
155 return;
156
157 brw_batch_emit(brw, GENX(3DSTATE_POLY_STIPPLE_OFFSET), poly) {
158 /* _NEW_BUFFERS
159 *
160 * If we're drawing to a system window we have to invert the Y axis
161 * in order to match the OpenGL pixel coordinate system, and our
162 * offset must be matched to the window position. If we're drawing
163 * to a user-created FBO then our native pixel coordinate system
164 * works just fine, and there's no window system to worry about.
165 */
166 if (ctx->DrawBuffer->FlipY) {
167 poly.PolygonStippleYOffset =
168 (32 - (_mesa_geometric_height(ctx->DrawBuffer) & 31)) & 31;
169 }
170 }
171 }
172
173 static const struct brw_tracked_state genX(polygon_stipple_offset) = {
174 .dirty = {
175 .mesa = _NEW_BUFFERS |
176 _NEW_POLYGON,
177 .brw = BRW_NEW_CONTEXT,
178 },
179 .emit = genX(upload_polygon_stipple_offset),
180 };
181
182 /**
183 * Line stipple packet
184 */
185 static void
186 genX(upload_line_stipple)(struct brw_context *brw)
187 {
188 struct gl_context *ctx = &brw->ctx;
189
190 if (!ctx->Line.StippleFlag)
191 return;
192
193 brw_batch_emit(brw, GENX(3DSTATE_LINE_STIPPLE), line) {
194 line.LineStipplePattern = ctx->Line.StipplePattern;
195
196 line.LineStippleInverseRepeatCount = 1.0f / ctx->Line.StippleFactor;
197 line.LineStippleRepeatCount = ctx->Line.StippleFactor;
198 }
199 }
200
201 static const struct brw_tracked_state genX(line_stipple) = {
202 .dirty = {
203 .mesa = _NEW_LINE,
204 .brw = BRW_NEW_CONTEXT,
205 },
206 .emit = genX(upload_line_stipple),
207 };
208
209 /* Constant single cliprect for framebuffer object or DRI2 drawing */
210 static void
211 genX(upload_drawing_rect)(struct brw_context *brw)
212 {
213 struct gl_context *ctx = &brw->ctx;
214 const struct gl_framebuffer *fb = ctx->DrawBuffer;
215 const unsigned int fb_width = _mesa_geometric_width(fb);
216 const unsigned int fb_height = _mesa_geometric_height(fb);
217
218 brw_batch_emit(brw, GENX(3DSTATE_DRAWING_RECTANGLE), rect) {
219 rect.ClippedDrawingRectangleXMax = fb_width - 1;
220 rect.ClippedDrawingRectangleYMax = fb_height - 1;
221 }
222 }
223
224 static const struct brw_tracked_state genX(drawing_rect) = {
225 .dirty = {
226 .mesa = _NEW_BUFFERS,
227 .brw = BRW_NEW_BLORP |
228 BRW_NEW_CONTEXT,
229 },
230 .emit = genX(upload_drawing_rect),
231 };
232
233 static uint32_t *
234 genX(emit_vertex_buffer_state)(struct brw_context *brw,
235 uint32_t *dw,
236 unsigned buffer_nr,
237 struct brw_bo *bo,
238 unsigned start_offset,
239 MAYBE_UNUSED unsigned end_offset,
240 unsigned stride,
241 MAYBE_UNUSED unsigned step_rate)
242 {
243 struct GENX(VERTEX_BUFFER_STATE) buf_state = {
244 .VertexBufferIndex = buffer_nr,
245 .BufferPitch = stride,
246
247 /* The VF cache designers apparently cut corners, and made the cache
248 * only consider the bottom 32 bits of memory addresses. If you happen
249 * to have two vertex buffers which get placed exactly 4 GiB apart and
250 * use them in back-to-back draw calls, you can get collisions. To work
251 * around this problem, we restrict vertex buffers to the low 32 bits of
252 * the address space.
253 */
254 .BufferStartingAddress = ro_32_bo(bo, start_offset),
255 #if GEN_GEN >= 8
256 .BufferSize = end_offset - start_offset,
257 #endif
258
259 #if GEN_GEN >= 7
260 .AddressModifyEnable = true,
261 #endif
262
263 #if GEN_GEN < 8
264 .BufferAccessType = step_rate ? INSTANCEDATA : VERTEXDATA,
265 .InstanceDataStepRate = step_rate,
266 #if GEN_GEN >= 5
267 .EndAddress = ro_bo(bo, end_offset - 1),
268 #endif
269 #endif
270
271 #if GEN_GEN == 11
272 .MOCS = ICL_MOCS_WB,
273 #elif GEN_GEN == 10
274 .MOCS = CNL_MOCS_WB,
275 #elif GEN_GEN == 9
276 .MOCS = SKL_MOCS_WB,
277 #elif GEN_GEN == 8
278 .MOCS = BDW_MOCS_WB,
279 #elif GEN_GEN == 7
280 .MOCS = GEN7_MOCS_L3,
281 #endif
282 };
283
284 GENX(VERTEX_BUFFER_STATE_pack)(brw, dw, &buf_state);
285 return dw + GENX(VERTEX_BUFFER_STATE_length);
286 }
287
288 UNUSED static bool
289 is_passthru_format(uint32_t format)
290 {
291 switch (format) {
292 case ISL_FORMAT_R64_PASSTHRU:
293 case ISL_FORMAT_R64G64_PASSTHRU:
294 case ISL_FORMAT_R64G64B64_PASSTHRU:
295 case ISL_FORMAT_R64G64B64A64_PASSTHRU:
296 return true;
297 default:
298 return false;
299 }
300 }
301
302 UNUSED static int
303 uploads_needed(uint32_t format,
304 bool is_dual_slot)
305 {
306 if (!is_passthru_format(format))
307 return 1;
308
309 if (is_dual_slot)
310 return 2;
311
312 switch (format) {
313 case ISL_FORMAT_R64_PASSTHRU:
314 case ISL_FORMAT_R64G64_PASSTHRU:
315 return 1;
316 case ISL_FORMAT_R64G64B64_PASSTHRU:
317 case ISL_FORMAT_R64G64B64A64_PASSTHRU:
318 return 2;
319 default:
320 unreachable("not reached");
321 }
322 }
323
324 /*
325 * Returns the format that we are finally going to use when upload a vertex
326 * element. It will only change if we are using *64*PASSTHRU formats, as for
327 * gen < 8 they need to be splitted on two *32*FLOAT formats.
328 *
329 * @upload points in which upload we are. Valid values are [0,1]
330 */
331 static uint32_t
332 downsize_format_if_needed(uint32_t format,
333 int upload)
334 {
335 assert(upload == 0 || upload == 1);
336
337 if (!is_passthru_format(format))
338 return format;
339
340 /* ISL_FORMAT_R64_PASSTHRU and ISL_FORMAT_R64G64_PASSTHRU with an upload ==
341 * 1 means that we have been forced to do 2 uploads for a size <= 2. This
342 * happens with gen < 8 and dvec3 or dvec4 vertex shader input
343 * variables. In those cases, we return ISL_FORMAT_R32_FLOAT as a way of
344 * flagging that we want to fill with zeroes this second forced upload.
345 */
346 switch (format) {
347 case ISL_FORMAT_R64_PASSTHRU:
348 return upload == 0 ? ISL_FORMAT_R32G32_FLOAT
349 : ISL_FORMAT_R32_FLOAT;
350 case ISL_FORMAT_R64G64_PASSTHRU:
351 return upload == 0 ? ISL_FORMAT_R32G32B32A32_FLOAT
352 : ISL_FORMAT_R32_FLOAT;
353 case ISL_FORMAT_R64G64B64_PASSTHRU:
354 return upload == 0 ? ISL_FORMAT_R32G32B32A32_FLOAT
355 : ISL_FORMAT_R32G32_FLOAT;
356 case ISL_FORMAT_R64G64B64A64_PASSTHRU:
357 return ISL_FORMAT_R32G32B32A32_FLOAT;
358 default:
359 unreachable("not reached");
360 }
361 }
362
363 /*
364 * Returns the number of componentes associated with a format that is used on
365 * a 64 to 32 format split. See downsize_format()
366 */
367 static int
368 upload_format_size(uint32_t upload_format)
369 {
370 switch (upload_format) {
371 case ISL_FORMAT_R32_FLOAT:
372
373 /* downsized_format has returned this one in order to flag that we are
374 * performing a second upload which we want to have filled with
375 * zeroes. This happens with gen < 8, a size <= 2, and dvec3 or dvec4
376 * vertex shader input variables.
377 */
378
379 return 0;
380 case ISL_FORMAT_R32G32_FLOAT:
381 return 2;
382 case ISL_FORMAT_R32G32B32A32_FLOAT:
383 return 4;
384 default:
385 unreachable("not reached");
386 }
387 }
388
389 static UNUSED uint16_t
390 pinned_bo_high_bits(struct brw_bo *bo)
391 {
392 return (bo->kflags & EXEC_OBJECT_PINNED) ? bo->gtt_offset >> 32ull : 0;
393 }
394
395 /* The VF cache designers apparently cut corners, and made the cache key's
396 * <VertexBufferIndex, Memory Address> tuple only consider the bottom 32 bits
397 * of the address. If you happen to have two vertex buffers which get placed
398 * exactly 4 GiB apart and use them in back-to-back draw calls, you can get
399 * collisions. (These collisions can happen within a single batch.)
400 *
401 * In the soft-pin world, we'd like to assign addresses up front, and never
402 * move buffers. So, we need to do a VF cache invalidate if the buffer for
403 * a particular VB slot has different [48:32] address bits than the last one.
404 *
405 * In the relocation world, we have no idea what the addresses will be, so
406 * we can't apply this workaround. Instead, we tell the kernel to move it
407 * to the low 4GB regardless.
408 *
409 * This HW issue is gone on Gen11+.
410 */
411 static void
412 vf_invalidate_for_vb_48bit_transitions(struct brw_context *brw)
413 {
414 #if GEN_GEN >= 8 && GEN_GEN < 11
415 bool need_invalidate = false;
416
417 for (unsigned i = 0; i < brw->vb.nr_buffers; i++) {
418 uint16_t high_bits = pinned_bo_high_bits(brw->vb.buffers[i].bo);
419
420 if (high_bits != brw->vb.last_bo_high_bits[i]) {
421 need_invalidate = true;
422 brw->vb.last_bo_high_bits[i] = high_bits;
423 }
424 }
425
426 if (brw->draw.draw_params_bo) {
427 uint16_t high_bits = pinned_bo_high_bits(brw->draw.draw_params_bo);
428
429 if (brw->vb.last_bo_high_bits[brw->vb.nr_buffers] != high_bits) {
430 need_invalidate = true;
431 brw->vb.last_bo_high_bits[brw->vb.nr_buffers] = high_bits;
432 }
433 }
434
435 if (brw->draw.derived_draw_params_bo) {
436 uint16_t high_bits = pinned_bo_high_bits(brw->draw.derived_draw_params_bo);
437
438 if (brw->vb.last_bo_high_bits[brw->vb.nr_buffers + 1] != high_bits) {
439 need_invalidate = true;
440 brw->vb.last_bo_high_bits[brw->vb.nr_buffers + 1] = high_bits;
441 }
442 }
443
444 if (need_invalidate) {
445 brw_emit_pipe_control_flush(brw, PIPE_CONTROL_VF_CACHE_INVALIDATE | PIPE_CONTROL_CS_STALL);
446 }
447 #endif
448 }
449
450 static void
451 vf_invalidate_for_ib_48bit_transition(struct brw_context *brw)
452 {
453 #if GEN_GEN >= 8
454 uint16_t high_bits = pinned_bo_high_bits(brw->ib.bo);
455
456 if (high_bits != brw->ib.last_bo_high_bits) {
457 brw_emit_pipe_control_flush(brw, PIPE_CONTROL_VF_CACHE_INVALIDATE);
458 brw->ib.last_bo_high_bits = high_bits;
459 }
460 #endif
461 }
462
463 static void
464 genX(emit_vertices)(struct brw_context *brw)
465 {
466 const struct gen_device_info *devinfo = &brw->screen->devinfo;
467 uint32_t *dw;
468
469 brw_prepare_vertices(brw);
470 brw_prepare_shader_draw_parameters(brw);
471
472 #if GEN_GEN < 6
473 brw_emit_query_begin(brw);
474 #endif
475
476 const struct brw_vs_prog_data *vs_prog_data =
477 brw_vs_prog_data(brw->vs.base.prog_data);
478
479 #if GEN_GEN >= 8
480 struct gl_context *ctx = &brw->ctx;
481 const bool uses_edge_flag = (ctx->Polygon.FrontMode != GL_FILL ||
482 ctx->Polygon.BackMode != GL_FILL);
483
484 if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid) {
485 unsigned vue = brw->vb.nr_enabled;
486
487 /* The element for the edge flags must always be last, so we have to
488 * insert the SGVS before it in that case.
489 */
490 if (uses_edge_flag) {
491 assert(vue > 0);
492 vue--;
493 }
494
495 WARN_ONCE(vue >= 33,
496 "Trying to insert VID/IID past 33rd vertex element, "
497 "need to reorder the vertex attrbutes.");
498
499 brw_batch_emit(brw, GENX(3DSTATE_VF_SGVS), vfs) {
500 if (vs_prog_data->uses_vertexid) {
501 vfs.VertexIDEnable = true;
502 vfs.VertexIDComponentNumber = 2;
503 vfs.VertexIDElementOffset = vue;
504 }
505
506 if (vs_prog_data->uses_instanceid) {
507 vfs.InstanceIDEnable = true;
508 vfs.InstanceIDComponentNumber = 3;
509 vfs.InstanceIDElementOffset = vue;
510 }
511 }
512
513 brw_batch_emit(brw, GENX(3DSTATE_VF_INSTANCING), vfi) {
514 vfi.InstancingEnable = true;
515 vfi.VertexElementIndex = vue;
516 }
517 } else {
518 brw_batch_emit(brw, GENX(3DSTATE_VF_SGVS), vfs);
519 }
520 #endif
521
522 const bool uses_draw_params =
523 vs_prog_data->uses_firstvertex ||
524 vs_prog_data->uses_baseinstance;
525
526 const bool uses_derived_draw_params =
527 vs_prog_data->uses_drawid ||
528 vs_prog_data->uses_is_indexed_draw;
529
530 const bool needs_sgvs_element = (uses_draw_params ||
531 vs_prog_data->uses_instanceid ||
532 vs_prog_data->uses_vertexid);
533
534 unsigned nr_elements =
535 brw->vb.nr_enabled + needs_sgvs_element + uses_derived_draw_params;
536
537 #if GEN_GEN < 8
538 /* If any of the formats of vb.enabled needs more that one upload, we need
539 * to add it to nr_elements
540 */
541 for (unsigned i = 0; i < brw->vb.nr_enabled; i++) {
542 struct brw_vertex_element *input = brw->vb.enabled[i];
543 const struct gl_array_attributes *glattrib = input->glattrib;
544 uint32_t format = brw_get_vertex_surface_type(brw, &glattrib->Format);
545
546 if (uploads_needed(format, input->is_dual_slot) > 1)
547 nr_elements++;
548 }
549 #endif
550
551 /* If the VS doesn't read any inputs (calculating vertex position from
552 * a state variable for some reason, for example), emit a single pad
553 * VERTEX_ELEMENT struct and bail.
554 *
555 * The stale VB state stays in place, but they don't do anything unless
556 * a VE loads from them.
557 */
558 if (nr_elements == 0) {
559 dw = brw_batch_emitn(brw, GENX(3DSTATE_VERTEX_ELEMENTS),
560 1 + GENX(VERTEX_ELEMENT_STATE_length));
561 struct GENX(VERTEX_ELEMENT_STATE) elem = {
562 .Valid = true,
563 .SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT,
564 .Component0Control = VFCOMP_STORE_0,
565 .Component1Control = VFCOMP_STORE_0,
566 .Component2Control = VFCOMP_STORE_0,
567 .Component3Control = VFCOMP_STORE_1_FP,
568 };
569 GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem);
570 return;
571 }
572
573 /* Now emit 3DSTATE_VERTEX_BUFFERS and 3DSTATE_VERTEX_ELEMENTS packets. */
574 const unsigned nr_buffers = brw->vb.nr_buffers +
575 uses_draw_params + uses_derived_draw_params;
576
577 vf_invalidate_for_vb_48bit_transitions(brw);
578
579 if (nr_buffers) {
580 assert(nr_buffers <= (GEN_GEN >= 6 ? 33 : 17));
581
582 dw = brw_batch_emitn(brw, GENX(3DSTATE_VERTEX_BUFFERS),
583 1 + GENX(VERTEX_BUFFER_STATE_length) * nr_buffers);
584
585 for (unsigned i = 0; i < brw->vb.nr_buffers; i++) {
586 const struct brw_vertex_buffer *buffer = &brw->vb.buffers[i];
587 /* Prior to Haswell and Bay Trail we have to use 4-component formats
588 * to fake 3-component ones. In particular, we do this for
589 * half-float and 8 and 16-bit integer formats. This means that the
590 * vertex element may poke over the end of the buffer by 2 bytes.
591 */
592 const unsigned padding =
593 (GEN_GEN <= 7 && !GEN_IS_HASWELL && !devinfo->is_baytrail) * 2;
594 const unsigned end = buffer->offset + buffer->size + padding;
595 dw = genX(emit_vertex_buffer_state)(brw, dw, i, buffer->bo,
596 buffer->offset,
597 end,
598 buffer->stride,
599 buffer->step_rate);
600 }
601
602 if (uses_draw_params) {
603 dw = genX(emit_vertex_buffer_state)(brw, dw, brw->vb.nr_buffers,
604 brw->draw.draw_params_bo,
605 brw->draw.draw_params_offset,
606 brw->draw.draw_params_bo->size,
607 0 /* stride */,
608 0 /* step rate */);
609 }
610
611 if (uses_derived_draw_params) {
612 dw = genX(emit_vertex_buffer_state)(brw, dw, brw->vb.nr_buffers + 1,
613 brw->draw.derived_draw_params_bo,
614 brw->draw.derived_draw_params_offset,
615 brw->draw.derived_draw_params_bo->size,
616 0 /* stride */,
617 0 /* step rate */);
618 }
619 }
620
621 /* The hardware allows one more VERTEX_ELEMENTS than VERTEX_BUFFERS,
622 * presumably for VertexID/InstanceID.
623 */
624 #if GEN_GEN >= 6
625 assert(nr_elements <= 34);
626 const struct brw_vertex_element *gen6_edgeflag_input = NULL;
627 #else
628 assert(nr_elements <= 18);
629 #endif
630
631 dw = brw_batch_emitn(brw, GENX(3DSTATE_VERTEX_ELEMENTS),
632 1 + GENX(VERTEX_ELEMENT_STATE_length) * nr_elements);
633 unsigned i;
634 for (i = 0; i < brw->vb.nr_enabled; i++) {
635 const struct brw_vertex_element *input = brw->vb.enabled[i];
636 const struct gl_array_attributes *glattrib = input->glattrib;
637 uint32_t format = brw_get_vertex_surface_type(brw, &glattrib->Format);
638 uint32_t comp0 = VFCOMP_STORE_SRC;
639 uint32_t comp1 = VFCOMP_STORE_SRC;
640 uint32_t comp2 = VFCOMP_STORE_SRC;
641 uint32_t comp3 = VFCOMP_STORE_SRC;
642 const unsigned num_uploads = GEN_GEN < 8 ?
643 uploads_needed(format, input->is_dual_slot) : 1;
644
645 #if GEN_GEN >= 8
646 /* From the BDW PRM, Volume 2d, page 588 (VERTEX_ELEMENT_STATE):
647 * "Any SourceElementFormat of *64*_PASSTHRU cannot be used with an
648 * element which has edge flag enabled."
649 */
650 assert(!(is_passthru_format(format) && uses_edge_flag));
651 #endif
652
653 /* The gen4 driver expects edgeflag to come in as a float, and passes
654 * that float on to the tests in the clipper. Mesa's current vertex
655 * attribute value for EdgeFlag is stored as a float, which works out.
656 * glEdgeFlagPointer, on the other hand, gives us an unnormalized
657 * integer ubyte. Just rewrite that to convert to a float.
658 *
659 * Gen6+ passes edgeflag as sideband along with the vertex, instead
660 * of in the VUE. We have to upload it sideband as the last vertex
661 * element according to the B-Spec.
662 */
663 #if GEN_GEN >= 6
664 if (input == &brw->vb.inputs[VERT_ATTRIB_EDGEFLAG]) {
665 gen6_edgeflag_input = input;
666 continue;
667 }
668 #endif
669
670 for (unsigned c = 0; c < num_uploads; c++) {
671 const uint32_t upload_format = GEN_GEN >= 8 ? format :
672 downsize_format_if_needed(format, c);
673 /* If we need more that one upload, the offset stride would be 128
674 * bits (16 bytes), as for previous uploads we are using the full
675 * entry. */
676 const unsigned offset = input->offset + c * 16;
677
678 const struct gl_array_attributes *glattrib = input->glattrib;
679 const int size = (GEN_GEN < 8 && is_passthru_format(format)) ?
680 upload_format_size(upload_format) : glattrib->Format.Size;
681
682 switch (size) {
683 case 0: comp0 = VFCOMP_STORE_0;
684 case 1: comp1 = VFCOMP_STORE_0;
685 case 2: comp2 = VFCOMP_STORE_0;
686 case 3:
687 if (GEN_GEN >= 8 && glattrib->Format.Doubles) {
688 comp3 = VFCOMP_STORE_0;
689 } else if (glattrib->Format.Integer) {
690 comp3 = VFCOMP_STORE_1_INT;
691 } else {
692 comp3 = VFCOMP_STORE_1_FP;
693 }
694
695 break;
696 }
697
698 #if GEN_GEN >= 8
699 /* From the BDW PRM, Volume 2d, page 586 (VERTEX_ELEMENT_STATE):
700 *
701 * "When SourceElementFormat is set to one of the *64*_PASSTHRU
702 * formats, 64-bit components are stored in the URB without any
703 * conversion. In this case, vertex elements must be written as 128
704 * or 256 bits, with VFCOMP_STORE_0 being used to pad the output as
705 * required. E.g., if R64_PASSTHRU is used to copy a 64-bit Red
706 * component into the URB, Component 1 must be specified as
707 * VFCOMP_STORE_0 (with Components 2,3 set to VFCOMP_NOSTORE) in
708 * order to output a 128-bit vertex element, or Components 1-3 must
709 * be specified as VFCOMP_STORE_0 in order to output a 256-bit vertex
710 * element. Likewise, use of R64G64B64_PASSTHRU requires Component 3
711 * to be specified as VFCOMP_STORE_0 in order to output a 256-bit
712 * vertex element."
713 */
714 if (glattrib->Format.Doubles && !input->is_dual_slot) {
715 /* Store vertex elements which correspond to double and dvec2 vertex
716 * shader inputs as 128-bit vertex elements, instead of 256-bits.
717 */
718 comp2 = VFCOMP_NOSTORE;
719 comp3 = VFCOMP_NOSTORE;
720 }
721 #endif
722
723 struct GENX(VERTEX_ELEMENT_STATE) elem_state = {
724 .VertexBufferIndex = input->buffer,
725 .Valid = true,
726 .SourceElementFormat = upload_format,
727 .SourceElementOffset = offset,
728 .Component0Control = comp0,
729 .Component1Control = comp1,
730 .Component2Control = comp2,
731 .Component3Control = comp3,
732 #if GEN_GEN < 5
733 .DestinationElementOffset = i * 4,
734 #endif
735 };
736
737 GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem_state);
738 dw += GENX(VERTEX_ELEMENT_STATE_length);
739 }
740 }
741
742 if (needs_sgvs_element) {
743 struct GENX(VERTEX_ELEMENT_STATE) elem_state = {
744 .Valid = true,
745 .Component0Control = VFCOMP_STORE_0,
746 .Component1Control = VFCOMP_STORE_0,
747 .Component2Control = VFCOMP_STORE_0,
748 .Component3Control = VFCOMP_STORE_0,
749 #if GEN_GEN < 5
750 .DestinationElementOffset = i * 4,
751 #endif
752 };
753
754 #if GEN_GEN >= 8
755 if (uses_draw_params) {
756 elem_state.VertexBufferIndex = brw->vb.nr_buffers;
757 elem_state.SourceElementFormat = ISL_FORMAT_R32G32_UINT;
758 elem_state.Component0Control = VFCOMP_STORE_SRC;
759 elem_state.Component1Control = VFCOMP_STORE_SRC;
760 }
761 #else
762 elem_state.VertexBufferIndex = brw->vb.nr_buffers;
763 elem_state.SourceElementFormat = ISL_FORMAT_R32G32_UINT;
764 if (uses_draw_params) {
765 elem_state.Component0Control = VFCOMP_STORE_SRC;
766 elem_state.Component1Control = VFCOMP_STORE_SRC;
767 }
768
769 if (vs_prog_data->uses_vertexid)
770 elem_state.Component2Control = VFCOMP_STORE_VID;
771
772 if (vs_prog_data->uses_instanceid)
773 elem_state.Component3Control = VFCOMP_STORE_IID;
774 #endif
775
776 GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem_state);
777 dw += GENX(VERTEX_ELEMENT_STATE_length);
778 }
779
780 if (uses_derived_draw_params) {
781 struct GENX(VERTEX_ELEMENT_STATE) elem_state = {
782 .Valid = true,
783 .VertexBufferIndex = brw->vb.nr_buffers + 1,
784 .SourceElementFormat = ISL_FORMAT_R32G32_UINT,
785 .Component0Control = VFCOMP_STORE_SRC,
786 .Component1Control = VFCOMP_STORE_SRC,
787 .Component2Control = VFCOMP_STORE_0,
788 .Component3Control = VFCOMP_STORE_0,
789 #if GEN_GEN < 5
790 .DestinationElementOffset = i * 4,
791 #endif
792 };
793
794 GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem_state);
795 dw += GENX(VERTEX_ELEMENT_STATE_length);
796 }
797
798 #if GEN_GEN >= 6
799 if (gen6_edgeflag_input) {
800 const struct gl_array_attributes *glattrib = gen6_edgeflag_input->glattrib;
801 const uint32_t format = brw_get_vertex_surface_type(brw, &glattrib->Format);
802
803 struct GENX(VERTEX_ELEMENT_STATE) elem_state = {
804 .Valid = true,
805 .VertexBufferIndex = gen6_edgeflag_input->buffer,
806 .EdgeFlagEnable = true,
807 .SourceElementFormat = format,
808 .SourceElementOffset = gen6_edgeflag_input->offset,
809 .Component0Control = VFCOMP_STORE_SRC,
810 .Component1Control = VFCOMP_STORE_0,
811 .Component2Control = VFCOMP_STORE_0,
812 .Component3Control = VFCOMP_STORE_0,
813 };
814
815 GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem_state);
816 dw += GENX(VERTEX_ELEMENT_STATE_length);
817 }
818 #endif
819
820 #if GEN_GEN >= 8
821 for (unsigned i = 0, j = 0; i < brw->vb.nr_enabled; i++) {
822 const struct brw_vertex_element *input = brw->vb.enabled[i];
823 const struct brw_vertex_buffer *buffer = &brw->vb.buffers[input->buffer];
824 unsigned element_index;
825
826 /* The edge flag element is reordered to be the last one in the code
827 * above so we need to compensate for that in the element indices used
828 * below.
829 */
830 if (input == gen6_edgeflag_input)
831 element_index = nr_elements - 1;
832 else
833 element_index = j++;
834
835 brw_batch_emit(brw, GENX(3DSTATE_VF_INSTANCING), vfi) {
836 vfi.VertexElementIndex = element_index;
837 vfi.InstancingEnable = buffer->step_rate != 0;
838 vfi.InstanceDataStepRate = buffer->step_rate;
839 }
840 }
841
842 if (vs_prog_data->uses_drawid) {
843 const unsigned element = brw->vb.nr_enabled + needs_sgvs_element;
844
845 brw_batch_emit(brw, GENX(3DSTATE_VF_INSTANCING), vfi) {
846 vfi.VertexElementIndex = element;
847 }
848 }
849 #endif
850 }
851
852 static const struct brw_tracked_state genX(vertices) = {
853 .dirty = {
854 .mesa = _NEW_POLYGON,
855 .brw = BRW_NEW_BATCH |
856 BRW_NEW_BLORP |
857 BRW_NEW_VERTEX_PROGRAM |
858 BRW_NEW_VERTICES |
859 BRW_NEW_VS_PROG_DATA,
860 },
861 .emit = genX(emit_vertices),
862 };
863
864 static void
865 genX(emit_index_buffer)(struct brw_context *brw)
866 {
867 const struct _mesa_index_buffer *index_buffer = brw->ib.ib;
868
869 if (index_buffer == NULL)
870 return;
871
872 vf_invalidate_for_ib_48bit_transition(brw);
873
874 brw_batch_emit(brw, GENX(3DSTATE_INDEX_BUFFER), ib) {
875 #if GEN_GEN < 8 && !GEN_IS_HASWELL
876 assert(brw->ib.enable_cut_index == brw->prim_restart.enable_cut_index);
877 ib.CutIndexEnable = brw->ib.enable_cut_index;
878 #endif
879 ib.IndexFormat = brw_get_index_type(index_buffer->index_size);
880
881 /* The VF cache designers apparently cut corners, and made the cache
882 * only consider the bottom 32 bits of memory addresses. If you happen
883 * to have two index buffers which get placed exactly 4 GiB apart and
884 * use them in back-to-back draw calls, you can get collisions. To work
885 * around this problem, we restrict index buffers to the low 32 bits of
886 * the address space.
887 */
888 ib.BufferStartingAddress = ro_32_bo(brw->ib.bo, 0);
889 #if GEN_GEN >= 8
890 ib.MOCS = GEN_GEN >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB;
891 ib.BufferSize = brw->ib.size;
892 #else
893 ib.BufferEndingAddress = ro_bo(brw->ib.bo, brw->ib.size - 1);
894 #endif
895 }
896 }
897
898 static const struct brw_tracked_state genX(index_buffer) = {
899 .dirty = {
900 .mesa = 0,
901 .brw = BRW_NEW_BATCH |
902 BRW_NEW_BLORP |
903 BRW_NEW_INDEX_BUFFER,
904 },
905 .emit = genX(emit_index_buffer),
906 };
907
908 #if GEN_IS_HASWELL || GEN_GEN >= 8
909 static void
910 genX(upload_cut_index)(struct brw_context *brw)
911 {
912 const struct gl_context *ctx = &brw->ctx;
913
914 brw_batch_emit(brw, GENX(3DSTATE_VF), vf) {
915 if (ctx->Array._PrimitiveRestart && brw->ib.ib) {
916 vf.IndexedDrawCutIndexEnable = true;
917 vf.CutIndex = _mesa_primitive_restart_index(ctx, brw->ib.index_size);
918 }
919 }
920 }
921
922 const struct brw_tracked_state genX(cut_index) = {
923 .dirty = {
924 .mesa = _NEW_TRANSFORM,
925 .brw = BRW_NEW_INDEX_BUFFER,
926 },
927 .emit = genX(upload_cut_index),
928 };
929 #endif
930
931 #if GEN_GEN >= 6
932 /**
933 * Determine the appropriate attribute override value to store into the
934 * 3DSTATE_SF structure for a given fragment shader attribute. The attribute
935 * override value contains two pieces of information: the location of the
936 * attribute in the VUE (relative to urb_entry_read_offset, see below), and a
937 * flag indicating whether to "swizzle" the attribute based on the direction
938 * the triangle is facing.
939 *
940 * If an attribute is "swizzled", then the given VUE location is used for
941 * front-facing triangles, and the VUE location that immediately follows is
942 * used for back-facing triangles. We use this to implement the mapping from
943 * gl_FrontColor/gl_BackColor to gl_Color.
944 *
945 * urb_entry_read_offset is the offset into the VUE at which the SF unit is
946 * being instructed to begin reading attribute data. It can be set to a
947 * nonzero value to prevent the SF unit from wasting time reading elements of
948 * the VUE that are not needed by the fragment shader. It is measured in
949 * 256-bit increments.
950 */
951 static void
952 genX(get_attr_override)(struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr,
953 const struct brw_vue_map *vue_map,
954 int urb_entry_read_offset, int fs_attr,
955 bool two_side_color, uint32_t *max_source_attr)
956 {
957 /* Find the VUE slot for this attribute. */
958 int slot = vue_map->varying_to_slot[fs_attr];
959
960 /* Viewport and Layer are stored in the VUE header. We need to override
961 * them to zero if earlier stages didn't write them, as GL requires that
962 * they read back as zero when not explicitly set.
963 */
964 if (fs_attr == VARYING_SLOT_VIEWPORT || fs_attr == VARYING_SLOT_LAYER) {
965 attr->ComponentOverrideX = true;
966 attr->ComponentOverrideW = true;
967 attr->ConstantSource = CONST_0000;
968
969 if (!(vue_map->slots_valid & VARYING_BIT_LAYER))
970 attr->ComponentOverrideY = true;
971 if (!(vue_map->slots_valid & VARYING_BIT_VIEWPORT))
972 attr->ComponentOverrideZ = true;
973
974 return;
975 }
976
977 /* If there was only a back color written but not front, use back
978 * as the color instead of undefined
979 */
980 if (slot == -1 && fs_attr == VARYING_SLOT_COL0)
981 slot = vue_map->varying_to_slot[VARYING_SLOT_BFC0];
982 if (slot == -1 && fs_attr == VARYING_SLOT_COL1)
983 slot = vue_map->varying_to_slot[VARYING_SLOT_BFC1];
984
985 if (slot == -1) {
986 /* This attribute does not exist in the VUE--that means that the vertex
987 * shader did not write to it. This means that either:
988 *
989 * (a) This attribute is a texture coordinate, and it is going to be
990 * replaced with point coordinates (as a consequence of a call to
991 * glTexEnvi(GL_POINT_SPRITE, GL_COORD_REPLACE, GL_TRUE)), so the
992 * hardware will ignore whatever attribute override we supply.
993 *
994 * (b) This attribute is read by the fragment shader but not written by
995 * the vertex shader, so its value is undefined. Therefore the
996 * attribute override we supply doesn't matter.
997 *
998 * (c) This attribute is gl_PrimitiveID, and it wasn't written by the
999 * previous shader stage.
1000 *
1001 * Note that we don't have to worry about the cases where the attribute
1002 * is gl_PointCoord or is undergoing point sprite coordinate
1003 * replacement, because in those cases, this function isn't called.
1004 *
1005 * In case (c), we need to program the attribute overrides so that the
1006 * primitive ID will be stored in this slot. In every other case, the
1007 * attribute override we supply doesn't matter. So just go ahead and
1008 * program primitive ID in every case.
1009 */
1010 attr->ComponentOverrideW = true;
1011 attr->ComponentOverrideX = true;
1012 attr->ComponentOverrideY = true;
1013 attr->ComponentOverrideZ = true;
1014 attr->ConstantSource = PRIM_ID;
1015 return;
1016 }
1017
1018 /* Compute the location of the attribute relative to urb_entry_read_offset.
1019 * Each increment of urb_entry_read_offset represents a 256-bit value, so
1020 * it counts for two 128-bit VUE slots.
1021 */
1022 int source_attr = slot - 2 * urb_entry_read_offset;
1023 assert(source_attr >= 0 && source_attr < 32);
1024
1025 /* If we are doing two-sided color, and the VUE slot following this one
1026 * represents a back-facing color, then we need to instruct the SF unit to
1027 * do back-facing swizzling.
1028 */
1029 bool swizzling = two_side_color &&
1030 ((vue_map->slot_to_varying[slot] == VARYING_SLOT_COL0 &&
1031 vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC0) ||
1032 (vue_map->slot_to_varying[slot] == VARYING_SLOT_COL1 &&
1033 vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC1));
1034
1035 /* Update max_source_attr. If swizzling, the SF will read this slot + 1. */
1036 if (*max_source_attr < source_attr + swizzling)
1037 *max_source_attr = source_attr + swizzling;
1038
1039 attr->SourceAttribute = source_attr;
1040 if (swizzling)
1041 attr->SwizzleSelect = INPUTATTR_FACING;
1042 }
1043
1044
1045 static void
1046 genX(calculate_attr_overrides)(const struct brw_context *brw,
1047 struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr_overrides,
1048 uint32_t *point_sprite_enables,
1049 uint32_t *urb_entry_read_length,
1050 uint32_t *urb_entry_read_offset)
1051 {
1052 const struct gl_context *ctx = &brw->ctx;
1053
1054 /* _NEW_POINT */
1055 const struct gl_point_attrib *point = &ctx->Point;
1056
1057 /* BRW_NEW_FRAGMENT_PROGRAM */
1058 const struct gl_program *fp = brw->programs[MESA_SHADER_FRAGMENT];
1059
1060 /* BRW_NEW_FS_PROG_DATA */
1061 const struct brw_wm_prog_data *wm_prog_data =
1062 brw_wm_prog_data(brw->wm.base.prog_data);
1063 uint32_t max_source_attr = 0;
1064
1065 *point_sprite_enables = 0;
1066
1067 int first_slot =
1068 brw_compute_first_urb_slot_required(fp->info.inputs_read,
1069 &brw->vue_map_geom_out);
1070
1071 /* Each URB offset packs two varying slots */
1072 assert(first_slot % 2 == 0);
1073 *urb_entry_read_offset = first_slot / 2;
1074
1075 /* From the Ivybridge PRM, Vol 2 Part 1, 3DSTATE_SBE,
1076 * description of dw10 Point Sprite Texture Coordinate Enable:
1077 *
1078 * "This field must be programmed to zero when non-point primitives
1079 * are rendered."
1080 *
1081 * The SandyBridge PRM doesn't explicitly say that point sprite enables
1082 * must be programmed to zero when rendering non-point primitives, but
1083 * the IvyBridge PRM does, and if we don't, we get garbage.
1084 *
1085 * This is not required on Haswell, as the hardware ignores this state
1086 * when drawing non-points -- although we do still need to be careful to
1087 * correctly set the attr overrides.
1088 *
1089 * _NEW_POLYGON
1090 * BRW_NEW_PRIMITIVE | BRW_NEW_GS_PROG_DATA | BRW_NEW_TES_PROG_DATA
1091 */
1092 bool drawing_points = brw_is_drawing_points(brw);
1093
1094 for (int attr = 0; attr < VARYING_SLOT_MAX; attr++) {
1095 int input_index = wm_prog_data->urb_setup[attr];
1096
1097 if (input_index < 0)
1098 continue;
1099
1100 /* _NEW_POINT */
1101 bool point_sprite = false;
1102 if (drawing_points) {
1103 if (point->PointSprite &&
1104 (attr >= VARYING_SLOT_TEX0 && attr <= VARYING_SLOT_TEX7) &&
1105 (point->CoordReplace & (1u << (attr - VARYING_SLOT_TEX0)))) {
1106 point_sprite = true;
1107 }
1108
1109 if (attr == VARYING_SLOT_PNTC)
1110 point_sprite = true;
1111
1112 if (point_sprite)
1113 *point_sprite_enables |= (1 << input_index);
1114 }
1115
1116 /* BRW_NEW_VUE_MAP_GEOM_OUT | _NEW_LIGHT | _NEW_PROGRAM */
1117 struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) attribute = { 0 };
1118
1119 if (!point_sprite) {
1120 genX(get_attr_override)(&attribute,
1121 &brw->vue_map_geom_out,
1122 *urb_entry_read_offset, attr,
1123 _mesa_vertex_program_two_side_enabled(ctx),
1124 &max_source_attr);
1125 }
1126
1127 /* The hardware can only do the overrides on 16 overrides at a
1128 * time, and the other up to 16 have to be lined up so that the
1129 * input index = the output index. We'll need to do some
1130 * tweaking to make sure that's the case.
1131 */
1132 if (input_index < 16)
1133 attr_overrides[input_index] = attribute;
1134 else
1135 assert(attribute.SourceAttribute == input_index);
1136 }
1137
1138 /* From the Sandy Bridge PRM, Volume 2, Part 1, documentation for
1139 * 3DSTATE_SF DWord 1 bits 15:11, "Vertex URB Entry Read Length":
1140 *
1141 * "This field should be set to the minimum length required to read the
1142 * maximum source attribute. The maximum source attribute is indicated
1143 * by the maximum value of the enabled Attribute # Source Attribute if
1144 * Attribute Swizzle Enable is set, Number of Output Attributes-1 if
1145 * enable is not set.
1146 * read_length = ceiling((max_source_attr + 1) / 2)
1147 *
1148 * [errata] Corruption/Hang possible if length programmed larger than
1149 * recommended"
1150 *
1151 * Similar text exists for Ivy Bridge.
1152 */
1153 *urb_entry_read_length = DIV_ROUND_UP(max_source_attr + 1, 2);
1154 }
1155 #endif
1156
1157 /* ---------------------------------------------------------------------- */
1158
1159 #if GEN_GEN >= 8
1160 typedef struct GENX(3DSTATE_WM_DEPTH_STENCIL) DEPTH_STENCIL_GENXML;
1161 #elif GEN_GEN >= 6
1162 typedef struct GENX(DEPTH_STENCIL_STATE) DEPTH_STENCIL_GENXML;
1163 #else
1164 typedef struct GENX(COLOR_CALC_STATE) DEPTH_STENCIL_GENXML;
1165 #endif
1166
1167 static inline void
1168 set_depth_stencil_bits(struct brw_context *brw, DEPTH_STENCIL_GENXML *ds)
1169 {
1170 struct gl_context *ctx = &brw->ctx;
1171
1172 /* _NEW_BUFFERS */
1173 struct intel_renderbuffer *depth_irb =
1174 intel_get_renderbuffer(ctx->DrawBuffer, BUFFER_DEPTH);
1175
1176 /* _NEW_DEPTH */
1177 struct gl_depthbuffer_attrib *depth = &ctx->Depth;
1178
1179 /* _NEW_STENCIL */
1180 struct gl_stencil_attrib *stencil = &ctx->Stencil;
1181 const int b = stencil->_BackFace;
1182
1183 if (depth->Test && depth_irb) {
1184 ds->DepthTestEnable = true;
1185 ds->DepthBufferWriteEnable = brw_depth_writes_enabled(brw);
1186 ds->DepthTestFunction = intel_translate_compare_func(depth->Func);
1187 }
1188
1189 if (brw->stencil_enabled) {
1190 ds->StencilTestEnable = true;
1191 ds->StencilWriteMask = stencil->WriteMask[0] & 0xff;
1192 ds->StencilTestMask = stencil->ValueMask[0] & 0xff;
1193
1194 ds->StencilTestFunction =
1195 intel_translate_compare_func(stencil->Function[0]);
1196 ds->StencilFailOp =
1197 intel_translate_stencil_op(stencil->FailFunc[0]);
1198 ds->StencilPassDepthPassOp =
1199 intel_translate_stencil_op(stencil->ZPassFunc[0]);
1200 ds->StencilPassDepthFailOp =
1201 intel_translate_stencil_op(stencil->ZFailFunc[0]);
1202
1203 ds->StencilBufferWriteEnable = brw->stencil_write_enabled;
1204
1205 if (brw->stencil_two_sided) {
1206 ds->DoubleSidedStencilEnable = true;
1207 ds->BackfaceStencilWriteMask = stencil->WriteMask[b] & 0xff;
1208 ds->BackfaceStencilTestMask = stencil->ValueMask[b] & 0xff;
1209
1210 ds->BackfaceStencilTestFunction =
1211 intel_translate_compare_func(stencil->Function[b]);
1212 ds->BackfaceStencilFailOp =
1213 intel_translate_stencil_op(stencil->FailFunc[b]);
1214 ds->BackfaceStencilPassDepthPassOp =
1215 intel_translate_stencil_op(stencil->ZPassFunc[b]);
1216 ds->BackfaceStencilPassDepthFailOp =
1217 intel_translate_stencil_op(stencil->ZFailFunc[b]);
1218 }
1219
1220 #if GEN_GEN <= 5 || GEN_GEN >= 9
1221 ds->StencilReferenceValue = _mesa_get_stencil_ref(ctx, 0);
1222 ds->BackfaceStencilReferenceValue = _mesa_get_stencil_ref(ctx, b);
1223 #endif
1224 }
1225 }
1226
1227 #if GEN_GEN >= 6
1228 static void
1229 genX(upload_depth_stencil_state)(struct brw_context *brw)
1230 {
1231 #if GEN_GEN >= 8
1232 brw_batch_emit(brw, GENX(3DSTATE_WM_DEPTH_STENCIL), wmds) {
1233 set_depth_stencil_bits(brw, &wmds);
1234 }
1235 #else
1236 uint32_t ds_offset;
1237 brw_state_emit(brw, GENX(DEPTH_STENCIL_STATE), 64, &ds_offset, ds) {
1238 set_depth_stencil_bits(brw, &ds);
1239 }
1240
1241 /* Now upload a pointer to the indirect state */
1242 #if GEN_GEN == 6
1243 brw_batch_emit(brw, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
1244 ptr.PointertoDEPTH_STENCIL_STATE = ds_offset;
1245 ptr.DEPTH_STENCIL_STATEChange = true;
1246 }
1247 #else
1248 brw_batch_emit(brw, GENX(3DSTATE_DEPTH_STENCIL_STATE_POINTERS), ptr) {
1249 ptr.PointertoDEPTH_STENCIL_STATE = ds_offset;
1250 }
1251 #endif
1252 #endif
1253 }
1254
1255 static const struct brw_tracked_state genX(depth_stencil_state) = {
1256 .dirty = {
1257 .mesa = _NEW_BUFFERS |
1258 _NEW_DEPTH |
1259 _NEW_STENCIL,
1260 .brw = BRW_NEW_BLORP |
1261 (GEN_GEN >= 8 ? BRW_NEW_CONTEXT
1262 : BRW_NEW_BATCH |
1263 BRW_NEW_STATE_BASE_ADDRESS),
1264 },
1265 .emit = genX(upload_depth_stencil_state),
1266 };
1267 #endif
1268
1269 /* ---------------------------------------------------------------------- */
1270
1271 #if GEN_GEN <= 5
1272
1273 static void
1274 genX(upload_clip_state)(struct brw_context *brw)
1275 {
1276 struct gl_context *ctx = &brw->ctx;
1277
1278 ctx->NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
1279 brw_state_emit(brw, GENX(CLIP_STATE), 32, &brw->clip.state_offset, clip) {
1280 clip.KernelStartPointer = KSP(brw, brw->clip.prog_offset);
1281 clip.GRFRegisterCount =
1282 DIV_ROUND_UP(brw->clip.prog_data->total_grf, 16) - 1;
1283 clip.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
1284 clip.SingleProgramFlow = true;
1285 clip.VertexURBEntryReadLength = brw->clip.prog_data->urb_read_length;
1286 clip.ConstantURBEntryReadLength = brw->clip.prog_data->curb_read_length;
1287
1288 /* BRW_NEW_PUSH_CONSTANT_ALLOCATION */
1289 clip.ConstantURBEntryReadOffset = brw->curbe.clip_start * 2;
1290 clip.DispatchGRFStartRegisterForURBData = 1;
1291 clip.VertexURBEntryReadOffset = 0;
1292
1293 /* BRW_NEW_URB_FENCE */
1294 clip.NumberofURBEntries = brw->urb.nr_clip_entries;
1295 clip.URBEntryAllocationSize = brw->urb.vsize - 1;
1296
1297 if (brw->urb.nr_clip_entries >= 10) {
1298 /* Half of the URB entries go to each thread, and it has to be an
1299 * even number.
1300 */
1301 assert(brw->urb.nr_clip_entries % 2 == 0);
1302
1303 /* Although up to 16 concurrent Clip threads are allowed on Ironlake,
1304 * only 2 threads can output VUEs at a time.
1305 */
1306 clip.MaximumNumberofThreads = (GEN_GEN == 5 ? 16 : 2) - 1;
1307 } else {
1308 assert(brw->urb.nr_clip_entries >= 5);
1309 clip.MaximumNumberofThreads = 1 - 1;
1310 }
1311
1312 clip.VertexPositionSpace = VPOS_NDCSPACE;
1313 clip.UserClipFlagsMustClipEnable = true;
1314 clip.GuardbandClipTestEnable = true;
1315
1316 clip.ClipperViewportStatePointer =
1317 ro_bo(brw->batch.state.bo, brw->clip.vp_offset);
1318
1319 clip.ScreenSpaceViewportXMin = -1;
1320 clip.ScreenSpaceViewportXMax = 1;
1321 clip.ScreenSpaceViewportYMin = -1;
1322 clip.ScreenSpaceViewportYMax = 1;
1323
1324 clip.ViewportXYClipTestEnable = true;
1325 clip.ViewportZClipTestEnable = !(ctx->Transform.DepthClampNear &&
1326 ctx->Transform.DepthClampFar);
1327
1328 /* _NEW_TRANSFORM */
1329 if (GEN_GEN == 5 || GEN_IS_G4X) {
1330 clip.UserClipDistanceClipTestEnableBitmask =
1331 ctx->Transform.ClipPlanesEnabled;
1332 } else {
1333 /* Up to 6 actual clip flags, plus the 7th for the negative RHW
1334 * workaround.
1335 */
1336 clip.UserClipDistanceClipTestEnableBitmask =
1337 (ctx->Transform.ClipPlanesEnabled & 0x3f) | 0x40;
1338 }
1339
1340 if (ctx->Transform.ClipDepthMode == GL_ZERO_TO_ONE)
1341 clip.APIMode = APIMODE_D3D;
1342 else
1343 clip.APIMode = APIMODE_OGL;
1344
1345 clip.GuardbandClipTestEnable = true;
1346
1347 clip.ClipMode = brw->clip.prog_data->clip_mode;
1348
1349 #if GEN_IS_G4X
1350 clip.NegativeWClipTestEnable = true;
1351 #endif
1352 }
1353 }
1354
1355 const struct brw_tracked_state genX(clip_state) = {
1356 .dirty = {
1357 .mesa = _NEW_TRANSFORM |
1358 _NEW_VIEWPORT,
1359 .brw = BRW_NEW_BATCH |
1360 BRW_NEW_BLORP |
1361 BRW_NEW_CLIP_PROG_DATA |
1362 BRW_NEW_PUSH_CONSTANT_ALLOCATION |
1363 BRW_NEW_PROGRAM_CACHE |
1364 BRW_NEW_URB_FENCE,
1365 },
1366 .emit = genX(upload_clip_state),
1367 };
1368
1369 #else
1370
1371 static void
1372 genX(upload_clip_state)(struct brw_context *brw)
1373 {
1374 struct gl_context *ctx = &brw->ctx;
1375
1376 /* _NEW_BUFFERS */
1377 struct gl_framebuffer *fb = ctx->DrawBuffer;
1378
1379 /* BRW_NEW_FS_PROG_DATA */
1380 struct brw_wm_prog_data *wm_prog_data =
1381 brw_wm_prog_data(brw->wm.base.prog_data);
1382
1383 brw_batch_emit(brw, GENX(3DSTATE_CLIP), clip) {
1384 clip.StatisticsEnable = !brw->meta_in_progress;
1385
1386 if (wm_prog_data->barycentric_interp_modes &
1387 BRW_BARYCENTRIC_NONPERSPECTIVE_BITS)
1388 clip.NonPerspectiveBarycentricEnable = true;
1389
1390 #if GEN_GEN >= 7
1391 clip.EarlyCullEnable = true;
1392 #endif
1393
1394 #if GEN_GEN == 7
1395 clip.FrontWinding = brw->polygon_front_bit != fb->FlipY;
1396
1397 if (ctx->Polygon.CullFlag) {
1398 switch (ctx->Polygon.CullFaceMode) {
1399 case GL_FRONT:
1400 clip.CullMode = CULLMODE_FRONT;
1401 break;
1402 case GL_BACK:
1403 clip.CullMode = CULLMODE_BACK;
1404 break;
1405 case GL_FRONT_AND_BACK:
1406 clip.CullMode = CULLMODE_BOTH;
1407 break;
1408 default:
1409 unreachable("Should not get here: invalid CullFlag");
1410 }
1411 } else {
1412 clip.CullMode = CULLMODE_NONE;
1413 }
1414 #endif
1415
1416 #if GEN_GEN < 8
1417 clip.UserClipDistanceCullTestEnableBitmask =
1418 brw_vue_prog_data(brw->vs.base.prog_data)->cull_distance_mask;
1419
1420 clip.ViewportZClipTestEnable = !(ctx->Transform.DepthClampNear &&
1421 ctx->Transform.DepthClampFar);
1422 #endif
1423
1424 /* _NEW_LIGHT */
1425 if (ctx->Light.ProvokingVertex == GL_FIRST_VERTEX_CONVENTION) {
1426 clip.TriangleStripListProvokingVertexSelect = 0;
1427 clip.TriangleFanProvokingVertexSelect = 1;
1428 clip.LineStripListProvokingVertexSelect = 0;
1429 } else {
1430 clip.TriangleStripListProvokingVertexSelect = 2;
1431 clip.TriangleFanProvokingVertexSelect = 2;
1432 clip.LineStripListProvokingVertexSelect = 1;
1433 }
1434
1435 /* _NEW_TRANSFORM */
1436 clip.UserClipDistanceClipTestEnableBitmask =
1437 ctx->Transform.ClipPlanesEnabled;
1438
1439 #if GEN_GEN >= 8
1440 clip.ForceUserClipDistanceClipTestEnableBitmask = true;
1441 #endif
1442
1443 if (ctx->Transform.ClipDepthMode == GL_ZERO_TO_ONE)
1444 clip.APIMode = APIMODE_D3D;
1445 else
1446 clip.APIMode = APIMODE_OGL;
1447
1448 clip.GuardbandClipTestEnable = true;
1449
1450 /* BRW_NEW_VIEWPORT_COUNT */
1451 const unsigned viewport_count = brw->clip.viewport_count;
1452
1453 if (ctx->RasterDiscard) {
1454 clip.ClipMode = CLIPMODE_REJECT_ALL;
1455 #if GEN_GEN == 6
1456 perf_debug("Rasterizer discard is currently implemented via the "
1457 "clipper; having the GS not write primitives would "
1458 "likely be faster.\n");
1459 #endif
1460 } else {
1461 clip.ClipMode = CLIPMODE_NORMAL;
1462 }
1463
1464 clip.ClipEnable = true;
1465
1466 /* _NEW_POLYGON,
1467 * BRW_NEW_GEOMETRY_PROGRAM | BRW_NEW_TES_PROG_DATA | BRW_NEW_PRIMITIVE
1468 */
1469 if (!brw_is_drawing_points(brw) && !brw_is_drawing_lines(brw))
1470 clip.ViewportXYClipTestEnable = true;
1471
1472 clip.MinimumPointWidth = 0.125;
1473 clip.MaximumPointWidth = 255.875;
1474 clip.MaximumVPIndex = viewport_count - 1;
1475 if (_mesa_geometric_layers(fb) == 0)
1476 clip.ForceZeroRTAIndexEnable = true;
1477 }
1478 }
1479
1480 static const struct brw_tracked_state genX(clip_state) = {
1481 .dirty = {
1482 .mesa = _NEW_BUFFERS |
1483 _NEW_LIGHT |
1484 _NEW_POLYGON |
1485 _NEW_TRANSFORM,
1486 .brw = BRW_NEW_BLORP |
1487 BRW_NEW_CONTEXT |
1488 BRW_NEW_FS_PROG_DATA |
1489 BRW_NEW_GS_PROG_DATA |
1490 BRW_NEW_VS_PROG_DATA |
1491 BRW_NEW_META_IN_PROGRESS |
1492 BRW_NEW_PRIMITIVE |
1493 BRW_NEW_RASTERIZER_DISCARD |
1494 BRW_NEW_TES_PROG_DATA |
1495 BRW_NEW_VIEWPORT_COUNT,
1496 },
1497 .emit = genX(upload_clip_state),
1498 };
1499 #endif
1500
1501 /* ---------------------------------------------------------------------- */
1502
1503 static void
1504 genX(upload_sf)(struct brw_context *brw)
1505 {
1506 struct gl_context *ctx = &brw->ctx;
1507 float point_size;
1508
1509 #if GEN_GEN <= 7
1510 /* _NEW_BUFFERS */
1511 bool flip_y = ctx->DrawBuffer->FlipY;
1512 UNUSED const bool multisampled_fbo =
1513 _mesa_geometric_samples(ctx->DrawBuffer) > 1;
1514 #endif
1515
1516 #if GEN_GEN < 6
1517 const struct brw_sf_prog_data *sf_prog_data = brw->sf.prog_data;
1518
1519 ctx->NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
1520
1521 brw_state_emit(brw, GENX(SF_STATE), 64, &brw->sf.state_offset, sf) {
1522 sf.KernelStartPointer = KSP(brw, brw->sf.prog_offset);
1523 sf.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
1524 sf.GRFRegisterCount = DIV_ROUND_UP(sf_prog_data->total_grf, 16) - 1;
1525 sf.DispatchGRFStartRegisterForURBData = 3;
1526 sf.VertexURBEntryReadOffset = BRW_SF_URB_ENTRY_READ_OFFSET;
1527 sf.VertexURBEntryReadLength = sf_prog_data->urb_read_length;
1528 sf.NumberofURBEntries = brw->urb.nr_sf_entries;
1529 sf.URBEntryAllocationSize = brw->urb.sfsize - 1;
1530
1531 /* STATE_PREFETCH command description describes this state as being
1532 * something loaded through the GPE (L2 ISC), so it's INSTRUCTION
1533 * domain.
1534 */
1535 sf.SetupViewportStateOffset =
1536 ro_bo(brw->batch.state.bo, brw->sf.vp_offset);
1537
1538 sf.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
1539
1540 /* sf.ConstantURBEntryReadLength = stage_prog_data->curb_read_length; */
1541 /* sf.ConstantURBEntryReadOffset = brw->curbe.vs_start * 2; */
1542
1543 sf.MaximumNumberofThreads =
1544 MIN2(GEN_GEN == 5 ? 48 : 24, brw->urb.nr_sf_entries) - 1;
1545
1546 sf.SpritePointEnable = ctx->Point.PointSprite;
1547
1548 sf.DestinationOriginHorizontalBias = 0.5;
1549 sf.DestinationOriginVerticalBias = 0.5;
1550 #else
1551 brw_batch_emit(brw, GENX(3DSTATE_SF), sf) {
1552 sf.StatisticsEnable = true;
1553 #endif
1554 sf.ViewportTransformEnable = true;
1555
1556 #if GEN_GEN == 7
1557 /* _NEW_BUFFERS */
1558 sf.DepthBufferSurfaceFormat = brw_depthbuffer_format(brw);
1559 #endif
1560
1561 #if GEN_GEN <= 7
1562 /* _NEW_POLYGON */
1563 sf.FrontWinding = brw->polygon_front_bit != flip_y;
1564 #if GEN_GEN >= 6
1565 sf.GlobalDepthOffsetEnableSolid = ctx->Polygon.OffsetFill;
1566 sf.GlobalDepthOffsetEnableWireframe = ctx->Polygon.OffsetLine;
1567 sf.GlobalDepthOffsetEnablePoint = ctx->Polygon.OffsetPoint;
1568
1569 switch (ctx->Polygon.FrontMode) {
1570 case GL_FILL:
1571 sf.FrontFaceFillMode = FILL_MODE_SOLID;
1572 break;
1573 case GL_LINE:
1574 sf.FrontFaceFillMode = FILL_MODE_WIREFRAME;
1575 break;
1576 case GL_POINT:
1577 sf.FrontFaceFillMode = FILL_MODE_POINT;
1578 break;
1579 default:
1580 unreachable("not reached");
1581 }
1582
1583 switch (ctx->Polygon.BackMode) {
1584 case GL_FILL:
1585 sf.BackFaceFillMode = FILL_MODE_SOLID;
1586 break;
1587 case GL_LINE:
1588 sf.BackFaceFillMode = FILL_MODE_WIREFRAME;
1589 break;
1590 case GL_POINT:
1591 sf.BackFaceFillMode = FILL_MODE_POINT;
1592 break;
1593 default:
1594 unreachable("not reached");
1595 }
1596
1597 if (multisampled_fbo && ctx->Multisample.Enabled)
1598 sf.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
1599
1600 sf.GlobalDepthOffsetConstant = ctx->Polygon.OffsetUnits * 2;
1601 sf.GlobalDepthOffsetScale = ctx->Polygon.OffsetFactor;
1602 sf.GlobalDepthOffsetClamp = ctx->Polygon.OffsetClamp;
1603 #endif
1604
1605 sf.ScissorRectangleEnable = true;
1606
1607 if (ctx->Polygon.CullFlag) {
1608 switch (ctx->Polygon.CullFaceMode) {
1609 case GL_FRONT:
1610 sf.CullMode = CULLMODE_FRONT;
1611 break;
1612 case GL_BACK:
1613 sf.CullMode = CULLMODE_BACK;
1614 break;
1615 case GL_FRONT_AND_BACK:
1616 sf.CullMode = CULLMODE_BOTH;
1617 break;
1618 default:
1619 unreachable("not reached");
1620 }
1621 } else {
1622 sf.CullMode = CULLMODE_NONE;
1623 }
1624
1625 #if GEN_IS_HASWELL
1626 sf.LineStippleEnable = ctx->Line.StippleFlag;
1627 #endif
1628
1629 #endif
1630
1631 /* _NEW_LINE */
1632 #if GEN_GEN == 8
1633 const struct gen_device_info *devinfo = &brw->screen->devinfo;
1634
1635 if (devinfo->is_cherryview)
1636 sf.CHVLineWidth = brw_get_line_width(brw);
1637 else
1638 sf.LineWidth = brw_get_line_width(brw);
1639 #else
1640 sf.LineWidth = brw_get_line_width(brw);
1641 #endif
1642
1643 if (ctx->Line.SmoothFlag) {
1644 sf.LineEndCapAntialiasingRegionWidth = _10pixels;
1645 #if GEN_GEN <= 7
1646 sf.AntiAliasingEnable = true;
1647 #endif
1648 }
1649
1650 /* _NEW_POINT - Clamp to ARB_point_parameters user limits */
1651 point_size = CLAMP(ctx->Point.Size, ctx->Point.MinSize, ctx->Point.MaxSize);
1652 /* Clamp to the hardware limits */
1653 sf.PointWidth = CLAMP(point_size, 0.125f, 255.875f);
1654
1655 /* _NEW_PROGRAM | _NEW_POINT, BRW_NEW_VUE_MAP_GEOM_OUT */
1656 if (use_state_point_size(brw))
1657 sf.PointWidthSource = State;
1658
1659 #if GEN_GEN >= 8
1660 /* _NEW_POINT | _NEW_MULTISAMPLE */
1661 if ((ctx->Point.SmoothFlag || _mesa_is_multisample_enabled(ctx)) &&
1662 !ctx->Point.PointSprite)
1663 sf.SmoothPointEnable = true;
1664 #endif
1665
1666 #if GEN_GEN == 10
1667 /* _NEW_BUFFERS
1668 * Smooth Point Enable bit MUST not be set when NUM_MULTISAMPLES > 1.
1669 */
1670 const bool multisampled_fbo =
1671 _mesa_geometric_samples(ctx->DrawBuffer) > 1;
1672 if (multisampled_fbo)
1673 sf.SmoothPointEnable = false;
1674 #endif
1675
1676 #if GEN_IS_G4X || GEN_GEN >= 5
1677 sf.AALineDistanceMode = AALINEDISTANCE_TRUE;
1678 #endif
1679
1680 /* _NEW_LIGHT */
1681 if (ctx->Light.ProvokingVertex != GL_FIRST_VERTEX_CONVENTION) {
1682 sf.TriangleStripListProvokingVertexSelect = 2;
1683 sf.TriangleFanProvokingVertexSelect = 2;
1684 sf.LineStripListProvokingVertexSelect = 1;
1685 } else {
1686 sf.TriangleFanProvokingVertexSelect = 1;
1687 }
1688
1689 #if GEN_GEN == 6
1690 /* BRW_NEW_FS_PROG_DATA */
1691 const struct brw_wm_prog_data *wm_prog_data =
1692 brw_wm_prog_data(brw->wm.base.prog_data);
1693
1694 sf.AttributeSwizzleEnable = true;
1695 sf.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
1696
1697 /*
1698 * Window coordinates in an FBO are inverted, which means point
1699 * sprite origin must be inverted, too.
1700 */
1701 if ((ctx->Point.SpriteOrigin == GL_LOWER_LEFT) == flip_y) {
1702 sf.PointSpriteTextureCoordinateOrigin = LOWERLEFT;
1703 } else {
1704 sf.PointSpriteTextureCoordinateOrigin = UPPERLEFT;
1705 }
1706
1707 /* BRW_NEW_VUE_MAP_GEOM_OUT | BRW_NEW_FRAGMENT_PROGRAM |
1708 * _NEW_POINT | _NEW_LIGHT | _NEW_PROGRAM | BRW_NEW_FS_PROG_DATA
1709 */
1710 uint32_t urb_entry_read_length;
1711 uint32_t urb_entry_read_offset;
1712 uint32_t point_sprite_enables;
1713 genX(calculate_attr_overrides)(brw, sf.Attribute, &point_sprite_enables,
1714 &urb_entry_read_length,
1715 &urb_entry_read_offset);
1716 sf.VertexURBEntryReadLength = urb_entry_read_length;
1717 sf.VertexURBEntryReadOffset = urb_entry_read_offset;
1718 sf.PointSpriteTextureCoordinateEnable = point_sprite_enables;
1719 sf.ConstantInterpolationEnable = wm_prog_data->flat_inputs;
1720 #endif
1721 }
1722 }
1723
1724 static const struct brw_tracked_state genX(sf_state) = {
1725 .dirty = {
1726 .mesa = _NEW_LIGHT |
1727 _NEW_LINE |
1728 _NEW_POINT |
1729 _NEW_PROGRAM |
1730 (GEN_GEN >= 6 ? _NEW_MULTISAMPLE : 0) |
1731 (GEN_GEN <= 7 ? _NEW_BUFFERS | _NEW_POLYGON : 0) |
1732 (GEN_GEN == 10 ? _NEW_BUFFERS : 0),
1733 .brw = BRW_NEW_BLORP |
1734 BRW_NEW_VUE_MAP_GEOM_OUT |
1735 (GEN_GEN <= 5 ? BRW_NEW_BATCH |
1736 BRW_NEW_PROGRAM_CACHE |
1737 BRW_NEW_SF_PROG_DATA |
1738 BRW_NEW_SF_VP |
1739 BRW_NEW_URB_FENCE
1740 : 0) |
1741 (GEN_GEN >= 6 ? BRW_NEW_CONTEXT : 0) |
1742 (GEN_GEN >= 6 && GEN_GEN <= 7 ?
1743 BRW_NEW_GS_PROG_DATA |
1744 BRW_NEW_PRIMITIVE |
1745 BRW_NEW_TES_PROG_DATA
1746 : 0) |
1747 (GEN_GEN == 6 ? BRW_NEW_FS_PROG_DATA |
1748 BRW_NEW_FRAGMENT_PROGRAM
1749 : 0),
1750 },
1751 .emit = genX(upload_sf),
1752 };
1753
1754 /* ---------------------------------------------------------------------- */
1755
1756 static bool
1757 brw_color_buffer_write_enabled(struct brw_context *brw)
1758 {
1759 struct gl_context *ctx = &brw->ctx;
1760 /* BRW_NEW_FRAGMENT_PROGRAM */
1761 const struct gl_program *fp = brw->programs[MESA_SHADER_FRAGMENT];
1762 unsigned i;
1763
1764 /* _NEW_BUFFERS */
1765 for (i = 0; i < ctx->DrawBuffer->_NumColorDrawBuffers; i++) {
1766 struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[i];
1767 uint64_t outputs_written = fp->info.outputs_written;
1768
1769 /* _NEW_COLOR */
1770 if (rb && (outputs_written & BITFIELD64_BIT(FRAG_RESULT_COLOR) ||
1771 outputs_written & BITFIELD64_BIT(FRAG_RESULT_DATA0 + i)) &&
1772 GET_COLORMASK(ctx->Color.ColorMask, i)) {
1773 return true;
1774 }
1775 }
1776
1777 return false;
1778 }
1779
1780 static void
1781 genX(upload_wm)(struct brw_context *brw)
1782 {
1783 struct gl_context *ctx = &brw->ctx;
1784
1785 /* BRW_NEW_FS_PROG_DATA */
1786 const struct brw_wm_prog_data *wm_prog_data =
1787 brw_wm_prog_data(brw->wm.base.prog_data);
1788
1789 UNUSED bool writes_depth =
1790 wm_prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF;
1791 UNUSED struct brw_stage_state *stage_state = &brw->wm.base;
1792 UNUSED const struct gen_device_info *devinfo = &brw->screen->devinfo;
1793
1794 #if GEN_GEN == 6
1795 /* We can't fold this into gen6_upload_wm_push_constants(), because
1796 * according to the SNB PRM, vol 2 part 1 section 7.2.2
1797 * (3DSTATE_CONSTANT_PS [DevSNB]):
1798 *
1799 * "[DevSNB]: This packet must be followed by WM_STATE."
1800 */
1801 brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_PS), wmcp) {
1802 if (wm_prog_data->base.nr_params != 0) {
1803 wmcp.Buffer0Valid = true;
1804 /* Pointer to the WM constant buffer. Covered by the set of
1805 * state flags from gen6_upload_wm_push_constants.
1806 */
1807 wmcp.ConstantBody.PointertoConstantBuffer0 = stage_state->push_const_offset;
1808 wmcp.ConstantBody.ConstantBuffer0ReadLength = stage_state->push_const_size - 1;
1809 }
1810 }
1811 #endif
1812
1813 #if GEN_GEN >= 6
1814 brw_batch_emit(brw, GENX(3DSTATE_WM), wm) {
1815 #else
1816 ctx->NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
1817 brw_state_emit(brw, GENX(WM_STATE), 64, &stage_state->state_offset, wm) {
1818 #endif
1819
1820 #if GEN_GEN <= 6
1821 wm._8PixelDispatchEnable = wm_prog_data->dispatch_8;
1822 wm._16PixelDispatchEnable = wm_prog_data->dispatch_16;
1823 wm._32PixelDispatchEnable = wm_prog_data->dispatch_32;
1824 #endif
1825
1826 #if GEN_GEN == 4
1827 /* On gen4, we only have one shader kernel */
1828 if (brw_wm_state_has_ksp(wm, 0)) {
1829 assert(brw_wm_prog_data_prog_offset(wm_prog_data, wm, 0) == 0);
1830 wm.KernelStartPointer0 = KSP(brw, stage_state->prog_offset);
1831 wm.GRFRegisterCount0 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 0);
1832 wm.DispatchGRFStartRegisterForConstantSetupData0 =
1833 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 0);
1834 }
1835 #elif GEN_GEN == 5
1836 /* On gen5, we have multiple shader kernels but only one GRF start
1837 * register for all kernels
1838 */
1839 wm.KernelStartPointer0 = stage_state->prog_offset +
1840 brw_wm_prog_data_prog_offset(wm_prog_data, wm, 0);
1841 wm.KernelStartPointer1 = stage_state->prog_offset +
1842 brw_wm_prog_data_prog_offset(wm_prog_data, wm, 1);
1843 wm.KernelStartPointer2 = stage_state->prog_offset +
1844 brw_wm_prog_data_prog_offset(wm_prog_data, wm, 2);
1845
1846 wm.GRFRegisterCount0 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 0);
1847 wm.GRFRegisterCount1 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 1);
1848 wm.GRFRegisterCount2 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 2);
1849
1850 wm.DispatchGRFStartRegisterForConstantSetupData0 =
1851 wm_prog_data->base.dispatch_grf_start_reg;
1852
1853 /* Dispatch GRF Start should be the same for all shaders on gen5 */
1854 if (brw_wm_state_has_ksp(wm, 1)) {
1855 assert(wm_prog_data->base.dispatch_grf_start_reg ==
1856 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 1));
1857 }
1858 if (brw_wm_state_has_ksp(wm, 2)) {
1859 assert(wm_prog_data->base.dispatch_grf_start_reg ==
1860 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 2));
1861 }
1862 #elif GEN_GEN == 6
1863 /* On gen6, we have multiple shader kernels and we no longer specify a
1864 * register count for each one.
1865 */
1866 wm.KernelStartPointer0 = stage_state->prog_offset +
1867 brw_wm_prog_data_prog_offset(wm_prog_data, wm, 0);
1868 wm.KernelStartPointer1 = stage_state->prog_offset +
1869 brw_wm_prog_data_prog_offset(wm_prog_data, wm, 1);
1870 wm.KernelStartPointer2 = stage_state->prog_offset +
1871 brw_wm_prog_data_prog_offset(wm_prog_data, wm, 2);
1872
1873 wm.DispatchGRFStartRegisterForConstantSetupData0 =
1874 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 0);
1875 wm.DispatchGRFStartRegisterForConstantSetupData1 =
1876 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 1);
1877 wm.DispatchGRFStartRegisterForConstantSetupData2 =
1878 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 2);
1879 #endif
1880
1881 #if GEN_GEN <= 5
1882 wm.ConstantURBEntryReadLength = wm_prog_data->base.curb_read_length;
1883 /* BRW_NEW_PUSH_CONSTANT_ALLOCATION */
1884 wm.ConstantURBEntryReadOffset = brw->curbe.wm_start * 2;
1885 wm.SetupURBEntryReadLength = wm_prog_data->num_varying_inputs * 2;
1886 wm.SetupURBEntryReadOffset = 0;
1887 wm.EarlyDepthTestEnable = true;
1888 #endif
1889
1890 #if GEN_GEN >= 6
1891 wm.LineAntialiasingRegionWidth = _10pixels;
1892 wm.LineEndCapAntialiasingRegionWidth = _05pixels;
1893
1894 wm.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
1895 wm.BarycentricInterpolationMode = wm_prog_data->barycentric_interp_modes;
1896 #else
1897 if (stage_state->sampler_count)
1898 wm.SamplerStatePointer =
1899 ro_bo(brw->batch.state.bo, stage_state->sampler_offset);
1900
1901 wm.LineAntialiasingRegionWidth = _05pixels;
1902 wm.LineEndCapAntialiasingRegionWidth = _10pixels;
1903
1904 /* _NEW_POLYGON */
1905 if (ctx->Polygon.OffsetFill) {
1906 wm.GlobalDepthOffsetEnable = true;
1907 /* Something weird going on with legacy_global_depth_bias,
1908 * offset_constant, scaling and MRD. This value passes glean
1909 * but gives some odd results elsewere (eg. the
1910 * quad-offset-units test).
1911 */
1912 wm.GlobalDepthOffsetConstant = ctx->Polygon.OffsetUnits * 2;
1913
1914 /* This is the only value that passes glean:
1915 */
1916 wm.GlobalDepthOffsetScale = ctx->Polygon.OffsetFactor;
1917 }
1918
1919 wm.DepthCoefficientURBReadOffset = 1;
1920 #endif
1921
1922 /* BRW_NEW_STATS_WM */
1923 wm.StatisticsEnable = GEN_GEN >= 6 || brw->stats_wm;
1924
1925 #if GEN_GEN < 7
1926 if (wm_prog_data->base.use_alt_mode)
1927 wm.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
1928
1929 /* WA_1606682166 */
1930 wm.SamplerCount = (GEN_GEN == 5 || GEN_GEN == 11) ?
1931 0 : DIV_ROUND_UP(stage_state->sampler_count, 4);
1932
1933 wm.BindingTableEntryCount =
1934 wm_prog_data->base.binding_table.size_bytes / 4;
1935 wm.MaximumNumberofThreads = devinfo->max_wm_threads - 1;
1936
1937 #if GEN_GEN == 6
1938 wm.DualSourceBlendEnable =
1939 wm_prog_data->dual_src_blend && (ctx->Color.BlendEnabled & 1) &&
1940 ctx->Color.Blend[0]._UsesDualSrc;
1941 wm.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
1942 wm.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
1943
1944 /* From the SNB PRM, volume 2 part 1, page 281:
1945 * "If the PS kernel does not need the Position XY Offsets
1946 * to compute a Position XY value, then this field should be
1947 * programmed to POSOFFSET_NONE."
1948 *
1949 * "SW Recommendation: If the PS kernel needs the Position Offsets
1950 * to compute a Position XY value, this field should match Position
1951 * ZW Interpolation Mode to ensure a consistent position.xyzw
1952 * computation."
1953 * We only require XY sample offsets. So, this recommendation doesn't
1954 * look useful at the moment. We might need this in future.
1955 */
1956 if (wm_prog_data->uses_pos_offset)
1957 wm.PositionXYOffsetSelect = POSOFFSET_SAMPLE;
1958 else
1959 wm.PositionXYOffsetSelect = POSOFFSET_NONE;
1960 #endif
1961
1962 if (wm_prog_data->base.total_scratch) {
1963 wm.ScratchSpaceBasePointer = rw_32_bo(stage_state->scratch_bo, 0);
1964 wm.PerThreadScratchSpace =
1965 ffs(stage_state->per_thread_scratch) - 11;
1966 }
1967
1968 wm.PixelShaderComputedDepth = writes_depth;
1969 #endif
1970
1971 /* _NEW_LINE */
1972 wm.LineStippleEnable = ctx->Line.StippleFlag;
1973
1974 /* _NEW_POLYGON */
1975 wm.PolygonStippleEnable = ctx->Polygon.StippleFlag;
1976
1977 #if GEN_GEN < 8
1978
1979 #if GEN_GEN >= 6
1980 wm.PixelShaderUsesSourceW = wm_prog_data->uses_src_w;
1981
1982 /* _NEW_BUFFERS */
1983 const bool multisampled_fbo = _mesa_geometric_samples(ctx->DrawBuffer) > 1;
1984
1985 if (multisampled_fbo) {
1986 /* _NEW_MULTISAMPLE */
1987 if (ctx->Multisample.Enabled)
1988 wm.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
1989 else
1990 wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
1991
1992 if (wm_prog_data->persample_dispatch)
1993 wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
1994 else
1995 wm.MultisampleDispatchMode = MSDISPMODE_PERPIXEL;
1996 } else {
1997 wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
1998 wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
1999 }
2000 #endif
2001 wm.PixelShaderUsesSourceDepth = wm_prog_data->uses_src_depth;
2002 if (wm_prog_data->uses_kill ||
2003 _mesa_is_alpha_test_enabled(ctx) ||
2004 _mesa_is_alpha_to_coverage_enabled(ctx) ||
2005 (GEN_GEN >= 6 && wm_prog_data->uses_omask)) {
2006 wm.PixelShaderKillsPixel = true;
2007 }
2008
2009 /* _NEW_BUFFERS | _NEW_COLOR */
2010 if (brw_color_buffer_write_enabled(brw) || writes_depth ||
2011 wm.PixelShaderKillsPixel ||
2012 (GEN_GEN >= 6 && wm_prog_data->has_side_effects)) {
2013 wm.ThreadDispatchEnable = true;
2014 }
2015
2016 #if GEN_GEN >= 7
2017 wm.PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode;
2018 wm.PixelShaderUsesInputCoverageMask = wm_prog_data->uses_sample_mask;
2019 #endif
2020
2021 /* The "UAV access enable" bits are unnecessary on HSW because they only
2022 * seem to have an effect on the HW-assisted coherency mechanism which we
2023 * don't need, and the rasterization-related UAV_ONLY flag and the
2024 * DISPATCH_ENABLE bit can be set independently from it.
2025 * C.f. gen8_upload_ps_extra().
2026 *
2027 * BRW_NEW_FRAGMENT_PROGRAM | BRW_NEW_FS_PROG_DATA | _NEW_BUFFERS |
2028 * _NEW_COLOR
2029 */
2030 #if GEN_IS_HASWELL
2031 if (!(brw_color_buffer_write_enabled(brw) || writes_depth) &&
2032 wm_prog_data->has_side_effects)
2033 wm.PSUAVonly = ON;
2034 #endif
2035 #endif
2036
2037 #if GEN_GEN >= 7
2038 /* BRW_NEW_FS_PROG_DATA */
2039 if (wm_prog_data->early_fragment_tests)
2040 wm.EarlyDepthStencilControl = EDSC_PREPS;
2041 else if (wm_prog_data->has_side_effects)
2042 wm.EarlyDepthStencilControl = EDSC_PSEXEC;
2043 #endif
2044 }
2045
2046 #if GEN_GEN <= 5
2047 if (brw->wm.offset_clamp != ctx->Polygon.OffsetClamp) {
2048 brw_batch_emit(brw, GENX(3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP), clamp) {
2049 clamp.GlobalDepthOffsetClamp = ctx->Polygon.OffsetClamp;
2050 }
2051
2052 brw->wm.offset_clamp = ctx->Polygon.OffsetClamp;
2053 }
2054 #endif
2055 }
2056
2057 static const struct brw_tracked_state genX(wm_state) = {
2058 .dirty = {
2059 .mesa = _NEW_LINE |
2060 _NEW_POLYGON |
2061 (GEN_GEN < 8 ? _NEW_BUFFERS |
2062 _NEW_COLOR :
2063 0) |
2064 (GEN_GEN == 6 ? _NEW_PROGRAM_CONSTANTS : 0) |
2065 (GEN_GEN < 6 ? _NEW_POLYGONSTIPPLE : 0) |
2066 (GEN_GEN < 8 && GEN_GEN >= 6 ? _NEW_MULTISAMPLE : 0),
2067 .brw = BRW_NEW_BLORP |
2068 BRW_NEW_FS_PROG_DATA |
2069 (GEN_GEN < 6 ? BRW_NEW_PUSH_CONSTANT_ALLOCATION |
2070 BRW_NEW_FRAGMENT_PROGRAM |
2071 BRW_NEW_PROGRAM_CACHE |
2072 BRW_NEW_SAMPLER_STATE_TABLE |
2073 BRW_NEW_STATS_WM
2074 : 0) |
2075 (GEN_GEN < 7 ? BRW_NEW_BATCH : BRW_NEW_CONTEXT),
2076 },
2077 .emit = genX(upload_wm),
2078 };
2079
2080 /* ---------------------------------------------------------------------- */
2081
2082 /* We restrict scratch buffers to the bottom 32 bits of the address space
2083 * by using rw_32_bo().
2084 *
2085 * General State Base Address is a bit broken. If the address + size as
2086 * seen by STATE_BASE_ADDRESS overflows 48 bits, the GPU appears to treat
2087 * all accesses to the buffer as being out of bounds and returns zero.
2088 */
2089
2090 #define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix) \
2091 pkt.KernelStartPointer = KSP(brw, stage_state->prog_offset); \
2092 /* WA_1606682166 */ \
2093 pkt.SamplerCount = \
2094 GEN_GEN == 11 ? \
2095 0 : \
2096 DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4); \
2097 /* Gen 11 workarounds table #2056 WABTPPrefetchDisable suggests to \
2098 * disable prefetching of binding tables in A0 and B0 steppings. \
2099 * TODO: Revisit this WA on C0 stepping. \
2100 */ \
2101 pkt.BindingTableEntryCount = \
2102 GEN_GEN == 11 ? \
2103 0 : \
2104 stage_prog_data->binding_table.size_bytes / 4; \
2105 pkt.FloatingPointMode = stage_prog_data->use_alt_mode; \
2106 \
2107 if (stage_prog_data->total_scratch) { \
2108 pkt.ScratchSpaceBasePointer = rw_32_bo(stage_state->scratch_bo, 0); \
2109 pkt.PerThreadScratchSpace = \
2110 ffs(stage_state->per_thread_scratch) - 11; \
2111 } \
2112 \
2113 pkt.DispatchGRFStartRegisterForURBData = \
2114 stage_prog_data->dispatch_grf_start_reg; \
2115 pkt.prefix##URBEntryReadLength = vue_prog_data->urb_read_length; \
2116 pkt.prefix##URBEntryReadOffset = 0; \
2117 \
2118 pkt.StatisticsEnable = true; \
2119 pkt.Enable = true;
2120
2121 static void
2122 genX(upload_vs_state)(struct brw_context *brw)
2123 {
2124 UNUSED struct gl_context *ctx = &brw->ctx;
2125 const struct gen_device_info *devinfo = &brw->screen->devinfo;
2126 struct brw_stage_state *stage_state = &brw->vs.base;
2127
2128 /* BRW_NEW_VS_PROG_DATA */
2129 const struct brw_vue_prog_data *vue_prog_data =
2130 brw_vue_prog_data(brw->vs.base.prog_data);
2131 const struct brw_stage_prog_data *stage_prog_data = &vue_prog_data->base;
2132
2133 assert(vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8 ||
2134 vue_prog_data->dispatch_mode == DISPATCH_MODE_4X2_DUAL_OBJECT);
2135 assert(GEN_GEN < 11 ||
2136 vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8);
2137
2138 #if GEN_GEN == 6
2139 /* From the BSpec, 3D Pipeline > Geometry > Vertex Shader > State,
2140 * 3DSTATE_VS, Dword 5.0 "VS Function Enable":
2141 *
2142 * [DevSNB] A pipeline flush must be programmed prior to a 3DSTATE_VS
2143 * command that causes the VS Function Enable to toggle. Pipeline
2144 * flush can be executed by sending a PIPE_CONTROL command with CS
2145 * stall bit set and a post sync operation.
2146 *
2147 * We've already done such a flush at the start of state upload, so we
2148 * don't need to do another one here.
2149 */
2150 brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_VS), cvs) {
2151 if (stage_state->push_const_size != 0) {
2152 cvs.Buffer0Valid = true;
2153 cvs.ConstantBody.PointertoConstantBuffer0 = stage_state->push_const_offset;
2154 cvs.ConstantBody.ConstantBuffer0ReadLength = stage_state->push_const_size - 1;
2155 }
2156 }
2157 #endif
2158
2159 if (GEN_GEN == 7 && devinfo->is_ivybridge)
2160 gen7_emit_vs_workaround_flush(brw);
2161
2162 #if GEN_GEN >= 6
2163 brw_batch_emit(brw, GENX(3DSTATE_VS), vs) {
2164 #else
2165 ctx->NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
2166 brw_state_emit(brw, GENX(VS_STATE), 32, &stage_state->state_offset, vs) {
2167 #endif
2168 INIT_THREAD_DISPATCH_FIELDS(vs, Vertex);
2169
2170 vs.MaximumNumberofThreads = devinfo->max_vs_threads - 1;
2171
2172 #if GEN_GEN < 6
2173 vs.GRFRegisterCount = DIV_ROUND_UP(vue_prog_data->total_grf, 16) - 1;
2174 vs.ConstantURBEntryReadLength = stage_prog_data->curb_read_length;
2175 vs.ConstantURBEntryReadOffset = brw->curbe.vs_start * 2;
2176
2177 vs.NumberofURBEntries = brw->urb.nr_vs_entries >> (GEN_GEN == 5 ? 2 : 0);
2178 vs.URBEntryAllocationSize = brw->urb.vsize - 1;
2179
2180 vs.MaximumNumberofThreads =
2181 CLAMP(brw->urb.nr_vs_entries / 2, 1, devinfo->max_vs_threads) - 1;
2182
2183 vs.StatisticsEnable = false;
2184 vs.SamplerStatePointer =
2185 ro_bo(brw->batch.state.bo, stage_state->sampler_offset);
2186 #endif
2187
2188 #if GEN_GEN == 5
2189 /* Force single program flow on Ironlake. We cannot reliably get
2190 * all applications working without it. See:
2191 * https://bugs.freedesktop.org/show_bug.cgi?id=29172
2192 *
2193 * The most notable and reliably failing application is the Humus
2194 * demo "CelShading"
2195 */
2196 vs.SingleProgramFlow = true;
2197 vs.SamplerCount = 0; /* hardware requirement */
2198 #endif
2199
2200 #if GEN_GEN >= 8
2201 vs.SIMD8DispatchEnable =
2202 vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8;
2203
2204 vs.UserClipDistanceCullTestEnableBitmask =
2205 vue_prog_data->cull_distance_mask;
2206 #endif
2207 }
2208
2209 #if GEN_GEN == 6
2210 /* Based on my reading of the simulator, the VS constants don't get
2211 * pulled into the VS FF unit until an appropriate pipeline flush
2212 * happens, and instead the 3DSTATE_CONSTANT_VS packet just adds
2213 * references to them into a little FIFO. The flushes are common,
2214 * but don't reliably happen between this and a 3DPRIMITIVE, causing
2215 * the primitive to use the wrong constants. Then the FIFO
2216 * containing the constant setup gets added to again on the next
2217 * constants change, and eventually when a flush does happen the
2218 * unit is overwhelmed by constant changes and dies.
2219 *
2220 * To avoid this, send a PIPE_CONTROL down the line that will
2221 * update the unit immediately loading the constants. The flush
2222 * type bits here were those set by the STATE_BASE_ADDRESS whose
2223 * move in a82a43e8d99e1715dd11c9c091b5ab734079b6a6 triggered the
2224 * bug reports that led to this workaround, and may be more than
2225 * what is strictly required to avoid the issue.
2226 */
2227 brw_emit_pipe_control_flush(brw,
2228 PIPE_CONTROL_DEPTH_STALL |
2229 PIPE_CONTROL_INSTRUCTION_INVALIDATE |
2230 PIPE_CONTROL_STATE_CACHE_INVALIDATE);
2231 #endif
2232 }
2233
2234 static const struct brw_tracked_state genX(vs_state) = {
2235 .dirty = {
2236 .mesa = (GEN_GEN == 6 ? (_NEW_PROGRAM_CONSTANTS | _NEW_TRANSFORM) : 0),
2237 .brw = BRW_NEW_BATCH |
2238 BRW_NEW_BLORP |
2239 BRW_NEW_CONTEXT |
2240 BRW_NEW_VS_PROG_DATA |
2241 (GEN_GEN == 6 ? BRW_NEW_VERTEX_PROGRAM : 0) |
2242 (GEN_GEN <= 5 ? BRW_NEW_PUSH_CONSTANT_ALLOCATION |
2243 BRW_NEW_PROGRAM_CACHE |
2244 BRW_NEW_SAMPLER_STATE_TABLE |
2245 BRW_NEW_URB_FENCE
2246 : 0),
2247 },
2248 .emit = genX(upload_vs_state),
2249 };
2250
2251 /* ---------------------------------------------------------------------- */
2252
2253 static void
2254 genX(upload_cc_viewport)(struct brw_context *brw)
2255 {
2256 struct gl_context *ctx = &brw->ctx;
2257
2258 /* BRW_NEW_VIEWPORT_COUNT */
2259 const unsigned viewport_count = brw->clip.viewport_count;
2260
2261 struct GENX(CC_VIEWPORT) ccv;
2262 uint32_t cc_vp_offset;
2263 uint32_t *cc_map =
2264 brw_state_batch(brw, 4 * GENX(CC_VIEWPORT_length) * viewport_count,
2265 32, &cc_vp_offset);
2266
2267 for (unsigned i = 0; i < viewport_count; i++) {
2268 /* _NEW_VIEWPORT | _NEW_TRANSFORM */
2269 const struct gl_viewport_attrib *vp = &ctx->ViewportArray[i];
2270 if (ctx->Transform.DepthClampNear && ctx->Transform.DepthClampFar) {
2271 ccv.MinimumDepth = MIN2(vp->Near, vp->Far);
2272 ccv.MaximumDepth = MAX2(vp->Near, vp->Far);
2273 } else if (ctx->Transform.DepthClampNear) {
2274 ccv.MinimumDepth = MIN2(vp->Near, vp->Far);
2275 ccv.MaximumDepth = 0.0;
2276 } else if (ctx->Transform.DepthClampFar) {
2277 ccv.MinimumDepth = 0.0;
2278 ccv.MaximumDepth = MAX2(vp->Near, vp->Far);
2279 } else {
2280 ccv.MinimumDepth = 0.0;
2281 ccv.MaximumDepth = 1.0;
2282 }
2283 GENX(CC_VIEWPORT_pack)(NULL, cc_map, &ccv);
2284 cc_map += GENX(CC_VIEWPORT_length);
2285 }
2286
2287 #if GEN_GEN >= 7
2288 brw_batch_emit(brw, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), ptr) {
2289 ptr.CCViewportPointer = cc_vp_offset;
2290 }
2291 #elif GEN_GEN == 6
2292 brw_batch_emit(brw, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vp) {
2293 vp.CCViewportStateChange = 1;
2294 vp.PointertoCC_VIEWPORT = cc_vp_offset;
2295 }
2296 #else
2297 brw->cc.vp_offset = cc_vp_offset;
2298 ctx->NewDriverState |= BRW_NEW_CC_VP;
2299 #endif
2300 }
2301
2302 const struct brw_tracked_state genX(cc_vp) = {
2303 .dirty = {
2304 .mesa = _NEW_TRANSFORM |
2305 _NEW_VIEWPORT,
2306 .brw = BRW_NEW_BATCH |
2307 BRW_NEW_BLORP |
2308 BRW_NEW_VIEWPORT_COUNT,
2309 },
2310 .emit = genX(upload_cc_viewport)
2311 };
2312
2313 /* ---------------------------------------------------------------------- */
2314
2315 static void
2316 set_scissor_bits(const struct gl_context *ctx, int i,
2317 bool flip_y, unsigned fb_width, unsigned fb_height,
2318 struct GENX(SCISSOR_RECT) *sc)
2319 {
2320 int bbox[4];
2321
2322 bbox[0] = MAX2(ctx->ViewportArray[i].X, 0);
2323 bbox[1] = MIN2(bbox[0] + ctx->ViewportArray[i].Width, fb_width);
2324 bbox[2] = CLAMP(ctx->ViewportArray[i].Y, 0, fb_height);
2325 bbox[3] = MIN2(bbox[2] + ctx->ViewportArray[i].Height, fb_height);
2326 _mesa_intersect_scissor_bounding_box(ctx, i, bbox);
2327
2328 if (bbox[0] == bbox[1] || bbox[2] == bbox[3]) {
2329 /* If the scissor was out of bounds and got clamped to 0 width/height
2330 * at the bounds, the subtraction of 1 from maximums could produce a
2331 * negative number and thus not clip anything. Instead, just provide
2332 * a min > max scissor inside the bounds, which produces the expected
2333 * no rendering.
2334 */
2335 sc->ScissorRectangleXMin = 1;
2336 sc->ScissorRectangleXMax = 0;
2337 sc->ScissorRectangleYMin = 1;
2338 sc->ScissorRectangleYMax = 0;
2339 } else if (!flip_y) {
2340 /* texmemory: Y=0=bottom */
2341 sc->ScissorRectangleXMin = bbox[0];
2342 sc->ScissorRectangleXMax = bbox[1] - 1;
2343 sc->ScissorRectangleYMin = bbox[2];
2344 sc->ScissorRectangleYMax = bbox[3] - 1;
2345 } else {
2346 /* memory: Y=0=top */
2347 sc->ScissorRectangleXMin = bbox[0];
2348 sc->ScissorRectangleXMax = bbox[1] - 1;
2349 sc->ScissorRectangleYMin = fb_height - bbox[3];
2350 sc->ScissorRectangleYMax = fb_height - bbox[2] - 1;
2351 }
2352 }
2353
2354 #if GEN_GEN >= 6
2355 static void
2356 genX(upload_scissor_state)(struct brw_context *brw)
2357 {
2358 struct gl_context *ctx = &brw->ctx;
2359 const bool flip_y = ctx->DrawBuffer->FlipY;
2360 struct GENX(SCISSOR_RECT) scissor;
2361 uint32_t scissor_state_offset;
2362 const unsigned int fb_width = _mesa_geometric_width(ctx->DrawBuffer);
2363 const unsigned int fb_height = _mesa_geometric_height(ctx->DrawBuffer);
2364 uint32_t *scissor_map;
2365
2366 /* BRW_NEW_VIEWPORT_COUNT */
2367 const unsigned viewport_count = brw->clip.viewport_count;
2368
2369 scissor_map = brw_state_batch(
2370 brw, GENX(SCISSOR_RECT_length) * sizeof(uint32_t) * viewport_count,
2371 32, &scissor_state_offset);
2372
2373 /* _NEW_SCISSOR | _NEW_BUFFERS | _NEW_VIEWPORT */
2374
2375 /* The scissor only needs to handle the intersection of drawable and
2376 * scissor rect. Clipping to the boundaries of static shared buffers
2377 * for front/back/depth is covered by looping over cliprects in brw_draw.c.
2378 *
2379 * Note that the hardware's coordinates are inclusive, while Mesa's min is
2380 * inclusive but max is exclusive.
2381 */
2382 for (unsigned i = 0; i < viewport_count; i++) {
2383 set_scissor_bits(ctx, i, flip_y, fb_width, fb_height, &scissor);
2384 GENX(SCISSOR_RECT_pack)(
2385 NULL, scissor_map + i * GENX(SCISSOR_RECT_length), &scissor);
2386 }
2387
2388 brw_batch_emit(brw, GENX(3DSTATE_SCISSOR_STATE_POINTERS), ptr) {
2389 ptr.ScissorRectPointer = scissor_state_offset;
2390 }
2391 }
2392
2393 static const struct brw_tracked_state genX(scissor_state) = {
2394 .dirty = {
2395 .mesa = _NEW_BUFFERS |
2396 _NEW_SCISSOR |
2397 _NEW_VIEWPORT,
2398 .brw = BRW_NEW_BATCH |
2399 BRW_NEW_BLORP |
2400 BRW_NEW_VIEWPORT_COUNT,
2401 },
2402 .emit = genX(upload_scissor_state),
2403 };
2404 #endif
2405
2406 /* ---------------------------------------------------------------------- */
2407
2408 static void
2409 brw_calculate_guardband_size(uint32_t fb_width, uint32_t fb_height,
2410 float m00, float m11, float m30, float m31,
2411 float *xmin, float *xmax,
2412 float *ymin, float *ymax)
2413 {
2414 /* According to the "Vertex X,Y Clamping and Quantization" section of the
2415 * Strips and Fans documentation:
2416 *
2417 * "The vertex X and Y screen-space coordinates are also /clamped/ to the
2418 * fixed-point "guardband" range supported by the rasterization hardware"
2419 *
2420 * and
2421 *
2422 * "In almost all circumstances, if an object’s vertices are actually
2423 * modified by this clamping (i.e., had X or Y coordinates outside of
2424 * the guardband extent the rendered object will not match the intended
2425 * result. Therefore software should take steps to ensure that this does
2426 * not happen - e.g., by clipping objects such that they do not exceed
2427 * these limits after the Drawing Rectangle is applied."
2428 *
2429 * I believe the fundamental restriction is that the rasterizer (in
2430 * the SF/WM stages) have a limit on the number of pixels that can be
2431 * rasterized. We need to ensure any coordinates beyond the rasterizer
2432 * limit are handled by the clipper. So effectively that limit becomes
2433 * the clipper's guardband size.
2434 *
2435 * It goes on to say:
2436 *
2437 * "In addition, in order to be correctly rendered, objects must have a
2438 * screenspace bounding box not exceeding 8K in the X or Y direction.
2439 * This additional restriction must also be comprehended by software,
2440 * i.e., enforced by use of clipping."
2441 *
2442 * This makes no sense. Gen7+ hardware supports 16K render targets,
2443 * and you definitely need to be able to draw polygons that fill the
2444 * surface. Our assumption is that the rasterizer was limited to 8K
2445 * on Sandybridge, which only supports 8K surfaces, and it was actually
2446 * increased to 16K on Ivybridge and later.
2447 *
2448 * So, limit the guardband to 16K on Gen7+ and 8K on Sandybridge.
2449 */
2450 const float gb_size = GEN_GEN >= 7 ? 16384.0f : 8192.0f;
2451
2452 /* Workaround: prevent gpu hangs on SandyBridge
2453 * by disabling guardband clipping for odd dimensions.
2454 */
2455 if (GEN_GEN == 6 && (fb_width & 1 || fb_height & 1)) {
2456 *xmin = -1.0f;
2457 *xmax = 1.0f;
2458 *ymin = -1.0f;
2459 *ymax = 1.0f;
2460 return;
2461 }
2462
2463 if (m00 != 0 && m11 != 0) {
2464 /* First, we compute the screen-space render area */
2465 const float ss_ra_xmin = MIN3( 0, m30 + m00, m30 - m00);
2466 const float ss_ra_xmax = MAX3( fb_width, m30 + m00, m30 - m00);
2467 const float ss_ra_ymin = MIN3( 0, m31 + m11, m31 - m11);
2468 const float ss_ra_ymax = MAX3(fb_height, m31 + m11, m31 - m11);
2469
2470 /* We want the guardband to be centered on that */
2471 const float ss_gb_xmin = (ss_ra_xmin + ss_ra_xmax) / 2 - gb_size;
2472 const float ss_gb_xmax = (ss_ra_xmin + ss_ra_xmax) / 2 + gb_size;
2473 const float ss_gb_ymin = (ss_ra_ymin + ss_ra_ymax) / 2 - gb_size;
2474 const float ss_gb_ymax = (ss_ra_ymin + ss_ra_ymax) / 2 + gb_size;
2475
2476 /* Now we need it in native device coordinates */
2477 const float ndc_gb_xmin = (ss_gb_xmin - m30) / m00;
2478 const float ndc_gb_xmax = (ss_gb_xmax - m30) / m00;
2479 const float ndc_gb_ymin = (ss_gb_ymin - m31) / m11;
2480 const float ndc_gb_ymax = (ss_gb_ymax - m31) / m11;
2481
2482 /* Thanks to Y-flipping and ORIGIN_UPPER_LEFT, the Y coordinates may be
2483 * flipped upside-down. X should be fine though.
2484 */
2485 assert(ndc_gb_xmin <= ndc_gb_xmax);
2486 *xmin = ndc_gb_xmin;
2487 *xmax = ndc_gb_xmax;
2488 *ymin = MIN2(ndc_gb_ymin, ndc_gb_ymax);
2489 *ymax = MAX2(ndc_gb_ymin, ndc_gb_ymax);
2490 } else {
2491 /* The viewport scales to 0, so nothing will be rendered. */
2492 *xmin = 0.0f;
2493 *xmax = 0.0f;
2494 *ymin = 0.0f;
2495 *ymax = 0.0f;
2496 }
2497 }
2498
2499 static void
2500 genX(upload_sf_clip_viewport)(struct brw_context *brw)
2501 {
2502 struct gl_context *ctx = &brw->ctx;
2503 float y_scale, y_bias;
2504
2505 /* BRW_NEW_VIEWPORT_COUNT */
2506 const unsigned viewport_count = brw->clip.viewport_count;
2507
2508 /* _NEW_BUFFERS */
2509 const bool flip_y = ctx->DrawBuffer->FlipY;
2510 const uint32_t fb_width = (float)_mesa_geometric_width(ctx->DrawBuffer);
2511 const uint32_t fb_height = (float)_mesa_geometric_height(ctx->DrawBuffer);
2512
2513 #if GEN_GEN >= 7
2514 #define clv sfv
2515 struct GENX(SF_CLIP_VIEWPORT) sfv;
2516 uint32_t sf_clip_vp_offset;
2517 uint32_t *sf_clip_map =
2518 brw_state_batch(brw, GENX(SF_CLIP_VIEWPORT_length) * 4 * viewport_count,
2519 64, &sf_clip_vp_offset);
2520 #else
2521 struct GENX(SF_VIEWPORT) sfv;
2522 struct GENX(CLIP_VIEWPORT) clv;
2523 uint32_t sf_vp_offset, clip_vp_offset;
2524 uint32_t *sf_map =
2525 brw_state_batch(brw, GENX(SF_VIEWPORT_length) * 4 * viewport_count,
2526 32, &sf_vp_offset);
2527 uint32_t *clip_map =
2528 brw_state_batch(brw, GENX(CLIP_VIEWPORT_length) * 4 * viewport_count,
2529 32, &clip_vp_offset);
2530 #endif
2531
2532 /* _NEW_BUFFERS */
2533 if (flip_y) {
2534 y_scale = -1.0;
2535 y_bias = (float)fb_height;
2536 } else {
2537 y_scale = 1.0;
2538 y_bias = 0;
2539 }
2540
2541 for (unsigned i = 0; i < brw->clip.viewport_count; i++) {
2542 /* _NEW_VIEWPORT: Guardband Clipping */
2543 float scale[3], translate[3], gb_xmin, gb_xmax, gb_ymin, gb_ymax;
2544 _mesa_get_viewport_xform(ctx, i, scale, translate);
2545
2546 sfv.ViewportMatrixElementm00 = scale[0];
2547 sfv.ViewportMatrixElementm11 = scale[1] * y_scale,
2548 sfv.ViewportMatrixElementm22 = scale[2],
2549 sfv.ViewportMatrixElementm30 = translate[0],
2550 sfv.ViewportMatrixElementm31 = translate[1] * y_scale + y_bias,
2551 sfv.ViewportMatrixElementm32 = translate[2],
2552 brw_calculate_guardband_size(fb_width, fb_height,
2553 sfv.ViewportMatrixElementm00,
2554 sfv.ViewportMatrixElementm11,
2555 sfv.ViewportMatrixElementm30,
2556 sfv.ViewportMatrixElementm31,
2557 &gb_xmin, &gb_xmax, &gb_ymin, &gb_ymax);
2558
2559
2560 clv.XMinClipGuardband = gb_xmin;
2561 clv.XMaxClipGuardband = gb_xmax;
2562 clv.YMinClipGuardband = gb_ymin;
2563 clv.YMaxClipGuardband = gb_ymax;
2564
2565 #if GEN_GEN < 6
2566 set_scissor_bits(ctx, i, flip_y, fb_width, fb_height,
2567 &sfv.ScissorRectangle);
2568 #elif GEN_GEN >= 8
2569 /* _NEW_VIEWPORT | _NEW_BUFFERS: Screen Space Viewport
2570 * The hardware will take the intersection of the drawing rectangle,
2571 * scissor rectangle, and the viewport extents. However, emitting
2572 * 3DSTATE_DRAWING_RECTANGLE is expensive since it requires a full
2573 * pipeline stall so we're better off just being a little more clever
2574 * with our viewport so we can emit it once at context creation time.
2575 */
2576 const float viewport_Xmin = MAX2(ctx->ViewportArray[i].X, 0);
2577 const float viewport_Ymin = MAX2(ctx->ViewportArray[i].Y, 0);
2578 const float viewport_Xmax =
2579 MIN2(ctx->ViewportArray[i].X + ctx->ViewportArray[i].Width, fb_width);
2580 const float viewport_Ymax =
2581 MIN2(ctx->ViewportArray[i].Y + ctx->ViewportArray[i].Height, fb_height);
2582
2583 if (flip_y) {
2584 sfv.XMinViewPort = viewport_Xmin;
2585 sfv.XMaxViewPort = viewport_Xmax - 1;
2586 sfv.YMinViewPort = fb_height - viewport_Ymax;
2587 sfv.YMaxViewPort = fb_height - viewport_Ymin - 1;
2588 } else {
2589 sfv.XMinViewPort = viewport_Xmin;
2590 sfv.XMaxViewPort = viewport_Xmax - 1;
2591 sfv.YMinViewPort = viewport_Ymin;
2592 sfv.YMaxViewPort = viewport_Ymax - 1;
2593 }
2594 #endif
2595
2596 #if GEN_GEN >= 7
2597 GENX(SF_CLIP_VIEWPORT_pack)(NULL, sf_clip_map, &sfv);
2598 sf_clip_map += GENX(SF_CLIP_VIEWPORT_length);
2599 #else
2600 GENX(SF_VIEWPORT_pack)(NULL, sf_map, &sfv);
2601 GENX(CLIP_VIEWPORT_pack)(NULL, clip_map, &clv);
2602 sf_map += GENX(SF_VIEWPORT_length);
2603 clip_map += GENX(CLIP_VIEWPORT_length);
2604 #endif
2605 }
2606
2607 #if GEN_GEN >= 7
2608 brw_batch_emit(brw, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), ptr) {
2609 ptr.SFClipViewportPointer = sf_clip_vp_offset;
2610 }
2611 #elif GEN_GEN == 6
2612 brw_batch_emit(brw, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vp) {
2613 vp.SFViewportStateChange = 1;
2614 vp.CLIPViewportStateChange = 1;
2615 vp.PointertoCLIP_VIEWPORT = clip_vp_offset;
2616 vp.PointertoSF_VIEWPORT = sf_vp_offset;
2617 }
2618 #else
2619 brw->sf.vp_offset = sf_vp_offset;
2620 brw->clip.vp_offset = clip_vp_offset;
2621 brw->ctx.NewDriverState |= BRW_NEW_SF_VP | BRW_NEW_CLIP_VP;
2622 #endif
2623 }
2624
2625 static const struct brw_tracked_state genX(sf_clip_viewport) = {
2626 .dirty = {
2627 .mesa = _NEW_BUFFERS |
2628 _NEW_VIEWPORT |
2629 (GEN_GEN <= 5 ? _NEW_SCISSOR : 0),
2630 .brw = BRW_NEW_BATCH |
2631 BRW_NEW_BLORP |
2632 BRW_NEW_VIEWPORT_COUNT,
2633 },
2634 .emit = genX(upload_sf_clip_viewport),
2635 };
2636
2637 /* ---------------------------------------------------------------------- */
2638
2639 static void
2640 genX(upload_gs_state)(struct brw_context *brw)
2641 {
2642 UNUSED struct gl_context *ctx = &brw->ctx;
2643 UNUSED const struct gen_device_info *devinfo = &brw->screen->devinfo;
2644 const struct brw_stage_state *stage_state = &brw->gs.base;
2645 const struct gl_program *gs_prog = brw->programs[MESA_SHADER_GEOMETRY];
2646 /* BRW_NEW_GEOMETRY_PROGRAM */
2647 bool active = GEN_GEN >= 6 && gs_prog;
2648
2649 /* BRW_NEW_GS_PROG_DATA */
2650 struct brw_stage_prog_data *stage_prog_data = stage_state->prog_data;
2651 UNUSED const struct brw_vue_prog_data *vue_prog_data =
2652 brw_vue_prog_data(stage_prog_data);
2653 #if GEN_GEN >= 7
2654 const struct brw_gs_prog_data *gs_prog_data =
2655 brw_gs_prog_data(stage_prog_data);
2656 #endif
2657
2658 #if GEN_GEN == 6
2659 brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_GS), cgs) {
2660 if (active && stage_state->push_const_size != 0) {
2661 cgs.Buffer0Valid = true;
2662 cgs.ConstantBody.PointertoConstantBuffer0 = stage_state->push_const_offset;
2663 cgs.ConstantBody.ConstantBuffer0ReadLength = stage_state->push_const_size - 1;
2664 }
2665 }
2666 #endif
2667
2668 #if GEN_GEN == 7 && !GEN_IS_HASWELL
2669 /**
2670 * From Graphics BSpec: 3D-Media-GPGPU Engine > 3D Pipeline Stages >
2671 * Geometry > Geometry Shader > State:
2672 *
2673 * "Note: Because of corruption in IVB:GT2, software needs to flush the
2674 * whole fixed function pipeline when the GS enable changes value in
2675 * the 3DSTATE_GS."
2676 *
2677 * The hardware architects have clarified that in this context "flush the
2678 * whole fixed function pipeline" means to emit a PIPE_CONTROL with the "CS
2679 * Stall" bit set.
2680 */
2681 if (devinfo->gt == 2 && brw->gs.enabled != active)
2682 gen7_emit_cs_stall_flush(brw);
2683 #endif
2684
2685 #if GEN_GEN >= 6
2686 brw_batch_emit(brw, GENX(3DSTATE_GS), gs) {
2687 #else
2688 ctx->NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
2689 brw_state_emit(brw, GENX(GS_STATE), 32, &brw->ff_gs.state_offset, gs) {
2690 #endif
2691
2692 #if GEN_GEN >= 6
2693 if (active) {
2694 INIT_THREAD_DISPATCH_FIELDS(gs, Vertex);
2695
2696 #if GEN_GEN >= 7
2697 gs.OutputVertexSize = gs_prog_data->output_vertex_size_hwords * 2 - 1;
2698 gs.OutputTopology = gs_prog_data->output_topology;
2699 gs.ControlDataHeaderSize =
2700 gs_prog_data->control_data_header_size_hwords;
2701
2702 gs.InstanceControl = gs_prog_data->invocations - 1;
2703 gs.DispatchMode = vue_prog_data->dispatch_mode;
2704
2705 gs.IncludePrimitiveID = gs_prog_data->include_primitive_id;
2706
2707 gs.ControlDataFormat = gs_prog_data->control_data_format;
2708 #endif
2709
2710 /* Note: the meaning of the GEN7_GS_REORDER_TRAILING bit changes between
2711 * Ivy Bridge and Haswell.
2712 *
2713 * On Ivy Bridge, setting this bit causes the vertices of a triangle
2714 * strip to be delivered to the geometry shader in an order that does
2715 * not strictly follow the OpenGL spec, but preserves triangle
2716 * orientation. For example, if the vertices are (1, 2, 3, 4, 5), then
2717 * the geometry shader sees triangles:
2718 *
2719 * (1, 2, 3), (2, 4, 3), (3, 4, 5)
2720 *
2721 * (Clearing the bit is even worse, because it fails to preserve
2722 * orientation).
2723 *
2724 * Triangle strips with adjacency always ordered in a way that preserves
2725 * triangle orientation but does not strictly follow the OpenGL spec,
2726 * regardless of the setting of this bit.
2727 *
2728 * On Haswell, both triangle strips and triangle strips with adjacency
2729 * are always ordered in a way that preserves triangle orientation.
2730 * Setting this bit causes the ordering to strictly follow the OpenGL
2731 * spec.
2732 *
2733 * So in either case we want to set the bit. Unfortunately on Ivy
2734 * Bridge this will get the order close to correct but not perfect.
2735 */
2736 gs.ReorderMode = TRAILING;
2737 gs.MaximumNumberofThreads =
2738 GEN_GEN == 8 ? (devinfo->max_gs_threads / 2 - 1)
2739 : (devinfo->max_gs_threads - 1);
2740
2741 #if GEN_GEN < 7
2742 gs.SOStatisticsEnable = true;
2743 if (gs_prog->info.has_transform_feedback_varyings)
2744 gs.SVBIPayloadEnable = _mesa_is_xfb_active_and_unpaused(ctx);
2745
2746 /* GEN6_GS_SPF_MODE and GEN6_GS_VECTOR_MASK_ENABLE are enabled as it
2747 * was previously done for gen6.
2748 *
2749 * TODO: test with both disabled to see if the HW is behaving
2750 * as expected, like in gen7.
2751 */
2752 gs.SingleProgramFlow = true;
2753 gs.VectorMaskEnable = true;
2754 #endif
2755
2756 #if GEN_GEN >= 8
2757 gs.ExpectedVertexCount = gs_prog_data->vertices_in;
2758
2759 if (gs_prog_data->static_vertex_count != -1) {
2760 gs.StaticOutput = true;
2761 gs.StaticOutputVertexCount = gs_prog_data->static_vertex_count;
2762 }
2763 gs.IncludeVertexHandles = vue_prog_data->include_vue_handles;
2764
2765 gs.UserClipDistanceCullTestEnableBitmask =
2766 vue_prog_data->cull_distance_mask;
2767
2768 const int urb_entry_write_offset = 1;
2769 const uint32_t urb_entry_output_length =
2770 DIV_ROUND_UP(vue_prog_data->vue_map.num_slots, 2) -
2771 urb_entry_write_offset;
2772
2773 gs.VertexURBEntryOutputReadOffset = urb_entry_write_offset;
2774 gs.VertexURBEntryOutputLength = MAX2(urb_entry_output_length, 1);
2775 #endif
2776 }
2777 #endif
2778
2779 #if GEN_GEN <= 6
2780 if (!active && brw->ff_gs.prog_active) {
2781 /* In gen6, transform feedback for the VS stage is done with an
2782 * ad-hoc GS program. This function provides the needed 3DSTATE_GS
2783 * for this.
2784 */
2785 gs.KernelStartPointer = KSP(brw, brw->ff_gs.prog_offset);
2786 gs.SingleProgramFlow = true;
2787 gs.DispatchGRFStartRegisterForURBData = GEN_GEN == 6 ? 2 : 1;
2788 gs.VertexURBEntryReadLength = brw->ff_gs.prog_data->urb_read_length;
2789
2790 #if GEN_GEN <= 5
2791 gs.GRFRegisterCount =
2792 DIV_ROUND_UP(brw->ff_gs.prog_data->total_grf, 16) - 1;
2793 /* BRW_NEW_URB_FENCE */
2794 gs.NumberofURBEntries = brw->urb.nr_gs_entries;
2795 gs.URBEntryAllocationSize = brw->urb.vsize - 1;
2796 gs.MaximumNumberofThreads = brw->urb.nr_gs_entries >= 8 ? 1 : 0;
2797 gs.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
2798 #else
2799 gs.Enable = true;
2800 gs.VectorMaskEnable = true;
2801 gs.SVBIPayloadEnable = true;
2802 gs.SVBIPostIncrementEnable = true;
2803 gs.SVBIPostIncrementValue =
2804 brw->ff_gs.prog_data->svbi_postincrement_value;
2805 gs.SOStatisticsEnable = true;
2806 gs.MaximumNumberofThreads = devinfo->max_gs_threads - 1;
2807 #endif
2808 }
2809 #endif
2810 if (!active && !brw->ff_gs.prog_active) {
2811 #if GEN_GEN < 8
2812 gs.DispatchGRFStartRegisterForURBData = 1;
2813 #if GEN_GEN >= 7
2814 gs.IncludeVertexHandles = true;
2815 #endif
2816 #endif
2817 }
2818
2819 #if GEN_GEN >= 6
2820 gs.StatisticsEnable = true;
2821 #endif
2822 #if GEN_GEN == 5 || GEN_GEN == 6
2823 gs.RenderingEnabled = true;
2824 #endif
2825 #if GEN_GEN <= 5
2826 gs.MaximumVPIndex = brw->clip.viewport_count - 1;
2827 #endif
2828 }
2829
2830 #if GEN_GEN == 6
2831 brw->gs.enabled = active;
2832 #endif
2833 }
2834
2835 static const struct brw_tracked_state genX(gs_state) = {
2836 .dirty = {
2837 .mesa = (GEN_GEN == 6 ? _NEW_PROGRAM_CONSTANTS : 0),
2838 .brw = BRW_NEW_BATCH |
2839 BRW_NEW_BLORP |
2840 (GEN_GEN <= 5 ? BRW_NEW_PUSH_CONSTANT_ALLOCATION |
2841 BRW_NEW_PROGRAM_CACHE |
2842 BRW_NEW_URB_FENCE |
2843 BRW_NEW_VIEWPORT_COUNT
2844 : 0) |
2845 (GEN_GEN >= 6 ? BRW_NEW_CONTEXT |
2846 BRW_NEW_GEOMETRY_PROGRAM |
2847 BRW_NEW_GS_PROG_DATA
2848 : 0) |
2849 (GEN_GEN < 7 ? BRW_NEW_FF_GS_PROG_DATA : 0),
2850 },
2851 .emit = genX(upload_gs_state),
2852 };
2853
2854 /* ---------------------------------------------------------------------- */
2855
2856 UNUSED static GLenum
2857 fix_dual_blend_alpha_to_one(GLenum function)
2858 {
2859 switch (function) {
2860 case GL_SRC1_ALPHA:
2861 return GL_ONE;
2862
2863 case GL_ONE_MINUS_SRC1_ALPHA:
2864 return GL_ZERO;
2865 }
2866
2867 return function;
2868 }
2869
2870 #define blend_factor(x) brw_translate_blend_factor(x)
2871 #define blend_eqn(x) brw_translate_blend_equation(x)
2872
2873 /**
2874 * Modify blend function to force destination alpha to 1.0
2875 *
2876 * If \c function specifies a blend function that uses destination alpha,
2877 * replace it with a function that hard-wires destination alpha to 1.0. This
2878 * is used when rendering to xRGB targets.
2879 */
2880 static GLenum
2881 brw_fix_xRGB_alpha(GLenum function)
2882 {
2883 switch (function) {
2884 case GL_DST_ALPHA:
2885 return GL_ONE;
2886
2887 case GL_ONE_MINUS_DST_ALPHA:
2888 case GL_SRC_ALPHA_SATURATE:
2889 return GL_ZERO;
2890 }
2891
2892 return function;
2893 }
2894
2895 #if GEN_GEN >= 6
2896 typedef struct GENX(BLEND_STATE_ENTRY) BLEND_ENTRY_GENXML;
2897 #else
2898 typedef struct GENX(COLOR_CALC_STATE) BLEND_ENTRY_GENXML;
2899 #endif
2900
2901 UNUSED static bool
2902 set_blend_entry_bits(struct brw_context *brw, BLEND_ENTRY_GENXML *entry, int i,
2903 bool alpha_to_one)
2904 {
2905 struct gl_context *ctx = &brw->ctx;
2906
2907 /* _NEW_BUFFERS */
2908 const struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[i];
2909
2910 bool independent_alpha_blend = false;
2911
2912 /* Used for implementing the following bit of GL_EXT_texture_integer:
2913 * "Per-fragment operations that require floating-point color
2914 * components, including multisample alpha operations, alpha test,
2915 * blending, and dithering, have no effect when the corresponding
2916 * colors are written to an integer color buffer."
2917 */
2918 const bool integer = ctx->DrawBuffer->_IntegerBuffers & (0x1 << i);
2919
2920 const unsigned blend_enabled = GEN_GEN >= 6 ?
2921 ctx->Color.BlendEnabled & (1 << i) : ctx->Color.BlendEnabled;
2922
2923 /* _NEW_COLOR */
2924 if (ctx->Color.ColorLogicOpEnabled) {
2925 GLenum rb_type = rb ? _mesa_get_format_datatype(rb->Format)
2926 : GL_UNSIGNED_NORMALIZED;
2927 WARN_ONCE(ctx->Color.LogicOp != GL_COPY &&
2928 rb_type != GL_UNSIGNED_NORMALIZED &&
2929 rb_type != GL_FLOAT, "Ignoring %s logic op on %s "
2930 "renderbuffer\n",
2931 _mesa_enum_to_string(ctx->Color.LogicOp),
2932 _mesa_enum_to_string(rb_type));
2933 if (GEN_GEN >= 8 || rb_type == GL_UNSIGNED_NORMALIZED) {
2934 entry->LogicOpEnable = true;
2935 entry->LogicOpFunction = ctx->Color._LogicOp;
2936 }
2937 } else if (blend_enabled && !ctx->Color._AdvancedBlendMode
2938 && (GEN_GEN <= 5 || !integer)) {
2939 GLenum eqRGB = ctx->Color.Blend[i].EquationRGB;
2940 GLenum eqA = ctx->Color.Blend[i].EquationA;
2941 GLenum srcRGB = ctx->Color.Blend[i].SrcRGB;
2942 GLenum dstRGB = ctx->Color.Blend[i].DstRGB;
2943 GLenum srcA = ctx->Color.Blend[i].SrcA;
2944 GLenum dstA = ctx->Color.Blend[i].DstA;
2945
2946 if (eqRGB == GL_MIN || eqRGB == GL_MAX)
2947 srcRGB = dstRGB = GL_ONE;
2948
2949 if (eqA == GL_MIN || eqA == GL_MAX)
2950 srcA = dstA = GL_ONE;
2951
2952 /* Due to hardware limitations, the destination may have information
2953 * in an alpha channel even when the format specifies no alpha
2954 * channel. In order to avoid getting any incorrect blending due to
2955 * that alpha channel, coerce the blend factors to values that will
2956 * not read the alpha channel, but will instead use the correct
2957 * implicit value for alpha.
2958 */
2959 if (rb && !_mesa_base_format_has_channel(rb->_BaseFormat,
2960 GL_TEXTURE_ALPHA_TYPE)) {
2961 srcRGB = brw_fix_xRGB_alpha(srcRGB);
2962 srcA = brw_fix_xRGB_alpha(srcA);
2963 dstRGB = brw_fix_xRGB_alpha(dstRGB);
2964 dstA = brw_fix_xRGB_alpha(dstA);
2965 }
2966
2967 /* From the BLEND_STATE docs, DWord 0, Bit 29 (AlphaToOne Enable):
2968 * "If Dual Source Blending is enabled, this bit must be disabled."
2969 *
2970 * We override SRC1_ALPHA to ONE and ONE_MINUS_SRC1_ALPHA to ZERO,
2971 * and leave it enabled anyway.
2972 */
2973 if (GEN_GEN >= 6 && ctx->Color.Blend[i]._UsesDualSrc && alpha_to_one) {
2974 srcRGB = fix_dual_blend_alpha_to_one(srcRGB);
2975 srcA = fix_dual_blend_alpha_to_one(srcA);
2976 dstRGB = fix_dual_blend_alpha_to_one(dstRGB);
2977 dstA = fix_dual_blend_alpha_to_one(dstA);
2978 }
2979
2980 /* BRW_NEW_FS_PROG_DATA */
2981 const struct brw_wm_prog_data *wm_prog_data =
2982 brw_wm_prog_data(brw->wm.base.prog_data);
2983
2984 /* The Dual Source Blending documentation says:
2985 *
2986 * "If SRC1 is included in a src/dst blend factor and
2987 * a DualSource RT Write message is not used, results
2988 * are UNDEFINED. (This reflects the same restriction in DX APIs,
2989 * where undefined results are produced if “o1” is not written
2990 * by a PS – there are no default values defined).
2991 * If SRC1 is not included in a src/dst blend factor,
2992 * dual source blending must be disabled."
2993 *
2994 * There is no way to gracefully fix this undefined situation
2995 * so we just disable the blending to prevent possible issues.
2996 */
2997 entry->ColorBufferBlendEnable =
2998 !ctx->Color.Blend[0]._UsesDualSrc || wm_prog_data->dual_src_blend;
2999
3000 entry->DestinationBlendFactor = blend_factor(dstRGB);
3001 entry->SourceBlendFactor = blend_factor(srcRGB);
3002 entry->DestinationAlphaBlendFactor = blend_factor(dstA);
3003 entry->SourceAlphaBlendFactor = blend_factor(srcA);
3004 entry->ColorBlendFunction = blend_eqn(eqRGB);
3005 entry->AlphaBlendFunction = blend_eqn(eqA);
3006
3007 if (srcA != srcRGB || dstA != dstRGB || eqA != eqRGB)
3008 independent_alpha_blend = true;
3009 }
3010
3011 return independent_alpha_blend;
3012 }
3013
3014 #if GEN_GEN >= 6
3015 static void
3016 genX(upload_blend_state)(struct brw_context *brw)
3017 {
3018 struct gl_context *ctx = &brw->ctx;
3019 int size;
3020
3021 /* We need at least one BLEND_STATE written, because we might do
3022 * thread dispatch even if _NumColorDrawBuffers is 0 (for example
3023 * for computed depth or alpha test), which will do an FB write
3024 * with render target 0, which will reference BLEND_STATE[0] for
3025 * alpha test enable.
3026 */
3027 int nr_draw_buffers = ctx->DrawBuffer->_NumColorDrawBuffers;
3028 if (nr_draw_buffers == 0 && ctx->Color.AlphaEnabled)
3029 nr_draw_buffers = 1;
3030
3031 size = GENX(BLEND_STATE_ENTRY_length) * 4 * nr_draw_buffers;
3032 #if GEN_GEN >= 8
3033 size += GENX(BLEND_STATE_length) * 4;
3034 #endif
3035
3036 uint32_t *blend_map;
3037 blend_map = brw_state_batch(brw, size, 64, &brw->cc.blend_state_offset);
3038
3039 #if GEN_GEN >= 8
3040 struct GENX(BLEND_STATE) blend = { 0 };
3041 {
3042 #else
3043 for (int i = 0; i < nr_draw_buffers; i++) {
3044 struct GENX(BLEND_STATE_ENTRY) entry = { 0 };
3045 #define blend entry
3046 #endif
3047 /* OpenGL specification 3.3 (page 196), section 4.1.3 says:
3048 * "If drawbuffer zero is not NONE and the buffer it references has an
3049 * integer format, the SAMPLE_ALPHA_TO_COVERAGE and SAMPLE_ALPHA_TO_ONE
3050 * operations are skipped."
3051 */
3052 if (!(ctx->DrawBuffer->_IntegerBuffers & 0x1)) {
3053 /* _NEW_MULTISAMPLE */
3054 if (_mesa_is_multisample_enabled(ctx)) {
3055 if (ctx->Multisample.SampleAlphaToCoverage) {
3056 blend.AlphaToCoverageEnable = true;
3057 blend.AlphaToCoverageDitherEnable = GEN_GEN >= 7;
3058 }
3059 if (ctx->Multisample.SampleAlphaToOne)
3060 blend.AlphaToOneEnable = true;
3061 }
3062
3063 /* _NEW_COLOR */
3064 if (ctx->Color.AlphaEnabled) {
3065 blend.AlphaTestEnable = true;
3066 blend.AlphaTestFunction =
3067 intel_translate_compare_func(ctx->Color.AlphaFunc);
3068 }
3069
3070 if (ctx->Color.DitherFlag) {
3071 blend.ColorDitherEnable = true;
3072 }
3073 }
3074
3075 #if GEN_GEN >= 8
3076 for (int i = 0; i < nr_draw_buffers; i++) {
3077 struct GENX(BLEND_STATE_ENTRY) entry = { 0 };
3078 #else
3079 {
3080 #endif
3081 blend.IndependentAlphaBlendEnable =
3082 set_blend_entry_bits(brw, &entry, i, blend.AlphaToOneEnable) ||
3083 blend.IndependentAlphaBlendEnable;
3084
3085 /* See section 8.1.6 "Pre-Blend Color Clamping" of the
3086 * SandyBridge PRM Volume 2 Part 1 for HW requirements.
3087 *
3088 * We do our ARB_color_buffer_float CLAMP_FRAGMENT_COLOR
3089 * clamping in the fragment shader. For its clamping of
3090 * blending, the spec says:
3091 *
3092 * "RESOLVED: For fixed-point color buffers, the inputs and
3093 * the result of the blending equation are clamped. For
3094 * floating-point color buffers, no clamping occurs."
3095 *
3096 * So, generally, we want clamping to the render target's range.
3097 * And, good news, the hardware tables for both pre- and
3098 * post-blend color clamping are either ignored, or any are
3099 * allowed, or clamping is required but RT range clamping is a
3100 * valid option.
3101 */
3102 entry.PreBlendColorClampEnable = true;
3103 entry.PostBlendColorClampEnable = true;
3104 entry.ColorClampRange = COLORCLAMP_RTFORMAT;
3105
3106 entry.WriteDisableRed = !GET_COLORMASK_BIT(ctx->Color.ColorMask, i, 0);
3107 entry.WriteDisableGreen = !GET_COLORMASK_BIT(ctx->Color.ColorMask, i, 1);
3108 entry.WriteDisableBlue = !GET_COLORMASK_BIT(ctx->Color.ColorMask, i, 2);
3109 entry.WriteDisableAlpha = !GET_COLORMASK_BIT(ctx->Color.ColorMask, i, 3);
3110
3111 #if GEN_GEN >= 8
3112 GENX(BLEND_STATE_ENTRY_pack)(NULL, &blend_map[1 + i * 2], &entry);
3113 #else
3114 GENX(BLEND_STATE_ENTRY_pack)(NULL, &blend_map[i * 2], &entry);
3115 #endif
3116 }
3117 }
3118
3119 #if GEN_GEN >= 8
3120 GENX(BLEND_STATE_pack)(NULL, blend_map, &blend);
3121 #endif
3122
3123 #if GEN_GEN < 7
3124 brw_batch_emit(brw, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
3125 ptr.PointertoBLEND_STATE = brw->cc.blend_state_offset;
3126 ptr.BLEND_STATEChange = true;
3127 }
3128 #else
3129 brw_batch_emit(brw, GENX(3DSTATE_BLEND_STATE_POINTERS), ptr) {
3130 ptr.BlendStatePointer = brw->cc.blend_state_offset;
3131 #if GEN_GEN >= 8
3132 ptr.BlendStatePointerValid = true;
3133 #endif
3134 }
3135 #endif
3136 }
3137
3138 static const struct brw_tracked_state genX(blend_state) = {
3139 .dirty = {
3140 .mesa = _NEW_BUFFERS |
3141 _NEW_COLOR |
3142 _NEW_MULTISAMPLE,
3143 .brw = BRW_NEW_BATCH |
3144 BRW_NEW_BLORP |
3145 BRW_NEW_FS_PROG_DATA |
3146 BRW_NEW_STATE_BASE_ADDRESS,
3147 },
3148 .emit = genX(upload_blend_state),
3149 };
3150 #endif
3151
3152 /* ---------------------------------------------------------------------- */
3153
3154 #if GEN_GEN >= 7
3155 UNUSED static const uint32_t push_constant_opcodes[] = {
3156 [MESA_SHADER_VERTEX] = 21,
3157 [MESA_SHADER_TESS_CTRL] = 25, /* HS */
3158 [MESA_SHADER_TESS_EVAL] = 26, /* DS */
3159 [MESA_SHADER_GEOMETRY] = 22,
3160 [MESA_SHADER_FRAGMENT] = 23,
3161 [MESA_SHADER_COMPUTE] = 0,
3162 };
3163
3164 static void
3165 genX(upload_push_constant_packets)(struct brw_context *brw)
3166 {
3167 const struct gen_device_info *devinfo = &brw->screen->devinfo;
3168 struct gl_context *ctx = &brw->ctx;
3169
3170 UNUSED uint32_t mocs = GEN_GEN < 8 ? GEN7_MOCS_L3 : 0;
3171
3172 struct brw_stage_state *stage_states[] = {
3173 &brw->vs.base,
3174 &brw->tcs.base,
3175 &brw->tes.base,
3176 &brw->gs.base,
3177 &brw->wm.base,
3178 };
3179
3180 if (GEN_GEN == 7 && !GEN_IS_HASWELL && !devinfo->is_baytrail &&
3181 stage_states[MESA_SHADER_VERTEX]->push_constants_dirty)
3182 gen7_emit_vs_workaround_flush(brw);
3183
3184 for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
3185 struct brw_stage_state *stage_state = stage_states[stage];
3186 UNUSED struct gl_program *prog = ctx->_Shader->CurrentProgram[stage];
3187
3188 if (!stage_state->push_constants_dirty)
3189 continue;
3190
3191 brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_VS), pkt) {
3192 pkt._3DCommandSubOpcode = push_constant_opcodes[stage];
3193 if (stage_state->prog_data) {
3194 #if GEN_GEN >= 8 || GEN_IS_HASWELL
3195 /* The Skylake PRM contains the following restriction:
3196 *
3197 * "The driver must ensure The following case does not occur
3198 * without a flush to the 3D engine: 3DSTATE_CONSTANT_* with
3199 * buffer 3 read length equal to zero committed followed by a
3200 * 3DSTATE_CONSTANT_* with buffer 0 read length not equal to
3201 * zero committed."
3202 *
3203 * To avoid this, we program the buffers in the highest slots.
3204 * This way, slot 0 is only used if slot 3 is also used.
3205 */
3206 int n = 3;
3207
3208 for (int i = 3; i >= 0; i--) {
3209 const struct brw_ubo_range *range =
3210 &stage_state->prog_data->ubo_ranges[i];
3211
3212 if (range->length == 0)
3213 continue;
3214
3215 const struct gl_uniform_block *block =
3216 prog->sh.UniformBlocks[range->block];
3217 const struct gl_buffer_binding *binding =
3218 &ctx->UniformBufferBindings[block->Binding];
3219
3220 if (binding->BufferObject == ctx->Shared->NullBufferObj) {
3221 static unsigned msg_id = 0;
3222 _mesa_gl_debugf(ctx, &msg_id, MESA_DEBUG_SOURCE_API,
3223 MESA_DEBUG_TYPE_UNDEFINED,
3224 MESA_DEBUG_SEVERITY_HIGH,
3225 "UBO %d unbound, %s shader uniform data "
3226 "will be undefined.",
3227 range->block,
3228 _mesa_shader_stage_to_string(stage));
3229 continue;
3230 }
3231
3232 assert(binding->Offset % 32 == 0);
3233
3234 struct brw_bo *bo = intel_bufferobj_buffer(brw,
3235 intel_buffer_object(binding->BufferObject),
3236 binding->Offset, range->length * 32, false);
3237
3238 pkt.ConstantBody.ReadLength[n] = range->length;
3239 pkt.ConstantBody.Buffer[n] =
3240 ro_bo(bo, range->start * 32 + binding->Offset);
3241 n--;
3242 }
3243
3244 if (stage_state->push_const_size > 0) {
3245 assert(n >= 0);
3246 pkt.ConstantBody.ReadLength[n] = stage_state->push_const_size;
3247 pkt.ConstantBody.Buffer[n] =
3248 ro_bo(stage_state->push_const_bo,
3249 stage_state->push_const_offset);
3250 }
3251 #else
3252 pkt.ConstantBody.ReadLength[0] = stage_state->push_const_size;
3253 pkt.ConstantBody.Buffer[0].offset =
3254 stage_state->push_const_offset | mocs;
3255 #endif
3256 }
3257 }
3258
3259 stage_state->push_constants_dirty = false;
3260 brw->ctx.NewDriverState |= GEN_GEN >= 9 ? BRW_NEW_SURFACES : 0;
3261 }
3262 }
3263
3264 const struct brw_tracked_state genX(push_constant_packets) = {
3265 .dirty = {
3266 .mesa = 0,
3267 .brw = BRW_NEW_DRAW_CALL,
3268 },
3269 .emit = genX(upload_push_constant_packets),
3270 };
3271 #endif
3272
3273 #if GEN_GEN >= 6
3274 static void
3275 genX(upload_vs_push_constants)(struct brw_context *brw)
3276 {
3277 struct brw_stage_state *stage_state = &brw->vs.base;
3278
3279 /* BRW_NEW_VERTEX_PROGRAM */
3280 const struct gl_program *vp = brw->programs[MESA_SHADER_VERTEX];
3281 /* BRW_NEW_VS_PROG_DATA */
3282 const struct brw_stage_prog_data *prog_data = brw->vs.base.prog_data;
3283
3284 gen6_upload_push_constants(brw, vp, prog_data, stage_state);
3285 }
3286
3287 static const struct brw_tracked_state genX(vs_push_constants) = {
3288 .dirty = {
3289 .mesa = _NEW_PROGRAM_CONSTANTS |
3290 _NEW_TRANSFORM,
3291 .brw = BRW_NEW_BATCH |
3292 BRW_NEW_BLORP |
3293 BRW_NEW_VERTEX_PROGRAM |
3294 BRW_NEW_VS_PROG_DATA,
3295 },
3296 .emit = genX(upload_vs_push_constants),
3297 };
3298
3299 static void
3300 genX(upload_gs_push_constants)(struct brw_context *brw)
3301 {
3302 struct brw_stage_state *stage_state = &brw->gs.base;
3303
3304 /* BRW_NEW_GEOMETRY_PROGRAM */
3305 const struct gl_program *gp = brw->programs[MESA_SHADER_GEOMETRY];
3306
3307 /* BRW_NEW_GS_PROG_DATA */
3308 struct brw_stage_prog_data *prog_data = brw->gs.base.prog_data;
3309
3310 gen6_upload_push_constants(brw, gp, prog_data, stage_state);
3311 }
3312
3313 static const struct brw_tracked_state genX(gs_push_constants) = {
3314 .dirty = {
3315 .mesa = _NEW_PROGRAM_CONSTANTS |
3316 _NEW_TRANSFORM,
3317 .brw = BRW_NEW_BATCH |
3318 BRW_NEW_BLORP |
3319 BRW_NEW_GEOMETRY_PROGRAM |
3320 BRW_NEW_GS_PROG_DATA,
3321 },
3322 .emit = genX(upload_gs_push_constants),
3323 };
3324
3325 static void
3326 genX(upload_wm_push_constants)(struct brw_context *brw)
3327 {
3328 struct brw_stage_state *stage_state = &brw->wm.base;
3329 /* BRW_NEW_FRAGMENT_PROGRAM */
3330 const struct gl_program *fp = brw->programs[MESA_SHADER_FRAGMENT];
3331 /* BRW_NEW_FS_PROG_DATA */
3332 const struct brw_stage_prog_data *prog_data = brw->wm.base.prog_data;
3333
3334 gen6_upload_push_constants(brw, fp, prog_data, stage_state);
3335 }
3336
3337 static const struct brw_tracked_state genX(wm_push_constants) = {
3338 .dirty = {
3339 .mesa = _NEW_PROGRAM_CONSTANTS,
3340 .brw = BRW_NEW_BATCH |
3341 BRW_NEW_BLORP |
3342 BRW_NEW_FRAGMENT_PROGRAM |
3343 BRW_NEW_FS_PROG_DATA,
3344 },
3345 .emit = genX(upload_wm_push_constants),
3346 };
3347 #endif
3348
3349 /* ---------------------------------------------------------------------- */
3350
3351 #if GEN_GEN >= 6
3352 static unsigned
3353 genX(determine_sample_mask)(struct brw_context *brw)
3354 {
3355 struct gl_context *ctx = &brw->ctx;
3356 float coverage = 1.0f;
3357 float coverage_invert = false;
3358 unsigned sample_mask = ~0u;
3359
3360 /* BRW_NEW_NUM_SAMPLES */
3361 unsigned num_samples = brw->num_samples;
3362
3363 if (_mesa_is_multisample_enabled(ctx)) {
3364 if (ctx->Multisample.SampleCoverage) {
3365 coverage = ctx->Multisample.SampleCoverageValue;
3366 coverage_invert = ctx->Multisample.SampleCoverageInvert;
3367 }
3368 if (ctx->Multisample.SampleMask) {
3369 sample_mask = ctx->Multisample.SampleMaskValue;
3370 }
3371 }
3372
3373 if (num_samples > 1) {
3374 int coverage_int = (int) (num_samples * coverage + 0.5f);
3375 uint32_t coverage_bits = (1 << coverage_int) - 1;
3376 if (coverage_invert)
3377 coverage_bits ^= (1 << num_samples) - 1;
3378 return coverage_bits & sample_mask;
3379 } else {
3380 return 1;
3381 }
3382 }
3383
3384 static void
3385 genX(emit_3dstate_multisample2)(struct brw_context *brw,
3386 unsigned num_samples)
3387 {
3388 unsigned log2_samples = ffs(num_samples) - 1;
3389
3390 brw_batch_emit(brw, GENX(3DSTATE_MULTISAMPLE), multi) {
3391 multi.PixelLocation = CENTER;
3392 multi.NumberofMultisamples = log2_samples;
3393 #if GEN_GEN == 6
3394 GEN_SAMPLE_POS_4X(multi.Sample);
3395 #elif GEN_GEN == 7
3396 switch (num_samples) {
3397 case 1:
3398 GEN_SAMPLE_POS_1X(multi.Sample);
3399 break;
3400 case 2:
3401 GEN_SAMPLE_POS_2X(multi.Sample);
3402 break;
3403 case 4:
3404 GEN_SAMPLE_POS_4X(multi.Sample);
3405 break;
3406 case 8:
3407 GEN_SAMPLE_POS_8X(multi.Sample);
3408 break;
3409 default:
3410 break;
3411 }
3412 #endif
3413 }
3414 }
3415
3416 static void
3417 genX(upload_multisample_state)(struct brw_context *brw)
3418 {
3419 assert(brw->num_samples > 0 && brw->num_samples <= 16);
3420
3421 genX(emit_3dstate_multisample2)(brw, brw->num_samples);
3422
3423 brw_batch_emit(brw, GENX(3DSTATE_SAMPLE_MASK), sm) {
3424 sm.SampleMask = genX(determine_sample_mask)(brw);
3425 }
3426 }
3427
3428 static const struct brw_tracked_state genX(multisample_state) = {
3429 .dirty = {
3430 .mesa = _NEW_MULTISAMPLE |
3431 (GEN_GEN == 10 ? _NEW_BUFFERS : 0),
3432 .brw = BRW_NEW_BLORP |
3433 BRW_NEW_CONTEXT |
3434 BRW_NEW_NUM_SAMPLES,
3435 },
3436 .emit = genX(upload_multisample_state)
3437 };
3438 #endif
3439
3440 /* ---------------------------------------------------------------------- */
3441
3442 static void
3443 genX(upload_color_calc_state)(struct brw_context *brw)
3444 {
3445 struct gl_context *ctx = &brw->ctx;
3446
3447 brw_state_emit(brw, GENX(COLOR_CALC_STATE), 64, &brw->cc.state_offset, cc) {
3448 #if GEN_GEN <= 5
3449 cc.IndependentAlphaBlendEnable =
3450 set_blend_entry_bits(brw, &cc, 0, false);
3451 set_depth_stencil_bits(brw, &cc);
3452
3453 if (ctx->Color.AlphaEnabled &&
3454 ctx->DrawBuffer->_NumColorDrawBuffers <= 1) {
3455 cc.AlphaTestEnable = true;
3456 cc.AlphaTestFunction =
3457 intel_translate_compare_func(ctx->Color.AlphaFunc);
3458 }
3459
3460 cc.ColorDitherEnable = ctx->Color.DitherFlag;
3461
3462 cc.StatisticsEnable = brw->stats_wm;
3463
3464 cc.CCViewportStatePointer =
3465 ro_bo(brw->batch.state.bo, brw->cc.vp_offset);
3466 #else
3467 /* _NEW_COLOR */
3468 cc.BlendConstantColorRed = ctx->Color.BlendColorUnclamped[0];
3469 cc.BlendConstantColorGreen = ctx->Color.BlendColorUnclamped[1];
3470 cc.BlendConstantColorBlue = ctx->Color.BlendColorUnclamped[2];
3471 cc.BlendConstantColorAlpha = ctx->Color.BlendColorUnclamped[3];
3472
3473 #if GEN_GEN < 9
3474 /* _NEW_STENCIL */
3475 cc.StencilReferenceValue = _mesa_get_stencil_ref(ctx, 0);
3476 cc.BackfaceStencilReferenceValue =
3477 _mesa_get_stencil_ref(ctx, ctx->Stencil._BackFace);
3478 #endif
3479
3480 #endif
3481
3482 /* _NEW_COLOR */
3483 UNCLAMPED_FLOAT_TO_UBYTE(cc.AlphaReferenceValueAsUNORM8,
3484 ctx->Color.AlphaRef);
3485 }
3486
3487 #if GEN_GEN >= 6
3488 brw_batch_emit(brw, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
3489 ptr.ColorCalcStatePointer = brw->cc.state_offset;
3490 #if GEN_GEN != 7
3491 ptr.ColorCalcStatePointerValid = true;
3492 #endif
3493 }
3494 #else
3495 brw->ctx.NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
3496 #endif
3497 }
3498
3499 static const struct brw_tracked_state genX(color_calc_state) = {
3500 .dirty = {
3501 .mesa = _NEW_COLOR |
3502 _NEW_STENCIL |
3503 (GEN_GEN <= 5 ? _NEW_BUFFERS |
3504 _NEW_DEPTH
3505 : 0),
3506 .brw = BRW_NEW_BATCH |
3507 BRW_NEW_BLORP |
3508 (GEN_GEN <= 5 ? BRW_NEW_CC_VP |
3509 BRW_NEW_STATS_WM
3510 : BRW_NEW_CC_STATE |
3511 BRW_NEW_STATE_BASE_ADDRESS),
3512 },
3513 .emit = genX(upload_color_calc_state),
3514 };
3515
3516
3517 /* ---------------------------------------------------------------------- */
3518
3519 #if GEN_GEN >= 7
3520 static void
3521 genX(upload_sbe)(struct brw_context *brw)
3522 {
3523 struct gl_context *ctx = &brw->ctx;
3524 /* BRW_NEW_FRAGMENT_PROGRAM */
3525 UNUSED const struct gl_program *fp = brw->programs[MESA_SHADER_FRAGMENT];
3526 /* BRW_NEW_FS_PROG_DATA */
3527 const struct brw_wm_prog_data *wm_prog_data =
3528 brw_wm_prog_data(brw->wm.base.prog_data);
3529 #if GEN_GEN >= 8
3530 struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) attr_overrides[16] = { { 0 } };
3531 #else
3532 #define attr_overrides sbe.Attribute
3533 #endif
3534 uint32_t urb_entry_read_length;
3535 uint32_t urb_entry_read_offset;
3536 uint32_t point_sprite_enables;
3537
3538 brw_batch_emit(brw, GENX(3DSTATE_SBE), sbe) {
3539 sbe.AttributeSwizzleEnable = true;
3540 sbe.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
3541
3542 /* _NEW_BUFFERS */
3543 bool flip_y = ctx->DrawBuffer->FlipY;
3544
3545 /* _NEW_POINT
3546 *
3547 * Window coordinates in an FBO are inverted, which means point
3548 * sprite origin must be inverted.
3549 */
3550 if ((ctx->Point.SpriteOrigin == GL_LOWER_LEFT) == flip_y)
3551 sbe.PointSpriteTextureCoordinateOrigin = LOWERLEFT;
3552 else
3553 sbe.PointSpriteTextureCoordinateOrigin = UPPERLEFT;
3554
3555 /* _NEW_POINT | _NEW_LIGHT | _NEW_PROGRAM,
3556 * BRW_NEW_FS_PROG_DATA | BRW_NEW_FRAGMENT_PROGRAM |
3557 * BRW_NEW_GS_PROG_DATA | BRW_NEW_PRIMITIVE | BRW_NEW_TES_PROG_DATA |
3558 * BRW_NEW_VUE_MAP_GEOM_OUT
3559 */
3560 genX(calculate_attr_overrides)(brw,
3561 attr_overrides,
3562 &point_sprite_enables,
3563 &urb_entry_read_length,
3564 &urb_entry_read_offset);
3565
3566 /* Typically, the URB entry read length and offset should be programmed
3567 * in 3DSTATE_VS and 3DSTATE_GS; SBE inherits it from the last active
3568 * stage which produces geometry. However, we don't know the proper
3569 * value until we call calculate_attr_overrides().
3570 *
3571 * To fit with our existing code, we override the inherited values and
3572 * specify it here directly, as we did on previous generations.
3573 */
3574 sbe.VertexURBEntryReadLength = urb_entry_read_length;
3575 sbe.VertexURBEntryReadOffset = urb_entry_read_offset;
3576 sbe.PointSpriteTextureCoordinateEnable = point_sprite_enables;
3577 sbe.ConstantInterpolationEnable = wm_prog_data->flat_inputs;
3578
3579 #if GEN_GEN >= 8
3580 sbe.ForceVertexURBEntryReadLength = true;
3581 sbe.ForceVertexURBEntryReadOffset = true;
3582 #endif
3583
3584 #if GEN_GEN >= 9
3585 /* prepare the active component dwords */
3586 for (int i = 0; i < 32; i++)
3587 sbe.AttributeActiveComponentFormat[i] = ACTIVE_COMPONENT_XYZW;
3588 #endif
3589 }
3590
3591 #if GEN_GEN >= 8
3592 brw_batch_emit(brw, GENX(3DSTATE_SBE_SWIZ), sbes) {
3593 for (int i = 0; i < 16; i++)
3594 sbes.Attribute[i] = attr_overrides[i];
3595 }
3596 #endif
3597
3598 #undef attr_overrides
3599 }
3600
3601 static const struct brw_tracked_state genX(sbe_state) = {
3602 .dirty = {
3603 .mesa = _NEW_BUFFERS |
3604 _NEW_LIGHT |
3605 _NEW_POINT |
3606 _NEW_POLYGON |
3607 _NEW_PROGRAM,
3608 .brw = BRW_NEW_BLORP |
3609 BRW_NEW_CONTEXT |
3610 BRW_NEW_FRAGMENT_PROGRAM |
3611 BRW_NEW_FS_PROG_DATA |
3612 BRW_NEW_GS_PROG_DATA |
3613 BRW_NEW_TES_PROG_DATA |
3614 BRW_NEW_VUE_MAP_GEOM_OUT |
3615 (GEN_GEN == 7 ? BRW_NEW_PRIMITIVE
3616 : 0),
3617 },
3618 .emit = genX(upload_sbe),
3619 };
3620 #endif
3621
3622 /* ---------------------------------------------------------------------- */
3623
3624 #if GEN_GEN >= 7
3625 /**
3626 * Outputs the 3DSTATE_SO_DECL_LIST command.
3627 *
3628 * The data output is a series of 64-bit entries containing a SO_DECL per
3629 * stream. We only have one stream of rendering coming out of the GS unit, so
3630 * we only emit stream 0 (low 16 bits) SO_DECLs.
3631 */
3632 static void
3633 genX(upload_3dstate_so_decl_list)(struct brw_context *brw,
3634 const struct brw_vue_map *vue_map)
3635 {
3636 struct gl_context *ctx = &brw->ctx;
3637 /* BRW_NEW_TRANSFORM_FEEDBACK */
3638 struct gl_transform_feedback_object *xfb_obj =
3639 ctx->TransformFeedback.CurrentObject;
3640 const struct gl_transform_feedback_info *linked_xfb_info =
3641 xfb_obj->program->sh.LinkedTransformFeedback;
3642 struct GENX(SO_DECL) so_decl[MAX_VERTEX_STREAMS][128];
3643 int buffer_mask[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
3644 int next_offset[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
3645 int decls[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
3646 int max_decls = 0;
3647 STATIC_ASSERT(ARRAY_SIZE(so_decl[0]) >= MAX_PROGRAM_OUTPUTS);
3648
3649 memset(so_decl, 0, sizeof(so_decl));
3650
3651 /* Construct the list of SO_DECLs to be emitted. The formatting of the
3652 * command feels strange -- each dword pair contains a SO_DECL per stream.
3653 */
3654 for (unsigned i = 0; i < linked_xfb_info->NumOutputs; i++) {
3655 const struct gl_transform_feedback_output *output =
3656 &linked_xfb_info->Outputs[i];
3657 const int buffer = output->OutputBuffer;
3658 const int varying = output->OutputRegister;
3659 const unsigned stream_id = output->StreamId;
3660 assert(stream_id < MAX_VERTEX_STREAMS);
3661
3662 buffer_mask[stream_id] |= 1 << buffer;
3663
3664 assert(vue_map->varying_to_slot[varying] >= 0);
3665
3666 /* Mesa doesn't store entries for gl_SkipComponents in the Outputs[]
3667 * array. Instead, it simply increments DstOffset for the following
3668 * input by the number of components that should be skipped.
3669 *
3670 * Our hardware is unusual in that it requires us to program SO_DECLs
3671 * for fake "hole" components, rather than simply taking the offset
3672 * for each real varying. Each hole can have size 1, 2, 3, or 4; we
3673 * program as many size = 4 holes as we can, then a final hole to
3674 * accommodate the final 1, 2, or 3 remaining.
3675 */
3676 int skip_components = output->DstOffset - next_offset[buffer];
3677
3678 while (skip_components > 0) {
3679 so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {
3680 .HoleFlag = 1,
3681 .OutputBufferSlot = output->OutputBuffer,
3682 .ComponentMask = (1 << MIN2(skip_components, 4)) - 1,
3683 };
3684 skip_components -= 4;
3685 }
3686
3687 next_offset[buffer] = output->DstOffset + output->NumComponents;
3688
3689 so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {
3690 .OutputBufferSlot = output->OutputBuffer,
3691 .RegisterIndex = vue_map->varying_to_slot[varying],
3692 .ComponentMask =
3693 ((1 << output->NumComponents) - 1) << output->ComponentOffset,
3694 };
3695
3696 if (decls[stream_id] > max_decls)
3697 max_decls = decls[stream_id];
3698 }
3699
3700 uint32_t *dw;
3701 dw = brw_batch_emitn(brw, GENX(3DSTATE_SO_DECL_LIST), 3 + 2 * max_decls,
3702 .StreamtoBufferSelects0 = buffer_mask[0],
3703 .StreamtoBufferSelects1 = buffer_mask[1],
3704 .StreamtoBufferSelects2 = buffer_mask[2],
3705 .StreamtoBufferSelects3 = buffer_mask[3],
3706 .NumEntries0 = decls[0],
3707 .NumEntries1 = decls[1],
3708 .NumEntries2 = decls[2],
3709 .NumEntries3 = decls[3]);
3710
3711 for (int i = 0; i < max_decls; i++) {
3712 GENX(SO_DECL_ENTRY_pack)(
3713 brw, dw + 2 + i * 2,
3714 &(struct GENX(SO_DECL_ENTRY)) {
3715 .Stream0Decl = so_decl[0][i],
3716 .Stream1Decl = so_decl[1][i],
3717 .Stream2Decl = so_decl[2][i],
3718 .Stream3Decl = so_decl[3][i],
3719 });
3720 }
3721 }
3722
3723 static void
3724 genX(upload_3dstate_so_buffers)(struct brw_context *brw)
3725 {
3726 struct gl_context *ctx = &brw->ctx;
3727 /* BRW_NEW_TRANSFORM_FEEDBACK */
3728 struct gl_transform_feedback_object *xfb_obj =
3729 ctx->TransformFeedback.CurrentObject;
3730 #if GEN_GEN < 8
3731 const struct gl_transform_feedback_info *linked_xfb_info =
3732 xfb_obj->program->sh.LinkedTransformFeedback;
3733 #else
3734 struct brw_transform_feedback_object *brw_obj =
3735 (struct brw_transform_feedback_object *) xfb_obj;
3736 uint32_t mocs_wb = GEN_GEN >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB;
3737 #endif
3738
3739 /* Set up the up to 4 output buffers. These are the ranges defined in the
3740 * gl_transform_feedback_object.
3741 */
3742 for (int i = 0; i < 4; i++) {
3743 struct intel_buffer_object *bufferobj =
3744 intel_buffer_object(xfb_obj->Buffers[i]);
3745 uint32_t start = xfb_obj->Offset[i];
3746 uint32_t end = ALIGN(start + xfb_obj->Size[i], 4);
3747 uint32_t const size = end - start;
3748
3749 if (!bufferobj || !size) {
3750 brw_batch_emit(brw, GENX(3DSTATE_SO_BUFFER), sob) {
3751 sob.SOBufferIndex = i;
3752 }
3753 continue;
3754 }
3755
3756 assert(start % 4 == 0);
3757 struct brw_bo *bo =
3758 intel_bufferobj_buffer(brw, bufferobj, start, size, true);
3759 assert(end <= bo->size);
3760
3761 brw_batch_emit(brw, GENX(3DSTATE_SO_BUFFER), sob) {
3762 sob.SOBufferIndex = i;
3763
3764 sob.SurfaceBaseAddress = rw_bo(bo, start);
3765 #if GEN_GEN < 8
3766 sob.SurfacePitch = linked_xfb_info->Buffers[i].Stride * 4;
3767 sob.SurfaceEndAddress = rw_bo(bo, end);
3768 #else
3769 sob.SOBufferEnable = true;
3770 sob.StreamOffsetWriteEnable = true;
3771 sob.StreamOutputBufferOffsetAddressEnable = true;
3772 sob.MOCS = mocs_wb;
3773
3774 sob.SurfaceSize = MAX2(xfb_obj->Size[i] / 4, 1) - 1;
3775 sob.StreamOutputBufferOffsetAddress =
3776 rw_bo(brw_obj->offset_bo, i * sizeof(uint32_t));
3777
3778 if (brw_obj->zero_offsets) {
3779 /* Zero out the offset and write that to offset_bo */
3780 sob.StreamOffset = 0;
3781 } else {
3782 /* Use offset_bo as the "Stream Offset." */
3783 sob.StreamOffset = 0xFFFFFFFF;
3784 }
3785 #endif
3786 }
3787 }
3788
3789 #if GEN_GEN >= 8
3790 brw_obj->zero_offsets = false;
3791 #endif
3792 }
3793
3794 static bool
3795 query_active(struct gl_query_object *q)
3796 {
3797 return q && q->Active;
3798 }
3799
3800 static void
3801 genX(upload_3dstate_streamout)(struct brw_context *brw, bool active,
3802 const struct brw_vue_map *vue_map)
3803 {
3804 struct gl_context *ctx = &brw->ctx;
3805 /* BRW_NEW_TRANSFORM_FEEDBACK */
3806 struct gl_transform_feedback_object *xfb_obj =
3807 ctx->TransformFeedback.CurrentObject;
3808
3809 brw_batch_emit(brw, GENX(3DSTATE_STREAMOUT), sos) {
3810 if (active) {
3811 int urb_entry_read_offset = 0;
3812 int urb_entry_read_length = (vue_map->num_slots + 1) / 2 -
3813 urb_entry_read_offset;
3814
3815 sos.SOFunctionEnable = true;
3816 sos.SOStatisticsEnable = true;
3817
3818 /* BRW_NEW_RASTERIZER_DISCARD */
3819 if (ctx->RasterDiscard) {
3820 if (!query_active(ctx->Query.PrimitivesGenerated[0])) {
3821 sos.RenderingDisable = true;
3822 } else {
3823 perf_debug("Rasterizer discard with a GL_PRIMITIVES_GENERATED "
3824 "query active relies on the clipper.\n");
3825 }
3826 }
3827
3828 /* _NEW_LIGHT */
3829 if (ctx->Light.ProvokingVertex != GL_FIRST_VERTEX_CONVENTION)
3830 sos.ReorderMode = TRAILING;
3831
3832 #if GEN_GEN < 8
3833 sos.SOBufferEnable0 = xfb_obj->Buffers[0] != NULL;
3834 sos.SOBufferEnable1 = xfb_obj->Buffers[1] != NULL;
3835 sos.SOBufferEnable2 = xfb_obj->Buffers[2] != NULL;
3836 sos.SOBufferEnable3 = xfb_obj->Buffers[3] != NULL;
3837 #else
3838 const struct gl_transform_feedback_info *linked_xfb_info =
3839 xfb_obj->program->sh.LinkedTransformFeedback;
3840 /* Set buffer pitches; 0 means unbound. */
3841 if (xfb_obj->Buffers[0])
3842 sos.Buffer0SurfacePitch = linked_xfb_info->Buffers[0].Stride * 4;
3843 if (xfb_obj->Buffers[1])
3844 sos.Buffer1SurfacePitch = linked_xfb_info->Buffers[1].Stride * 4;
3845 if (xfb_obj->Buffers[2])
3846 sos.Buffer2SurfacePitch = linked_xfb_info->Buffers[2].Stride * 4;
3847 if (xfb_obj->Buffers[3])
3848 sos.Buffer3SurfacePitch = linked_xfb_info->Buffers[3].Stride * 4;
3849 #endif
3850
3851 /* We always read the whole vertex. This could be reduced at some
3852 * point by reading less and offsetting the register index in the
3853 * SO_DECLs.
3854 */
3855 sos.Stream0VertexReadOffset = urb_entry_read_offset;
3856 sos.Stream0VertexReadLength = urb_entry_read_length - 1;
3857 sos.Stream1VertexReadOffset = urb_entry_read_offset;
3858 sos.Stream1VertexReadLength = urb_entry_read_length - 1;
3859 sos.Stream2VertexReadOffset = urb_entry_read_offset;
3860 sos.Stream2VertexReadLength = urb_entry_read_length - 1;
3861 sos.Stream3VertexReadOffset = urb_entry_read_offset;
3862 sos.Stream3VertexReadLength = urb_entry_read_length - 1;
3863 }
3864 }
3865 }
3866
3867 static void
3868 genX(upload_sol)(struct brw_context *brw)
3869 {
3870 struct gl_context *ctx = &brw->ctx;
3871 /* BRW_NEW_TRANSFORM_FEEDBACK */
3872 bool active = _mesa_is_xfb_active_and_unpaused(ctx);
3873
3874 if (active) {
3875 genX(upload_3dstate_so_buffers)(brw);
3876
3877 /* BRW_NEW_VUE_MAP_GEOM_OUT */
3878 genX(upload_3dstate_so_decl_list)(brw, &brw->vue_map_geom_out);
3879 }
3880
3881 /* Finally, set up the SOL stage. This command must always follow updates to
3882 * the nonpipelined SOL state (3DSTATE_SO_BUFFER, 3DSTATE_SO_DECL_LIST) or
3883 * MMIO register updates (current performed by the kernel at each batch
3884 * emit).
3885 */
3886 genX(upload_3dstate_streamout)(brw, active, &brw->vue_map_geom_out);
3887 }
3888
3889 static const struct brw_tracked_state genX(sol_state) = {
3890 .dirty = {
3891 .mesa = _NEW_LIGHT,
3892 .brw = BRW_NEW_BATCH |
3893 BRW_NEW_BLORP |
3894 BRW_NEW_RASTERIZER_DISCARD |
3895 BRW_NEW_VUE_MAP_GEOM_OUT |
3896 BRW_NEW_TRANSFORM_FEEDBACK,
3897 },
3898 .emit = genX(upload_sol),
3899 };
3900 #endif
3901
3902 /* ---------------------------------------------------------------------- */
3903
3904 #if GEN_GEN >= 7
3905 static void
3906 genX(upload_ps)(struct brw_context *brw)
3907 {
3908 UNUSED const struct gl_context *ctx = &brw->ctx;
3909 UNUSED const struct gen_device_info *devinfo = &brw->screen->devinfo;
3910
3911 /* BRW_NEW_FS_PROG_DATA */
3912 const struct brw_wm_prog_data *prog_data =
3913 brw_wm_prog_data(brw->wm.base.prog_data);
3914 const struct brw_stage_state *stage_state = &brw->wm.base;
3915
3916 #if GEN_GEN < 8
3917 #endif
3918
3919 brw_batch_emit(brw, GENX(3DSTATE_PS), ps) {
3920 /* Initialize the execution mask with VMask. Otherwise, derivatives are
3921 * incorrect for subspans where some of the pixels are unlit. We believe
3922 * the bit just didn't take effect in previous generations.
3923 */
3924 ps.VectorMaskEnable = GEN_GEN >= 8;
3925
3926 /* WA_1606682166:
3927 * "Incorrect TDL's SSP address shift in SARB for 16:6 & 18:8 modes.
3928 * Disable the Sampler state prefetch functionality in the SARB by
3929 * programming 0xB000[30] to '1'."
3930 */
3931 ps.SamplerCount = GEN_GEN == 11 ?
3932 0 : DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4);
3933
3934 /* BRW_NEW_FS_PROG_DATA */
3935 /* Gen 11 workarounds table #2056 WABTPPrefetchDisable suggests to disable
3936 * prefetching of binding tables in A0 and B0 steppings.
3937 * TODO: Revisit this workaround on C0 stepping.
3938 */
3939 ps.BindingTableEntryCount = GEN_GEN == 11 ?
3940 0 :
3941 prog_data->base.binding_table.size_bytes / 4;
3942
3943 if (prog_data->base.use_alt_mode)
3944 ps.FloatingPointMode = Alternate;
3945
3946 /* Haswell requires the sample mask to be set in this packet as well as
3947 * in 3DSTATE_SAMPLE_MASK; the values should match.
3948 */
3949
3950 /* _NEW_BUFFERS, _NEW_MULTISAMPLE */
3951 #if GEN_IS_HASWELL
3952 ps.SampleMask = genX(determine_sample_mask(brw));
3953 #endif
3954
3955 /* 3DSTATE_PS expects the number of threads per PSD, which is always 64
3956 * for pre Gen11 and 128 for gen11+; On gen11+ If a programmed value is
3957 * k, it implies 2(k+1) threads. It implicitly scales for different GT
3958 * levels (which have some # of PSDs).
3959 *
3960 * In Gen8 the format is U8-2 whereas in Gen9+ it is U9-1.
3961 */
3962 #if GEN_GEN >= 9
3963 ps.MaximumNumberofThreadsPerPSD = 64 - 1;
3964 #elif GEN_GEN >= 8
3965 ps.MaximumNumberofThreadsPerPSD = 64 - 2;
3966 #else
3967 ps.MaximumNumberofThreads = devinfo->max_wm_threads - 1;
3968 #endif
3969
3970 if (prog_data->base.nr_params > 0 ||
3971 prog_data->base.ubo_ranges[0].length > 0)
3972 ps.PushConstantEnable = true;
3973
3974 #if GEN_GEN < 8
3975 /* From the IVB PRM, volume 2 part 1, page 287:
3976 * "This bit is inserted in the PS payload header and made available to
3977 * the DataPort (either via the message header or via header bypass) to
3978 * indicate that oMask data (one or two phases) is included in Render
3979 * Target Write messages. If present, the oMask data is used to mask off
3980 * samples."
3981 */
3982 ps.oMaskPresenttoRenderTarget = prog_data->uses_omask;
3983
3984 /* The hardware wedges if you have this bit set but don't turn on any
3985 * dual source blend factors.
3986 *
3987 * BRW_NEW_FS_PROG_DATA | _NEW_COLOR
3988 */
3989 ps.DualSourceBlendEnable = prog_data->dual_src_blend &&
3990 (ctx->Color.BlendEnabled & 1) &&
3991 ctx->Color.Blend[0]._UsesDualSrc;
3992
3993 /* BRW_NEW_FS_PROG_DATA */
3994 ps.AttributeEnable = (prog_data->num_varying_inputs != 0);
3995 #endif
3996
3997 /* From the documentation for this packet:
3998 * "If the PS kernel does not need the Position XY Offsets to
3999 * compute a Position Value, then this field should be programmed
4000 * to POSOFFSET_NONE."
4001 *
4002 * "SW Recommendation: If the PS kernel needs the Position Offsets
4003 * to compute a Position XY value, this field should match Position
4004 * ZW Interpolation Mode to ensure a consistent position.xyzw
4005 * computation."
4006 *
4007 * We only require XY sample offsets. So, this recommendation doesn't
4008 * look useful at the moment. We might need this in future.
4009 */
4010 if (prog_data->uses_pos_offset)
4011 ps.PositionXYOffsetSelect = POSOFFSET_SAMPLE;
4012 else
4013 ps.PositionXYOffsetSelect = POSOFFSET_NONE;
4014
4015 ps._8PixelDispatchEnable = prog_data->dispatch_8;
4016 ps._16PixelDispatchEnable = prog_data->dispatch_16;
4017 ps._32PixelDispatchEnable = prog_data->dispatch_32;
4018
4019 /* From the Sky Lake PRM 3DSTATE_PS::32 Pixel Dispatch Enable:
4020 *
4021 * "When NUM_MULTISAMPLES = 16 or FORCE_SAMPLE_COUNT = 16, SIMD32
4022 * Dispatch must not be enabled for PER_PIXEL dispatch mode."
4023 *
4024 * Since 16x MSAA is first introduced on SKL, we don't need to apply
4025 * the workaround on any older hardware.
4026 *
4027 * BRW_NEW_NUM_SAMPLES
4028 */
4029 if (GEN_GEN >= 9 && !prog_data->persample_dispatch &&
4030 brw->num_samples == 16) {
4031 assert(ps._8PixelDispatchEnable || ps._16PixelDispatchEnable);
4032 ps._32PixelDispatchEnable = false;
4033 }
4034
4035 ps.DispatchGRFStartRegisterForConstantSetupData0 =
4036 brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 0);
4037 ps.DispatchGRFStartRegisterForConstantSetupData1 =
4038 brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 1);
4039 ps.DispatchGRFStartRegisterForConstantSetupData2 =
4040 brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 2);
4041
4042 ps.KernelStartPointer0 = stage_state->prog_offset +
4043 brw_wm_prog_data_prog_offset(prog_data, ps, 0);
4044 ps.KernelStartPointer1 = stage_state->prog_offset +
4045 brw_wm_prog_data_prog_offset(prog_data, ps, 1);
4046 ps.KernelStartPointer2 = stage_state->prog_offset +
4047 brw_wm_prog_data_prog_offset(prog_data, ps, 2);
4048
4049 if (prog_data->base.total_scratch) {
4050 ps.ScratchSpaceBasePointer =
4051 rw_32_bo(stage_state->scratch_bo,
4052 ffs(stage_state->per_thread_scratch) - 11);
4053 }
4054 }
4055 }
4056
4057 static const struct brw_tracked_state genX(ps_state) = {
4058 .dirty = {
4059 .mesa = _NEW_MULTISAMPLE |
4060 (GEN_GEN < 8 ? _NEW_BUFFERS |
4061 _NEW_COLOR
4062 : 0),
4063 .brw = BRW_NEW_BATCH |
4064 BRW_NEW_BLORP |
4065 BRW_NEW_FS_PROG_DATA |
4066 (GEN_GEN >= 9 ? BRW_NEW_NUM_SAMPLES : 0),
4067 },
4068 .emit = genX(upload_ps),
4069 };
4070 #endif
4071
4072 /* ---------------------------------------------------------------------- */
4073
4074 #if GEN_GEN >= 7
4075 static void
4076 genX(upload_hs_state)(struct brw_context *brw)
4077 {
4078 const struct gen_device_info *devinfo = &brw->screen->devinfo;
4079 struct brw_stage_state *stage_state = &brw->tcs.base;
4080 struct brw_stage_prog_data *stage_prog_data = stage_state->prog_data;
4081 const struct brw_vue_prog_data *vue_prog_data =
4082 brw_vue_prog_data(stage_prog_data);
4083
4084 /* BRW_NEW_TES_PROG_DATA */
4085 struct brw_tcs_prog_data *tcs_prog_data =
4086 brw_tcs_prog_data(stage_prog_data);
4087
4088 if (!tcs_prog_data) {
4089 brw_batch_emit(brw, GENX(3DSTATE_HS), hs);
4090 } else {
4091 brw_batch_emit(brw, GENX(3DSTATE_HS), hs) {
4092 INIT_THREAD_DISPATCH_FIELDS(hs, Vertex);
4093
4094 hs.InstanceCount = tcs_prog_data->instances - 1;
4095 hs.IncludeVertexHandles = true;
4096
4097 hs.MaximumNumberofThreads = devinfo->max_tcs_threads - 1;
4098 }
4099 }
4100 }
4101
4102 static const struct brw_tracked_state genX(hs_state) = {
4103 .dirty = {
4104 .mesa = 0,
4105 .brw = BRW_NEW_BATCH |
4106 BRW_NEW_BLORP |
4107 BRW_NEW_TCS_PROG_DATA |
4108 BRW_NEW_TESS_PROGRAMS,
4109 },
4110 .emit = genX(upload_hs_state),
4111 };
4112
4113 static void
4114 genX(upload_ds_state)(struct brw_context *brw)
4115 {
4116 const struct gen_device_info *devinfo = &brw->screen->devinfo;
4117 const struct brw_stage_state *stage_state = &brw->tes.base;
4118 struct brw_stage_prog_data *stage_prog_data = stage_state->prog_data;
4119
4120 /* BRW_NEW_TES_PROG_DATA */
4121 const struct brw_tes_prog_data *tes_prog_data =
4122 brw_tes_prog_data(stage_prog_data);
4123 const struct brw_vue_prog_data *vue_prog_data =
4124 brw_vue_prog_data(stage_prog_data);
4125
4126 if (!tes_prog_data) {
4127 brw_batch_emit(brw, GENX(3DSTATE_DS), ds);
4128 } else {
4129 assert(GEN_GEN < 11 ||
4130 vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8);
4131
4132 brw_batch_emit(brw, GENX(3DSTATE_DS), ds) {
4133 INIT_THREAD_DISPATCH_FIELDS(ds, Patch);
4134
4135 ds.MaximumNumberofThreads = devinfo->max_tes_threads - 1;
4136 ds.ComputeWCoordinateEnable =
4137 tes_prog_data->domain == BRW_TESS_DOMAIN_TRI;
4138
4139 #if GEN_GEN >= 8
4140 if (vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8)
4141 ds.DispatchMode = DISPATCH_MODE_SIMD8_SINGLE_PATCH;
4142 ds.UserClipDistanceCullTestEnableBitmask =
4143 vue_prog_data->cull_distance_mask;
4144 #endif
4145 }
4146 }
4147 }
4148
4149 static const struct brw_tracked_state genX(ds_state) = {
4150 .dirty = {
4151 .mesa = 0,
4152 .brw = BRW_NEW_BATCH |
4153 BRW_NEW_BLORP |
4154 BRW_NEW_TESS_PROGRAMS |
4155 BRW_NEW_TES_PROG_DATA,
4156 },
4157 .emit = genX(upload_ds_state),
4158 };
4159
4160 /* ---------------------------------------------------------------------- */
4161
4162 static void
4163 upload_te_state(struct brw_context *brw)
4164 {
4165 /* BRW_NEW_TESS_PROGRAMS */
4166 bool active = brw->programs[MESA_SHADER_TESS_EVAL];
4167
4168 /* BRW_NEW_TES_PROG_DATA */
4169 const struct brw_tes_prog_data *tes_prog_data =
4170 brw_tes_prog_data(brw->tes.base.prog_data);
4171
4172 if (active) {
4173 brw_batch_emit(brw, GENX(3DSTATE_TE), te) {
4174 te.Partitioning = tes_prog_data->partitioning;
4175 te.OutputTopology = tes_prog_data->output_topology;
4176 te.TEDomain = tes_prog_data->domain;
4177 te.TEEnable = true;
4178 te.MaximumTessellationFactorOdd = 63.0;
4179 te.MaximumTessellationFactorNotOdd = 64.0;
4180 }
4181 } else {
4182 brw_batch_emit(brw, GENX(3DSTATE_TE), te);
4183 }
4184 }
4185
4186 static const struct brw_tracked_state genX(te_state) = {
4187 .dirty = {
4188 .mesa = 0,
4189 .brw = BRW_NEW_BLORP |
4190 BRW_NEW_CONTEXT |
4191 BRW_NEW_TES_PROG_DATA |
4192 BRW_NEW_TESS_PROGRAMS,
4193 },
4194 .emit = upload_te_state,
4195 };
4196
4197 /* ---------------------------------------------------------------------- */
4198
4199 static void
4200 genX(upload_tes_push_constants)(struct brw_context *brw)
4201 {
4202 struct brw_stage_state *stage_state = &brw->tes.base;
4203 /* BRW_NEW_TESS_PROGRAMS */
4204 const struct gl_program *tep = brw->programs[MESA_SHADER_TESS_EVAL];
4205
4206 /* BRW_NEW_TES_PROG_DATA */
4207 const struct brw_stage_prog_data *prog_data = brw->tes.base.prog_data;
4208 gen6_upload_push_constants(brw, tep, prog_data, stage_state);
4209 }
4210
4211 static const struct brw_tracked_state genX(tes_push_constants) = {
4212 .dirty = {
4213 .mesa = _NEW_PROGRAM_CONSTANTS,
4214 .brw = BRW_NEW_BATCH |
4215 BRW_NEW_BLORP |
4216 BRW_NEW_TESS_PROGRAMS |
4217 BRW_NEW_TES_PROG_DATA,
4218 },
4219 .emit = genX(upload_tes_push_constants),
4220 };
4221
4222 static void
4223 genX(upload_tcs_push_constants)(struct brw_context *brw)
4224 {
4225 struct brw_stage_state *stage_state = &brw->tcs.base;
4226 /* BRW_NEW_TESS_PROGRAMS */
4227 const struct gl_program *tcp = brw->programs[MESA_SHADER_TESS_CTRL];
4228
4229 /* BRW_NEW_TCS_PROG_DATA */
4230 const struct brw_stage_prog_data *prog_data = brw->tcs.base.prog_data;
4231
4232 gen6_upload_push_constants(brw, tcp, prog_data, stage_state);
4233 }
4234
4235 static const struct brw_tracked_state genX(tcs_push_constants) = {
4236 .dirty = {
4237 .mesa = _NEW_PROGRAM_CONSTANTS,
4238 .brw = BRW_NEW_BATCH |
4239 BRW_NEW_BLORP |
4240 BRW_NEW_DEFAULT_TESS_LEVELS |
4241 BRW_NEW_TESS_PROGRAMS |
4242 BRW_NEW_TCS_PROG_DATA,
4243 },
4244 .emit = genX(upload_tcs_push_constants),
4245 };
4246
4247 #endif
4248
4249 /* ---------------------------------------------------------------------- */
4250
4251 #if GEN_GEN >= 7
4252 static void
4253 genX(upload_cs_push_constants)(struct brw_context *brw)
4254 {
4255 struct brw_stage_state *stage_state = &brw->cs.base;
4256
4257 /* BRW_NEW_COMPUTE_PROGRAM */
4258 const struct gl_program *cp = brw->programs[MESA_SHADER_COMPUTE];
4259
4260 if (cp) {
4261 /* BRW_NEW_CS_PROG_DATA */
4262 struct brw_cs_prog_data *cs_prog_data =
4263 brw_cs_prog_data(brw->cs.base.prog_data);
4264
4265 _mesa_shader_write_subroutine_indices(&brw->ctx, MESA_SHADER_COMPUTE);
4266 brw_upload_cs_push_constants(brw, cp, cs_prog_data, stage_state);
4267 }
4268 }
4269
4270 const struct brw_tracked_state genX(cs_push_constants) = {
4271 .dirty = {
4272 .mesa = _NEW_PROGRAM_CONSTANTS,
4273 .brw = BRW_NEW_BATCH |
4274 BRW_NEW_BLORP |
4275 BRW_NEW_COMPUTE_PROGRAM |
4276 BRW_NEW_CS_PROG_DATA,
4277 },
4278 .emit = genX(upload_cs_push_constants),
4279 };
4280
4281 /**
4282 * Creates a new CS constant buffer reflecting the current CS program's
4283 * constants, if needed by the CS program.
4284 */
4285 static void
4286 genX(upload_cs_pull_constants)(struct brw_context *brw)
4287 {
4288 struct brw_stage_state *stage_state = &brw->cs.base;
4289
4290 /* BRW_NEW_COMPUTE_PROGRAM */
4291 struct brw_program *cp =
4292 (struct brw_program *) brw->programs[MESA_SHADER_COMPUTE];
4293
4294 /* BRW_NEW_CS_PROG_DATA */
4295 const struct brw_stage_prog_data *prog_data = brw->cs.base.prog_data;
4296
4297 _mesa_shader_write_subroutine_indices(&brw->ctx, MESA_SHADER_COMPUTE);
4298 /* _NEW_PROGRAM_CONSTANTS */
4299 brw_upload_pull_constants(brw, BRW_NEW_SURFACES, &cp->program,
4300 stage_state, prog_data);
4301 }
4302
4303 const struct brw_tracked_state genX(cs_pull_constants) = {
4304 .dirty = {
4305 .mesa = _NEW_PROGRAM_CONSTANTS,
4306 .brw = BRW_NEW_BATCH |
4307 BRW_NEW_BLORP |
4308 BRW_NEW_COMPUTE_PROGRAM |
4309 BRW_NEW_CS_PROG_DATA,
4310 },
4311 .emit = genX(upload_cs_pull_constants),
4312 };
4313
4314 static void
4315 genX(upload_cs_state)(struct brw_context *brw)
4316 {
4317 if (!brw->cs.base.prog_data)
4318 return;
4319
4320 uint32_t offset;
4321 uint32_t *desc = (uint32_t*) brw_state_batch(
4322 brw, GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t), 64,
4323 &offset);
4324
4325 struct brw_stage_state *stage_state = &brw->cs.base;
4326 struct brw_stage_prog_data *prog_data = stage_state->prog_data;
4327 struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(prog_data);
4328 const struct gen_device_info *devinfo = &brw->screen->devinfo;
4329
4330 if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
4331 brw_emit_buffer_surface_state(
4332 brw, &stage_state->surf_offset[
4333 prog_data->binding_table.shader_time_start],
4334 brw->shader_time.bo, 0, ISL_FORMAT_RAW,
4335 brw->shader_time.bo->size, 1,
4336 RELOC_WRITE);
4337 }
4338
4339 uint32_t *bind = brw_state_batch(brw, prog_data->binding_table.size_bytes,
4340 32, &stage_state->bind_bo_offset);
4341
4342 /* The MEDIA_VFE_STATE documentation for Gen8+ says:
4343 *
4344 * "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
4345 * the only bits that are changed are scoreboard related: Scoreboard
4346 * Enable, Scoreboard Type, Scoreboard Mask, Scoreboard * Delta. For
4347 * these scoreboard related states, a MEDIA_STATE_FLUSH is sufficient."
4348 *
4349 * Earlier generations say "MI_FLUSH" instead of "stalling PIPE_CONTROL",
4350 * but MI_FLUSH isn't really a thing, so we assume they meant PIPE_CONTROL.
4351 */
4352 brw_emit_pipe_control_flush(brw, PIPE_CONTROL_CS_STALL);
4353
4354 brw_batch_emit(brw, GENX(MEDIA_VFE_STATE), vfe) {
4355 if (prog_data->total_scratch) {
4356 uint32_t per_thread_scratch_value;
4357
4358 if (GEN_GEN >= 8) {
4359 /* Broadwell's Per Thread Scratch Space is in the range [0, 11]
4360 * where 0 = 1k, 1 = 2k, 2 = 4k, ..., 11 = 2M.
4361 */
4362 per_thread_scratch_value = ffs(stage_state->per_thread_scratch) - 11;
4363 } else if (GEN_IS_HASWELL) {
4364 /* Haswell's Per Thread Scratch Space is in the range [0, 10]
4365 * where 0 = 2k, 1 = 4k, 2 = 8k, ..., 10 = 2M.
4366 */
4367 per_thread_scratch_value = ffs(stage_state->per_thread_scratch) - 12;
4368 } else {
4369 /* Earlier platforms use the range [0, 11] to mean [1kB, 12kB]
4370 * where 0 = 1kB, 1 = 2kB, 2 = 3kB, ..., 11 = 12kB.
4371 */
4372 per_thread_scratch_value = stage_state->per_thread_scratch / 1024 - 1;
4373 }
4374 vfe.ScratchSpaceBasePointer = rw_32_bo(stage_state->scratch_bo, 0);
4375 vfe.PerThreadScratchSpace = per_thread_scratch_value;
4376 }
4377
4378 /* If brw->screen->subslice_total is greater than one, then
4379 * devinfo->max_cs_threads stores number of threads per sub-slice;
4380 * thus we need to multiply by that number by subslices to get
4381 * the actual maximum number of threads; the -1 is because the HW
4382 * has a bias of 1 (would not make sense to say the maximum number
4383 * of threads is 0).
4384 */
4385 const uint32_t subslices = MAX2(brw->screen->subslice_total, 1);
4386 vfe.MaximumNumberofThreads = devinfo->max_cs_threads * subslices - 1;
4387 vfe.NumberofURBEntries = GEN_GEN >= 8 ? 2 : 0;
4388 #if GEN_GEN < 11
4389 vfe.ResetGatewayTimer =
4390 Resettingrelativetimerandlatchingtheglobaltimestamp;
4391 #endif
4392 #if GEN_GEN < 9
4393 vfe.BypassGatewayControl = BypassingOpenGatewayCloseGatewayprotocol;
4394 #endif
4395 #if GEN_GEN == 7
4396 vfe.GPGPUMode = 1;
4397 #endif
4398
4399 /* We are uploading duplicated copies of push constant uniforms for each
4400 * thread. Although the local id data needs to vary per thread, it won't
4401 * change for other uniform data. Unfortunately this duplication is
4402 * required for gen7. As of Haswell, this duplication can be avoided,
4403 * but this older mechanism with duplicated data continues to work.
4404 *
4405 * FINISHME: As of Haswell, we could make use of the
4406 * INTERFACE_DESCRIPTOR_DATA "Cross-Thread Constant Data Read Length"
4407 * field to only store one copy of uniform data.
4408 *
4409 * FINISHME: Broadwell adds a new alternative "Indirect Payload Storage"
4410 * which is described in the GPGPU_WALKER command and in the Broadwell
4411 * PRM Volume 7: 3D Media GPGPU, under Media GPGPU Pipeline => Mode of
4412 * Operations => GPGPU Mode => Indirect Payload Storage.
4413 *
4414 * Note: The constant data is built in brw_upload_cs_push_constants
4415 * below.
4416 */
4417 vfe.URBEntryAllocationSize = GEN_GEN >= 8 ? 2 : 0;
4418
4419 const uint32_t vfe_curbe_allocation =
4420 ALIGN(cs_prog_data->push.per_thread.regs * cs_prog_data->threads +
4421 cs_prog_data->push.cross_thread.regs, 2);
4422 vfe.CURBEAllocationSize = vfe_curbe_allocation;
4423 }
4424
4425 if (cs_prog_data->push.total.size > 0) {
4426 brw_batch_emit(brw, GENX(MEDIA_CURBE_LOAD), curbe) {
4427 curbe.CURBETotalDataLength =
4428 ALIGN(cs_prog_data->push.total.size, 64);
4429 curbe.CURBEDataStartAddress = stage_state->push_const_offset;
4430 }
4431 }
4432
4433 /* BRW_NEW_SURFACES and BRW_NEW_*_CONSTBUF */
4434 memcpy(bind, stage_state->surf_offset,
4435 prog_data->binding_table.size_bytes);
4436 const struct GENX(INTERFACE_DESCRIPTOR_DATA) idd = {
4437 .KernelStartPointer = brw->cs.base.prog_offset,
4438 .SamplerStatePointer = stage_state->sampler_offset,
4439 .SamplerCount = DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4),
4440 .BindingTablePointer = stage_state->bind_bo_offset,
4441 .ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs,
4442 .NumberofThreadsinGPGPUThreadGroup = cs_prog_data->threads,
4443 .SharedLocalMemorySize = encode_slm_size(GEN_GEN,
4444 prog_data->total_shared),
4445 .BarrierEnable = cs_prog_data->uses_barrier,
4446 #if GEN_GEN >= 8 || GEN_IS_HASWELL
4447 .CrossThreadConstantDataReadLength =
4448 cs_prog_data->push.cross_thread.regs,
4449 #endif
4450 };
4451
4452 GENX(INTERFACE_DESCRIPTOR_DATA_pack)(brw, desc, &idd);
4453
4454 brw_batch_emit(brw, GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), load) {
4455 load.InterfaceDescriptorTotalLength =
4456 GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
4457 load.InterfaceDescriptorDataStartAddress = offset;
4458 }
4459 }
4460
4461 static const struct brw_tracked_state genX(cs_state) = {
4462 .dirty = {
4463 .mesa = _NEW_PROGRAM_CONSTANTS,
4464 .brw = BRW_NEW_BATCH |
4465 BRW_NEW_BLORP |
4466 BRW_NEW_CS_PROG_DATA |
4467 BRW_NEW_SAMPLER_STATE_TABLE |
4468 BRW_NEW_SURFACES,
4469 },
4470 .emit = genX(upload_cs_state)
4471 };
4472
4473 #define GPGPU_DISPATCHDIMX 0x2500
4474 #define GPGPU_DISPATCHDIMY 0x2504
4475 #define GPGPU_DISPATCHDIMZ 0x2508
4476
4477 #define MI_PREDICATE_SRC0 0x2400
4478 #define MI_PREDICATE_SRC1 0x2408
4479
4480 static void
4481 prepare_indirect_gpgpu_walker(struct brw_context *brw)
4482 {
4483 GLintptr indirect_offset = brw->compute.num_work_groups_offset;
4484 struct brw_bo *bo = brw->compute.num_work_groups_bo;
4485
4486 emit_lrm(brw, GPGPU_DISPATCHDIMX, ro_bo(bo, indirect_offset + 0));
4487 emit_lrm(brw, GPGPU_DISPATCHDIMY, ro_bo(bo, indirect_offset + 4));
4488 emit_lrm(brw, GPGPU_DISPATCHDIMZ, ro_bo(bo, indirect_offset + 8));
4489
4490 #if GEN_GEN <= 7
4491 /* Clear upper 32-bits of SRC0 and all 64-bits of SRC1 */
4492 emit_lri(brw, MI_PREDICATE_SRC0 + 4, 0);
4493 emit_lri(brw, MI_PREDICATE_SRC1 , 0);
4494 emit_lri(brw, MI_PREDICATE_SRC1 + 4, 0);
4495
4496 /* Load compute_dispatch_indirect_x_size into SRC0 */
4497 emit_lrm(brw, MI_PREDICATE_SRC0, ro_bo(bo, indirect_offset + 0));
4498
4499 /* predicate = (compute_dispatch_indirect_x_size == 0); */
4500 brw_batch_emit(brw, GENX(MI_PREDICATE), mip) {
4501 mip.LoadOperation = LOAD_LOAD;
4502 mip.CombineOperation = COMBINE_SET;
4503 mip.CompareOperation = COMPARE_SRCS_EQUAL;
4504 }
4505
4506 /* Load compute_dispatch_indirect_y_size into SRC0 */
4507 emit_lrm(brw, MI_PREDICATE_SRC0, ro_bo(bo, indirect_offset + 4));
4508
4509 /* predicate |= (compute_dispatch_indirect_y_size == 0); */
4510 brw_batch_emit(brw, GENX(MI_PREDICATE), mip) {
4511 mip.LoadOperation = LOAD_LOAD;
4512 mip.CombineOperation = COMBINE_OR;
4513 mip.CompareOperation = COMPARE_SRCS_EQUAL;
4514 }
4515
4516 /* Load compute_dispatch_indirect_z_size into SRC0 */
4517 emit_lrm(brw, MI_PREDICATE_SRC0, ro_bo(bo, indirect_offset + 8));
4518
4519 /* predicate |= (compute_dispatch_indirect_z_size == 0); */
4520 brw_batch_emit(brw, GENX(MI_PREDICATE), mip) {
4521 mip.LoadOperation = LOAD_LOAD;
4522 mip.CombineOperation = COMBINE_OR;
4523 mip.CompareOperation = COMPARE_SRCS_EQUAL;
4524 }
4525
4526 /* predicate = !predicate; */
4527 #define COMPARE_FALSE 1
4528 brw_batch_emit(brw, GENX(MI_PREDICATE), mip) {
4529 mip.LoadOperation = LOAD_LOADINV;
4530 mip.CombineOperation = COMBINE_OR;
4531 mip.CompareOperation = COMPARE_FALSE;
4532 }
4533 #endif
4534 }
4535
4536 static void
4537 genX(emit_gpgpu_walker)(struct brw_context *brw)
4538 {
4539 const struct brw_cs_prog_data *prog_data =
4540 brw_cs_prog_data(brw->cs.base.prog_data);
4541
4542 const GLuint *num_groups = brw->compute.num_work_groups;
4543
4544 bool indirect = brw->compute.num_work_groups_bo != NULL;
4545 if (indirect)
4546 prepare_indirect_gpgpu_walker(brw);
4547
4548 const unsigned simd_size = prog_data->simd_size;
4549 unsigned group_size = prog_data->local_size[0] *
4550 prog_data->local_size[1] * prog_data->local_size[2];
4551
4552 uint32_t right_mask = 0xffffffffu >> (32 - simd_size);
4553 const unsigned right_non_aligned = group_size & (simd_size - 1);
4554 if (right_non_aligned != 0)
4555 right_mask >>= (simd_size - right_non_aligned);
4556
4557 brw_batch_emit(brw, GENX(GPGPU_WALKER), ggw) {
4558 ggw.IndirectParameterEnable = indirect;
4559 ggw.PredicateEnable = GEN_GEN <= 7 && indirect;
4560 ggw.SIMDSize = prog_data->simd_size / 16;
4561 ggw.ThreadDepthCounterMaximum = 0;
4562 ggw.ThreadHeightCounterMaximum = 0;
4563 ggw.ThreadWidthCounterMaximum = prog_data->threads - 1;
4564 ggw.ThreadGroupIDXDimension = num_groups[0];
4565 ggw.ThreadGroupIDYDimension = num_groups[1];
4566 ggw.ThreadGroupIDZDimension = num_groups[2];
4567 ggw.RightExecutionMask = right_mask;
4568 ggw.BottomExecutionMask = 0xffffffff;
4569 }
4570
4571 brw_batch_emit(brw, GENX(MEDIA_STATE_FLUSH), msf);
4572 }
4573
4574 #endif
4575
4576 /* ---------------------------------------------------------------------- */
4577
4578 #if GEN_GEN >= 8
4579 static void
4580 genX(upload_raster)(struct brw_context *brw)
4581 {
4582 const struct gl_context *ctx = &brw->ctx;
4583
4584 /* _NEW_BUFFERS */
4585 const bool flip_y = ctx->DrawBuffer->FlipY;
4586
4587 /* _NEW_POLYGON */
4588 const struct gl_polygon_attrib *polygon = &ctx->Polygon;
4589
4590 /* _NEW_POINT */
4591 const struct gl_point_attrib *point = &ctx->Point;
4592
4593 brw_batch_emit(brw, GENX(3DSTATE_RASTER), raster) {
4594 if (brw->polygon_front_bit != flip_y)
4595 raster.FrontWinding = CounterClockwise;
4596
4597 if (polygon->CullFlag) {
4598 switch (polygon->CullFaceMode) {
4599 case GL_FRONT:
4600 raster.CullMode = CULLMODE_FRONT;
4601 break;
4602 case GL_BACK:
4603 raster.CullMode = CULLMODE_BACK;
4604 break;
4605 case GL_FRONT_AND_BACK:
4606 raster.CullMode = CULLMODE_BOTH;
4607 break;
4608 default:
4609 unreachable("not reached");
4610 }
4611 } else {
4612 raster.CullMode = CULLMODE_NONE;
4613 }
4614
4615 raster.SmoothPointEnable = point->SmoothFlag;
4616
4617 raster.DXMultisampleRasterizationEnable =
4618 _mesa_is_multisample_enabled(ctx);
4619
4620 raster.GlobalDepthOffsetEnableSolid = polygon->OffsetFill;
4621 raster.GlobalDepthOffsetEnableWireframe = polygon->OffsetLine;
4622 raster.GlobalDepthOffsetEnablePoint = polygon->OffsetPoint;
4623
4624 switch (polygon->FrontMode) {
4625 case GL_FILL:
4626 raster.FrontFaceFillMode = FILL_MODE_SOLID;
4627 break;
4628 case GL_LINE:
4629 raster.FrontFaceFillMode = FILL_MODE_WIREFRAME;
4630 break;
4631 case GL_POINT:
4632 raster.FrontFaceFillMode = FILL_MODE_POINT;
4633 break;
4634 default:
4635 unreachable("not reached");
4636 }
4637
4638 switch (polygon->BackMode) {
4639 case GL_FILL:
4640 raster.BackFaceFillMode = FILL_MODE_SOLID;
4641 break;
4642 case GL_LINE:
4643 raster.BackFaceFillMode = FILL_MODE_WIREFRAME;
4644 break;
4645 case GL_POINT:
4646 raster.BackFaceFillMode = FILL_MODE_POINT;
4647 break;
4648 default:
4649 unreachable("not reached");
4650 }
4651
4652 /* _NEW_LINE */
4653 raster.AntialiasingEnable = ctx->Line.SmoothFlag;
4654
4655 #if GEN_GEN == 10
4656 /* _NEW_BUFFERS
4657 * Antialiasing Enable bit MUST not be set when NUM_MULTISAMPLES > 1.
4658 */
4659 const bool multisampled_fbo =
4660 _mesa_geometric_samples(ctx->DrawBuffer) > 1;
4661 if (multisampled_fbo)
4662 raster.AntialiasingEnable = false;
4663 #endif
4664
4665 /* _NEW_SCISSOR */
4666 raster.ScissorRectangleEnable = ctx->Scissor.EnableFlags;
4667
4668 /* _NEW_TRANSFORM */
4669 #if GEN_GEN < 9
4670 if (!(ctx->Transform.DepthClampNear &&
4671 ctx->Transform.DepthClampFar))
4672 raster.ViewportZClipTestEnable = true;
4673 #endif
4674
4675 #if GEN_GEN >= 9
4676 if (!ctx->Transform.DepthClampNear)
4677 raster.ViewportZNearClipTestEnable = true;
4678
4679 if (!ctx->Transform.DepthClampFar)
4680 raster.ViewportZFarClipTestEnable = true;
4681 #endif
4682
4683 /* BRW_NEW_CONSERVATIVE_RASTERIZATION */
4684 #if GEN_GEN >= 9
4685 raster.ConservativeRasterizationEnable =
4686 ctx->IntelConservativeRasterization;
4687 #endif
4688
4689 raster.GlobalDepthOffsetClamp = polygon->OffsetClamp;
4690 raster.GlobalDepthOffsetScale = polygon->OffsetFactor;
4691
4692 raster.GlobalDepthOffsetConstant = polygon->OffsetUnits * 2;
4693 }
4694 }
4695
4696 static const struct brw_tracked_state genX(raster_state) = {
4697 .dirty = {
4698 .mesa = _NEW_BUFFERS |
4699 _NEW_LINE |
4700 _NEW_MULTISAMPLE |
4701 _NEW_POINT |
4702 _NEW_POLYGON |
4703 _NEW_SCISSOR |
4704 _NEW_TRANSFORM,
4705 .brw = BRW_NEW_BLORP |
4706 BRW_NEW_CONTEXT |
4707 BRW_NEW_CONSERVATIVE_RASTERIZATION,
4708 },
4709 .emit = genX(upload_raster),
4710 };
4711 #endif
4712
4713 /* ---------------------------------------------------------------------- */
4714
4715 #if GEN_GEN >= 8
4716 static void
4717 genX(upload_ps_extra)(struct brw_context *brw)
4718 {
4719 UNUSED struct gl_context *ctx = &brw->ctx;
4720
4721 const struct brw_wm_prog_data *prog_data =
4722 brw_wm_prog_data(brw->wm.base.prog_data);
4723
4724 brw_batch_emit(brw, GENX(3DSTATE_PS_EXTRA), psx) {
4725 psx.PixelShaderValid = true;
4726 psx.PixelShaderComputedDepthMode = prog_data->computed_depth_mode;
4727 psx.PixelShaderKillsPixel = prog_data->uses_kill;
4728 psx.AttributeEnable = prog_data->num_varying_inputs != 0;
4729 psx.PixelShaderUsesSourceDepth = prog_data->uses_src_depth;
4730 psx.PixelShaderUsesSourceW = prog_data->uses_src_w;
4731 psx.PixelShaderIsPerSample = prog_data->persample_dispatch;
4732
4733 /* _NEW_MULTISAMPLE | BRW_NEW_CONSERVATIVE_RASTERIZATION */
4734 if (prog_data->uses_sample_mask) {
4735 #if GEN_GEN >= 9
4736 if (prog_data->post_depth_coverage)
4737 psx.InputCoverageMaskState = ICMS_DEPTH_COVERAGE;
4738 else if (prog_data->inner_coverage && ctx->IntelConservativeRasterization)
4739 psx.InputCoverageMaskState = ICMS_INNER_CONSERVATIVE;
4740 else
4741 psx.InputCoverageMaskState = ICMS_NORMAL;
4742 #else
4743 psx.PixelShaderUsesInputCoverageMask = true;
4744 #endif
4745 }
4746
4747 psx.oMaskPresenttoRenderTarget = prog_data->uses_omask;
4748 #if GEN_GEN >= 9
4749 psx.PixelShaderPullsBary = prog_data->pulls_bary;
4750 psx.PixelShaderComputesStencil = prog_data->computed_stencil;
4751 #endif
4752
4753 /* The stricter cross-primitive coherency guarantees that the hardware
4754 * gives us with the "Accesses UAV" bit set for at least one shader stage
4755 * and the "UAV coherency required" bit set on the 3DPRIMITIVE command
4756 * are redundant within the current image, atomic counter and SSBO GL
4757 * APIs, which all have very loose ordering and coherency requirements
4758 * and generally rely on the application to insert explicit barriers when
4759 * a shader invocation is expected to see the memory writes performed by
4760 * the invocations of some previous primitive. Regardless of the value
4761 * of "UAV coherency required", the "Accesses UAV" bits will implicitly
4762 * cause an in most cases useless DC flush when the lowermost stage with
4763 * the bit set finishes execution.
4764 *
4765 * It would be nice to disable it, but in some cases we can't because on
4766 * Gen8+ it also has an influence on rasterization via the PS UAV-only
4767 * signal (which could be set independently from the coherency mechanism
4768 * in the 3DSTATE_WM command on Gen7), and because in some cases it will
4769 * determine whether the hardware skips execution of the fragment shader
4770 * or not via the ThreadDispatchEnable signal. However if we know that
4771 * GEN8_PS_BLEND_HAS_WRITEABLE_RT is going to be set and
4772 * GEN8_PSX_PIXEL_SHADER_NO_RT_WRITE is not set it shouldn't make any
4773 * difference so we may just disable it here.
4774 *
4775 * Gen8 hardware tries to compute ThreadDispatchEnable for us but doesn't
4776 * take into account KillPixels when no depth or stencil writes are
4777 * enabled. In order for occlusion queries to work correctly with no
4778 * attachments, we need to force-enable here.
4779 *
4780 * BRW_NEW_FS_PROG_DATA | BRW_NEW_FRAGMENT_PROGRAM | _NEW_BUFFERS |
4781 * _NEW_COLOR
4782 */
4783 if ((prog_data->has_side_effects || prog_data->uses_kill) &&
4784 !brw_color_buffer_write_enabled(brw))
4785 psx.PixelShaderHasUAV = true;
4786 }
4787 }
4788
4789 const struct brw_tracked_state genX(ps_extra) = {
4790 .dirty = {
4791 .mesa = _NEW_BUFFERS | _NEW_COLOR,
4792 .brw = BRW_NEW_BLORP |
4793 BRW_NEW_CONTEXT |
4794 BRW_NEW_FRAGMENT_PROGRAM |
4795 BRW_NEW_FS_PROG_DATA |
4796 BRW_NEW_CONSERVATIVE_RASTERIZATION,
4797 },
4798 .emit = genX(upload_ps_extra),
4799 };
4800 #endif
4801
4802 /* ---------------------------------------------------------------------- */
4803
4804 #if GEN_GEN >= 8
4805 static void
4806 genX(upload_ps_blend)(struct brw_context *brw)
4807 {
4808 struct gl_context *ctx = &brw->ctx;
4809
4810 /* _NEW_BUFFERS */
4811 struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[0];
4812 const bool buffer0_is_integer = ctx->DrawBuffer->_IntegerBuffers & 0x1;
4813
4814 /* _NEW_COLOR */
4815 struct gl_colorbuffer_attrib *color = &ctx->Color;
4816
4817 brw_batch_emit(brw, GENX(3DSTATE_PS_BLEND), pb) {
4818 /* BRW_NEW_FRAGMENT_PROGRAM | _NEW_BUFFERS | _NEW_COLOR */
4819 pb.HasWriteableRT = brw_color_buffer_write_enabled(brw);
4820
4821 bool alpha_to_one = false;
4822
4823 if (!buffer0_is_integer) {
4824 /* _NEW_MULTISAMPLE */
4825
4826 if (_mesa_is_multisample_enabled(ctx)) {
4827 pb.AlphaToCoverageEnable = ctx->Multisample.SampleAlphaToCoverage;
4828 alpha_to_one = ctx->Multisample.SampleAlphaToOne;
4829 }
4830
4831 pb.AlphaTestEnable = color->AlphaEnabled;
4832 }
4833
4834 /* Used for implementing the following bit of GL_EXT_texture_integer:
4835 * "Per-fragment operations that require floating-point color
4836 * components, including multisample alpha operations, alpha test,
4837 * blending, and dithering, have no effect when the corresponding
4838 * colors are written to an integer color buffer."
4839 *
4840 * The OpenGL specification 3.3 (page 196), section 4.1.3 says:
4841 * "If drawbuffer zero is not NONE and the buffer it references has an
4842 * integer format, the SAMPLE_ALPHA_TO_COVERAGE and SAMPLE_ALPHA_TO_ONE
4843 * operations are skipped."
4844 */
4845 if (rb && !buffer0_is_integer && (color->BlendEnabled & 1)) {
4846 GLenum eqRGB = color->Blend[0].EquationRGB;
4847 GLenum eqA = color->Blend[0].EquationA;
4848 GLenum srcRGB = color->Blend[0].SrcRGB;
4849 GLenum dstRGB = color->Blend[0].DstRGB;
4850 GLenum srcA = color->Blend[0].SrcA;
4851 GLenum dstA = color->Blend[0].DstA;
4852
4853 if (eqRGB == GL_MIN || eqRGB == GL_MAX)
4854 srcRGB = dstRGB = GL_ONE;
4855
4856 if (eqA == GL_MIN || eqA == GL_MAX)
4857 srcA = dstA = GL_ONE;
4858
4859 /* Due to hardware limitations, the destination may have information
4860 * in an alpha channel even when the format specifies no alpha
4861 * channel. In order to avoid getting any incorrect blending due to
4862 * that alpha channel, coerce the blend factors to values that will
4863 * not read the alpha channel, but will instead use the correct
4864 * implicit value for alpha.
4865 */
4866 if (!_mesa_base_format_has_channel(rb->_BaseFormat,
4867 GL_TEXTURE_ALPHA_TYPE)) {
4868 srcRGB = brw_fix_xRGB_alpha(srcRGB);
4869 srcA = brw_fix_xRGB_alpha(srcA);
4870 dstRGB = brw_fix_xRGB_alpha(dstRGB);
4871 dstA = brw_fix_xRGB_alpha(dstA);
4872 }
4873
4874 /* Alpha to One doesn't work with Dual Color Blending. Override
4875 * SRC1_ALPHA to ONE and ONE_MINUS_SRC1_ALPHA to ZERO.
4876 */
4877 if (alpha_to_one && color->Blend[0]._UsesDualSrc) {
4878 srcRGB = fix_dual_blend_alpha_to_one(srcRGB);
4879 srcA = fix_dual_blend_alpha_to_one(srcA);
4880 dstRGB = fix_dual_blend_alpha_to_one(dstRGB);
4881 dstA = fix_dual_blend_alpha_to_one(dstA);
4882 }
4883
4884 /* BRW_NEW_FS_PROG_DATA */
4885 const struct brw_wm_prog_data *wm_prog_data =
4886 brw_wm_prog_data(brw->wm.base.prog_data);
4887
4888 /* The Dual Source Blending documentation says:
4889 *
4890 * "If SRC1 is included in a src/dst blend factor and
4891 * a DualSource RT Write message is not used, results
4892 * are UNDEFINED. (This reflects the same restriction in DX APIs,
4893 * where undefined results are produced if “o1” is not written
4894 * by a PS – there are no default values defined).
4895 * If SRC1 is not included in a src/dst blend factor,
4896 * dual source blending must be disabled."
4897 *
4898 * There is no way to gracefully fix this undefined situation
4899 * so we just disable the blending to prevent possible issues.
4900 */
4901 pb.ColorBufferBlendEnable =
4902 !color->Blend[0]._UsesDualSrc || wm_prog_data->dual_src_blend;
4903 pb.SourceAlphaBlendFactor = brw_translate_blend_factor(srcA);
4904 pb.DestinationAlphaBlendFactor = brw_translate_blend_factor(dstA);
4905 pb.SourceBlendFactor = brw_translate_blend_factor(srcRGB);
4906 pb.DestinationBlendFactor = brw_translate_blend_factor(dstRGB);
4907
4908 pb.IndependentAlphaBlendEnable =
4909 srcA != srcRGB || dstA != dstRGB || eqA != eqRGB;
4910 }
4911 }
4912 }
4913
4914 static const struct brw_tracked_state genX(ps_blend) = {
4915 .dirty = {
4916 .mesa = _NEW_BUFFERS |
4917 _NEW_COLOR |
4918 _NEW_MULTISAMPLE,
4919 .brw = BRW_NEW_BLORP |
4920 BRW_NEW_CONTEXT |
4921 BRW_NEW_FRAGMENT_PROGRAM |
4922 BRW_NEW_FS_PROG_DATA,
4923 },
4924 .emit = genX(upload_ps_blend)
4925 };
4926 #endif
4927
4928 /* ---------------------------------------------------------------------- */
4929
4930 #if GEN_GEN >= 8
4931 static void
4932 genX(emit_vf_topology)(struct brw_context *brw)
4933 {
4934 brw_batch_emit(brw, GENX(3DSTATE_VF_TOPOLOGY), vftopo) {
4935 vftopo.PrimitiveTopologyType = brw->primitive;
4936 }
4937 }
4938
4939 static const struct brw_tracked_state genX(vf_topology) = {
4940 .dirty = {
4941 .mesa = 0,
4942 .brw = BRW_NEW_BLORP |
4943 BRW_NEW_PRIMITIVE,
4944 },
4945 .emit = genX(emit_vf_topology),
4946 };
4947 #endif
4948
4949 /* ---------------------------------------------------------------------- */
4950
4951 #if GEN_GEN >= 7
4952 static void
4953 genX(emit_mi_report_perf_count)(struct brw_context *brw,
4954 struct brw_bo *bo,
4955 uint32_t offset_in_bytes,
4956 uint32_t report_id)
4957 {
4958 brw_batch_emit(brw, GENX(MI_REPORT_PERF_COUNT), mi_rpc) {
4959 mi_rpc.MemoryAddress = ggtt_bo(bo, offset_in_bytes);
4960 mi_rpc.ReportID = report_id;
4961 }
4962 }
4963 #endif
4964
4965 /* ---------------------------------------------------------------------- */
4966
4967 /**
4968 * Emit a 3DSTATE_SAMPLER_STATE_POINTERS_{VS,HS,GS,DS,PS} packet.
4969 */
4970 static void
4971 genX(emit_sampler_state_pointers_xs)(MAYBE_UNUSED struct brw_context *brw,
4972 MAYBE_UNUSED struct brw_stage_state *stage_state)
4973 {
4974 #if GEN_GEN >= 7
4975 static const uint16_t packet_headers[] = {
4976 [MESA_SHADER_VERTEX] = 43,
4977 [MESA_SHADER_TESS_CTRL] = 44,
4978 [MESA_SHADER_TESS_EVAL] = 45,
4979 [MESA_SHADER_GEOMETRY] = 46,
4980 [MESA_SHADER_FRAGMENT] = 47,
4981 };
4982
4983 /* Ivybridge requires a workaround flush before VS packets. */
4984 if (GEN_GEN == 7 && !GEN_IS_HASWELL &&
4985 stage_state->stage == MESA_SHADER_VERTEX) {
4986 gen7_emit_vs_workaround_flush(brw);
4987 }
4988
4989 brw_batch_emit(brw, GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ptr) {
4990 ptr._3DCommandSubOpcode = packet_headers[stage_state->stage];
4991 ptr.PointertoVSSamplerState = stage_state->sampler_offset;
4992 }
4993 #endif
4994 }
4995
4996 UNUSED static bool
4997 has_component(mesa_format format, int i)
4998 {
4999 if (_mesa_is_format_color_format(format))
5000 return _mesa_format_has_color_component(format, i);
5001
5002 /* depth and stencil have only one component */
5003 return i == 0;
5004 }
5005
5006 /**
5007 * Upload SAMPLER_BORDER_COLOR_STATE.
5008 */
5009 static void
5010 genX(upload_default_color)(struct brw_context *brw,
5011 const struct gl_sampler_object *sampler,
5012 MAYBE_UNUSED mesa_format format, GLenum base_format,
5013 bool is_integer_format, bool is_stencil_sampling,
5014 uint32_t *sdc_offset)
5015 {
5016 union gl_color_union color;
5017
5018 switch (base_format) {
5019 case GL_DEPTH_COMPONENT:
5020 /* GL specs that border color for depth textures is taken from the
5021 * R channel, while the hardware uses A. Spam R into all the
5022 * channels for safety.
5023 */
5024 color.ui[0] = sampler->BorderColor.ui[0];
5025 color.ui[1] = sampler->BorderColor.ui[0];
5026 color.ui[2] = sampler->BorderColor.ui[0];
5027 color.ui[3] = sampler->BorderColor.ui[0];
5028 break;
5029 case GL_ALPHA:
5030 color.ui[0] = 0u;
5031 color.ui[1] = 0u;
5032 color.ui[2] = 0u;
5033 color.ui[3] = sampler->BorderColor.ui[3];
5034 break;
5035 case GL_INTENSITY:
5036 color.ui[0] = sampler->BorderColor.ui[0];
5037 color.ui[1] = sampler->BorderColor.ui[0];
5038 color.ui[2] = sampler->BorderColor.ui[0];
5039 color.ui[3] = sampler->BorderColor.ui[0];
5040 break;
5041 case GL_LUMINANCE:
5042 color.ui[0] = sampler->BorderColor.ui[0];
5043 color.ui[1] = sampler->BorderColor.ui[0];
5044 color.ui[2] = sampler->BorderColor.ui[0];
5045 color.ui[3] = float_as_int(1.0);
5046 break;
5047 case GL_LUMINANCE_ALPHA:
5048 color.ui[0] = sampler->BorderColor.ui[0];
5049 color.ui[1] = sampler->BorderColor.ui[0];
5050 color.ui[2] = sampler->BorderColor.ui[0];
5051 color.ui[3] = sampler->BorderColor.ui[3];
5052 break;
5053 default:
5054 color.ui[0] = sampler->BorderColor.ui[0];
5055 color.ui[1] = sampler->BorderColor.ui[1];
5056 color.ui[2] = sampler->BorderColor.ui[2];
5057 color.ui[3] = sampler->BorderColor.ui[3];
5058 break;
5059 }
5060
5061 /* In some cases we use an RGBA surface format for GL RGB textures,
5062 * where we've initialized the A channel to 1.0. We also have to set
5063 * the border color alpha to 1.0 in that case.
5064 */
5065 if (base_format == GL_RGB)
5066 color.ui[3] = float_as_int(1.0);
5067
5068 int alignment = 32;
5069 if (GEN_GEN >= 8) {
5070 alignment = 64;
5071 } else if (GEN_IS_HASWELL && (is_integer_format || is_stencil_sampling)) {
5072 alignment = 512;
5073 }
5074
5075 uint32_t *sdc = brw_state_batch(
5076 brw, GENX(SAMPLER_BORDER_COLOR_STATE_length) * sizeof(uint32_t),
5077 alignment, sdc_offset);
5078
5079 struct GENX(SAMPLER_BORDER_COLOR_STATE) state = { 0 };
5080
5081 #define ASSIGN(dst, src) \
5082 do { \
5083 dst = src; \
5084 } while (0)
5085
5086 #define ASSIGNu16(dst, src) \
5087 do { \
5088 dst = (uint16_t)src; \
5089 } while (0)
5090
5091 #define ASSIGNu8(dst, src) \
5092 do { \
5093 dst = (uint8_t)src; \
5094 } while (0)
5095
5096 #define BORDER_COLOR_ATTR(macro, _color_type, src) \
5097 macro(state.BorderColor ## _color_type ## Red, src[0]); \
5098 macro(state.BorderColor ## _color_type ## Green, src[1]); \
5099 macro(state.BorderColor ## _color_type ## Blue, src[2]); \
5100 macro(state.BorderColor ## _color_type ## Alpha, src[3]);
5101
5102 #if GEN_GEN >= 8
5103 /* On Broadwell, the border color is represented as four 32-bit floats,
5104 * integers, or unsigned values, interpreted according to the surface
5105 * format. This matches the sampler->BorderColor union exactly; just
5106 * memcpy the values.
5107 */
5108 BORDER_COLOR_ATTR(ASSIGN, 32bit, color.ui);
5109 #elif GEN_IS_HASWELL
5110 if (is_integer_format || is_stencil_sampling) {
5111 bool stencil = format == MESA_FORMAT_S_UINT8 || is_stencil_sampling;
5112 const int bits_per_channel =
5113 _mesa_get_format_bits(format, stencil ? GL_STENCIL_BITS : GL_RED_BITS);
5114
5115 /* From the Haswell PRM, "Command Reference: Structures", Page 36:
5116 * "If any color channel is missing from the surface format,
5117 * corresponding border color should be programmed as zero and if
5118 * alpha channel is missing, corresponding Alpha border color should
5119 * be programmed as 1."
5120 */
5121 unsigned c[4] = { 0, 0, 0, 1 };
5122 for (int i = 0; i < 4; i++) {
5123 if (has_component(format, i))
5124 c[i] = color.ui[i];
5125 }
5126
5127 switch (bits_per_channel) {
5128 case 8:
5129 /* Copy RGBA in order. */
5130 BORDER_COLOR_ATTR(ASSIGNu8, 8bit, c);
5131 break;
5132 case 10:
5133 /* R10G10B10A2_UINT is treated like a 16-bit format. */
5134 case 16:
5135 BORDER_COLOR_ATTR(ASSIGNu16, 16bit, c);
5136 break;
5137 case 32:
5138 if (base_format == GL_RG) {
5139 /* Careful inspection of the tables reveals that for RG32 formats,
5140 * the green channel needs to go where blue normally belongs.
5141 */
5142 state.BorderColor32bitRed = c[0];
5143 state.BorderColor32bitBlue = c[1];
5144 state.BorderColor32bitAlpha = 1;
5145 } else {
5146 /* Copy RGBA in order. */
5147 BORDER_COLOR_ATTR(ASSIGN, 32bit, c);
5148 }
5149 break;
5150 default:
5151 assert(!"Invalid number of bits per channel in integer format.");
5152 break;
5153 }
5154 } else {
5155 BORDER_COLOR_ATTR(ASSIGN, Float, color.f);
5156 }
5157 #elif GEN_GEN == 5 || GEN_GEN == 6
5158 BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_UBYTE, Unorm, color.f);
5159 BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_USHORT, Unorm16, color.f);
5160 BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_SHORT, Snorm16, color.f);
5161
5162 #define MESA_FLOAT_TO_HALF(dst, src) \
5163 dst = _mesa_float_to_half(src);
5164
5165 BORDER_COLOR_ATTR(MESA_FLOAT_TO_HALF, Float16, color.f);
5166
5167 #undef MESA_FLOAT_TO_HALF
5168
5169 state.BorderColorSnorm8Red = state.BorderColorSnorm16Red >> 8;
5170 state.BorderColorSnorm8Green = state.BorderColorSnorm16Green >> 8;
5171 state.BorderColorSnorm8Blue = state.BorderColorSnorm16Blue >> 8;
5172 state.BorderColorSnorm8Alpha = state.BorderColorSnorm16Alpha >> 8;
5173
5174 BORDER_COLOR_ATTR(ASSIGN, Float, color.f);
5175 #elif GEN_GEN == 4
5176 BORDER_COLOR_ATTR(ASSIGN, , color.f);
5177 #else
5178 BORDER_COLOR_ATTR(ASSIGN, Float, color.f);
5179 #endif
5180
5181 #undef ASSIGN
5182 #undef BORDER_COLOR_ATTR
5183
5184 GENX(SAMPLER_BORDER_COLOR_STATE_pack)(brw, sdc, &state);
5185 }
5186
5187 static uint32_t
5188 translate_wrap_mode(GLenum wrap, MAYBE_UNUSED bool using_nearest)
5189 {
5190 switch (wrap) {
5191 case GL_REPEAT:
5192 return TCM_WRAP;
5193 case GL_CLAMP:
5194 #if GEN_GEN >= 8
5195 /* GL_CLAMP is the weird mode where coordinates are clamped to
5196 * [0.0, 1.0], so linear filtering of coordinates outside of
5197 * [0.0, 1.0] give you half edge texel value and half border
5198 * color.
5199 *
5200 * Gen8+ supports this natively.
5201 */
5202 return TCM_HALF_BORDER;
5203 #else
5204 /* On Gen4-7.5, we clamp the coordinates in the fragment shader
5205 * and set clamp_border here, which gets the result desired.
5206 * We just use clamp(_to_edge) for nearest, because for nearest
5207 * clamping to 1.0 gives border color instead of the desired
5208 * edge texels.
5209 */
5210 if (using_nearest)
5211 return TCM_CLAMP;
5212 else
5213 return TCM_CLAMP_BORDER;
5214 #endif
5215 case GL_CLAMP_TO_EDGE:
5216 return TCM_CLAMP;
5217 case GL_CLAMP_TO_BORDER:
5218 return TCM_CLAMP_BORDER;
5219 case GL_MIRRORED_REPEAT:
5220 return TCM_MIRROR;
5221 case GL_MIRROR_CLAMP_TO_EDGE:
5222 return TCM_MIRROR_ONCE;
5223 default:
5224 return TCM_WRAP;
5225 }
5226 }
5227
5228 /**
5229 * Return true if the given wrap mode requires the border color to exist.
5230 */
5231 static bool
5232 wrap_mode_needs_border_color(unsigned wrap_mode)
5233 {
5234 #if GEN_GEN >= 8
5235 return wrap_mode == TCM_CLAMP_BORDER ||
5236 wrap_mode == TCM_HALF_BORDER;
5237 #else
5238 return wrap_mode == TCM_CLAMP_BORDER;
5239 #endif
5240 }
5241
5242 /**
5243 * Sets the sampler state for a single unit based off of the sampler key
5244 * entry.
5245 */
5246 static void
5247 genX(update_sampler_state)(struct brw_context *brw,
5248 GLenum target, bool tex_cube_map_seamless,
5249 GLfloat tex_unit_lod_bias,
5250 mesa_format format, GLenum base_format,
5251 const struct gl_texture_object *texObj,
5252 const struct gl_sampler_object *sampler,
5253 uint32_t *sampler_state)
5254 {
5255 struct GENX(SAMPLER_STATE) samp_st = { 0 };
5256
5257 /* Select min and mip filters. */
5258 switch (sampler->MinFilter) {
5259 case GL_NEAREST:
5260 samp_st.MinModeFilter = MAPFILTER_NEAREST;
5261 samp_st.MipModeFilter = MIPFILTER_NONE;
5262 break;
5263 case GL_LINEAR:
5264 samp_st.MinModeFilter = MAPFILTER_LINEAR;
5265 samp_st.MipModeFilter = MIPFILTER_NONE;
5266 break;
5267 case GL_NEAREST_MIPMAP_NEAREST:
5268 samp_st.MinModeFilter = MAPFILTER_NEAREST;
5269 samp_st.MipModeFilter = MIPFILTER_NEAREST;
5270 break;
5271 case GL_LINEAR_MIPMAP_NEAREST:
5272 samp_st.MinModeFilter = MAPFILTER_LINEAR;
5273 samp_st.MipModeFilter = MIPFILTER_NEAREST;
5274 break;
5275 case GL_NEAREST_MIPMAP_LINEAR:
5276 samp_st.MinModeFilter = MAPFILTER_NEAREST;
5277 samp_st.MipModeFilter = MIPFILTER_LINEAR;
5278 break;
5279 case GL_LINEAR_MIPMAP_LINEAR:
5280 samp_st.MinModeFilter = MAPFILTER_LINEAR;
5281 samp_st.MipModeFilter = MIPFILTER_LINEAR;
5282 break;
5283 default:
5284 unreachable("not reached");
5285 }
5286
5287 /* Select mag filter. */
5288 samp_st.MagModeFilter = sampler->MagFilter == GL_LINEAR ?
5289 MAPFILTER_LINEAR : MAPFILTER_NEAREST;
5290
5291 /* Enable anisotropic filtering if desired. */
5292 samp_st.MaximumAnisotropy = RATIO21;
5293
5294 if (sampler->MaxAnisotropy > 1.0f) {
5295 if (samp_st.MinModeFilter == MAPFILTER_LINEAR)
5296 samp_st.MinModeFilter = MAPFILTER_ANISOTROPIC;
5297 if (samp_st.MagModeFilter == MAPFILTER_LINEAR)
5298 samp_st.MagModeFilter = MAPFILTER_ANISOTROPIC;
5299
5300 if (sampler->MaxAnisotropy > 2.0f) {
5301 samp_st.MaximumAnisotropy =
5302 MIN2((sampler->MaxAnisotropy - 2) / 2, RATIO161);
5303 }
5304 }
5305
5306 /* Set address rounding bits if not using nearest filtering. */
5307 if (samp_st.MinModeFilter != MAPFILTER_NEAREST) {
5308 samp_st.UAddressMinFilterRoundingEnable = true;
5309 samp_st.VAddressMinFilterRoundingEnable = true;
5310 samp_st.RAddressMinFilterRoundingEnable = true;
5311 }
5312
5313 if (samp_st.MagModeFilter != MAPFILTER_NEAREST) {
5314 samp_st.UAddressMagFilterRoundingEnable = true;
5315 samp_st.VAddressMagFilterRoundingEnable = true;
5316 samp_st.RAddressMagFilterRoundingEnable = true;
5317 }
5318
5319 bool either_nearest =
5320 sampler->MinFilter == GL_NEAREST || sampler->MagFilter == GL_NEAREST;
5321 unsigned wrap_s = translate_wrap_mode(sampler->WrapS, either_nearest);
5322 unsigned wrap_t = translate_wrap_mode(sampler->WrapT, either_nearest);
5323 unsigned wrap_r = translate_wrap_mode(sampler->WrapR, either_nearest);
5324
5325 if (target == GL_TEXTURE_CUBE_MAP ||
5326 target == GL_TEXTURE_CUBE_MAP_ARRAY) {
5327 /* Cube maps must use the same wrap mode for all three coordinate
5328 * dimensions. Prior to Haswell, only CUBE and CLAMP are valid.
5329 *
5330 * Ivybridge and Baytrail seem to have problems with CUBE mode and
5331 * integer formats. Fall back to CLAMP for now.
5332 */
5333 if ((tex_cube_map_seamless || sampler->CubeMapSeamless) &&
5334 !(GEN_GEN == 7 && !GEN_IS_HASWELL && texObj->_IsIntegerFormat)) {
5335 wrap_s = TCM_CUBE;
5336 wrap_t = TCM_CUBE;
5337 wrap_r = TCM_CUBE;
5338 } else {
5339 wrap_s = TCM_CLAMP;
5340 wrap_t = TCM_CLAMP;
5341 wrap_r = TCM_CLAMP;
5342 }
5343 } else if (target == GL_TEXTURE_1D) {
5344 /* There's a bug in 1D texture sampling - it actually pays
5345 * attention to the wrap_t value, though it should not.
5346 * Override the wrap_t value here to GL_REPEAT to keep
5347 * any nonexistent border pixels from floating in.
5348 */
5349 wrap_t = TCM_WRAP;
5350 }
5351
5352 samp_st.TCXAddressControlMode = wrap_s;
5353 samp_st.TCYAddressControlMode = wrap_t;
5354 samp_st.TCZAddressControlMode = wrap_r;
5355
5356 samp_st.ShadowFunction =
5357 sampler->CompareMode == GL_COMPARE_R_TO_TEXTURE_ARB ?
5358 intel_translate_shadow_compare_func(sampler->CompareFunc) : 0;
5359
5360 #if GEN_GEN >= 7
5361 /* Set shadow function. */
5362 samp_st.AnisotropicAlgorithm =
5363 samp_st.MinModeFilter == MAPFILTER_ANISOTROPIC ?
5364 EWAApproximation : LEGACY;
5365 #endif
5366
5367 #if GEN_GEN >= 6
5368 samp_st.NonnormalizedCoordinateEnable = target == GL_TEXTURE_RECTANGLE;
5369 #endif
5370
5371 const float hw_max_lod = GEN_GEN >= 7 ? 14 : 13;
5372 samp_st.MinLOD = CLAMP(sampler->MinLod, 0, hw_max_lod);
5373 samp_st.MaxLOD = CLAMP(sampler->MaxLod, 0, hw_max_lod);
5374 samp_st.TextureLODBias =
5375 CLAMP(tex_unit_lod_bias + sampler->LodBias, -16, 15);
5376
5377 #if GEN_GEN == 6
5378 samp_st.BaseMipLevel =
5379 CLAMP(texObj->MinLevel + texObj->BaseLevel, 0, hw_max_lod);
5380 samp_st.MinandMagStateNotEqual =
5381 samp_st.MinModeFilter != samp_st.MagModeFilter;
5382 #endif
5383
5384 /* Upload the border color if necessary. If not, just point it at
5385 * offset 0 (the start of the batch) - the color should be ignored,
5386 * but that address won't fault in case something reads it anyway.
5387 */
5388 uint32_t border_color_offset = 0;
5389 if (wrap_mode_needs_border_color(wrap_s) ||
5390 wrap_mode_needs_border_color(wrap_t) ||
5391 wrap_mode_needs_border_color(wrap_r)) {
5392 genX(upload_default_color)(brw, sampler, format, base_format,
5393 texObj->_IsIntegerFormat,
5394 texObj->StencilSampling,
5395 &border_color_offset);
5396 }
5397 #if GEN_GEN < 6
5398 samp_st.BorderColorPointer =
5399 ro_bo(brw->batch.state.bo, border_color_offset);
5400 #else
5401 samp_st.BorderColorPointer = border_color_offset;
5402 #endif
5403
5404 #if GEN_GEN >= 8
5405 samp_st.LODPreClampMode = CLAMP_MODE_OGL;
5406 #else
5407 samp_st.LODPreClampEnable = true;
5408 #endif
5409
5410 GENX(SAMPLER_STATE_pack)(brw, sampler_state, &samp_st);
5411 }
5412
5413 static void
5414 update_sampler_state(struct brw_context *brw,
5415 int unit,
5416 uint32_t *sampler_state)
5417 {
5418 struct gl_context *ctx = &brw->ctx;
5419 const struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
5420 const struct gl_texture_object *texObj = texUnit->_Current;
5421 const struct gl_sampler_object *sampler = _mesa_get_samplerobj(ctx, unit);
5422
5423 /* These don't use samplers at all. */
5424 if (texObj->Target == GL_TEXTURE_BUFFER)
5425 return;
5426
5427 struct gl_texture_image *firstImage = texObj->Image[0][texObj->BaseLevel];
5428 genX(update_sampler_state)(brw, texObj->Target,
5429 ctx->Texture.CubeMapSeamless,
5430 texUnit->LodBias,
5431 firstImage->TexFormat, firstImage->_BaseFormat,
5432 texObj, sampler,
5433 sampler_state);
5434 }
5435
5436 static void
5437 genX(upload_sampler_state_table)(struct brw_context *brw,
5438 struct gl_program *prog,
5439 struct brw_stage_state *stage_state)
5440 {
5441 struct gl_context *ctx = &brw->ctx;
5442 uint32_t sampler_count = stage_state->sampler_count;
5443
5444 GLbitfield SamplersUsed = prog->SamplersUsed;
5445
5446 if (sampler_count == 0)
5447 return;
5448
5449 /* SAMPLER_STATE is 4 DWords on all platforms. */
5450 const int dwords = GENX(SAMPLER_STATE_length);
5451 const int size_in_bytes = dwords * sizeof(uint32_t);
5452
5453 uint32_t *sampler_state = brw_state_batch(brw,
5454 sampler_count * size_in_bytes,
5455 32, &stage_state->sampler_offset);
5456 /* memset(sampler_state, 0, sampler_count * size_in_bytes); */
5457
5458 for (unsigned s = 0; s < sampler_count; s++) {
5459 if (SamplersUsed & (1 << s)) {
5460 const unsigned unit = prog->SamplerUnits[s];
5461 if (ctx->Texture.Unit[unit]._Current) {
5462 update_sampler_state(brw, unit, sampler_state);
5463 }
5464 }
5465
5466 sampler_state += dwords;
5467 }
5468
5469 if (GEN_GEN >= 7 && stage_state->stage != MESA_SHADER_COMPUTE) {
5470 /* Emit a 3DSTATE_SAMPLER_STATE_POINTERS_XS packet. */
5471 genX(emit_sampler_state_pointers_xs)(brw, stage_state);
5472 } else {
5473 /* Flag that the sampler state table pointer has changed; later atoms
5474 * will handle it.
5475 */
5476 brw->ctx.NewDriverState |= BRW_NEW_SAMPLER_STATE_TABLE;
5477 }
5478 }
5479
5480 static void
5481 genX(upload_fs_samplers)(struct brw_context *brw)
5482 {
5483 /* BRW_NEW_FRAGMENT_PROGRAM */
5484 struct gl_program *fs = brw->programs[MESA_SHADER_FRAGMENT];
5485 genX(upload_sampler_state_table)(brw, fs, &brw->wm.base);
5486 }
5487
5488 static const struct brw_tracked_state genX(fs_samplers) = {
5489 .dirty = {
5490 .mesa = _NEW_TEXTURE,
5491 .brw = BRW_NEW_BATCH |
5492 BRW_NEW_BLORP |
5493 BRW_NEW_FRAGMENT_PROGRAM,
5494 },
5495 .emit = genX(upload_fs_samplers),
5496 };
5497
5498 static void
5499 genX(upload_vs_samplers)(struct brw_context *brw)
5500 {
5501 /* BRW_NEW_VERTEX_PROGRAM */
5502 struct gl_program *vs = brw->programs[MESA_SHADER_VERTEX];
5503 genX(upload_sampler_state_table)(brw, vs, &brw->vs.base);
5504 }
5505
5506 static const struct brw_tracked_state genX(vs_samplers) = {
5507 .dirty = {
5508 .mesa = _NEW_TEXTURE,
5509 .brw = BRW_NEW_BATCH |
5510 BRW_NEW_BLORP |
5511 BRW_NEW_VERTEX_PROGRAM,
5512 },
5513 .emit = genX(upload_vs_samplers),
5514 };
5515
5516 #if GEN_GEN >= 6
5517 static void
5518 genX(upload_gs_samplers)(struct brw_context *brw)
5519 {
5520 /* BRW_NEW_GEOMETRY_PROGRAM */
5521 struct gl_program *gs = brw->programs[MESA_SHADER_GEOMETRY];
5522 if (!gs)
5523 return;
5524
5525 genX(upload_sampler_state_table)(brw, gs, &brw->gs.base);
5526 }
5527
5528
5529 static const struct brw_tracked_state genX(gs_samplers) = {
5530 .dirty = {
5531 .mesa = _NEW_TEXTURE,
5532 .brw = BRW_NEW_BATCH |
5533 BRW_NEW_BLORP |
5534 BRW_NEW_GEOMETRY_PROGRAM,
5535 },
5536 .emit = genX(upload_gs_samplers),
5537 };
5538 #endif
5539
5540 #if GEN_GEN >= 7
5541 static void
5542 genX(upload_tcs_samplers)(struct brw_context *brw)
5543 {
5544 /* BRW_NEW_TESS_PROGRAMS */
5545 struct gl_program *tcs = brw->programs[MESA_SHADER_TESS_CTRL];
5546 if (!tcs)
5547 return;
5548
5549 genX(upload_sampler_state_table)(brw, tcs, &brw->tcs.base);
5550 }
5551
5552 static const struct brw_tracked_state genX(tcs_samplers) = {
5553 .dirty = {
5554 .mesa = _NEW_TEXTURE,
5555 .brw = BRW_NEW_BATCH |
5556 BRW_NEW_BLORP |
5557 BRW_NEW_TESS_PROGRAMS,
5558 },
5559 .emit = genX(upload_tcs_samplers),
5560 };
5561 #endif
5562
5563 #if GEN_GEN >= 7
5564 static void
5565 genX(upload_tes_samplers)(struct brw_context *brw)
5566 {
5567 /* BRW_NEW_TESS_PROGRAMS */
5568 struct gl_program *tes = brw->programs[MESA_SHADER_TESS_EVAL];
5569 if (!tes)
5570 return;
5571
5572 genX(upload_sampler_state_table)(brw, tes, &brw->tes.base);
5573 }
5574
5575 static const struct brw_tracked_state genX(tes_samplers) = {
5576 .dirty = {
5577 .mesa = _NEW_TEXTURE,
5578 .brw = BRW_NEW_BATCH |
5579 BRW_NEW_BLORP |
5580 BRW_NEW_TESS_PROGRAMS,
5581 },
5582 .emit = genX(upload_tes_samplers),
5583 };
5584 #endif
5585
5586 #if GEN_GEN >= 7
5587 static void
5588 genX(upload_cs_samplers)(struct brw_context *brw)
5589 {
5590 /* BRW_NEW_COMPUTE_PROGRAM */
5591 struct gl_program *cs = brw->programs[MESA_SHADER_COMPUTE];
5592 if (!cs)
5593 return;
5594
5595 genX(upload_sampler_state_table)(brw, cs, &brw->cs.base);
5596 }
5597
5598 const struct brw_tracked_state genX(cs_samplers) = {
5599 .dirty = {
5600 .mesa = _NEW_TEXTURE,
5601 .brw = BRW_NEW_BATCH |
5602 BRW_NEW_BLORP |
5603 BRW_NEW_COMPUTE_PROGRAM,
5604 },
5605 .emit = genX(upload_cs_samplers),
5606 };
5607 #endif
5608
5609 /* ---------------------------------------------------------------------- */
5610
5611 #if GEN_GEN <= 5
5612
5613 static void genX(upload_blend_constant_color)(struct brw_context *brw)
5614 {
5615 struct gl_context *ctx = &brw->ctx;
5616
5617 brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_COLOR), blend_cc) {
5618 blend_cc.BlendConstantColorRed = ctx->Color.BlendColorUnclamped[0];
5619 blend_cc.BlendConstantColorGreen = ctx->Color.BlendColorUnclamped[1];
5620 blend_cc.BlendConstantColorBlue = ctx->Color.BlendColorUnclamped[2];
5621 blend_cc.BlendConstantColorAlpha = ctx->Color.BlendColorUnclamped[3];
5622 }
5623 }
5624
5625 static const struct brw_tracked_state genX(blend_constant_color) = {
5626 .dirty = {
5627 .mesa = _NEW_COLOR,
5628 .brw = BRW_NEW_CONTEXT |
5629 BRW_NEW_BLORP,
5630 },
5631 .emit = genX(upload_blend_constant_color)
5632 };
5633 #endif
5634
5635 /* ---------------------------------------------------------------------- */
5636
5637 void
5638 genX(init_atoms)(struct brw_context *brw)
5639 {
5640 #if GEN_GEN < 6
5641 static const struct brw_tracked_state *render_atoms[] =
5642 {
5643 /* Once all the programs are done, we know how large urb entry
5644 * sizes need to be and can decide if we need to change the urb
5645 * layout.
5646 */
5647 &brw_curbe_offsets,
5648 &brw_recalculate_urb_fence,
5649
5650 &genX(cc_vp),
5651 &genX(color_calc_state),
5652
5653 /* Surface state setup. Must come before the VS/WM unit. The binding
5654 * table upload must be last.
5655 */
5656 &brw_vs_pull_constants,
5657 &brw_wm_pull_constants,
5658 &brw_renderbuffer_surfaces,
5659 &brw_renderbuffer_read_surfaces,
5660 &brw_texture_surfaces,
5661 &brw_vs_binding_table,
5662 &brw_wm_binding_table,
5663
5664 &genX(fs_samplers),
5665 &genX(vs_samplers),
5666
5667 /* These set up state for brw_psp_urb_cbs */
5668 &genX(wm_state),
5669 &genX(sf_clip_viewport),
5670 &genX(sf_state),
5671 &genX(vs_state), /* always required, enabled or not */
5672 &genX(clip_state),
5673 &genX(gs_state),
5674
5675 /* Command packets:
5676 */
5677 &brw_binding_table_pointers,
5678 &genX(blend_constant_color),
5679
5680 &brw_depthbuffer,
5681
5682 &genX(polygon_stipple),
5683 &genX(polygon_stipple_offset),
5684
5685 &genX(line_stipple),
5686
5687 &brw_psp_urb_cbs,
5688
5689 &genX(drawing_rect),
5690 &brw_indices, /* must come before brw_vertices */
5691 &genX(index_buffer),
5692 &genX(vertices),
5693
5694 &brw_constant_buffer
5695 };
5696 #elif GEN_GEN == 6
5697 static const struct brw_tracked_state *render_atoms[] =
5698 {
5699 &genX(sf_clip_viewport),
5700
5701 /* Command packets: */
5702
5703 &genX(cc_vp),
5704
5705 &gen6_urb,
5706 &genX(blend_state), /* must do before cc unit */
5707 &genX(color_calc_state), /* must do before cc unit */
5708 &genX(depth_stencil_state), /* must do before cc unit */
5709
5710 &genX(vs_push_constants), /* Before vs_state */
5711 &genX(gs_push_constants), /* Before gs_state */
5712 &genX(wm_push_constants), /* Before wm_state */
5713
5714 /* Surface state setup. Must come before the VS/WM unit. The binding
5715 * table upload must be last.
5716 */
5717 &brw_vs_pull_constants,
5718 &brw_vs_ubo_surfaces,
5719 &brw_gs_pull_constants,
5720 &brw_gs_ubo_surfaces,
5721 &brw_wm_pull_constants,
5722 &brw_wm_ubo_surfaces,
5723 &gen6_renderbuffer_surfaces,
5724 &brw_renderbuffer_read_surfaces,
5725 &brw_texture_surfaces,
5726 &gen6_sol_surface,
5727 &brw_vs_binding_table,
5728 &gen6_gs_binding_table,
5729 &brw_wm_binding_table,
5730
5731 &genX(fs_samplers),
5732 &genX(vs_samplers),
5733 &genX(gs_samplers),
5734 &gen6_sampler_state,
5735 &genX(multisample_state),
5736
5737 &genX(vs_state),
5738 &genX(gs_state),
5739 &genX(clip_state),
5740 &genX(sf_state),
5741 &genX(wm_state),
5742
5743 &genX(scissor_state),
5744
5745 &gen6_binding_table_pointers,
5746
5747 &brw_depthbuffer,
5748
5749 &genX(polygon_stipple),
5750 &genX(polygon_stipple_offset),
5751
5752 &genX(line_stipple),
5753
5754 &genX(drawing_rect),
5755
5756 &brw_indices, /* must come before brw_vertices */
5757 &genX(index_buffer),
5758 &genX(vertices),
5759 };
5760 #elif GEN_GEN == 7
5761 static const struct brw_tracked_state *render_atoms[] =
5762 {
5763 /* Command packets: */
5764
5765 &genX(cc_vp),
5766 &genX(sf_clip_viewport),
5767
5768 &gen7_l3_state,
5769 &gen7_push_constant_space,
5770 &gen7_urb,
5771 &genX(blend_state), /* must do before cc unit */
5772 &genX(color_calc_state), /* must do before cc unit */
5773 &genX(depth_stencil_state), /* must do before cc unit */
5774
5775 &brw_vs_image_surfaces, /* Before vs push/pull constants and binding table */
5776 &brw_tcs_image_surfaces, /* Before tcs push/pull constants and binding table */
5777 &brw_tes_image_surfaces, /* Before tes push/pull constants and binding table */
5778 &brw_gs_image_surfaces, /* Before gs push/pull constants and binding table */
5779 &brw_wm_image_surfaces, /* Before wm push/pull constants and binding table */
5780
5781 &genX(vs_push_constants), /* Before vs_state */
5782 &genX(tcs_push_constants),
5783 &genX(tes_push_constants),
5784 &genX(gs_push_constants), /* Before gs_state */
5785 &genX(wm_push_constants), /* Before wm_surfaces and constant_buffer */
5786
5787 /* Surface state setup. Must come before the VS/WM unit. The binding
5788 * table upload must be last.
5789 */
5790 &brw_vs_pull_constants,
5791 &brw_vs_ubo_surfaces,
5792 &brw_tcs_pull_constants,
5793 &brw_tcs_ubo_surfaces,
5794 &brw_tes_pull_constants,
5795 &brw_tes_ubo_surfaces,
5796 &brw_gs_pull_constants,
5797 &brw_gs_ubo_surfaces,
5798 &brw_wm_pull_constants,
5799 &brw_wm_ubo_surfaces,
5800 &gen6_renderbuffer_surfaces,
5801 &brw_renderbuffer_read_surfaces,
5802 &brw_texture_surfaces,
5803
5804 &genX(push_constant_packets),
5805
5806 &brw_vs_binding_table,
5807 &brw_tcs_binding_table,
5808 &brw_tes_binding_table,
5809 &brw_gs_binding_table,
5810 &brw_wm_binding_table,
5811
5812 &genX(fs_samplers),
5813 &genX(vs_samplers),
5814 &genX(tcs_samplers),
5815 &genX(tes_samplers),
5816 &genX(gs_samplers),
5817 &genX(multisample_state),
5818
5819 &genX(vs_state),
5820 &genX(hs_state),
5821 &genX(te_state),
5822 &genX(ds_state),
5823 &genX(gs_state),
5824 &genX(sol_state),
5825 &genX(clip_state),
5826 &genX(sbe_state),
5827 &genX(sf_state),
5828 &genX(wm_state),
5829 &genX(ps_state),
5830
5831 &genX(scissor_state),
5832
5833 &brw_depthbuffer,
5834
5835 &genX(polygon_stipple),
5836 &genX(polygon_stipple_offset),
5837
5838 &genX(line_stipple),
5839
5840 &genX(drawing_rect),
5841
5842 &brw_indices, /* must come before brw_vertices */
5843 &genX(index_buffer),
5844 &genX(vertices),
5845
5846 #if GEN_IS_HASWELL
5847 &genX(cut_index),
5848 #endif
5849 };
5850 #elif GEN_GEN >= 8
5851 static const struct brw_tracked_state *render_atoms[] =
5852 {
5853 &genX(cc_vp),
5854 &genX(sf_clip_viewport),
5855
5856 &gen7_l3_state,
5857 &gen7_push_constant_space,
5858 &gen7_urb,
5859 &genX(blend_state),
5860 &genX(color_calc_state),
5861
5862 &brw_vs_image_surfaces, /* Before vs push/pull constants and binding table */
5863 &brw_tcs_image_surfaces, /* Before tcs push/pull constants and binding table */
5864 &brw_tes_image_surfaces, /* Before tes push/pull constants and binding table */
5865 &brw_gs_image_surfaces, /* Before gs push/pull constants and binding table */
5866 &brw_wm_image_surfaces, /* Before wm push/pull constants and binding table */
5867
5868 &genX(vs_push_constants), /* Before vs_state */
5869 &genX(tcs_push_constants),
5870 &genX(tes_push_constants),
5871 &genX(gs_push_constants), /* Before gs_state */
5872 &genX(wm_push_constants), /* Before wm_surfaces and constant_buffer */
5873
5874 /* Surface state setup. Must come before the VS/WM unit. The binding
5875 * table upload must be last.
5876 */
5877 &brw_vs_pull_constants,
5878 &brw_vs_ubo_surfaces,
5879 &brw_tcs_pull_constants,
5880 &brw_tcs_ubo_surfaces,
5881 &brw_tes_pull_constants,
5882 &brw_tes_ubo_surfaces,
5883 &brw_gs_pull_constants,
5884 &brw_gs_ubo_surfaces,
5885 &brw_wm_pull_constants,
5886 &brw_wm_ubo_surfaces,
5887 &gen6_renderbuffer_surfaces,
5888 &brw_renderbuffer_read_surfaces,
5889 &brw_texture_surfaces,
5890
5891 &genX(push_constant_packets),
5892
5893 &brw_vs_binding_table,
5894 &brw_tcs_binding_table,
5895 &brw_tes_binding_table,
5896 &brw_gs_binding_table,
5897 &brw_wm_binding_table,
5898
5899 &genX(fs_samplers),
5900 &genX(vs_samplers),
5901 &genX(tcs_samplers),
5902 &genX(tes_samplers),
5903 &genX(gs_samplers),
5904 &genX(multisample_state),
5905
5906 &genX(vs_state),
5907 &genX(hs_state),
5908 &genX(te_state),
5909 &genX(ds_state),
5910 &genX(gs_state),
5911 &genX(sol_state),
5912 &genX(clip_state),
5913 &genX(raster_state),
5914 &genX(sbe_state),
5915 &genX(sf_state),
5916 &genX(ps_blend),
5917 &genX(ps_extra),
5918 &genX(ps_state),
5919 &genX(depth_stencil_state),
5920 &genX(wm_state),
5921
5922 &genX(scissor_state),
5923
5924 &brw_depthbuffer,
5925
5926 &genX(polygon_stipple),
5927 &genX(polygon_stipple_offset),
5928
5929 &genX(line_stipple),
5930
5931 &genX(drawing_rect),
5932
5933 &genX(vf_topology),
5934
5935 &brw_indices,
5936 &genX(index_buffer),
5937 &genX(vertices),
5938
5939 &genX(cut_index),
5940 &gen8_pma_fix,
5941 };
5942 #endif
5943
5944 STATIC_ASSERT(ARRAY_SIZE(render_atoms) <= ARRAY_SIZE(brw->render_atoms));
5945 brw_copy_pipeline_atoms(brw, BRW_RENDER_PIPELINE,
5946 render_atoms, ARRAY_SIZE(render_atoms));
5947
5948 #if GEN_GEN >= 7
5949 static const struct brw_tracked_state *compute_atoms[] =
5950 {
5951 &gen7_l3_state,
5952 &brw_cs_image_surfaces,
5953 &genX(cs_push_constants),
5954 &genX(cs_pull_constants),
5955 &brw_cs_ubo_surfaces,
5956 &brw_cs_texture_surfaces,
5957 &brw_cs_work_groups_surface,
5958 &genX(cs_samplers),
5959 &genX(cs_state),
5960 };
5961
5962 STATIC_ASSERT(ARRAY_SIZE(compute_atoms) <= ARRAY_SIZE(brw->compute_atoms));
5963 brw_copy_pipeline_atoms(brw, BRW_COMPUTE_PIPELINE,
5964 compute_atoms, ARRAY_SIZE(compute_atoms));
5965
5966 brw->vtbl.emit_mi_report_perf_count = genX(emit_mi_report_perf_count);
5967 brw->vtbl.emit_compute_walker = genX(emit_gpgpu_walker);
5968 #endif
5969 }