#include <assert.h>
-#include "common/gen_device_info.h"
+#include "dev/gen_device_info.h"
#include "common/gen_sample_positions.h"
#include "genxml/gen_macros.h"
+#include "common/gen_guardband.h"
#include "main/bufferobj.h"
#include "main/context.h"
#include "main/macros.h"
#include "main/state.h"
+#include "genX_boilerplate.h"
+
#include "brw_context.h"
#include "brw_draw.h"
#include "brw_multisample_state.h"
#include "main/viewport.h"
#include "util/half_float.h"
-UNUSED static void *
-emit_dwords(struct brw_context *brw, unsigned n)
-{
- intel_batchbuffer_begin(brw, n, RENDER_RING);
- uint32_t *map = brw->batch.map_next;
- brw->batch.map_next += n;
- intel_batchbuffer_advance(brw);
- return map;
-}
-
-struct brw_address {
- struct brw_bo *bo;
- unsigned reloc_flags;
- uint32_t offset;
-};
-
-#define __gen_address_type struct brw_address
-#define __gen_user_data struct brw_context
-
-static uint64_t
-__gen_combine_address(struct brw_context *brw, void *location,
- struct brw_address address, uint32_t delta)
-{
- struct intel_batchbuffer *batch = &brw->batch;
- uint32_t offset;
-
- if (address.bo == NULL) {
- return address.offset + delta;
- } else {
- if (GEN_GEN < 6 && brw_ptr_in_state_buffer(batch, location)) {
- offset = (char *) location - (char *) brw->batch.state.map;
- return brw_state_reloc(batch, offset, address.bo,
- address.offset + delta,
- address.reloc_flags);
- }
-
- assert(!brw_ptr_in_state_buffer(batch, location));
-
- offset = (char *) location - (char *) brw->batch.batch.map;
- return brw_batch_reloc(batch, offset, address.bo,
- address.offset + delta,
- address.reloc_flags);
- }
-}
-
-static struct brw_address
-rw_bo(struct brw_bo *bo, uint32_t offset)
-{
- return (struct brw_address) {
- .bo = bo,
- .offset = offset,
- .reloc_flags = RELOC_WRITE,
- };
-}
-
-static struct brw_address
-ro_bo(struct brw_bo *bo, uint32_t offset)
-{
- return (struct brw_address) {
- .bo = bo,
- .offset = offset,
- };
-}
-
-UNUSED static struct brw_address
-ggtt_bo(struct brw_bo *bo, uint32_t offset)
-{
- return (struct brw_address) {
- .bo = bo,
- .offset = offset,
- .reloc_flags = RELOC_WRITE | RELOC_NEEDS_GGTT,
- };
-}
-
#if GEN_GEN == 4
static struct brw_address
KSP(struct brw_context *brw, uint32_t offset)
}
#else
static uint32_t
-KSP(struct brw_context *brw, uint32_t offset)
+KSP(UNUSED struct brw_context *brw, uint32_t offset)
{
return offset;
}
#endif
-#include "genxml/genX_pack.h"
-
-#define _brw_cmd_length(cmd) cmd ## _length
-#define _brw_cmd_length_bias(cmd) cmd ## _length_bias
-#define _brw_cmd_header(cmd) cmd ## _header
-#define _brw_cmd_pack(cmd) cmd ## _pack
-
-#define brw_batch_emit(brw, cmd, name) \
- for (struct cmd name = { _brw_cmd_header(cmd) }, \
- *_dst = emit_dwords(brw, _brw_cmd_length(cmd)); \
- __builtin_expect(_dst != NULL, 1); \
- _brw_cmd_pack(cmd)(brw, (void *)_dst, &name), \
- _dst = NULL)
-
-#define brw_batch_emitn(brw, cmd, n, ...) ({ \
- uint32_t *_dw = emit_dwords(brw, n); \
- struct cmd template = { \
- _brw_cmd_header(cmd), \
- .DWordLength = n - _brw_cmd_length_bias(cmd), \
- __VA_ARGS__ \
- }; \
- _brw_cmd_pack(cmd)(brw, _dw, &template); \
- _dw + 1; /* Array starts at dw[1] */ \
- })
-
-#define brw_state_emit(brw, cmd, align, offset, name) \
- for (struct cmd name = {}, \
- *_dst = brw_state_batch(brw, _brw_cmd_length(cmd) * 4, \
- align, offset); \
- __builtin_expect(_dst != NULL, 1); \
- _brw_cmd_pack(cmd)(brw, (void *)_dst, &name), \
- _dst = NULL)
+#if GEN_GEN >= 7
+static void
+emit_lrm(struct brw_context *brw, uint32_t reg, struct brw_address addr)
+{
+ brw_batch_emit(brw, GENX(MI_LOAD_REGISTER_MEM), lrm) {
+ lrm.RegisterAddress = reg;
+ lrm.MemoryAddress = addr;
+ }
+}
+#endif
+
+#if GEN_GEN == 7
+static void
+emit_lri(struct brw_context *brw, uint32_t reg, uint32_t imm)
+{
+ brw_batch_emit(brw, GENX(MI_LOAD_REGISTER_IMM), lri) {
+ lri.RegisterOffset = reg;
+ lri.DataDWord = imm;
+ }
+}
+#endif
/**
* Polygon stipple packet
* to a FBO (i.e. any named frame buffer object), we *don't*
* need to invert - we already match the layout.
*/
- if (_mesa_is_winsys_fbo(ctx->DrawBuffer)) {
+ if (ctx->DrawBuffer->FlipY) {
for (unsigned i = 0; i < 32; i++)
poly.PatternRow[i] = ctx->PolygonStipple[31 - i]; /* invert */
} else {
* to a user-created FBO then our native pixel coordinate system
* works just fine, and there's no window system to worry about.
*/
- if (_mesa_is_winsys_fbo(ctx->DrawBuffer)) {
+ if (ctx->DrawBuffer->FlipY) {
poly.PolygonStippleYOffset =
(32 - (_mesa_geometric_height(ctx->DrawBuffer) & 31)) & 31;
}
unsigned buffer_nr,
struct brw_bo *bo,
unsigned start_offset,
- unsigned end_offset,
+ UNUSED unsigned end_offset,
unsigned stride,
- unsigned step_rate)
+ UNUSED unsigned step_rate)
{
struct GENX(VERTEX_BUFFER_STATE) buf_state = {
.VertexBufferIndex = buffer_nr,
.BufferPitch = stride,
- .BufferStartingAddress = ro_bo(bo, start_offset),
+
+ /* The VF cache designers apparently cut corners, and made the cache
+ * only consider the bottom 32 bits of memory addresses. If you happen
+ * to have two vertex buffers which get placed exactly 4 GiB apart and
+ * use them in back-to-back draw calls, you can get collisions. To work
+ * around this problem, we restrict vertex buffers to the low 32 bits of
+ * the address space.
+ */
+ .BufferStartingAddress = ro_32_bo(bo, start_offset),
#if GEN_GEN >= 8
.BufferSize = end_offset - start_offset,
#endif
#endif
#if GEN_GEN == 11
- .VertexBufferMOCS = ICL_MOCS_WB,
+ .MOCS = ICL_MOCS_WB,
#elif GEN_GEN == 10
- .VertexBufferMOCS = CNL_MOCS_WB,
+ .MOCS = CNL_MOCS_WB,
#elif GEN_GEN == 9
- .VertexBufferMOCS = SKL_MOCS_WB,
+ .MOCS = SKL_MOCS_WB,
#elif GEN_GEN == 8
- .VertexBufferMOCS = BDW_MOCS_WB,
+ .MOCS = BDW_MOCS_WB,
#elif GEN_GEN == 7
- .VertexBufferMOCS = GEN7_MOCS_L3,
+ .MOCS = GEN7_MOCS_L3,
#endif
};
}
}
+static UNUSED uint16_t
+pinned_bo_high_bits(struct brw_bo *bo)
+{
+ return (bo->kflags & EXEC_OBJECT_PINNED) ? bo->gtt_offset >> 32ull : 0;
+}
+
+/* The VF cache designers apparently cut corners, and made the cache key's
+ * <VertexBufferIndex, Memory Address> tuple only consider the bottom 32 bits
+ * of the address. If you happen to have two vertex buffers which get placed
+ * exactly 4 GiB apart and use them in back-to-back draw calls, you can get
+ * collisions. (These collisions can happen within a single batch.)
+ *
+ * In the soft-pin world, we'd like to assign addresses up front, and never
+ * move buffers. So, we need to do a VF cache invalidate if the buffer for
+ * a particular VB slot has different [48:32] address bits than the last one.
+ *
+ * In the relocation world, we have no idea what the addresses will be, so
+ * we can't apply this workaround. Instead, we tell the kernel to move it
+ * to the low 4GB regardless.
+ *
+ * This HW issue is gone on Gen11+.
+ */
+static void
+vf_invalidate_for_vb_48bit_transitions(struct brw_context *brw)
+{
+#if GEN_GEN >= 8 && GEN_GEN < 11
+ bool need_invalidate = false;
+
+ for (unsigned i = 0; i < brw->vb.nr_buffers; i++) {
+ uint16_t high_bits = pinned_bo_high_bits(brw->vb.buffers[i].bo);
+
+ if (high_bits != brw->vb.last_bo_high_bits[i]) {
+ need_invalidate = true;
+ brw->vb.last_bo_high_bits[i] = high_bits;
+ }
+ }
+
+ if (brw->draw.draw_params_bo) {
+ uint16_t high_bits = pinned_bo_high_bits(brw->draw.draw_params_bo);
+
+ if (brw->vb.last_bo_high_bits[brw->vb.nr_buffers] != high_bits) {
+ need_invalidate = true;
+ brw->vb.last_bo_high_bits[brw->vb.nr_buffers] = high_bits;
+ }
+ }
+
+ if (brw->draw.derived_draw_params_bo) {
+ uint16_t high_bits = pinned_bo_high_bits(brw->draw.derived_draw_params_bo);
+
+ if (brw->vb.last_bo_high_bits[brw->vb.nr_buffers + 1] != high_bits) {
+ need_invalidate = true;
+ brw->vb.last_bo_high_bits[brw->vb.nr_buffers + 1] = high_bits;
+ }
+ }
+
+ if (need_invalidate) {
+ brw_emit_pipe_control_flush(brw, PIPE_CONTROL_VF_CACHE_INVALIDATE | PIPE_CONTROL_CS_STALL);
+ }
+#endif
+}
+
+static void
+vf_invalidate_for_ib_48bit_transition(struct brw_context *brw)
+{
+#if GEN_GEN >= 8
+ uint16_t high_bits = pinned_bo_high_bits(brw->ib.bo);
+
+ if (high_bits != brw->ib.last_bo_high_bits) {
+ brw_emit_pipe_control_flush(brw, PIPE_CONTROL_VF_CACHE_INVALIDATE);
+ brw->ib.last_bo_high_bits = high_bits;
+ }
+#endif
+}
+
static void
genX(emit_vertices)(struct brw_context *brw)
{
}
#endif
- const bool needs_sgvs_element = (vs_prog_data->uses_basevertex ||
- vs_prog_data->uses_baseinstance ||
+ const bool uses_draw_params =
+ vs_prog_data->uses_firstvertex ||
+ vs_prog_data->uses_baseinstance;
+
+ const bool uses_derived_draw_params =
+ vs_prog_data->uses_drawid ||
+ vs_prog_data->uses_is_indexed_draw;
+
+ const bool needs_sgvs_element = (uses_draw_params ||
vs_prog_data->uses_instanceid ||
vs_prog_data->uses_vertexid);
unsigned nr_elements =
- brw->vb.nr_enabled + needs_sgvs_element + vs_prog_data->uses_drawid;
+ brw->vb.nr_enabled + needs_sgvs_element + uses_derived_draw_params;
#if GEN_GEN < 8
/* If any of the formats of vb.enabled needs more that one upload, we need
*/
for (unsigned i = 0; i < brw->vb.nr_enabled; i++) {
struct brw_vertex_element *input = brw->vb.enabled[i];
- uint32_t format = brw_get_vertex_surface_type(brw, input->glarray);
+ const struct gl_array_attributes *glattrib = input->glattrib;
+ uint32_t format = brw_get_vertex_surface_type(brw, &glattrib->Format);
if (uploads_needed(format, input->is_dual_slot) > 1)
nr_elements++;
1 + GENX(VERTEX_ELEMENT_STATE_length));
struct GENX(VERTEX_ELEMENT_STATE) elem = {
.Valid = true,
- .SourceElementFormat = (enum GENX(SURFACE_FORMAT)) ISL_FORMAT_R32G32B32A32_FLOAT,
+ .SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT,
.Component0Control = VFCOMP_STORE_0,
.Component1Control = VFCOMP_STORE_0,
.Component2Control = VFCOMP_STORE_0,
}
/* Now emit 3DSTATE_VERTEX_BUFFERS and 3DSTATE_VERTEX_ELEMENTS packets. */
- const bool uses_draw_params =
- vs_prog_data->uses_basevertex ||
- vs_prog_data->uses_baseinstance;
const unsigned nr_buffers = brw->vb.nr_buffers +
- uses_draw_params + vs_prog_data->uses_drawid;
+ uses_draw_params + uses_derived_draw_params;
+
+ vf_invalidate_for_vb_48bit_transitions(brw);
if (nr_buffers) {
assert(nr_buffers <= (GEN_GEN >= 6 ? 33 : 17));
0 /* step rate */);
}
- if (vs_prog_data->uses_drawid) {
+ if (uses_derived_draw_params) {
dw = genX(emit_vertex_buffer_state)(brw, dw, brw->vb.nr_buffers + 1,
- brw->draw.draw_id_bo,
- brw->draw.draw_id_offset,
- brw->draw.draw_id_bo->size,
+ brw->draw.derived_draw_params_bo,
+ brw->draw.derived_draw_params_offset,
+ brw->draw.derived_draw_params_bo->size,
0 /* stride */,
0 /* step rate */);
}
unsigned i;
for (i = 0; i < brw->vb.nr_enabled; i++) {
const struct brw_vertex_element *input = brw->vb.enabled[i];
- uint32_t format = brw_get_vertex_surface_type(brw, input->glarray);
+ const struct gl_array_attributes *glattrib = input->glattrib;
+ uint32_t format = brw_get_vertex_surface_type(brw, &glattrib->Format);
uint32_t comp0 = VFCOMP_STORE_SRC;
uint32_t comp1 = VFCOMP_STORE_SRC;
uint32_t comp2 = VFCOMP_STORE_SRC;
* entry. */
const unsigned offset = input->offset + c * 16;
+ const struct gl_array_attributes *glattrib = input->glattrib;
const int size = (GEN_GEN < 8 && is_passthru_format(format)) ?
- upload_format_size(upload_format) : input->glarray->Size;
+ upload_format_size(upload_format) : glattrib->Format.Size;
switch (size) {
case 0: comp0 = VFCOMP_STORE_0;
case 1: comp1 = VFCOMP_STORE_0;
case 2: comp2 = VFCOMP_STORE_0;
case 3:
- if (GEN_GEN >= 8 && input->glarray->Doubles) {
+ if (GEN_GEN >= 8 && glattrib->Format.Doubles) {
comp3 = VFCOMP_STORE_0;
- } else if (input->glarray->Integer) {
+ } else if (glattrib->Format.Integer) {
comp3 = VFCOMP_STORE_1_INT;
} else {
comp3 = VFCOMP_STORE_1_FP;
* to be specified as VFCOMP_STORE_0 in order to output a 256-bit
* vertex element."
*/
- if (input->glarray->Doubles && !input->is_dual_slot) {
+ if (glattrib->Format.Doubles && !input->is_dual_slot) {
/* Store vertex elements which correspond to double and dvec2 vertex
* shader inputs as 128-bit vertex elements, instead of 256-bits.
*/
};
#if GEN_GEN >= 8
- if (vs_prog_data->uses_basevertex ||
- vs_prog_data->uses_baseinstance) {
+ if (uses_draw_params) {
elem_state.VertexBufferIndex = brw->vb.nr_buffers;
- elem_state.SourceElementFormat = (enum GENX(SURFACE_FORMAT)) ISL_FORMAT_R32G32_UINT;
+ elem_state.SourceElementFormat = ISL_FORMAT_R32G32_UINT;
elem_state.Component0Control = VFCOMP_STORE_SRC;
elem_state.Component1Control = VFCOMP_STORE_SRC;
}
#else
elem_state.VertexBufferIndex = brw->vb.nr_buffers;
- elem_state.SourceElementFormat = (enum GENX(SURFACE_FORMAT)) ISL_FORMAT_R32G32_UINT;
- if (vs_prog_data->uses_basevertex)
+ elem_state.SourceElementFormat = ISL_FORMAT_R32G32_UINT;
+ if (uses_draw_params) {
elem_state.Component0Control = VFCOMP_STORE_SRC;
-
- if (vs_prog_data->uses_baseinstance)
elem_state.Component1Control = VFCOMP_STORE_SRC;
+ }
if (vs_prog_data->uses_vertexid)
elem_state.Component2Control = VFCOMP_STORE_VID;
dw += GENX(VERTEX_ELEMENT_STATE_length);
}
- if (vs_prog_data->uses_drawid) {
+ if (uses_derived_draw_params) {
struct GENX(VERTEX_ELEMENT_STATE) elem_state = {
.Valid = true,
.VertexBufferIndex = brw->vb.nr_buffers + 1,
- .SourceElementFormat = (enum GENX(SURFACE_FORMAT)) ISL_FORMAT_R32_UINT,
+ .SourceElementFormat = ISL_FORMAT_R32G32_UINT,
.Component0Control = VFCOMP_STORE_SRC,
- .Component1Control = VFCOMP_STORE_0,
+ .Component1Control = VFCOMP_STORE_SRC,
.Component2Control = VFCOMP_STORE_0,
.Component3Control = VFCOMP_STORE_0,
#if GEN_GEN < 5
#if GEN_GEN >= 6
if (gen6_edgeflag_input) {
- const uint32_t format =
- brw_get_vertex_surface_type(brw, gen6_edgeflag_input->glarray);
+ const struct gl_array_attributes *glattrib = gen6_edgeflag_input->glattrib;
+ const uint32_t format = brw_get_vertex_surface_type(brw, &glattrib->Format);
struct GENX(VERTEX_ELEMENT_STATE) elem_state = {
.Valid = true,
.mesa = _NEW_POLYGON,
.brw = BRW_NEW_BATCH |
BRW_NEW_BLORP |
+ BRW_NEW_VERTEX_PROGRAM |
BRW_NEW_VERTICES |
BRW_NEW_VS_PROG_DATA,
},
if (index_buffer == NULL)
return;
+ vf_invalidate_for_ib_48bit_transition(brw);
+
brw_batch_emit(brw, GENX(3DSTATE_INDEX_BUFFER), ib) {
#if GEN_GEN < 8 && !GEN_IS_HASWELL
- ib.CutIndexEnable = brw->prim_restart.enable_cut_index;
+ assert(brw->ib.enable_cut_index == brw->prim_restart.enable_cut_index);
+ ib.CutIndexEnable = brw->ib.enable_cut_index;
#endif
ib.IndexFormat = brw_get_index_type(index_buffer->index_size);
- ib.BufferStartingAddress = ro_bo(brw->ib.bo, 0);
+
+ /* The VF cache designers apparently cut corners, and made the cache
+ * only consider the bottom 32 bits of memory addresses. If you happen
+ * to have two index buffers which get placed exactly 4 GiB apart and
+ * use them in back-to-back draw calls, you can get collisions. To work
+ * around this problem, we restrict index buffers to the low 32 bits of
+ * the address space.
+ */
+ ib.BufferStartingAddress = ro_32_bo(brw->ib.bo, 0);
#if GEN_GEN >= 8
- ib.IndexBufferMOCS = GEN_GEN >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB;
+ ib.MOCS = GEN_GEN >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB;
ib.BufferSize = brw->ib.size;
#else
ib.BufferEndingAddress = ro_bo(brw->ib.bo, brw->ib.size - 1);
};
#endif
+static void
+genX(upload_vf_statistics)(struct brw_context *brw)
+{
+ brw_batch_emit(brw, GENX(3DSTATE_VF_STATISTICS), vf) {
+ vf.StatisticsEnable = true;
+ }
+}
+
+const struct brw_tracked_state genX(vf_statistics) = {
+ .dirty = {
+ .mesa = 0,
+ .brw = BRW_NEW_BLORP | BRW_NEW_CONTEXT,
+ },
+ .emit = genX(upload_vf_statistics),
+};
+
#if GEN_GEN >= 6
/**
* Determine the appropriate attribute override value to store into the
clip.ScreenSpaceViewportYMax = 1;
clip.ViewportXYClipTestEnable = true;
- clip.ViewportZClipTestEnable = !ctx->Transform.DepthClamp;
+ clip.ViewportZClipTestEnable = !(ctx->Transform.DepthClampNear &&
+ ctx->Transform.DepthClampFar);
/* _NEW_TRANSFORM */
if (GEN_GEN == 5 || GEN_IS_G4X) {
#endif
#if GEN_GEN == 7
- clip.FrontWinding = brw->polygon_front_bit == _mesa_is_user_fbo(fb);
+ clip.FrontWinding = brw->polygon_front_bit != fb->FlipY;
if (ctx->Polygon.CullFlag) {
switch (ctx->Polygon.CullFaceMode) {
clip.UserClipDistanceCullTestEnableBitmask =
brw_vue_prog_data(brw->vs.base.prog_data)->cull_distance_mask;
- clip.ViewportZClipTestEnable = !ctx->Transform.DepthClamp;
+ clip.ViewportZClipTestEnable = !(ctx->Transform.DepthClampNear &&
+ ctx->Transform.DepthClampFar);
#endif
/* _NEW_LIGHT */
#if GEN_GEN <= 7
/* _NEW_BUFFERS */
- bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
+ bool flip_y = ctx->DrawBuffer->FlipY;
UNUSED const bool multisampled_fbo =
_mesa_geometric_samples(ctx->DrawBuffer) > 1;
#endif
#if GEN_GEN <= 7
/* _NEW_POLYGON */
- sf.FrontWinding = brw->polygon_front_bit == render_to_fbo;
+ sf.FrontWinding = brw->polygon_front_bit != flip_y;
#if GEN_GEN >= 6
sf.GlobalDepthOffsetEnableSolid = ctx->Polygon.OffsetFill;
sf.GlobalDepthOffsetEnableWireframe = ctx->Polygon.OffsetLine;
if (ctx->Line.SmoothFlag) {
sf.LineEndCapAntialiasingRegionWidth = _10pixels;
#if GEN_GEN <= 7
- sf.AntiAliasingEnable = true;
+ sf.AntialiasingEnable = true;
#endif
}
* Window coordinates in an FBO are inverted, which means point
* sprite origin must be inverted, too.
*/
- if ((ctx->Point.SpriteOrigin == GL_LOWER_LEFT) != render_to_fbo) {
+ if ((ctx->Point.SpriteOrigin == GL_LOWER_LEFT) == flip_y) {
sf.PointSpriteTextureCoordinateOrigin = LOWERLEFT;
} else {
sf.PointSpriteTextureCoordinateOrigin = UPPERLEFT;
/* Pointer to the WM constant buffer. Covered by the set of
* state flags from gen6_upload_wm_push_constants.
*/
- wmcp.PointertoPSConstantBuffer0 = stage_state->push_const_offset;
- wmcp.PSConstantBuffer0ReadLength = stage_state->push_const_size - 1;
+ wmcp.ConstantBody.PointertoConstantBuffer0 = stage_state->push_const_offset;
+ wmcp.ConstantBody.ConstantBuffer0ReadLength = stage_state->push_const_size - 1;
}
}
#endif
#if GEN_GEN >= 6
brw_batch_emit(brw, GENX(3DSTATE_WM), wm) {
- wm.LineAntialiasingRegionWidth = _10pixels;
- wm.LineEndCapAntialiasingRegionWidth = _05pixels;
-
- wm.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
- wm.BarycentricInterpolationMode = wm_prog_data->barycentric_interp_modes;
#else
ctx->NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
brw_state_emit(brw, GENX(WM_STATE), 64, &stage_state->state_offset, wm) {
- if (wm_prog_data->dispatch_8 && wm_prog_data->dispatch_16) {
- /* These two fields should be the same pre-gen6, which is why we
- * only have one hardware field to program for both dispatch
- * widths.
- */
- assert(wm_prog_data->base.dispatch_grf_start_reg ==
- wm_prog_data->dispatch_grf_start_reg_2);
+#endif
+
+#if GEN_GEN <= 6
+ wm._8PixelDispatchEnable = wm_prog_data->dispatch_8;
+ wm._16PixelDispatchEnable = wm_prog_data->dispatch_16;
+ wm._32PixelDispatchEnable = wm_prog_data->dispatch_32;
+#endif
+
+#if GEN_GEN == 4
+ /* On gen4, we only have one shader kernel */
+ if (brw_wm_state_has_ksp(wm, 0)) {
+ assert(brw_wm_prog_data_prog_offset(wm_prog_data, wm, 0) == 0);
+ wm.KernelStartPointer0 = KSP(brw, stage_state->prog_offset);
+ wm.GRFRegisterCount0 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 0);
+ wm.DispatchGRFStartRegisterForConstantSetupData0 =
+ brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 0);
}
+#elif GEN_GEN == 5
+ /* On gen5, we have multiple shader kernels but only one GRF start
+ * register for all kernels
+ */
+ wm.KernelStartPointer0 = stage_state->prog_offset +
+ brw_wm_prog_data_prog_offset(wm_prog_data, wm, 0);
+ wm.KernelStartPointer1 = stage_state->prog_offset +
+ brw_wm_prog_data_prog_offset(wm_prog_data, wm, 1);
+ wm.KernelStartPointer2 = stage_state->prog_offset +
+ brw_wm_prog_data_prog_offset(wm_prog_data, wm, 2);
- if (wm_prog_data->dispatch_8 || wm_prog_data->dispatch_16)
- wm.GRFRegisterCount0 = wm_prog_data->reg_blocks_0;
+ wm.GRFRegisterCount0 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 0);
+ wm.GRFRegisterCount1 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 1);
+ wm.GRFRegisterCount2 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 2);
- if (stage_state->sampler_count)
- wm.SamplerStatePointer =
- ro_bo(brw->batch.state.bo, stage_state->sampler_offset);
-#if GEN_GEN == 5
- if (wm_prog_data->prog_offset_2)
- wm.GRFRegisterCount2 = wm_prog_data->reg_blocks_2;
+ wm.DispatchGRFStartRegisterForConstantSetupData0 =
+ wm_prog_data->base.dispatch_grf_start_reg;
+
+ /* Dispatch GRF Start should be the same for all shaders on gen5 */
+ if (brw_wm_state_has_ksp(wm, 1)) {
+ assert(wm_prog_data->base.dispatch_grf_start_reg ==
+ brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 1));
+ }
+ if (brw_wm_state_has_ksp(wm, 2)) {
+ assert(wm_prog_data->base.dispatch_grf_start_reg ==
+ brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 2));
+ }
+#elif GEN_GEN == 6
+ /* On gen6, we have multiple shader kernels and we no longer specify a
+ * register count for each one.
+ */
+ wm.KernelStartPointer0 = stage_state->prog_offset +
+ brw_wm_prog_data_prog_offset(wm_prog_data, wm, 0);
+ wm.KernelStartPointer1 = stage_state->prog_offset +
+ brw_wm_prog_data_prog_offset(wm_prog_data, wm, 1);
+ wm.KernelStartPointer2 = stage_state->prog_offset +
+ brw_wm_prog_data_prog_offset(wm_prog_data, wm, 2);
+
+ wm.DispatchGRFStartRegisterForConstantSetupData0 =
+ brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 0);
+ wm.DispatchGRFStartRegisterForConstantSetupData1 =
+ brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 1);
+ wm.DispatchGRFStartRegisterForConstantSetupData2 =
+ brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 2);
#endif
- wm.SetupURBEntryReadLength = wm_prog_data->num_varying_inputs * 2;
+#if GEN_GEN <= 5
wm.ConstantURBEntryReadLength = wm_prog_data->base.curb_read_length;
/* BRW_NEW_PUSH_CONSTANT_ALLOCATION */
wm.ConstantURBEntryReadOffset = brw->curbe.wm_start * 2;
+ wm.SetupURBEntryReadLength = wm_prog_data->num_varying_inputs * 2;
+ wm.SetupURBEntryReadOffset = 0;
wm.EarlyDepthTestEnable = true;
+#endif
+
+#if GEN_GEN >= 6
+ wm.LineAntialiasingRegionWidth = _10pixels;
+ wm.LineEndCapAntialiasingRegionWidth = _05pixels;
+
+ wm.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
+ wm.BarycentricInterpolationMode = wm_prog_data->barycentric_interp_modes;
+#else
+ if (stage_state->sampler_count)
+ wm.SamplerStatePointer =
+ ro_bo(brw->batch.state.bo, stage_state->sampler_offset);
+
wm.LineAntialiasingRegionWidth = _05pixels;
wm.LineEndCapAntialiasingRegionWidth = _10pixels;
wm.BindingTableEntryCount =
wm_prog_data->base.binding_table.size_bytes / 4;
wm.MaximumNumberofThreads = devinfo->max_wm_threads - 1;
- wm._8PixelDispatchEnable = wm_prog_data->dispatch_8;
- wm._16PixelDispatchEnable = wm_prog_data->dispatch_16;
- wm.DispatchGRFStartRegisterForConstantSetupData0 =
- wm_prog_data->base.dispatch_grf_start_reg;
- if (GEN_GEN == 6 ||
- wm_prog_data->dispatch_8 || wm_prog_data->dispatch_16) {
- wm.KernelStartPointer0 = KSP(brw, stage_state->prog_offset);
- }
-
-#if GEN_GEN >= 5
- if (GEN_GEN == 6 || wm_prog_data->prog_offset_2) {
- wm.KernelStartPointer2 =
- KSP(brw, stage_state->prog_offset + wm_prog_data->prog_offset_2);
- }
-#endif
#if GEN_GEN == 6
wm.DualSourceBlendEnable =
wm.PositionXYOffsetSelect = POSOFFSET_SAMPLE;
else
wm.PositionXYOffsetSelect = POSOFFSET_NONE;
-
- wm.DispatchGRFStartRegisterForConstantSetupData2 =
- wm_prog_data->dispatch_grf_start_reg_2;
#endif
if (wm_prog_data->base.total_scratch) {
- wm.ScratchSpaceBasePointer = rw_bo(stage_state->scratch_bo, 0);
+ wm.ScratchSpaceBasePointer = rw_32_bo(stage_state->scratch_bo, 0);
wm.PerThreadScratchSpace =
ffs(stage_state->per_thread_scratch) - 11;
}
/* ---------------------------------------------------------------------- */
+/* We restrict scratch buffers to the bottom 32 bits of the address space
+ * by using rw_32_bo().
+ *
+ * General State Base Address is a bit broken. If the address + size as
+ * seen by STATE_BASE_ADDRESS overflows 48 bits, the GPU appears to treat
+ * all accesses to the buffer as being out of bounds and returns zero.
+ */
+
#define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix) \
pkt.KernelStartPointer = KSP(brw, stage_state->prog_offset); \
+ /* WA_1606682166 */ \
pkt.SamplerCount = \
+ GEN_GEN == 11 ? \
+ 0 : \
DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4); \
+ /* Gen 11 workarounds table #2056 WABTPPrefetchDisable suggests to \
+ * disable prefetching of binding tables in A0 and B0 steppings. \
+ * TODO: Revisit this WA on C0 stepping. \
+ */ \
pkt.BindingTableEntryCount = \
+ GEN_GEN == 11 ? \
+ 0 : \
stage_prog_data->binding_table.size_bytes / 4; \
pkt.FloatingPointMode = stage_prog_data->use_alt_mode; \
\
if (stage_prog_data->total_scratch) { \
- pkt.ScratchSpaceBasePointer = rw_bo(stage_state->scratch_bo, 0); \
+ pkt.ScratchSpaceBasePointer = rw_32_bo(stage_state->scratch_bo, 0); \
pkt.PerThreadScratchSpace = \
ffs(stage_state->per_thread_scratch) - 11; \
} \
brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_VS), cvs) {
if (stage_state->push_const_size != 0) {
cvs.Buffer0Valid = true;
- cvs.PointertoVSConstantBuffer0 = stage_state->push_const_offset;
- cvs.VSConstantBuffer0ReadLength = stage_state->push_const_size - 1;
+ cvs.ConstantBody.PointertoConstantBuffer0 = stage_state->push_const_offset;
+ cvs.ConstantBody.ConstantBuffer0ReadLength = stage_state->push_const_size - 1;
}
}
#endif
for (unsigned i = 0; i < viewport_count; i++) {
/* _NEW_VIEWPORT | _NEW_TRANSFORM */
const struct gl_viewport_attrib *vp = &ctx->ViewportArray[i];
- if (ctx->Transform.DepthClamp) {
+ if (ctx->Transform.DepthClampNear && ctx->Transform.DepthClampFar) {
+ ccv.MinimumDepth = MIN2(vp->Near, vp->Far);
+ ccv.MaximumDepth = MAX2(vp->Near, vp->Far);
+ } else if (ctx->Transform.DepthClampNear) {
ccv.MinimumDepth = MIN2(vp->Near, vp->Far);
+ ccv.MaximumDepth = 0.0;
+ } else if (ctx->Transform.DepthClampFar) {
+ ccv.MinimumDepth = 0.0;
ccv.MaximumDepth = MAX2(vp->Near, vp->Far);
} else {
ccv.MinimumDepth = 0.0;
static void
set_scissor_bits(const struct gl_context *ctx, int i,
- bool render_to_fbo, unsigned fb_width, unsigned fb_height,
+ bool flip_y, unsigned fb_width, unsigned fb_height,
struct GENX(SCISSOR_RECT) *sc)
{
int bbox[4];
bbox[0] = MAX2(ctx->ViewportArray[i].X, 0);
bbox[1] = MIN2(bbox[0] + ctx->ViewportArray[i].Width, fb_width);
- bbox[2] = MAX2(ctx->ViewportArray[i].Y, 0);
+ bbox[2] = CLAMP(ctx->ViewportArray[i].Y, 0, fb_height);
bbox[3] = MIN2(bbox[2] + ctx->ViewportArray[i].Height, fb_height);
_mesa_intersect_scissor_bounding_box(ctx, i, bbox);
sc->ScissorRectangleXMax = 0;
sc->ScissorRectangleYMin = 1;
sc->ScissorRectangleYMax = 0;
- } else if (render_to_fbo) {
+ } else if (!flip_y) {
/* texmemory: Y=0=bottom */
sc->ScissorRectangleXMin = bbox[0];
sc->ScissorRectangleXMax = bbox[1] - 1;
genX(upload_scissor_state)(struct brw_context *brw)
{
struct gl_context *ctx = &brw->ctx;
- const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
+ const bool flip_y = ctx->DrawBuffer->FlipY;
struct GENX(SCISSOR_RECT) scissor;
uint32_t scissor_state_offset;
const unsigned int fb_width = _mesa_geometric_width(ctx->DrawBuffer);
* inclusive but max is exclusive.
*/
for (unsigned i = 0; i < viewport_count; i++) {
- set_scissor_bits(ctx, i, render_to_fbo, fb_width, fb_height, &scissor);
+ set_scissor_bits(ctx, i, flip_y, fb_width, fb_height, &scissor);
GENX(SCISSOR_RECT_pack)(
NULL, scissor_map + i * GENX(SCISSOR_RECT_length), &scissor);
}
/* ---------------------------------------------------------------------- */
-static void
-brw_calculate_guardband_size(uint32_t fb_width, uint32_t fb_height,
- float m00, float m11, float m30, float m31,
- float *xmin, float *xmax,
- float *ymin, float *ymax)
-{
- /* According to the "Vertex X,Y Clamping and Quantization" section of the
- * Strips and Fans documentation:
- *
- * "The vertex X and Y screen-space coordinates are also /clamped/ to the
- * fixed-point "guardband" range supported by the rasterization hardware"
- *
- * and
- *
- * "In almost all circumstances, if an object’s vertices are actually
- * modified by this clamping (i.e., had X or Y coordinates outside of
- * the guardband extent the rendered object will not match the intended
- * result. Therefore software should take steps to ensure that this does
- * not happen - e.g., by clipping objects such that they do not exceed
- * these limits after the Drawing Rectangle is applied."
- *
- * I believe the fundamental restriction is that the rasterizer (in
- * the SF/WM stages) have a limit on the number of pixels that can be
- * rasterized. We need to ensure any coordinates beyond the rasterizer
- * limit are handled by the clipper. So effectively that limit becomes
- * the clipper's guardband size.
- *
- * It goes on to say:
- *
- * "In addition, in order to be correctly rendered, objects must have a
- * screenspace bounding box not exceeding 8K in the X or Y direction.
- * This additional restriction must also be comprehended by software,
- * i.e., enforced by use of clipping."
- *
- * This makes no sense. Gen7+ hardware supports 16K render targets,
- * and you definitely need to be able to draw polygons that fill the
- * surface. Our assumption is that the rasterizer was limited to 8K
- * on Sandybridge, which only supports 8K surfaces, and it was actually
- * increased to 16K on Ivybridge and later.
- *
- * So, limit the guardband to 16K on Gen7+ and 8K on Sandybridge.
- */
- const float gb_size = GEN_GEN >= 7 ? 16384.0f : 8192.0f;
-
- if (m00 != 0 && m11 != 0) {
- /* First, we compute the screen-space render area */
- const float ss_ra_xmin = MIN3( 0, m30 + m00, m30 - m00);
- const float ss_ra_xmax = MAX3( fb_width, m30 + m00, m30 - m00);
- const float ss_ra_ymin = MIN3( 0, m31 + m11, m31 - m11);
- const float ss_ra_ymax = MAX3(fb_height, m31 + m11, m31 - m11);
-
- /* We want the guardband to be centered on that */
- const float ss_gb_xmin = (ss_ra_xmin + ss_ra_xmax) / 2 - gb_size;
- const float ss_gb_xmax = (ss_ra_xmin + ss_ra_xmax) / 2 + gb_size;
- const float ss_gb_ymin = (ss_ra_ymin + ss_ra_ymax) / 2 - gb_size;
- const float ss_gb_ymax = (ss_ra_ymin + ss_ra_ymax) / 2 + gb_size;
-
- /* Now we need it in native device coordinates */
- const float ndc_gb_xmin = (ss_gb_xmin - m30) / m00;
- const float ndc_gb_xmax = (ss_gb_xmax - m30) / m00;
- const float ndc_gb_ymin = (ss_gb_ymin - m31) / m11;
- const float ndc_gb_ymax = (ss_gb_ymax - m31) / m11;
-
- /* Thanks to Y-flipping and ORIGIN_UPPER_LEFT, the Y coordinates may be
- * flipped upside-down. X should be fine though.
- */
- assert(ndc_gb_xmin <= ndc_gb_xmax);
- *xmin = ndc_gb_xmin;
- *xmax = ndc_gb_xmax;
- *ymin = MIN2(ndc_gb_ymin, ndc_gb_ymax);
- *ymax = MAX2(ndc_gb_ymin, ndc_gb_ymax);
- } else {
- /* The viewport scales to 0, so nothing will be rendered. */
- *xmin = 0.0f;
- *xmax = 0.0f;
- *ymin = 0.0f;
- *ymax = 0.0f;
- }
-}
-
static void
genX(upload_sf_clip_viewport)(struct brw_context *brw)
{
const unsigned viewport_count = brw->clip.viewport_count;
/* _NEW_BUFFERS */
- const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
+ const bool flip_y = ctx->DrawBuffer->FlipY;
const uint32_t fb_width = (float)_mesa_geometric_width(ctx->DrawBuffer);
const uint32_t fb_height = (float)_mesa_geometric_height(ctx->DrawBuffer);
#endif
/* _NEW_BUFFERS */
- if (render_to_fbo) {
- y_scale = 1.0;
- y_bias = 0;
- } else {
+ if (flip_y) {
y_scale = -1.0;
y_bias = (float)fb_height;
+ } else {
+ y_scale = 1.0;
+ y_bias = 0;
}
for (unsigned i = 0; i < brw->clip.viewport_count; i++) {
sfv.ViewportMatrixElementm30 = translate[0],
sfv.ViewportMatrixElementm31 = translate[1] * y_scale + y_bias,
sfv.ViewportMatrixElementm32 = translate[2],
- brw_calculate_guardband_size(fb_width, fb_height,
+ gen_calculate_guardband_size(fb_width, fb_height,
sfv.ViewportMatrixElementm00,
sfv.ViewportMatrixElementm11,
sfv.ViewportMatrixElementm30,
clv.YMaxClipGuardband = gb_ymax;
#if GEN_GEN < 6
- set_scissor_bits(ctx, i, render_to_fbo, fb_width, fb_height,
+ set_scissor_bits(ctx, i, flip_y, fb_width, fb_height,
&sfv.ScissorRectangle);
#elif GEN_GEN >= 8
/* _NEW_VIEWPORT | _NEW_BUFFERS: Screen Space Viewport
const float viewport_Ymax =
MIN2(ctx->ViewportArray[i].Y + ctx->ViewportArray[i].Height, fb_height);
- if (render_to_fbo) {
+ if (flip_y) {
sfv.XMinViewPort = viewport_Xmin;
sfv.XMaxViewPort = viewport_Xmax - 1;
- sfv.YMinViewPort = viewport_Ymin;
- sfv.YMaxViewPort = viewport_Ymax - 1;
+ sfv.YMinViewPort = fb_height - viewport_Ymax;
+ sfv.YMaxViewPort = fb_height - viewport_Ymin - 1;
} else {
sfv.XMinViewPort = viewport_Xmin;
sfv.XMaxViewPort = viewport_Xmax - 1;
- sfv.YMinViewPort = fb_height - viewport_Ymax;
- sfv.YMaxViewPort = fb_height - viewport_Ymin - 1;
+ sfv.YMinViewPort = viewport_Ymin;
+ sfv.YMaxViewPort = viewport_Ymax - 1;
}
#endif
brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_GS), cgs) {
if (active && stage_state->push_const_size != 0) {
cgs.Buffer0Valid = true;
- cgs.PointertoGSConstantBuffer0 = stage_state->push_const_offset;
- cgs.GSConstantBuffer0ReadLength = stage_state->push_const_size - 1;
+ cgs.ConstantBody.PointertoConstantBuffer0 = stage_state->push_const_offset;
+ cgs.ConstantBody.ConstantBuffer0ReadLength = stage_state->push_const_size - 1;
}
}
#endif
#if GEN_GEN < 7
gs.SOStatisticsEnable = true;
if (gs_prog->info.has_transform_feedback_varyings)
- gs.SVBIPayloadEnable = true;
+ gs.SVBIPayloadEnable = _mesa_is_xfb_active_and_unpaused(ctx);
/* GEN6_GS_SPF_MODE and GEN6_GS_VECTOR_MASK_ENABLE are enabled as it
* was previously done for gen6.
dstA = fix_dual_blend_alpha_to_one(dstA);
}
- entry->ColorBufferBlendEnable = true;
+ /* BRW_NEW_FS_PROG_DATA */
+ const struct brw_wm_prog_data *wm_prog_data =
+ brw_wm_prog_data(brw->wm.base.prog_data);
+
+ /* The Dual Source Blending documentation says:
+ *
+ * "If SRC1 is included in a src/dst blend factor and
+ * a DualSource RT Write message is not used, results
+ * are UNDEFINED. (This reflects the same restriction in DX APIs,
+ * where undefined results are produced if “o1” is not written
+ * by a PS – there are no default values defined).
+ * If SRC1 is not included in a src/dst blend factor,
+ * dual source blending must be disabled."
+ *
+ * There is no way to gracefully fix this undefined situation
+ * so we just disable the blending to prevent possible issues.
+ */
+ entry->ColorBufferBlendEnable =
+ !ctx->Color.Blend[0]._UsesDualSrc || wm_prog_data->dual_src_blend;
+
entry->DestinationBlendFactor = blend_factor(dstRGB);
entry->SourceBlendFactor = blend_factor(srcRGB);
entry->DestinationAlphaBlendFactor = blend_factor(dstA);
_NEW_MULTISAMPLE,
.brw = BRW_NEW_BATCH |
BRW_NEW_BLORP |
+ BRW_NEW_FS_PROG_DATA |
BRW_NEW_STATE_BASE_ADDRESS,
},
.emit = genX(upload_blend_state),
if (binding->BufferObject == ctx->Shared->NullBufferObj) {
static unsigned msg_id = 0;
- _mesa_gl_debug(ctx, &msg_id, MESA_DEBUG_SOURCE_API,
- MESA_DEBUG_TYPE_UNDEFINED,
- MESA_DEBUG_SEVERITY_HIGH,
- "UBO %d unbound, %s shader uniform data "
- "will be undefined.",
- range->block,
- _mesa_shader_stage_to_string(stage));
+ _mesa_gl_debugf(ctx, &msg_id, MESA_DEBUG_SOURCE_API,
+ MESA_DEBUG_TYPE_UNDEFINED,
+ MESA_DEBUG_SEVERITY_HIGH,
+ "UBO %d unbound, %s shader uniform data "
+ "will be undefined.",
+ range->block,
+ _mesa_shader_stage_to_string(stage));
continue;
}
sbe.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
/* _NEW_BUFFERS */
- bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
+ bool flip_y = ctx->DrawBuffer->FlipY;
/* _NEW_POINT
*
* Window coordinates in an FBO are inverted, which means point
* sprite origin must be inverted.
*/
- if ((ctx->Point.SpriteOrigin == GL_LOWER_LEFT) != render_to_fbo)
+ if ((ctx->Point.SpriteOrigin == GL_LOWER_LEFT) == flip_y)
sbe.PointSpriteTextureCoordinateOrigin = LOWERLEFT;
else
sbe.PointSpriteTextureCoordinateOrigin = UPPERLEFT;
for (int i = 0; i < 4; i++) {
struct intel_buffer_object *bufferobj =
intel_buffer_object(xfb_obj->Buffers[i]);
+ uint32_t start = xfb_obj->Offset[i];
+ uint32_t end = ALIGN(start + xfb_obj->Size[i], 4);
+ uint32_t const size = end - start;
- if (!bufferobj) {
+ if (!bufferobj || !size) {
brw_batch_emit(brw, GENX(3DSTATE_SO_BUFFER), sob) {
sob.SOBufferIndex = i;
}
continue;
}
- uint32_t start = xfb_obj->Offset[i];
assert(start % 4 == 0);
- uint32_t end = ALIGN(start + xfb_obj->Size[i], 4);
struct brw_bo *bo =
- intel_bufferobj_buffer(brw, bufferobj, start, end - start, true);
+ intel_bufferobj_buffer(brw, bufferobj, start, size, true);
assert(end <= bo->size);
brw_batch_emit(brw, GENX(3DSTATE_SO_BUFFER), sob) {
sob.SOBufferEnable = true;
sob.StreamOffsetWriteEnable = true;
sob.StreamOutputBufferOffsetAddressEnable = true;
- sob.SOBufferMOCS = mocs_wb;
+ sob.MOCS = mocs_wb;
sob.SurfaceSize = MAX2(xfb_obj->Size[i] / 4, 1) - 1;
sob.StreamOutputBufferOffsetAddress =
*/
ps.VectorMaskEnable = GEN_GEN >= 8;
- ps.SamplerCount =
- DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4);
+ /* WA_1606682166:
+ * "Incorrect TDL's SSP address shift in SARB for 16:6 & 18:8 modes.
+ * Disable the Sampler state prefetch functionality in the SARB by
+ * programming 0xB000[30] to '1'."
+ */
+ ps.SamplerCount = GEN_GEN == 11 ?
+ 0 : DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4);
/* BRW_NEW_FS_PROG_DATA */
- ps.BindingTableEntryCount = prog_data->base.binding_table.size_bytes / 4;
+ /* Gen 11 workarounds table #2056 WABTPPrefetchDisable suggests to disable
+ * prefetching of binding tables in A0 and B0 steppings.
+ * TODO: Revisit this workaround on C0 stepping.
+ */
+ ps.BindingTableEntryCount = GEN_GEN == 11 ?
+ 0 :
+ prog_data->base.binding_table.size_bytes / 4;
if (prog_data->base.use_alt_mode)
ps.FloatingPointMode = Alternate;
ps._8PixelDispatchEnable = prog_data->dispatch_8;
ps._16PixelDispatchEnable = prog_data->dispatch_16;
+ ps._32PixelDispatchEnable = prog_data->dispatch_32;
+
+ /* From the Sky Lake PRM 3DSTATE_PS::32 Pixel Dispatch Enable:
+ *
+ * "When NUM_MULTISAMPLES = 16 or FORCE_SAMPLE_COUNT = 16, SIMD32
+ * Dispatch must not be enabled for PER_PIXEL dispatch mode."
+ *
+ * Since 16x MSAA is first introduced on SKL, we don't need to apply
+ * the workaround on any older hardware.
+ *
+ * BRW_NEW_NUM_SAMPLES
+ */
+ if (GEN_GEN >= 9 && !prog_data->persample_dispatch &&
+ brw->num_samples == 16) {
+ assert(ps._8PixelDispatchEnable || ps._16PixelDispatchEnable);
+ ps._32PixelDispatchEnable = false;
+ }
+
ps.DispatchGRFStartRegisterForConstantSetupData0 =
- prog_data->base.dispatch_grf_start_reg;
+ brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 0);
+ ps.DispatchGRFStartRegisterForConstantSetupData1 =
+ brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 1);
ps.DispatchGRFStartRegisterForConstantSetupData2 =
- prog_data->dispatch_grf_start_reg_2;
+ brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 2);
- ps.KernelStartPointer0 = stage_state->prog_offset;
+ ps.KernelStartPointer0 = stage_state->prog_offset +
+ brw_wm_prog_data_prog_offset(prog_data, ps, 0);
+ ps.KernelStartPointer1 = stage_state->prog_offset +
+ brw_wm_prog_data_prog_offset(prog_data, ps, 1);
ps.KernelStartPointer2 = stage_state->prog_offset +
- prog_data->prog_offset_2;
+ brw_wm_prog_data_prog_offset(prog_data, ps, 2);
if (prog_data->base.total_scratch) {
ps.ScratchSpaceBasePointer =
- rw_bo(stage_state->scratch_bo,
- ffs(stage_state->per_thread_scratch) - 11);
+ rw_32_bo(stage_state->scratch_bo,
+ ffs(stage_state->per_thread_scratch) - 11);
}
}
}
: 0),
.brw = BRW_NEW_BATCH |
BRW_NEW_BLORP |
- BRW_NEW_FS_PROG_DATA,
+ BRW_NEW_FS_PROG_DATA |
+ (GEN_GEN >= 9 ? BRW_NEW_NUM_SAMPLES : 0),
},
.emit = genX(upload_ps),
};
hs.IncludeVertexHandles = true;
hs.MaximumNumberofThreads = devinfo->max_tcs_threads - 1;
+
+#if GEN_GEN >= 9
+ hs.DispatchMode = vue_prog_data->dispatch_mode;
+ hs.IncludePrimitiveID = tcs_prog_data->include_primitive_id;
+#endif
}
}
}
*/
per_thread_scratch_value = stage_state->per_thread_scratch / 1024 - 1;
}
- vfe.ScratchSpaceBasePointer = rw_bo(stage_state->scratch_bo, 0);
+ vfe.ScratchSpaceBasePointer = rw_32_bo(stage_state->scratch_bo, 0);
vfe.PerThreadScratchSpace = per_thread_scratch_value;
}
const struct GENX(INTERFACE_DESCRIPTOR_DATA) idd = {
.KernelStartPointer = brw->cs.base.prog_offset,
.SamplerStatePointer = stage_state->sampler_offset,
- .SamplerCount = DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4),
+ /* WA_1606682166 */
+ .SamplerCount = GEN_GEN == 11 ? 0 :
+ DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4),
.BindingTablePointer = stage_state->bind_bo_offset,
.ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs,
.NumberofThreadsinGPGPUThreadGroup = cs_prog_data->threads,
.emit = genX(upload_cs_state)
};
+#define GPGPU_DISPATCHDIMX 0x2500
+#define GPGPU_DISPATCHDIMY 0x2504
+#define GPGPU_DISPATCHDIMZ 0x2508
+
+#define MI_PREDICATE_SRC0 0x2400
+#define MI_PREDICATE_SRC1 0x2408
+
+static void
+prepare_indirect_gpgpu_walker(struct brw_context *brw)
+{
+ GLintptr indirect_offset = brw->compute.num_work_groups_offset;
+ struct brw_bo *bo = brw->compute.num_work_groups_bo;
+
+ emit_lrm(brw, GPGPU_DISPATCHDIMX, ro_bo(bo, indirect_offset + 0));
+ emit_lrm(brw, GPGPU_DISPATCHDIMY, ro_bo(bo, indirect_offset + 4));
+ emit_lrm(brw, GPGPU_DISPATCHDIMZ, ro_bo(bo, indirect_offset + 8));
+
+#if GEN_GEN <= 7
+ /* Clear upper 32-bits of SRC0 and all 64-bits of SRC1 */
+ emit_lri(brw, MI_PREDICATE_SRC0 + 4, 0);
+ emit_lri(brw, MI_PREDICATE_SRC1 , 0);
+ emit_lri(brw, MI_PREDICATE_SRC1 + 4, 0);
+
+ /* Load compute_dispatch_indirect_x_size into SRC0 */
+ emit_lrm(brw, MI_PREDICATE_SRC0, ro_bo(bo, indirect_offset + 0));
+
+ /* predicate = (compute_dispatch_indirect_x_size == 0); */
+ brw_batch_emit(brw, GENX(MI_PREDICATE), mip) {
+ mip.LoadOperation = LOAD_LOAD;
+ mip.CombineOperation = COMBINE_SET;
+ mip.CompareOperation = COMPARE_SRCS_EQUAL;
+ }
+
+ /* Load compute_dispatch_indirect_y_size into SRC0 */
+ emit_lrm(brw, MI_PREDICATE_SRC0, ro_bo(bo, indirect_offset + 4));
+
+ /* predicate |= (compute_dispatch_indirect_y_size == 0); */
+ brw_batch_emit(brw, GENX(MI_PREDICATE), mip) {
+ mip.LoadOperation = LOAD_LOAD;
+ mip.CombineOperation = COMBINE_OR;
+ mip.CompareOperation = COMPARE_SRCS_EQUAL;
+ }
+
+ /* Load compute_dispatch_indirect_z_size into SRC0 */
+ emit_lrm(brw, MI_PREDICATE_SRC0, ro_bo(bo, indirect_offset + 8));
+
+ /* predicate |= (compute_dispatch_indirect_z_size == 0); */
+ brw_batch_emit(brw, GENX(MI_PREDICATE), mip) {
+ mip.LoadOperation = LOAD_LOAD;
+ mip.CombineOperation = COMBINE_OR;
+ mip.CompareOperation = COMPARE_SRCS_EQUAL;
+ }
+
+ /* predicate = !predicate; */
+#define COMPARE_FALSE 1
+ brw_batch_emit(brw, GENX(MI_PREDICATE), mip) {
+ mip.LoadOperation = LOAD_LOADINV;
+ mip.CombineOperation = COMBINE_OR;
+ mip.CompareOperation = COMPARE_FALSE;
+ }
+#endif
+}
+
+static void
+genX(emit_gpgpu_walker)(struct brw_context *brw)
+{
+ const struct brw_cs_prog_data *prog_data =
+ brw_cs_prog_data(brw->cs.base.prog_data);
+
+ const GLuint *num_groups = brw->compute.num_work_groups;
+
+ bool indirect = brw->compute.num_work_groups_bo != NULL;
+ if (indirect)
+ prepare_indirect_gpgpu_walker(brw);
+
+ const unsigned simd_size = prog_data->simd_size;
+ unsigned group_size = prog_data->local_size[0] *
+ prog_data->local_size[1] * prog_data->local_size[2];
+
+ uint32_t right_mask = 0xffffffffu >> (32 - simd_size);
+ const unsigned right_non_aligned = group_size & (simd_size - 1);
+ if (right_non_aligned != 0)
+ right_mask >>= (simd_size - right_non_aligned);
+
+ brw_batch_emit(brw, GENX(GPGPU_WALKER), ggw) {
+ ggw.IndirectParameterEnable = indirect;
+ ggw.PredicateEnable = GEN_GEN <= 7 && indirect;
+ ggw.SIMDSize = prog_data->simd_size / 16;
+ ggw.ThreadDepthCounterMaximum = 0;
+ ggw.ThreadHeightCounterMaximum = 0;
+ ggw.ThreadWidthCounterMaximum = prog_data->threads - 1;
+ ggw.ThreadGroupIDXDimension = num_groups[0];
+ ggw.ThreadGroupIDYDimension = num_groups[1];
+ ggw.ThreadGroupIDZDimension = num_groups[2];
+ ggw.RightExecutionMask = right_mask;
+ ggw.BottomExecutionMask = 0xffffffff;
+ }
+
+ brw_batch_emit(brw, GENX(MEDIA_STATE_FLUSH), msf);
+}
+
#endif
/* ---------------------------------------------------------------------- */
const struct gl_context *ctx = &brw->ctx;
/* _NEW_BUFFERS */
- const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
+ const bool flip_y = ctx->DrawBuffer->FlipY;
/* _NEW_POLYGON */
const struct gl_polygon_attrib *polygon = &ctx->Polygon;
const struct gl_point_attrib *point = &ctx->Point;
brw_batch_emit(brw, GENX(3DSTATE_RASTER), raster) {
- if (brw->polygon_front_bit == render_to_fbo)
+ if (brw->polygon_front_bit != flip_y)
raster.FrontWinding = CounterClockwise;
if (polygon->CullFlag) {
raster.ScissorRectangleEnable = ctx->Scissor.EnableFlags;
/* _NEW_TRANSFORM */
- if (!ctx->Transform.DepthClamp) {
+#if GEN_GEN < 9
+ if (!(ctx->Transform.DepthClampNear &&
+ ctx->Transform.DepthClampFar))
+ raster.ViewportZClipTestEnable = true;
+#endif
+
#if GEN_GEN >= 9
- raster.ViewportZFarClipTestEnable = true;
+ if (!ctx->Transform.DepthClampNear)
raster.ViewportZNearClipTestEnable = true;
-#else
- raster.ViewportZClipTestEnable = true;
+
+ if (!ctx->Transform.DepthClampFar)
+ raster.ViewportZFarClipTestEnable = true;
#endif
- }
/* BRW_NEW_CONSERVATIVE_RASTERIZATION */
#if GEN_GEN >= 9
dstA = fix_dual_blend_alpha_to_one(dstA);
}
- pb.ColorBufferBlendEnable = true;
+ /* BRW_NEW_FS_PROG_DATA */
+ const struct brw_wm_prog_data *wm_prog_data =
+ brw_wm_prog_data(brw->wm.base.prog_data);
+
+ /* The Dual Source Blending documentation says:
+ *
+ * "If SRC1 is included in a src/dst blend factor and
+ * a DualSource RT Write message is not used, results
+ * are UNDEFINED. (This reflects the same restriction in DX APIs,
+ * where undefined results are produced if “o1” is not written
+ * by a PS – there are no default values defined).
+ * If SRC1 is not included in a src/dst blend factor,
+ * dual source blending must be disabled."
+ *
+ * There is no way to gracefully fix this undefined situation
+ * so we just disable the blending to prevent possible issues.
+ */
+ pb.ColorBufferBlendEnable =
+ !color->Blend[0]._UsesDualSrc || wm_prog_data->dual_src_blend;
pb.SourceAlphaBlendFactor = brw_translate_blend_factor(srcA);
pb.DestinationAlphaBlendFactor = brw_translate_blend_factor(dstA);
pb.SourceBlendFactor = brw_translate_blend_factor(srcRGB);
_NEW_MULTISAMPLE,
.brw = BRW_NEW_BLORP |
BRW_NEW_CONTEXT |
- BRW_NEW_FRAGMENT_PROGRAM,
+ BRW_NEW_FRAGMENT_PROGRAM |
+ BRW_NEW_FS_PROG_DATA,
},
.emit = genX(upload_ps_blend)
};
* Emit a 3DSTATE_SAMPLER_STATE_POINTERS_{VS,HS,GS,DS,PS} packet.
*/
static void
-genX(emit_sampler_state_pointers_xs)(struct brw_context *brw,
- struct brw_stage_state *stage_state)
+genX(emit_sampler_state_pointers_xs)(UNUSED struct brw_context *brw,
+ UNUSED struct brw_stage_state *stage_state)
{
#if GEN_GEN >= 7
static const uint16_t packet_headers[] = {
}
static uint32_t
-translate_wrap_mode(struct brw_context *brw, GLenum wrap, bool using_nearest)
+translate_wrap_mode(GLenum wrap, UNUSED bool using_nearest)
{
switch (wrap) {
case GL_REPEAT:
mesa_format format, GLenum base_format,
const struct gl_texture_object *texObj,
const struct gl_sampler_object *sampler,
- uint32_t *sampler_state,
- uint32_t batch_offset_for_sampler_state)
+ uint32_t *sampler_state)
{
struct GENX(SAMPLER_STATE) samp_st = { 0 };
bool either_nearest =
sampler->MinFilter == GL_NEAREST || sampler->MagFilter == GL_NEAREST;
- unsigned wrap_s = translate_wrap_mode(brw, sampler->WrapS, either_nearest);
- unsigned wrap_t = translate_wrap_mode(brw, sampler->WrapT, either_nearest);
- unsigned wrap_r = translate_wrap_mode(brw, sampler->WrapR, either_nearest);
+ unsigned wrap_s = translate_wrap_mode(sampler->WrapS, either_nearest);
+ unsigned wrap_t = translate_wrap_mode(sampler->WrapT, either_nearest);
+ unsigned wrap_r = translate_wrap_mode(sampler->WrapR, either_nearest);
if (target == GL_TEXTURE_CUBE_MAP ||
target == GL_TEXTURE_CUBE_MAP_ARRAY) {
static void
update_sampler_state(struct brw_context *brw,
int unit,
- uint32_t *sampler_state,
- uint32_t batch_offset_for_sampler_state)
+ uint32_t *sampler_state)
{
struct gl_context *ctx = &brw->ctx;
const struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
texUnit->LodBias,
firstImage->TexFormat, firstImage->_BaseFormat,
texObj, sampler,
- sampler_state, batch_offset_for_sampler_state);
+ sampler_state);
}
static void
32, &stage_state->sampler_offset);
/* memset(sampler_state, 0, sampler_count * size_in_bytes); */
- uint32_t batch_offset_for_sampler_state = stage_state->sampler_offset;
-
for (unsigned s = 0; s < sampler_count; s++) {
if (SamplersUsed & (1 << s)) {
const unsigned unit = prog->SamplerUnits[s];
if (ctx->Texture.Unit[unit]._Current) {
- update_sampler_state(brw, unit, sampler_state,
- batch_offset_for_sampler_state);
+ update_sampler_state(brw, unit, sampler_state);
}
}
sampler_state += dwords;
- batch_offset_for_sampler_state += size_in_bytes;
}
if (GEN_GEN >= 7 && stage_state->stage != MESA_SHADER_COMPUTE) {
#if GEN_GEN < 6
static const struct brw_tracked_state *render_atoms[] =
{
+ &genX(vf_statistics),
+
/* Once all the programs are done, we know how large urb entry
* sizes need to be and can decide if we need to change the urb
* layout.
#elif GEN_GEN == 6
static const struct brw_tracked_state *render_atoms[] =
{
+ &genX(vf_statistics),
+
&genX(sf_clip_viewport),
/* Command packets: */
#elif GEN_GEN == 7
static const struct brw_tracked_state *render_atoms[] =
{
+ &genX(vf_statistics),
+
/* Command packets: */
&genX(cc_vp),
&genX(scissor_state),
- &gen7_depthbuffer,
+ &brw_depthbuffer,
&genX(polygon_stipple),
&genX(polygon_stipple_offset),
#elif GEN_GEN >= 8
static const struct brw_tracked_state *render_atoms[] =
{
+ &genX(vf_statistics),
+
&genX(cc_vp),
&genX(sf_clip_viewport),
&genX(scissor_state),
- &gen7_depthbuffer,
+ &brw_depthbuffer,
&genX(polygon_stipple),
&genX(polygon_stipple_offset),
compute_atoms, ARRAY_SIZE(compute_atoms));
brw->vtbl.emit_mi_report_perf_count = genX(emit_mi_report_perf_count);
+ brw->vtbl.emit_compute_walker = genX(emit_gpgpu_walker);
#endif
}