From ddc6f4d069f6280786683b0bffdd76f0e3a8ebdf Mon Sep 17 00:00:00 2001 From: Rafael Antognolli Date: Tue, 21 Mar 2017 07:30:03 -0700 Subject: [PATCH] i965: Port gen7+ 3DSTATE_SOL to genxml. Emit 3DSTATE_SOL on Gen7+ using brw_batch_emit helper, that uses pack structs from genxml. v2: - Add helpers to assign struct brw_address (Kristian) v3: - Rename MOCS -> SOBufferMOCS - Do not re-declare MOCS macros (Ken). - Style and code reorganization (Ken). Signed-off-by: Rafael Antognolli Reviewed-by: Kenneth Graunke --- src/mesa/drivers/dri/i965/Makefile.sources | 1 - src/mesa/drivers/dri/i965/brw_state.h | 6 - src/mesa/drivers/dri/i965/gen7_sol_state.c | 307 ---------------- src/mesa/drivers/dri/i965/gen8_sol_state.c | 95 ----- src/mesa/drivers/dri/i965/genX_state_upload.c | 341 +++++++++++++++++- 5 files changed, 338 insertions(+), 412 deletions(-) delete mode 100644 src/mesa/drivers/dri/i965/gen8_sol_state.c diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources index 47680a7ccb9..bfcf57cbbfa 100644 --- a/src/mesa/drivers/dri/i965/Makefile.sources +++ b/src/mesa/drivers/dri/i965/Makefile.sources @@ -111,7 +111,6 @@ i965_FILES = \ gen8_hs_state.c \ gen8_multisample_state.c \ gen8_ps_state.c \ - gen8_sol_state.c \ gen8_surface_state.c \ gen8_viewport_state.c \ gen8_vs_state.c \ diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h index 3df975a86a6..94f758b3d36 100644 --- a/src/mesa/drivers/dri/i965/brw_state.h +++ b/src/mesa/drivers/dri/i965/brw_state.h @@ -135,7 +135,6 @@ extern const struct brw_tracked_state gen7_l3_state; extern const struct brw_tracked_state gen7_ps_state; extern const struct brw_tracked_state gen7_push_constant_space; extern const struct brw_tracked_state gen7_sf_clip_viewport; -extern const struct brw_tracked_state gen7_sol_state; extern const struct brw_tracked_state gen7_te_state; extern const struct brw_tracked_state gen7_tes_push_constants; extern const struct brw_tracked_state gen7_urb; @@ -299,11 +298,6 @@ void gen8_upload_ps_state(struct brw_context *brw, void gen8_upload_ps_extra(struct brw_context *brw, const struct brw_wm_prog_data *prog_data); -/* gen7_sol_state.c */ -void gen7_upload_3dstate_so_decl_list(struct brw_context *brw, - const struct brw_vue_map *vue_map); -void gen8_upload_3dstate_so_buffers(struct brw_context *brw); - /* gen8_surface_state.c */ void gen8_init_vtable_surface_functions(struct brw_context *brw); diff --git a/src/mesa/drivers/dri/i965/gen7_sol_state.c b/src/mesa/drivers/dri/i965/gen7_sol_state.c index f1bd19c24f0..f54b370cd40 100644 --- a/src/mesa/drivers/dri/i965/gen7_sol_state.c +++ b/src/mesa/drivers/dri/i965/gen7_sol_state.c @@ -35,313 +35,6 @@ #include "intel_buffer_objects.h" #include "main/transformfeedback.h" -static void -upload_3dstate_so_buffers(struct brw_context *brw) -{ - struct gl_context *ctx = &brw->ctx; - /* BRW_NEW_TRANSFORM_FEEDBACK */ - struct gl_transform_feedback_object *xfb_obj = - ctx->TransformFeedback.CurrentObject; - const struct gl_transform_feedback_info *linked_xfb_info = - xfb_obj->program->sh.LinkedTransformFeedback; - int i; - - /* Set up the up to 4 output buffers. These are the ranges defined in the - * gl_transform_feedback_object. - */ - for (i = 0; i < 4; i++) { - struct intel_buffer_object *bufferobj = - intel_buffer_object(xfb_obj->Buffers[i]); - struct brw_bo *bo; - uint32_t start, end; - uint32_t stride; - - if (!xfb_obj->Buffers[i]) { - /* The pitch of 0 in this command indicates that the buffer is - * unbound and won't be written to. - */ - BEGIN_BATCH(4); - OUT_BATCH(_3DSTATE_SO_BUFFER << 16 | (4 - 2)); - OUT_BATCH((i << SO_BUFFER_INDEX_SHIFT)); - OUT_BATCH(0); - OUT_BATCH(0); - ADVANCE_BATCH(); - - continue; - } - - stride = linked_xfb_info->Buffers[i].Stride * 4; - - start = xfb_obj->Offset[i]; - assert(start % 4 == 0); - end = ALIGN(start + xfb_obj->Size[i], 4); - bo = intel_bufferobj_buffer(brw, bufferobj, start, end - start); - assert(end <= bo->size); - - BEGIN_BATCH(4); - OUT_BATCH(_3DSTATE_SO_BUFFER << 16 | (4 - 2)); - OUT_BATCH((i << SO_BUFFER_INDEX_SHIFT) | stride); - OUT_RELOC(bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, start); - OUT_RELOC(bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, end); - ADVANCE_BATCH(); - } -} - -/** - * Outputs the 3DSTATE_SO_DECL_LIST command. - * - * The data output is a series of 64-bit entries containing a SO_DECL per - * stream. We only have one stream of rendering coming out of the GS unit, so - * we only emit stream 0 (low 16 bits) SO_DECLs. - */ -void -gen7_upload_3dstate_so_decl_list(struct brw_context *brw, - const struct brw_vue_map *vue_map) -{ - struct gl_context *ctx = &brw->ctx; - /* BRW_NEW_TRANSFORM_FEEDBACK */ - struct gl_transform_feedback_object *xfb_obj = - ctx->TransformFeedback.CurrentObject; - const struct gl_transform_feedback_info *linked_xfb_info = - xfb_obj->program->sh.LinkedTransformFeedback; - uint16_t so_decl[MAX_VERTEX_STREAMS][128]; - int buffer_mask[MAX_VERTEX_STREAMS] = {0, 0, 0, 0}; - int next_offset[BRW_MAX_SOL_BUFFERS] = {0, 0, 0, 0}; - int decls[MAX_VERTEX_STREAMS] = {0, 0, 0, 0}; - int max_decls = 0; - STATIC_ASSERT(ARRAY_SIZE(so_decl[0]) >= MAX_PROGRAM_OUTPUTS); - - memset(so_decl, 0, sizeof(so_decl)); - - /* Construct the list of SO_DECLs to be emitted. The formatting of the - * command is feels strange -- each dword pair contains a SO_DECL per stream. - */ - for (unsigned i = 0; i < linked_xfb_info->NumOutputs; i++) { - int buffer = linked_xfb_info->Outputs[i].OutputBuffer; - uint16_t decl = 0; - int varying = linked_xfb_info->Outputs[i].OutputRegister; - const unsigned components = linked_xfb_info->Outputs[i].NumComponents; - unsigned component_mask = (1 << components) - 1; - unsigned stream_id = linked_xfb_info->Outputs[i].StreamId; - unsigned decl_buffer_slot = buffer << SO_DECL_OUTPUT_BUFFER_SLOT_SHIFT; - assert(stream_id < MAX_VERTEX_STREAMS); - - /* gl_PointSize is stored in VARYING_SLOT_PSIZ.w - * gl_Layer is stored in VARYING_SLOT_PSIZ.y - * gl_ViewportIndex is stored in VARYING_SLOT_PSIZ.z - */ - if (varying == VARYING_SLOT_PSIZ) { - assert(components == 1); - component_mask <<= 3; - } else if (varying == VARYING_SLOT_LAYER) { - assert(components == 1); - component_mask <<= 1; - } else if (varying == VARYING_SLOT_VIEWPORT) { - assert(components == 1); - component_mask <<= 2; - } else { - component_mask <<= linked_xfb_info->Outputs[i].ComponentOffset; - } - - buffer_mask[stream_id] |= 1 << buffer; - - decl |= decl_buffer_slot; - if (varying == VARYING_SLOT_LAYER || varying == VARYING_SLOT_VIEWPORT) { - decl |= vue_map->varying_to_slot[VARYING_SLOT_PSIZ] << - SO_DECL_REGISTER_INDEX_SHIFT; - } else { - assert(vue_map->varying_to_slot[varying] >= 0); - decl |= vue_map->varying_to_slot[varying] << - SO_DECL_REGISTER_INDEX_SHIFT; - } - decl |= component_mask << SO_DECL_COMPONENT_MASK_SHIFT; - - /* Mesa doesn't store entries for gl_SkipComponents in the Outputs[] - * array. Instead, it simply increments DstOffset for the following - * input by the number of components that should be skipped. - * - * Our hardware is unusual in that it requires us to program SO_DECLs - * for fake "hole" components, rather than simply taking the offset - * for each real varying. Each hole can have size 1, 2, 3, or 4; we - * program as many size = 4 holes as we can, then a final hole to - * accommodate the final 1, 2, or 3 remaining. - */ - int skip_components = - linked_xfb_info->Outputs[i].DstOffset - next_offset[buffer]; - - next_offset[buffer] += skip_components; - - while (skip_components >= 4) { - so_decl[stream_id][decls[stream_id]++] = - SO_DECL_HOLE_FLAG | 0xf | decl_buffer_slot; - skip_components -= 4; - } - if (skip_components > 0) - so_decl[stream_id][decls[stream_id]++] = - SO_DECL_HOLE_FLAG | ((1 << skip_components) - 1) | - decl_buffer_slot; - - assert(linked_xfb_info->Outputs[i].DstOffset == next_offset[buffer]); - - next_offset[buffer] += components; - - so_decl[stream_id][decls[stream_id]++] = decl; - - if (decls[stream_id] > max_decls) - max_decls = decls[stream_id]; - } - - BEGIN_BATCH(max_decls * 2 + 3); - OUT_BATCH(_3DSTATE_SO_DECL_LIST << 16 | (max_decls * 2 + 1)); - - OUT_BATCH((buffer_mask[0] << SO_STREAM_TO_BUFFER_SELECTS_0_SHIFT) | - (buffer_mask[1] << SO_STREAM_TO_BUFFER_SELECTS_1_SHIFT) | - (buffer_mask[2] << SO_STREAM_TO_BUFFER_SELECTS_2_SHIFT) | - (buffer_mask[3] << SO_STREAM_TO_BUFFER_SELECTS_3_SHIFT)); - - OUT_BATCH((decls[0] << SO_NUM_ENTRIES_0_SHIFT) | - (decls[1] << SO_NUM_ENTRIES_1_SHIFT) | - (decls[2] << SO_NUM_ENTRIES_2_SHIFT) | - (decls[3] << SO_NUM_ENTRIES_3_SHIFT)); - - for (int i = 0; i < max_decls; i++) { - /* Stream 1 | Stream 0 */ - OUT_BATCH(((uint32_t) so_decl[1][i]) << 16 | so_decl[0][i]); - /* Stream 3 | Stream 2 */ - OUT_BATCH(((uint32_t) so_decl[3][i]) << 16 | so_decl[2][i]); - } - - ADVANCE_BATCH(); -} - -static bool -query_active(struct gl_query_object *q) -{ - return q && q->Active; -} - -static void -upload_3dstate_streamout(struct brw_context *brw, bool active, - const struct brw_vue_map *vue_map) -{ - struct gl_context *ctx = &brw->ctx; - /* BRW_NEW_TRANSFORM_FEEDBACK */ - struct gl_transform_feedback_object *xfb_obj = - ctx->TransformFeedback.CurrentObject; - uint32_t dw1 = 0, dw2 = 0, dw3 = 0, dw4 = 0; - int i; - - if (active) { - const struct gl_transform_feedback_info *linked_xfb_info = - xfb_obj->program->sh.LinkedTransformFeedback; - int urb_entry_read_offset = 0; - int urb_entry_read_length = (vue_map->num_slots + 1) / 2 - - urb_entry_read_offset; - - dw1 |= SO_FUNCTION_ENABLE; - dw1 |= SO_STATISTICS_ENABLE; - - /* BRW_NEW_RASTERIZER_DISCARD */ - if (ctx->RasterDiscard) { - if (!query_active(ctx->Query.PrimitivesGenerated[0])) { - dw1 |= SO_RENDERING_DISABLE; - } else { - perf_debug("Rasterizer discard with a GL_PRIMITIVES_GENERATED " - "query active relies on the clipper."); - } - } - - /* _NEW_LIGHT */ - if (ctx->Light.ProvokingVertex != GL_FIRST_VERTEX_CONVENTION) - dw1 |= SO_REORDER_TRAILING; - - if (brw->gen < 8) { - for (i = 0; i < 4; i++) { - if (xfb_obj->Buffers[i]) { - dw1 |= SO_BUFFER_ENABLE(i); - } - } - } - - /* We always read the whole vertex. This could be reduced at some - * point by reading less and offsetting the register index in the - * SO_DECLs. - */ - dw2 |= SET_FIELD(urb_entry_read_offset, SO_STREAM_0_VERTEX_READ_OFFSET); - dw2 |= SET_FIELD(urb_entry_read_length - 1, SO_STREAM_0_VERTEX_READ_LENGTH); - - dw2 |= SET_FIELD(urb_entry_read_offset, SO_STREAM_1_VERTEX_READ_OFFSET); - dw2 |= SET_FIELD(urb_entry_read_length - 1, SO_STREAM_1_VERTEX_READ_LENGTH); - - dw2 |= SET_FIELD(urb_entry_read_offset, SO_STREAM_2_VERTEX_READ_OFFSET); - dw2 |= SET_FIELD(urb_entry_read_length - 1, SO_STREAM_2_VERTEX_READ_LENGTH); - - dw2 |= SET_FIELD(urb_entry_read_offset, SO_STREAM_3_VERTEX_READ_OFFSET); - dw2 |= SET_FIELD(urb_entry_read_length - 1, SO_STREAM_3_VERTEX_READ_LENGTH); - - if (brw->gen >= 8) { - /* Set buffer pitches; 0 means unbound. */ - if (xfb_obj->Buffers[0]) - dw3 |= linked_xfb_info->Buffers[0].Stride * 4; - if (xfb_obj->Buffers[1]) - dw3 |= (linked_xfb_info->Buffers[1].Stride * 4) << 16; - if (xfb_obj->Buffers[2]) - dw4 |= linked_xfb_info->Buffers[2].Stride * 4; - if (xfb_obj->Buffers[3]) - dw4 |= (linked_xfb_info->Buffers[3].Stride * 4) << 16; - } - } - - const int dwords = brw->gen >= 8 ? 5 : 3; - - BEGIN_BATCH(dwords); - OUT_BATCH(_3DSTATE_STREAMOUT << 16 | (dwords - 2)); - OUT_BATCH(dw1); - OUT_BATCH(dw2); - if (dwords > 3) { - OUT_BATCH(dw3); - OUT_BATCH(dw4); - } - ADVANCE_BATCH(); -} - -static void -upload_sol_state(struct brw_context *brw) -{ - struct gl_context *ctx = &brw->ctx; - /* BRW_NEW_TRANSFORM_FEEDBACK */ - bool active = _mesa_is_xfb_active_and_unpaused(ctx); - - if (active) { - if (brw->gen >= 8) - gen8_upload_3dstate_so_buffers(brw); - else - upload_3dstate_so_buffers(brw); - - /* BRW_NEW_VUE_MAP_GEOM_OUT */ - gen7_upload_3dstate_so_decl_list(brw, &brw->vue_map_geom_out); - } - - /* Finally, set up the SOL stage. This command must always follow updates to - * the nonpipelined SOL state (3DSTATE_SO_BUFFER, 3DSTATE_SO_DECL_LIST) or - * MMIO register updates (current performed by the kernel at each batch - * emit). - */ - upload_3dstate_streamout(brw, active, &brw->vue_map_geom_out); -} - -const struct brw_tracked_state gen7_sol_state = { - .dirty = { - .mesa = _NEW_LIGHT, - .brw = BRW_NEW_BATCH | - BRW_NEW_BLORP | - BRW_NEW_RASTERIZER_DISCARD | - BRW_NEW_VUE_MAP_GEOM_OUT | - BRW_NEW_TRANSFORM_FEEDBACK, - }, - .emit = upload_sol_state, -}; - void gen7_begin_transform_feedback(struct gl_context *ctx, GLenum mode, struct gl_transform_feedback_object *obj) diff --git a/src/mesa/drivers/dri/i965/gen8_sol_state.c b/src/mesa/drivers/dri/i965/gen8_sol_state.c deleted file mode 100644 index 6866539c3b2..00000000000 --- a/src/mesa/drivers/dri/i965/gen8_sol_state.c +++ /dev/null @@ -1,95 +0,0 @@ -/* - * Copyright © 2012 Intel Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -/** - * @file gen8_sol_state.c - * - * Controls the stream output logic (SOL) stage of the gen8 hardware, which is - * used to implement GL_EXT_transform_feedback. - */ - -#include "brw_context.h" -#include "brw_state.h" -#include "brw_defines.h" -#include "intel_batchbuffer.h" -#include "intel_buffer_objects.h" -#include "main/transformfeedback.h" - -void -gen8_upload_3dstate_so_buffers(struct brw_context *brw) -{ - struct gl_context *ctx = &brw->ctx; - /* BRW_NEW_TRANSFORM_FEEDBACK */ - struct gl_transform_feedback_object *xfb_obj = - ctx->TransformFeedback.CurrentObject; - struct brw_transform_feedback_object *brw_obj = - (struct brw_transform_feedback_object *) xfb_obj; - uint32_t mocs_wb = brw->gen >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB; - - /* Set up the up to 4 output buffers. These are the ranges defined in the - * gl_transform_feedback_object. - */ - for (int i = 0; i < 4; i++) { - struct intel_buffer_object *bufferobj = - intel_buffer_object(xfb_obj->Buffers[i]); - - if (!bufferobj) { - BEGIN_BATCH(8); - OUT_BATCH(_3DSTATE_SO_BUFFER << 16 | (8 - 2)); - OUT_BATCH((i << SO_BUFFER_INDEX_SHIFT)); - OUT_BATCH(0); - OUT_BATCH(0); - OUT_BATCH(0); - OUT_BATCH(0); - OUT_BATCH(0); - OUT_BATCH(0); - ADVANCE_BATCH(); - continue; - } - - uint32_t start = xfb_obj->Offset[i]; - assert(start % 4 == 0); - uint32_t end = ALIGN(start + xfb_obj->Size[i], 4); - struct brw_bo *bo = - intel_bufferobj_buffer(brw, bufferobj, start, end - start); - assert(end <= bo->size); - - BEGIN_BATCH(8); - OUT_BATCH(_3DSTATE_SO_BUFFER << 16 | (8 - 2)); - OUT_BATCH(GEN8_SO_BUFFER_ENABLE | (i << SO_BUFFER_INDEX_SHIFT) | - GEN8_SO_BUFFER_OFFSET_WRITE_ENABLE | - GEN8_SO_BUFFER_OFFSET_ADDRESS_ENABLE | - (mocs_wb << 22)); - OUT_RELOC64(bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, start); - OUT_BATCH(xfb_obj->Size[i] / 4 - 1); - OUT_RELOC64(brw_obj->offset_bo, - I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, - i * sizeof(uint32_t)); - if (brw_obj->zero_offsets) - OUT_BATCH(0); /* Zero out the offset and write that to offset_bo */ - else - OUT_BATCH(0xFFFFFFFF); /* Use offset_bo as the "Stream Offset." */ - ADVANCE_BATCH(); - } - brw_obj->zero_offsets = false; -} diff --git a/src/mesa/drivers/dri/i965/genX_state_upload.c b/src/mesa/drivers/dri/i965/genX_state_upload.c index 575be7e4d3a..c06132cec6e 100644 --- a/src/mesa/drivers/dri/i965/genX_state_upload.c +++ b/src/mesa/drivers/dri/i965/genX_state_upload.c @@ -31,11 +31,13 @@ #include "brw_util.h" #include "intel_batchbuffer.h" +#include "intel_buffer_objects.h" #include "intel_fbo.h" #include "main/fbobject.h" #include "main/framebuffer.h" #include "main/stencil.h" +#include "main/transformfeedback.h" UNUSED static void * emit_dwords(struct brw_context *brw, unsigned n) @@ -80,6 +82,28 @@ __gen_combine_address(struct brw_context *brw, void *location, } } +static inline struct brw_address +render_bo(struct brw_bo *bo, uint32_t offset) +{ + return (struct brw_address) { + .bo = bo, + .offset = offset, + .read_domains = I915_GEM_DOMAIN_RENDER, + .write_domain = I915_GEM_DOMAIN_RENDER, + }; +} + +static inline struct brw_address +instruction_bo(struct brw_bo *bo, uint32_t offset) +{ + return (struct brw_address) { + .bo = bo, + .offset = offset, + .read_domains = I915_GEM_DOMAIN_INSTRUCTION, + .write_domain = I915_GEM_DOMAIN_INSTRUCTION, + }; +} + #include "genxml/genX_pack.h" #define _brw_cmd_length(cmd) cmd ## _length @@ -94,11 +118,12 @@ __gen_combine_address(struct brw_context *brw, void *location, _brw_cmd_pack(cmd)(brw, (void *)_dst, &name), \ _dst = NULL) -#define brw_batch_emitn(brw, cmd, n) ({ \ +#define brw_batch_emitn(brw, cmd, n, ...) ({ \ uint32_t *_dw = emit_dwords(brw, n); \ struct cmd template = { \ _brw_cmd_header(cmd), \ .DWordLength = n - _brw_cmd_length_bias(cmd), \ + __VA_ARGS__ \ }; \ _brw_cmd_pack(cmd)(brw, _dw, &template); \ _dw + 1; /* Array starts at dw[1] */ \ @@ -860,6 +885,316 @@ static const struct brw_tracked_state genX(sbe_state) = { }, .emit = genX(upload_sbe), }; + +/* ---------------------------------------------------------------------- */ + +/** + * Outputs the 3DSTATE_SO_DECL_LIST command. + * + * The data output is a series of 64-bit entries containing a SO_DECL per + * stream. We only have one stream of rendering coming out of the GS unit, so + * we only emit stream 0 (low 16 bits) SO_DECLs. + */ +static void +genX(upload_3dstate_so_decl_list)(struct brw_context *brw, + const struct brw_vue_map *vue_map) +{ + struct gl_context *ctx = &brw->ctx; + /* BRW_NEW_TRANSFORM_FEEDBACK */ + struct gl_transform_feedback_object *xfb_obj = + ctx->TransformFeedback.CurrentObject; + const struct gl_transform_feedback_info *linked_xfb_info = + xfb_obj->program->sh.LinkedTransformFeedback; + struct GENX(SO_DECL) so_decl[MAX_VERTEX_STREAMS][128]; + int buffer_mask[MAX_VERTEX_STREAMS] = {0, 0, 0, 0}; + int next_offset[MAX_VERTEX_STREAMS] = {0, 0, 0, 0}; + int decls[MAX_VERTEX_STREAMS] = {0, 0, 0, 0}; + int max_decls = 0; + STATIC_ASSERT(ARRAY_SIZE(so_decl[0]) >= MAX_PROGRAM_OUTPUTS); + + memset(so_decl, 0, sizeof(so_decl)); + + /* Construct the list of SO_DECLs to be emitted. The formatting of the + * command feels strange -- each dword pair contains a SO_DECL per stream. + */ + for (unsigned i = 0; i < linked_xfb_info->NumOutputs; i++) { + int buffer = linked_xfb_info->Outputs[i].OutputBuffer; + struct GENX(SO_DECL) decl = {0}; + int varying = linked_xfb_info->Outputs[i].OutputRegister; + const unsigned components = linked_xfb_info->Outputs[i].NumComponents; + unsigned component_mask = (1 << components) - 1; + unsigned stream_id = linked_xfb_info->Outputs[i].StreamId; + unsigned decl_buffer_slot = buffer; + assert(stream_id < MAX_VERTEX_STREAMS); + + /* gl_PointSize is stored in VARYING_SLOT_PSIZ.w + * gl_Layer is stored in VARYING_SLOT_PSIZ.y + * gl_ViewportIndex is stored in VARYING_SLOT_PSIZ.z + */ + if (varying == VARYING_SLOT_PSIZ) { + assert(components == 1); + component_mask <<= 3; + } else if (varying == VARYING_SLOT_LAYER) { + assert(components == 1); + component_mask <<= 1; + } else if (varying == VARYING_SLOT_VIEWPORT) { + assert(components == 1); + component_mask <<= 2; + } else { + component_mask <<= linked_xfb_info->Outputs[i].ComponentOffset; + } + + buffer_mask[stream_id] |= 1 << buffer; + + decl.OutputBufferSlot = decl_buffer_slot; + if (varying == VARYING_SLOT_LAYER || varying == VARYING_SLOT_VIEWPORT) { + decl.RegisterIndex = vue_map->varying_to_slot[VARYING_SLOT_PSIZ]; + } else { + assert(vue_map->varying_to_slot[varying] >= 0); + decl.RegisterIndex = vue_map->varying_to_slot[varying]; + } + decl.ComponentMask = component_mask; + + /* Mesa doesn't store entries for gl_SkipComponents in the Outputs[] + * array. Instead, it simply increments DstOffset for the following + * input by the number of components that should be skipped. + * + * Our hardware is unusual in that it requires us to program SO_DECLs + * for fake "hole" components, rather than simply taking the offset + * for each real varying. Each hole can have size 1, 2, 3, or 4; we + * program as many size = 4 holes as we can, then a final hole to + * accommodate the final 1, 2, or 3 remaining. + */ + int skip_components = + linked_xfb_info->Outputs[i].DstOffset - next_offset[buffer]; + + next_offset[buffer] += skip_components; + + while (skip_components >= 4) { + struct GENX(SO_DECL) *d = &so_decl[stream_id][decls[stream_id]++]; + d->HoleFlag = 1; + d->OutputBufferSlot = decl_buffer_slot; + d->ComponentMask = 0xf; + skip_components -= 4; + } + + if (skip_components > 0) { + struct GENX(SO_DECL) *d = &so_decl[stream_id][decls[stream_id]++]; + d->HoleFlag = 1; + d->OutputBufferSlot = decl_buffer_slot; + d->ComponentMask = (1 << skip_components) - 1; + } + + assert(linked_xfb_info->Outputs[i].DstOffset == next_offset[buffer]); + + next_offset[buffer] += components; + + so_decl[stream_id][decls[stream_id]++] = decl; + + if (decls[stream_id] > max_decls) + max_decls = decls[stream_id]; + } + + uint32_t *dw; + dw = brw_batch_emitn(brw, GENX(3DSTATE_SO_DECL_LIST), 3 + 2 * max_decls, + .StreamtoBufferSelects0 = buffer_mask[0], + .StreamtoBufferSelects1 = buffer_mask[1], + .StreamtoBufferSelects2 = buffer_mask[2], + .StreamtoBufferSelects3 = buffer_mask[3], + .NumEntries0 = decls[0], + .NumEntries1 = decls[1], + .NumEntries2 = decls[2], + .NumEntries3 = decls[3]); + + for (int i = 0; i < max_decls; i++) { + GENX(SO_DECL_ENTRY_pack)( + brw, dw + 2 + i * 2, + &(struct GENX(SO_DECL_ENTRY)) { + .Stream0Decl = so_decl[0][i], + .Stream1Decl = so_decl[1][i], + .Stream2Decl = so_decl[2][i], + .Stream3Decl = so_decl[3][i], + }); + } +} + +static void +genX(upload_3dstate_so_buffers)(struct brw_context *brw) +{ + struct gl_context *ctx = &brw->ctx; + /* BRW_NEW_TRANSFORM_FEEDBACK */ + struct gl_transform_feedback_object *xfb_obj = + ctx->TransformFeedback.CurrentObject; +#if GEN_GEN < 8 + const struct gl_transform_feedback_info *linked_xfb_info = + xfb_obj->program->sh.LinkedTransformFeedback; +#else + struct brw_transform_feedback_object *brw_obj = + (struct brw_transform_feedback_object *) xfb_obj; + uint32_t mocs_wb = brw->gen >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB; +#endif + + /* Set up the up to 4 output buffers. These are the ranges defined in the + * gl_transform_feedback_object. + */ + for (int i = 0; i < 4; i++) { + struct intel_buffer_object *bufferobj = + intel_buffer_object(xfb_obj->Buffers[i]); + + if (!bufferobj) { + brw_batch_emit(brw, GENX(3DSTATE_SO_BUFFER), sob) { + sob.SOBufferIndex = i; + } + continue; + } + + uint32_t start = xfb_obj->Offset[i]; + assert(start % 4 == 0); + uint32_t end = ALIGN(start + xfb_obj->Size[i], 4); + struct brw_bo *bo = + intel_bufferobj_buffer(brw, bufferobj, start, end - start); + assert(end <= bo->size); + + brw_batch_emit(brw, GENX(3DSTATE_SO_BUFFER), sob) { + sob.SOBufferIndex = i; + + sob.SurfaceBaseAddress = render_bo(bo, start); +#if GEN_GEN < 8 + sob.SurfacePitch = linked_xfb_info->Buffers[i].Stride * 4; + sob.SurfaceEndAddress = render_bo(bo, end); +#else + sob.SOBufferEnable = true; + sob.StreamOffsetWriteEnable = true; + sob.StreamOutputBufferOffsetAddressEnable = true; + sob.SOBufferMOCS = mocs_wb; + + sob.SurfaceSize = MAX2(xfb_obj->Size[i] / 4, 1) - 1; + sob.StreamOutputBufferOffsetAddress = + instruction_bo(brw_obj->offset_bo, i * sizeof(uint32_t)); + + if (brw_obj->zero_offsets) { + /* Zero out the offset and write that to offset_bo */ + sob.StreamOffset = 0; + } else { + /* Use offset_bo as the "Stream Offset." */ + sob.StreamOffset = 0xFFFFFFFF; + } +#endif + } + } + +#if GEN_GEN >= 8 + brw_obj->zero_offsets = false; +#endif +} + +static inline bool +query_active(struct gl_query_object *q) +{ + return q && q->Active; +} + +static void +genX(upload_3dstate_streamout)(struct brw_context *brw, bool active, + const struct brw_vue_map *vue_map) +{ + struct gl_context *ctx = &brw->ctx; + /* BRW_NEW_TRANSFORM_FEEDBACK */ + struct gl_transform_feedback_object *xfb_obj = + ctx->TransformFeedback.CurrentObject; + + brw_batch_emit(brw, GENX(3DSTATE_STREAMOUT), sos) { + if (active) { + int urb_entry_read_offset = 0; + int urb_entry_read_length = (vue_map->num_slots + 1) / 2 - + urb_entry_read_offset; + + sos.SOFunctionEnable = true; + sos.SOStatisticsEnable = true; + + /* BRW_NEW_RASTERIZER_DISCARD */ + if (ctx->RasterDiscard) { + if (!query_active(ctx->Query.PrimitivesGenerated[0])) { + sos.RenderingDisable = true; + } else { + perf_debug("Rasterizer discard with a GL_PRIMITIVES_GENERATED " + "query active relies on the clipper."); + } + } + + /* _NEW_LIGHT */ + if (ctx->Light.ProvokingVertex != GL_FIRST_VERTEX_CONVENTION) + sos.ReorderMode = TRAILING; + +#if GEN_GEN < 8 + sos.SOBufferEnable0 = xfb_obj->Buffers[0] != NULL; + sos.SOBufferEnable1 = xfb_obj->Buffers[1] != NULL; + sos.SOBufferEnable2 = xfb_obj->Buffers[2] != NULL; + sos.SOBufferEnable3 = xfb_obj->Buffers[3] != NULL; +#else + const struct gl_transform_feedback_info *linked_xfb_info = + xfb_obj->program->sh.LinkedTransformFeedback; + /* Set buffer pitches; 0 means unbound. */ + if (xfb_obj->Buffers[0]) + sos.Buffer0SurfacePitch = linked_xfb_info->Buffers[0].Stride * 4; + if (xfb_obj->Buffers[1]) + sos.Buffer1SurfacePitch = linked_xfb_info->Buffers[1].Stride * 4; + if (xfb_obj->Buffers[2]) + sos.Buffer2SurfacePitch = linked_xfb_info->Buffers[2].Stride * 4; + if (xfb_obj->Buffers[3]) + sos.Buffer3SurfacePitch = linked_xfb_info->Buffers[3].Stride * 4; +#endif + + /* We always read the whole vertex. This could be reduced at some + * point by reading less and offsetting the register index in the + * SO_DECLs. + */ + sos.Stream0VertexReadOffset = urb_entry_read_offset; + sos.Stream0VertexReadLength = urb_entry_read_length - 1; + sos.Stream1VertexReadOffset = urb_entry_read_offset; + sos.Stream1VertexReadLength = urb_entry_read_length - 1; + sos.Stream2VertexReadOffset = urb_entry_read_offset; + sos.Stream2VertexReadLength = urb_entry_read_length - 1; + sos.Stream3VertexReadOffset = urb_entry_read_offset; + sos.Stream3VertexReadLength = urb_entry_read_length - 1; + } + } +} + +static void +genX(upload_sol)(struct brw_context *brw) +{ + struct gl_context *ctx = &brw->ctx; + /* BRW_NEW_TRANSFORM_FEEDBACK */ + bool active = _mesa_is_xfb_active_and_unpaused(ctx); + + if (active) { + genX(upload_3dstate_so_buffers)(brw); + + /* BRW_NEW_VUE_MAP_GEOM_OUT */ + genX(upload_3dstate_so_decl_list)(brw, &brw->vue_map_geom_out); + } + + /* Finally, set up the SOL stage. This command must always follow updates to + * the nonpipelined SOL state (3DSTATE_SO_BUFFER, 3DSTATE_SO_DECL_LIST) or + * MMIO register updates (current performed by the kernel at each batch + * emit). + */ + genX(upload_3dstate_streamout)(brw, active, &brw->vue_map_geom_out); +} + +static const struct brw_tracked_state genX(sol_state) = { + .dirty = { + .mesa = _NEW_LIGHT, + .brw = BRW_NEW_BATCH | + BRW_NEW_BLORP | + BRW_NEW_RASTERIZER_DISCARD | + BRW_NEW_VUE_MAP_GEOM_OUT | + BRW_NEW_TRANSFORM_FEEDBACK, + }, + .emit = genX(upload_sol), +}; + #endif /* ---------------------------------------------------------------------- */ @@ -1178,7 +1513,7 @@ genX(init_atoms)(struct brw_context *brw) &gen7_te_state, &gen7_ds_state, &gen7_gs_state, - &gen7_sol_state, + &genX(sol_state), &genX(clip_state), &genX(sbe_state), &genX(sf_state), @@ -1265,7 +1600,7 @@ genX(init_atoms)(struct brw_context *brw) &gen7_te_state, &gen8_ds_state, &gen8_gs_state, - &gen7_sol_state, + &genX(sol_state), &genX(clip_state), &genX(raster_state), &genX(sbe_state), -- 2.30.2