X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fmesa%2Fdrivers%2Fdri%2Fi965%2Fgen7_sol_state.c;h=17752742d465eb8499e6797cf61b89ebcab83176;hb=93d2b5c57632f5cc57e71511bc6e33f8474e40fd;hp=1e484dc1bcf72e8468405c808c5fa1cce7d4e87b;hpb=d1e4e9960cbdfce6078cdc377809ea76c2eb7078;p=mesa.git diff --git a/src/mesa/drivers/dri/i965/gen7_sol_state.c b/src/mesa/drivers/dri/i965/gen7_sol_state.c index 1e484dc1bcf..17752742d46 100644 --- a/src/mesa/drivers/dri/i965/gen7_sol_state.c +++ b/src/mesa/drivers/dri/i965/gen7_sol_state.c @@ -38,16 +38,12 @@ static void upload_3dstate_so_buffers(struct brw_context *brw) { - struct intel_context *intel = &brw->intel; - struct gl_context *ctx = &intel->ctx; - /* BRW_NEW_VERTEX_PROGRAM */ - const struct gl_shader_program *vs_prog = - ctx->Shader.CurrentVertexProgram; - const struct gl_transform_feedback_info *linked_xfb_info = - &vs_prog->LinkedTransformFeedback; + struct gl_context *ctx = &brw->ctx; /* BRW_NEW_TRANSFORM_FEEDBACK */ struct gl_transform_feedback_object *xfb_obj = ctx->TransformFeedback.CurrentObject; + const struct gl_transform_feedback_info *linked_xfb_info = + &xfb_obj->shader_program->LinkedTransformFeedback; int i; /* Set up the up to 4 output buffers. These are the ranges defined in the @@ -74,23 +70,14 @@ upload_3dstate_so_buffers(struct brw_context *brw) continue; } - bo = intel_bufferobj_buffer(intel, bufferobj, INTEL_WRITE_PART); - stride = linked_xfb_info->BufferStride[i] * 4; + stride = linked_xfb_info->Buffers[i].Stride * 4; start = xfb_obj->Offset[i]; assert(start % 4 == 0); end = ALIGN(start + xfb_obj->Size[i], 4); + bo = intel_bufferobj_buffer(brw, bufferobj, start, end - start); assert(end <= bo->size); - /* If we don't have hardware contexts, then we reset our offsets at the - * start of every batch, so we track the number of vertices written in - * software and increment our pointers by that many. - */ - if (!intel->hw_ctx) { - start += brw->sol.offset_0_batch_start * stride; - assert(start <= end); - } - BEGIN_BATCH(4); OUT_BATCH(_3DSTATE_SO_BUFFER << 16 | (4 - 2)); OUT_BATCH((i << SO_BUFFER_INDEX_SHIFT) | stride); @@ -107,92 +94,143 @@ upload_3dstate_so_buffers(struct brw_context *brw) * stream. We only have one stream of rendering coming out of the GS unit, so * we only emit stream 0 (low 16 bits) SO_DECLs. */ -static void -upload_3dstate_so_decl_list(struct brw_context *brw, - const struct brw_vue_map *vue_map) +void +gen7_upload_3dstate_so_decl_list(struct brw_context *brw, + const struct brw_vue_map *vue_map) { - struct intel_context *intel = &brw->intel; - struct gl_context *ctx = &intel->ctx; - /* BRW_NEW_VERTEX_PROGRAM */ - const struct gl_shader_program *vs_prog = - ctx->Shader.CurrentVertexProgram; + struct gl_context *ctx = &brw->ctx; /* BRW_NEW_TRANSFORM_FEEDBACK */ + struct gl_transform_feedback_object *xfb_obj = + ctx->TransformFeedback.CurrentObject; const struct gl_transform_feedback_info *linked_xfb_info = - &vs_prog->LinkedTransformFeedback; - int i; - uint16_t so_decl[128]; - int buffer_mask = 0; - int next_offset[4] = {0, 0, 0, 0}; + &xfb_obj->shader_program->LinkedTransformFeedback; + uint16_t so_decl[MAX_VERTEX_STREAMS][128]; + int buffer_mask[MAX_VERTEX_STREAMS] = {0, 0, 0, 0}; + int next_offset[MAX_VERTEX_STREAMS] = {0, 0, 0, 0}; + int decls[MAX_VERTEX_STREAMS] = {0, 0, 0, 0}; + int max_decls = 0; + STATIC_ASSERT(ARRAY_SIZE(so_decl[0]) >= MAX_PROGRAM_OUTPUTS); - STATIC_ASSERT(ARRAY_SIZE(so_decl) >= MAX_PROGRAM_OUTPUTS); + memset(so_decl, 0, sizeof(so_decl)); /* Construct the list of SO_DECLs to be emitted. The formatting of the * command is feels strange -- each dword pair contains a SO_DECL per stream. */ - for (i = 0; i < linked_xfb_info->NumOutputs; i++) { + for (unsigned i = 0; i < linked_xfb_info->NumOutputs; i++) { int buffer = linked_xfb_info->Outputs[i].OutputBuffer; uint16_t decl = 0; int varying = linked_xfb_info->Outputs[i].OutputRegister; - unsigned component_mask = - (1 << linked_xfb_info->Outputs[i].NumComponents) - 1; - - /* gl_PointSize is stored in VARYING_SLOT_PSIZ.w. */ + const unsigned components = linked_xfb_info->Outputs[i].NumComponents; + unsigned component_mask = (1 << components) - 1; + unsigned stream_id = linked_xfb_info->Outputs[i].StreamId; + unsigned decl_buffer_slot = buffer << SO_DECL_OUTPUT_BUFFER_SLOT_SHIFT; + assert(stream_id < MAX_VERTEX_STREAMS); + + /* gl_PointSize is stored in VARYING_SLOT_PSIZ.w + * gl_Layer is stored in VARYING_SLOT_PSIZ.y + * gl_ViewportIndex is stored in VARYING_SLOT_PSIZ.z + */ if (varying == VARYING_SLOT_PSIZ) { - assert(linked_xfb_info->Outputs[i].NumComponents == 1); + assert(components == 1); component_mask <<= 3; + } else if (varying == VARYING_SLOT_LAYER) { + assert(components == 1); + component_mask <<= 1; + } else if (varying == VARYING_SLOT_VIEWPORT) { + assert(components == 1); + component_mask <<= 2; } else { component_mask <<= linked_xfb_info->Outputs[i].ComponentOffset; } - buffer_mask |= 1 << buffer; + buffer_mask[stream_id] |= 1 << buffer; - decl |= buffer << SO_DECL_OUTPUT_BUFFER_SLOT_SHIFT; - decl |= vue_map->varying_to_slot[varying] << - SO_DECL_REGISTER_INDEX_SHIFT; + decl |= decl_buffer_slot; + if (varying == VARYING_SLOT_LAYER || varying == VARYING_SLOT_VIEWPORT) { + decl |= vue_map->varying_to_slot[VARYING_SLOT_PSIZ] << + SO_DECL_REGISTER_INDEX_SHIFT; + } else { + assert(vue_map->varying_to_slot[varying] >= 0); + decl |= vue_map->varying_to_slot[varying] << + SO_DECL_REGISTER_INDEX_SHIFT; + } decl |= component_mask << SO_DECL_COMPONENT_MASK_SHIFT; - /* This assert should be true until GL_ARB_transform_feedback_instanced - * is added and we start using the hole flag. + /* Mesa doesn't store entries for gl_SkipComponents in the Outputs[] + * array. Instead, it simply increments DstOffset for the following + * input by the number of components that should be skipped. + * + * Our hardware is unusual in that it requires us to program SO_DECLs + * for fake "hole" components, rather than simply taking the offset + * for each real varying. Each hole can have size 1, 2, 3, or 4; we + * program as many size = 4 holes as we can, then a final hole to + * accommodate the final 1, 2, or 3 remaining. */ + int skip_components = + linked_xfb_info->Outputs[i].DstOffset - next_offset[buffer]; + + next_offset[buffer] += skip_components; + + while (skip_components >= 4) { + so_decl[stream_id][decls[stream_id]++] = + SO_DECL_HOLE_FLAG | 0xf | decl_buffer_slot; + skip_components -= 4; + } + if (skip_components > 0) + so_decl[stream_id][decls[stream_id]++] = + SO_DECL_HOLE_FLAG | ((1 << skip_components) - 1) | + decl_buffer_slot; + assert(linked_xfb_info->Outputs[i].DstOffset == next_offset[buffer]); - next_offset[buffer] += linked_xfb_info->Outputs[i].NumComponents; + next_offset[buffer] += components; - so_decl[i] = decl; + so_decl[stream_id][decls[stream_id]++] = decl; + + if (decls[stream_id] > max_decls) + max_decls = decls[stream_id]; } - BEGIN_BATCH(linked_xfb_info->NumOutputs * 2 + 3); - OUT_BATCH(_3DSTATE_SO_DECL_LIST << 16 | - (linked_xfb_info->NumOutputs * 2 + 1)); + BEGIN_BATCH(max_decls * 2 + 3); + OUT_BATCH(_3DSTATE_SO_DECL_LIST << 16 | (max_decls * 2 + 1)); - OUT_BATCH((buffer_mask << SO_STREAM_TO_BUFFER_SELECTS_0_SHIFT) | - (0 << SO_STREAM_TO_BUFFER_SELECTS_1_SHIFT) | - (0 << SO_STREAM_TO_BUFFER_SELECTS_2_SHIFT) | - (0 << SO_STREAM_TO_BUFFER_SELECTS_3_SHIFT)); + OUT_BATCH((buffer_mask[0] << SO_STREAM_TO_BUFFER_SELECTS_0_SHIFT) | + (buffer_mask[1] << SO_STREAM_TO_BUFFER_SELECTS_1_SHIFT) | + (buffer_mask[2] << SO_STREAM_TO_BUFFER_SELECTS_2_SHIFT) | + (buffer_mask[3] << SO_STREAM_TO_BUFFER_SELECTS_3_SHIFT)); - OUT_BATCH((linked_xfb_info->NumOutputs << SO_NUM_ENTRIES_0_SHIFT) | - (0 << SO_NUM_ENTRIES_1_SHIFT) | - (0 << SO_NUM_ENTRIES_2_SHIFT) | - (0 << SO_NUM_ENTRIES_3_SHIFT)); + OUT_BATCH((decls[0] << SO_NUM_ENTRIES_0_SHIFT) | + (decls[1] << SO_NUM_ENTRIES_1_SHIFT) | + (decls[2] << SO_NUM_ENTRIES_2_SHIFT) | + (decls[3] << SO_NUM_ENTRIES_3_SHIFT)); - for (i = 0; i < linked_xfb_info->NumOutputs; i++) { - OUT_BATCH(so_decl[i]); - OUT_BATCH(0); + for (int i = 0; i < max_decls; i++) { + /* Stream 1 | Stream 0 */ + OUT_BATCH(((uint32_t) so_decl[1][i]) << 16 | so_decl[0][i]); + /* Stream 3 | Stream 2 */ + OUT_BATCH(((uint32_t) so_decl[3][i]) << 16 | so_decl[2][i]); } ADVANCE_BATCH(); } +static bool +query_active(struct gl_query_object *q) +{ + return q && q->Active; +} + static void upload_3dstate_streamout(struct brw_context *brw, bool active, const struct brw_vue_map *vue_map) { - struct intel_context *intel = &brw->intel; - struct gl_context *ctx = &intel->ctx; + struct gl_context *ctx = &brw->ctx; /* BRW_NEW_TRANSFORM_FEEDBACK */ struct gl_transform_feedback_object *xfb_obj = ctx->TransformFeedback.CurrentObject; - uint32_t dw1 = 0, dw2 = 0; + const struct gl_transform_feedback_info *linked_xfb_info = + &xfb_obj->shader_program->LinkedTransformFeedback; + uint32_t dw1 = 0, dw2 = 0, dw3 = 0, dw4 = 0; int i; if (active) { @@ -203,50 +241,85 @@ upload_3dstate_streamout(struct brw_context *brw, bool active, dw1 |= SO_FUNCTION_ENABLE; dw1 |= SO_STATISTICS_ENABLE; + /* BRW_NEW_RASTERIZER_DISCARD */ + if (ctx->RasterDiscard) { + if (!query_active(ctx->Query.PrimitivesGenerated[0])) { + dw1 |= SO_RENDERING_DISABLE; + } else { + perf_debug("Rasterizer discard with a GL_PRIMITIVES_GENERATED " + "query active relies on the clipper."); + } + } + /* _NEW_LIGHT */ if (ctx->Light.ProvokingVertex != GL_FIRST_VERTEX_CONVENTION) dw1 |= SO_REORDER_TRAILING; - for (i = 0; i < 4; i++) { - if (xfb_obj->Buffers[i]) { - dw1 |= SO_BUFFER_ENABLE(i); - } + if (brw->gen < 8) { + for (i = 0; i < 4; i++) { + if (xfb_obj->Buffers[i]) { + dw1 |= SO_BUFFER_ENABLE(i); + } + } } /* We always read the whole vertex. This could be reduced at some * point by reading less and offsetting the register index in the * SO_DECLs. */ - dw2 |= urb_entry_read_offset << SO_STREAM_0_VERTEX_READ_OFFSET_SHIFT; - dw2 |= (urb_entry_read_length - 1) << - SO_STREAM_0_VERTEX_READ_LENGTH_SHIFT; + dw2 |= SET_FIELD(urb_entry_read_offset, SO_STREAM_0_VERTEX_READ_OFFSET); + dw2 |= SET_FIELD(urb_entry_read_length - 1, SO_STREAM_0_VERTEX_READ_LENGTH); + + dw2 |= SET_FIELD(urb_entry_read_offset, SO_STREAM_1_VERTEX_READ_OFFSET); + dw2 |= SET_FIELD(urb_entry_read_length - 1, SO_STREAM_1_VERTEX_READ_LENGTH); + + dw2 |= SET_FIELD(urb_entry_read_offset, SO_STREAM_2_VERTEX_READ_OFFSET); + dw2 |= SET_FIELD(urb_entry_read_length - 1, SO_STREAM_2_VERTEX_READ_LENGTH); + + dw2 |= SET_FIELD(urb_entry_read_offset, SO_STREAM_3_VERTEX_READ_OFFSET); + dw2 |= SET_FIELD(urb_entry_read_length - 1, SO_STREAM_3_VERTEX_READ_LENGTH); + + if (brw->gen >= 8) { + /* Set buffer pitches; 0 means unbound. */ + if (xfb_obj->Buffers[0]) + dw3 |= linked_xfb_info->Buffers[0].Stride * 4; + if (xfb_obj->Buffers[1]) + dw3 |= (linked_xfb_info->Buffers[1].Stride * 4) << 16; + if (xfb_obj->Buffers[2]) + dw4 |= linked_xfb_info->Buffers[2].Stride * 4; + if (xfb_obj->Buffers[3]) + dw4 |= (linked_xfb_info->Buffers[3].Stride * 4) << 16; + } } - BEGIN_BATCH(3); - OUT_BATCH(_3DSTATE_STREAMOUT << 16 | (3 - 2)); + const int dwords = brw->gen >= 8 ? 5 : 3; + + BEGIN_BATCH(dwords); + OUT_BATCH(_3DSTATE_STREAMOUT << 16 | (dwords - 2)); OUT_BATCH(dw1); OUT_BATCH(dw2); + if (dwords > 3) { + OUT_BATCH(dw3); + OUT_BATCH(dw4); + } ADVANCE_BATCH(); } static void upload_sol_state(struct brw_context *brw) { - struct intel_context *intel = &brw->intel; - struct gl_context *ctx = &intel->ctx; + struct gl_context *ctx = &brw->ctx; /* BRW_NEW_TRANSFORM_FEEDBACK */ bool active = _mesa_is_xfb_active_and_unpaused(ctx); if (active) { - upload_3dstate_so_buffers(brw); - /* BRW_NEW_VUE_MAP_GEOM_OUT */ - upload_3dstate_so_decl_list(brw, &brw->vue_map_geom_out); + if (brw->gen >= 8) + gen8_upload_3dstate_so_buffers(brw); + else + upload_3dstate_so_buffers(brw); - /* If we don't have hardware contexts, then some other client may have - * changed the SO write offsets, and we need to rewrite them. - */ - if (!intel->hw_ctx) - intel->batch.needs_sol_reset = true; + /* BRW_NEW_VUE_MAP_GEOM_OUT */ + gen7_upload_3dstate_so_decl_list(brw, &brw->vue_map_geom_out); } /* Finally, set up the SOL stage. This command must always follow updates to @@ -259,27 +332,268 @@ upload_sol_state(struct brw_context *brw) const struct brw_tracked_state gen7_sol_state = { .dirty = { - .mesa = (_NEW_LIGHT), - .brw = (BRW_NEW_BATCH | - BRW_NEW_VERTEX_PROGRAM | - BRW_NEW_VUE_MAP_GEOM_OUT | - BRW_NEW_TRANSFORM_FEEDBACK) + .mesa = _NEW_LIGHT, + .brw = BRW_NEW_BATCH | + BRW_NEW_BLORP | + BRW_NEW_RASTERIZER_DISCARD | + BRW_NEW_VUE_MAP_GEOM_OUT | + BRW_NEW_TRANSFORM_FEEDBACK, }, .emit = upload_sol_state, }; +/** + * Tally the number of primitives generated so far. + * + * The buffer contains a series of pairs: + * (, ) ; + * (, ) ; + * + * For each stream, we subtract the pair of values (end - start) to get the + * number of primitives generated during one section. We accumulate these + * values, adding them up to get the total number of primitives generated. + */ +static void +gen7_tally_prims_generated(struct brw_context *brw, + struct brw_transform_feedback_object *obj) +{ + /* If the current batch is still contributing to the number of primitives + * generated, flush it now so the results will be present when mapped. + */ + if (drm_intel_bo_references(brw->batch.bo, obj->prim_count_bo)) + intel_batchbuffer_flush(brw); + + if (unlikely(brw->perf_debug && drm_intel_bo_busy(obj->prim_count_bo))) + perf_debug("Stalling for # of transform feedback primitives written.\n"); + + drm_intel_bo_map(obj->prim_count_bo, false); + uint64_t *prim_counts = obj->prim_count_bo->virtual; + + assert(obj->prim_count_buffer_index % (2 * BRW_MAX_XFB_STREAMS) == 0); + int pairs = obj->prim_count_buffer_index / (2 * BRW_MAX_XFB_STREAMS); + + for (int i = 0; i < pairs; i++) { + for (int s = 0; s < BRW_MAX_XFB_STREAMS; s++) { + obj->prims_generated[s] += + prim_counts[BRW_MAX_XFB_STREAMS + s] - prim_counts[s]; + } + prim_counts += 2 * BRW_MAX_XFB_STREAMS; /* move to the next pair */ + } + + drm_intel_bo_unmap(obj->prim_count_bo); + + /* We've already gathered up the old data; we can safely overwrite it now. */ + obj->prim_count_buffer_index = 0; +} + +/** + * Store the SO_NUM_PRIMS_WRITTEN counters for each stream (4 uint64_t values) + * to prim_count_bo. + * + * If prim_count_bo is out of space, gather up the results so far into + * prims_generated[] and allocate a new buffer with enough space. + * + * The number of primitives written is used to compute the number of vertices + * written to a transform feedback stream, which is required to implement + * DrawTransformFeedback(). + */ +static void +gen7_save_primitives_written_counters(struct brw_context *brw, + struct brw_transform_feedback_object *obj) +{ + const int streams = BRW_MAX_XFB_STREAMS; + + /* Check if there's enough space for a new pair of four values. */ + if (obj->prim_count_bo != NULL && + obj->prim_count_buffer_index + 2 * streams >= 4096 / sizeof(uint64_t)) { + /* Gather up the results so far and release the BO. */ + gen7_tally_prims_generated(brw, obj); + } + + /* Flush any drawing so that the counters have the right values. */ + brw_emit_mi_flush(brw); + + /* Emit MI_STORE_REGISTER_MEM commands to write the values. */ + for (int i = 0; i < streams; i++) { + int offset = (obj->prim_count_buffer_index + i) * sizeof(uint64_t); + brw_store_register_mem64(brw, obj->prim_count_bo, + GEN7_SO_NUM_PRIMS_WRITTEN(i), + offset); + } + + /* Update where to write data to. */ + obj->prim_count_buffer_index += streams; +} + +/** + * Compute the number of vertices written by this transform feedback operation. + */ +static void +brw_compute_xfb_vertices_written(struct brw_context *brw, + struct brw_transform_feedback_object *obj) +{ + if (obj->vertices_written_valid || !obj->base.EndedAnytime) + return; + + unsigned vertices_per_prim = 0; + + switch (obj->primitive_mode) { + case GL_POINTS: + vertices_per_prim = 1; + break; + case GL_LINES: + vertices_per_prim = 2; + break; + case GL_TRIANGLES: + vertices_per_prim = 3; + break; + default: + unreachable("Invalid transform feedback primitive mode."); + } + + /* Get the number of primitives generated. */ + gen7_tally_prims_generated(brw, obj); + + for (int i = 0; i < BRW_MAX_XFB_STREAMS; i++) { + obj->vertices_written[i] = vertices_per_prim * obj->prims_generated[i]; + } + obj->vertices_written_valid = true; +} + +/** + * GetTransformFeedbackVertexCount() driver hook. + * + * Returns the number of vertices written to a particular stream by the last + * Begin/EndTransformFeedback block. Used to implement DrawTransformFeedback(). + */ +GLsizei +brw_get_transform_feedback_vertex_count(struct gl_context *ctx, + struct gl_transform_feedback_object *obj, + GLuint stream) +{ + struct brw_context *brw = brw_context(ctx); + struct brw_transform_feedback_object *brw_obj = + (struct brw_transform_feedback_object *) obj; + + assert(obj->EndedAnytime); + assert(stream < BRW_MAX_XFB_STREAMS); + + brw_compute_xfb_vertices_written(brw, brw_obj); + return brw_obj->vertices_written[stream]; +} + +void +gen7_begin_transform_feedback(struct gl_context *ctx, GLenum mode, + struct gl_transform_feedback_object *obj) +{ + struct brw_context *brw = brw_context(ctx); + struct brw_transform_feedback_object *brw_obj = + (struct brw_transform_feedback_object *) obj; + + /* Reset the SO buffer offsets to 0. */ + if (brw->gen >= 8) { + brw_obj->zero_offsets = true; + } else { + intel_batchbuffer_flush(brw); + brw->batch.needs_sol_reset = true; + } + + /* We're about to lose the information needed to compute the number of + * vertices written during the last Begin/EndTransformFeedback section, + * so we can't delay it any further. + */ + brw_compute_xfb_vertices_written(brw, brw_obj); + + /* No primitives have been generated yet. */ + for (int i = 0; i < BRW_MAX_XFB_STREAMS; i++) { + brw_obj->prims_generated[i] = 0; + } + + /* Store the starting value of the SO_NUM_PRIMS_WRITTEN counters. */ + gen7_save_primitives_written_counters(brw, brw_obj); + + brw_obj->primitive_mode = mode; +} + void gen7_end_transform_feedback(struct gl_context *ctx, struct gl_transform_feedback_object *obj) { - /* Because we have to rely on the kernel to reset our SO write offsets, and - * we only get to do it once per batchbuffer, flush the batch after feedback - * so another transform feedback can get the write offset reset it needs. - * - * This also covers any cache flushing required. + /* After EndTransformFeedback, it's likely that the client program will try + * to draw using the contents of the transform feedback buffer as vertex + * input. In order for this to work, we need to flush the data through at + * least the GS stage of the pipeline, and flush out the render cache. For + * simplicity, just do a full flush. + */ + struct brw_context *brw = brw_context(ctx); + struct brw_transform_feedback_object *brw_obj = + (struct brw_transform_feedback_object *) obj; + + /* Store the ending value of the SO_NUM_PRIMS_WRITTEN counters. */ + if (!obj->Paused) + gen7_save_primitives_written_counters(brw, brw_obj); + + /* EndTransformFeedback() means that we need to update the number of + * vertices written. Since it's only necessary if DrawTransformFeedback() + * is called and it means mapping a buffer object, we delay computing it + * until it's absolutely necessary to try and avoid stalls. + */ + brw_obj->vertices_written_valid = false; +} + +void +gen7_pause_transform_feedback(struct gl_context *ctx, + struct gl_transform_feedback_object *obj) +{ + struct brw_context *brw = brw_context(ctx); + struct brw_transform_feedback_object *brw_obj = + (struct brw_transform_feedback_object *) obj; + + /* Flush any drawing so that the counters have the right values. */ + brw_emit_mi_flush(brw); + + /* Save the SOL buffer offset register values. */ + if (brw->gen < 8) { + for (int i = 0; i < 4; i++) { + BEGIN_BATCH(3); + OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2)); + OUT_BATCH(GEN7_SO_WRITE_OFFSET(i)); + OUT_RELOC(brw_obj->offset_bo, + I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, + i * sizeof(uint32_t)); + ADVANCE_BATCH(); + } + } + + /* Store the temporary ending value of the SO_NUM_PRIMS_WRITTEN counters. + * While this operation is paused, other transform feedback actions may + * occur, which will contribute to the counters. We need to exclude that + * from our counts. */ + gen7_save_primitives_written_counters(brw, brw_obj); +} + +void +gen7_resume_transform_feedback(struct gl_context *ctx, + struct gl_transform_feedback_object *obj) +{ struct brw_context *brw = brw_context(ctx); - struct intel_context *intel = &brw->intel; + struct brw_transform_feedback_object *brw_obj = + (struct brw_transform_feedback_object *) obj; + + /* Reload the SOL buffer offset registers. */ + if (brw->gen < 8) { + for (int i = 0; i < 4; i++) { + BEGIN_BATCH(3); + OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (3 - 2)); + OUT_BATCH(GEN7_SO_WRITE_OFFSET(i)); + OUT_RELOC(brw_obj->offset_bo, + I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, + i * sizeof(uint32_t)); + ADVANCE_BATCH(); + } + } - intel_batchbuffer_flush(intel); + /* Store the new starting value of the SO_NUM_PRIMS_WRITTEN counters. */ + gen7_save_primitives_written_counters(brw, brw_obj); }