X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fmesa%2Fdrivers%2Fdri%2Fi965%2Fbrw_curbe.c;h=b777bce70c176c2db5ed02d67c642654fe6ffda0;hb=839793680f99b8387bee9489733d5071c10f3ace;hp=5da92cc9629c6deaa78c85990d2e8ee8107fb5e7;hpb=a7bdd4cba8ddcab8dff59ecaaa7efbd436c6c307;p=mesa.git diff --git a/src/mesa/drivers/dri/i965/brw_curbe.c b/src/mesa/drivers/dri/i965/brw_curbe.c index 5da92cc9629..b777bce70c1 100644 --- a/src/mesa/drivers/dri/i965/brw_curbe.c +++ b/src/mesa/drivers/dri/i965/brw_curbe.c @@ -1,6 +1,6 @@ /* Copyright (C) Intel Corp. 2006. All Rights Reserved. - Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to + Intel funded Tungsten Graphics to develop this 3D driver. Permission is hereby granted, free of charge, to any person obtaining @@ -26,12 +26,30 @@ **********************************************************************/ /* * Authors: - * Keith Whitwell + * Keith Whitwell */ +/** @file brw_curbe.c + * + * Push constant handling for gen4/5. + * + * Push constants are constant values (such as GLSL uniforms) that are + * pre-loaded into a shader stage's register space at thread spawn time. On + * gen4 and gen5, we create a blob in memory containing all the push constants + * for all the stages in order. At CMD_CONST_BUFFER time that blob is loaded + * into URB space as a constant URB entry (CURBE) so that it can be accessed + * quickly at thread setup time. Each individual fixed function unit's state + * (brw_vs_state.c for example) tells the hardware which subset of the CURBE + * it wants in its register space, and we calculate those areas here under the + * BRW_NEW_CURBE_OFFSETS state flag. The brw_urb.c allocation will control + * how many CURBEs can be loaded into the hardware at once before a pipeline + * stall occurs at CMD_CONST_BUFFER time. + * + * On gen6+, constant handling becomes a much simpler set of per-unit state. + * See gen6_upload_vec4_push_constants() in gen6_vs_state.c for that code. + */ -#include "main/glheader.h" #include "main/context.h" #include "main/macros.h" #include "main/enums.h" @@ -39,7 +57,7 @@ #include "program/prog_print.h" #include "program/prog_statevars.h" #include "intel_batchbuffer.h" -#include "intel_regions.h" +#include "intel_buffer_objects.h" #include "brw_context.h" #include "brw_defines.h" #include "brw_state.h" @@ -47,20 +65,20 @@ /** - * Partition the CURBE between the various users of constant values: - * Note that vertex and fragment shaders can now fetch constants out - * of constant buffers. We no longer allocatea block of the GRF for - * constants. That greatly reduces the demand for space in the CURBE. - * Some of the comments within are dated... + * Partition the CURBE between the various users of constant values. + * + * If the users all fit within the previous allocatation, we avoid changing + * the layout because that means reuploading all unit state and uploading new + * constant buffers. */ static void calculate_curbe_offsets( struct brw_context *brw ) { struct gl_context *ctx = &brw->ctx; - /* CACHE_NEW_WM_PROG */ - const GLuint nr_fp_regs = (brw->wm.prog_data->nr_params + 15) / 16; + /* BRW_NEW_FS_PROG_DATA */ + const GLuint nr_fp_regs = (brw->wm.prog_data->base.nr_params + 15) / 16; - /* BRW_NEW_VERTEX_PROGRAM */ - const GLuint nr_vp_regs = (brw->vs.prog_data->base.nr_params + 15) / 16; + /* BRW_NEW_VS_PROG_DATA */ + const GLuint nr_vp_regs = (brw->vs.prog_data->base.base.nr_params + 15) / 16; GLuint nr_clip_regs = 0; GLuint total_regs; @@ -73,22 +91,15 @@ static void calculate_curbe_offsets( struct brw_context *brw ) total_regs = nr_fp_regs + nr_vp_regs + nr_clip_regs; - /* This can happen - what to do? Probably rather than falling - * back, the best thing to do is emit programs which code the - * constants as immediate values. Could do this either as a static - * cap on WM and VS, or adaptively. + /* The CURBE allocation size is limited to 32 512-bit units (128 EU + * registers, or 1024 floats). See CS_URB_STATE in the gen4 or gen5 + * (volume 1, part 1) PRMs. * - * Unfortunately, this is currently dependent on the results of the - * program generation process (in the case of wm), so this would - * introduce the need to re-generate programs in the event of a - * curbe allocation failure. - */ - /* Max size is 32 - just large enough to - * hold the 128 parameters allowed by - * the fragment and vertex program - * api's. It's not clear what happens - * when both VP and FP want to use 128 - * parameters, though. + * Note that in brw_fs.cpp we're only loading up to 16 EU registers of + * values as push constants before spilling to pull constants, and in + * brw_vec4.cpp we're loading up to 32 registers of push constants. An EU + * register is 1/2 of one of these URB entry units, so that leaves us 16 EU + * regs for clip. */ assert(total_regs <= 32); @@ -114,15 +125,15 @@ static void calculate_curbe_offsets( struct brw_context *brw ) brw->curbe.total_size = reg; if (0) - printf("curbe wm %d+%d clip %d+%d vs %d+%d\n", - brw->curbe.wm_start, - brw->curbe.wm_size, - brw->curbe.clip_start, - brw->curbe.clip_size, - brw->curbe.vs_start, - brw->curbe.vs_size ); - - brw->state.dirty.brw |= BRW_NEW_CURBE_OFFSETS; + fprintf(stderr, "curbe wm %d+%d clip %d+%d vs %d+%d\n", + brw->curbe.wm_start, + brw->curbe.wm_size, + brw->curbe.clip_start, + brw->curbe.clip_size, + brw->curbe.vs_start, + brw->curbe.vs_size ); + + brw->ctx.NewDriverState |= BRW_NEW_CURBE_OFFSETS; } } @@ -130,8 +141,9 @@ static void calculate_curbe_offsets( struct brw_context *brw ) const struct brw_tracked_state brw_curbe_offsets = { .dirty = { .mesa = _NEW_TRANSFORM, - .brw = BRW_NEW_VERTEX_PROGRAM | BRW_NEW_CONTEXT, - .cache = CACHE_NEW_WM_PROG + .brw = BRW_NEW_CONTEXT | + BRW_NEW_FS_PROG_DATA | + BRW_NEW_VS_PROG_DATA, }, .emit = calculate_curbe_offsets }; @@ -139,18 +151,17 @@ const struct brw_tracked_state brw_curbe_offsets = { -/* Define the number of curbes within CS's urb allocation. Multiple - * urb entries -> multiple curbes. These will be used by - * fixed-function hardware in a double-buffering scheme to avoid a - * pipeline stall each time the contents of the curbe is changed. +/** Uploads the CS_URB_STATE packet. + * + * Just like brw_vs_state.c and brw_wm_state.c define a URB entry size and + * number of entries for their stages, constant buffers do so using this state + * packet. Having multiple CURBEs in the URB at the same time allows the + * hardware to avoid a pipeline stall between primitives using different + * constant buffer contents. */ void brw_upload_cs_urb_state(struct brw_context *brw) { BEGIN_BATCH(2); - /* It appears that this is the state packet for the CS unit, ie. the - * urb entries detailed here are housed in the CS range from the - * URB_FENCE command. - */ OUT_BATCH(CMD_CS_URB_STATE << 16 | (2-2)); /* BRW_NEW_URB_FENCE */ @@ -161,10 +172,10 @@ void brw_upload_cs_urb_state(struct brw_context *brw) assert(brw->urb.nr_cs_entries); OUT_BATCH((brw->urb.csize - 1) << 4 | brw->urb.nr_cs_entries); } - CACHED_BATCH(); + ADVANCE_BATCH(); } -static GLfloat fixed_plane[6][4] = { +static const GLfloat fixed_plane[6][4] = { { 0, 0, -1, 1 }, { 0, 0, 1, 1 }, { 0, -1, 0, 1 }, @@ -173,34 +184,41 @@ static GLfloat fixed_plane[6][4] = { { 1, 0, 0, 1 } }; -/* Upload a new set of constants. Too much variability to go into the - * cache mechanism, but maybe would benefit from a comparison against - * the current uploaded set of constants. +/** + * Gathers together all the uniform values into a block of memory to be + * uploaded into the CURBE, then emits the state packet telling the hardware + * the new location. */ static void brw_upload_constant_buffer(struct brw_context *brw) { struct gl_context *ctx = &brw->ctx; + /* BRW_NEW_CURBE_OFFSETS */ const GLuint sz = brw->curbe.total_size; const GLuint bufsz = sz * 16 * sizeof(GLfloat); - GLfloat *buf; + gl_constant_value *buf; GLuint i; gl_clip_plane *clip_planes; if (sz == 0) { - brw->curbe.last_bufsz = 0; goto emit; } - buf = brw->curbe.next_buf; + buf = intel_upload_space(brw, bufsz, 64, + &brw->curbe.curbe_bo, &brw->curbe.curbe_offset); + + STATIC_ASSERT(sizeof(gl_constant_value) == sizeof(float)); /* fragment shader constants */ if (brw->curbe.wm_size) { + _mesa_load_state_parameters(ctx, brw->fragment_program->Base.Parameters); + + /* BRW_NEW_CURBE_OFFSETS */ GLuint offset = brw->curbe.wm_start * 16; - /* copy float constants */ - for (i = 0; i < brw->wm.prog_data->nr_params; i++) { - buf[offset + i] = *brw->wm.prog_data->param[i]; + /* BRW_NEW_FS_PROG_DATA | _NEW_PROGRAM_CONSTANTS: copy uniform values */ + for (i = 0; i < brw->wm.prog_data->base.nr_params; i++) { + buf[offset + i] = *brw->wm.prog_data->base.param[i]; } } @@ -212,10 +230,10 @@ brw_upload_constant_buffer(struct brw_context *brw) /* If any planes are going this way, send them all this way: */ for (i = 0; i < 6; i++) { - buf[offset + i * 4 + 0] = fixed_plane[i][0]; - buf[offset + i * 4 + 1] = fixed_plane[i][1]; - buf[offset + i * 4 + 2] = fixed_plane[i][2]; - buf[offset + i * 4 + 3] = fixed_plane[i][3]; + buf[offset + i * 4 + 0].f = fixed_plane[i][0]; + buf[offset + i * 4 + 1].f = fixed_plane[i][1]; + buf[offset + i * 4 + 2].f = fixed_plane[i][2]; + buf[offset + i * 4 + 3].f = fixed_plane[i][3]; } /* Clip planes: _NEW_TRANSFORM plus _NEW_PROJECTION to get to @@ -224,10 +242,10 @@ brw_upload_constant_buffer(struct brw_context *brw) clip_planes = brw_select_clip_planes(ctx); for (j = 0; j < MAX_CLIP_PLANES; j++) { if (ctx->Transform.ClipPlanesEnabled & (1<curbe.vs_size) { + _mesa_load_state_parameters(ctx, brw->vertex_program->Base.Parameters); + GLuint offset = brw->curbe.vs_start * 16; - for (i = 0; i < brw->vs.prog_data->base.nr_params; i++) { - buf[offset + i] = *brw->vs.prog_data->base.param[i]; + /* BRW_NEW_VS_PROG_DATA | _NEW_PROGRAM_CONSTANTS: copy uniform values */ + for (i = 0; i < brw->vs.prog_data->base.base.nr_params; i++) { + buf[offset + i] = *brw->vs.prog_data->base.base.param[i]; } } if (0) { for (i = 0; i < sz*16; i+=4) - printf("curbe %d.%d: %f %f %f %f\n", i/8, i&4, - buf[i+0], buf[i+1], buf[i+2], buf[i+3]); - - printf("last_buf %p buf %p sz %d/%d cmp %d\n", - brw->curbe.last_buf, buf, - bufsz, brw->curbe.last_bufsz, - brw->curbe.last_buf ? memcmp(buf, brw->curbe.last_buf, bufsz) : -1); - } - - if (brw->curbe.curbe_bo != NULL && - bufsz == brw->curbe.last_bufsz && - memcmp(buf, brw->curbe.last_buf, bufsz) == 0) { - /* constants have not changed */ - } else { - /* Update the record of what our last set of constants was. We - * don't just flip the pointers because we don't fill in the - * data in the padding between the entries. - */ - memcpy(brw->curbe.last_buf, buf, bufsz); - brw->curbe.last_bufsz = bufsz; - - if (brw->curbe.curbe_bo != NULL && - brw->curbe.curbe_next_offset + bufsz > brw->curbe.curbe_bo->size) - { - drm_intel_gem_bo_unmap_gtt(brw->curbe.curbe_bo); - drm_intel_bo_unreference(brw->curbe.curbe_bo); - brw->curbe.curbe_bo = NULL; - } - - if (brw->curbe.curbe_bo == NULL) { - /* Allocate a single page for CURBE entries for this batchbuffer. - * They're generally around 64b. - */ - brw->curbe.curbe_bo = drm_intel_bo_alloc(brw->bufmgr, "CURBE", - 4096, 1 << 6); - brw->curbe.curbe_next_offset = 0; - drm_intel_gem_bo_map_gtt(brw->curbe.curbe_bo); - assert(bufsz < 4096); - } - - brw->curbe.curbe_offset = brw->curbe.curbe_next_offset; - brw->curbe.curbe_next_offset += bufsz; - brw->curbe.curbe_next_offset = ALIGN(brw->curbe.curbe_next_offset, 64); - - /* Copy data to the buffer: - */ - memcpy(brw->curbe.curbe_bo->virtual + brw->curbe.curbe_offset, - buf, - bufsz); + fprintf(stderr, "curbe %d.%d: %f %f %f %f\n", i/8, i&4, + buf[i+0].f, buf[i+1].f, buf[i+2].f, buf[i+3].f); } /* Because this provokes an action (ie copy the constants into the @@ -310,6 +284,14 @@ brw_upload_constant_buffer(struct brw_context *brw) */ emit: + /* BRW_NEW_URB_FENCE: From the gen4 PRM, volume 1, section 3.9.8 + * (CONSTANT_BUFFER (CURBE Load)): + * + * "Modifying the CS URB allocation via URB_FENCE invalidates any + * previous CURBE entries. Therefore software must subsequently + * [re]issue a CONSTANT_BUFFER command before CURBE data can be used + * in the pipeline." + */ BEGIN_BATCH(2); if (brw->curbe.total_size == 0) { OUT_BATCH((CMD_CONST_BUFFER << 16) | (2 - 2)); @@ -321,18 +303,43 @@ emit: (brw->curbe.total_size - 1) + brw->curbe.curbe_offset); } ADVANCE_BATCH(); + + /* Work around a Broadwater/Crestline depth interpolator bug. The + * following sequence will cause GPU hangs: + * + * 1. Change state so that all depth related fields in CC_STATE are + * disabled, and in WM_STATE, only "PS Use Source Depth" is enabled. + * 2. Emit a CONSTANT_BUFFER packet. + * 3. Draw via 3DPRIMITIVE. + * + * The recommended workaround is to emit a non-pipelined state change after + * emitting CONSTANT_BUFFER, in order to drain the windowizer pipeline. + * + * We arbitrarily choose 3DSTATE_GLOBAL_DEPTH_CLAMP_OFFSET (as it's small), + * and always emit it when "PS Use Source Depth" is set. We could be more + * precise, but the additional complexity is probably not worth it. + * + * BRW_NEW_FRAGMENT_PROGRAM + */ + if (brw->gen == 4 && !brw->is_g4x && + (brw->fragment_program->Base.InputsRead & (1 << VARYING_SLOT_POS))) { + BEGIN_BATCH(2); + OUT_BATCH(_3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP << 16 | (2 - 2)); + OUT_BATCH(0); + ADVANCE_BATCH(); + } } const struct brw_tracked_state brw_constant_buffer = { .dirty = { .mesa = _NEW_PROGRAM_CONSTANTS, - .brw = (BRW_NEW_FRAGMENT_PROGRAM | - BRW_NEW_VERTEX_PROGRAM | - BRW_NEW_URB_FENCE | /* Implicit - hardware requires this, not used above */ - BRW_NEW_PSP | /* Implicit - hardware requires this, not used above */ - BRW_NEW_CURBE_OFFSETS | - BRW_NEW_BATCH), - .cache = (CACHE_NEW_WM_PROG) + .brw = BRW_NEW_BATCH | + BRW_NEW_CURBE_OFFSETS | + BRW_NEW_FRAGMENT_PROGRAM | + BRW_NEW_FS_PROG_DATA | + BRW_NEW_PSP | /* Implicit - hardware requires this, not used above */ + BRW_NEW_URB_FENCE | + BRW_NEW_VS_PROG_DATA, }, .emit = brw_upload_constant_buffer, };