From: Ian Romanick Date: Mon, 10 Nov 2014 14:06:47 +0000 (-0800) Subject: i965: Micro-optimize brw_get_index_type X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=6ed53c27efc1b3eeb987b84b997633bc9456e5aa;p=mesa.git i965: Micro-optimize brw_get_index_type With the switch-statement, GCC 4.8.3 produces a small pile of code with a branch. 00000000 : 000000: 8b 54 24 04 mov 0x4(%esp),%edx 000004: b8 01 00 00 00 mov $0x1,%eax 000009: 81 fa 03 14 00 00 cmp $0x1403,%edx 00000f: 74 0d je 00001e 000011: 31 c0 xor %eax,%eax 000013: 81 fa 05 14 00 00 cmp $0x1405,%edx 000019: 0f 94 c0 sete %al 00001c: 01 c0 add %eax,%eax 00001e: c3 ret However, this could be two instructions. 00000000 : 000000: 2d 01 14 00 00 sub $0x1401,%eax 000005: d1 e8 shr %eax 000007: 90 nop 000008: 90 nop 000009: 90 nop 00000a: 90 nop 00000b: c3 ret The function was also moved to the header so that it could be inlined at the two call sites. Without this, 32-bit also needs to pull the parameter from the stack. This means there is a push, a call, a move, and a ret added to a two instruction function. The above code shows the function with __attribute__((regparm=1)), but even this adds several extra instructions. There is also an extra instruction on 64-bit to move the parameter to %eax for the subtract. On Bay Trail-D using Fedora 20 compile flags (-m64 -O2 -mtune=generic for 64-bit and -m32 -march=i686 -mtune=atom for 32-bit), affects Gl32Batch7: 32-bit: Difference at 95.0% confidence 0.818589% +/- 0.234661% (n=40) 64-bit: Difference at 95.0% confidence 0.54554% +/- 0.354092% (n=40) Signed-off-by: Ian Romanick Reviewed-by: Kenneth Graunke Reviewed-by: Matt Turner --- diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h index c1787a8cf7f..0e7bd21b019 100644 --- a/src/mesa/drivers/dri/i965/brw_context.h +++ b/src/mesa/drivers/dri/i965/brw_context.h @@ -1601,7 +1601,27 @@ gl_clip_plane *brw_select_clip_planes(struct gl_context *ctx); /* brw_draw_upload.c */ unsigned brw_get_vertex_surface_type(struct brw_context *brw, const struct gl_client_array *glarray); -unsigned brw_get_index_type(GLenum type); + +static inline unsigned +brw_get_index_type(GLenum type) +{ + assert((type == GL_UNSIGNED_BYTE) + || (type == GL_UNSIGNED_SHORT) + || (type == GL_UNSIGNED_INT)); + + /* The possible values for type are GL_UNSIGNED_BYTE (0x1401), + * GL_UNSIGNED_SHORT (0x1403), and GL_UNSIGNED_INT (0x1405) which we want + * to map to scale factors of 0, 1, and 2, respectively. These scale + * factors are then left-shfited by 8 to be in the correct position in the + * CMD_INDEX_BUFFER packet. + * + * Subtracting 0x1401 gives 0, 2, and 4. Shifting left by 7 afterwards + * gives 0x00000000, 0x00000100, and 0x00000200. These just happen to be + * the values the need to be written in the CMD_INDEX_BUFFER packet. + */ + return (type - 0x1401) << 7; +} + void brw_prepare_vertices(struct brw_context *brw); /* brw_wm_surface_state.c */ diff --git a/src/mesa/drivers/dri/i965/brw_draw_upload.c b/src/mesa/drivers/dri/i965/brw_draw_upload.c index 8123da8377c..52dcb6f2c4c 100644 --- a/src/mesa/drivers/dri/i965/brw_draw_upload.c +++ b/src/mesa/drivers/dri/i965/brw_draw_upload.c @@ -344,17 +344,6 @@ brw_get_vertex_surface_type(struct brw_context *brw, } } -unsigned -brw_get_index_type(GLenum type) -{ - switch (type) { - case GL_UNSIGNED_BYTE: return BRW_INDEX_BYTE; - case GL_UNSIGNED_SHORT: return BRW_INDEX_WORD; - case GL_UNSIGNED_INT: return BRW_INDEX_DWORD; - default: unreachable("not reached"); - } -} - static void copy_array_to_vbo_array(struct brw_context *brw, struct brw_vertex_element *element, @@ -963,7 +952,7 @@ static void brw_emit_index_buffer(struct brw_context *brw) BEGIN_BATCH(3); OUT_BATCH(CMD_INDEX_BUFFER << 16 | cut_index_setting | - brw_get_index_type(index_buffer->type) << 8 | + brw_get_index_type(index_buffer->type) | 1); OUT_RELOC(brw->ib.bo, I915_GEM_DOMAIN_VERTEX, 0, diff --git a/src/mesa/drivers/dri/i965/gen8_draw_upload.c b/src/mesa/drivers/dri/i965/gen8_draw_upload.c index 0d6feec8ab0..1af90ecc6a4 100644 --- a/src/mesa/drivers/dri/i965/gen8_draw_upload.c +++ b/src/mesa/drivers/dri/i965/gen8_draw_upload.c @@ -251,7 +251,7 @@ gen8_emit_index_buffer(struct brw_context *brw) BEGIN_BATCH(5); OUT_BATCH(CMD_INDEX_BUFFER << 16 | (5 - 2)); - OUT_BATCH(brw_get_index_type(index_buffer->type) << 8 | mocs_wb); + OUT_BATCH(brw_get_index_type(index_buffer->type) | mocs_wb); OUT_RELOC64(brw->ib.bo, I915_GEM_DOMAIN_VERTEX, 0, 0); OUT_BATCH(brw->ib.bo->size); ADVANCE_BATCH();