From 35fe7ed7d3c45f5263ae42bcedecc00ba6adf91d Mon Sep 17 00:00:00 2001 From: Francisco Jerez Date: Fri, 1 Nov 2013 11:29:13 -0700 Subject: [PATCH] i965/gen7: Add instruction latency estimates for untyped atomics and reads. The latency information has been obtained empirically from measurements taken on Haswell and Ivy Bridge. Acked-by: Paul Berry Reviewed-by: Matt Turner --- .../dri/i965/brw_schedule_instructions.cpp | 39 +++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp index fe0de080544..5a425133a9d 100644 --- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp +++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp @@ -352,6 +352,45 @@ schedule_node::set_latency_gen7(bool is_haswell) * then around 140. Presumably this is cache hit vs miss. */ latency = 50; + case SHADER_OPCODE_UNTYPED_ATOMIC: + /* Test code: + * mov(8) g112<1>ud 0x00000000ud { align1 WE_all 1Q }; + * mov(1) g112.7<1>ud g1.7<0,1,0>ud { align1 WE_all }; + * mov(8) g113<1>ud 0x00000000ud { align1 WE_normal 1Q }; + * send(8) g4<1>ud g112<8,8,1>ud + * data (38, 5, 6) mlen 2 rlen 1 { align1 WE_normal 1Q }; + * + * Running it 100 times as fragment shader on a 128x128 quad + * gives an average latency of 13867 cycles per atomic op, + * standard deviation 3%. Note that this is a rather + * pessimistic estimate, the actual latency in cases with few + * collisions between threads and favorable pipelining has been + * seen to be reduced by a factor of 100. + */ + latency = 14000; + break; + + case SHADER_OPCODE_UNTYPED_SURFACE_READ: + /* Test code: + * mov(8) g112<1>UD 0x00000000UD { align1 WE_all 1Q }; + * mov(1) g112.7<1>UD g1.7<0,1,0>UD { align1 WE_all }; + * mov(8) g113<1>UD 0x00000000UD { align1 WE_normal 1Q }; + * send(8) g4<1>UD g112<8,8,1>UD + * data (38, 6, 5) mlen 2 rlen 1 { align1 WE_normal 1Q }; + * . + * . [repeats 8 times] + * . + * mov(8) g112<1>UD 0x00000000UD { align1 WE_all 1Q }; + * mov(1) g112.7<1>UD g1.7<0,1,0>UD { align1 WE_all }; + * mov(8) g113<1>UD 0x00000000UD { align1 WE_normal 1Q }; + * send(8) g4<1>UD g112<8,8,1>UD + * data (38, 6, 5) mlen 2 rlen 1 { align1 WE_normal 1Q }; + * + * Running it 100 times as fragment shader on a 128x128 quad + * gives an average latency of 583 cycles per surface read, + * standard deviation 0.9%. + */ + latency = is_haswell ? 300 : 600; break; default: -- 2.30.2