From: Jason Ekstrand Date: Thu, 21 Feb 2019 16:32:01 +0000 (-0600) Subject: intel/schedule_instructions: Move some comments X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=95ae400abcda4f692fd31c9132462d904f939ec3;p=mesa.git intel/schedule_instructions: Move some comments Reviewed-by: Caio Marcelo de Oliveira Filho --- diff --git a/src/intel/compiler/brw_schedule_instructions.cpp b/src/intel/compiler/brw_schedule_instructions.cpp index 4a516223cf9..1d5ee56bd4a 100644 --- a/src/intel/compiler/brw_schedule_instructions.cpp +++ b/src/intel/compiler/brw_schedule_instructions.cpp @@ -368,44 +368,13 @@ schedule_node::set_latency_gen7(bool is_haswell) break; case SHADER_OPCODE_UNTYPED_ATOMIC: - /* Test code: - * mov(8) g112<1>ud 0x00000000ud { align1 WE_all 1Q }; - * mov(1) g112.7<1>ud g1.7<0,1,0>ud { align1 WE_all }; - * mov(8) g113<1>ud 0x00000000ud { align1 WE_normal 1Q }; - * send(8) g4<1>ud g112<8,8,1>ud - * data (38, 5, 6) mlen 2 rlen 1 { align1 WE_normal 1Q }; - * - * Running it 100 times as fragment shader on a 128x128 quad - * gives an average latency of 13867 cycles per atomic op, - * standard deviation 3%. Note that this is a rather - * pessimistic estimate, the actual latency in cases with few - * collisions between threads and favorable pipelining has been - * seen to be reduced by a factor of 100. - */ + /* See GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP */ latency = 14000; break; case SHADER_OPCODE_UNTYPED_SURFACE_READ: case SHADER_OPCODE_UNTYPED_SURFACE_WRITE: - /* Test code: - * mov(8) g112<1>UD 0x00000000UD { align1 WE_all 1Q }; - * mov(1) g112.7<1>UD g1.7<0,1,0>UD { align1 WE_all }; - * mov(8) g113<1>UD 0x00000000UD { align1 WE_normal 1Q }; - * send(8) g4<1>UD g112<8,8,1>UD - * data (38, 6, 5) mlen 2 rlen 1 { align1 WE_normal 1Q }; - * . - * . [repeats 8 times] - * . - * mov(8) g112<1>UD 0x00000000UD { align1 WE_all 1Q }; - * mov(1) g112.7<1>UD g1.7<0,1,0>UD { align1 WE_all }; - * mov(8) g113<1>UD 0x00000000UD { align1 WE_normal 1Q }; - * send(8) g4<1>UD g112<8,8,1>UD - * data (38, 6, 5) mlen 2 rlen 1 { align1 WE_normal 1Q }; - * - * Running it 100 times as fragment shader on a 128x128 quad - * gives an average latency of 583 cycles per surface read, - * standard deviation 0.9%. - */ + /* See also GEN7_DATAPORT_DC_UNTYPED_SURFACE_READ */ latency = is_haswell ? 300 : 600; break; @@ -460,13 +429,44 @@ schedule_node::set_latency_gen7(bool is_haswell) case GEN7_DATAPORT_DC_UNTYPED_SURFACE_READ: case GEN7_DATAPORT_DC_UNTYPED_SURFACE_WRITE: - /* See also SHADER_OPCODE_UNTYPED_SURFACE_READ */ + /* Test code: + * mov(8) g112<1>UD 0x00000000UD { align1 WE_all 1Q }; + * mov(1) g112.7<1>UD g1.7<0,1,0>UD { align1 WE_all }; + * mov(8) g113<1>UD 0x00000000UD { align1 WE_normal 1Q }; + * send(8) g4<1>UD g112<8,8,1>UD + * data (38, 6, 5) mlen 2 rlen 1 { align1 WE_normal 1Q }; + * . + * . [repeats 8 times] + * . + * mov(8) g112<1>UD 0x00000000UD { align1 WE_all 1Q }; + * mov(1) g112.7<1>UD g1.7<0,1,0>UD { align1 WE_all }; + * mov(8) g113<1>UD 0x00000000UD { align1 WE_normal 1Q }; + * send(8) g4<1>UD g112<8,8,1>UD + * data (38, 6, 5) mlen 2 rlen 1 { align1 WE_normal 1Q }; + * + * Running it 100 times as fragment shader on a 128x128 quad + * gives an average latency of 583 cycles per surface read, + * standard deviation 0.9%. + */ assert(!is_haswell); latency = 600; break; case GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP: - /* See also SHADER_OPCODE_UNTYPED_ATOMIC */ + /* Test code: + * mov(8) g112<1>ud 0x00000000ud { align1 WE_all 1Q }; + * mov(1) g112.7<1>ud g1.7<0,1,0>ud { align1 WE_all }; + * mov(8) g113<1>ud 0x00000000ud { align1 WE_normal 1Q }; + * send(8) g4<1>ud g112<8,8,1>ud + * data (38, 5, 6) mlen 2 rlen 1 { align1 WE_normal 1Q }; + * + * Running it 100 times as fragment shader on a 128x128 quad + * gives an average latency of 13867 cycles per atomic op, + * standard deviation 3%. Note that this is a rather + * pessimistic estimate, the actual latency in cases with few + * collisions between threads and favorable pipelining has been + * seen to be reduced by a factor of 100. + */ assert(!is_haswell); latency = 14000; break; @@ -486,7 +486,7 @@ schedule_node::set_latency_gen7(bool is_haswell) case GEN8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_READ: case GEN8_DATAPORT_DC_PORT1_A64_SCATTERED_WRITE: case GEN9_DATAPORT_DC_PORT1_A64_SCATTERED_READ: - /* See also SHADER_OPCODE_UNTYPED_SURFACE_READ */ + /* See also GEN7_DATAPORT_DC_UNTYPED_SURFACE_READ */ latency = 300; break; @@ -497,7 +497,7 @@ schedule_node::set_latency_gen7(bool is_haswell) case GEN9_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_FLOAT_OP: case GEN8_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_OP: case GEN9_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_FLOAT_OP: - /* See also SHADER_OPCODE_UNTYPED_ATOMIC */ + /* See also GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP */ latency = 14000; break;