{3, 3, 3, 3, 3}, /* cost of unaligned SSE store
in 128bit, 256bit and 512bit */
3, 3, /* SSE->integer and integer->SSE moves */
+ 5, 0, /* Gather load static, per_elt. */
+ 5, 0, /* Gather store static, per_elt. */
0, /* size of l1 cache */
0, /* size of l2 cache */
0, /* size of prefetch block */
in 32,64,128,256 and 512-bit */
{4, 8, 16, 32, 64}, /* cost of unaligned stores. */
3, 3, /* SSE->integer and integer->SSE moves */
+ 4, 4, /* Gather load static, per_elt. */
+ 4, 4, /* Gather store static, per_elt. */
0, /* size of l1 cache */
0, /* size of l2 cache */
0, /* size of prefetch block */
in 32,64,128,256 and 512-bit */
{4, 8, 16, 32, 64}, /* cost of unaligned stores. */
3, 3, /* SSE->integer and integer->SSE moves */
+ 4, 4, /* Gather load static, per_elt. */
+ 4, 4, /* Gather store static, per_elt. */
4, /* size of l1 cache. 486 has 8kB cache
shared for code and data, so 4kB is
not really precise. */
in 32,64,128,256 and 512-bit */
{4, 8, 16, 32, 64}, /* cost of unaligned stores. */
3, 3, /* SSE->integer and integer->SSE moves */
+ 4, 4, /* Gather load static, per_elt. */
+ 4, 4, /* Gather store static, per_elt. */
8, /* size of l1 cache. */
8, /* size of l2 cache */
0, /* size of prefetch block */
in 32,64,128,256 and 512-bit */
{4, 8, 16, 32, 64}, /* cost of unaligned stores. */
3, 3, /* SSE->integer and integer->SSE moves */
+ 4, 4, /* Gather load static, per_elt. */
+ 4, 4, /* Gather store static, per_elt. */
8, /* size of l1 cache. */
8, /* size of l2 cache */
0, /* size of prefetch block */
in 32,64,128,256 and 512-bit */
{4, 8, 16, 32, 64}, /* cost of unaligned stores. */
3, 3, /* SSE->integer and integer->SSE moves */
+ 4, 4, /* Gather load static, per_elt. */
+ 4, 4, /* Gather store static, per_elt. */
8, /* size of l1 cache. */
256, /* size of l2 cache */
32, /* size of prefetch block */
in 32,64,128,256 and 512-bit */
{2, 2, 8, 16, 32}, /* cost of unaligned stores. */
6, 6, /* SSE->integer and integer->SSE moves */
+ 2, 2, /* Gather load static, per_elt. */
+ 2, 2, /* Gather store static, per_elt. */
64, /* size of l1 cache. */
128, /* size of l2 cache. */
32, /* size of prefetch block */
in 32,64,128,256 and 512-bit */
{2, 2, 8, 16, 32}, /* cost of unaligned stores. */
6, 6, /* SSE->integer and integer->SSE moves */
+ 2, 2, /* Gather load static, per_elt. */
+ 2, 2, /* Gather store static, per_elt. */
32, /* size of l1 cache. */
32, /* size of l2 cache. Some models
have integrated l2 cache, but
in 32,64,128,256 and 512-bit */
{4, 4, 5, 10, 20}, /* cost of unaligned stores. */
5, 5, /* SSE->integer and integer->SSE moves */
+ 4, 4, /* Gather load static, per_elt. */
+ 4, 4, /* Gather store static, per_elt. */
64, /* size of l1 cache. */
256, /* size of l2 cache. */
64, /* size of prefetch block */
in 32,64,128,256 and 512-bit */
{4, 4, 5, 10, 20}, /* cost of unaligned stores. */
5, 5, /* SSE->integer and integer->SSE moves */
+ 4, 4, /* Gather load static, per_elt. */
+ 4, 4, /* Gather store static, per_elt. */
64, /* size of l1 cache. */
512, /* size of l2 cache. */
64, /* size of prefetch block */
1/1 1/1
MOVD reg32, xmmreg Double FADD 3
1/1 1/1 */
+ 4, 4, /* Gather load static, per_elt. */
+ 4, 4, /* Gather store static, per_elt. */
64, /* size of l1 cache. */
512, /* size of l2 cache. */
64, /* size of prefetch block */
in 32,64,128,256 and 512-bit */
{10, 10, 10, 20, 30}, /* cost of unaligned stores. */
16, 20, /* SSE->integer and integer->SSE moves */
+ 12, 12, /* Gather load static, per_elt. */
+ 10, 10, /* Gather store static, per_elt. */
16, /* size of l1 cache. */
2048, /* size of l2 cache. */
64, /* size of prefetch block */
in 32,64,128,256 and 512-bit */
{10, 10, 10, 20, 30}, /* cost of unaligned stores. */
16, 20, /* SSE->integer and integer->SSE moves */
+ 12, 12, /* Gather load static, per_elt. */
+ 10, 10, /* Gather store static, per_elt. */
16, /* size of l1 cache. */
2048, /* size of l2 cache. */
64, /* size of prefetch block */
in 32,64,128,256 and 512-bit */
{10, 10, 10, 20, 30}, /* cost of unaligned stores. */
16, 20, /* SSE->integer and integer->SSE moves */
+ 12, 12, /* Gather load static, per_elt. */
+ 10, 10, /* Gather store static, per_elt. */
16, /* size of l1 cache. */
2048, /* size of l2 cache. */
64, /* size of prefetch block */
in 32,64,128,256 and 512-bit */
{10, 10, 10, 20, 30}, /* cost of unaligned stores. */
16, 20, /* SSE->integer and integer->SSE moves */
+ 12, 12, /* Gather load static, per_elt. */
+ 10, 10, /* Gather store static, per_elt. */
16, /* size of l1 cache. */
2048, /* size of l2 cache. */
64, /* size of prefetch block */
in 32,64,128,256 and 512-bit. */
{8, 8, 8, 8, 16}, /* cost of unaligned stores. */
6, 6, /* SSE->integer and integer->SSE moves. */
+ /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
+ throughput 12. Approx 9 uops do not depend on vector size and every load
+ is 7 uops. */
+ 18, 8, /* Gather load static, per_elt. */
+ 18, 10, /* Gather store static, per_elt. */
32, /* size of l1 cache. */
512, /* size of l2 cache. */
64, /* size of prefetch block. */
in 32,64,128,256 and 512-bit */
{10, 10, 12, 24, 48}, /* cost of unaligned stores. */
14, 14, /* SSE->integer and integer->SSE moves */
+ 10, 10, /* Gather load static, per_elt. */
+ 10, 10, /* Gather store static, per_elt. */
32, /* size of l1 cache. */
512, /* size of l2 cache. */
64, /* size of prefetch block */
in 32,64,128,256 and 512-bit */
{10, 10, 12, 24, 48}, /* cost of unaligned stores. */
14, 14, /* SSE->integer and integer->SSE moves */
+ 10, 10, /* Gather load static, per_elt. */
+ 10, 10, /* Gather store static, per_elt. */
32, /* size of l1 cache. */
2048, /* size of l2 cache. */
64, /* size of prefetch block */
in 32,64,128,256 and 512-bit */
{32, 32, 32, 64, 128}, /* cost of unaligned stores. */
20, 12, /* SSE->integer and integer->SSE moves */
+ 16, 16, /* Gather load static, per_elt. */
+ 16, 16, /* Gather store static, per_elt. */
8, /* size of l1 cache. */
256, /* size of l2 cache. */
64, /* size of prefetch block */
in 32,64,128,256 and 512-bit */
{24, 24, 24, 48, 96}, /* cost of unaligned stores. */
20, 12, /* SSE->integer and integer->SSE moves */
+ 12, 12, /* Gather load static, per_elt. */
+ 12, 12, /* Gather store static, per_elt. */
8, /* size of l1 cache. */
1024, /* size of l2 cache. */
64, /* size of prefetch block */
in 32,64,128,256 and 512-bit */
{16, 16, 16, 32, 64}, /* cost of unaligned stores. */
8, 6, /* SSE->integer and integer->SSE moves */
+ 8, 8, /* Gather load static, per_elt. */
+ 8, 8, /* Gather store static, per_elt. */
32, /* size of l1 cache. */
256, /* size of l2 cache. */
64, /* size of prefetch block */
in 32,64,128,256 and 512-bit */
{16, 16, 16, 32, 64}, /* cost of unaligned stores. */
8, 6, /* SSE->integer and integer->SSE moves */
+ 8, 8, /* Gather load static, per_elt. */
+ 8, 8, /* Gather store static, per_elt. */
32, /* size of l1 cache. */
256, /* size of l2 cache. */
64, /* size of prefetch block */
in 32,64,128,256 and 512-bit */
{10, 10, 10, 10, 10}, /* cost of unaligned loads. */
4, 4, /* SSE->integer and integer->SSE moves */
+ 6, 6, /* Gather load static, per_elt. */
+ 6, 6, /* Gather store static, per_elt. */
32, /* size of l1 cache. */
256, /* size of l2 cache. */
64, /* size of prefetch block */
in 32,64,128,256 and 512-bit */
{10, 10, 10, 15, 20}, /* cost of unaligned storess. */
20, 20, /* SSE->integer and integer->SSE moves */
+ 6, 6, /* Gather load static, per_elt. */
+ 6, 6, /* Gather store static, per_elt. */
32, /* size of l1 cache. */
512, /* size of l2 cache. */
64, /* size of prefetch block */
in 32,64,128,256 and 512-bit */
{6, 6, 6, 6, 12}, /* cost of unaligned stores. */
2, 2, /* SSE->integer and integer->SSE moves */
+ /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
+ rec. throughput 6.
+ So 5 uops statically and one uops per load. */
+ 10, 6, /* Gather load static, per_elt. */
+ 10, 6, /* Gather store static, per_elt. */
64, /* size of l1 cache. */
512, /* size of l2 cache. */
64, /* size of prefetch block */