From df41dbaf7eae8dcb0841b62647f7504ed13a1d64 Mon Sep 17 00:00:00 2001 From: Jan Hubicka Date: Mon, 23 Oct 2017 17:10:09 +0200 Subject: [PATCH] i386.c (dimode_scalar_chain::compute_convert_gain): Use xmm_move instead of sse_move. * i386.c (dimode_scalar_chain::compute_convert_gain): Use xmm_move instead of sse_move. (sse_store_index): New function. (ix86_register_move_cost): Be more sensible about mismatch stall; model AVX moves correctly; make difference between sse->integer and integer->sse. (ix86_builtin_vectorization_cost): Model correctly aligned and unaligned moves; make difference between SSE and AVX. * i386.h (processor_costs): Remove sse_move; add xmm_move, ymm_move and zmm_move. Increase size of sse load and store tables; add unaligned load and store tables; add ssemmx_to_integer. * x86-tune-costs.h: Update all entries according to real move latencies from Agner Fog's manual and chip documentation. From-SVN: r254012 --- gcc/ChangeLog | 16 + gcc/config/i386/i386.c | 75 ++-- gcc/config/i386/i386.h | 16 +- gcc/config/i386/x86-tune-costs.h | 642 ++++++++++++++++++------------- 4 files changed, 456 insertions(+), 293 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index cc64d7d3a2a..be739723a35 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,19 @@ +2017-10-23 Jan Hubicka + + * i386.c (dimode_scalar_chain::compute_convert_gain): Use + xmm_move instead of sse_move. + (sse_store_index): New function. + (ix86_register_move_cost): Be more sensible about mismatch stall; + model AVX moves correctly; make difference between sse->integer and + integer->sse. + (ix86_builtin_vectorization_cost): Model correctly aligned and unaligned + moves; make difference between SSE and AVX. + * i386.h (processor_costs): Remove sse_move; add xmm_move, ymm_move + and zmm_move. Increase size of sse load and store tables; + add unaligned load and store tables; add ssemmx_to_integer. + * x86-tune-costs.h: Update all entries according to real + move latencies from Agner Fog's manual and chip documentation. + 2017-10-23 Jakub Jelinek PR target/82628 diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 5840a273685..b182cc6c0ee 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -1601,7 +1601,7 @@ dimode_scalar_chain::compute_convert_gain () rtx dst = SET_DEST (def_set); if (REG_P (src) && REG_P (dst)) - gain += COSTS_N_INSNS (2) - ix86_cost->sse_move; + gain += COSTS_N_INSNS (2) - ix86_cost->xmm_move; else if (REG_P (src) && MEM_P (dst)) gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1]; else if (MEM_P (src) && REG_P (dst)) @@ -38634,6 +38634,28 @@ ix86_can_change_mode_class (machine_mode from, machine_mode to, return true; } +/* Return index of MODE in the sse load/store tables. */ + +static inline int +sse_store_index (machine_mode mode) +{ + switch (GET_MODE_SIZE (mode)) + { + case 4: + return 0; + case 8: + return 1; + case 16: + return 2; + case 32: + return 3; + case 64: + return 4; + default: + return -1; + } +} + /* Return the cost of moving data of mode M between a register and memory. A value of 2 is the default; this cost is relative to those in `REGISTER_MOVE_COST'. @@ -38677,21 +38699,9 @@ inline_memory_move_cost (machine_mode mode, enum reg_class regclass, } if (SSE_CLASS_P (regclass)) { - int index; - switch (GET_MODE_SIZE (mode)) - { - case 4: - index = 0; - break; - case 8: - index = 1; - break; - case 16: - index = 2; - break; - default: - return 100; - } + int index = sse_store_index (mode); + if (index == -1) + return 100; if (in == 2) return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]); return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index]; @@ -38794,8 +38804,10 @@ ix86_register_move_cost (machine_mode mode, reg_class_t class1_i, /* In case of copying from general_purpose_register we may emit multiple stores followed by single load causing memory size mismatch stall. Count this as arbitrarily high cost of 20. */ - if (targetm.class_max_nregs (class1, mode) - > targetm.class_max_nregs (class2, mode)) + if (GET_MODE_BITSIZE (mode) > BITS_PER_WORD + && TARGET_MEMORY_MISMATCH_STALL + && targetm.class_max_nregs (class1, mode) + > targetm.class_max_nregs (class2, mode)) cost += 20; /* In the case of FP/MMX moves, the registers actually overlap, and we @@ -38817,12 +38829,19 @@ ix86_register_move_cost (machine_mode mode, reg_class_t class1_i, where integer modes in MMX/SSE registers are not tieable because of missing QImode and HImode moves to, from or between MMX/SSE registers. */ - return MAX (8, ix86_cost->mmxsse_to_integer); + return MAX (8, MMX_CLASS_P (class1) || MMX_CLASS_P (class2) + ? ix86_cost->mmxsse_to_integer : ix86_cost->ssemmx_to_integer); if (MAYBE_FLOAT_CLASS_P (class1)) return ix86_cost->fp_move; if (MAYBE_SSE_CLASS_P (class1)) - return ix86_cost->sse_move; + { + if (GET_MODE_BITSIZE (mode) <= 128) + return ix86_cost->xmm_move; + if (GET_MODE_BITSIZE (mode) <= 256) + return ix86_cost->ymm_move; + return ix86_cost->zmm_move; + } if (MAYBE_MMX_CLASS_P (class1)) return ix86_cost->mmx_move; return 2; @@ -44370,6 +44389,7 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, { bool fp = false; machine_mode mode = TImode; + int index; if (vectype != NULL) { fp = FLOAT_TYPE_P (vectype); @@ -44397,13 +44417,16 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, true); case vector_load: + index = sse_store_index (mode); + gcc_assert (index >= 0); return ix86_vec_cost (mode, - COSTS_N_INSNS (ix86_cost->sse_load[2]) / 2, + COSTS_N_INSNS (ix86_cost->sse_load[index]) / 2, true); case vector_store: + index = sse_store_index (mode); return ix86_vec_cost (mode, - COSTS_N_INSNS (ix86_cost->sse_store[2]) / 2, + COSTS_N_INSNS (ix86_cost->sse_store[index]) / 2, true); case vec_to_scalar: @@ -44414,14 +44437,18 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, Do that incrementally. */ case unaligned_load: case vector_gather_load: + index = sse_store_index (mode); return ix86_vec_cost (mode, - COSTS_N_INSNS (ix86_cost->sse_load[2]), + COSTS_N_INSNS + (ix86_cost->sse_unaligned_load[index]) / 2, true); case unaligned_store: case vector_scatter_store: + index = sse_store_index (mode); return ix86_vec_cost (mode, - COSTS_N_INSNS (ix86_cost->sse_store[2]), + COSTS_N_INSNS + (ix86_cost->sse_unaligned_store[index]) / 2, true); case cond_branch_taken: diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 4a71977808e..27fc9f08cc7 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -242,13 +242,17 @@ struct processor_costs { in SImode and DImode */ const int mmx_store[2]; /* cost of storing MMX register in SImode and DImode */ - const int sse_move; /* cost of moving SSE register. */ - const int sse_load[3]; /* cost of loading SSE register - in SImode, DImode and TImode*/ - const int sse_store[3]; /* cost of storing SSE register - in SImode, DImode and TImode*/ + const int xmm_move, ymm_move, /* cost of moving XMM and YMM register. */ + zmm_move; + const int sse_load[5]; /* cost of loading SSE register + in 32bit, 64bit, 128bit, 256bit and 512bit */ + const int sse_unaligned_load[5];/* cost of unaligned load. */ + const int sse_store[5]; /* cost of storing SSE register + in SImode, DImode and TImode. */ + const int sse_unaligned_store[5];/* cost of unaligned store. */ const int mmxsse_to_integer; /* cost of moving mmxsse register to - integer and vice versa. */ + integer. */ + const int ssemmx_to_integer; /* cost of moving integer to mmxsse register. */ const int l1_cache_size; /* size of l1 cache, in kilobytes. */ const int l2_cache_size; /* size of l2 cache, in kilobytes. */ const int prefetch_block; /* bytes moved to cache for prefetch. */ diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h index 0bfcac44fe1..e31d7cef2eb 100644 --- a/gcc/config/i386/x86-tune-costs.h +++ b/gcc/config/i386/x86-tune-costs.h @@ -1,4 +1,26 @@ +/* Costs of operations of individual x86 CPUs. + Copyright (C) 1988-2017 Free Software Foundation, Inc. +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +Under Section 7 of GPL version 3, you are granted additional +permissions described in the GCC Runtime Library Exception, version +3.1, as published by the Free Software Foundation. + +You should have received a copy of the GNU General Public License and +a copy of the GCC Runtime Library Exception along with this program; +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +. */ /* Processor costs (relative to an add) */ /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */ #define COSTS_N_BYTES(N) ((N) * 2) @@ -33,6 +55,8 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */ COSTS_N_BYTES (3), /* cost of movzx */ 0, /* "large" insn */ 2, /* MOVE_RATIO */ + + /* All move costs are relative to integer->integer move times 2. */ 2, /* cost for loading QImode using movzbl */ {2, 2, 2}, /* cost of loading integer registers in QImode, HImode and SImode. @@ -48,12 +72,16 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */ in SImode and DImode */ {3, 3}, /* cost of storing MMX registers in SImode and DImode */ - 3, /* cost of moving SSE register */ - {3, 3, 3}, /* cost of loading SSE registers - in SImode, DImode and TImode */ - {3, 3, 3}, /* cost of storing SSE registers - in SImode, DImode and TImode */ - 3, /* MMX or SSE register to integer */ + 3, 3, 3, /* cost of moving XMM,YMM,ZMM register */ + {3, 3, 3, 3, 3}, /* cost of loading SSE registers + in 32,64,128,256 and 512-bit */ + {3, 3, 3, 3, 3}, /* cost of unaligned SSE load + in 128bit, 256bit and 512bit */ + {3, 3, 3, 3, 3}, /* cost of storing SSE registers + in 32,64,128,256 and 512-bit */ + {3, 3, 3, 3, 3}, /* cost of unaligned SSE store + in 128bit, 256bit and 512bit */ + 3, 3, /* SSE->integer and integer->SSE moves */ 0, /* size of l1 cache */ 0, /* size of l2 cache */ 0, /* size of prefetch block */ @@ -112,6 +140,9 @@ struct processor_costs i386_cost = { /* 386 specific costs */ COSTS_N_INSNS (2), /* cost of movzx */ 15, /* "large" insn */ 3, /* MOVE_RATIO */ + + /* All move costs are relative to integer->integer move times 2 and thus + they are latency*2. */ 4, /* cost for loading QImode using movzbl */ {2, 4, 2}, /* cost of loading integer registers in QImode, HImode and SImode. @@ -127,12 +158,14 @@ struct processor_costs i386_cost = { /* 386 specific costs */ in SImode and DImode */ {4, 8}, /* cost of storing MMX registers in SImode and DImode */ - 2, /* cost of moving SSE register */ - {4, 8, 16}, /* cost of loading SSE registers - in SImode, DImode and TImode */ - {4, 8, 16}, /* cost of storing SSE registers - in SImode, DImode and TImode */ - 3, /* MMX or SSE register to integer */ + 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ + {4, 8, 16, 32, 64}, /* cost of loading SSE registers + in 32,64,128,256 and 512-bit */ + {4, 8, 16, 32, 64}, /* cost of unaligned loads. */ + {4, 8, 16, 32, 64}, /* cost of storing SSE registers + in 32,64,128,256 and 512-bit */ + {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ + 3, 3, /* SSE->integer and integer->SSE moves */ 0, /* size of l1 cache */ 0, /* size of l2 cache */ 0, /* size of prefetch block */ @@ -190,6 +223,9 @@ struct processor_costs i486_cost = { /* 486 specific costs */ COSTS_N_INSNS (2), /* cost of movzx */ 15, /* "large" insn */ 3, /* MOVE_RATIO */ + + /* All move costs are relative to integer->integer move times 2 and thus + they are latency*2. */ 4, /* cost for loading QImode using movzbl */ {2, 4, 2}, /* cost of loading integer registers in QImode, HImode and SImode. @@ -205,12 +241,14 @@ struct processor_costs i486_cost = { /* 486 specific costs */ in SImode and DImode */ {4, 8}, /* cost of storing MMX registers in SImode and DImode */ - 2, /* cost of moving SSE register */ - {4, 8, 16}, /* cost of loading SSE registers - in SImode, DImode and TImode */ - {4, 8, 16}, /* cost of storing SSE registers - in SImode, DImode and TImode */ - 3, /* MMX or SSE register to integer */ + 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ + {4, 8, 16, 32, 64}, /* cost of loading SSE registers + in 32,64,128,256 and 512-bit */ + {4, 8, 16, 32, 64}, /* cost of unaligned loads. */ + {4, 8, 16, 32, 64}, /* cost of storing SSE registers + in 32,64,128,256 and 512-bit */ + {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ + 3, 3, /* SSE->integer and integer->SSE moves */ 4, /* size of l1 cache. 486 has 8kB cache shared for code and data, so 4kB is not really precise. */ @@ -270,6 +308,9 @@ struct processor_costs pentium_cost = { COSTS_N_INSNS (2), /* cost of movzx */ 8, /* "large" insn */ 6, /* MOVE_RATIO */ + + /* All move costs are relative to integer->integer move times 2 and thus + they are latency*2. */ 6, /* cost for loading QImode using movzbl */ {2, 4, 2}, /* cost of loading integer registers in QImode, HImode and SImode. @@ -285,12 +326,14 @@ struct processor_costs pentium_cost = { in SImode and DImode */ {8, 8}, /* cost of storing MMX registers in SImode and DImode */ - 2, /* cost of moving SSE register */ - {4, 8, 16}, /* cost of loading SSE registers - in SImode, DImode and TImode */ - {4, 8, 16}, /* cost of storing SSE registers - in SImode, DImode and TImode */ - 3, /* MMX or SSE register to integer */ + 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ + {4, 8, 16, 32, 64}, /* cost of loading SSE registers + in 32,64,128,256 and 512-bit */ + {4, 8, 16, 32, 64}, /* cost of unaligned loads. */ + {4, 8, 16, 32, 64}, /* cost of storing SSE registers + in 32,64,128,256 and 512-bit */ + {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ + 3, 3, /* SSE->integer and integer->SSE moves */ 8, /* size of l1 cache. */ 8, /* size of l2 cache */ 0, /* size of prefetch block */ @@ -341,6 +384,9 @@ struct processor_costs lakemont_cost = { COSTS_N_INSNS (2), /* cost of movzx */ 8, /* "large" insn */ 17, /* MOVE_RATIO */ + + /* All move costs are relative to integer->integer move times 2 and thus + they are latency*2. */ 6, /* cost for loading QImode using movzbl */ {2, 4, 2}, /* cost of loading integer registers in QImode, HImode and SImode. @@ -356,12 +402,14 @@ struct processor_costs lakemont_cost = { in SImode and DImode */ {8, 8}, /* cost of storing MMX registers in SImode and DImode */ - 2, /* cost of moving SSE register */ - {4, 8, 16}, /* cost of loading SSE registers - in SImode, DImode and TImode */ - {4, 8, 16}, /* cost of storing SSE registers - in SImode, DImode and TImode */ - 3, /* MMX or SSE register to integer */ + 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ + {4, 8, 16, 32, 64}, /* cost of loading SSE registers + in 32,64,128,256 and 512-bit */ + {4, 8, 16, 32, 64}, /* cost of unaligned loads. */ + {4, 8, 16, 32, 64}, /* cost of storing SSE registers + in 32,64,128,256 and 512-bit */ + {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ + 3, 3, /* SSE->integer and integer->SSE moves */ 8, /* size of l1 cache. */ 8, /* size of l2 cache */ 0, /* size of prefetch block */ @@ -427,6 +475,9 @@ struct processor_costs pentiumpro_cost = { COSTS_N_INSNS (1), /* cost of movzx */ 8, /* "large" insn */ 6, /* MOVE_RATIO */ + + /* All move costs are relative to integer->integer move times 2 and thus + they are latency*2. */ 2, /* cost for loading QImode using movzbl */ {4, 4, 4}, /* cost of loading integer registers in QImode, HImode and SImode. @@ -442,12 +493,14 @@ struct processor_costs pentiumpro_cost = { in SImode and DImode */ {2, 2}, /* cost of storing MMX registers in SImode and DImode */ - 2, /* cost of moving SSE register */ - {2, 2, 8}, /* cost of loading SSE registers - in SImode, DImode and TImode */ - {2, 2, 8}, /* cost of storing SSE registers - in SImode, DImode and TImode */ - 3, /* MMX or SSE register to integer */ + 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ + {4, 8, 16, 32, 64}, /* cost of loading SSE registers + in 32,64,128,256 and 512-bit */ + {4, 8, 16, 32, 64}, /* cost of unaligned loads. */ + {4, 8, 16, 32, 64}, /* cost of storing SSE registers + in 32,64,128,256 and 512-bit */ + {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ + 3, 3, /* SSE->integer and integer->SSE moves */ 8, /* size of l1 cache. */ 256, /* size of l2 cache */ 32, /* size of prefetch block */ @@ -504,13 +557,16 @@ struct processor_costs geode_cost = { COSTS_N_INSNS (1), /* cost of movzx */ 8, /* "large" insn */ 4, /* MOVE_RATIO */ - 1, /* cost for loading QImode using movzbl */ - {1, 1, 1}, /* cost of loading integer registers + + /* All move costs are relative to integer->integer move times 2 and thus + they are latency*2. */ + 2, /* cost for loading QImode using movzbl */ + {2, 2, 2}, /* cost of loading integer registers in QImode, HImode and SImode. Relative to reg-reg move (2). */ - {1, 1, 1}, /* cost of storing integer registers */ - 1, /* cost of reg,reg fld/fst */ - {1, 1, 1}, /* cost of loading fp registers + {2, 2, 2}, /* cost of storing integer registers */ + 2, /* cost of reg,reg fld/fst */ + {2, 2, 2}, /* cost of loading fp registers in SFmode, DFmode and XFmode */ {4, 6, 6}, /* cost of storing fp registers in SFmode, DFmode and XFmode */ @@ -520,12 +576,14 @@ struct processor_costs geode_cost = { in SImode and DImode */ {2, 2}, /* cost of storing MMX registers in SImode and DImode */ - 2, /* cost of moving SSE register */ - {2, 2, 8}, /* cost of loading SSE registers - in SImode, DImode and TImode */ - {2, 2, 8}, /* cost of storing SSE registers - in SImode, DImode and TImode */ - 3, /* MMX or SSE register to integer */ + 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ + {2, 2, 8, 16, 32}, /* cost of loading SSE registers + in 32,64,128,256 and 512-bit */ + {2, 2, 8, 16, 32}, /* cost of unaligned loads. */ + {2, 2, 8, 16, 32}, /* cost of storing SSE registers + in 32,64,128,256 and 512-bit */ + {2, 2, 8, 16, 32}, /* cost of unaligned stores. */ + 6, 6, /* SSE->integer and integer->SSE moves */ 64, /* size of l1 cache. */ 128, /* size of l2 cache. */ 32, /* size of prefetch block */ @@ -582,6 +640,9 @@ struct processor_costs k6_cost = { COSTS_N_INSNS (2), /* cost of movzx */ 8, /* "large" insn */ 4, /* MOVE_RATIO */ + + /* All move costs are relative to integer->integer move times 2 and thus + they are latency*2. */ 3, /* cost for loading QImode using movzbl */ {4, 5, 4}, /* cost of loading integer registers in QImode, HImode and SImode. @@ -597,12 +658,14 @@ struct processor_costs k6_cost = { in SImode and DImode */ {2, 2}, /* cost of storing MMX registers in SImode and DImode */ - 2, /* cost of moving SSE register */ - {2, 2, 8}, /* cost of loading SSE registers - in SImode, DImode and TImode */ - {2, 2, 8}, /* cost of storing SSE registers - in SImode, DImode and TImode */ - 6, /* MMX or SSE register to integer */ + 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ + {2, 2, 8, 16, 32}, /* cost of loading SSE registers + in 32,64,128,256 and 512-bit */ + {2, 2, 8, 16, 32}, /* cost of unaligned loads. */ + {2, 2, 8, 16, 32}, /* cost of storing SSE registers + in 32,64,128,256 and 512-bit */ + {2, 2, 8, 16, 32}, /* cost of unaligned stores. */ + 6, 6, /* SSE->integer and integer->SSE moves */ 32, /* size of l1 cache. */ 32, /* size of l2 cache. Some models have integrated l2 cache, but @@ -665,6 +728,9 @@ struct processor_costs athlon_cost = { COSTS_N_INSNS (1), /* cost of movzx */ 8, /* "large" insn */ 9, /* MOVE_RATIO */ + + /* All move costs are relative to integer->integer move times 2 and thus + they are latency*2. */ 4, /* cost for loading QImode using movzbl */ {3, 4, 3}, /* cost of loading integer registers in QImode, HImode and SImode. @@ -680,12 +746,14 @@ struct processor_costs athlon_cost = { in SImode and DImode */ {4, 4}, /* cost of storing MMX registers in SImode and DImode */ - 2, /* cost of moving SSE register */ - {4, 4, 6}, /* cost of loading SSE registers - in SImode, DImode and TImode */ - {4, 4, 5}, /* cost of storing SSE registers - in SImode, DImode and TImode */ - 5, /* MMX or SSE register to integer */ + 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ + {4, 4, 6, 12, 24}, /* cost of loading SSE registers + in 32,64,128,256 and 512-bit */ + {4, 4, 6, 12, 24}, /* cost of unaligned loads. */ + {4, 4, 5, 10, 20}, /* cost of storing SSE registers + in 32,64,128,256 and 512-bit */ + {4, 4, 5, 10, 20}, /* cost of unaligned stores. */ + 5, 5, /* SSE->integer and integer->SSE moves */ 64, /* size of l1 cache. */ 256, /* size of l2 cache. */ 64, /* size of prefetch block */ @@ -750,6 +818,9 @@ struct processor_costs k8_cost = { COSTS_N_INSNS (1), /* cost of movzx */ 8, /* "large" insn */ 9, /* MOVE_RATIO */ + + /* All move costs are relative to integer->integer move times 2 and thus + they are latency*2. */ 4, /* cost for loading QImode using movzbl */ {3, 4, 3}, /* cost of loading integer registers in QImode, HImode and SImode. @@ -765,12 +836,14 @@ struct processor_costs k8_cost = { in SImode and DImode */ {4, 4}, /* cost of storing MMX registers in SImode and DImode */ - 2, /* cost of moving SSE register */ - {4, 3, 6}, /* cost of loading SSE registers - in SImode, DImode and TImode */ - {4, 4, 5}, /* cost of storing SSE registers - in SImode, DImode and TImode */ - 5, /* MMX or SSE register to integer */ + 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ + {4, 3, 6, 12, 24}, /* cost of loading SSE registers + in 32,64,128,256 and 512-bit */ + {4, 3, 6, 12, 24}, /* cost of unaligned loads. */ + {4, 4, 5, 10, 20}, /* cost of storing SSE registers + in 32,64,128,256 and 512-bit */ + {4, 4, 5, 10, 20}, /* cost of unaligned stores. */ + 5, 5, /* SSE->integer and integer->SSE moves */ 64, /* size of l1 cache. */ 512, /* size of l2 cache. */ 64, /* size of prefetch block */ @@ -839,6 +912,9 @@ struct processor_costs amdfam10_cost = { COSTS_N_INSNS (1), /* cost of movzx */ 8, /* "large" insn */ 9, /* MOVE_RATIO */ + + /* All move costs are relative to integer->integer move times 2 and thus + they are latency*2. */ 4, /* cost for loading QImode using movzbl */ {3, 4, 3}, /* cost of loading integer registers in QImode, HImode and SImode. @@ -854,12 +930,14 @@ struct processor_costs amdfam10_cost = { in SImode and DImode */ {4, 4}, /* cost of storing MMX registers in SImode and DImode */ - 2, /* cost of moving SSE register */ - {4, 4, 3}, /* cost of loading SSE registers - in SImode, DImode and TImode */ - {4, 4, 5}, /* cost of storing SSE registers - in SImode, DImode and TImode */ - 3, /* MMX or SSE register to integer */ + 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ + {4, 4, 3, 6, 12}, /* cost of loading SSE registers + in 32,64,128,256 and 512-bit */ + {4, 4, 3, 7, 12}, /* cost of unaligned loads. */ + {4, 4, 5, 10, 20}, /* cost of storing SSE registers + in 32,64,128,256 and 512-bit */ + {4, 4, 5, 10, 20}, /* cost of unaligned stores. */ + 3, 3, /* SSE->integer and integer->SSE moves */ /* On K8: MOVD reg64, xmmreg Double FSTORE 4 MOVD reg32, xmmreg Double FSTORE 4 @@ -937,35 +1015,32 @@ const struct processor_costs bdver1_cost = { COSTS_N_INSNS (1), /* cost of movzx */ 8, /* "large" insn */ 9, /* MOVE_RATIO */ - 4, /* cost for loading QImode using movzbl */ - {5, 5, 4}, /* cost of loading integer registers + + /* All move costs are relative to integer->integer move times 2 and thus + they are latency*2. */ + 8, /* cost for loading QImode using movzbl */ + {8, 8, 8}, /* cost of loading integer registers in QImode, HImode and SImode. Relative to reg-reg move (2). */ - {4, 4, 4}, /* cost of storing integer registers */ - 2, /* cost of reg,reg fld/fst */ - {5, 5, 12}, /* cost of loading fp registers + {8, 8, 8}, /* cost of storing integer registers */ + 4, /* cost of reg,reg fld/fst */ + {12, 12, 28}, /* cost of loading fp registers in SFmode, DFmode and XFmode */ - {4, 4, 8}, /* cost of storing fp registers + {10, 10, 18}, /* cost of storing fp registers in SFmode, DFmode and XFmode */ - 2, /* cost of moving MMX register */ - {4, 4}, /* cost of loading MMX registers + 4, /* cost of moving MMX register */ + {12, 12}, /* cost of loading MMX registers in SImode and DImode */ - {4, 4}, /* cost of storing MMX registers + {10, 10}, /* cost of storing MMX registers in SImode and DImode */ - 2, /* cost of moving SSE register */ - {4, 4, 4}, /* cost of loading SSE registers - in SImode, DImode and TImode */ - {4, 4, 4}, /* cost of storing SSE registers - in SImode, DImode and TImode */ - 2, /* MMX or SSE register to integer */ - /* On K8: - MOVD reg64, xmmreg Double FSTORE 4 - MOVD reg32, xmmreg Double FSTORE 4 - On AMDFAM10: - MOVD reg64, xmmreg Double FADD 3 - 1/1 1/1 - MOVD reg32, xmmreg Double FADD 3 - 1/1 1/1 */ + 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ + {12, 12, 10, 20, 30}, /* cost of loading SSE registers + in 32,64,128,256 and 512-bit */ + {12, 12, 10, 20, 30}, /* cost of unaligned loads. */ + {10, 10, 10, 20, 30}, /* cost of storing SSE registers + in 32,64,128,256 and 512-bit */ + {10, 10, 10, 20, 30}, /* cost of unaligned stores. */ + 16, 20, /* SSE->integer and integer->SSE moves */ 16, /* size of l1 cache. */ 2048, /* size of l2 cache. */ 64, /* size of prefetch block */ @@ -1037,35 +1112,32 @@ const struct processor_costs bdver2_cost = { COSTS_N_INSNS (1), /* cost of movzx */ 8, /* "large" insn */ 9, /* MOVE_RATIO */ - 4, /* cost for loading QImode using movzbl */ - {5, 5, 4}, /* cost of loading integer registers + + /* All move costs are relative to integer->integer move times 2 and thus + they are latency*2. */ + 8, /* cost for loading QImode using movzbl */ + {8, 8, 8}, /* cost of loading integer registers in QImode, HImode and SImode. Relative to reg-reg move (2). */ - {4, 4, 4}, /* cost of storing integer registers */ - 2, /* cost of reg,reg fld/fst */ - {5, 5, 12}, /* cost of loading fp registers + {8, 8, 8}, /* cost of storing integer registers */ + 4, /* cost of reg,reg fld/fst */ + {12, 12, 28}, /* cost of loading fp registers in SFmode, DFmode and XFmode */ - {4, 4, 8}, /* cost of storing fp registers + {10, 10, 18}, /* cost of storing fp registers in SFmode, DFmode and XFmode */ - 2, /* cost of moving MMX register */ - {4, 4}, /* cost of loading MMX registers + 4, /* cost of moving MMX register */ + {12, 12}, /* cost of loading MMX registers in SImode and DImode */ - {4, 4}, /* cost of storing MMX registers + {10, 10}, /* cost of storing MMX registers in SImode and DImode */ - 2, /* cost of moving SSE register */ - {4, 4, 4}, /* cost of loading SSE registers - in SImode, DImode and TImode */ - {4, 4, 4}, /* cost of storing SSE registers - in SImode, DImode and TImode */ - 2, /* MMX or SSE register to integer */ - /* On K8: - MOVD reg64, xmmreg Double FSTORE 4 - MOVD reg32, xmmreg Double FSTORE 4 - On AMDFAM10: - MOVD reg64, xmmreg Double FADD 3 - 1/1 1/1 - MOVD reg32, xmmreg Double FADD 3 - 1/1 1/1 */ + 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ + {12, 12, 10, 20, 30}, /* cost of loading SSE registers + in 32,64,128,256 and 512-bit */ + {12, 12, 10, 20, 30}, /* cost of unaligned loads. */ + {10, 10, 10, 20, 30}, /* cost of storing SSE registers + in 32,64,128,256 and 512-bit */ + {10, 10, 10, 20, 30}, /* cost of unaligned stores. */ + 16, 20, /* SSE->integer and integer->SSE moves */ 16, /* size of l1 cache. */ 2048, /* size of l2 cache. */ 64, /* size of prefetch block */ @@ -1136,27 +1208,32 @@ struct processor_costs bdver3_cost = { COSTS_N_INSNS (1), /* cost of movzx */ 8, /* "large" insn */ 9, /* MOVE_RATIO */ - 4, /* cost for loading QImode using movzbl */ - {5, 5, 4}, /* cost of loading integer registers + + /* All move costs are relative to integer->integer move times 2 and thus + they are latency*2. */ + 8, /* cost for loading QImode using movzbl */ + {8, 8, 8}, /* cost of loading integer registers in QImode, HImode and SImode. Relative to reg-reg move (2). */ - {4, 4, 4}, /* cost of storing integer registers */ - 2, /* cost of reg,reg fld/fst */ - {5, 5, 12}, /* cost of loading fp registers + {8, 8, 8}, /* cost of storing integer registers */ + 4, /* cost of reg,reg fld/fst */ + {12, 12, 28}, /* cost of loading fp registers in SFmode, DFmode and XFmode */ - {4, 4, 8}, /* cost of storing fp registers + {10, 10, 18}, /* cost of storing fp registers in SFmode, DFmode and XFmode */ - 2, /* cost of moving MMX register */ - {4, 4}, /* cost of loading MMX registers + 4, /* cost of moving MMX register */ + {12, 12}, /* cost of loading MMX registers in SImode and DImode */ - {4, 4}, /* cost of storing MMX registers + {10, 10}, /* cost of storing MMX registers in SImode and DImode */ - 2, /* cost of moving SSE register */ - {4, 4, 4}, /* cost of loading SSE registers - in SImode, DImode and TImode */ - {4, 4, 4}, /* cost of storing SSE registers - in SImode, DImode and TImode */ - 2, /* MMX or SSE register to integer */ + 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ + {12, 12, 10, 20, 30}, /* cost of loading SSE registers + in 32,64,128,256 and 512-bit */ + {12, 12, 10, 20, 30}, /* cost of unaligned loads. */ + {10, 10, 10, 20, 30}, /* cost of storing SSE registers + in 32,64,128,256 and 512-bit */ + {10, 10, 10, 20, 30}, /* cost of unaligned stores. */ + 16, 20, /* SSE->integer and integer->SSE moves */ 16, /* size of l1 cache. */ 2048, /* size of l2 cache. */ 64, /* size of prefetch block */ @@ -1226,27 +1303,32 @@ struct processor_costs bdver4_cost = { COSTS_N_INSNS (1), /* cost of movzx */ 8, /* "large" insn */ 9, /* MOVE_RATIO */ - 4, /* cost for loading QImode using movzbl */ - {5, 5, 4}, /* cost of loading integer registers + + /* All move costs are relative to integer->integer move times 2 and thus + they are latency*2. */ + 8, /* cost for loading QImode using movzbl */ + {8, 8, 8}, /* cost of loading integer registers in QImode, HImode and SImode. Relative to reg-reg move (2). */ - {4, 4, 4}, /* cost of storing integer registers */ - 2, /* cost of reg,reg fld/fst */ - {5, 5, 12}, /* cost of loading fp registers + {8, 8, 8}, /* cost of storing integer registers */ + 4, /* cost of reg,reg fld/fst */ + {12, 12, 28}, /* cost of loading fp registers in SFmode, DFmode and XFmode */ - {4, 4, 8}, /* cost of storing fp registers + {10, 10, 18}, /* cost of storing fp registers in SFmode, DFmode and XFmode */ - 2, /* cost of moving MMX register */ - {4, 4}, /* cost of loading MMX registers + 4, /* cost of moving MMX register */ + {12, 12}, /* cost of loading MMX registers in SImode and DImode */ - {4, 4}, /* cost of storing MMX registers + {10, 10}, /* cost of storing MMX registers in SImode and DImode */ - 2, /* cost of moving SSE register */ - {4, 4, 4}, /* cost of loading SSE registers - in SImode, DImode and TImode */ - {4, 4, 4}, /* cost of storing SSE registers - in SImode, DImode and TImode */ - 2, /* MMX or SSE register to integer */ + 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ + {12, 12, 10, 20, 30}, /* cost of loading SSE registers + in 32,64,128,256 and 512-bit */ + {12, 12, 10, 20, 30}, /* cost of unaligned loads. */ + {10, 10, 10, 20, 30}, /* cost of storing SSE registers + in 32,64,128,256 and 512-bit */ + {10, 10, 10, 20, 30}, /* cost of unaligned stores. */ + 16, 20, /* SSE->integer and integer->SSE moves */ 16, /* size of l1 cache. */ 2048, /* size of l2 cache. */ 64, /* size of prefetch block */ @@ -1321,6 +1403,9 @@ struct processor_costs znver1_cost = { 8, /* "large" insn. */ 9, /* MOVE_RATIO. */ + /* All move costs are relative to integer->integer move times 2 and thus + they are latency*2. */ + /* reg-reg moves are done by renaming and thus they are even cheaper than 1 cycle. Becuase reg-reg move cost is 2 and the following tables correspond to doubles of latencies, we do not model this correctly. It does not @@ -1342,12 +1427,14 @@ struct processor_costs znver1_cost = { in SImode and DImode. */ {8, 8}, /* cost of storing MMX registers in SImode and DImode. */ - 2, /* cost of moving SSE register. */ - {6, 6, 6}, /* cost of loading SSE registers - in SImode, DImode and TImode. */ - {8, 8, 8}, /* cost of storing SSE registers - in SImode, DImode and TImode. */ - 6, /* MMX or SSE register to integer. */ + 2, 3, 6, /* cost of moving XMM,YMM,ZMM register. */ + {6, 6, 6, 10, 20}, /* cost of loading SSE registers + in 32,64,128,256 and 512-bit. */ + {6, 6, 6, 10, 20}, /* cost of unaligned loads. */ + {8, 8, 8, 8, 16}, /* cost of storing SSE registers + in 32,64,128,256 and 512-bit. */ + {8, 8, 8, 8, 16}, /* cost of unaligned stores. */ + 6, 6, /* SSE->integer and integer->SSE moves. */ 32, /* size of l1 cache. */ 512, /* size of l2 cache. */ 64, /* size of prefetch block. */ @@ -1426,35 +1513,32 @@ const struct processor_costs btver1_cost = { COSTS_N_INSNS (1), /* cost of movzx */ 8, /* "large" insn */ 9, /* MOVE_RATIO */ - 4, /* cost for loading QImode using movzbl */ - {3, 4, 3}, /* cost of loading integer registers + + /* All move costs are relative to integer->integer move times 2 and thus + they are latency*2. */ + 8, /* cost for loading QImode using movzbl */ + {6, 8, 6}, /* cost of loading integer registers in QImode, HImode and SImode. Relative to reg-reg move (2). */ - {3, 4, 3}, /* cost of storing integer registers */ + {6, 8, 6}, /* cost of storing integer registers */ 4, /* cost of reg,reg fld/fst */ - {4, 4, 12}, /* cost of loading fp registers + {12, 12, 28}, /* cost of loading fp registers in SFmode, DFmode and XFmode */ - {6, 6, 8}, /* cost of storing fp registers + {12, 12, 38}, /* cost of storing fp registers in SFmode, DFmode and XFmode */ - 2, /* cost of moving MMX register */ - {3, 3}, /* cost of loading MMX registers + 4, /* cost of moving MMX register */ + {10, 10}, /* cost of loading MMX registers in SImode and DImode */ - {4, 4}, /* cost of storing MMX registers + {12, 12}, /* cost of storing MMX registers in SImode and DImode */ - 2, /* cost of moving SSE register */ - {4, 4, 3}, /* cost of loading SSE registers - in SImode, DImode and TImode */ - {4, 4, 5}, /* cost of storing SSE registers - in SImode, DImode and TImode */ - 3, /* MMX or SSE register to integer */ - /* On K8: - MOVD reg64, xmmreg Double FSTORE 4 - MOVD reg32, xmmreg Double FSTORE 4 - On AMDFAM10: - MOVD reg64, xmmreg Double FADD 3 - 1/1 1/1 - MOVD reg32, xmmreg Double FADD 3 - 1/1 1/1 */ + 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ + {10, 10, 12, 24, 48}, /* cost of loading SSE registers + in 32,64,128,256 and 512-bit */ + {10, 10, 12, 24, 48}, /* cost of unaligned loads. */ + {10, 10, 12, 24, 48}, /* cost of storing SSE registers + in 32,64,128,256 and 512-bit */ + {10, 10, 12, 24, 48}, /* cost of unaligned stores. */ + 14, 14, /* SSE->integer and integer->SSE moves */ 32, /* size of l1 cache. */ 512, /* size of l2 cache. */ 64, /* size of prefetch block */ @@ -1514,35 +1598,32 @@ const struct processor_costs btver2_cost = { COSTS_N_INSNS (1), /* cost of movzx */ 8, /* "large" insn */ 9, /* MOVE_RATIO */ - 4, /* cost for loading QImode using movzbl */ - {3, 4, 3}, /* cost of loading integer registers + + /* All move costs are relative to integer->integer move times 2 and thus + they are latency*2. */ + 8, /* cost for loading QImode using movzbl */ + {8, 8, 6}, /* cost of loading integer registers in QImode, HImode and SImode. Relative to reg-reg move (2). */ - {3, 4, 3}, /* cost of storing integer registers */ + {8, 8, 6}, /* cost of storing integer registers */ 4, /* cost of reg,reg fld/fst */ - {4, 4, 12}, /* cost of loading fp registers + {12, 12, 28}, /* cost of loading fp registers in SFmode, DFmode and XFmode */ - {6, 6, 8}, /* cost of storing fp registers + {12, 12, 38}, /* cost of storing fp registers in SFmode, DFmode and XFmode */ - 2, /* cost of moving MMX register */ - {3, 3}, /* cost of loading MMX registers + 4, /* cost of moving MMX register */ + {10, 10}, /* cost of loading MMX registers in SImode and DImode */ - {4, 4}, /* cost of storing MMX registers + {12, 12}, /* cost of storing MMX registers in SImode and DImode */ - 2, /* cost of moving SSE register */ - {4, 4, 3}, /* cost of loading SSE registers - in SImode, DImode and TImode */ - {4, 4, 5}, /* cost of storing SSE registers - in SImode, DImode and TImode */ - 3, /* MMX or SSE register to integer */ - /* On K8: - MOVD reg64, xmmreg Double FSTORE 4 - MOVD reg32, xmmreg Double FSTORE 4 - On AMDFAM10: - MOVD reg64, xmmreg Double FADD 3 - 1/1 1/1 - MOVD reg32, xmmreg Double FADD 3 - 1/1 1/1 */ + 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ + {10, 10, 12, 24, 48}, /* cost of loading SSE registers + in 32,64,128,256 and 512-bit */ + {10, 10, 12, 24, 48}, /* cost of unaligned loads. */ + {10, 10, 12, 24, 48}, /* cost of storing SSE registers + in 32,64,128,256 and 512-bit */ + {10, 10, 12, 24, 48}, /* cost of unaligned stores. */ + 14, 14, /* SSE->integer and integer->SSE moves */ 32, /* size of l1 cache. */ 2048, /* size of l2 cache. */ 64, /* size of prefetch block */ @@ -1601,27 +1682,32 @@ struct processor_costs pentium4_cost = { COSTS_N_INSNS (1), /* cost of movzx */ 16, /* "large" insn */ 6, /* MOVE_RATIO */ - 2, /* cost for loading QImode using movzbl */ + + /* All move costs are relative to integer->integer move times 2 and thus + they are latency*2. */ + 5, /* cost for loading QImode using movzbl */ {4, 5, 4}, /* cost of loading integer registers in QImode, HImode and SImode. Relative to reg-reg move (2). */ {2, 3, 2}, /* cost of storing integer registers */ - 2, /* cost of reg,reg fld/fst */ - {2, 2, 6}, /* cost of loading fp registers + 12, /* cost of reg,reg fld/fst */ + {14, 14, 14}, /* cost of loading fp registers in SFmode, DFmode and XFmode */ - {4, 4, 6}, /* cost of storing fp registers + {14, 14, 14}, /* cost of storing fp registers in SFmode, DFmode and XFmode */ - 2, /* cost of moving MMX register */ - {2, 2}, /* cost of loading MMX registers + 12, /* cost of moving MMX register */ + {16, 16}, /* cost of loading MMX registers in SImode and DImode */ - {2, 2}, /* cost of storing MMX registers + {16, 16}, /* cost of storing MMX registers in SImode and DImode */ - 12, /* cost of moving SSE register */ - {12, 12, 12}, /* cost of loading SSE registers - in SImode, DImode and TImode */ - {2, 2, 8}, /* cost of storing SSE registers - in SImode, DImode and TImode */ - 10, /* MMX or SSE register to integer */ + 12, 24, 48, /* cost of moving XMM,YMM,ZMM register */ + {16, 16, 16, 32, 64}, /* cost of loading SSE registers + in 32,64,128,256 and 512-bit */ + {32, 32, 32, 64, 128}, /* cost of unaligned loads. */ + {16, 16, 16, 32, 64}, /* cost of storing SSE registers + in 32,64,128,256 and 512-bit */ + {32, 32, 32, 64, 128}, /* cost of unaligned stores. */ + 20, 12, /* SSE->integer and integer->SSE moves */ 8, /* size of l1 cache. */ 256, /* size of l2 cache. */ 64, /* size of prefetch block */ @@ -1683,27 +1769,32 @@ struct processor_costs nocona_cost = { COSTS_N_INSNS (1), /* cost of movzx */ 16, /* "large" insn */ 17, /* MOVE_RATIO */ + + /* All move costs are relative to integer->integer move times 2 and thus + they are latency*2. */ 4, /* cost for loading QImode using movzbl */ {4, 4, 4}, /* cost of loading integer registers in QImode, HImode and SImode. Relative to reg-reg move (2). */ {4, 4, 4}, /* cost of storing integer registers */ - 3, /* cost of reg,reg fld/fst */ - {12, 12, 12}, /* cost of loading fp registers + 12, /* cost of reg,reg fld/fst */ + {14, 14, 14}, /* cost of loading fp registers in SFmode, DFmode and XFmode */ - {4, 4, 4}, /* cost of storing fp registers + {14, 14, 14}, /* cost of storing fp registers in SFmode, DFmode and XFmode */ - 6, /* cost of moving MMX register */ + 14, /* cost of moving MMX register */ {12, 12}, /* cost of loading MMX registers in SImode and DImode */ {12, 12}, /* cost of storing MMX registers in SImode and DImode */ - 6, /* cost of moving SSE register */ - {12, 12, 12}, /* cost of loading SSE registers - in SImode, DImode and TImode */ - {12, 12, 12}, /* cost of storing SSE registers - in SImode, DImode and TImode */ - 8, /* MMX or SSE register to integer */ + 6, 12, 24, /* cost of moving XMM,YMM,ZMM register */ + {12, 12, 12, 24, 48}, /* cost of loading SSE registers + in 32,64,128,256 and 512-bit */ + {24, 24, 24, 48, 96}, /* cost of unaligned loads. */ + {12, 12, 12, 24, 48}, /* cost of storing SSE registers + in 32,64,128,256 and 512-bit */ + {24, 24, 24, 48, 96}, /* cost of unaligned stores. */ + 20, 12, /* SSE->integer and integer->SSE moves */ 8, /* size of l1 cache. */ 1024, /* size of l2 cache. */ 64, /* size of prefetch block */ @@ -1763,27 +1854,32 @@ struct processor_costs atom_cost = { COSTS_N_INSNS (1), /* cost of movzx */ 8, /* "large" insn */ 17, /* MOVE_RATIO */ - 4, /* cost for loading QImode using movzbl */ - {4, 4, 4}, /* cost of loading integer registers + + /* All move costs are relative to integer->integer move times 2 and thus + they are latency*2. */ + 6, /* cost for loading QImode using movzbl */ + {6, 6, 6}, /* cost of loading integer registers in QImode, HImode and SImode. Relative to reg-reg move (2). */ - {4, 4, 4}, /* cost of storing integer registers */ + {6, 6, 6}, /* cost of storing integer registers */ 4, /* cost of reg,reg fld/fst */ - {12, 12, 12}, /* cost of loading fp registers + {6, 6, 18}, /* cost of loading fp registers in SFmode, DFmode and XFmode */ - {6, 6, 8}, /* cost of storing fp registers + {14, 14, 24}, /* cost of storing fp registers in SFmode, DFmode and XFmode */ 2, /* cost of moving MMX register */ {8, 8}, /* cost of loading MMX registers in SImode and DImode */ - {8, 8}, /* cost of storing MMX registers + {10, 10}, /* cost of storing MMX registers in SImode and DImode */ - 2, /* cost of moving SSE register */ - {8, 8, 8}, /* cost of loading SSE registers - in SImode, DImode and TImode */ - {8, 8, 8}, /* cost of storing SSE registers - in SImode, DImode and TImode */ - 5, /* MMX or SSE register to integer */ + 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ + {8, 8, 8, 16, 32}, /* cost of loading SSE registers + in 32,64,128,256 and 512-bit */ + {16, 16, 16, 32, 64}, /* cost of unaligned loads. */ + {8, 8, 8, 16, 32}, /* cost of storing SSE registers + in 32,64,128,256 and 512-bit */ + {16, 16, 16, 32, 64}, /* cost of unaligned stores. */ + 8, 6, /* SSE->integer and integer->SSE moves */ 32, /* size of l1 cache. */ 256, /* size of l2 cache. */ 64, /* size of prefetch block */ @@ -1843,27 +1939,32 @@ struct processor_costs slm_cost = { COSTS_N_INSNS (1), /* cost of movzx */ 8, /* "large" insn */ 17, /* MOVE_RATIO */ - 4, /* cost for loading QImode using movzbl */ - {4, 4, 4}, /* cost of loading integer registers + + /* All move costs are relative to integer->integer move times 2 and thus + they are latency*2. */ + 8, /* cost for loading QImode using movzbl */ + {8, 8, 8}, /* cost of loading integer registers in QImode, HImode and SImode. Relative to reg-reg move (2). */ - {4, 4, 4}, /* cost of storing integer registers */ - 4, /* cost of reg,reg fld/fst */ - {12, 12, 12}, /* cost of loading fp registers + {6, 6, 6}, /* cost of storing integer registers */ + 2, /* cost of reg,reg fld/fst */ + {8, 8, 18}, /* cost of loading fp registers in SFmode, DFmode and XFmode */ - {6, 6, 8}, /* cost of storing fp registers + {6, 6, 18}, /* cost of storing fp registers in SFmode, DFmode and XFmode */ 2, /* cost of moving MMX register */ {8, 8}, /* cost of loading MMX registers in SImode and DImode */ - {8, 8}, /* cost of storing MMX registers + {6, 6}, /* cost of storing MMX registers in SImode and DImode */ - 2, /* cost of moving SSE register */ - {8, 8, 8}, /* cost of loading SSE registers - in SImode, DImode and TImode */ - {8, 8, 8}, /* cost of storing SSE registers - in SImode, DImode and TImode */ - 5, /* MMX or SSE register to integer */ + 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ + {8, 8, 8, 16, 32}, /* cost of loading SSE registers + in 32,64,128,256 and 512-bit */ + {16, 16, 16, 32, 64}, /* cost of unaligned loads. */ + {8, 8, 8, 16, 32}, /* cost of storing SSE registers + in 32,64,128,256 and 512-bit */ + {16, 16, 16, 32, 64}, /* cost of unaligned stores. */ + 8, 6, /* SSE->integer and integer->SSE moves */ 32, /* size of l1 cache. */ 256, /* size of l2 cache. */ 64, /* size of prefetch block */ @@ -1923,6 +2024,9 @@ struct processor_costs intel_cost = { COSTS_N_INSNS (1), /* cost of movzx */ 8, /* "large" insn */ 17, /* MOVE_RATIO */ + + /* All move costs are relative to integer->integer move times 2 and thus + they are latency*2. */ 6, /* cost for loading QImode using movzbl */ {4, 4, 4}, /* cost of loading integer registers in QImode, HImode and SImode. @@ -1938,12 +2042,14 @@ struct processor_costs intel_cost = { in SImode and DImode */ {6, 6}, /* cost of storing MMX registers in SImode and DImode */ - 2, /* cost of moving SSE register */ - {6, 6, 6}, /* cost of loading SSE registers - in SImode, DImode and TImode */ - {6, 6, 6}, /* cost of storing SSE registers - in SImode, DImode and TImode */ - 2, /* MMX or SSE register to integer */ + 2, 2, 2, /* cost of moving XMM,YMM,ZMM register */ + {6, 6, 6, 6, 6}, /* cost of loading SSE registers + in 32,64,128,256 and 512-bit */ + {10, 10, 10, 10, 10}, /* cost of unaligned loads. */ + {6, 6, 6, 6, 6}, /* cost of storing SSE registers + in 32,64,128,256 and 512-bit */ + {10, 10, 10, 10, 10}, /* cost of unaligned loads. */ + 4, 4, /* SSE->integer and integer->SSE moves */ 32, /* size of l1 cache. */ 256, /* size of l2 cache. */ 64, /* size of prefetch block */ @@ -2010,6 +2116,9 @@ struct processor_costs generic_cost = { COSTS_N_INSNS (1), /* cost of movzx */ 8, /* "large" insn */ 17, /* MOVE_RATIO */ + + /* All move costs are relative to integer->integer move times 2 and thus + they are latency*2. */ 4, /* cost for loading QImode using movzbl */ {4, 4, 4}, /* cost of loading integer registers in QImode, HImode and SImode. @@ -2025,12 +2134,14 @@ struct processor_costs generic_cost = { in SImode and DImode */ {6, 6}, /* cost of storing MMX registers in SImode and DImode */ - 2, /* cost of moving SSE register */ - {6, 6, 6}, /* cost of loading SSE registers - in SImode, DImode and TImode */ - {6, 6, 6}, /* cost of storing SSE registers - in SImode, DImode and TImode */ - 6, /* MMX or SSE register to integer */ + 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */ + {6, 6, 6, 10, 15}, /* cost of loading SSE registers + in 32,64,128,256 and 512-bit */ + {10, 10, 10, 15, 20}, /* cost of unaligned loads. */ + {6, 6, 6, 10, 15}, /* cost of storing SSE registers + in 32,64,128,256 and 512-bit */ + {10, 10, 10, 15, 20}, /* cost of unaligned storess. */ + 20, 20, /* SSE->integer and integer->SSE moves */ 32, /* size of l1 cache. */ 512, /* size of l2 cache. */ 64, /* size of prefetch block */ @@ -2102,6 +2213,9 @@ struct processor_costs core_cost = { COSTS_N_INSNS (1), /* cost of movzx */ 8, /* "large" insn */ 17, /* MOVE_RATIO */ + + /* All move costs are relative to integer->integer move times 2 and thus + they are latency*2. */ 6, /* cost for loading QImode using movzbl */ {4, 4, 4}, /* cost of loading integer registers in QImode, HImode and SImode. @@ -2117,12 +2231,14 @@ struct processor_costs core_cost = { in SImode and DImode */ {6, 6}, /* cost of storing MMX registers in SImode and DImode */ - 2, /* cost of moving SSE register */ - {6, 6, 6}, /* cost of loading SSE registers - in SImode, DImode and TImode */ - {6, 6, 6}, /* cost of storing SSE registers - in SImode, DImode and TImode */ - 2, /* MMX or SSE register to integer */ + 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ + {6, 6, 6, 6, 12}, /* cost of loading SSE registers + in 32,64,128,256 and 512-bit */ + {6, 6, 6, 6, 12}, /* cost of unaligned loads. */ + {6, 6, 6, 6, 12}, /* cost of storing SSE registers + in 32,64,128,256 and 512-bit */ + {6, 6, 6, 6, 12}, /* cost of unaligned stores. */ + 2, 2, /* SSE->integer and integer->SSE moves */ 64, /* size of l1 cache. */ 512, /* size of l2 cache. */ 64, /* size of prefetch block */ -- 2.30.2