From 1133125eb84a5326b5e59595b00b5ec8add169dc Mon Sep 17 00:00:00 2001 From: Harsha Jagasia Date: Fri, 14 May 2010 17:35:11 +0000 Subject: [PATCH] config.gcc: Add support for --with-cpu option for bdver1. 2010-05-14 Harsha Jagasia * config.gcc: Add support for --with-cpu option for bdver1. * config/i386/i386.h (TARGET_BDVER1): New macro. (ix86_tune_indices): Change SSE_UNALIGNED_MOVE_OPTIMAL to SSE_UNALIGNED_LOAD_OPTIMAL. Add SSE_UNALIGNED_STORE_OPTIMAL. (ix86_tune_features) :Change SSE_UNALIGNED_MOVE_OPTIMAL to SSE_UNALIGNED_LOAD_OPTIMAL. Add SSE_UNALIGNED_STORE_OPTIMAL. Add SSE_PACKED_SINGLE_INSN_OPTIMAL. (TARGET_CPU_DEFAULT_NAMES): Add bdver1. (processor_type): Add PROCESSOR_BDVER1. * config/i386/i386.md: Add bdver1 as a new cpu attribute to match processor_type in config/i386/i386.h. Add check for TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL to emit movaps instead of movapd when replacing movsd or movss for SSE and AVX. Add check for TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL to emit packed xor instead of packed double/packed integer xor for SSE and AVX when moving a zero value. * config/i386/sse.md: Add check for TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL to emit movaps instead of movapd/movdqa for SSE and AVX. Add check for TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL to emit packed single logical operations i.e and, or and xor instead of packed double logical operations for SSE and AVX. * config/i386/i386-c.c: (ix86_target_macros_internal): Add PROCESSOR_BDVER1. * config/i386/driver-i386.c: Turn on -mtune=native for BDVER1. (has_fma4, has_xop): New. * config/i386/i386.c (bdver1_cost): New variable. (m_BDVER1): New macro. (m_AMD_MULTIPLE): Add m_BDVER1. (x86_tune_use_leave, x86_tune_push_memory, x86_tune_unroll_strlen, x86_tune_deep_branch_prediction, x86_tune_use_sahf, x86_tune_movx, x86_tune_use_simode_fiop, x86_tune_promote_qimode, x86_tune_add_esp_8, x86_tune_tune_sub_esp_4, x86_tune_sub_esp_8, x86_tune_integer_dfmode_moves, x86_tune_partial_reg_dependency, x86_tune_sse_partial_reg_dependency, x86_tune_sse_unaligned_load_optimal, x86_tune_sse_unaligned_store_optimal, x86_tune_sse_typeless_stores, x86_tune_memory_mismatch_stall, x86_tune_use_ffreep, x86_tune_inter_unit_moves, x86_tune_inter_unit_conversions, x86_tune_use_bt, x86_tune_pad_returns, x86_tune_slow_imul_imm32_mem, x86_tune_slow_imul_imm8, x86_tune_fuse_cmp_and_branch): Enable/disable for bdver1. (processor_target_table): Add bdver1_cost. (cpu_names): Add bdver1. (override_options): Set up PROCESSOR_BDVER1 for bdver1 entry in processor_alias_table. (ix86_expand_vector_move_misalign): Change TARGET_SSE_UNALIGNED_MOVE_OPTIMAL to TARGET_SSE_UNALIGNED_LOAD_OPTIMAL. Check for TARGET_SSE_UNALIGNED_STORE_OPTIMAL. Check for TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL to emit movups instead of movupd/movdqu for SSE and AVX. (ix86_tune_issue_rate): Add PROCESSOR_BDVER1. (ix86_tune_adjust_cost): Add code for bdver1. (standard_sse_constant_opcode): Add check for TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL to emit packed single xor instead of packed double xor for SSE and AVX. From-SVN: r159399 --- gcc/ChangeLog | 58 +++++++++ gcc/config.gcc | 24 ++-- gcc/config/i386/driver-i386.c | 12 ++ gcc/config/i386/i386-c.c | 7 ++ gcc/config/i386/i386.c | 224 ++++++++++++++++++++++++++++------ gcc/config/i386/i386.h | 15 ++- gcc/config/i386/i386.md | 62 ++++++++-- gcc/config/i386/sse.md | 48 ++++++-- 8 files changed, 381 insertions(+), 69 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 3d84fe834f4..d360ebec8c3 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,61 @@ +2010-05-14 Harsha Jagasia + + * config.gcc: Add support for --with-cpu option for bdver1. + * config/i386/i386.h (TARGET_BDVER1): New macro. + (ix86_tune_indices): Change SSE_UNALIGNED_MOVE_OPTIMAL + to SSE_UNALIGNED_LOAD_OPTIMAL. Add SSE_UNALIGNED_STORE_OPTIMAL. + (ix86_tune_features) :Change SSE_UNALIGNED_MOVE_OPTIMAL + to SSE_UNALIGNED_LOAD_OPTIMAL. Add SSE_UNALIGNED_STORE_OPTIMAL. + Add SSE_PACKED_SINGLE_INSN_OPTIMAL. + (TARGET_CPU_DEFAULT_NAMES): Add bdver1. + (processor_type): Add PROCESSOR_BDVER1. + * config/i386/i386.md: Add bdver1 as a new cpu attribute to match + processor_type in config/i386/i386.h. + Add check for TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL to emit + movaps instead of movapd when replacing + movsd or movss for SSE and AVX. + Add check for TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL + to emit packed xor instead of packed double/packed integer + xor for SSE and AVX when moving a zero value. + * config/i386/sse.md: Add check for TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL + to emit movaps instead of movapd/movdqa for SSE and AVX. + Add check for TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL to emit packed single + logical operations i.e and, or and xor instead of packed double logical + operations for SSE and AVX. + * config/i386/i386-c.c: + (ix86_target_macros_internal): Add PROCESSOR_BDVER1. + * config/i386/driver-i386.c: Turn on -mtune=native for BDVER1. + (has_fma4, has_xop): New. + * config/i386/i386.c (bdver1_cost): New variable. + (m_BDVER1): New macro. + (m_AMD_MULTIPLE): Add m_BDVER1. + (x86_tune_use_leave, x86_tune_push_memory, x86_tune_unroll_strlen, + x86_tune_deep_branch_prediction, x86_tune_use_sahf, x86_tune_movx, + x86_tune_use_simode_fiop, x86_tune_promote_qimode, + x86_tune_add_esp_8, x86_tune_tune_sub_esp_4, x86_tune_sub_esp_8, + x86_tune_integer_dfmode_moves, x86_tune_partial_reg_dependency, + x86_tune_sse_partial_reg_dependency, x86_tune_sse_unaligned_load_optimal, + x86_tune_sse_unaligned_store_optimal, x86_tune_sse_typeless_stores, + x86_tune_memory_mismatch_stall, x86_tune_use_ffreep, + x86_tune_inter_unit_moves, x86_tune_inter_unit_conversions, + x86_tune_use_bt, x86_tune_pad_returns, x86_tune_slow_imul_imm32_mem, + x86_tune_slow_imul_imm8, x86_tune_fuse_cmp_and_branch): + Enable/disable for bdver1. + (processor_target_table): Add bdver1_cost. + (cpu_names): Add bdver1. + (override_options): Set up PROCESSOR_BDVER1 for bdver1 entry in + processor_alias_table. + (ix86_expand_vector_move_misalign): Change + TARGET_SSE_UNALIGNED_MOVE_OPTIMAL to TARGET_SSE_UNALIGNED_LOAD_OPTIMAL. + Check for TARGET_SSE_UNALIGNED_STORE_OPTIMAL. + Check for TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL to emit movups instead + of movupd/movdqu for SSE and AVX. + (ix86_tune_issue_rate): Add PROCESSOR_BDVER1. + (ix86_tune_adjust_cost): Add code for bdver1. + (standard_sse_constant_opcode): Add check for + TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL to emit packed single xor instead + of packed double xor for SSE and AVX. + 2010-05-14 Pat Haugen * tree-ssa-loop.prefetch.c (prune_ref_by_group_reuse): Cast abs() diff --git a/gcc/config.gcc b/gcc/config.gcc index 21433fc690a..da56fbecc20 100644 --- a/gcc/config.gcc +++ b/gcc/config.gcc @@ -1139,7 +1139,7 @@ i[34567]86-*-linux* | i[34567]86-*-kfreebsd*-gnu | i[34567]86-*-knetbsd*-gnu | i need_64bit_hwint=yes need_64bit_isa=yes case X"${with_cpu}" in - Xgeneric|Xatom|Xcore2|Xnocona|Xx86-64|Xamdfam10|Xbarcelona|Xk8|Xopteron|Xathlon64|Xathlon-fx|Xathlon64-sse3|Xk8-sse3|Xopteron-sse3) + Xgeneric|Xatom|Xcore2|Xnocona|Xx86-64|Xbdver1|Xamdfam10|Xbarcelona|Xk8|Xopteron|Xathlon64|Xathlon-fx|Xathlon64-sse3|Xk8-sse3|Xopteron-sse3) ;; X) if test x$with_cpu_64 = x; then @@ -1148,7 +1148,7 @@ i[34567]86-*-linux* | i[34567]86-*-kfreebsd*-gnu | i[34567]86-*-knetbsd*-gnu | i ;; *) echo "Unsupported CPU used in --with-cpu=$with_cpu, supported values:" 1>&2 - echo "generic atom core2 nocona x86-64 amdfam10 barcelona k8 opteron athlon64 athlon-fx athlon64-sse3 k8-sse3 opteron-sse3" 1>&2 + echo "generic atom core2 nocona x86-64 bdver1 amdfam10 barcelona k8 opteron athlon64 athlon-fx athlon64-sse3 k8-sse3 opteron-sse3" 1>&2 exit 1 ;; esac @@ -1266,7 +1266,7 @@ i[34567]86-*-solaris2*) need_64bit_isa=yes use_gcc_stdint=wrap case X"${with_cpu}" in - Xgeneric|Xatom|Xcore2|Xnocona|Xx86-64|Xamdfam10|Xbarcelona|Xk8|Xopteron|Xathlon64|Xathlon-fx|Xathlon64-sse3|Xk8-sse3|Xopteron-sse3) + Xgeneric|Xatom|Xcore2|Xnocona|Xx86-64|Xbdver1|Xamdfam10|Xbarcelona|Xk8|Xopteron|Xathlon64|Xathlon-fx|Xathlon64-sse3|Xk8-sse3|Xopteron-sse3) ;; X) if test x$with_cpu_64 = x; then @@ -1275,7 +1275,7 @@ i[34567]86-*-solaris2*) ;; *) echo "Unsupported CPU used in --with-cpu=$with_cpu, supported values:" 1>&2 - echo "generic atom core2 nocona x86-64 amdfam10 barcelona k8 opteron athlon64 athlon-fx athlon64-sse3 k8-sse3 opteron-sse3" 1>&2 + echo "generic atom core2 nocona x86-64 bdver1 amdfam10 barcelona k8 opteron athlon64 athlon-fx athlon64-sse3 k8-sse3 opteron-sse3" 1>&2 exit 1 ;; esac @@ -1346,7 +1346,7 @@ i[34567]86-*-mingw* | x86_64-*-mingw*) if test x$enable_targets = xall; then tm_defines="${tm_defines} TARGET_BI_ARCH=1" case X"${with_cpu}" in - Xgeneric|Xatom|Xcore2|Xnocona|Xx86-64|Xamdfam10|Xbarcelona|Xk8|Xopteron|Xathlon64|Xathlon-fx|Xathlon64-sse3|Xk8-sse3|Xopteron-sse3) + Xgeneric|Xatom|Xcore2|Xnocona|Xx86-64|Xbdver1|Xamdfam10|Xbarcelona|Xk8|Xopteron|Xathlon64|Xathlon-fx|Xathlon64-sse3|Xk8-sse3|Xopteron-sse3) ;; X) if test x$with_cpu_64 = x; then @@ -1355,7 +1355,7 @@ i[34567]86-*-mingw* | x86_64-*-mingw*) ;; *) echo "Unsupported CPU used in --with-cpu=$with_cpu, supported values:" 1>&2 - echo "generic atom core2 nocona x86-64 amdfam10 barcelona k8 opteron athlon64 athlon-fx athlon64-sse3 k8-sse3 opteron-sse3" 1>&2 + echo "generic atom core2 nocona x86-64 bdver1 amdfam10 barcelona k8 opteron athlon64 athlon-fx athlon64-sse3 k8-sse3 opteron-sse3" 1>&2 exit 1 ;; esac @@ -2626,6 +2626,10 @@ case ${target} in ;; i686-*-* | i786-*-*) case ${target_noncanonical} in + bdver1-*) + arch=bdver1 + cpu=bdver1 + ;; amdfam10-*|barcelona-*) arch=amdfam10 cpu=amdfam10 @@ -2703,6 +2707,10 @@ case ${target} in ;; x86_64-*-*) case ${target_noncanonical} in + bdver1-*) + arch=bdver1 + cpu=bdver1 + ;; amdfam10-*|barcelona-*) arch=amdfam10 cpu=amdfam10 @@ -3109,8 +3117,8 @@ case "${target}" in ;; "" | x86-64 | generic | native \ | k8 | k8-sse3 | athlon64 | athlon64-sse3 | opteron \ - | opteron-sse3 | athlon-fx | amdfam10 | barcelona \ - | nocona | core2 | atom) + | opteron-sse3 | athlon-fx | bdver1 | amdfam10 \ + | barcelona | nocona | core2 | atom) # OK ;; *) diff --git a/gcc/config/i386/driver-i386.c b/gcc/config/i386/driver-i386.c index 063279aa629..8a768577c39 100644 --- a/gcc/config/i386/driver-i386.c +++ b/gcc/config/i386/driver-i386.c @@ -396,6 +396,7 @@ const char *host_detect_local_cpu (int argc, const char **argv) unsigned int has_movbe = 0, has_sse4_1 = 0, has_sse4_2 = 0; unsigned int has_popcnt = 0, has_aes = 0, has_avx = 0; unsigned int has_pclmul = 0, has_abm = 0, has_lwp = 0; + unsigned int has_fma4 = 0, has_xop = 0; bool arch; @@ -460,6 +461,8 @@ const char *host_detect_local_cpu (int argc, const char **argv) has_sse4a = ecx & bit_SSE4a; has_abm = ecx & bit_ABM; has_lwp = ecx & bit_LWP; + has_fma4 = ecx & bit_FMA4; + has_xop = ecx & bit_XOP; has_longmode = edx & bit_LM; has_3dnowp = edx & bit_3DNOWP; @@ -490,6 +493,8 @@ const char *host_detect_local_cpu (int argc, const char **argv) if (name == SIG_GEODE) processor = PROCESSOR_GEODE; + else if (has_xop) + processor = PROCESSOR_BDVER1; else if (has_sse4a) processor = PROCESSOR_AMDFAM10; else if (has_sse2 || has_longmode) @@ -629,6 +634,9 @@ const char *host_detect_local_cpu (int argc, const char **argv) case PROCESSOR_AMDFAM10: cpu = "amdfam10"; break; + case PROCESSOR_BDVER1: + cpu = "bdver1"; + break; default: /* Use something reasonable. */ @@ -674,6 +682,10 @@ const char *host_detect_local_cpu (int argc, const char **argv) options = concat (options, " -mabm", NULL); if (has_lwp) options = concat (options, " -mlwp", NULL); + if (has_fma4) + options = concat (options, " -mfma4", NULL); + if (has_xop) + options = concat (options, " -mxop", NULL); if (has_avx) options = concat (options, " -mavx", NULL); diff --git a/gcc/config/i386/i386-c.c b/gcc/config/i386/i386-c.c index 35eab492b6c..285f6ef3c93 100644 --- a/gcc/config/i386/i386-c.c +++ b/gcc/config/i386/i386-c.c @@ -107,6 +107,10 @@ ix86_target_macros_internal (int isa_flag, def_or_undef (parse_in, "__amdfam10"); def_or_undef (parse_in, "__amdfam10__"); break; + case PROCESSOR_BDVER1: + def_or_undef (parse_in, "__bdver1"); + def_or_undef (parse_in, "__bdver1__"); + break; case PROCESSOR_PENTIUM4: def_or_undef (parse_in, "__pentium4"); def_or_undef (parse_in, "__pentium4__"); @@ -182,6 +186,9 @@ ix86_target_macros_internal (int isa_flag, case PROCESSOR_AMDFAM10: def_or_undef (parse_in, "__tune_amdfam10__"); break; + case PROCESSOR_BDVER1: + def_or_undef (parse_in, "__tune_bdver1__"); + break; case PROCESSOR_PENTIUM4: def_or_undef (parse_in, "__tune_pentium4__"); break; diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index ee5e931f3aa..b99586a52ae 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -819,6 +819,93 @@ struct processor_costs amdfam10_cost = { 1, /* cond_not_taken_branch_cost. */ }; +struct processor_costs bdver1_cost = { + COSTS_N_INSNS (1), /* cost of an add instruction */ + COSTS_N_INSNS (2), /* cost of a lea instruction */ + COSTS_N_INSNS (1), /* variable shift costs */ + COSTS_N_INSNS (1), /* constant shift costs */ + {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ + COSTS_N_INSNS (4), /* HI */ + COSTS_N_INSNS (3), /* SI */ + COSTS_N_INSNS (4), /* DI */ + COSTS_N_INSNS (5)}, /* other */ + 0, /* cost of multiply per each bit set */ + {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ + COSTS_N_INSNS (35), /* HI */ + COSTS_N_INSNS (51), /* SI */ + COSTS_N_INSNS (83), /* DI */ + COSTS_N_INSNS (83)}, /* other */ + COSTS_N_INSNS (1), /* cost of movsx */ + COSTS_N_INSNS (1), /* cost of movzx */ + 8, /* "large" insn */ + 9, /* MOVE_RATIO */ + 4, /* cost for loading QImode using movzbl */ + {3, 4, 3}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {3, 4, 3}, /* cost of storing integer registers */ + 4, /* cost of reg,reg fld/fst */ + {4, 4, 12}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {6, 6, 8}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 2, /* cost of moving MMX register */ + {3, 3}, /* cost of loading MMX registers + in SImode and DImode */ + {4, 4}, /* cost of storing MMX registers + in SImode and DImode */ + 2, /* cost of moving SSE register */ + {4, 4, 3}, /* cost of loading SSE registers + in SImode, DImode and TImode */ + {4, 4, 5}, /* cost of storing SSE registers + in SImode, DImode and TImode */ + 3, /* MMX or SSE register to integer */ + /* On K8 + MOVD reg64, xmmreg Double FSTORE 4 + MOVD reg32, xmmreg Double FSTORE 4 + On AMDFAM10 + MOVD reg64, xmmreg Double FADD 3 + 1/1 1/1 + MOVD reg32, xmmreg Double FADD 3 + 1/1 1/1 */ + 64, /* size of l1 cache. */ + 1024, /* size of l2 cache. */ + 64, /* size of prefetch block */ + /* New AMD processors never drop prefetches; if they cannot be performed + immediately, they are queued. We set number of simultaneous prefetches + to a large constant to reflect this (it probably is not a good idea not + to limit number of prefetches at all, as their execution also takes some + time). */ + 100, /* number of parallel prefetches */ + 2, /* Branch cost */ + COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (4), /* cost of FMUL instruction. */ + COSTS_N_INSNS (19), /* cost of FDIV instruction. */ + COSTS_N_INSNS (2), /* cost of FABS instruction. */ + COSTS_N_INSNS (2), /* cost of FCHS instruction. */ + COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ + + /* BDVER1 has optimized REP instruction for medium sized blocks, but for + very small blocks it is better to use loop. For large blocks, libcall can + do nontemporary accesses and beat inline considerably. */ + {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}}, + {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}, + {{libcall, {{8, loop}, {24, unrolled_loop}, + {2048, rep_prefix_4_byte}, {-1, libcall}}}, + {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}, + 4, /* scalar_stmt_cost. */ + 2, /* scalar load_cost. */ + 2, /* scalar_store_cost. */ + 6, /* vec_stmt_cost. */ + 0, /* vec_to_scalar_cost. */ + 2, /* scalar_to_vec_cost. */ + 2, /* vec_align_load_cost. */ + 2, /* vec_unalign_load_cost. */ + 2, /* vec_store_cost. */ + 2, /* cond_taken_branch_cost. */ + 1, /* cond_not_taken_branch_cost. */ +}; + static const struct processor_costs pentium4_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ @@ -1276,7 +1363,8 @@ const struct processor_costs *ix86_cost = &pentium_cost; #define m_ATHLON (1<mode) && ix86_binary_operator_ok (, mode, operands)" - "vp\t{%2, %1, %0|%0, %1, %2}" +{ + if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) + return "vps\t{%2, %1, %0|%0, %1, %2}"; + else + return "vp\t{%2, %1, %0|%0, %1, %2}"; +} [(set_attr "type" "sselog") (set_attr "prefix" "vex") (set_attr "mode" "")]) @@ -1631,7 +1648,12 @@ (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "xm")))] "SSE_VEC_FLOAT_MODE_P (mode) && ix86_binary_operator_ok (, mode, operands)" - "p\t{%2, %0|%0, %2}" +{ + if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) + return "ps\t{%2, %0|%0, %2}"; + else + return "p\t{%2, %0|%0, %2}"; +} [(set_attr "type" "sselog") (set_attr "mode" "")]) @@ -1687,7 +1709,12 @@ (match_operand:MODEF 1 "register_operand" "x") (match_operand:MODEF 2 "register_operand" "x")))] "AVX_FLOAT_MODE_P (mode)" - "vp\t{%2, %1, %0|%0, %1, %2}" +{ + if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) + return "vps\t{%2, %1, %0|%0, %1, %2}"; + else + return "vp\t{%2, %1, %0|%0, %1, %2}"; +} [(set_attr "type" "sselog") (set_attr "prefix" "vex") (set_attr "mode" "")]) @@ -1698,7 +1725,12 @@ (match_operand:MODEF 1 "register_operand" "0") (match_operand:MODEF 2 "register_operand" "x")))] "SSE_FLOAT_MODE_P (mode)" - "p\t{%2, %0|%0, %2}" +{ + if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) + return "ps\t{%2, %0|%0, %2}"; + else + return "p\t{%2, %0|%0, %2}"; +} [(set_attr "type" "sselog") (set_attr "mode" "")]) -- 2.30.2