gcc/config/i386/x86-tune-costs.h

   1
   2 /* Processor costs (relative to an add) */
   3 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes.  */
   4 #define COSTS_N_BYTES(N) ((N) * 2)
   5
   6 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
   7
   8 static stringop_algs ix86_size_memcpy[2] = {
   9   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
  10   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
  11 static stringop_algs ix86_size_memset[2] = {
  12   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
  13   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
  14
  15 const
  16 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
  17   COSTS_N_BYTES (2),                    /* cost of an add instruction */
  18   COSTS_N_BYTES (3),                    /* cost of a lea instruction */
  19   COSTS_N_BYTES (2),                    /* variable shift costs */
  20   COSTS_N_BYTES (3),                    /* constant shift costs */
  21   {COSTS_N_BYTES (3),                   /* cost of starting multiply for QI */
  22    COSTS_N_BYTES (3),                   /*                               HI */
  23    COSTS_N_BYTES (3),                   /*                               SI */
  24    COSTS_N_BYTES (3),                   /*                               DI */
  25    COSTS_N_BYTES (5)},                  /*                            other */
  26   0,                                    /* cost of multiply per each bit set */
  27   {COSTS_N_BYTES (3),                   /* cost of a divide/mod for QI */
  28    COSTS_N_BYTES (3),                   /*                          HI */
  29    COSTS_N_BYTES (3),                   /*                          SI */
  30    COSTS_N_BYTES (3),                   /*                          DI */
  31    COSTS_N_BYTES (5)},                  /*                          other */
  32   COSTS_N_BYTES (3),                    /* cost of movsx */
  33   COSTS_N_BYTES (3),                    /* cost of movzx */
  34   0,                                    /* "large" insn */
  35   2,                                    /* MOVE_RATIO */
  36   2,                                 /* cost for loading QImode using movzbl */
  37   {2, 2, 2},                            /* cost of loading integer registers
  38                                            in QImode, HImode and SImode.
  39                                            Relative to reg-reg move (2).  */
  40   {2, 2, 2},                            /* cost of storing integer registers */
  41   2,                                    /* cost of reg,reg fld/fst */
  42   {2, 2, 2},                            /* cost of loading fp registers
  43                                            in SFmode, DFmode and XFmode */
  44   {2, 2, 2},                            /* cost of storing fp registers
  45                                            in SFmode, DFmode and XFmode */
  46   3,                                    /* cost of moving MMX register */
  47   {3, 3},                               /* cost of loading MMX registers
  48                                            in SImode and DImode */
  49   {3, 3},                               /* cost of storing MMX registers
  50                                            in SImode and DImode */
  51   3,                                    /* cost of moving SSE register */
  52   {3, 3, 3},                            /* cost of loading SSE registers
  53                                            in SImode, DImode and TImode */
  54   {3, 3, 3},                            /* cost of storing SSE registers
  55                                            in SImode, DImode and TImode */
  56   3,                                    /* MMX or SSE register to integer */
  57   0,                                    /* size of l1 cache  */
  58   0,                                    /* size of l2 cache  */
  59   0,                                    /* size of prefetch block */
  60   0,                                    /* number of parallel prefetches */
  61   2,                                    /* Branch cost */
  62   COSTS_N_BYTES (2),                    /* cost of FADD and FSUB insns.  */
  63   COSTS_N_BYTES (2),                    /* cost of FMUL instruction.  */
  64   COSTS_N_BYTES (2),                    /* cost of FDIV instruction.  */
  65   COSTS_N_BYTES (2),                    /* cost of FABS instruction.  */
  66   COSTS_N_BYTES (2),                    /* cost of FCHS instruction.  */
  67   COSTS_N_BYTES (2),                    /* cost of FSQRT instruction.  */
  68
  69   COSTS_N_BYTES (2),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
  70   COSTS_N_BYTES (2),                    /* cost of MULSS instruction.  */
  71   COSTS_N_BYTES (2),                    /* cost of MULSD instruction.  */
  72   COSTS_N_BYTES (2),                    /* cost of DIVSS instruction.  */
  73   COSTS_N_BYTES (2),                    /* cost of DIVSD instruction.  */
  74   COSTS_N_BYTES (2),                    /* cost of SQRTSS instruction.  */
  75   COSTS_N_BYTES (2),                    /* cost of SQRTSD instruction.  */
  76   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
  77   ix86_size_memcpy,
  78   ix86_size_memset,
  79   1,                                    /* scalar_stmt_cost.  */
  80   1,                                    /* scalar load_cost.  */
  81   1,                                    /* scalar_store_cost.  */
  82   1,                                    /* vec_stmt_cost.  */
  83   1,                                    /* vec_to_scalar_cost.  */
  84   1,                                    /* scalar_to_vec_cost.  */
  85   1,                                    /* vec_align_load_cost.  */
  86   1,                                    /* vec_unalign_load_cost.  */
  87   1,                                    /* vec_store_cost.  */
  88   1,                                    /* cond_taken_branch_cost.  */
  89   1,                                    /* cond_not_taken_branch_cost.  */
  90 };
  91
  92 /* Processor costs (relative to an add) */
  93 static stringop_algs i386_memcpy[2] = {
  94   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
  95   DUMMY_STRINGOP_ALGS};
  96 static stringop_algs i386_memset[2] = {
  97   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
  98   DUMMY_STRINGOP_ALGS};
  99
 100 static const
 101 struct processor_costs i386_cost = {    /* 386 specific costs */
 102   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 103   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
 104   COSTS_N_INSNS (3),                    /* variable shift costs */
 105   COSTS_N_INSNS (2),                    /* constant shift costs */
 106   {COSTS_N_INSNS (6),                   /* cost of starting multiply for QI */
 107    COSTS_N_INSNS (6),                   /*                               HI */
 108    COSTS_N_INSNS (6),                   /*                               SI */
 109    COSTS_N_INSNS (6),                   /*                               DI */
 110    COSTS_N_INSNS (6)},                  /*                            other */
 111   COSTS_N_INSNS (1),                    /* cost of multiply per each bit set */
 112   {COSTS_N_INSNS (23),                  /* cost of a divide/mod for QI */
 113    COSTS_N_INSNS (23),                  /*                          HI */
 114    COSTS_N_INSNS (23),                  /*                          SI */
 115    COSTS_N_INSNS (23),                  /*                          DI */
 116    COSTS_N_INSNS (23)},                 /*                          other */
 117   COSTS_N_INSNS (3),                    /* cost of movsx */
 118   COSTS_N_INSNS (2),                    /* cost of movzx */
 119   15,                                   /* "large" insn */
 120   3,                                    /* MOVE_RATIO */
 121   4,                                 /* cost for loading QImode using movzbl */
 122   {2, 4, 2},                            /* cost of loading integer registers
 123                                            in QImode, HImode and SImode.
 124                                            Relative to reg-reg move (2).  */
 125   {2, 4, 2},                            /* cost of storing integer registers */
 126   2,                                    /* cost of reg,reg fld/fst */
 127   {8, 8, 8},                            /* cost of loading fp registers
 128                                            in SFmode, DFmode and XFmode */
 129   {8, 8, 8},                            /* cost of storing fp registers
 130                                            in SFmode, DFmode and XFmode */
 131   2,                                    /* cost of moving MMX register */
 132   {4, 8},                               /* cost of loading MMX registers
 133                                            in SImode and DImode */
 134   {4, 8},                               /* cost of storing MMX registers
 135                                            in SImode and DImode */
 136   2,                                    /* cost of moving SSE register */
 137   {4, 8, 16},                           /* cost of loading SSE registers
 138                                            in SImode, DImode and TImode */
 139   {4, 8, 16},                           /* cost of storing SSE registers
 140                                            in SImode, DImode and TImode */
 141   3,                                    /* MMX or SSE register to integer */
 142   0,                                    /* size of l1 cache  */
 143   0,                                    /* size of l2 cache  */
 144   0,                                    /* size of prefetch block */
 145   0,                                    /* number of parallel prefetches */
 146   1,                                    /* Branch cost */
 147   COSTS_N_INSNS (23),                   /* cost of FADD and FSUB insns.  */
 148   COSTS_N_INSNS (27),                   /* cost of FMUL instruction.  */
 149   COSTS_N_INSNS (88),                   /* cost of FDIV instruction.  */
 150   COSTS_N_INSNS (22),                   /* cost of FABS instruction.  */
 151   COSTS_N_INSNS (24),                   /* cost of FCHS instruction.  */
 152   COSTS_N_INSNS (122),                  /* cost of FSQRT instruction.  */
 153
 154   COSTS_N_INSNS (23),                   /* cost of ADDSS/SD SUBSS/SD insns.  */
 155   COSTS_N_INSNS (27),                   /* cost of MULSS instruction.  */
 156   COSTS_N_INSNS (27),                   /* cost of MULSD instruction.  */
 157   COSTS_N_INSNS (88),                   /* cost of DIVSS instruction.  */
 158   COSTS_N_INSNS (88),                   /* cost of DIVSD instruction.  */
 159   COSTS_N_INSNS (122),                  /* cost of SQRTSS instruction.  */
 160   COSTS_N_INSNS (122),                  /* cost of SQRTSD instruction.  */
 161   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 162   i386_memcpy,
 163   i386_memset,
 164   1,                                    /* scalar_stmt_cost.  */
 165   1,                                    /* scalar load_cost.  */
 166   1,                                    /* scalar_store_cost.  */
 167   1,                                    /* vec_stmt_cost.  */
 168   1,                                    /* vec_to_scalar_cost.  */
 169   1,                                    /* scalar_to_vec_cost.  */
 170   1,                                    /* vec_align_load_cost.  */
 171   2,                                    /* vec_unalign_load_cost.  */
 172   1,                                    /* vec_store_cost.  */
 173   3,                                    /* cond_taken_branch_cost.  */
 174   1,                                    /* cond_not_taken_branch_cost.  */
 175 };
 176
 177 static stringop_algs i486_memcpy[2] = {
 178   {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
 179   DUMMY_STRINGOP_ALGS};
 180 static stringop_algs i486_memset[2] = {
 181   {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
 182   DUMMY_STRINGOP_ALGS};
 183
 184 static const
 185 struct processor_costs i486_cost = {    /* 486 specific costs */
 186   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 187   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
 188   COSTS_N_INSNS (3),                    /* variable shift costs */
 189   COSTS_N_INSNS (2),                    /* constant shift costs */
 190   {COSTS_N_INSNS (12),                  /* cost of starting multiply for QI */
 191    COSTS_N_INSNS (12),                  /*                               HI */
 192    COSTS_N_INSNS (12),                  /*                               SI */
 193    COSTS_N_INSNS (12),                  /*                               DI */
 194    COSTS_N_INSNS (12)},                 /*                            other */
 195   1,                                    /* cost of multiply per each bit set */
 196   {COSTS_N_INSNS (40),                  /* cost of a divide/mod for QI */
 197    COSTS_N_INSNS (40),                  /*                          HI */
 198    COSTS_N_INSNS (40),                  /*                          SI */
 199    COSTS_N_INSNS (40),                  /*                          DI */
 200    COSTS_N_INSNS (40)},                 /*                          other */
 201   COSTS_N_INSNS (3),                    /* cost of movsx */
 202   COSTS_N_INSNS (2),                    /* cost of movzx */
 203   15,                                   /* "large" insn */
 204   3,                                    /* MOVE_RATIO */
 205   4,                                 /* cost for loading QImode using movzbl */
 206   {2, 4, 2},                            /* cost of loading integer registers
 207                                            in QImode, HImode and SImode.
 208                                            Relative to reg-reg move (2).  */
 209   {2, 4, 2},                            /* cost of storing integer registers */
 210   2,                                    /* cost of reg,reg fld/fst */
 211   {8, 8, 8},                            /* cost of loading fp registers
 212                                            in SFmode, DFmode and XFmode */
 213   {8, 8, 8},                            /* cost of storing fp registers
 214                                            in SFmode, DFmode and XFmode */
 215   2,                                    /* cost of moving MMX register */
 216   {4, 8},                               /* cost of loading MMX registers
 217                                            in SImode and DImode */
 218   {4, 8},                               /* cost of storing MMX registers
 219                                            in SImode and DImode */
 220   2,                                    /* cost of moving SSE register */
 221   {4, 8, 16},                           /* cost of loading SSE registers
 222                                            in SImode, DImode and TImode */
 223   {4, 8, 16},                           /* cost of storing SSE registers
 224                                            in SImode, DImode and TImode */
 225   3,                                    /* MMX or SSE register to integer */
 226   4,                                    /* size of l1 cache.  486 has 8kB cache
 227                                            shared for code and data, so 4kB is
 228                                            not really precise.  */
 229   4,                                    /* size of l2 cache  */
 230   0,                                    /* size of prefetch block */
 231   0,                                    /* number of parallel prefetches */
 232   1,                                    /* Branch cost */
 233   COSTS_N_INSNS (8),                    /* cost of FADD and FSUB insns.  */
 234   COSTS_N_INSNS (16),                   /* cost of FMUL instruction.  */
 235   COSTS_N_INSNS (73),                   /* cost of FDIV instruction.  */
 236   COSTS_N_INSNS (3),                    /* cost of FABS instruction.  */
 237   COSTS_N_INSNS (3),                    /* cost of FCHS instruction.  */
 238   COSTS_N_INSNS (83),                   /* cost of FSQRT instruction.  */
 239
 240   COSTS_N_INSNS (8),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 241   COSTS_N_INSNS (16),                   /* cost of MULSS instruction.  */
 242   COSTS_N_INSNS (16),                   /* cost of MULSD instruction.  */
 243   COSTS_N_INSNS (73),                   /* cost of DIVSS instruction.  */
 244   COSTS_N_INSNS (74),                   /* cost of DIVSD instruction.  */
 245   COSTS_N_INSNS (83),                   /* cost of SQRTSS instruction.  */
 246   COSTS_N_INSNS (83),                   /* cost of SQRTSD instruction.  */
 247   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 248   i486_memcpy,
 249   i486_memset,
 250   1,                                    /* scalar_stmt_cost.  */
 251   1,                                    /* scalar load_cost.  */
 252   1,                                    /* scalar_store_cost.  */
 253   1,                                    /* vec_stmt_cost.  */
 254   1,                                    /* vec_to_scalar_cost.  */
 255   1,                                    /* scalar_to_vec_cost.  */
 256   1,                                    /* vec_align_load_cost.  */
 257   2,                                    /* vec_unalign_load_cost.  */
 258   1,                                    /* vec_store_cost.  */
 259   3,                                    /* cond_taken_branch_cost.  */
 260   1,                                    /* cond_not_taken_branch_cost.  */
 261 };
 262
 263 static stringop_algs pentium_memcpy[2] = {
 264   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 265   DUMMY_STRINGOP_ALGS};
 266 static stringop_algs pentium_memset[2] = {
 267   {libcall, {{-1, rep_prefix_4_byte, false}}},
 268   DUMMY_STRINGOP_ALGS};
 269
 270 static const
 271 struct processor_costs pentium_cost = {
 272   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 273   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
 274   COSTS_N_INSNS (4),                    /* variable shift costs */
 275   COSTS_N_INSNS (1),                    /* constant shift costs */
 276   {COSTS_N_INSNS (11),                  /* cost of starting multiply for QI */
 277    COSTS_N_INSNS (11),                  /*                               HI */
 278    COSTS_N_INSNS (11),                  /*                               SI */
 279    COSTS_N_INSNS (11),                  /*                               DI */
 280    COSTS_N_INSNS (11)},                 /*                            other */
 281   0,                                    /* cost of multiply per each bit set */
 282   {COSTS_N_INSNS (25),                  /* cost of a divide/mod for QI */
 283    COSTS_N_INSNS (25),                  /*                          HI */
 284    COSTS_N_INSNS (25),                  /*                          SI */
 285    COSTS_N_INSNS (25),                  /*                          DI */
 286    COSTS_N_INSNS (25)},                 /*                          other */
 287   COSTS_N_INSNS (3),                    /* cost of movsx */
 288   COSTS_N_INSNS (2),                    /* cost of movzx */
 289   8,                                    /* "large" insn */
 290   6,                                    /* MOVE_RATIO */
 291   6,                                 /* cost for loading QImode using movzbl */
 292   {2, 4, 2},                            /* cost of loading integer registers
 293                                            in QImode, HImode and SImode.
 294                                            Relative to reg-reg move (2).  */
 295   {2, 4, 2},                            /* cost of storing integer registers */
 296   2,                                    /* cost of reg,reg fld/fst */
 297   {2, 2, 6},                            /* cost of loading fp registers
 298                                            in SFmode, DFmode and XFmode */
 299   {4, 4, 6},                            /* cost of storing fp registers
 300                                            in SFmode, DFmode and XFmode */
 301   8,                                    /* cost of moving MMX register */
 302   {8, 8},                               /* cost of loading MMX registers
 303                                            in SImode and DImode */
 304   {8, 8},                               /* cost of storing MMX registers
 305                                            in SImode and DImode */
 306   2,                                    /* cost of moving SSE register */
 307   {4, 8, 16},                           /* cost of loading SSE registers
 308                                            in SImode, DImode and TImode */
 309   {4, 8, 16},                           /* cost of storing SSE registers
 310                                            in SImode, DImode and TImode */
 311   3,                                    /* MMX or SSE register to integer */
 312   8,                                    /* size of l1 cache.  */
 313   8,                                    /* size of l2 cache  */
 314   0,                                    /* size of prefetch block */
 315   0,                                    /* number of parallel prefetches */
 316   2,                                    /* Branch cost */
 317   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
 318   COSTS_N_INSNS (3),                    /* cost of FMUL instruction.  */
 319   COSTS_N_INSNS (39),                   /* cost of FDIV instruction.  */
 320   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
 321   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
 322   COSTS_N_INSNS (70),                   /* cost of FSQRT instruction.  */
 323
 324   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 325   COSTS_N_INSNS (3),                    /* cost of MULSS instruction.  */
 326   COSTS_N_INSNS (3),                    /* cost of MULSD instruction.  */
 327   COSTS_N_INSNS (39),                   /* cost of DIVSS instruction.  */
 328   COSTS_N_INSNS (39),                   /* cost of DIVSD instruction.  */
 329   COSTS_N_INSNS (70),                   /* cost of SQRTSS instruction.  */
 330   COSTS_N_INSNS (70),                   /* cost of SQRTSD instruction.  */
 331   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 332   pentium_memcpy,
 333   pentium_memset,
 334   1,                                    /* scalar_stmt_cost.  */
 335   1,                                    /* scalar load_cost.  */
 336   1,                                    /* scalar_store_cost.  */
 337   1,                                    /* vec_stmt_cost.  */
 338   1,                                    /* vec_to_scalar_cost.  */
 339   1,                                    /* scalar_to_vec_cost.  */
 340   1,                                    /* vec_align_load_cost.  */
 341   2,                                    /* vec_unalign_load_cost.  */
 342   1,                                    /* vec_store_cost.  */
 343   3,                                    /* cond_taken_branch_cost.  */
 344   1,                                    /* cond_not_taken_branch_cost.  */
 345 };
 346
 347 static const
 348 struct processor_costs lakemont_cost = {
 349   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 350   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
 351   COSTS_N_INSNS (1),                    /* variable shift costs */
 352   COSTS_N_INSNS (1),                    /* constant shift costs */
 353   {COSTS_N_INSNS (11),                  /* cost of starting multiply for QI */
 354    COSTS_N_INSNS (11),                  /*                               HI */
 355    COSTS_N_INSNS (11),                  /*                               SI */
 356    COSTS_N_INSNS (11),                  /*                               DI */
 357    COSTS_N_INSNS (11)},                 /*                            other */
 358   0,                                    /* cost of multiply per each bit set */
 359   {COSTS_N_INSNS (25),                  /* cost of a divide/mod for QI */
 360    COSTS_N_INSNS (25),                  /*                          HI */
 361    COSTS_N_INSNS (25),                  /*                          SI */
 362    COSTS_N_INSNS (25),                  /*                          DI */
 363    COSTS_N_INSNS (25)},                 /*                          other */
 364   COSTS_N_INSNS (3),                    /* cost of movsx */
 365   COSTS_N_INSNS (2),                    /* cost of movzx */
 366   8,                                    /* "large" insn */
 367   17,                                   /* MOVE_RATIO */
 368   6,                                 /* cost for loading QImode using movzbl */
 369   {2, 4, 2},                            /* cost of loading integer registers
 370                                            in QImode, HImode and SImode.
 371                                            Relative to reg-reg move (2).  */
 372   {2, 4, 2},                            /* cost of storing integer registers */
 373   2,                                    /* cost of reg,reg fld/fst */
 374   {2, 2, 6},                            /* cost of loading fp registers
 375                                            in SFmode, DFmode and XFmode */
 376   {4, 4, 6},                            /* cost of storing fp registers
 377                                            in SFmode, DFmode and XFmode */
 378   8,                                    /* cost of moving MMX register */
 379   {8, 8},                               /* cost of loading MMX registers
 380                                            in SImode and DImode */
 381   {8, 8},                               /* cost of storing MMX registers
 382                                            in SImode and DImode */
 383   2,                                    /* cost of moving SSE register */
 384   {4, 8, 16},                           /* cost of loading SSE registers
 385                                            in SImode, DImode and TImode */
 386   {4, 8, 16},                           /* cost of storing SSE registers
 387                                            in SImode, DImode and TImode */
 388   3,                                    /* MMX or SSE register to integer */
 389   8,                                    /* size of l1 cache.  */
 390   8,                                    /* size of l2 cache  */
 391   0,                                    /* size of prefetch block */
 392   0,                                    /* number of parallel prefetches */
 393   2,                                    /* Branch cost */
 394   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
 395   COSTS_N_INSNS (3),                    /* cost of FMUL instruction.  */
 396   COSTS_N_INSNS (39),                   /* cost of FDIV instruction.  */
 397   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
 398   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
 399   COSTS_N_INSNS (70),                   /* cost of FSQRT instruction.  */
 400
 401   COSTS_N_INSNS (5),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 402   COSTS_N_INSNS (5),                    /* cost of MULSS instruction.  */
 403   COSTS_N_INSNS (5),                    /* cost of MULSD instruction.  */
 404   COSTS_N_INSNS (31),                   /* cost of DIVSS instruction.  */
 405   COSTS_N_INSNS (60),                   /* cost of DIVSD instruction.  */
 406   COSTS_N_INSNS (31),                   /* cost of SQRTSS instruction.  */
 407   COSTS_N_INSNS (63),                   /* cost of SQRTSD instruction.  */
 408   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 409   pentium_memcpy,
 410   pentium_memset,
 411   1,                                    /* scalar_stmt_cost.  */
 412   1,                                    /* scalar load_cost.  */
 413   1,                                    /* scalar_store_cost.  */
 414   1,                                    /* vec_stmt_cost.  */
 415   1,                                    /* vec_to_scalar_cost.  */
 416   1,                                    /* scalar_to_vec_cost.  */
 417   1,                                    /* vec_align_load_cost.  */
 418   2,                                    /* vec_unalign_load_cost.  */
 419   1,                                    /* vec_store_cost.  */
 420   3,                                    /* cond_taken_branch_cost.  */
 421   1,                                    /* cond_not_taken_branch_cost.  */
 422 };
 423
 424 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
 425    (we ensure the alignment).  For small blocks inline loop is still a
 426    noticeable win, for bigger blocks either rep movsl or rep movsb is
 427    way to go.  Rep movsb has apparently more expensive startup time in CPU,
 428    but after 4K the difference is down in the noise.  */
 429 static stringop_algs pentiumpro_memcpy[2] = {
 430   {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
 431                        {8192, rep_prefix_4_byte, false},
 432                        {-1, rep_prefix_1_byte, false}}},
 433   DUMMY_STRINGOP_ALGS};
 434 static stringop_algs pentiumpro_memset[2] = {
 435   {rep_prefix_4_byte, {{1024, unrolled_loop, false},
 436                        {8192, rep_prefix_4_byte, false},
 437                        {-1, libcall, false}}},
 438   DUMMY_STRINGOP_ALGS};
 439 static const
 440 struct processor_costs pentiumpro_cost = {
 441   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 442   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
 443   COSTS_N_INSNS (1),                    /* variable shift costs */
 444   COSTS_N_INSNS (1),                    /* constant shift costs */
 445   {COSTS_N_INSNS (4),                   /* cost of starting multiply for QI */
 446    COSTS_N_INSNS (4),                   /*                               HI */
 447    COSTS_N_INSNS (4),                   /*                               SI */
 448    COSTS_N_INSNS (4),                   /*                               DI */
 449    COSTS_N_INSNS (4)},                  /*                            other */
 450   0,                                    /* cost of multiply per each bit set */
 451   {COSTS_N_INSNS (17),                  /* cost of a divide/mod for QI */
 452    COSTS_N_INSNS (17),                  /*                          HI */
 453    COSTS_N_INSNS (17),                  /*                          SI */
 454    COSTS_N_INSNS (17),                  /*                          DI */
 455    COSTS_N_INSNS (17)},                 /*                          other */
 456   COSTS_N_INSNS (1),                    /* cost of movsx */
 457   COSTS_N_INSNS (1),                    /* cost of movzx */
 458   8,                                    /* "large" insn */
 459   6,                                    /* MOVE_RATIO */
 460   2,                                 /* cost for loading QImode using movzbl */
 461   {4, 4, 4},                            /* cost of loading integer registers
 462                                            in QImode, HImode and SImode.
 463                                            Relative to reg-reg move (2).  */
 464   {2, 2, 2},                            /* cost of storing integer registers */
 465   2,                                    /* cost of reg,reg fld/fst */
 466   {2, 2, 6},                            /* cost of loading fp registers
 467                                            in SFmode, DFmode and XFmode */
 468   {4, 4, 6},                            /* cost of storing fp registers
 469                                            in SFmode, DFmode and XFmode */
 470   2,                                    /* cost of moving MMX register */
 471   {2, 2},                               /* cost of loading MMX registers
 472                                            in SImode and DImode */
 473   {2, 2},                               /* cost of storing MMX registers
 474                                            in SImode and DImode */
 475   2,                                    /* cost of moving SSE register */
 476   {2, 2, 8},                            /* cost of loading SSE registers
 477                                            in SImode, DImode and TImode */
 478   {2, 2, 8},                            /* cost of storing SSE registers
 479                                            in SImode, DImode and TImode */
 480   3,                                    /* MMX or SSE register to integer */
 481   8,                                    /* size of l1 cache.  */
 482   256,                                  /* size of l2 cache  */
 483   32,                                   /* size of prefetch block */
 484   6,                                    /* number of parallel prefetches */
 485   2,                                    /* Branch cost */
 486   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
 487   COSTS_N_INSNS (5),                    /* cost of FMUL instruction.  */
 488   COSTS_N_INSNS (56),                   /* cost of FDIV instruction.  */
 489   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
 490   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
 491   COSTS_N_INSNS (56),                   /* cost of FSQRT instruction.  */
 492
 493   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 494   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
 495   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
 496   COSTS_N_INSNS (18),                   /* cost of DIVSS instruction.  */
 497   COSTS_N_INSNS (18),                   /* cost of DIVSD instruction.  */
 498   COSTS_N_INSNS (31),                   /* cost of SQRTSS instruction.  */
 499   COSTS_N_INSNS (31),                   /* cost of SQRTSD instruction.  */
 500   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 501   pentiumpro_memcpy,
 502   pentiumpro_memset,
 503   1,                                    /* scalar_stmt_cost.  */
 504   1,                                    /* scalar load_cost.  */
 505   1,                                    /* scalar_store_cost.  */
 506   1,                                    /* vec_stmt_cost.  */
 507   1,                                    /* vec_to_scalar_cost.  */
 508   1,                                    /* scalar_to_vec_cost.  */
 509   1,                                    /* vec_align_load_cost.  */
 510   2,                                    /* vec_unalign_load_cost.  */
 511   1,                                    /* vec_store_cost.  */
 512   3,                                    /* cond_taken_branch_cost.  */
 513   1,                                    /* cond_not_taken_branch_cost.  */
 514 };
 515
 516 static stringop_algs geode_memcpy[2] = {
 517   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 518   DUMMY_STRINGOP_ALGS};
 519 static stringop_algs geode_memset[2] = {
 520   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 521   DUMMY_STRINGOP_ALGS};
 522 static const
 523 struct processor_costs geode_cost = {
 524   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 525   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
 526   COSTS_N_INSNS (2),                    /* variable shift costs */
 527   COSTS_N_INSNS (1),                    /* constant shift costs */
 528   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
 529    COSTS_N_INSNS (4),                   /*                               HI */
 530    COSTS_N_INSNS (7),                   /*                               SI */
 531    COSTS_N_INSNS (7),                   /*                               DI */
 532    COSTS_N_INSNS (7)},                  /*                            other */
 533   0,                                    /* cost of multiply per each bit set */
 534   {COSTS_N_INSNS (15),                  /* cost of a divide/mod for QI */
 535    COSTS_N_INSNS (23),                  /*                          HI */
 536    COSTS_N_INSNS (39),                  /*                          SI */
 537    COSTS_N_INSNS (39),                  /*                          DI */
 538    COSTS_N_INSNS (39)},                 /*                          other */
 539   COSTS_N_INSNS (1),                    /* cost of movsx */
 540   COSTS_N_INSNS (1),                    /* cost of movzx */
 541   8,                                    /* "large" insn */
 542   4,                                    /* MOVE_RATIO */
 543   1,                                 /* cost for loading QImode using movzbl */
 544   {1, 1, 1},                            /* cost of loading integer registers
 545                                            in QImode, HImode and SImode.
 546                                            Relative to reg-reg move (2).  */
 547   {1, 1, 1},                            /* cost of storing integer registers */
 548   1,                                    /* cost of reg,reg fld/fst */
 549   {1, 1, 1},                            /* cost of loading fp registers
 550                                            in SFmode, DFmode and XFmode */
 551   {4, 6, 6},                            /* cost of storing fp registers
 552                                            in SFmode, DFmode and XFmode */
 553
 554   2,                                    /* cost of moving MMX register */
 555   {2, 2},                               /* cost of loading MMX registers
 556                                            in SImode and DImode */
 557   {2, 2},                               /* cost of storing MMX registers
 558                                            in SImode and DImode */
 559   2,                                    /* cost of moving SSE register */
 560   {2, 2, 8},                            /* cost of loading SSE registers
 561                                            in SImode, DImode and TImode */
 562   {2, 2, 8},                            /* cost of storing SSE registers
 563                                            in SImode, DImode and TImode */
 564   3,                                    /* MMX or SSE register to integer */
 565   64,                                   /* size of l1 cache.  */
 566   128,                                  /* size of l2 cache.  */
 567   32,                                   /* size of prefetch block */
 568   1,                                    /* number of parallel prefetches */
 569   1,                                    /* Branch cost */
 570   COSTS_N_INSNS (6),                    /* cost of FADD and FSUB insns.  */
 571   COSTS_N_INSNS (11),                   /* cost of FMUL instruction.  */
 572   COSTS_N_INSNS (47),                   /* cost of FDIV instruction.  */
 573   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
 574   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
 575   COSTS_N_INSNS (54),                   /* cost of FSQRT instruction.  */
 576
 577   COSTS_N_INSNS (6),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 578   COSTS_N_INSNS (11),                   /* cost of MULSS instruction.  */
 579   COSTS_N_INSNS (11),                   /* cost of MULSD instruction.  */
 580   COSTS_N_INSNS (47),                   /* cost of DIVSS instruction.  */
 581   COSTS_N_INSNS (47),                   /* cost of DIVSD instruction.  */
 582   COSTS_N_INSNS (54),                   /* cost of SQRTSS instruction.  */
 583   COSTS_N_INSNS (54),                   /* cost of SQRTSD instruction.  */
 584   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 585   geode_memcpy,
 586   geode_memset,
 587   1,                                    /* scalar_stmt_cost.  */
 588   1,                                    /* scalar load_cost.  */
 589   1,                                    /* scalar_store_cost.  */
 590   1,                                    /* vec_stmt_cost.  */
 591   1,                                    /* vec_to_scalar_cost.  */
 592   1,                                    /* scalar_to_vec_cost.  */
 593   1,                                    /* vec_align_load_cost.  */
 594   2,                                    /* vec_unalign_load_cost.  */
 595   1,                                    /* vec_store_cost.  */
 596   3,                                    /* cond_taken_branch_cost.  */
 597   1,                                    /* cond_not_taken_branch_cost.  */
 598 };
 599
 600 static stringop_algs k6_memcpy[2] = {
 601   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 602   DUMMY_STRINGOP_ALGS};
 603 static stringop_algs k6_memset[2] = {
 604   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 605   DUMMY_STRINGOP_ALGS};
 606 static const
 607 struct processor_costs k6_cost = {
 608   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 609   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
 610   COSTS_N_INSNS (1),                    /* variable shift costs */
 611   COSTS_N_INSNS (1),                    /* constant shift costs */
 612   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
 613    COSTS_N_INSNS (3),                   /*                               HI */
 614    COSTS_N_INSNS (3),                   /*                               SI */
 615    COSTS_N_INSNS (3),                   /*                               DI */
 616    COSTS_N_INSNS (3)},                  /*                            other */
 617   0,                                    /* cost of multiply per each bit set */
 618   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
 619    COSTS_N_INSNS (18),                  /*                          HI */
 620    COSTS_N_INSNS (18),                  /*                          SI */
 621    COSTS_N_INSNS (18),                  /*                          DI */
 622    COSTS_N_INSNS (18)},                 /*                          other */
 623   COSTS_N_INSNS (2),                    /* cost of movsx */
 624   COSTS_N_INSNS (2),                    /* cost of movzx */
 625   8,                                    /* "large" insn */
 626   4,                                    /* MOVE_RATIO */
 627   3,                                 /* cost for loading QImode using movzbl */
 628   {4, 5, 4},                            /* cost of loading integer registers
 629                                            in QImode, HImode and SImode.
 630                                            Relative to reg-reg move (2).  */
 631   {2, 3, 2},                            /* cost of storing integer registers */
 632   4,                                    /* cost of reg,reg fld/fst */
 633   {6, 6, 6},                            /* cost of loading fp registers
 634                                            in SFmode, DFmode and XFmode */
 635   {4, 4, 4},                            /* cost of storing fp registers
 636                                            in SFmode, DFmode and XFmode */
 637   2,                                    /* cost of moving MMX register */
 638   {2, 2},                               /* cost of loading MMX registers
 639                                            in SImode and DImode */
 640   {2, 2},                               /* cost of storing MMX registers
 641                                            in SImode and DImode */
 642   2,                                    /* cost of moving SSE register */
 643   {2, 2, 8},                            /* cost of loading SSE registers
 644                                            in SImode, DImode and TImode */
 645   {2, 2, 8},                            /* cost of storing SSE registers
 646                                            in SImode, DImode and TImode */
 647   6,                                    /* MMX or SSE register to integer */
 648   32,                                   /* size of l1 cache.  */
 649   32,                                   /* size of l2 cache.  Some models
 650                                            have integrated l2 cache, but
 651                                            optimizing for k6 is not important
 652                                            enough to worry about that.  */
 653   32,                                   /* size of prefetch block */
 654   1,                                    /* number of parallel prefetches */
 655   1,                                    /* Branch cost */
 656   COSTS_N_INSNS (2),                    /* cost of FADD and FSUB insns.  */
 657   COSTS_N_INSNS (2),                    /* cost of FMUL instruction.  */
 658   COSTS_N_INSNS (56),                   /* cost of FDIV instruction.  */
 659   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
 660   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
 661   COSTS_N_INSNS (56),                   /* cost of FSQRT instruction.  */
 662
 663   COSTS_N_INSNS (2),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 664   COSTS_N_INSNS (2),                    /* cost of MULSS instruction.  */
 665   COSTS_N_INSNS (2),                    /* cost of MULSD instruction.  */
 666   COSTS_N_INSNS (56),                   /* cost of DIVSS instruction.  */
 667   COSTS_N_INSNS (56),                   /* cost of DIVSD instruction.  */
 668   COSTS_N_INSNS (56),                   /* cost of SQRTSS instruction.  */
 669   COSTS_N_INSNS (56),                   /* cost of SQRTSD instruction.  */
 670   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 671   k6_memcpy,
 672   k6_memset,
 673   1,                                    /* scalar_stmt_cost.  */
 674   1,                                    /* scalar load_cost.  */
 675   1,                                    /* scalar_store_cost.  */
 676   1,                                    /* vec_stmt_cost.  */
 677   1,                                    /* vec_to_scalar_cost.  */
 678   1,                                    /* scalar_to_vec_cost.  */
 679   1,                                    /* vec_align_load_cost.  */
 680   2,                                    /* vec_unalign_load_cost.  */
 681   1,                                    /* vec_store_cost.  */
 682   3,                                    /* cond_taken_branch_cost.  */
 683   1,                                    /* cond_not_taken_branch_cost.  */
 684 };
 685
 686 /* For some reason, Athlon deals better with REP prefix (relative to loops)
 687    compared to K8. Alignment becomes important after 8 bytes for memcpy and
 688    128 bytes for memset.  */
 689 static stringop_algs athlon_memcpy[2] = {
 690   {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 691   DUMMY_STRINGOP_ALGS};
 692 static stringop_algs athlon_memset[2] = {
 693   {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 694   DUMMY_STRINGOP_ALGS};
 695 static const
 696 struct processor_costs athlon_cost = {
 697   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 698   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
 699   COSTS_N_INSNS (1),                    /* variable shift costs */
 700   COSTS_N_INSNS (1),                    /* constant shift costs */
 701   {COSTS_N_INSNS (5),                   /* cost of starting multiply for QI */
 702    COSTS_N_INSNS (5),                   /*                               HI */
 703    COSTS_N_INSNS (5),                   /*                               SI */
 704    COSTS_N_INSNS (5),                   /*                               DI */
 705    COSTS_N_INSNS (5)},                  /*                            other */
 706   0,                                    /* cost of multiply per each bit set */
 707   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
 708    COSTS_N_INSNS (26),                  /*                          HI */
 709    COSTS_N_INSNS (42),                  /*                          SI */
 710    COSTS_N_INSNS (74),                  /*                          DI */
 711    COSTS_N_INSNS (74)},                 /*                          other */
 712   COSTS_N_INSNS (1),                    /* cost of movsx */
 713   COSTS_N_INSNS (1),                    /* cost of movzx */
 714   8,                                    /* "large" insn */
 715   9,                                    /* MOVE_RATIO */
 716   4,                                 /* cost for loading QImode using movzbl */
 717   {3, 4, 3},                            /* cost of loading integer registers
 718                                            in QImode, HImode and SImode.
 719                                            Relative to reg-reg move (2).  */
 720   {3, 4, 3},                            /* cost of storing integer registers */
 721   4,                                    /* cost of reg,reg fld/fst */
 722   {4, 4, 12},                           /* cost of loading fp registers
 723                                            in SFmode, DFmode and XFmode */
 724   {6, 6, 8},                            /* cost of storing fp registers
 725                                            in SFmode, DFmode and XFmode */
 726   2,                                    /* cost of moving MMX register */
 727   {4, 4},                               /* cost of loading MMX registers
 728                                            in SImode and DImode */
 729   {4, 4},                               /* cost of storing MMX registers
 730                                            in SImode and DImode */
 731   2,                                    /* cost of moving SSE register */
 732   {4, 4, 6},                            /* cost of loading SSE registers
 733                                            in SImode, DImode and TImode */
 734   {4, 4, 5},                            /* cost of storing SSE registers
 735                                            in SImode, DImode and TImode */
 736   5,                                    /* MMX or SSE register to integer */
 737   64,                                   /* size of l1 cache.  */
 738   256,                                  /* size of l2 cache.  */
 739   64,                                   /* size of prefetch block */
 740   6,                                    /* number of parallel prefetches */
 741   5,                                    /* Branch cost */
 742   COSTS_N_INSNS (4),                    /* cost of FADD and FSUB insns.  */
 743   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
 744   COSTS_N_INSNS (24),                   /* cost of FDIV instruction.  */
 745   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
 746   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
 747   COSTS_N_INSNS (35),                   /* cost of FSQRT instruction.  */
 748
 749   COSTS_N_INSNS (4),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 750   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
 751   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
 752   /* 11-16  */
 753   COSTS_N_INSNS (16),                   /* cost of DIVSS instruction.  */
 754   COSTS_N_INSNS (24),                   /* cost of DIVSD instruction.  */
 755   COSTS_N_INSNS (19),                   /* cost of SQRTSS instruction.  */
 756   COSTS_N_INSNS (19),                   /* cost of SQRTSD instruction.  */
 757   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 758   athlon_memcpy,
 759   athlon_memset,
 760   1,                                    /* scalar_stmt_cost.  */
 761   1,                                    /* scalar load_cost.  */
 762   1,                                    /* scalar_store_cost.  */
 763   1,                                    /* vec_stmt_cost.  */
 764   1,                                    /* vec_to_scalar_cost.  */
 765   1,                                    /* scalar_to_vec_cost.  */
 766   1,                                    /* vec_align_load_cost.  */
 767   2,                                    /* vec_unalign_load_cost.  */
 768   1,                                    /* vec_store_cost.  */
 769   3,                                    /* cond_taken_branch_cost.  */
 770   1,                                    /* cond_not_taken_branch_cost.  */
 771 };
 772
 773 /* K8 has optimized REP instruction for medium sized blocks, but for very
 774    small blocks it is better to use loop. For large blocks, libcall can
 775    do nontemporary accesses and beat inline considerably.  */
 776 static stringop_algs k8_memcpy[2] = {
 777   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
 778              {-1, rep_prefix_4_byte, false}}},
 779   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
 780              {-1, libcall, false}}}};
 781 static stringop_algs k8_memset[2] = {
 782   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
 783              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 784   {libcall, {{48, unrolled_loop, false},
 785              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
 786 static const
 787 struct processor_costs k8_cost = {
 788   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 789   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
 790   COSTS_N_INSNS (1),                    /* variable shift costs */
 791   COSTS_N_INSNS (1),                    /* constant shift costs */
 792   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
 793    COSTS_N_INSNS (4),                   /*                               HI */
 794    COSTS_N_INSNS (3),                   /*                               SI */
 795    COSTS_N_INSNS (4),                   /*                               DI */
 796    COSTS_N_INSNS (5)},                  /*                            other */
 797   0,                                    /* cost of multiply per each bit set */
 798   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
 799    COSTS_N_INSNS (26),                  /*                          HI */
 800    COSTS_N_INSNS (42),                  /*                          SI */
 801    COSTS_N_INSNS (74),                  /*                          DI */
 802    COSTS_N_INSNS (74)},                 /*                          other */
 803   COSTS_N_INSNS (1),                    /* cost of movsx */
 804   COSTS_N_INSNS (1),                    /* cost of movzx */
 805   8,                                    /* "large" insn */
 806   9,                                    /* MOVE_RATIO */
 807   4,                                 /* cost for loading QImode using movzbl */
 808   {3, 4, 3},                            /* cost of loading integer registers
 809                                            in QImode, HImode and SImode.
 810                                            Relative to reg-reg move (2).  */
 811   {3, 4, 3},                            /* cost of storing integer registers */
 812   4,                                    /* cost of reg,reg fld/fst */
 813   {4, 4, 12},                           /* cost of loading fp registers
 814                                            in SFmode, DFmode and XFmode */
 815   {6, 6, 8},                            /* cost of storing fp registers
 816                                            in SFmode, DFmode and XFmode */
 817   2,                                    /* cost of moving MMX register */
 818   {3, 3},                               /* cost of loading MMX registers
 819                                            in SImode and DImode */
 820   {4, 4},                               /* cost of storing MMX registers
 821                                            in SImode and DImode */
 822   2,                                    /* cost of moving SSE register */
 823   {4, 3, 6},                            /* cost of loading SSE registers
 824                                            in SImode, DImode and TImode */
 825   {4, 4, 5},                            /* cost of storing SSE registers
 826                                            in SImode, DImode and TImode */
 827   5,                                    /* MMX or SSE register to integer */
 828   64,                                   /* size of l1 cache.  */
 829   512,                                  /* size of l2 cache.  */
 830   64,                                   /* size of prefetch block */
 831   /* New AMD processors never drop prefetches; if they cannot be performed
 832      immediately, they are queued.  We set number of simultaneous prefetches
 833      to a large constant to reflect this (it probably is not a good idea not
 834      to limit number of prefetches at all, as their execution also takes some
 835      time).  */
 836   100,                                  /* number of parallel prefetches */
 837   3,                                    /* Branch cost */
 838   COSTS_N_INSNS (4),                    /* cost of FADD and FSUB insns.  */
 839   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
 840   COSTS_N_INSNS (19),                   /* cost of FDIV instruction.  */
 841   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
 842   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
 843   COSTS_N_INSNS (35),                   /* cost of FSQRT instruction.  */
 844
 845   COSTS_N_INSNS (4),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 846   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
 847   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
 848   /* 11-16  */
 849   COSTS_N_INSNS (16),                   /* cost of DIVSS instruction.  */
 850   COSTS_N_INSNS (20),                   /* cost of DIVSD instruction.  */
 851   COSTS_N_INSNS (19),                   /* cost of SQRTSS instruction.  */
 852   COSTS_N_INSNS (27),                   /* cost of SQRTSD instruction.  */
 853   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 854   k8_memcpy,
 855   k8_memset,
 856   4,                                    /* scalar_stmt_cost.  */
 857   2,                                    /* scalar load_cost.  */
 858   2,                                    /* scalar_store_cost.  */
 859   5,                                    /* vec_stmt_cost.  */
 860   0,                                    /* vec_to_scalar_cost.  */
 861   2,                                    /* scalar_to_vec_cost.  */
 862   2,                                    /* vec_align_load_cost.  */
 863   3,                                    /* vec_unalign_load_cost.  */
 864   3,                                    /* vec_store_cost.  */
 865   3,                                    /* cond_taken_branch_cost.  */
 866   2,                                    /* cond_not_taken_branch_cost.  */
 867 };
 868
 869 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
 870    very small blocks it is better to use loop. For large blocks, libcall can
 871    do nontemporary accesses and beat inline considerably.  */
 872 static stringop_algs amdfam10_memcpy[2] = {
 873   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
 874              {-1, rep_prefix_4_byte, false}}},
 875   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
 876              {-1, libcall, false}}}};
 877 static stringop_algs amdfam10_memset[2] = {
 878   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
 879              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 880   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
 881              {-1, libcall, false}}}};
 882 struct processor_costs amdfam10_cost = {
 883   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 884   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
 885   COSTS_N_INSNS (1),                    /* variable shift costs */
 886   COSTS_N_INSNS (1),                    /* constant shift costs */
 887   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
 888    COSTS_N_INSNS (4),                   /*                               HI */
 889    COSTS_N_INSNS (3),                   /*                               SI */
 890    COSTS_N_INSNS (4),                   /*                               DI */
 891    COSTS_N_INSNS (5)},                  /*                            other */
 892   0,                                    /* cost of multiply per each bit set */
 893   {COSTS_N_INSNS (19),                  /* cost of a divide/mod for QI */
 894    COSTS_N_INSNS (35),                  /*                          HI */
 895    COSTS_N_INSNS (51),                  /*                          SI */
 896    COSTS_N_INSNS (83),                  /*                          DI */
 897    COSTS_N_INSNS (83)},                 /*                          other */
 898   COSTS_N_INSNS (1),                    /* cost of movsx */
 899   COSTS_N_INSNS (1),                    /* cost of movzx */
 900   8,                                    /* "large" insn */
 901   9,                                    /* MOVE_RATIO */
 902   4,                                 /* cost for loading QImode using movzbl */
 903   {3, 4, 3},                            /* cost of loading integer registers
 904                                            in QImode, HImode and SImode.
 905                                            Relative to reg-reg move (2).  */
 906   {3, 4, 3},                            /* cost of storing integer registers */
 907   4,                                    /* cost of reg,reg fld/fst */
 908   {4, 4, 12},                           /* cost of loading fp registers
 909                                            in SFmode, DFmode and XFmode */
 910   {6, 6, 8},                            /* cost of storing fp registers
 911                                            in SFmode, DFmode and XFmode */
 912   2,                                    /* cost of moving MMX register */
 913   {3, 3},                               /* cost of loading MMX registers
 914                                            in SImode and DImode */
 915   {4, 4},                               /* cost of storing MMX registers
 916                                            in SImode and DImode */
 917   2,                                    /* cost of moving SSE register */
 918   {4, 4, 3},                            /* cost of loading SSE registers
 919                                            in SImode, DImode and TImode */
 920   {4, 4, 5},                            /* cost of storing SSE registers
 921                                            in SImode, DImode and TImode */
 922   3,                                    /* MMX or SSE register to integer */
 923                                         /* On K8:
 924                                             MOVD reg64, xmmreg Double FSTORE 4
 925                                             MOVD reg32, xmmreg Double FSTORE 4
 926                                            On AMDFAM10:
 927                                             MOVD reg64, xmmreg Double FADD 3
 928                                                                1/1  1/1
 929                                             MOVD reg32, xmmreg Double FADD 3
 930                                                                1/1  1/1 */
 931   64,                                   /* size of l1 cache.  */
 932   512,                                  /* size of l2 cache.  */
 933   64,                                   /* size of prefetch block */
 934   /* New AMD processors never drop prefetches; if they cannot be performed
 935      immediately, they are queued.  We set number of simultaneous prefetches
 936      to a large constant to reflect this (it probably is not a good idea not
 937      to limit number of prefetches at all, as their execution also takes some
 938      time).  */
 939   100,                                  /* number of parallel prefetches */
 940   2,                                    /* Branch cost */
 941   COSTS_N_INSNS (4),                    /* cost of FADD and FSUB insns.  */
 942   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
 943   COSTS_N_INSNS (19),                   /* cost of FDIV instruction.  */
 944   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
 945   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
 946   COSTS_N_INSNS (35),                   /* cost of FSQRT instruction.  */
 947
 948   COSTS_N_INSNS (4),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 949   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
 950   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
 951   /* 11-16  */
 952   COSTS_N_INSNS (16),                   /* cost of DIVSS instruction.  */
 953   COSTS_N_INSNS (20),                   /* cost of DIVSD instruction.  */
 954   COSTS_N_INSNS (19),                   /* cost of SQRTSS instruction.  */
 955   COSTS_N_INSNS (27),                   /* cost of SQRTSD instruction.  */
 956   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 957   amdfam10_memcpy,
 958   amdfam10_memset,
 959   4,                                    /* scalar_stmt_cost.  */
 960   2,                                    /* scalar load_cost.  */
 961   2,                                    /* scalar_store_cost.  */
 962   6,                                    /* vec_stmt_cost.  */
 963   0,                                    /* vec_to_scalar_cost.  */
 964   2,                                    /* scalar_to_vec_cost.  */
 965   2,                                    /* vec_align_load_cost.  */
 966   2,                                    /* vec_unalign_load_cost.  */
 967   2,                                    /* vec_store_cost.  */
 968   2,                                    /* cond_taken_branch_cost.  */
 969   1,                                    /* cond_not_taken_branch_cost.  */
 970 };
 971
 972 /*  BDVER1 has optimized REP instruction for medium sized blocks, but for
 973     very small blocks it is better to use loop. For large blocks, libcall
 974     can do nontemporary accesses and beat inline considerably.  */
 975 static stringop_algs bdver1_memcpy[2] = {
 976   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
 977              {-1, rep_prefix_4_byte, false}}},
 978   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
 979              {-1, libcall, false}}}};
 980 static stringop_algs bdver1_memset[2] = {
 981   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
 982              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 983   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
 984              {-1, libcall, false}}}};
 985
 986 const struct processor_costs bdver1_cost = {
 987   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 988   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
 989   COSTS_N_INSNS (1),                    /* variable shift costs */
 990   COSTS_N_INSNS (1),                    /* constant shift costs */
 991   {COSTS_N_INSNS (4),                   /* cost of starting multiply for QI */
 992    COSTS_N_INSNS (4),                   /*                               HI */
 993    COSTS_N_INSNS (4),                   /*                               SI */
 994    COSTS_N_INSNS (6),                   /*                               DI */
 995    COSTS_N_INSNS (6)},                  /*                            other */
 996   0,                                    /* cost of multiply per each bit set */
 997   {COSTS_N_INSNS (19),                  /* cost of a divide/mod for QI */
 998    COSTS_N_INSNS (35),                  /*                          HI */
 999    COSTS_N_INSNS (51),                  /*                          SI */
1000    COSTS_N_INSNS (83),                  /*                          DI */
1001    COSTS_N_INSNS (83)},                 /*                          other */
1002   COSTS_N_INSNS (1),                    /* cost of movsx */
1003   COSTS_N_INSNS (1),                    /* cost of movzx */
1004   8,                                    /* "large" insn */
1005   9,                                    /* MOVE_RATIO */
1006   4,                                 /* cost for loading QImode using movzbl */
1007   {5, 5, 4},                            /* cost of loading integer registers
1008                                            in QImode, HImode and SImode.
1009                                            Relative to reg-reg move (2).  */
1010   {4, 4, 4},                            /* cost of storing integer registers */
1011   2,                                    /* cost of reg,reg fld/fst */
1012   {5, 5, 12},                           /* cost of loading fp registers
1013                                            in SFmode, DFmode and XFmode */
1014   {4, 4, 8},                            /* cost of storing fp registers
1015                                            in SFmode, DFmode and XFmode */
1016   2,                                    /* cost of moving MMX register */
1017   {4, 4},                               /* cost of loading MMX registers
1018                                            in SImode and DImode */
1019   {4, 4},                               /* cost of storing MMX registers
1020                                            in SImode and DImode */
1021   2,                                    /* cost of moving SSE register */
1022   {4, 4, 4},                            /* cost of loading SSE registers
1023                                            in SImode, DImode and TImode */
1024   {4, 4, 4},                            /* cost of storing SSE registers
1025                                            in SImode, DImode and TImode */
1026   2,                                    /* MMX or SSE register to integer */
1027                                         /* On K8:
1028                                             MOVD reg64, xmmreg Double FSTORE 4
1029                                             MOVD reg32, xmmreg Double FSTORE 4
1030                                            On AMDFAM10:
1031                                             MOVD reg64, xmmreg Double FADD 3
1032                                                                1/1  1/1
1033                                             MOVD reg32, xmmreg Double FADD 3
1034                                                                1/1  1/1 */
1035   16,                                   /* size of l1 cache.  */
1036   2048,                                 /* size of l2 cache.  */
1037   64,                                   /* size of prefetch block */
1038   /* New AMD processors never drop prefetches; if they cannot be performed
1039      immediately, they are queued.  We set number of simultaneous prefetches
1040      to a large constant to reflect this (it probably is not a good idea not
1041      to limit number of prefetches at all, as their execution also takes some
1042      time).  */
1043   100,                                  /* number of parallel prefetches */
1044   2,                                    /* Branch cost */
1045   COSTS_N_INSNS (6),                    /* cost of FADD and FSUB insns.  */
1046   COSTS_N_INSNS (6),                    /* cost of FMUL instruction.  */
1047   COSTS_N_INSNS (42),                   /* cost of FDIV instruction.  */
1048   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
1049   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
1050   COSTS_N_INSNS (52),                   /* cost of FSQRT instruction.  */
1051
1052   COSTS_N_INSNS (6),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1053   COSTS_N_INSNS (6),                    /* cost of MULSS instruction.  */
1054   COSTS_N_INSNS (6),                    /* cost of MULSD instruction.  */
1055   /* 9-24  */
1056   COSTS_N_INSNS (24),                   /* cost of DIVSS instruction.  */
1057   /* 9-27  */
1058   COSTS_N_INSNS (27),                   /* cost of DIVSD instruction.  */
1059   COSTS_N_INSNS (15),                   /* cost of SQRTSS instruction.  */
1060   COSTS_N_INSNS (26),                   /* cost of SQRTSD instruction.  */
1061   1, 2, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1062   bdver1_memcpy,
1063   bdver1_memset,
1064   6,                                    /* scalar_stmt_cost.  */
1065   4,                                    /* scalar load_cost.  */
1066   4,                                    /* scalar_store_cost.  */
1067   6,                                    /* vec_stmt_cost.  */
1068   0,                                    /* vec_to_scalar_cost.  */
1069   2,                                    /* scalar_to_vec_cost.  */
1070   4,                                    /* vec_align_load_cost.  */
1071   4,                                    /* vec_unalign_load_cost.  */
1072   4,                                    /* vec_store_cost.  */
1073   4,                                    /* cond_taken_branch_cost.  */
1074   2,                                    /* cond_not_taken_branch_cost.  */
1075 };
1076
1077 /*  BDVER2 has optimized REP instruction for medium sized blocks, but for
1078     very small blocks it is better to use loop. For large blocks, libcall
1079     can do nontemporary accesses and beat inline considerably.  */
1080
1081 static stringop_algs bdver2_memcpy[2] = {
1082   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1083              {-1, rep_prefix_4_byte, false}}},
1084   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1085              {-1, libcall, false}}}};
1086 static stringop_algs bdver2_memset[2] = {
1087   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1088              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1089   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1090              {-1, libcall, false}}}};
1091
1092 const struct processor_costs bdver2_cost = {
1093   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1094   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
1095   COSTS_N_INSNS (1),                    /* variable shift costs */
1096   COSTS_N_INSNS (1),                    /* constant shift costs */
1097   {COSTS_N_INSNS (4),                   /* cost of starting multiply for QI */
1098    COSTS_N_INSNS (4),                   /*                               HI */
1099    COSTS_N_INSNS (4),                   /*                               SI */
1100    COSTS_N_INSNS (6),                   /*                               DI */
1101    COSTS_N_INSNS (6)},                  /*                            other */
1102   0,                                    /* cost of multiply per each bit set */
1103   {COSTS_N_INSNS (19),                  /* cost of a divide/mod for QI */
1104    COSTS_N_INSNS (35),                  /*                          HI */
1105    COSTS_N_INSNS (51),                  /*                          SI */
1106    COSTS_N_INSNS (83),                  /*                          DI */
1107    COSTS_N_INSNS (83)},                 /*                          other */
1108   COSTS_N_INSNS (1),                    /* cost of movsx */
1109   COSTS_N_INSNS (1),                    /* cost of movzx */
1110   8,                                    /* "large" insn */
1111   9,                                    /* MOVE_RATIO */
1112   4,                                 /* cost for loading QImode using movzbl */
1113   {5, 5, 4},                            /* cost of loading integer registers
1114                                            in QImode, HImode and SImode.
1115                                            Relative to reg-reg move (2).  */
1116   {4, 4, 4},                            /* cost of storing integer registers */
1117   2,                                    /* cost of reg,reg fld/fst */
1118   {5, 5, 12},                           /* cost of loading fp registers
1119                                            in SFmode, DFmode and XFmode */
1120   {4, 4, 8},                            /* cost of storing fp registers
1121                                            in SFmode, DFmode and XFmode */
1122   2,                                    /* cost of moving MMX register */
1123   {4, 4},                               /* cost of loading MMX registers
1124                                            in SImode and DImode */
1125   {4, 4},                               /* cost of storing MMX registers
1126                                            in SImode and DImode */
1127   2,                                    /* cost of moving SSE register */
1128   {4, 4, 4},                            /* cost of loading SSE registers
1129                                            in SImode, DImode and TImode */
1130   {4, 4, 4},                            /* cost of storing SSE registers
1131                                            in SImode, DImode and TImode */
1132   2,                                    /* MMX or SSE register to integer */
1133                                         /* On K8:
1134                                             MOVD reg64, xmmreg Double FSTORE 4
1135                                             MOVD reg32, xmmreg Double FSTORE 4
1136                                            On AMDFAM10:
1137                                             MOVD reg64, xmmreg Double FADD 3
1138                                                                1/1  1/1
1139                                             MOVD reg32, xmmreg Double FADD 3
1140                                                                1/1  1/1 */
1141   16,                                   /* size of l1 cache.  */
1142   2048,                                 /* size of l2 cache.  */
1143   64,                                   /* size of prefetch block */
1144   /* New AMD processors never drop prefetches; if they cannot be performed
1145      immediately, they are queued.  We set number of simultaneous prefetches
1146      to a large constant to reflect this (it probably is not a good idea not
1147      to limit number of prefetches at all, as their execution also takes some
1148      time).  */
1149   100,                                  /* number of parallel prefetches */
1150   2,                                    /* Branch cost */
1151   COSTS_N_INSNS (6),                    /* cost of FADD and FSUB insns.  */
1152   COSTS_N_INSNS (6),                    /* cost of FMUL instruction.  */
1153   COSTS_N_INSNS (42),                   /* cost of FDIV instruction.  */
1154   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
1155   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
1156   COSTS_N_INSNS (52),                   /* cost of FSQRT instruction.  */
1157
1158   COSTS_N_INSNS (6),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1159   COSTS_N_INSNS (6),                    /* cost of MULSS instruction.  */
1160   COSTS_N_INSNS (6),                    /* cost of MULSD instruction.  */
1161   /* 9-24  */
1162   COSTS_N_INSNS (24),                   /* cost of DIVSS instruction.  */
1163   /* 9-27  */
1164   COSTS_N_INSNS (27),                   /* cost of DIVSD instruction.  */
1165   COSTS_N_INSNS (15),                   /* cost of SQRTSS instruction.  */
1166   COSTS_N_INSNS (26),                   /* cost of SQRTSD instruction.  */
1167   1, 2, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1168   bdver2_memcpy,
1169   bdver2_memset,
1170   6,                                    /* scalar_stmt_cost.  */
1171   4,                                    /* scalar load_cost.  */
1172   4,                                    /* scalar_store_cost.  */
1173   6,                                    /* vec_stmt_cost.  */
1174   0,                                    /* vec_to_scalar_cost.  */
1175   2,                                    /* scalar_to_vec_cost.  */
1176   4,                                    /* vec_align_load_cost.  */
1177   4,                                    /* vec_unalign_load_cost.  */
1178   4,                                    /* vec_store_cost.  */
1179   4,                                    /* cond_taken_branch_cost.  */
1180   2,                                    /* cond_not_taken_branch_cost.  */
1181 };
1182
1183
1184   /*  BDVER3 has optimized REP instruction for medium sized blocks, but for
1185       very small blocks it is better to use loop. For large blocks, libcall
1186       can do nontemporary accesses and beat inline considerably.  */
1187 static stringop_algs bdver3_memcpy[2] = {
1188   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1189              {-1, rep_prefix_4_byte, false}}},
1190   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1191              {-1, libcall, false}}}};
1192 static stringop_algs bdver3_memset[2] = {
1193   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1194              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1195   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1196              {-1, libcall, false}}}};
1197 struct processor_costs bdver3_cost = {
1198   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1199   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
1200   COSTS_N_INSNS (1),                    /* variable shift costs */
1201   COSTS_N_INSNS (1),                    /* constant shift costs */
1202   {COSTS_N_INSNS (4),                   /* cost of starting multiply for QI */
1203    COSTS_N_INSNS (4),                   /*                               HI */
1204    COSTS_N_INSNS (4),                   /*                               SI */
1205    COSTS_N_INSNS (6),                   /*                               DI */
1206    COSTS_N_INSNS (6)},                  /*                            other */
1207   0,                                    /* cost of multiply per each bit set */
1208   {COSTS_N_INSNS (19),                  /* cost of a divide/mod for QI */
1209    COSTS_N_INSNS (35),                  /*                          HI */
1210    COSTS_N_INSNS (51),                  /*                          SI */
1211    COSTS_N_INSNS (83),                  /*                          DI */
1212    COSTS_N_INSNS (83)},                 /*                          other */
1213   COSTS_N_INSNS (1),                    /* cost of movsx */
1214   COSTS_N_INSNS (1),                    /* cost of movzx */
1215   8,                                    /* "large" insn */
1216   9,                                    /* MOVE_RATIO */
1217   4,                                 /* cost for loading QImode using movzbl */
1218   {5, 5, 4},                            /* cost of loading integer registers
1219                                            in QImode, HImode and SImode.
1220                                            Relative to reg-reg move (2).  */
1221   {4, 4, 4},                            /* cost of storing integer registers */
1222   2,                                    /* cost of reg,reg fld/fst */
1223   {5, 5, 12},                           /* cost of loading fp registers
1224                                            in SFmode, DFmode and XFmode */
1225   {4, 4, 8},                            /* cost of storing fp registers
1226                                            in SFmode, DFmode and XFmode */
1227   2,                                    /* cost of moving MMX register */
1228   {4, 4},                               /* cost of loading MMX registers
1229                                            in SImode and DImode */
1230   {4, 4},                               /* cost of storing MMX registers
1231                                            in SImode and DImode */
1232   2,                                    /* cost of moving SSE register */
1233   {4, 4, 4},                            /* cost of loading SSE registers
1234                                            in SImode, DImode and TImode */
1235   {4, 4, 4},                            /* cost of storing SSE registers
1236                                            in SImode, DImode and TImode */
1237   2,                                    /* MMX or SSE register to integer */
1238   16,                                   /* size of l1 cache.  */
1239   2048,                                 /* size of l2 cache.  */
1240   64,                                   /* size of prefetch block */
1241   /* New AMD processors never drop prefetches; if they cannot be performed
1242      immediately, they are queued.  We set number of simultaneous prefetches
1243      to a large constant to reflect this (it probably is not a good idea not
1244      to limit number of prefetches at all, as their execution also takes some
1245      time).  */
1246   100,                                  /* number of parallel prefetches */
1247   2,                                    /* Branch cost */
1248   COSTS_N_INSNS (6),                    /* cost of FADD and FSUB insns.  */
1249   COSTS_N_INSNS (6),                    /* cost of FMUL instruction.  */
1250   COSTS_N_INSNS (42),                   /* cost of FDIV instruction.  */
1251   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
1252   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
1253   COSTS_N_INSNS (52),                   /* cost of FSQRT instruction.  */
1254
1255   COSTS_N_INSNS (6),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1256   COSTS_N_INSNS (6),                    /* cost of MULSS instruction.  */
1257   COSTS_N_INSNS (6),                    /* cost of MULSD instruction.  */
1258   /* 9-24  */
1259   COSTS_N_INSNS (24),                   /* cost of DIVSS instruction.  */
1260   /* 9-27  */
1261   COSTS_N_INSNS (27),                   /* cost of DIVSD instruction.  */
1262   COSTS_N_INSNS (15),                   /* cost of SQRTSS instruction.  */
1263   COSTS_N_INSNS (26),                   /* cost of SQRTSD instruction.  */
1264   1, 2, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1265   bdver3_memcpy,
1266   bdver3_memset,
1267   6,                                    /* scalar_stmt_cost.  */
1268   4,                                    /* scalar load_cost.  */
1269   4,                                    /* scalar_store_cost.  */
1270   6,                                    /* vec_stmt_cost.  */
1271   0,                                    /* vec_to_scalar_cost.  */
1272   2,                                    /* scalar_to_vec_cost.  */
1273   4,                                    /* vec_align_load_cost.  */
1274   4,                                    /* vec_unalign_load_cost.  */
1275   4,                                    /* vec_store_cost.  */
1276   4,                                    /* cond_taken_branch_cost.  */
1277   2,                                    /* cond_not_taken_branch_cost.  */
1278 };
1279
1280 /*  BDVER4 has optimized REP instruction for medium sized blocks, but for
1281     very small blocks it is better to use loop. For large blocks, libcall
1282     can do nontemporary accesses and beat inline considerably.  */
1283 static stringop_algs bdver4_memcpy[2] = {
1284   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1285              {-1, rep_prefix_4_byte, false}}},
1286   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1287              {-1, libcall, false}}}};
1288 static stringop_algs bdver4_memset[2] = {
1289   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1290              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1291   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1292              {-1, libcall, false}}}};
1293 struct processor_costs bdver4_cost = {
1294   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1295   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
1296   COSTS_N_INSNS (1),                    /* variable shift costs */
1297   COSTS_N_INSNS (1),                    /* constant shift costs */
1298   {COSTS_N_INSNS (4),                   /* cost of starting multiply for QI */
1299    COSTS_N_INSNS (4),                   /*                               HI */
1300    COSTS_N_INSNS (4),                   /*                               SI */
1301    COSTS_N_INSNS (6),                   /*                               DI */
1302    COSTS_N_INSNS (6)},                  /*                            other */
1303   0,                                    /* cost of multiply per each bit set */
1304   {COSTS_N_INSNS (19),                  /* cost of a divide/mod for QI */
1305    COSTS_N_INSNS (35),                  /*                          HI */
1306    COSTS_N_INSNS (51),                  /*                          SI */
1307    COSTS_N_INSNS (83),                  /*                          DI */
1308    COSTS_N_INSNS (83)},                 /*                          other */
1309   COSTS_N_INSNS (1),                    /* cost of movsx */
1310   COSTS_N_INSNS (1),                    /* cost of movzx */
1311   8,                                    /* "large" insn */
1312   9,                                    /* MOVE_RATIO */
1313   4,                                 /* cost for loading QImode using movzbl */
1314   {5, 5, 4},                            /* cost of loading integer registers
1315                                            in QImode, HImode and SImode.
1316                                            Relative to reg-reg move (2).  */
1317   {4, 4, 4},                            /* cost of storing integer registers */
1318   2,                                    /* cost of reg,reg fld/fst */
1319   {5, 5, 12},                           /* cost of loading fp registers
1320                                            in SFmode, DFmode and XFmode */
1321   {4, 4, 8},                            /* cost of storing fp registers
1322                                            in SFmode, DFmode and XFmode */
1323   2,                                    /* cost of moving MMX register */
1324   {4, 4},                               /* cost of loading MMX registers
1325                                            in SImode and DImode */
1326   {4, 4},                               /* cost of storing MMX registers
1327                                            in SImode and DImode */
1328   2,                                    /* cost of moving SSE register */
1329   {4, 4, 4},                            /* cost of loading SSE registers
1330                                            in SImode, DImode and TImode */
1331   {4, 4, 4},                            /* cost of storing SSE registers
1332                                            in SImode, DImode and TImode */
1333   2,                                    /* MMX or SSE register to integer */
1334   16,                                   /* size of l1 cache.  */
1335   2048,                                 /* size of l2 cache.  */
1336   64,                                   /* size of prefetch block */
1337   /* New AMD processors never drop prefetches; if they cannot be performed
1338      immediately, they are queued.  We set number of simultaneous prefetches
1339      to a large constant to reflect this (it probably is not a good idea not
1340      to limit number of prefetches at all, as their execution also takes some
1341      time).  */
1342   100,                                  /* number of parallel prefetches */
1343   2,                                    /* Branch cost */
1344   COSTS_N_INSNS (6),                    /* cost of FADD and FSUB insns.  */
1345   COSTS_N_INSNS (6),                    /* cost of FMUL instruction.  */
1346   COSTS_N_INSNS (42),                   /* cost of FDIV instruction.  */
1347   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
1348   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
1349   COSTS_N_INSNS (52),                   /* cost of FSQRT instruction.  */
1350
1351   COSTS_N_INSNS (6),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1352   COSTS_N_INSNS (6),                    /* cost of MULSS instruction.  */
1353   COSTS_N_INSNS (6),                    /* cost of MULSD instruction.  */
1354   /* 9-24  */
1355   COSTS_N_INSNS (24),                   /* cost of DIVSS instruction.  */
1356   /* 9-27  */
1357   COSTS_N_INSNS (27),                   /* cost of DIVSD instruction.  */
1358   COSTS_N_INSNS (15),                   /* cost of SQRTSS instruction.  */
1359   COSTS_N_INSNS (26),                   /* cost of SQRTSD instruction.  */
1360   1, 2, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1361   bdver4_memcpy,
1362   bdver4_memset,
1363   6,                                    /* scalar_stmt_cost.  */
1364   4,                                    /* scalar load_cost.  */
1365   4,                                    /* scalar_store_cost.  */
1366   6,                                    /* vec_stmt_cost.  */
1367   0,                                    /* vec_to_scalar_cost.  */
1368   2,                                    /* scalar_to_vec_cost.  */
1369   4,                                    /* vec_align_load_cost.  */
1370   4,                                    /* vec_unalign_load_cost.  */
1371   4,                                    /* vec_store_cost.  */
1372   4,                                    /* cond_taken_branch_cost.  */
1373   2,                                    /* cond_not_taken_branch_cost.  */
1374 };
1375
1376
1377 /*  ZNVER1 has optimized REP instruction for medium sized blocks, but for
1378     very small blocks it is better to use loop.  For large blocks, libcall
1379     can do nontemporary accesses and beat inline considerably.  */
1380 static stringop_algs znver1_memcpy[2] = {
1381   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1382              {-1, rep_prefix_4_byte, false}}},
1383   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1384              {-1, libcall, false}}}};
1385 static stringop_algs znver1_memset[2] = {
1386   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1387              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1388   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1389              {-1, libcall, false}}}};
1390 struct processor_costs znver1_cost = {
1391   COSTS_N_INSNS (1),                    /* cost of an add instruction.  */
1392   COSTS_N_INSNS (1),                    /* cost of a lea instruction.  */
1393   COSTS_N_INSNS (1),                    /* variable shift costs.  */
1394   COSTS_N_INSNS (1),                    /* constant shift costs.  */
1395   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI.  */
1396    COSTS_N_INSNS (3),                   /*                               HI.  */
1397    COSTS_N_INSNS (3),                   /*                               SI.  */
1398    COSTS_N_INSNS (3),                   /*                               DI.  */
1399    COSTS_N_INSNS (3)},                  /*                            other.  */
1400   0,                                    /* cost of multiply per each bit
1401                                             set.  */
1402    /* Depending on parameters, idiv can get faster on ryzen.  This is upper
1403       bound.  */
1404   {COSTS_N_INSNS (16),                  /* cost of a divide/mod for QI.  */
1405    COSTS_N_INSNS (22),                  /*                          HI.  */
1406    COSTS_N_INSNS (30),                  /*                          SI.  */
1407    COSTS_N_INSNS (45),                  /*                          DI.  */
1408    COSTS_N_INSNS (45)},                 /*                          other.  */
1409   COSTS_N_INSNS (1),                    /* cost of movsx.  */
1410   COSTS_N_INSNS (1),                    /* cost of movzx.  */
1411   8,                                    /* "large" insn.  */
1412   9,                                    /* MOVE_RATIO.  */
1413   4,                                    /* cost for loading QImode using
1414                                            movzbl.  */
1415   {5, 5, 4},                            /* cost of loading integer registers
1416                                            in QImode, HImode and SImode.
1417                                            Relative to reg-reg move (2).  */
1418   {4, 4, 4},                            /* cost of storing integer
1419                                            registers.  */
1420   2,                                    /* cost of reg,reg fld/fst.  */
1421   {5, 5, 12},                           /* cost of loading fp registers
1422                                            in SFmode, DFmode and XFmode.  */
1423   {4, 4, 8},                            /* cost of storing fp registers
1424                                            in SFmode, DFmode and XFmode.  */
1425   2,                                    /* cost of moving MMX register.  */
1426   {4, 4},                               /* cost of loading MMX registers
1427                                            in SImode and DImode.  */
1428   {4, 4},                               /* cost of storing MMX registers
1429                                            in SImode and DImode.  */
1430   2,                                    /* cost of moving SSE register.  */
1431   {4, 4, 4},                            /* cost of loading SSE registers
1432                                            in SImode, DImode and TImode.  */
1433   {4, 4, 4},                            /* cost of storing SSE registers
1434                                            in SImode, DImode and TImode.  */
1435   2,                                    /* MMX or SSE register to integer.  */
1436   32,                                   /* size of l1 cache.  */
1437   512,                                  /* size of l2 cache.  */
1438   64,                                   /* size of prefetch block.  */
1439   /* New AMD processors never drop prefetches; if they cannot be performed
1440      immediately, they are queued.  We set number of simultaneous prefetches
1441      to a large constant to reflect this (it probably is not a good idea not
1442      to limit number of prefetches at all, as their execution also takes some
1443      time).  */
1444   100,                                  /* number of parallel prefetches.  */
1445   3,                                    /* Branch cost.  */
1446   COSTS_N_INSNS (5),                    /* cost of FADD and FSUB insns.  */
1447   COSTS_N_INSNS (5),                    /* cost of FMUL instruction.  */
1448   /* Latency of fdiv is 8-15.  */
1449   COSTS_N_INSNS (15),                   /* cost of FDIV instruction.  */
1450   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
1451   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
1452   /* Latency of fsqrt is 4-10.  */
1453   COSTS_N_INSNS (10),                   /* cost of FSQRT instruction.  */
1454
1455   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1456   COSTS_N_INSNS (3),                    /* cost of MULSS instruction.  */
1457   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
1458   COSTS_N_INSNS (10),                   /* cost of DIVSS instruction.  */
1459   /* 9-13  */
1460   COSTS_N_INSNS (13),                   /* cost of DIVSD instruction.  */
1461   COSTS_N_INSNS (10),                   /* cost of SQRTSS instruction.  */
1462   COSTS_N_INSNS (15),                   /* cost of SQRTSD instruction.  */
1463   /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles
1464      and it can execute 2 integer additions and 2 multiplications thus
1465      reassociation may make sense up to with of 6.  SPEC2k6 bencharks suggests
1466      that 4 works better than 6 probably due to register pressure.
1467
1468      Integer vector operations are taken by FP unit and execute 3 vector
1469      plus/minus operations per cycle but only one multiply.  This is adjusted
1470      in ix86_reassociation_width.  */
1471   4, 4, 3, 6,                           /* reassoc int, fp, vec_int, vec_fp.  */
1472   znver1_memcpy,
1473   znver1_memset,
1474   6,                                    /* scalar_stmt_cost.  */
1475   4,                                    /* scalar load_cost.  */
1476   4,                                    /* scalar_store_cost.  */
1477   6,                                    /* vec_stmt_cost.  */
1478   0,                                    /* vec_to_scalar_cost.  */
1479   2,                                    /* scalar_to_vec_cost.  */
1480   4,                                    /* vec_align_load_cost.  */
1481   4,                                    /* vec_unalign_load_cost.  */
1482   4,                                    /* vec_store_cost.  */
1483   4,                                    /* cond_taken_branch_cost.  */
1484   2,                                    /* cond_not_taken_branch_cost.  */
1485 };
1486
1487   /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1488      very small blocks it is better to use loop. For large blocks, libcall can
1489      do nontemporary accesses and beat inline considerably.  */
1490 static stringop_algs btver1_memcpy[2] = {
1491   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1492              {-1, rep_prefix_4_byte, false}}},
1493   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1494              {-1, libcall, false}}}};
1495 static stringop_algs btver1_memset[2] = {
1496   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1497              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1498   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1499              {-1, libcall, false}}}};
1500 const struct processor_costs btver1_cost = {
1501   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1502   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
1503   COSTS_N_INSNS (1),                    /* variable shift costs */
1504   COSTS_N_INSNS (1),                    /* constant shift costs */
1505   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
1506    COSTS_N_INSNS (4),                   /*                               HI */
1507    COSTS_N_INSNS (3),                   /*                               SI */
1508    COSTS_N_INSNS (4),                   /*                               DI */
1509    COSTS_N_INSNS (5)},                  /*                            other */
1510   0,                                    /* cost of multiply per each bit set */
1511   {COSTS_N_INSNS (19),                  /* cost of a divide/mod for QI */
1512    COSTS_N_INSNS (35),                  /*                          HI */
1513    COSTS_N_INSNS (51),                  /*                          SI */
1514    COSTS_N_INSNS (83),                  /*                          DI */
1515    COSTS_N_INSNS (83)},                 /*                          other */
1516   COSTS_N_INSNS (1),                    /* cost of movsx */
1517   COSTS_N_INSNS (1),                    /* cost of movzx */
1518   8,                                    /* "large" insn */
1519   9,                                    /* MOVE_RATIO */
1520   4,                                 /* cost for loading QImode using movzbl */
1521   {3, 4, 3},                            /* cost of loading integer registers
1522                                            in QImode, HImode and SImode.
1523                                            Relative to reg-reg move (2).  */
1524   {3, 4, 3},                            /* cost of storing integer registers */
1525   4,                                    /* cost of reg,reg fld/fst */
1526   {4, 4, 12},                           /* cost of loading fp registers
1527                                            in SFmode, DFmode and XFmode */
1528   {6, 6, 8},                            /* cost of storing fp registers
1529                                            in SFmode, DFmode and XFmode */
1530   2,                                    /* cost of moving MMX register */
1531   {3, 3},                               /* cost of loading MMX registers
1532                                            in SImode and DImode */
1533   {4, 4},                               /* cost of storing MMX registers
1534                                            in SImode and DImode */
1535   2,                                    /* cost of moving SSE register */
1536   {4, 4, 3},                            /* cost of loading SSE registers
1537                                            in SImode, DImode and TImode */
1538   {4, 4, 5},                            /* cost of storing SSE registers
1539                                            in SImode, DImode and TImode */
1540   3,                                    /* MMX or SSE register to integer */
1541                                         /* On K8:
1542                                            MOVD reg64, xmmreg Double FSTORE 4
1543                                            MOVD reg32, xmmreg Double FSTORE 4
1544                                            On AMDFAM10:
1545                                            MOVD reg64, xmmreg Double FADD 3
1546                                                                1/1  1/1
1547                                             MOVD reg32, xmmreg Double FADD 3
1548                                                                1/1  1/1 */
1549   32,                                   /* size of l1 cache.  */
1550   512,                                  /* size of l2 cache.  */
1551   64,                                   /* size of prefetch block */
1552   100,                                  /* number of parallel prefetches */
1553   2,                                    /* Branch cost */
1554   COSTS_N_INSNS (4),                    /* cost of FADD and FSUB insns.  */
1555   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
1556   COSTS_N_INSNS (19),                   /* cost of FDIV instruction.  */
1557   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
1558   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
1559   COSTS_N_INSNS (35),                   /* cost of FSQRT instruction.  */
1560
1561   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1562   COSTS_N_INSNS (2),                    /* cost of MULSS instruction.  */
1563   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
1564   COSTS_N_INSNS (13),                   /* cost of DIVSS instruction.  */
1565   COSTS_N_INSNS (17),                   /* cost of DIVSD instruction.  */
1566   COSTS_N_INSNS (14),                   /* cost of SQRTSS instruction.  */
1567   COSTS_N_INSNS (48),                   /* cost of SQRTSD instruction.  */
1568   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1569   btver1_memcpy,
1570   btver1_memset,
1571   4,                                    /* scalar_stmt_cost.  */
1572   2,                                    /* scalar load_cost.  */
1573   2,                                    /* scalar_store_cost.  */
1574   6,                                    /* vec_stmt_cost.  */
1575   0,                                    /* vec_to_scalar_cost.  */
1576   2,                                    /* scalar_to_vec_cost.  */
1577   2,                                    /* vec_align_load_cost.  */
1578   2,                                    /* vec_unalign_load_cost.  */
1579   2,                                    /* vec_store_cost.  */
1580   2,                                    /* cond_taken_branch_cost.  */
1581   1,                                    /* cond_not_taken_branch_cost.  */
1582 };
1583
1584 static stringop_algs btver2_memcpy[2] = {
1585   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1586              {-1, rep_prefix_4_byte, false}}},
1587   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1588              {-1, libcall, false}}}};
1589 static stringop_algs btver2_memset[2] = {
1590   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1591              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1592   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1593              {-1, libcall, false}}}};
1594 const struct processor_costs btver2_cost = {
1595   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1596   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
1597   COSTS_N_INSNS (1),                    /* variable shift costs */
1598   COSTS_N_INSNS (1),                    /* constant shift costs */
1599   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
1600    COSTS_N_INSNS (4),                   /*                               HI */
1601    COSTS_N_INSNS (3),                   /*                               SI */
1602    COSTS_N_INSNS (4),                   /*                               DI */
1603    COSTS_N_INSNS (5)},                  /*                            other */
1604   0,                                    /* cost of multiply per each bit set */
1605   {COSTS_N_INSNS (19),                  /* cost of a divide/mod for QI */
1606    COSTS_N_INSNS (35),                  /*                          HI */
1607    COSTS_N_INSNS (51),                  /*                          SI */
1608    COSTS_N_INSNS (83),                  /*                          DI */
1609    COSTS_N_INSNS (83)},                 /*                          other */
1610   COSTS_N_INSNS (1),                    /* cost of movsx */
1611   COSTS_N_INSNS (1),                    /* cost of movzx */
1612   8,                                    /* "large" insn */
1613   9,                                    /* MOVE_RATIO */
1614   4,                                 /* cost for loading QImode using movzbl */
1615   {3, 4, 3},                            /* cost of loading integer registers
1616                                            in QImode, HImode and SImode.
1617                                            Relative to reg-reg move (2).  */
1618   {3, 4, 3},                            /* cost of storing integer registers */
1619   4,                                    /* cost of reg,reg fld/fst */
1620   {4, 4, 12},                           /* cost of loading fp registers
1621                                            in SFmode, DFmode and XFmode */
1622   {6, 6, 8},                            /* cost of storing fp registers
1623                                            in SFmode, DFmode and XFmode */
1624   2,                                    /* cost of moving MMX register */
1625   {3, 3},                               /* cost of loading MMX registers
1626                                            in SImode and DImode */
1627   {4, 4},                               /* cost of storing MMX registers
1628                                            in SImode and DImode */
1629   2,                                    /* cost of moving SSE register */
1630   {4, 4, 3},                            /* cost of loading SSE registers
1631                                            in SImode, DImode and TImode */
1632   {4, 4, 5},                            /* cost of storing SSE registers
1633                                            in SImode, DImode and TImode */
1634   3,                                    /* MMX or SSE register to integer */
1635                                         /* On K8:
1636                                            MOVD reg64, xmmreg Double FSTORE 4
1637                                            MOVD reg32, xmmreg Double FSTORE 4
1638                                            On AMDFAM10:
1639                                            MOVD reg64, xmmreg Double FADD 3
1640                                                                1/1  1/1
1641                                             MOVD reg32, xmmreg Double FADD 3
1642                                                                1/1  1/1 */
1643   32,                                   /* size of l1 cache.  */
1644   2048,                                 /* size of l2 cache.  */
1645   64,                                   /* size of prefetch block */
1646   100,                                  /* number of parallel prefetches */
1647   2,                                    /* Branch cost */
1648   COSTS_N_INSNS (4),                    /* cost of FADD and FSUB insns.  */
1649   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
1650   COSTS_N_INSNS (19),                   /* cost of FDIV instruction.  */
1651   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
1652   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
1653   COSTS_N_INSNS (35),                   /* cost of FSQRT instruction.  */
1654
1655   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1656   COSTS_N_INSNS (2),                    /* cost of MULSS instruction.  */
1657   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
1658   COSTS_N_INSNS (13),                   /* cost of DIVSS instruction.  */
1659   COSTS_N_INSNS (19),                   /* cost of DIVSD instruction.  */
1660   COSTS_N_INSNS (16),                   /* cost of SQRTSS instruction.  */
1661   COSTS_N_INSNS (21),                   /* cost of SQRTSD instruction.  */
1662   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1663   btver2_memcpy,
1664   btver2_memset,
1665   4,                                    /* scalar_stmt_cost.  */
1666   2,                                    /* scalar load_cost.  */
1667   2,                                    /* scalar_store_cost.  */
1668   6,                                    /* vec_stmt_cost.  */
1669   0,                                    /* vec_to_scalar_cost.  */
1670   2,                                    /* scalar_to_vec_cost.  */
1671   2,                                    /* vec_align_load_cost.  */
1672   2,                                    /* vec_unalign_load_cost.  */
1673   2,                                    /* vec_store_cost.  */
1674   2,                                    /* cond_taken_branch_cost.  */
1675   1,                                    /* cond_not_taken_branch_cost.  */
1676 };
1677
1678 static stringop_algs pentium4_memcpy[2] = {
1679   {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1680   DUMMY_STRINGOP_ALGS};
1681 static stringop_algs pentium4_memset[2] = {
1682   {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1683              {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1684   DUMMY_STRINGOP_ALGS};
1685
1686 static const
1687 struct processor_costs pentium4_cost = {
1688   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1689   COSTS_N_INSNS (3),                    /* cost of a lea instruction */
1690   COSTS_N_INSNS (4),                    /* variable shift costs */
1691   COSTS_N_INSNS (4),                    /* constant shift costs */
1692   {COSTS_N_INSNS (15),                  /* cost of starting multiply for QI */
1693    COSTS_N_INSNS (15),                  /*                               HI */
1694    COSTS_N_INSNS (15),                  /*                               SI */
1695    COSTS_N_INSNS (15),                  /*                               DI */
1696    COSTS_N_INSNS (15)},                 /*                            other */
1697   0,                                    /* cost of multiply per each bit set */
1698   {COSTS_N_INSNS (56),                  /* cost of a divide/mod for QI */
1699    COSTS_N_INSNS (56),                  /*                          HI */
1700    COSTS_N_INSNS (56),                  /*                          SI */
1701    COSTS_N_INSNS (56),                  /*                          DI */
1702    COSTS_N_INSNS (56)},                 /*                          other */
1703   COSTS_N_INSNS (1),                    /* cost of movsx */
1704   COSTS_N_INSNS (1),                    /* cost of movzx */
1705   16,                                   /* "large" insn */
1706   6,                                    /* MOVE_RATIO */
1707   2,                                 /* cost for loading QImode using movzbl */
1708   {4, 5, 4},                            /* cost of loading integer registers
1709                                            in QImode, HImode and SImode.
1710                                            Relative to reg-reg move (2).  */
1711   {2, 3, 2},                            /* cost of storing integer registers */
1712   2,                                    /* cost of reg,reg fld/fst */
1713   {2, 2, 6},                            /* cost of loading fp registers
1714                                            in SFmode, DFmode and XFmode */
1715   {4, 4, 6},                            /* cost of storing fp registers
1716                                            in SFmode, DFmode and XFmode */
1717   2,                                    /* cost of moving MMX register */
1718   {2, 2},                               /* cost of loading MMX registers
1719                                            in SImode and DImode */
1720   {2, 2},                               /* cost of storing MMX registers
1721                                            in SImode and DImode */
1722   12,                                   /* cost of moving SSE register */
1723   {12, 12, 12},                         /* cost of loading SSE registers
1724                                            in SImode, DImode and TImode */
1725   {2, 2, 8},                            /* cost of storing SSE registers
1726                                            in SImode, DImode and TImode */
1727   10,                                   /* MMX or SSE register to integer */
1728   8,                                    /* size of l1 cache.  */
1729   256,                                  /* size of l2 cache.  */
1730   64,                                   /* size of prefetch block */
1731   6,                                    /* number of parallel prefetches */
1732   2,                                    /* Branch cost */
1733   COSTS_N_INSNS (5),                    /* cost of FADD and FSUB insns.  */
1734   COSTS_N_INSNS (7),                    /* cost of FMUL instruction.  */
1735   COSTS_N_INSNS (43),                   /* cost of FDIV instruction.  */
1736   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
1737   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
1738   COSTS_N_INSNS (43),                   /* cost of FSQRT instruction.  */
1739
1740   COSTS_N_INSNS (4),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1741   COSTS_N_INSNS (6),                    /* cost of MULSS instruction.  */
1742   COSTS_N_INSNS (6),                    /* cost of MULSD instruction.  */
1743   COSTS_N_INSNS (23),                   /* cost of DIVSS instruction.  */
1744   COSTS_N_INSNS (38),                   /* cost of DIVSD instruction.  */
1745   COSTS_N_INSNS (23),                   /* cost of SQRTSS instruction.  */
1746   COSTS_N_INSNS (38),                   /* cost of SQRTSD instruction.  */
1747   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1748   pentium4_memcpy,
1749   pentium4_memset,
1750   1,                                    /* scalar_stmt_cost.  */
1751   1,                                    /* scalar load_cost.  */
1752   1,                                    /* scalar_store_cost.  */
1753   1,                                    /* vec_stmt_cost.  */
1754   1,                                    /* vec_to_scalar_cost.  */
1755   1,                                    /* scalar_to_vec_cost.  */
1756   1,                                    /* vec_align_load_cost.  */
1757   2,                                    /* vec_unalign_load_cost.  */
1758   1,                                    /* vec_store_cost.  */
1759   3,                                    /* cond_taken_branch_cost.  */
1760   1,                                    /* cond_not_taken_branch_cost.  */
1761 };
1762
1763 static stringop_algs nocona_memcpy[2] = {
1764   {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1765   {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
1766              {100000, unrolled_loop, false}, {-1, libcall, false}}}};
1767
1768 static stringop_algs nocona_memset[2] = {
1769   {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1770              {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1771   {libcall, {{24, loop, false}, {64, unrolled_loop, false},
1772              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1773
1774 static const
1775 struct processor_costs nocona_cost = {
1776   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1777   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
1778   COSTS_N_INSNS (1),                    /* variable shift costs */
1779   COSTS_N_INSNS (1),                    /* constant shift costs */
1780   {COSTS_N_INSNS (10),                  /* cost of starting multiply for QI */
1781    COSTS_N_INSNS (10),                  /*                               HI */
1782    COSTS_N_INSNS (10),                  /*                               SI */
1783    COSTS_N_INSNS (10),                  /*                               DI */
1784    COSTS_N_INSNS (10)},                 /*                            other */
1785   0,                                    /* cost of multiply per each bit set */
1786   {COSTS_N_INSNS (66),                  /* cost of a divide/mod for QI */
1787    COSTS_N_INSNS (66),                  /*                          HI */
1788    COSTS_N_INSNS (66),                  /*                          SI */
1789    COSTS_N_INSNS (66),                  /*                          DI */
1790    COSTS_N_INSNS (66)},                 /*                          other */
1791   COSTS_N_INSNS (1),                    /* cost of movsx */
1792   COSTS_N_INSNS (1),                    /* cost of movzx */
1793   16,                                   /* "large" insn */
1794   17,                                   /* MOVE_RATIO */
1795   4,                                 /* cost for loading QImode using movzbl */
1796   {4, 4, 4},                            /* cost of loading integer registers
1797                                            in QImode, HImode and SImode.
1798                                            Relative to reg-reg move (2).  */
1799   {4, 4, 4},                            /* cost of storing integer registers */
1800   3,                                    /* cost of reg,reg fld/fst */
1801   {12, 12, 12},                         /* cost of loading fp registers
1802                                            in SFmode, DFmode and XFmode */
1803   {4, 4, 4},                            /* cost of storing fp registers
1804                                            in SFmode, DFmode and XFmode */
1805   6,                                    /* cost of moving MMX register */
1806   {12, 12},                             /* cost of loading MMX registers
1807                                            in SImode and DImode */
1808   {12, 12},                             /* cost of storing MMX registers
1809                                            in SImode and DImode */
1810   6,                                    /* cost of moving SSE register */
1811   {12, 12, 12},                         /* cost of loading SSE registers
1812                                            in SImode, DImode and TImode */
1813   {12, 12, 12},                         /* cost of storing SSE registers
1814                                            in SImode, DImode and TImode */
1815   8,                                    /* MMX or SSE register to integer */
1816   8,                                    /* size of l1 cache.  */
1817   1024,                                 /* size of l2 cache.  */
1818   64,                                   /* size of prefetch block */
1819   8,                                    /* number of parallel prefetches */
1820   1,                                    /* Branch cost */
1821   COSTS_N_INSNS (6),                    /* cost of FADD and FSUB insns.  */
1822   COSTS_N_INSNS (8),                    /* cost of FMUL instruction.  */
1823   COSTS_N_INSNS (40),                   /* cost of FDIV instruction.  */
1824   COSTS_N_INSNS (3),                    /* cost of FABS instruction.  */
1825   COSTS_N_INSNS (3),                    /* cost of FCHS instruction.  */
1826   COSTS_N_INSNS (44),                   /* cost of FSQRT instruction.  */
1827
1828   COSTS_N_INSNS (5),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1829   COSTS_N_INSNS (7),                    /* cost of MULSS instruction.  */
1830   COSTS_N_INSNS (7),                    /* cost of MULSD instruction.  */
1831   COSTS_N_INSNS (32),                   /* cost of DIVSS instruction.  */
1832   COSTS_N_INSNS (40),                   /* cost of DIVSD instruction.  */
1833   COSTS_N_INSNS (32),                   /* cost of SQRTSS instruction.  */
1834   COSTS_N_INSNS (41),                   /* cost of SQRTSD instruction.  */
1835   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1836   nocona_memcpy,
1837   nocona_memset,
1838   1,                                    /* scalar_stmt_cost.  */
1839   1,                                    /* scalar load_cost.  */
1840   1,                                    /* scalar_store_cost.  */
1841   1,                                    /* vec_stmt_cost.  */
1842   1,                                    /* vec_to_scalar_cost.  */
1843   1,                                    /* scalar_to_vec_cost.  */
1844   1,                                    /* vec_align_load_cost.  */
1845   2,                                    /* vec_unalign_load_cost.  */
1846   1,                                    /* vec_store_cost.  */
1847   3,                                    /* cond_taken_branch_cost.  */
1848   1,                                    /* cond_not_taken_branch_cost.  */
1849 };
1850
1851 static stringop_algs atom_memcpy[2] = {
1852   {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1853   {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1854              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1855 static stringop_algs atom_memset[2] = {
1856   {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1857              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1858   {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1859              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1860 static const
1861 struct processor_costs atom_cost = {
1862   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1863   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
1864   COSTS_N_INSNS (1),                    /* variable shift costs */
1865   COSTS_N_INSNS (1),                    /* constant shift costs */
1866   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
1867    COSTS_N_INSNS (4),                   /*                               HI */
1868    COSTS_N_INSNS (3),                   /*                               SI */
1869    COSTS_N_INSNS (4),                   /*                               DI */
1870    COSTS_N_INSNS (2)},                  /*                            other */
1871   0,                                    /* cost of multiply per each bit set */
1872   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
1873    COSTS_N_INSNS (26),                  /*                          HI */
1874    COSTS_N_INSNS (42),                  /*                          SI */
1875    COSTS_N_INSNS (74),                  /*                          DI */
1876    COSTS_N_INSNS (74)},                 /*                          other */
1877   COSTS_N_INSNS (1),                    /* cost of movsx */
1878   COSTS_N_INSNS (1),                    /* cost of movzx */
1879   8,                                    /* "large" insn */
1880   17,                                   /* MOVE_RATIO */
1881   4,                                    /* cost for loading QImode using movzbl */
1882   {4, 4, 4},                            /* cost of loading integer registers
1883                                            in QImode, HImode and SImode.
1884                                            Relative to reg-reg move (2).  */
1885   {4, 4, 4},                            /* cost of storing integer registers */
1886   4,                                    /* cost of reg,reg fld/fst */
1887   {12, 12, 12},                         /* cost of loading fp registers
1888                                            in SFmode, DFmode and XFmode */
1889   {6, 6, 8},                            /* cost of storing fp registers
1890                                            in SFmode, DFmode and XFmode */
1891   2,                                    /* cost of moving MMX register */
1892   {8, 8},                               /* cost of loading MMX registers
1893                                            in SImode and DImode */
1894   {8, 8},                               /* cost of storing MMX registers
1895                                            in SImode and DImode */
1896   2,                                    /* cost of moving SSE register */
1897   {8, 8, 8},                            /* cost of loading SSE registers
1898                                            in SImode, DImode and TImode */
1899   {8, 8, 8},                            /* cost of storing SSE registers
1900                                            in SImode, DImode and TImode */
1901   5,                                    /* MMX or SSE register to integer */
1902   32,                                   /* size of l1 cache.  */
1903   256,                                  /* size of l2 cache.  */
1904   64,                                   /* size of prefetch block */
1905   6,                                    /* number of parallel prefetches */
1906   3,                                    /* Branch cost */
1907   COSTS_N_INSNS (8),                    /* cost of FADD and FSUB insns.  */
1908   COSTS_N_INSNS (8),                    /* cost of FMUL instruction.  */
1909   COSTS_N_INSNS (20),                   /* cost of FDIV instruction.  */
1910   COSTS_N_INSNS (8),                    /* cost of FABS instruction.  */
1911   COSTS_N_INSNS (8),                    /* cost of FCHS instruction.  */
1912   COSTS_N_INSNS (40),                   /* cost of FSQRT instruction.  */
1913
1914   COSTS_N_INSNS (5),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1915   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
1916   COSTS_N_INSNS (5),                    /* cost of MULSD instruction.  */
1917   COSTS_N_INSNS (31),                   /* cost of DIVSS instruction.  */
1918   COSTS_N_INSNS (60),                   /* cost of DIVSD instruction.  */
1919   COSTS_N_INSNS (31),                   /* cost of SQRTSS instruction.  */
1920   COSTS_N_INSNS (63),                   /* cost of SQRTSD instruction.  */
1921   2, 2, 2, 2,                           /* reassoc int, fp, vec_int, vec_fp.  */
1922   atom_memcpy,
1923   atom_memset,
1924   1,                                    /* scalar_stmt_cost.  */
1925   1,                                    /* scalar load_cost.  */
1926   1,                                    /* scalar_store_cost.  */
1927   1,                                    /* vec_stmt_cost.  */
1928   1,                                    /* vec_to_scalar_cost.  */
1929   1,                                    /* scalar_to_vec_cost.  */
1930   1,                                    /* vec_align_load_cost.  */
1931   2,                                    /* vec_unalign_load_cost.  */
1932   1,                                    /* vec_store_cost.  */
1933   3,                                    /* cond_taken_branch_cost.  */
1934   1,                                    /* cond_not_taken_branch_cost.  */
1935 };
1936
1937 static stringop_algs slm_memcpy[2] = {
1938   {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1939   {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1940              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1941 static stringop_algs slm_memset[2] = {
1942   {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1943              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1944   {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1945              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1946 static const
1947 struct processor_costs slm_cost = {
1948   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1949   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
1950   COSTS_N_INSNS (1),                    /* variable shift costs */
1951   COSTS_N_INSNS (1),                    /* constant shift costs */
1952   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
1953    COSTS_N_INSNS (3),                   /*                               HI */
1954    COSTS_N_INSNS (3),                   /*                               SI */
1955    COSTS_N_INSNS (4),                   /*                               DI */
1956    COSTS_N_INSNS (2)},                  /*                            other */
1957   0,                                    /* cost of multiply per each bit set */
1958   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
1959    COSTS_N_INSNS (26),                  /*                          HI */
1960    COSTS_N_INSNS (42),                  /*                          SI */
1961    COSTS_N_INSNS (74),                  /*                          DI */
1962    COSTS_N_INSNS (74)},                 /*                          other */
1963   COSTS_N_INSNS (1),                    /* cost of movsx */
1964   COSTS_N_INSNS (1),                    /* cost of movzx */
1965   8,                                    /* "large" insn */
1966   17,                                   /* MOVE_RATIO */
1967   4,                                    /* cost for loading QImode using movzbl */
1968   {4, 4, 4},                            /* cost of loading integer registers
1969                                            in QImode, HImode and SImode.
1970                                            Relative to reg-reg move (2).  */
1971   {4, 4, 4},                            /* cost of storing integer registers */
1972   4,                                    /* cost of reg,reg fld/fst */
1973   {12, 12, 12},                         /* cost of loading fp registers
1974                                            in SFmode, DFmode and XFmode */
1975   {6, 6, 8},                            /* cost of storing fp registers
1976                                            in SFmode, DFmode and XFmode */
1977   2,                                    /* cost of moving MMX register */
1978   {8, 8},                               /* cost of loading MMX registers
1979                                            in SImode and DImode */
1980   {8, 8},                               /* cost of storing MMX registers
1981                                            in SImode and DImode */
1982   2,                                    /* cost of moving SSE register */
1983   {8, 8, 8},                            /* cost of loading SSE registers
1984                                            in SImode, DImode and TImode */
1985   {8, 8, 8},                            /* cost of storing SSE registers
1986                                            in SImode, DImode and TImode */
1987   5,                                    /* MMX or SSE register to integer */
1988   32,                                   /* size of l1 cache.  */
1989   256,                                  /* size of l2 cache.  */
1990   64,                                   /* size of prefetch block */
1991   6,                                    /* number of parallel prefetches */
1992   3,                                    /* Branch cost */
1993   COSTS_N_INSNS (8),                    /* cost of FADD and FSUB insns.  */
1994   COSTS_N_INSNS (8),                    /* cost of FMUL instruction.  */
1995   COSTS_N_INSNS (20),                   /* cost of FDIV instruction.  */
1996   COSTS_N_INSNS (8),                    /* cost of FABS instruction.  */
1997   COSTS_N_INSNS (8),                    /* cost of FCHS instruction.  */
1998   COSTS_N_INSNS (40),                   /* cost of FSQRT instruction.  */
1999
2000   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
2001   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
2002   COSTS_N_INSNS (5),                    /* cost of MULSD instruction.  */
2003   COSTS_N_INSNS (39),                   /* cost of DIVSS instruction.  */
2004   COSTS_N_INSNS (69),                   /* cost of DIVSD instruction.  */
2005   COSTS_N_INSNS (20),                   /* cost of SQRTSS instruction.  */
2006   COSTS_N_INSNS (35),                   /* cost of SQRTSD instruction.  */
2007   1, 2, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
2008   slm_memcpy,
2009   slm_memset,
2010   1,                                    /* scalar_stmt_cost.  */
2011   1,                                    /* scalar load_cost.  */
2012   1,                                    /* scalar_store_cost.  */
2013   1,                                    /* vec_stmt_cost.  */
2014   4,                                    /* vec_to_scalar_cost.  */
2015   1,                                    /* scalar_to_vec_cost.  */
2016   1,                                    /* vec_align_load_cost.  */
2017   2,                                    /* vec_unalign_load_cost.  */
2018   1,                                    /* vec_store_cost.  */
2019   3,                                    /* cond_taken_branch_cost.  */
2020   1,                                    /* cond_not_taken_branch_cost.  */
2021 };
2022
2023 static stringop_algs intel_memcpy[2] = {
2024   {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
2025   {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
2026              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2027 static stringop_algs intel_memset[2] = {
2028   {libcall, {{8, loop, false}, {15, unrolled_loop, false},
2029              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2030   {libcall, {{24, loop, false}, {32, unrolled_loop, false},
2031              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2032 static const
2033 struct processor_costs intel_cost = {
2034   COSTS_N_INSNS (1),                    /* cost of an add instruction */
2035   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
2036   COSTS_N_INSNS (1),                    /* variable shift costs */
2037   COSTS_N_INSNS (1),                    /* constant shift costs */
2038   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
2039    COSTS_N_INSNS (3),                   /*                               HI */
2040    COSTS_N_INSNS (3),                   /*                               SI */
2041    COSTS_N_INSNS (4),                   /*                               DI */
2042    COSTS_N_INSNS (2)},                  /*                            other */
2043   0,                                    /* cost of multiply per each bit set */
2044   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
2045    COSTS_N_INSNS (26),                  /*                          HI */
2046    COSTS_N_INSNS (42),                  /*                          SI */
2047    COSTS_N_INSNS (74),                  /*                          DI */
2048    COSTS_N_INSNS (74)},                 /*                          other */
2049   COSTS_N_INSNS (1),                    /* cost of movsx */
2050   COSTS_N_INSNS (1),                    /* cost of movzx */
2051   8,                                    /* "large" insn */
2052   17,                                   /* MOVE_RATIO */
2053   4,                                    /* cost for loading QImode using movzbl */
2054   {4, 4, 4},                            /* cost of loading integer registers
2055                                            in QImode, HImode and SImode.
2056                                            Relative to reg-reg move (2).  */
2057   {4, 4, 4},                            /* cost of storing integer registers */
2058   4,                                    /* cost of reg,reg fld/fst */
2059   {12, 12, 12},                         /* cost of loading fp registers
2060                                            in SFmode, DFmode and XFmode */
2061   {6, 6, 8},                            /* cost of storing fp registers
2062                                            in SFmode, DFmode and XFmode */
2063   2,                                    /* cost of moving MMX register */
2064   {8, 8},                               /* cost of loading MMX registers
2065                                            in SImode and DImode */
2066   {8, 8},                               /* cost of storing MMX registers
2067                                            in SImode and DImode */
2068   2,                                    /* cost of moving SSE register */
2069   {8, 8, 8},                            /* cost of loading SSE registers
2070                                            in SImode, DImode and TImode */
2071   {8, 8, 8},                            /* cost of storing SSE registers
2072                                            in SImode, DImode and TImode */
2073   5,                                    /* MMX or SSE register to integer */
2074   32,                                   /* size of l1 cache.  */
2075   256,                                  /* size of l2 cache.  */
2076   64,                                   /* size of prefetch block */
2077   6,                                    /* number of parallel prefetches */
2078   3,                                    /* Branch cost */
2079   COSTS_N_INSNS (8),                    /* cost of FADD and FSUB insns.  */
2080   COSTS_N_INSNS (8),                    /* cost of FMUL instruction.  */
2081   COSTS_N_INSNS (20),                   /* cost of FDIV instruction.  */
2082   COSTS_N_INSNS (8),                    /* cost of FABS instruction.  */
2083   COSTS_N_INSNS (8),                    /* cost of FCHS instruction.  */
2084   COSTS_N_INSNS (40),                   /* cost of FSQRT instruction.  */
2085
2086   COSTS_N_INSNS (8),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
2087   COSTS_N_INSNS (8),                    /* cost of MULSS instruction.  */
2088   COSTS_N_INSNS (8),                    /* cost of MULSD instruction.  */
2089   COSTS_N_INSNS (20),                   /* cost of DIVSS instruction.  */
2090   COSTS_N_INSNS (20),                   /* cost of DIVSD instruction.  */
2091   COSTS_N_INSNS (40),                   /* cost of SQRTSS instruction.  */
2092   COSTS_N_INSNS (40),                   /* cost of SQRTSD instruction.  */
2093   1, 4, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
2094   intel_memcpy,
2095   intel_memset,
2096   1,                                    /* scalar_stmt_cost.  */
2097   1,                                    /* scalar load_cost.  */
2098   1,                                    /* scalar_store_cost.  */
2099   1,                                    /* vec_stmt_cost.  */
2100   4,                                    /* vec_to_scalar_cost.  */
2101   1,                                    /* scalar_to_vec_cost.  */
2102   1,                                    /* vec_align_load_cost.  */
2103   2,                                    /* vec_unalign_load_cost.  */
2104   1,                                    /* vec_store_cost.  */
2105   3,                                    /* cond_taken_branch_cost.  */
2106   1,                                    /* cond_not_taken_branch_cost.  */
2107 };
2108
2109 /* Generic should produce code tuned for Core-i7 (and newer chips)
2110    and btver1 (and newer chips).  */
2111
2112 static stringop_algs generic_memcpy[2] = {
2113   {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2114              {-1, libcall, false}}},
2115   {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2116              {-1, libcall, false}}}};
2117 static stringop_algs generic_memset[2] = {
2118   {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2119              {-1, libcall, false}}},
2120   {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2121              {-1, libcall, false}}}};
2122 static const
2123 struct processor_costs generic_cost = {
2124   COSTS_N_INSNS (1),                    /* cost of an add instruction */
2125   /* On all chips taken into consideration lea is 2 cycles and more.  With
2126      this cost however our current implementation of synth_mult results in
2127      use of unnecessary temporary registers causing regression on several
2128      SPECfp benchmarks.  */
2129   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
2130   COSTS_N_INSNS (1),                    /* variable shift costs */
2131   COSTS_N_INSNS (1),                    /* constant shift costs */
2132   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
2133    COSTS_N_INSNS (4),                   /*                               HI */
2134    COSTS_N_INSNS (3),                   /*                               SI */
2135    COSTS_N_INSNS (4),                   /*                               DI */
2136    COSTS_N_INSNS (2)},                  /*                            other */
2137   0,                                    /* cost of multiply per each bit set */
2138   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
2139    COSTS_N_INSNS (26),                  /*                          HI */
2140    COSTS_N_INSNS (42),                  /*                          SI */
2141    COSTS_N_INSNS (74),                  /*                          DI */
2142    COSTS_N_INSNS (74)},                 /*                          other */
2143   COSTS_N_INSNS (1),                    /* cost of movsx */
2144   COSTS_N_INSNS (1),                    /* cost of movzx */
2145   8,                                    /* "large" insn */
2146   17,                                   /* MOVE_RATIO */
2147   4,                                 /* cost for loading QImode using movzbl */
2148   {4, 4, 4},                            /* cost of loading integer registers
2149                                            in QImode, HImode and SImode.
2150                                            Relative to reg-reg move (2).  */
2151   {4, 4, 4},                            /* cost of storing integer registers */
2152   4,                                    /* cost of reg,reg fld/fst */
2153   {12, 12, 12},                         /* cost of loading fp registers
2154                                            in SFmode, DFmode and XFmode */
2155   {6, 6, 8},                            /* cost of storing fp registers
2156                                            in SFmode, DFmode and XFmode */
2157   2,                                    /* cost of moving MMX register */
2158   {8, 8},                               /* cost of loading MMX registers
2159                                            in SImode and DImode */
2160   {8, 8},                               /* cost of storing MMX registers
2161                                            in SImode and DImode */
2162   2,                                    /* cost of moving SSE register */
2163   {8, 8, 8},                            /* cost of loading SSE registers
2164                                            in SImode, DImode and TImode */
2165   {8, 8, 8},                            /* cost of storing SSE registers
2166                                            in SImode, DImode and TImode */
2167   5,                                    /* MMX or SSE register to integer */
2168   32,                                   /* size of l1 cache.  */
2169   512,                                  /* size of l2 cache.  */
2170   64,                                   /* size of prefetch block */
2171   6,                                    /* number of parallel prefetches */
2172   /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
2173      value is increased to perhaps more appropriate value of 5.  */
2174   3,                                    /* Branch cost */
2175   COSTS_N_INSNS (8),                    /* cost of FADD and FSUB insns.  */
2176   COSTS_N_INSNS (8),                    /* cost of FMUL instruction.  */
2177   COSTS_N_INSNS (20),                   /* cost of FDIV instruction.  */
2178   COSTS_N_INSNS (8),                    /* cost of FABS instruction.  */
2179   COSTS_N_INSNS (8),                    /* cost of FCHS instruction.  */
2180   COSTS_N_INSNS (40),                   /* cost of FSQRT instruction.  */
2181
2182   COSTS_N_INSNS (8),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
2183   COSTS_N_INSNS (8),                    /* cost of MULSS instruction.  */
2184   COSTS_N_INSNS (8),                    /* cost of MULSD instruction.  */
2185   COSTS_N_INSNS (20),                   /* cost of DIVSS instruction.  */
2186   COSTS_N_INSNS (20),                   /* cost of DIVSD instruction.  */
2187   COSTS_N_INSNS (40),                   /* cost of SQRTSS instruction.  */
2188   COSTS_N_INSNS (40),                   /* cost of SQRTSD instruction.  */
2189   1, 2, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
2190   generic_memcpy,
2191   generic_memset,
2192   1,                                    /* scalar_stmt_cost.  */
2193   1,                                    /* scalar load_cost.  */
2194   1,                                    /* scalar_store_cost.  */
2195   1,                                    /* vec_stmt_cost.  */
2196   1,                                    /* vec_to_scalar_cost.  */
2197   1,                                    /* scalar_to_vec_cost.  */
2198   1,                                    /* vec_align_load_cost.  */
2199   2,                                    /* vec_unalign_load_cost.  */
2200   1,                                    /* vec_store_cost.  */
2201   3,                                    /* cond_taken_branch_cost.  */
2202   1,                                    /* cond_not_taken_branch_cost.  */
2203 };
2204
2205 /* core_cost should produce code tuned for Core familly of CPUs.  */
2206 static stringop_algs core_memcpy[2] = {
2207   {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
2208   {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
2209              {-1, libcall, false}}}};
2210 static stringop_algs core_memset[2] = {
2211   {libcall, {{6, loop_1_byte, true},
2212              {24, loop, true},
2213              {8192, rep_prefix_4_byte, true},
2214              {-1, libcall, false}}},
2215   {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
2216              {-1, libcall, false}}}};
2217
2218 static const
2219 struct processor_costs core_cost = {
2220   COSTS_N_INSNS (1),                    /* cost of an add instruction */
2221   /* On all chips taken into consideration lea is 2 cycles and more.  With
2222      this cost however our current implementation of synth_mult results in
2223      use of unnecessary temporary registers causing regression on several
2224      SPECfp benchmarks.  */
2225   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
2226   COSTS_N_INSNS (1),                    /* variable shift costs */
2227   COSTS_N_INSNS (1),                    /* constant shift costs */
2228   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
2229    COSTS_N_INSNS (4),                   /*                               HI */
2230    COSTS_N_INSNS (3),                   /*                               SI */
2231    COSTS_N_INSNS (4),                   /*                               DI */
2232    COSTS_N_INSNS (2)},                  /*                            other */
2233   0,                                    /* cost of multiply per each bit set */
2234   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
2235    COSTS_N_INSNS (26),                  /*                          HI */
2236    COSTS_N_INSNS (42),                  /*                          SI */
2237    COSTS_N_INSNS (74),                  /*                          DI */
2238    COSTS_N_INSNS (74)},                 /*                          other */
2239   COSTS_N_INSNS (1),                    /* cost of movsx */
2240   COSTS_N_INSNS (1),                    /* cost of movzx */
2241   8,                                    /* "large" insn */
2242   17,                                   /* MOVE_RATIO */
2243   4,                                 /* cost for loading QImode using movzbl */
2244   {4, 4, 4},                            /* cost of loading integer registers
2245                                            in QImode, HImode and SImode.
2246                                            Relative to reg-reg move (2).  */
2247   {4, 4, 4},                            /* cost of storing integer registers */
2248   4,                                    /* cost of reg,reg fld/fst */
2249   {12, 12, 12},                         /* cost of loading fp registers
2250                                            in SFmode, DFmode and XFmode */
2251   {6, 6, 8},                            /* cost of storing fp registers
2252                                            in SFmode, DFmode and XFmode */
2253   2,                                    /* cost of moving MMX register */
2254   {8, 8},                               /* cost of loading MMX registers
2255                                            in SImode and DImode */
2256   {8, 8},                               /* cost of storing MMX registers
2257                                            in SImode and DImode */
2258   2,                                    /* cost of moving SSE register */
2259   {8, 8, 8},                            /* cost of loading SSE registers
2260                                            in SImode, DImode and TImode */
2261   {8, 8, 8},                            /* cost of storing SSE registers
2262                                            in SImode, DImode and TImode */
2263   5,                                    /* MMX or SSE register to integer */
2264   64,                                   /* size of l1 cache.  */
2265   512,                                  /* size of l2 cache.  */
2266   64,                                   /* size of prefetch block */
2267   6,                                    /* number of parallel prefetches */
2268   /* FIXME perhaps more appropriate value is 5.  */
2269   3,                                    /* Branch cost */
2270   COSTS_N_INSNS (8),                    /* cost of FADD and FSUB insns.  */
2271   COSTS_N_INSNS (8),                    /* cost of FMUL instruction.  */
2272   COSTS_N_INSNS (20),                   /* cost of FDIV instruction.  */
2273   COSTS_N_INSNS (8),                    /* cost of FABS instruction.  */
2274   COSTS_N_INSNS (8),                    /* cost of FCHS instruction.  */
2275   COSTS_N_INSNS (40),                   /* cost of FSQRT instruction.  */
2276
2277   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
2278   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
2279   COSTS_N_INSNS (5),                    /* cost of MULSD instruction.  */
2280   COSTS_N_INSNS (18),                   /* cost of DIVSS instruction.  */
2281   COSTS_N_INSNS (32),                   /* cost of DIVSD instruction.  */
2282   COSTS_N_INSNS (30),                   /* cost of SQRTSS instruction.  */
2283   COSTS_N_INSNS (58),                   /* cost of SQRTSD instruction.  */
2284   1, 4, 2, 2,                           /* reassoc int, fp, vec_int, vec_fp.  */
2285   core_memcpy,
2286   core_memset,
2287   1,                                    /* scalar_stmt_cost.  */
2288   1,                                    /* scalar load_cost.  */
2289   1,                                    /* scalar_store_cost.  */
2290   1,                                    /* vec_stmt_cost.  */
2291   1,                                    /* vec_to_scalar_cost.  */
2292   1,                                    /* scalar_to_vec_cost.  */
2293   1,                                    /* vec_align_load_cost.  */
2294   2,                                    /* vec_unalign_load_cost.  */
2295   1,                                    /* vec_store_cost.  */
2296   3,                                    /* cond_taken_branch_cost.  */
2297   1,                                    /* cond_not_taken_branch_cost.  */
2298 };
2299