Add -march=iamcu to optimize for IA MCU

author H.J. Lu <hjl@gcc.gnu.org>

Mon, 6 Jul 2015 15:17:44 +0000 (08:17 -0700)

committer H.J. Lu <hjl@gcc.gnu.org>

Mon, 6 Jul 2015 15:17:44 +0000 (08:17 -0700)
author H.J. Lu <hjl@gcc.gnu.org>
Mon, 6 Jul 2015 15:17:44 +0000 (08:17 -0700)
committer H.J. Lu <hjl@gcc.gnu.org>
Mon, 6 Jul 2015 15:17:44 +0000 (08:17 -0700)
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c

index 0a5dfd3554ddd95cab0ab6207247ca9efa133d69..9fb8db613a6916b215ae0f0a695aa8d97f7a113e 100644 (file)
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -426,6 +426,74 @@ struct processor_costs pentium_cost = {
    1,                                   /* cond_not_taken_branch_cost.  */
  };
  
+static const
+struct processor_costs iamcu_cost = {
+  COSTS_N_INSNS (1),                   /* cost of an add instruction */
+  COSTS_N_INSNS (1) + 1,               /* cost of a lea instruction */
+  COSTS_N_INSNS (4),                   /* variable shift costs */
+  COSTS_N_INSNS (1),                   /* constant shift costs */
+  {COSTS_N_INSNS (11),                 /* cost of starting multiply for QI */
+   COSTS_N_INSNS (11),                 /*                               HI */
+   COSTS_N_INSNS (11),                 /*                               SI */
+   COSTS_N_INSNS (11),                 /*                               DI */
+   COSTS_N_INSNS (11)},                        /*                            other */
+  0,                                   /* cost of multiply per each bit set */
+  {COSTS_N_INSNS (25),                 /* cost of a divide/mod for QI */
+   COSTS_N_INSNS (25),                 /*                          HI */
+   COSTS_N_INSNS (25),                 /*                          SI */
+   COSTS_N_INSNS (25),                 /*                          DI */
+   COSTS_N_INSNS (25)},                        /*                          other */
+  COSTS_N_INSNS (3),                   /* cost of movsx */
+  COSTS_N_INSNS (2),                   /* cost of movzx */
+  8,                                   /* "large" insn */
+  6,                                   /* MOVE_RATIO */
+  6,                                /* cost for loading QImode using movzbl */
+  {2, 4, 2},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {2, 4, 2},                           /* cost of storing integer registers */
+  2,                                   /* cost of reg,reg fld/fst */
+  {2, 2, 6},                           /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {4, 4, 6},                           /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  8,                                   /* cost of moving MMX register */
+  {8, 8},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {8, 8},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2,                                   /* cost of moving SSE register */
+  {4, 8, 16},                          /* cost of loading SSE registers
+                                          in SImode, DImode and TImode */
+  {4, 8, 16},                          /* cost of storing SSE registers
+                                          in SImode, DImode and TImode */
+  3,                                   /* MMX or SSE register to integer */
+  8,                                   /* size of l1 cache.  */
+  8,                                   /* size of l2 cache  */
+  0,                                   /* size of prefetch block */
+  0,                                   /* number of parallel prefetches */
+  2,                                   /* Branch cost */
+  COSTS_N_INSNS (3),                   /* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (3),                   /* cost of FMUL instruction.  */
+  COSTS_N_INSNS (39),                  /* cost of FDIV instruction.  */
+  COSTS_N_INSNS (1),                   /* cost of FABS instruction.  */
+  COSTS_N_INSNS (1),                   /* cost of FCHS instruction.  */
+  COSTS_N_INSNS (70),                  /* cost of FSQRT instruction.  */
+  pentium_memcpy,
+  pentium_memset,
+  1,                                   /* scalar_stmt_cost.  */
+  1,                                   /* scalar load_cost.  */
+  1,                                   /* scalar_store_cost.  */
+  1,                                   /* vec_stmt_cost.  */
+  1,                                   /* vec_to_scalar_cost.  */
+  1,                                   /* scalar_to_vec_cost.  */
+  1,                                   /* vec_align_load_cost.  */
+  2,                                   /* vec_unalign_load_cost.  */
+  1,                                   /* vec_store_cost.  */
+  3,                                   /* cond_taken_branch_cost.  */
+  1,                                   /* cond_not_taken_branch_cost.  */
+};
+
  /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
     (we ensure the alignment).  For small blocks inline loop is still a
     noticeable win, for bigger blocks either rep movsl or rep movsb is
@@ -2027,6 +2095,7 @@ const struct processor_costs *ix86_cost = &pentium_cost;
  #define m_386 (1<<PROCESSOR_I386)
  #define m_486 (1<<PROCESSOR_I486)
  #define m_PENT (1<<PROCESSOR_PENTIUM)
+#define m_IAMCU (1<<PROCESSOR_IAMCU)
  #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
  #define m_PENT4 (1<<PROCESSOR_PENTIUM4)
  #define m_NOCONA (1<<PROCESSOR_NOCONA)
@@ -2086,7 +2155,7 @@ unsigned char ix86_arch_features[X86_ARCH_LAST];
     ix86_arch_features based on the processor mask.  */
  static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
    /* X86_ARCH_CMOV: Conditional move was added for pentiumpro.  */
-  ~(m_386 | m_486 | m_PENT | m_K6),
+  ~(m_386 | m_486 | m_PENT | m_IAMCU | m_K6),
  
    /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486.  */
    ~m_386,
@@ -2497,6 +2566,7 @@ static const struct ptt processor_target_table[PROCESSOR_max] =
    {"i386", &i386_cost, 4, 3, 4, 3, 4},
    {"i486", &i486_cost, 16, 15, 16, 15, 16},
    {"pentium", &pentium_cost, 16, 7, 16, 7, 16},
+  {"iamcu", &iamcu_cost, 16, 7, 16, 7, 16},
    {"pentiumpro", &pentiumpro_cost, 16, 15, 16, 10, 16},
    {"pentium4", &pentium4_cost, 0, 0, 0, 0, 0},
    {"nocona", &nocona_cost, 0, 0, 0, 0, 0},
@@ -3246,6 +3316,7 @@ ix86_option_override_internal (bool main_args_p,
        {"i486", PROCESSOR_I486, CPU_NONE, 0},
        {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
        {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
+      {"iamcu", PROCESSOR_IAMCU, CPU_PENTIUM, 0},
        {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
        {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
        {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW | PTA_PRFCHW},
@@ -26139,6 +26210,7 @@ ix86_issue_rate (void)
    switch (ix86_tune)
      {
      case PROCESSOR_PENTIUM:
+    case PROCESSOR_IAMCU:
      case PROCESSOR_BONNELL:
      case PROCESSOR_SILVERMONT:
      case PROCESSOR_KNL:
@@ -26325,6 +26397,7 @@ ix86_adjust_cost (rtx_insn *insn, rtx link, rtx_insn *dep_insn, int cost)
    switch (ix86_tune)
      {
      case PROCESSOR_PENTIUM:
+    case PROCESSOR_IAMCU:
        /* Address Generation Interlock adds a cycle of latency.  */
        if (insn_type == TYPE_LEA)
         {
@@ -26534,6 +26607,7 @@ ia32_multipass_dfa_lookahead (void)
    switch (ix86_tune)
      {
      case PROCESSOR_PENTIUM:
+    case PROCESSOR_IAMCU:
        return 2;
  
      case PROCESSOR_PENTIUMPRO:
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h

index d710b3d2643b8a0b2589806d02b6b2578a687d1c..f357e79d7805f624ec2d6f3838d12ab75ccdaa52 100644 (file)
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -2266,6 +2266,7 @@ enum processor_type
    PROCESSOR_I386,                      /* 80386 */
    PROCESSOR_I486,                      /* 80486DX, 80486SX, 80486DX[24] */
    PROCESSOR_PENTIUM,
+  PROCESSOR_IAMCU,
    PROCESSOR_PENTIUMPRO,
    PROCESSOR_PENTIUM4,
    PROCESSOR_NOCONA,
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def

index bb3209d6fcbe034a210b95e4da4050a9c3c9a0b7..42a560b1e8ddbc35de620ae5c96e73eb54e9d44a 100644 (file)
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -40,8 +40,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  
  /* X86_TUNE_SCHEDULE: Enable scheduling.  */
  DEF_TUNE (X86_TUNE_SCHEDULE, "schedule",
-          m_PENT | m_PPRO | m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_INTEL 
-         | m_KNL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC)
+          m_PENT | m_IAMCU | m_PPRO | m_CORE_ALL | m_BONNELL | m_SILVERMONT
+         | m_INTEL | m_KNL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC)
  
  /* X86_TUNE_PARTIAL_REG_DEPENDENCY: Enable more register renaming
     on modern chips.  Preffer stores affecting whole integer register
@@ -172,19 +172,21 @@ DEF_TUNE (X86_TUNE_PUSH_MEMORY, "push_memory",
  /* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred
     over esp subtraction.  */
  DEF_TUNE (X86_TUNE_SINGLE_PUSH, "single_push", m_386 | m_486 | m_PENT
-          | m_K6_GEODE)
+         | m_IAMCU | m_K6_GEODE)
  
  /* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred
     over esp subtraction.  */
-DEF_TUNE (X86_TUNE_DOUBLE_PUSH, "double_push", m_PENT | m_K6_GEODE)
+DEF_TUNE (X86_TUNE_DOUBLE_PUSH, "double_push", m_PENT | m_IAMCU
+         | m_K6_GEODE)
  
  /* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred
     over esp addition.  */
-DEF_TUNE (X86_TUNE_SINGLE_POP, "single_pop", m_386 | m_486 | m_PENT | m_PPRO)
+DEF_TUNE (X86_TUNE_SINGLE_POP, "single_pop", m_386 | m_486 | m_PENT
+         | m_IAMCU | m_PPRO)
  
  /* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred
     over esp addition.  */
-DEF_TUNE (X86_TUNE_DOUBLE_POP, "double_pop", m_PENT)
+DEF_TUNE (X86_TUNE_DOUBLE_POP, "double_pop", m_PENT | m_IAMCU)
  
  /*****************************************************************************/
  /* Branch predictor tuning                                                  */
@@ -224,7 +226,7 @@ DEF_TUNE (X86_TUNE_LCP_STALL, "lcp_stall", m_CORE_ALL | m_GENERIC)
  
  /* X86_TUNE_READ_MODIFY: Enable use of read-modify instructions such
     as "add mem, reg".  */
-DEF_TUNE (X86_TUNE_READ_MODIFY, "read_modify", ~(m_PENT | m_PPRO))
+DEF_TUNE (X86_TUNE_READ_MODIFY, "read_modify", ~(m_PENT | m_IAMCU | m_PPRO))
  
  /* X86_TUNE_USE_INCDEC: Enable use of inc/dec instructions.   */
  DEF_TUNE (X86_TUNE_USE_INCDEC, "use_incdec",
@@ -284,7 +286,8 @@ DEF_TUNE (X86_TUNE_USE_SAHF, "use_sahf",
  
  /* X86_TUNE_USE_CLTD: Controls use of CLTD and CTQO instructions.  */
  DEF_TUNE (X86_TUNE_USE_CLTD, "use_cltd",
-         ~(m_PENT | m_BONNELL | m_SILVERMONT | m_KNL | m_INTEL  | m_K6))
+         ~(m_PENT | m_IAMCU | m_BONNELL | m_SILVERMONT | m_KNL | m_INTEL
+           | m_K6))
  
  /* X86_TUNE_USE_BT: Enable use of BT (bit test) instructions.  */
  DEF_TUNE (X86_TUNE_USE_BT, "use_bt",
@@ -304,8 +307,8 @@ DEF_TUNE (X86_TUNE_USE_HIMODE_FIOP, "use_himode_fiop",
  /* X86_TUNE_USE_SIMODE_FIOP: Enables use of x87 instructions with 32bit
     integer operand.  */
  DEF_TUNE (X86_TUNE_USE_SIMODE_FIOP, "use_simode_fiop",
-          ~(m_PENT | m_PPRO | m_CORE_ALL | m_BONNELL | m_SILVERMONT
-           | m_KNL | m_INTEL | m_AMD_MULTIPLE | m_GENERIC))
+          ~(m_PENT | m_IAMCU | m_PPRO | m_CORE_ALL | m_BONNELL
+           | m_SILVERMONT | m_KNL | m_INTEL | m_AMD_MULTIPLE | m_GENERIC))
  
  /* X86_TUNE_USE_FFREEP: Use freep instruction instead of fstp.  */
  DEF_TUNE (X86_TUNE_USE_FFREEP, "use_ffreep", m_AMD_MULTIPLE)
@@ -444,7 +447,8 @@ DEF_TUNE (X86_TUNE_SHIFT1, "shift1", ~m_486)
  
  /* X86_TUNE_ZERO_EXTEND_WITH_AND: Use AND instruction instead
     of mozbl/movwl.  */
-DEF_TUNE (X86_TUNE_ZERO_EXTEND_WITH_AND, "zero_extend_with_and",  m_486 | m_PENT)
+DEF_TUNE (X86_TUNE_ZERO_EXTEND_WITH_AND, "zero_extend_with_and",
+         m_486 | m_PENT | m_IAMCU)
  
  /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
     and SImode multiply, but 386 and 486 do HImode multiply faster.  */
@@ -454,19 +458,21 @@ DEF_TUNE (X86_TUNE_PROMOTE_HIMODE_IMUL, "promote_himode_imul",
  /* X86_TUNE_FAST_PREFIX: Enable demoting some 32bit or 64bit arithmetic
     into 16bit/8bit when resulting sequence is shorter.  For example
     for "and $-65536, reg" to 16bit store of 0.  */
-DEF_TUNE (X86_TUNE_FAST_PREFIX, "fast_prefix", ~(m_386 | m_486 | m_PENT))
+DEF_TUNE (X86_TUNE_FAST_PREFIX, "fast_prefix",
+         ~(m_386 | m_486 | m_PENT | m_IAMCU))
  
  /* X86_TUNE_READ_MODIFY_WRITE: Enable use of read modify write instructions
     such as "add $1, mem".  */
-DEF_TUNE (X86_TUNE_READ_MODIFY_WRITE, "read_modify_write", ~m_PENT)
+DEF_TUNE (X86_TUNE_READ_MODIFY_WRITE, "read_modify_write",
+         ~(m_PENT | m_IAMCU))
  
  /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
     than a MOV.  */
-DEF_TUNE (X86_TUNE_MOVE_M1_VIA_OR, "move_m1_via_or", m_PENT)
+DEF_TUNE (X86_TUNE_MOVE_M1_VIA_OR, "move_m1_via_or", m_PENT | m_IAMCU)
  
  /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
     but one byte longer.  */
-DEF_TUNE (X86_TUNE_NOT_UNPAIRABLE, "not_unpairable", m_PENT)
+DEF_TUNE (X86_TUNE_NOT_UNPAIRABLE, "not_unpairable", m_PENT | m_IAMCU)
  
  /* X86_TUNE_PARTIAL_REG_STALL: Pentium pro, unlike later chips, handled
     use of partial registers by renaming.  This improved performance of 16bit
diff --git a/gcc/testsuite/gcc.target/i386/pr66749.c b/gcc/testsuite/gcc.target/i386/pr66749.c

new file mode 100644 (file)

index 0000000..affda08
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr66749.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target ia32 } */
+/* { dg-require-effective-target nonpic } */
+/* { dg-options "-O2 -miamcu -mtune=iamcu" } */
+
+char a[10], b[10];
+
+int f(int i)
+{
+  return a[i+1] + b[i+1];
+}
+
+/* { dg-final { scan-assembler "a\\+1" } } */
+/* { dg-final { scan-assembler "b\\+1" } } */
author	H.J. Lu <hjl@gcc.gnu.org>
	Mon, 6 Jul 2015 15:17:44 +0000 (08:17 -0700)
committer	H.J. Lu <hjl@gcc.gnu.org>
	Mon, 6 Jul 2015 15:17:44 +0000 (08:17 -0700)
gcc/config/i386/i386.c		patch \| blob \| history
gcc/config/i386/i386.h		patch \| blob \| history
gcc/config/i386/x86-tune.def		patch \| blob \| history
gcc/testsuite/gcc.target/i386/pr66749.c	[new file with mode: 0644]	patch \| blob