+2014-12-10 Ilya Tocar <ilya.tocar@intel.com>
+
+ * config.gcc: Support "knl".
+ * config/i386/driver-i386.c (host_detect_local_cpu): Detect "knl".
+ * config/i386/i386-c.c (ix86_target_macros_internal): Handle
+ PROCESSOR_KNL.
+ * config/i386/i386.c (m_KNL): Define.
+ (processor_target_table): Add "knl".
+ (PTA_KNL): Define.
+ (ix86_issue_rate): Add PROCESSOR_KNL.
+ (ix86_adjust_cost): Ditto.
+ (ia32_multipass_dfa_lookahead): Ditto.
+ (get_builtin_code_for_version): Handle "knl".
+ (fold_builtin_cpu): Ditto.
+ * config/i386/i386.h (TARGET_KNL): Define.
+ (processor_type): Add PROCESSOR_KNL.
+ * config/i386/i386.md (attr "cpu"): Add knl.
+ * config/i386/x86-tune.def: Add m_KNL.
+
2014-12-10 Jan Hubicka <hubicka@ucw.cz>
* doc/invoke.texi: (-devirtualize-at-ltrans): Document.
x86_64_archs="amdfam10 athlon64 athlon64-sse3 barcelona bdver1 bdver2 \
bdver3 bdver4 btver1 btver2 k8 k8-sse3 opteron opteron-sse3 nocona \
core2 corei7 corei7-avx core-avx-i core-avx2 atom slm nehalem westmere \
-sandybridge ivybridge haswell broadwell bonnell silvermont x86-64 native"
+sandybridge ivybridge haswell broadwell bonnell silvermont knl x86-64 \
+native"
# Additional x86 processors supported by --with-cpu=. Each processor
# MUST be separated by exactly one space.
if (arch)
{
/* This is unknown family 0x6 CPU. */
- if (has_adx)
+ /* Assume Knights Landing. */
+ if (has_avx512f)
+ cpu = "knl";
+ /* Assume Broadwell. */
+ else if (has_adx)
cpu = "broadwell";
else if (has_avx2)
/* Assume Haswell. */
def_or_undef (parse_in, "__silvermont");
def_or_undef (parse_in, "__silvermont__");
break;
+ case PROCESSOR_KNL:
+ def_or_undef (parse_in, "__knl");
+ def_or_undef (parse_in, "__knl__");
+ break;
/* use PROCESSOR_max to not set/unset the arch macro. */
case PROCESSOR_max:
break;
def_or_undef (parse_in, "__tune_slm__");
def_or_undef (parse_in, "__tune_silvermont__");
break;
+ case PROCESSOR_KNL:
+ def_or_undef (parse_in, "__tune_knl__");
+ break;
case PROCESSOR_INTEL:
case PROCESSOR_GENERIC:
break;
#define m_CORE_ALL (m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_HASWELL)
#define m_BONNELL (1<<PROCESSOR_BONNELL)
#define m_SILVERMONT (1<<PROCESSOR_SILVERMONT)
+#define m_KNL (1<<PROCESSOR_KNL)
#define m_INTEL (1<<PROCESSOR_INTEL)
#define m_GEODE (1<<PROCESSOR_GEODE)
{"haswell", &core_cost, 16, 10, 16, 10, 16},
{"bonnell", &atom_cost, 16, 15, 16, 7, 16},
{"silvermont", &slm_cost, 16, 15, 16, 7, 16},
+ {"knl", &slm_cost, 16, 15, 16, 7, 16},
{"intel", &intel_cost, 16, 15, 16, 7, 16},
{"geode", &geode_cost, 0, 0, 0, 0, 0},
{"k6", &k6_cost, 32, 7, 32, 7, 32},
| PTA_FMA | PTA_MOVBE | PTA_HLE)
#define PTA_BROADWELL \
(PTA_HASWELL | PTA_ADX | PTA_PRFCHW | PTA_RDSEED)
+#define PTA_KNL \
+ (PTA_BROADWELL | PTA_AVX512PF | PTA_AVX512ER | PTA_AVX512F | PTA_AVX512CD)
#define PTA_BONNELL \
(PTA_CORE2 | PTA_MOVBE)
#define PTA_SILVERMONT \
{"atom", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
{"silvermont", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
{"slm", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
+ {"knl", PROCESSOR_KNL, CPU_KNL, PTA_KNL},
{"intel", PROCESSOR_INTEL, CPU_SLM, PTA_NEHALEM},
{"geode", PROCESSOR_GEODE, CPU_GEODE,
PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE | PTA_PRFCHW},
case PROCESSOR_PENTIUM:
case PROCESSOR_BONNELL:
case PROCESSOR_SILVERMONT:
+ case PROCESSOR_KNL:
case PROCESSOR_INTEL:
case PROCESSOR_K6:
case PROCESSOR_BTVER2:
break;
case PROCESSOR_SILVERMONT:
+ case PROCESSOR_KNL:
case PROCESSOR_INTEL:
if (!reload_completed)
return cost;
case PROCESSOR_HASWELL:
case PROCESSOR_BONNELL:
case PROCESSOR_SILVERMONT:
+ case PROCESSOR_KNL:
case PROCESSOR_INTEL:
/* Generally, we want haifa-sched:max_issue() to look ahead as far
as many instructions can be executed on a cycle, i.e.,
P_PROC_FMA,
P_AVX2,
P_PROC_AVX2,
- P_AVX512F
+ P_AVX512F,
+ P_PROC_AVX512F
};
enum feature_priority priority = P_ZERO;
arg_str = "bonnell";
priority = P_PROC_SSSE3;
break;
+ case PROCESSOR_KNL:
+ arg_str = "knl";
+ priority = P_PROC_AVX512F;
+ break;
case PROCESSOR_SILVERMONT:
arg_str = "silvermont";
priority = P_PROC_SSE4_2;
M_AMDFAM10H,
M_AMDFAM15H,
M_INTEL_SILVERMONT,
+ M_INTEL_KNL,
M_AMD_BTVER1,
M_AMD_BTVER2,
M_CPU_SUBTYPE_START,
{"haswell", M_INTEL_COREI7_HASWELL},
{"bonnell", M_INTEL_BONNELL},
{"silvermont", M_INTEL_SILVERMONT},
+ {"knl", M_INTEL_KNL},
{"amdfam10h", M_AMDFAM10H},
{"barcelona", M_AMDFAM10H_BARCELONA},
{"shanghai", M_AMDFAM10H_SHANGHAI},
#define TARGET_HASWELL (ix86_tune == PROCESSOR_HASWELL)
#define TARGET_BONNELL (ix86_tune == PROCESSOR_BONNELL)
#define TARGET_SILVERMONT (ix86_tune == PROCESSOR_SILVERMONT)
+#define TARGET_KNL (ix86_tune == PROCESSOR_KNL)
#define TARGET_INTEL (ix86_tune == PROCESSOR_INTEL)
#define TARGET_GENERIC (ix86_tune == PROCESSOR_GENERIC)
#define TARGET_AMDFAM10 (ix86_tune == PROCESSOR_AMDFAM10)
PROCESSOR_HASWELL,
PROCESSOR_BONNELL,
PROCESSOR_SILVERMONT,
+ PROCESSOR_KNL,
PROCESSOR_INTEL,
PROCESSOR_GEODE,
PROCESSOR_K6,
;; Processor type.
(define_attr "cpu" "none,pentium,pentiumpro,geode,k6,athlon,k8,core2,nehalem,
atom,slm,generic,amdfam10,bdver1,bdver2,bdver3,bdver4,
- btver2"
+ btver2,knl"
(const (symbol_ref "ix86_schedule")))
;; A basic instruction type. Refinements due to arguments to be
/* X86_TUNE_SCHEDULE: Enable scheduling. */
DEF_TUNE (X86_TUNE_SCHEDULE, "schedule",
m_PENT | m_PPRO | m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_INTEL
- | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC)
+ | m_KNL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC)
/* X86_TUNE_PARTIAL_REG_DEPENDENCY: Enable more register renaming
on modern chips. Preffer stores affecting whole integer register
value over movb. */
DEF_TUNE (X86_TUNE_PARTIAL_REG_DEPENDENCY, "partial_reg_dependency",
m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_INTEL
- | m_AMD_MULTIPLE | m_GENERIC)
+ | m_KNL | m_AMD_MULTIPLE | m_GENERIC)
/* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: This knob promotes all store
destinations to be 128bit to allow register renaming on 128bit SSE units,
partial dependencies. */
DEF_TUNE (X86_TUNE_MOVX, "movx",
m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT
- | m_INTEL | m_GEODE | m_AMD_MULTIPLE | m_GENERIC)
+ | m_KNL | m_INTEL | m_GEODE | m_AMD_MULTIPLE | m_GENERIC)
/* X86_TUNE_MEMORY_MISMATCH_STALL: Avoid partial stores that are followed by
full sized loads. */
DEF_TUNE (X86_TUNE_MEMORY_MISMATCH_STALL, "memory_mismatch_stall",
m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_INTEL
- | m_AMD_MULTIPLE | m_GENERIC)
+ | m_KNL | m_AMD_MULTIPLE | m_GENERIC)
/* X86_TUNE_FUSE_CMP_AND_BRANCH_32: Fuse compare with a subsequent
conditional jump instruction for 32 bit TARGET.
/* X86_TUNE_REASSOC_FP_TO_PARALLEL: Try to produce parallel computations
during reassociation of fp computation. */
DEF_TUNE (X86_TUNE_REASSOC_FP_TO_PARALLEL, "reassoc_fp_to_parallel",
- m_BONNELL | m_SILVERMONT | m_HASWELL | m_INTEL | m_BDVER1
+ m_BONNELL | m_SILVERMONT | m_HASWELL | m_KNL |m_INTEL | m_BDVER1
| m_BDVER2 | m_GENERIC)
/*****************************************************************************/
regression on mgrid due to IRA limitation leading to unecessary
use of the frame pointer in 32bit mode. */
DEF_TUNE (X86_TUNE_ACCUMULATE_OUTGOING_ARGS, "accumulate_outgoing_args",
- m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_INTEL
+ m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_KNL | m_INTEL
| m_ATHLON_K8)
/* X86_TUNE_PROLOGUE_USING_MOVE: Do not use push/pop in prologues that are
/* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
than 4 branch instructions in the 16 byte window. */
DEF_TUNE (X86_TUNE_FOUR_JUMP_LIMIT, "four_jump_limit",
- m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_INTEL |
+ m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_KNL |m_INTEL |
m_ATHLON_K8 | m_AMDFAM10)
/*****************************************************************************/
/* X86_TUNE_USE_INCDEC: Enable use of inc/dec instructions. */
DEF_TUNE (X86_TUNE_USE_INCDEC, "use_incdec",
~(m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_INTEL
- | m_GENERIC))
+ | m_KNL | m_GENERIC))
/* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
for DFmode copies */
DEF_TUNE (X86_TUNE_INTEGER_DFMODE_MOVES, "integer_dfmode_moves",
~(m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT
- | m_INTEL | m_GEODE | m_AMD_MULTIPLE | m_GENERIC))
+ | m_KNL | m_INTEL | m_GEODE | m_AMD_MULTIPLE | m_GENERIC))
/* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
will impact LEA instruction selection. */
-DEF_TUNE (X86_TUNE_OPT_AGU, "opt_agu", m_BONNELL | m_SILVERMONT | m_INTEL)
+DEF_TUNE (X86_TUNE_OPT_AGU, "opt_agu", m_BONNELL | m_SILVERMONT | m_KNL
+ | m_INTEL)
/* X86_TUNE_AVOID_LEA_FOR_ADDR: Avoid lea for address computation. */
DEF_TUNE (X86_TUNE_AVOID_LEA_FOR_ADDR, "avoid_lea_for_addr",
- m_BONNELL | m_SILVERMONT)
+ m_BONNELL | m_SILVERMONT | m_KNL)
/* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
vector path on AMD machines.
/* X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE: Try to avoid memory operands for
a conditional move. */
DEF_TUNE (X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE, "avoid_mem_opnd_for_cmove",
- m_BONNELL | m_SILVERMONT | m_INTEL)
+ m_BONNELL | m_SILVERMONT | m_KNL | m_INTEL)
/* X86_TUNE_SINGLE_STRINGOP: Enable use of single string operations, such
as MOVS and STOS (without a REP prefix) to move/set sequences of bytes. */
/* X86_TUNE_USE_SAHF: Controls use of SAHF. */
DEF_TUNE (X86_TUNE_USE_SAHF, "use_sahf",
m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT
- | m_INTEL | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER
- | m_GENERIC)
+ | m_KNL | m_INTEL | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER
+ | m_BTVER | m_GENERIC)
/* X86_TUNE_USE_CLTD: Controls use of CLTD and CTQO instructions. */
DEF_TUNE (X86_TUNE_USE_CLTD, "use_cltd",
- ~(m_PENT | m_BONNELL | m_SILVERMONT | m_INTEL | m_K6))
+ ~(m_PENT | m_BONNELL | m_SILVERMONT | m_KNL | m_INTEL | m_K6))
/* X86_TUNE_USE_BT: Enable use of BT (bit test) instructions. */
DEF_TUNE (X86_TUNE_USE_BT, "use_bt",
- m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_INTEL | m_AMD_MULTIPLE
- | m_GENERIC)
+ m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_KNL | m_INTEL
+ | m_AMD_MULTIPLE | m_GENERIC)
/*****************************************************************************/
/* 387 instruction selection tuning */
integer operand. */
DEF_TUNE (X86_TUNE_USE_SIMODE_FIOP, "use_simode_fiop",
~(m_PENT | m_PPRO | m_CORE_ALL | m_BONNELL | m_SILVERMONT
- | m_INTEL | m_AMD_MULTIPLE | m_GENERIC))
+ | m_KNL | m_INTEL | m_AMD_MULTIPLE | m_GENERIC))
/* X86_TUNE_USE_FFREEP: Use freep instruction instead of fstp. */
DEF_TUNE (X86_TUNE_USE_FFREEP, "use_ffreep", m_AMD_MULTIPLE)
/* X86_TUNE_EXT_80387_CONSTANTS: Use fancy 80387 constants, such as PI. */
DEF_TUNE (X86_TUNE_EXT_80387_CONSTANTS, "ext_80387_constants",
m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT
- | m_INTEL | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC)
+ | m_KNL | m_INTEL | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC)
/*****************************************************************************/
/* SSE instruction selection tuning */
of a sequence loading registers by parts. */
DEF_TUNE (X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL, "sse_unaligned_load_optimal",
m_NEHALEM | m_SANDYBRIDGE | m_HASWELL | m_AMDFAM10 | m_BDVER
- | m_BTVER | m_SILVERMONT | m_INTEL | m_GENERIC)
+ | m_BTVER | m_SILVERMONT | m_KNL | m_INTEL | m_GENERIC)
/* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL: Use movups for misaligned stores instead
of a sequence loading registers by parts. */
DEF_TUNE (X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL, "sse_unaligned_store_optimal",
m_NEHALEM | m_SANDYBRIDGE | m_HASWELL | m_BDVER | m_SILVERMONT
- | m_INTEL | m_GENERIC)
+ | m_KNL | m_INTEL | m_GENERIC)
/* Use packed single precision instructions where posisble. I.e. movups instead
of movupd. */
/* X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS: Try to split memory operand for
fp converts to destination register. */
DEF_TUNE (X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS, "split_mem_opnd_for_fp_converts",
- m_SILVERMONT | m_INTEL)
+ m_SILVERMONT | m_KNL | m_INTEL)
/* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
from FP to FP. This form of instructions avoids partial write to the
/* X86_TUNE_SLOW_SHUFB: Indicates tunings with slow pshufb instruction. */
DEF_TUNE (X86_TUNE_SLOW_PSHUFB, "slow_pshufb",
- m_BONNELL | m_SILVERMONT | m_INTEL)
+ m_BONNELL | m_SILVERMONT | m_KNL | m_INTEL)
/* X86_TUNE_VECTOR_PARALLEL_EXECUTION: Indicates tunings with ability to
execute 2 or more vector instructions in parallel. */
+2014-12-10 Ilya Tocar <ilya.tocar@intel.com>
+
+ * gcc.target/i386/funcspec-5.c: Test avx512f and knl.
+
2014-12-10 Jakub Jelinek <jakub@redhat.com>
PR tree-optimization/62021
extern void test_tbm (void) __attribute__((__target__("tbm")));
extern void test_avx (void) __attribute__((__target__("avx")));
extern void test_avx2 (void) __attribute__((__target__("avx2")));
+extern void test_avx512 (void) __attribute__((__target__("avx512")));
extern void test_no_abm (void) __attribute__((__target__("no-abm")));
extern void test_no_aes (void) __attribute__((__target__("no-aes")));
extern void test_no_tbm (void) __attribute__((__target__("no-tbm")));
extern void test_no_avx (void) __attribute__((__target__("no-avx")));
extern void test_no_avx2 (void) __attribute__((__target__("no-avx2")));
+extern void test_no_avx512 (void) __attribute__((__target__("no-avx512")));
extern void test_arch_i386 (void) __attribute__((__target__("arch=i386")));
extern void test_arch_i486 (void) __attribute__((__target__("arch=i486")));
extern void test_arch_corei7 (void) __attribute__((__target__("arch=corei7")));
extern void test_arch_corei7_avx (void) __attribute__((__target__("arch=corei7-avx")));
extern void test_arch_core_avx2 (void) __attribute__((__target__("arch=core-avx2")));
+extern void test_arch_knl (void) __attribute__((__target__("arch=knl")));
extern void test_arch_geode (void) __attribute__((__target__("arch=geode")));
extern void test_arch_k6 (void) __attribute__((__target__("arch=k6")));
extern void test_arch_k6_2 (void) __attribute__((__target__("arch=k6-2")));