+2018-04-08  Monk Chiang  <sh.chiang04@gmail.com>
+
+       * config/nds32/constants.md (unspec_volatile_element): Add values for
+       TLB operation and data prefetch.
+       * config/nds32/nds32-intrinsic.c: Implementation of intrinsic
+       functions for TLB operation and data prefetch.
+       * config/nds32/nds32-intrinsic.md: Likewise.
+       * config/nds32/nds32_intrinsic.h: Likewise.
+       * config/nds32/nds32.c (nds32_dpref_names): Likewise.
+       (nds32_print_operand): Likewise.
+       * config/nds32/nds32.h (nds32_builtins): Likewise.
+
 2018-04-07  Thomas Koenig  <tkoenig@gcc.gnu.org>
        Andrew Pinski <pinsika@gcc.gnu.org>
 
 
   UNSPEC_VOLATILE_CCTL_VA_WBINVAL_LA
   UNSPEC_VOLATILE_CCTL_IDX_WBINVAL
   UNSPEC_VOLATILE_CCTL_VA_LCK
+  UNSPEC_VOLATILE_DPREF_QW
+  UNSPEC_VOLATILE_DPREF_HW
+  UNSPEC_VOLATILE_DPREF_W
+  UNSPEC_VOLATILE_DPREF_DW
+  UNSPEC_VOLATILE_TLBOP_TRD
+  UNSPEC_VOLATILE_TLBOP_TWR
+  UNSPEC_VOLATILE_TLBOP_RWR
+  UNSPEC_VOLATILE_TLBOP_RWLK
+  UNSPEC_VOLATILE_TLBOP_UNLK
+  UNSPEC_VOLATILE_TLBOP_PB
+  UNSPEC_VOLATILE_TLBOP_INV
+  UNSPEC_VOLATILE_TLBOP_FLUA
   UNSPEC_VOLATILE_RELAX_GROUP
   UNSPEC_VOLATILE_POP25_RETURN
 ])
 
   return target;
 }
 
+/* Expand builtins that take three operands and the third is immediate.  */
+static rtx
+nds32_expand_triopimm_builtin (enum insn_code icode, tree exp, rtx target,
+                              bool return_p, const char *name)
+{
+  rtx pat;
+  rtx op0 = nds32_read_argument (exp, 0);
+  rtx op1 = nds32_read_argument (exp, 1);
+  rtx op2 = nds32_read_argument (exp, 2);
+  int op0_num = return_p ? 1 : 0;
+  int op1_num = return_p ? 2 : 1;
+  int op2_num = return_p ? 3 : 2;
+
+  if (return_p)
+    target = nds32_legitimize_target (icode, target);
+
+  if (!nds32_check_constant_argument (icode, op2_num, op2, name))
+    return NULL_RTX;
+
+  op0 = nds32_legitimize_argument (icode, op0_num, op0);
+  op1 = nds32_legitimize_argument (icode, op1_num, op1);
+  op2 = nds32_legitimize_argument (icode, op2_num, op2);
+
+  /* Emit and return the new instruction. */
+  if (return_p)
+    pat = GEN_FCN (icode) (target, op0, op1, op2);
+  else
+    pat = GEN_FCN (icode) (op0, op1, op2);
+
+  if (! pat)
+    return NULL_RTX;
+
+  emit_insn (pat);
+  return target;
+}
+
 /* Expand builtins for load.  */
 static rtx
 nds32_expand_builtin_load (enum insn_code icode, tree exp, rtx target)
   NDS32_BUILTIN(clzsi2, "clz", CLZ)
   NDS32_BUILTIN(unspec_clo, "clo", CLO)
   NDS32_BUILTIN(unspec_wsbh, "wsbh", WSBH)
+  NDS32_BUILTIN(unspec_tlbop_pb, "tlbop_pb",TLBOP_PB)
   NDS32_BUILTIN(unaligned_load_hw, "unaligned_load_hw", UALOAD_HW)
   NDS32_BUILTIN(unaligned_loadsi, "unaligned_load_w", UALOAD_W)
   NDS32_BUILTIN(unaligned_loaddi, "unaligned_load_dw", UALOAD_DW)
   NDS32_NO_TARGET_BUILTIN(unspec_jral_ton, "jral_ton", JRAL_TON)
   NDS32_NO_TARGET_BUILTIN(unspec_ret_toff, "ret_toff", RET_TOFF)
   NDS32_NO_TARGET_BUILTIN(unspec_jral_iton, "jral_iton",JRAL_ITON)
+  NDS32_NO_TARGET_BUILTIN(unspec_tlbop_trd, "tlbop_trd", TLBOP_TRD)
+  NDS32_NO_TARGET_BUILTIN(unspec_tlbop_twr, "tlbop_twr", TLBOP_TWR)
+  NDS32_NO_TARGET_BUILTIN(unspec_tlbop_rwr, "tlbop_rwr", TLBOP_RWR)
+  NDS32_NO_TARGET_BUILTIN(unspec_tlbop_rwlk, "tlbop_rwlk", TLBOP_RWLK)
+  NDS32_NO_TARGET_BUILTIN(unspec_tlbop_unlk, "tlbop_unlk", TLBOP_UNLK)
+  NDS32_NO_TARGET_BUILTIN(unspec_tlbop_inv, "tlbop_inv", TLBOP_INV)
   NDS32_NO_TARGET_BUILTIN(unspec_ret_itoff, "ret_itoff", RET_ITOFF)
   NDS32_NO_TARGET_BUILTIN(unspec_set_current_sp,
                          "set_current_sp", SET_CURRENT_SP)
   NDS32_NO_TARGET_BUILTIN(bsp, "bsp", BSP)
 };
 
+/* Three-argument intrinsics with an immediate third argument.  */
+static struct builtin_description bdesc_3argimm[] =
+{
+  NDS32_NO_TARGET_BUILTIN(prefetch_qw, "prefetch_qw", DPREF_QW)
+  NDS32_NO_TARGET_BUILTIN(prefetch_hw, "prefetch_hw", DPREF_HW)
+  NDS32_NO_TARGET_BUILTIN(prefetch_w, "prefetch_w", DPREF_W)
+  NDS32_NO_TARGET_BUILTIN(prefetch_dw, "prefetch_dw", DPREF_DW)
+};
+
 /* Intrinsics that load a value.  */
 static struct builtin_description bdesc_load[] =
 {
     case NDS32_BUILTIN_SCHE_BARRIER:
       emit_insn (gen_blockage ());
       return target;
+    case NDS32_BUILTIN_TLBOP_FLUA:
+      emit_insn (gen_unspec_tlbop_flua ());
+      return target;
     case NDS32_BUILTIN_SCW:
       return nds32_expand_scw_builtin (CODE_FOR_unspec_volatile_scw,
                                       exp, target);
     if (d->code == fcode)
       return nds32_expand_triop_builtin (d->icode, exp, target, d->return_p);
 
+  for (i = 0, d = bdesc_3argimm; i < ARRAY_SIZE (bdesc_3argimm); i++, d++)
+    if (d->code == fcode)
+      return nds32_expand_triopimm_builtin (d->icode, exp, target,
+                                           d->return_p, d->name);
+
   for (i = 0, d = bdesc_load; i < ARRAY_SIZE (bdesc_load); i++, d++)
     if (d->code == fcode)
       return nds32_expand_builtin_load (d->icode, exp, target);
   ADD_NDS32_BUILTIN3 ("cctl_idx_write", void, integer, unsigned, unsigned,
                      CCTL_IDX_WRITE);
 
+  /* PREFETCH  */
+  ADD_NDS32_BUILTIN3 ("dpref_qw", void, ptr_uchar, unsigned, integer, DPREF_QW);
+  ADD_NDS32_BUILTIN3 ("dpref_hw", void, ptr_ushort, unsigned, integer,
+                     DPREF_HW);
+  ADD_NDS32_BUILTIN3 ("dpref_w", void, ptr_uint, unsigned, integer, DPREF_W);
+  ADD_NDS32_BUILTIN3 ("dpref_dw", void, ptr_ulong, unsigned, integer, DPREF_DW);
+
   /* Performance Extension  */
   ADD_NDS32_BUILTIN1 ("pe_abs", integer, integer, ABS);
   ADD_NDS32_BUILTIN2 ("pe_ave", integer, integer, integer, AVE);
 
   /* Schedule Barrier */
   ADD_NDS32_BUILTIN0 ("schedule_barrier", void, SCHE_BARRIER);
+
+  /* TLBOP  */
+  ADD_NDS32_BUILTIN1 ("tlbop_trd", void, unsigned, TLBOP_TRD);
+  ADD_NDS32_BUILTIN1 ("tlbop_twr", void, unsigned, TLBOP_TWR);
+  ADD_NDS32_BUILTIN1 ("tlbop_rwr", void, unsigned, TLBOP_RWR);
+  ADD_NDS32_BUILTIN1 ("tlbop_rwlk", void, unsigned, TLBOP_RWLK);
+  ADD_NDS32_BUILTIN1 ("tlbop_unlk", void, unsigned, TLBOP_UNLK);
+  ADD_NDS32_BUILTIN1 ("tlbop_pb", unsigned, unsigned, TLBOP_PB);
+  ADD_NDS32_BUILTIN1 ("tlbop_inv", void, unsigned, TLBOP_INV);
+  ADD_NDS32_BUILTIN0 ("tlbop_flua", void, TLBOP_FLUA);
+
   /* Unaligned Load/Store  */
   ADD_NDS32_BUILTIN1 ("unaligned_load_hw", short_unsigned, ptr_ushort,
                      UALOAD_HW);
 
   [(set_attr "type" "mmu")]
 )
 
+;;PREFETCH
+
+(define_insn "prefetch_qw"
+  [(unspec_volatile:QI [(match_operand:SI 0 "register_operand" "r")
+                       (match_operand:SI 1 "nonmemory_operand" "r")
+                       (match_operand:SI 2 "immediate_operand" "i")] UNSPEC_VOLATILE_DPREF_QW)]
+  ""
+  "dpref\t%Z2, [%0 + %1]"
+  [(set_attr "type" "misc")]
+)
+
+(define_insn "prefetch_hw"
+  [(unspec_volatile:HI [(match_operand:SI 0 "register_operand" "r")
+                       (match_operand:SI 1 "nonmemory_operand" "r")
+                       (match_operand:SI 2 "immediate_operand" "i")] UNSPEC_VOLATILE_DPREF_HW)]
+  ""
+  "dpref\t%Z2, [%0 + (%1<<1)]"
+  [(set_attr "type" "misc")]
+)
+
+(define_insn "prefetch_w"
+  [(unspec_volatile:SI [(match_operand:SI 0 "register_operand" "    r, r")
+                       (match_operand:SI 1 "nonmemory_operand" "Is15, r")
+                       (match_operand:SI 2 "immediate_operand" "   i, i")] UNSPEC_VOLATILE_DPREF_W)]
+  ""
+  "@
+  dprefi.w\t%Z2, [%0 + %1]
+  dpref\t%Z2, [%0 + (%1<<2)]"
+  [(set_attr "type" "misc")]
+)
+
+(define_insn "prefetch_dw"
+  [(unspec_volatile:DI [(match_operand:SI 0 "register_operand"  "   r, r")
+                       (match_operand:SI 1 "nonmemory_operand" "Is15, r")
+                       (match_operand:SI 2 "immediate_operand" "   i, i")] UNSPEC_VOLATILE_DPREF_DW)]
+  ""
+  "@
+  dprefi.d\t%Z2, [%0 + %1]
+  dpref\t%Z2, [%0 + (%1<<3)]"
+  [(set_attr "type" "misc")]
+)
 
 ;; Performance Extension
 
   [(set_attr "type"    "alu")
    (set_attr "length"    "4")]
 )
+
+;; TLBOP Intrinsic
+
+(define_insn "unspec_tlbop_trd"
+  [(unspec_volatile:SI [(match_operand:SI 0 "register_operand" "r")] UNSPEC_VOLATILE_TLBOP_TRD)]
+  ""
+  "tlbop\t%0, TRD"
+  [(set_attr "type" "mmu")]
+)
+
+(define_insn "unspec_tlbop_twr"
+  [(unspec_volatile:SI [(match_operand:SI 0 "register_operand" "r")] UNSPEC_VOLATILE_TLBOP_TWR)]
+  ""
+  "tlbop\t%0, TWR"
+  [(set_attr "type" "mmu")]
+)
+
+(define_insn "unspec_tlbop_rwr"
+  [(unspec_volatile:SI [(match_operand:SI 0 "register_operand" "r")] UNSPEC_VOLATILE_TLBOP_RWR)]
+  ""
+  "tlbop\t%0, RWR"
+  [(set_attr "type" "mmu")]
+)
+
+(define_insn "unspec_tlbop_rwlk"
+  [(unspec_volatile:SI [(match_operand:SI 0 "register_operand" "r")] UNSPEC_VOLATILE_TLBOP_RWLK)]
+  ""
+  "tlbop\t%0, RWLK"
+  [(set_attr "type" "mmu")]
+)
+
+(define_insn "unspec_tlbop_unlk"
+  [(unspec_volatile:SI [(match_operand:SI 0 "register_operand" "r")] UNSPEC_VOLATILE_TLBOP_UNLK)]
+  ""
+  "tlbop\t%0, UNLK"
+  [(set_attr "type" "mmu")]
+)
+
+(define_insn "unspec_tlbop_pb"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+       (unspec_volatile:SI [(match_operand:SI 1 "register_operand" "r")] UNSPEC_VOLATILE_TLBOP_PB))]
+  ""
+  "tlbop\t%0, %1, PB"
+  [(set_attr "type" "mmu")]
+)
+
+(define_insn "unspec_tlbop_inv"
+  [(unspec_volatile:SI [(match_operand:SI 0 "register_operand" "r")] UNSPEC_VOLATILE_TLBOP_INV)]
+  ""
+  "tlbop\t%0, INV"
+  [(set_attr "type" "mmu")]
+)
+
+(define_insn "unspec_tlbop_flua"
+  [(unspec_volatile:SI [(const_int 0)] UNSPEC_VOLATILE_TLBOP_FLUA)]
+  ""
+  "tlbop\tFLUA"
+  [(set_attr "type" "mmu")]
+)
+
 ;;Unaligned Load/Store
 
 (define_expand "unaligned_load_hw"
 
   "L1I_IX_WWD"
 };
 
+static const char * const nds32_dpref_names[] =
+{
+  "SRD",
+  "MRD",
+  "SWR",
+  "MWR",
+  "PTE",
+  "CLWR"
+};
+
 /* Defining register allocation order for performance.
    We want to allocate callee-saved registers after others.
    It may be used by nds32_adjust_reg_alloc_order().  */
       fprintf (stream, "%s", nds32_cctl_names[op_value + 16]);
       return;
 
+    case 'Z': /* dpref  */
+      fprintf (stream, "%s", nds32_dpref_names[op_value]);
+      return;
+
     default :
       /* Unknown flag.  */
       output_operand_lossage ("invalid operand output code");
 
   NDS32_BUILTIN_CCTL_L1D_INVALALL,
   NDS32_BUILTIN_CCTL_L1D_WBALL_ALVL,
   NDS32_BUILTIN_CCTL_L1D_WBALL_ONE_LVL,
+  NDS32_BUILTIN_DPREF_QW,
+  NDS32_BUILTIN_DPREF_HW,
+  NDS32_BUILTIN_DPREF_W,
+  NDS32_BUILTIN_DPREF_DW,
+  NDS32_BUILTIN_TLBOP_TRD,
+  NDS32_BUILTIN_TLBOP_TWR,
+  NDS32_BUILTIN_TLBOP_RWR,
+  NDS32_BUILTIN_TLBOP_RWLK,
+  NDS32_BUILTIN_TLBOP_UNLK,
+  NDS32_BUILTIN_TLBOP_PB,
+  NDS32_BUILTIN_TLBOP_INV,
+  NDS32_BUILTIN_TLBOP_FLUA,
   NDS32_BUILTIN_UALOAD_HW,
   NDS32_BUILTIN_UALOAD_W,
   NDS32_BUILTIN_UALOAD_DW,
 
   __NDS32_CCTL_L1I_IX_WWD__
 };
 
+enum nds32_dpref
+{
+  __NDS32_DPREF_SRD__,
+  __NDS32_DPREF_MRD__,
+  __NDS32_DPREF_SWR__,
+  __NDS32_DPREF_MWR__,
+  __NDS32_DPREF_PTE__,
+  __NDS32_DPREF_CLWR__
+};
+
 /* ------------------------------------------------------------------------ */
 
 /* Define intrinsic register name macro for compatibility.  */
 #define NDS32_CCTL_L1D_IX_WWD           __NDS32_CCTL_L1D_IX_WWD__
 #define NDS32_CCTL_L1I_IX_WTAG          __NDS32_CCTL_L1I_IX_WTAG__
 #define NDS32_CCTL_L1I_IX_WWD           __NDS32_CCTL_L1I_IX_WWD__
+
+#define NDS32_DPREF_SRD                 __NDS32_DPREF_SRD__
+#define NDS32_DPREF_MRD                 __NDS32_DPREF_MRD__
+#define NDS32_DPREF_SWR                 __NDS32_DPREF_SWR__
+#define NDS32_DPREF_MWR                 __NDS32_DPREF_MWR__
+#define NDS32_DPREF_PTE                 __NDS32_DPREF_PTE__
+#define NDS32_DPREF_CLWR                __NDS32_DPREF_CLWR__
+
 /* ------------------------------------------------------------------------ */
 
 
   (__builtin_nds32_svs ((a), (b)))
 #define __nds32__sva(a, b) \
   (__builtin_nds32_sva ((a), (b)))
+#define __nds32__dpref_qw(a, b, subtype) \
+  (__builtin_nds32_dpref_qw ((a), (b), (subtype)))
+#define __nds32__dpref_hw(a, b, subtype) \
+  (__builtin_nds32_dpref_hw ((a), (b), (subtype)))
+#define __nds32__dpref_w(a, b, subtype) \
+  (__builtin_nds32_dpref_w ((a), (b), (subtype)))
+#define __nds32__dpref_dw(a, b, subtype) \
+  (__builtin_nds32_dpref_dw ((a), (b), (subtype)))
 
 #define __nds32__teqz(a, swid) \
   (__builtin_nds32_teqz ((a), (swid)))
 #define __nds32__fmfcfg() \
   (__builtin_nds32_fmfcfg())
 
+#define __nds32__tlbop_trd(a) \
+  (__builtin_nds32_tlbop_trd ((a)))
+#define __nds32__tlbop_twr(a) \
+  (__builtin_nds32_tlbop_twr ((a)))
+#define __nds32__tlbop_rwr(a) \
+  (__builtin_nds32_tlbop_rwr ((a)))
+#define __nds32__tlbop_rwlk(a) \
+  (__builtin_nds32_tlbop_rwlk ((a)))
+#define __nds32__tlbop_unlk(a) \
+  (__builtin_nds32_tlbop_unlk ((a)))
+#define __nds32__tlbop_pb(a) \
+  (__builtin_nds32_tlbop_pb ((a)))
+#define __nds32__tlbop_inv(a) \
+  (__builtin_nds32_tlbop_inv ((a)))
+#define __nds32__tlbop_flua() \
+(__builtin_nds32_tlbop_flua())
+
 #endif /* nds32_intrinsic.h */