From e22862689667e739d308140d72e8cc85ed116a2f Mon Sep 17 00:00:00 2001
From: Monk Chiang <sh.chiang04@gmail.com>
Date: Fri, 6 Apr 2018 05:51:33 +0000
Subject: [PATCH] [NDS32] Add hard float support.

gcc/
	* config.gcc (nds32*-*-*): Add v2j v3f v3s checking.
	(nds32*-*-*): Add float and fpu_config into supported_defaults.
	* common/config/nds32/nds32-common.c (TARGET_DEFAULT_TARGET_FLAGS):
	Include TARGET_DEFAULT_FPU_ISA and TARGET_DEFAULT_FPU_FMA.
	* config/nds32/constants.md (unspec_element): Add UNSPEC_COPYSIGN,
	UNSPEC_FCPYNSD, UNSPEC_FCPYNSS, UNSPEC_FCPYSD and UNSPEC_FCPYSS.
	* config/nds32/constraints.md: New constraints and checking for hard
	float configuration.
	* config/nds32/iterators.md: New mode iterator and attribute for hard
	float configuration.
	* config/nds32/nds32-doubleword.md: Use hard float alternatives and
	patterns.
	* config/nds32/nds32-fpu.md: New file.
	* config/nds32/nds32-md-auxiliary.c: New functions and checkings to
	deal with hard float code generation.
	* config/nds32/nds32-opts.h (nds32_arch_type): Add ARCH_V3F and
	ARCH_V3S.
	(abi_type, float_reg_number): New enum type.
	* config/nds32/nds32-predicates.c: New predicates for hard float.
	* config/nds32/nds32-protos.h: Declare functions for hard float.
	* config/nds32/nds32.c: Implementation for hard float configuration.
	* config/nds32/nds32.h: Definitions for hard float configuration.
	* config/nds32/nds32.md: Include hard float machine description and
	modify patterns for hard float configuration.
	* config/nds32/nds32.opt: New options for hard float configuration.
	* config/nds32/predicates.md: New predicates for hard float
	configuration.

Co-Authored-By: Chung-Ju Wu <jasonwucj@gmail.com>

From-SVN: r259161
---
 gcc/ChangeLog                          |   31 +
 gcc/common/config/nds32/nds32-common.c |    2 +
 gcc/config.gcc                         |   42 +-
 gcc/config/nds32/constants.md          |    5 +
 gcc/config/nds32/constraints.md        |   40 +-
 gcc/config/nds32/iterators.md          |    6 +-
 gcc/config/nds32/nds32-doubleword.md   |  211 ++---
 gcc/config/nds32/nds32-fpu.md          |  503 ++++++++++++
 gcc/config/nds32/nds32-md-auxiliary.c  |  684 +++++++++++++++-
 gcc/config/nds32/nds32-opts.h          |   24 +-
 gcc/config/nds32/nds32-predicates.c    |   67 ++
 gcc/config/nds32/nds32-protos.h        |   30 +
 gcc/config/nds32/nds32.c               | 1019 +++++++++++++++++++++---
 gcc/config/nds32/nds32.h               |  268 ++++++-
 gcc/config/nds32/nds32.md              |  114 ++-
 gcc/config/nds32/nds32.opt             |   75 ++
 gcc/config/nds32/predicates.md         |   21 +
 17 files changed, 2826 insertions(+), 316 deletions(-)
 create mode 100644 gcc/config/nds32/nds32-fpu.md

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 5c49936fb19..979450f612c 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,34 @@
+2018-04-06  Monk Chiang  <sh.chiang04@gmail.com>
+	    Chung-Ju Wu  <jasonwucj@gmail.com>
+
+	* config.gcc (nds32*-*-*): Add v2j v3f v3s checking.
+	(nds32*-*-*): Add float and fpu_config into supported_defaults.
+	* common/config/nds32/nds32-common.c (TARGET_DEFAULT_TARGET_FLAGS):
+	Include TARGET_DEFAULT_FPU_ISA and TARGET_DEFAULT_FPU_FMA.
+	* config/nds32/constants.md (unspec_element): Add UNSPEC_COPYSIGN,
+	UNSPEC_FCPYNSD, UNSPEC_FCPYNSS, UNSPEC_FCPYSD and UNSPEC_FCPYSS.
+	* config/nds32/constraints.md: New constraints and checking for hard
+	float configuration.
+	* config/nds32/iterators.md: New mode iterator and attribute for hard
+	float configuration.
+	* config/nds32/nds32-doubleword.md: Use hard float alternatives and
+	patterns.
+	* config/nds32/nds32-fpu.md: New file.
+	* config/nds32/nds32-md-auxiliary.c: New functions and checkings to
+	deal with hard float code generation.
+	* config/nds32/nds32-opts.h (nds32_arch_type): Add ARCH_V3F and
+	ARCH_V3S.
+	(abi_type, float_reg_number): New enum type.
+	* config/nds32/nds32-predicates.c: New predicates for hard float.
+	* config/nds32/nds32-protos.h: Declare functions for hard float.
+	* config/nds32/nds32.c: Implementation for hard float configuration.
+	* config/nds32/nds32.h: Definitions for hard float configuration.
+	* config/nds32/nds32.md: Include hard float machine description and
+	modify patterns for hard float configuration.
+	* config/nds32/nds32.opt: New options for hard float configuration.
+	* config/nds32/predicates.md: New predicates for hard float
+	configuration.
+
 2018-04-06  Kuan-Lin Chen  <kuanlinchentw@gmail.com>
 
 	* common/config/nds32/nds32-common.c
diff --git a/gcc/common/config/nds32/nds32-common.c b/gcc/common/config/nds32/nds32-common.c
index 8d3c9b2ea96..dbcc390728e 100644
--- a/gcc/common/config/nds32/nds32-common.c
+++ b/gcc/common/config/nds32/nds32-common.c
@@ -107,6 +107,8 @@ static const struct default_options nds32_option_optimization_table[] =
 #undef TARGET_DEFAULT_TARGET_FLAGS
 #define TARGET_DEFAULT_TARGET_FLAGS		\
   (TARGET_CPU_DEFAULT				\
+   | TARGET_DEFAULT_FPU_ISA			\
+   | TARGET_DEFAULT_FPU_FMA			\
    | MASK_16_BIT				\
    | MASK_EXT_PERF				\
    | MASK_EXT_PERF2				\
diff --git a/gcc/config.gcc b/gcc/config.gcc
index b8a9877b432..75d0ae815ff 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -4278,15 +4278,26 @@ case "${target}" in
 		;;
 
 	nds32*-*-*)
-		supported_defaults="arch cpu nds32_lib"
+		supported_defaults="arch cpu nds32_lib float fpu_config"
 
 		# process --with-arch
 		case "${with_arch}" in
-		"" | v2 | v3 | v3m)
+		"" | v3 )
+			tm_defines="${tm_defines} TARGET_ARCH_DEFAULT=0"
+			;;
+		v2 | v2j | v3m)
 			# OK
+			tm_defines="${tm_defines} TARGET_ARCH_DEFAULT=0"
+			;;
+		v3f)
+			tm_defines="${tm_defines} TARGET_ARCH_DEFAULT=1"
+			;;
+		v3s)
+			tm_defines="${tm_defines} TARGET_ARCH_DEFAULT=2"
+
 			;;
 		*)
-			echo "Cannot accept --with-arch=$with_arch, available values are: v2 v3 v3m" 1>&2
+			echo "Cannot accept --with-arch=$with_arch, available values are: v2 v2j v3 v3m v3f v3s" 1>&2
 			exit 1
 			;;
 		esac
@@ -4321,8 +4332,31 @@ case "${target}" in
 			exit 1
 			;;
 		esac
-		;;
 
+		# process --with-float
+		case "${with_float}" in
+		"" | soft | hard)
+			# OK
+			;;
+		*)
+			echo "Cannot accept --with-float=$with_float, available values are: soft hard" 1>&2
+			exit 1
+			;;
+		esac
+
+		# process --with-config-fpu
+		case "${with_config_fpu}" in
+		"" | 0 | 1 | 2 | 3)
+			# OK
+			;;
+		*)
+			echo "Cannot accept --with-config-fpu=$with_config_fpu, available values from 0 to 7" 1>&2
+			exit 1
+			;;
+		esac
+
+
+		;;
 	nios2*-*-*)
 		supported_defaults="arch"
 			case "$with_arch" in
diff --git a/gcc/config/nds32/constants.md b/gcc/config/nds32/constants.md
index 77fb71c3420..7c706eb7da0 100644
--- a/gcc/config/nds32/constants.md
+++ b/gcc/config/nds32/constants.md
@@ -32,6 +32,11 @@
 
 ;; The unpec operation index.
 (define_c_enum "unspec_element" [
+  UNSPEC_COPYSIGN
+  UNSPEC_FCPYNSD
+  UNSPEC_FCPYNSS
+  UNSPEC_FCPYSD
+  UNSPEC_FCPYSS
   UNSPEC_FFB
   UNSPEC_FFMISM
   UNSPEC_FLMISM
diff --git a/gcc/config/nds32/constraints.md b/gcc/config/nds32/constraints.md
index 7cf18eb5533..7af7769fcbf 100644
--- a/gcc/config/nds32/constraints.md
+++ b/gcc/config/nds32/constraints.md
@@ -53,6 +53,10 @@
 (define_register_constraint "x" "FRAME_POINTER_REG"
   "Frame pointer register $fp")
 
+(define_register_constraint "f"
+  "(TARGET_FPU_SINGLE || TARGET_FPU_DOUBLE) ? FP_REGS : NO_REGS"
+ "The Floating point registers $fs0 ~ $fs31")
+
 (define_constraint "Iv00"
   "Constant value 0"
   (and (match_code "const_int")
@@ -108,6 +112,11 @@
   (and (match_code "const_int")
        (match_test "ival < (1 << 4) && ival >= -(1 << 4)")))
 
+(define_constraint "Cs05"
+  "Signed immediate 5-bit value"
+  (and (match_code "const_double")
+       (match_test "nds32_const_double_range_ok_p (op, SFmode, -(1 << 4), (1 << 4))")))
+
 (define_constraint "Iu05"
   "Unsigned immediate 5-bit value"
   (and (match_code "const_int")
@@ -246,12 +255,21 @@
   (and (match_code "const_int")
        (match_test "ival < (1 << 19) && ival >= -(1 << 19)")))
 
+(define_constraint "Cs20"
+  "Signed immediate 20-bit value"
+  (and (match_code "const_double")
+       (match_test "nds32_const_double_range_ok_p (op, SFmode, -(1 << 19), (1 << 19))")))
 
 (define_constraint "Ihig"
   "The immediate value that can be simply set high 20-bit"
   (and (match_code "const_int")
        (match_test "(ival != 0) && ((ival & 0xfff) == 0)")))
 
+(define_constraint "Chig"
+  "The immediate value that can be simply set high 20-bit"
+  (and (match_code "high")
+       (match_test "GET_CODE (XEXP (op, 0)) == CONST_DOUBLE")))
+
 (define_constraint "Izeb"
   "The immediate value 0xff"
   (and (match_code "const_int")
@@ -296,25 +314,39 @@
   "Memory constraint for 45 format"
   (and (match_code "mem")
        (match_test "(nds32_mem_format (op) == ADDRESS_REG)
-		    && (GET_MODE (op) == SImode)")))
+		    && ((GET_MODE (op) == SImode)
+		       || (GET_MODE (op) == SFmode))")))
 
 (define_memory_constraint "Ufe"
   "Memory constraint for fe format"
   (and (match_code "mem")
        (match_test "nds32_mem_format (op) == ADDRESS_R8_IMM7U
-		    && (GET_MODE (op) == SImode)")))
+		    && (GET_MODE (op) == SImode
+			|| GET_MODE (op) == SFmode)")))
 
 (define_memory_constraint "U37"
   "Memory constraint for 37 format"
   (and (match_code "mem")
        (match_test "(nds32_mem_format (op) == ADDRESS_SP_IMM7U
 		    || nds32_mem_format (op) == ADDRESS_FP_IMM7U)
-		    && (GET_MODE (op) == SImode)")))
-
+		    && (GET_MODE (op) == SImode
+			|| GET_MODE (op) == SFmode)")))
 
 (define_memory_constraint "Umw"
   "Memory constraint for lwm/smw"
   (and (match_code "mem")
        (match_test "nds32_valid_smw_lwm_base_p (op)")))
 
+(define_memory_constraint "Da"
+  "Memory constraint for non-offset loads/stores"
+  (and (match_code "mem")
+       (match_test "REG_P (XEXP (op, 0))
+		    || (GET_CODE (XEXP (op, 0)) == POST_INC)")))
+
+(define_memory_constraint "Q"
+  "Memory constraint for no symbol_ref and const"
+  (and (match_code "mem")
+       (match_test "(TARGET_FPU_SINGLE || TARGET_FPU_DOUBLE)
+		     && nds32_float_mem_operand_p (op)")))
+
 ;; ------------------------------------------------------------------------
diff --git a/gcc/config/nds32/iterators.md b/gcc/config/nds32/iterators.md
index e0798ff812f..c2062de2e97 100644
--- a/gcc/config/nds32/iterators.md
+++ b/gcc/config/nds32/iterators.md
@@ -45,11 +45,15 @@
 (define_mode_iterator VSQIHIDI [V4QI V2HI QI HI DI])
 (define_mode_iterator VQIHIDI [V4QI V2HI DI])
 
+;; A list of the modes that are up to double-word long.
+(define_mode_iterator ANYF [(SF "TARGET_FPU_SINGLE")
+			    (DF "TARGET_FPU_DOUBLE")])
+
 ;;----------------------------------------------------------------------------
 ;; Mode attributes.
 ;;----------------------------------------------------------------------------
 
-(define_mode_attr size [(QI "b") (HI "h") (SI "w")])
+(define_mode_attr size [(QI "b") (HI "h") (SI "w") (SF "s") (DF "d")])
 
 (define_mode_attr byte [(QI "1") (HI "2") (SI "4") (V4QI "4") (V2HI "4")])
 
diff --git a/gcc/config/nds32/nds32-doubleword.md b/gcc/config/nds32/nds32-doubleword.md
index 0e4b0dc4fb6..7df715a771f 100644
--- a/gcc/config/nds32/nds32-doubleword.md
+++ b/gcc/config/nds32/nds32-doubleword.md
@@ -46,145 +46,77 @@
 
 
 (define_insn "move_<mode>"
-  [(set (match_operand:DIDF 0 "nonimmediate_operand" "=r, r, r, m")
-	(match_operand:DIDF 1 "general_operand"      " r, i, m, r"))]
+  [(set (match_operand:DIDF 0 "nonimmediate_operand" "=r, r,  r, r, Da, m, f, Q, f, *r, *f")
+	(match_operand:DIDF 1 "general_operand"      " r, i, Da, m,  r, r, Q, f, f, *f, *r"))]
   "register_operand(operands[0], <MODE>mode)
    || register_operand(operands[1], <MODE>mode)"
 {
-  rtx addr;
-  rtx otherops[5];
-
   switch (which_alternative)
     {
     case 0:
       return "movd44\t%0, %1";
-
     case 1:
       /* reg <- const_int, we ask gcc to split instruction.  */
       return "#";
-
     case 2:
-      /* Refer to nds32_legitimate_address_p() in nds32.c,
-         we only allow "reg", "symbol_ref", "const", and "reg + const_int"
-         as address rtx for DImode/DFmode memory access.  */
-      addr = XEXP (operands[1], 0);
-
-      otherops[0] = gen_rtx_REG (SImode, REGNO (operands[0]));
-      otherops[1] = gen_rtx_REG (SImode, REGNO (operands[0]) + 1);
-      otherops[2] = addr;
-
-      if (REG_P (addr))
-	{
-	  /* (reg) <- (mem (reg)) */
-	  output_asm_insn ("lmw.bi\t%0, [%2], %1, 0", otherops);
-	}
-      else if (GET_CODE (addr) == PLUS)
-	{
-	  /* (reg) <- (mem (plus (reg) (const_int))) */
-	  rtx op0 = XEXP (addr, 0);
-	  rtx op1 = XEXP (addr, 1);
-
-	  if (REG_P (op0))
-	    {
-	      otherops[2] = op0;
-	      otherops[3] = op1;
-	      otherops[4] = gen_int_mode (INTVAL (op1) + 4, SImode);
-	    }
-	  else
-	    {
-	      otherops[2] = op1;
-	      otherops[3] = op0;
-	      otherops[4] = gen_int_mode (INTVAL (op0) + 4, SImode);
-	    }
-
-	  /* To avoid base overwrite when REGNO(%0) == REGNO(%2).  */
-	  if (REGNO (otherops[0]) != REGNO (otherops[2]))
-	    {
-	      output_asm_insn ("lwi\t%0, [%2 + (%3)]", otherops);
-	      output_asm_insn ("lwi\t%1, [%2 + (%4)]", otherops);
-	    }
-	  else
-	    {
-	      output_asm_insn ("lwi\t%1, [%2 + (%4)]", otherops);
-	      output_asm_insn ("lwi\t%0,[ %2 + (%3)]", otherops);
-	    }
-	}
-      else
-	{
-	  /* (reg) <- (mem (symbol_ref ...))
-	     (reg) <- (mem (const ...)) */
-	  output_asm_insn ("lwi.gp\t%0, [ + %2]", otherops);
-	  output_asm_insn ("lwi.gp\t%1, [ + %2 + 4]", otherops);
-	}
-
-      /* We have already used output_asm_insn() by ourself,
-         so return an empty string.  */
-      return "";
-
+      /* The memory format is (mem (reg)),
+	 we can generate 'lmw.bi' instruction.  */
+      return nds32_output_double (operands, true);
     case 3:
-      /* Refer to nds32_legitimate_address_p() in nds32.c,
-         we only allow "reg", "symbol_ref", "const", and "reg + const_int"
-         as address rtx for DImode/DFmode memory access.  */
-      addr = XEXP (operands[0], 0);
-
-      otherops[0] = gen_rtx_REG (SImode, REGNO (operands[1]));
-      otherops[1] = gen_rtx_REG (SImode, REGNO (operands[1]) + 1);
-      otherops[2] = addr;
-
-      if (REG_P (addr))
-	{
-	  /* (mem (reg)) <- (reg) */
-	  output_asm_insn ("smw.bi\t%0, [%2], %1, 0", otherops);
-	}
-      else if (GET_CODE (addr) == PLUS)
-	{
-	  /* (mem (plus (reg) (const_int))) <- (reg) */
-	  rtx op0 = XEXP (addr, 0);
-	  rtx op1 = XEXP (addr, 1);
-
-	  if (REG_P (op0))
-	    {
-	      otherops[2] = op0;
-	      otherops[3] = op1;
-	      otherops[4] = gen_int_mode (INTVAL (op1) + 4, SImode);
-	    }
-	  else
-	    {
-	      otherops[2] = op1;
-	      otherops[3] = op0;
-	      otherops[4] = gen_int_mode (INTVAL (op0) + 4, SImode);
-	    }
-
-	  /* To avoid base overwrite when REGNO(%0) == REGNO(%2).  */
-	  if (REGNO (otherops[0]) != REGNO (otherops[2]))
-	    {
-	      output_asm_insn ("swi\t%0, [%2 + (%3)]", otherops);
-	      output_asm_insn ("swi\t%1, [%2 + (%4)]", otherops);
-	    }
-	  else
-	    {
-	      output_asm_insn ("swi\t%1, [%2 + (%4)]", otherops);
-	      output_asm_insn ("swi\t%0, [%2 + (%3)]", otherops);
-	    }
-	}
-      else
-	{
-	  /* (mem (symbol_ref ...)) <- (reg)
-	     (mem (const ...))      <- (reg) */
-	  output_asm_insn ("swi.gp\t%0, [ + %2]", otherops);
-	  output_asm_insn ("swi.gp\t%1, [ + %2 + 4]", otherops);
-	}
-
-      /* We have already used output_asm_insn() by ourself,
-         so return an empty string.  */
-      return "";
-
+      /* We haven't 64-bit load instruction,
+	 we split this pattern to two SImode pattern.  */
+      return "#";
+    case 4:
+      /* The memory format is (mem (reg)),
+	 we can generate 'smw.bi' instruction.  */
+      return nds32_output_double (operands, false);
+    case 5:
+      /* We haven't 64-bit store instruction,
+	 we split this pattern to two SImode pattern.  */
+      return "#";
+    case 6:
+      return nds32_output_float_load (operands);
+    case 7:
+      return nds32_output_float_store (operands);
+    case 8:
+      return "fcpysd\t%0, %1, %1";
+    case 9:
+      return "fmfdr\t%0, %1";
+    case 10:
+      return "fmtdr\t%1, %0";
     default:
       gcc_unreachable ();
     }
 }
-  [(set_attr "type"   "alu,alu,alu,alu")
-   (set_attr "length" "  4, 16,  8,  8")])
+  [(set_attr "type"    "alu,alu,load,load,store,store,fload,fstore,fcpy,fmfdr,fmtdr")
+   (set_attr_alternative "length"
+     [
+       ;; Alternative 0
+       (if_then_else (match_test "!TARGET_16_BIT")
+		     (const_int 4)
+		     (const_int 2))
+       ;; Alternative 1
+       (const_int 16)
+       ;; Alternative 2
+       (const_int 4)
+       ;; Alternative 3
+       (const_int 8)
+       ;; Alternative 4
+       (const_int 4)
+       ;; Alternative 5
+       (const_int 8)
+       ;; Alternative 6
+       (const_int 4)
+       ;; Alternative 7
+       (const_int 4)
+       ;; Alternative 8
+       (const_int 4)
+       ;; Alternative 9
+       (const_int 4)
+       ;; Alternative 10
+       (const_int 4)
+     ])
+   (set_attr "feature" " v1, v1,  v1,  v1,   v1,   v1,    fpu,    fpu,    fpu,    fpu,    fpu")])
 
 (define_split
   [(set (match_operand:DIDF 0 "register_operand"     "")
@@ -208,7 +140,12 @@
   /* Actually we would like to create move behavior by ourself.
      So that movsi expander could have chance to split large constant.  */
   emit_move_insn (operands[2], operands[3]);
-  emit_move_insn (operands[4], operands[5]);
+
+  unsigned HOST_WIDE_INT mask = GET_MODE_MASK (SImode);
+  if ((UINTVAL (operands[3]) & mask) == (UINTVAL (operands[5]) & mask))
+    emit_move_insn (operands[4], operands[2]);
+  else
+    emit_move_insn (operands[4], operands[5]);
   DONE;
 })
 
@@ -218,7 +155,9 @@
   [(set (match_operand:DIDF 0 "register_operand" "")
 	(match_operand:DIDF 1 "register_operand" ""))]
   "reload_completed
-   && (TARGET_ISA_V2 || !TARGET_16_BIT)"
+   && (TARGET_ISA_V2 || !TARGET_16_BIT)
+   && NDS32_IS_GPR_REGNUM (REGNO (operands[0]))
+   && NDS32_IS_GPR_REGNUM (REGNO (operands[1]))"
   [(set (match_dup 0) (match_dup 1))
    (set (match_dup 2) (match_dup 3))]
 {
@@ -240,6 +179,28 @@
     }
 })
 
+(define_split
+  [(set (match_operand:DIDF 0 "nds32_general_register_operand" "")
+	(match_operand:DIDF 1 "memory_operand" ""))]
+  "reload_completed
+   && nds32_split_double_word_load_store_p (operands, true)"
+  [(set (match_dup 2) (match_dup 3))
+   (set (match_dup 4) (match_dup 5))]
+{
+  nds32_spilt_doubleword (operands, true);
+})
+
+(define_split
+  [(set (match_operand:DIDF 0  "memory_operand" "")
+	(match_operand:DIDF 1  "nds32_general_register_operand" ""))]
+  "reload_completed
+   && nds32_split_double_word_load_store_p (operands, false)"
+  [(set (match_dup 2) (match_dup 3))
+   (set (match_dup 4) (match_dup 5))]
+{
+  nds32_spilt_doubleword (operands, false);
+})
+
 ;; -------------------------------------------------------------
 ;; Boolean DImode instructions.
 ;; -------------------------------------------------------------
diff --git a/gcc/config/nds32/nds32-fpu.md b/gcc/config/nds32/nds32-fpu.md
new file mode 100644
index 00000000000..719b0428ced
--- /dev/null
+++ b/gcc/config/nds32/nds32-fpu.md
@@ -0,0 +1,503 @@
+;; Machine description of Andes NDS32 cpu for GNU compiler
+;; Copyright (C) 2012-2015 Free Software Foundation, Inc.
+;; Contributed by Andes Technology Corporation.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it
+;; under the terms of the GNU General Public License as published
+;; by the Free Software Foundation; either version 3, or (at your
+;; option) any later version.
+;;
+;; GCC is distributed in the hope that it will be useful, but WITHOUT
+;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+;; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+;; License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+
+;;SFmode moves
+
+(define_expand "movsf"
+  [(set (match_operand:SF 0 "general_operand" "")
+	(match_operand:SF 1 "general_operand" ""))]
+  ""
+{
+  /* Need to force register if mem <- !reg.  */
+  if (MEM_P (operands[0]) && !REG_P (operands[1]))
+    operands[1] = force_reg (SFmode, operands[1]);
+  if (CONST_DOUBLE_P (operands[1])
+      && !satisfies_constraint_Cs20 (operands[1]))
+    {
+      const REAL_VALUE_TYPE *r;
+      unsigned long l;
+
+      r = CONST_DOUBLE_REAL_VALUE (operands[1]);
+      REAL_VALUE_TO_TARGET_SINGLE (*r, l);
+
+      emit_move_insn (operands[0], gen_rtx_HIGH (SFmode, operands[1]));
+
+      if ((l & 0xFFF) != 0)
+	emit_insn (gen_movsf_lo (operands[0], operands[0], operands[1]));
+      DONE;
+    }
+})
+
+(define_insn "movsf_lo"
+  [(set (match_operand:SF 0 "register_operand" "=r")
+	(lo_sum:SF (match_operand:SF 1 "register_operand" "r")
+		   (match_operand:SF 2 "immediate_operand" "i")))]
+  ""
+  "ori\t%0, %1, lo12(%2)"
+  [(set_attr "type"   "alu")
+   (set_attr "length"   "4")]
+)
+
+(define_insn "*movsf"
+  [(set (match_operand:SF 0 "nonimmediate_operand" "=r, r, U45, U33, U37, U45, m,   l,   l,   l,   d, r, f, *f, *r, f, Q,   r,   r,    r")
+	(match_operand:SF 1 "general_operand"      " r, r,   l,   l,   l,   d, r, U45, U33, U37, U45, m, f, *r, *f, Q, f,Cs05,Cs20, Chig"))]
+  "(register_operand(operands[0], SFmode)
+    || register_operand(operands[1], SFmode))"
+{
+  switch (which_alternative)
+    {
+    case 0:
+      return "mov55\t%0, %1";
+    case 1:
+      return "ori\t%0, %1, 0";
+    case 2:
+    case 3:
+    case 4:
+    case 5:
+      return nds32_output_16bit_store (operands, 4);
+    case 6:
+      return nds32_output_32bit_store (operands, 4);
+    case 7:
+    case 8:
+    case 9:
+    case 10:
+      return nds32_output_16bit_load (operands, 4);
+    case 11:
+      return nds32_output_32bit_load (operands, 4);
+    case 12:
+      if (TARGET_FPU_SINGLE)
+	return "fcpyss\t%0, %1, %1";
+      else
+	return "#";
+    case 13:
+      return "fmtsr\t%1, %0";
+    case 14:
+      return "fmfsr\t%0, %1";
+    case 15:
+      return nds32_output_float_load (operands);
+    case 16:
+      return nds32_output_float_store (operands);
+    case 17:
+      return "movi55\t%0, %1";
+    case 18:
+      return "movi\t%0, %1";
+    case 19:
+      return "sethi\t%0, %1";
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "type"    "alu,alu,store,store,store,store,store,load,load,load,load,load,fcpy,fmtsr,fmfsr,fload,fstore,alu,alu,alu")
+   (set_attr "length"  "  2,  4,    2,    2,    2,    2,    4,   2,   2,   2,   2,   4,   4,    4,    4,    4,     4,  2,  4,  4")
+   (set_attr "feature" " v1, v1,   v1,   v1,   v1,   v1,   v1,  v1,  v1,  v1,  v1,  v1, fpu,  fpu,  fpu,  fpu,   fpu, v1, v1, v1")])
+
+;; Conditional Move Instructions
+
+(define_expand "mov<mode>cc"
+  [(set (match_operand:ANYF 0 "register_operand" "")
+	(if_then_else:ANYF (match_operand 1 "nds32_float_comparison_operator" "")
+			   (match_operand:ANYF 2 "register_operand" "")
+			   (match_operand:ANYF 3 "register_operand" "")))]
+  ""
+{
+  if (nds32_cond_move_p (operands[1]))
+    {
+      /* Operands[1] condition code is UNORDERED or ORDERED, and
+	 sub-operands[1] MODE isn't SFmode or SFmode, return FAIL
+	 for gcc, because we don't using slt compare instruction
+	 to generate UNORDERED and ORDERED condition.  */
+      FAIL;
+    }
+  else
+    nds32_expand_float_movcc (operands);
+})
+
+(define_insn "fcmov<mode>_eq"
+  [(set (match_operand:ANYF 0 "register_operand" "=f, f")
+	(if_then_else:ANYF (eq (match_operand:SI 1 "register_operand" "f, f")
+			       (const_int 0))
+			   (match_operand:ANYF 2 "register_operand" "f, 0")
+			   (match_operand:ANYF 3 "register_operand" "0, f")))]
+  ""
+  "@
+   fcmovz<size>\t%0,%2,%1
+   fcmovn<size>\t%0,%3,%1"
+  [(set_attr "type"  "fcmov")
+   (set_attr "length" "4")]
+)
+
+(define_insn "fcmov<mode>_ne"
+  [(set (match_operand:ANYF 0 "register_operand" "=f, f")
+	(if_then_else:ANYF (ne (match_operand:SI 1 "register_operand" "f, f")
+			       (const_int 0))
+			   (match_operand:ANYF 2 "register_operand" "f, 0")
+			   (match_operand:ANYF 3 "register_operand" "0, f")))]
+  ""
+  "@
+   fcmovn<size>\t%0,%2,%1
+   fcmovz<size>\t%0,%3,%1"
+  [(set_attr "type"  "fcmov")
+   (set_attr "length" "4")]
+)
+
+;; Arithmetic instructions.
+
+(define_insn "add<mode>3"
+  [(set (match_operand:ANYF 0 "register_operand" "=f")
+	(plus:ANYF (match_operand:ANYF 1 "register_operand" "f")
+		   (match_operand:ANYF 2 "register_operand" "f")))]
+  ""
+  "fadd<size>\t %0, %1, %2"
+  [(set_attr "type"   "falu")
+   (set_attr "length" "4")]
+)
+
+(define_insn "sub<mode>3"
+  [(set (match_operand:ANYF 0 "register_operand" "=f")
+	(minus:ANYF (match_operand:ANYF 1 "register_operand" "f")
+		    (match_operand:ANYF 2 "register_operand" "f")))]
+  ""
+  "fsub<size>\t %0, %1, %2"
+  [(set_attr "type"   "falu")
+   (set_attr "length" "4")]
+)
+
+;; Multiplication insns.
+
+(define_insn "mul<mode>3"
+  [(set (match_operand:ANYF 0 "register_operand" "=f")
+	(mult:ANYF (match_operand:ANYF 1 "register_operand" "f")
+		   (match_operand:ANYF 2 "register_operand" "f")))]
+  ""
+  "fmul<size>\t %0, %1, %2"
+  [(set_attr "type"   "fmul<size>")
+   (set_attr "length" "4")]
+)
+
+(define_insn "fma<mode>4"
+  [(set (match_operand:ANYF 0 "register_operand" "=f")
+	(fma:ANYF (match_operand:ANYF 1 "register_operand" "f")
+		  (match_operand:ANYF 2 "register_operand" "f")
+		  (match_operand:ANYF 3 "register_operand" "0")))]
+  "TARGET_EXT_FPU_FMA"
+  "fmadd<size>\t%0, %1, %2"
+  [(set_attr "type"   "fmac<size>")
+   (set_attr "length" "4")]
+)
+
+(define_insn "fnma<mode>4"
+  [(set (match_operand:ANYF 0 "register_operand" "=f")
+	(fma:ANYF (neg:ANYF (match_operand:ANYF 1 "register_operand" "f"))
+		  (match_operand:ANYF 2 "register_operand" "f")
+		  (match_operand:ANYF 3 "register_operand" "0")))]
+  "TARGET_EXT_FPU_FMA"
+  "fmsub<size>\t%0, %1, %2"
+  [(set_attr "type"   "fmac<size>")
+   (set_attr "length" "4")]
+)
+
+(define_insn "fms<mode>4"
+  [(set (match_operand:ANYF 0 "register_operand" "=f")
+	(fma:ANYF (match_operand:ANYF 1 "register_operand" "f")
+		  (match_operand:ANYF 2 "register_operand" "f")
+		  (neg:ANYF (match_operand:ANYF 3 "register_operand" "0"))))]
+  "TARGET_EXT_FPU_FMA"
+  "fnmsub<size>\t%0, %1, %2"
+  [(set_attr "type"   "fmac<size>")
+   (set_attr "length" "4")]
+)
+
+(define_insn "fnms<mode>4"
+  [(set (match_operand:ANYF 0 "register_operand" "=f")
+	(fma:ANYF (neg:ANYF (match_operand:ANYF 1 "register_operand" "f"))
+		  (match_operand:ANYF 2 "register_operand" "f")
+		  (neg:ANYF (match_operand:ANYF 3 "register_operand" "0"))))]
+  "TARGET_EXT_FPU_FMA"
+  "fnmadd<size>\t%0, %1, %2"
+  [(set_attr "type"   "fmac<size>")
+   (set_attr "length" "4")]
+)
+
+;; Div Instructions.
+
+(define_insn "div<mode>3"
+  [(set (match_operand:ANYF 0 "register_operand" "=f")
+	(div:ANYF (match_operand:ANYF 1 "register_operand" "f")
+		  (match_operand:ANYF 2 "register_operand" "f")))]
+  ""
+  "fdiv<size>\t %0, %1, %2"
+  [(set_attr "type"   "fdiv<size>")
+   (set_attr "length" "4")]
+)
+
+(define_insn "sqrt<mode>2"
+  [(set (match_operand:ANYF 0 "register_operand" "=f")
+	(sqrt:ANYF (match_operand:ANYF 1 "register_operand" "f")))]
+  ""
+  "fsqrt<size>\t %0, %1"
+  [(set_attr "type"   "fsqrt<size>")
+   (set_attr "length" "4")]
+)
+
+;; Conditional Branch patterns
+
+(define_expand "cstore<mode>4"
+  [(set (match_operand:SI 0 "register_operand" "")
+	(match_operator:SI 1 "nds32_float_comparison_operator"
+	 [(match_operand:ANYF 2 "register_operand" "")
+	  (match_operand:ANYF 3 "register_operand" "")]))]
+  ""
+{
+  nds32_expand_float_cstore (operands);
+  DONE;
+})
+
+(define_expand "cbranch<mode>4"
+  [(set (pc)
+	(if_then_else (match_operator 0 "nds32_float_comparison_operator"
+		       [(match_operand:ANYF 1 "register_operand" "")
+			(match_operand:ANYF 2 "register_operand" "")])
+		      (label_ref (match_operand 3 "" ""))
+		      (pc)))]
+  ""
+{
+  nds32_expand_float_cbranch (operands);
+  DONE;
+})
+
+;; Copysign Instructions.
+
+(define_insn "copysignsf3"
+  [(set (match_operand:SF 0 "register_operand" "=f")
+	(unspec:SF [(match_operand:SF 1 "register_operand" "f")
+		    (match_operand:SF 2 "register_operand" "f")]
+		     UNSPEC_COPYSIGN))]
+  "TARGET_FPU_SINGLE"
+  "fcpyss\t%0,%1,%2"
+  [(set_attr "type"   "fcpy")
+   (set_attr "length" "4")]
+)
+
+(define_insn "copysigndf3"
+  [(set (match_operand:DF 0 "register_operand" "=f")
+	(unspec:DF [(match_operand:DF 1 "register_operand" "f")
+		    (match_operand:DF 2 "register_operand" "f")]
+		     UNSPEC_COPYSIGN))]
+  "TARGET_FPU_SINGLE || TARGET_FPU_DOUBLE"
+  "fcpysd\t%0,%1,%2"
+  [(set_attr "type"   "fcpy")
+   (set_attr "length" "4")]
+)
+
+(define_insn "*ncopysign<mode>3"
+  [(set (match_operand:ANYF 0 "register_operand" "=f")
+	(neg:ANYF (unspec:ANYF [(match_operand:ANYF 1 "register_operand" "f")
+				(match_operand:ANYF 2 "register_operand" "f")]
+				UNSPEC_COPYSIGN)))]
+  ""
+  "fcpyns<size>\t%0,%1,%2"
+  [(set_attr "type"   "fcpy")
+   (set_attr "length" "4")]
+)
+
+;; Absolute Instructions
+
+(define_insn "abssf2"
+  [(set (match_operand:SF 0 "register_operand" "=f, r")
+	(abs:SF (match_operand:SF 1 "register_operand" "f, r")))]
+  "TARGET_FPU_SINGLE || TARGET_EXT_PERF"
+  "@
+   fabss\t%0, %1
+   bclr\t%0, %1, 31"
+  [(set_attr "type"    "fabs,alu")
+   (set_attr "length"  "4")
+   (set_attr "feature" "fpu,pe1")]
+)
+
+(define_insn "absdf2"
+  [(set (match_operand:DF 0 "register_operand" "=f")
+	(abs:DF (match_operand:DF 1 "register_operand" "f")))]
+  "TARGET_FPU_DOUBLE"
+  "fabsd\t%0, %1"
+  [(set_attr "type"   "fabs")
+   (set_attr "length" "4")]
+)
+
+;; Negation Instructions
+
+(define_insn "*negsf2"
+  [(set (match_operand:SF 0 "register_operand" "=f, r")
+	(neg:SF (match_operand:SF 1 "register_operand" "f, r")))]
+  "TARGET_FPU_SINGLE || TARGET_EXT_PERF"
+  "@
+   fcpynss\t%0, %1, %1
+   btgl\t%0, %1, 31"
+  [(set_attr "type"    "fcpy,alu")
+   (set_attr "length"  "4")
+   (set_attr "feature" "fpu,pe1")]
+)
+
+(define_insn "*negdf2"
+  [(set (match_operand:DF 0 "register_operand" "=f")
+	(neg:DF (match_operand:DF 1 "register_operand" "f")))]
+  "TARGET_FPU_DOUBLE"
+  "fcpynsd\t%0, %1, %1"
+  [(set_attr "type"   "fcpy")
+   (set_attr "length" "4")]
+)
+
+;; Data Format Conversion Instructions
+
+(define_insn "floatunssi<mode>2"
+  [(set (match_operand:ANYF 0 "register_operand" "=f")
+	(unsigned_float:ANYF (match_operand:SI 1 "register_operand" "f")))]
+  ""
+  "fui2<size>\t %0, %1"
+  [(set_attr "type"   "falu")
+   (set_attr "length" "4")]
+)
+
+(define_insn "floatsi<mode>2"
+  [(set (match_operand:ANYF 0 "register_operand" "=f")
+	(float:ANYF (match_operand:SI 1 "register_operand" "f")))]
+  ""
+  "fsi2<size>\t %0, %1"
+  [(set_attr "type"   "falu")
+   (set_attr "length" "4")]
+)
+
+(define_insn "fixuns_trunc<mode>si2"
+  [(set (match_operand:SI 0 "register_operand" "=f")
+	(unsigned_fix:SI (fix:ANYF (match_operand:ANYF 1 "register_operand" "f"))))]
+  ""
+  "f<size>2ui.z\t %0, %1"
+  [(set_attr "type"   "falu")
+   (set_attr "length" "4")]
+)
+
+(define_insn "fix_trunc<mode>si2"
+  [(set (match_operand:SI 0 "register_operand" "=f")
+	(fix:SI (fix:ANYF (match_operand:ANYF 1 "register_operand" "f"))))]
+  ""
+  "f<size>2si.z\t %0, %1"
+  [(set_attr "type"   "falu")
+   (set_attr "length" "4")]
+)
+
+(define_insn "extendsfdf2"
+  [(set (match_operand:DF 0 "register_operand" "=f")
+	(float_extend:DF (match_operand:SF 1 "register_operand" "f")))]
+  "TARGET_FPU_SINGLE && TARGET_FPU_DOUBLE"
+  "fs2d\t%0, %1"
+  [(set_attr "type"   "falu")
+   (set_attr "length" "4")]
+)
+
+(define_insn "truncdfsf2"
+  [(set (match_operand:SF 0 "register_operand" "=f")
+	(float_truncate:SF (match_operand:DF 1 "register_operand" "f")))]
+  "TARGET_FPU_SINGLE && TARGET_FPU_DOUBLE"
+  "fd2s\t%0, %1"
+  [(set_attr "type"   "falu")
+   (set_attr "length" "4")]
+)
+
+;; Compare Instructions
+
+(define_insn "cmp<mode>_eq"
+  [(set (match_operand:SI 0 "register_operand" "=f")
+	(eq:SI (match_operand:ANYF 1 "register_operand" "f")
+	       (match_operand:ANYF 2 "register_operand" "f")))]
+  ""
+  {
+    if (NDS32_EXT_FPU_DOT_E)
+      return "fcmpeq<size>.e %0, %1, %2";
+    else
+      return "fcmpeq<size>\t%0, %1, %2";
+  }
+  [(set_attr "type"   "fcmp")
+   (set_attr "length" "4")]
+)
+
+(define_insn "cmp<mode>_lt"
+  [(set (match_operand:SI 0 "register_operand" "=f")
+	(lt:SI (match_operand:ANYF 1 "register_operand" "f")
+	       (match_operand:ANYF 2 "register_operand" "f")))]
+  ""
+{
+  if (NDS32_EXT_FPU_DOT_E)
+    return "fcmplt<size>.e %0, %1, %2";
+  else
+    return "fcmplt<size>\t%0, %1, %2";
+}
+  [(set_attr "type"   "fcmp")
+   (set_attr "length" "4")]
+)
+
+(define_insn "cmp<mode>_le"
+  [(set (match_operand:SI 0 "register_operand" "=f")
+	(le:SI (match_operand:ANYF 1 "register_operand" "f")
+	       (match_operand:ANYF 2 "register_operand" "f")))]
+  ""
+{
+  if (NDS32_EXT_FPU_DOT_E)
+    return "fcmple<size>.e %0, %1, %2";
+  else
+    return "fcmple<size>\t%0, %1, %2";
+}
+  [(set_attr "type"   "fcmp")
+   (set_attr "length" "4")]
+)
+
+(define_insn "cmp<mode>_un"
+  [(set (match_operand:SI 0 "register_operand" "=f")
+	(unordered:SI (match_operand:ANYF 1 "register_operand" "f")
+		      (match_operand:ANYF 2 "register_operand" "f")))]
+  ""
+{
+  if (NDS32_EXT_FPU_DOT_E)
+    return "fcmpun<size>.e %0, %1, %2";
+  else
+    return "fcmpun<size>\t%0, %1, %2";
+}
+  [(set_attr "type"   "fcmp")
+   (set_attr "length" "4")]
+)
+
+(define_split
+  [(set (match_operand:SF 0 "register_operand" "")
+	(match_operand:SF 1 "register_operand" ""))]
+  "!TARGET_FPU_SINGLE
+   && NDS32_IS_FPR_REGNUM (REGNO (operands[0]))
+   && NDS32_IS_FPR_REGNUM (REGNO (operands[1]))"
+  [(set (match_dup 2) (match_dup 1))
+   (set (match_dup 0) (match_dup 2))]
+{
+  operands[2] = gen_rtx_REG (SFmode, TA_REGNUM);
+})
+
+(define_split
+  [(set (match_operand:SF 0 "register_operand" "")
+	(match_operand:SF 1 "const_double_operand" ""))]
+  "!satisfies_constraint_Cs20 (operands[1])
+   && !satisfies_constraint_Chig (operands[1])"
+  [(set (match_dup 0) (high:SF (match_dup 1)))
+   (set (match_dup 0) (lo_sum:SF (match_dup 0) (match_dup 1)))])
+;; ----------------------------------------------------------------------------
diff --git a/gcc/config/nds32/nds32-md-auxiliary.c b/gcc/config/nds32/nds32-md-auxiliary.c
index b3673ae72e6..dca1c9f959b 100644
--- a/gcc/config/nds32/nds32-md-auxiliary.c
+++ b/gcc/config/nds32/nds32-md-auxiliary.c
@@ -742,6 +742,146 @@ nds32_expand_cstore (rtx *operands)
     }
 }
 
+void
+nds32_expand_float_cbranch (rtx *operands)
+{
+  enum rtx_code code = GET_CODE (operands[0]);
+  enum rtx_code new_code = code;
+  rtx cmp_op0 = operands[1];
+  rtx cmp_op1 = operands[2];
+  rtx tmp_reg;
+  rtx tmp;
+
+  int reverse = 0;
+
+  /* Main Goal: Use compare instruction + branch instruction.
+
+     For example:
+     GT, GE: swap condition and swap operands and generate
+     compare instruction(LT, LE) + branch not equal instruction.
+
+     UNORDERED, LT, LE, EQ: no need to change and generate
+     compare instruction(UNORDERED, LT, LE, EQ) + branch not equal instruction.
+
+     ORDERED, NE: reverse condition and generate
+     compare instruction(EQ) + branch equal instruction. */
+
+  switch (code)
+    {
+    case GT:
+    case GE:
+      tmp = cmp_op0;
+      cmp_op0 = cmp_op1;
+      cmp_op1 = tmp;
+      new_code = swap_condition (new_code);
+      break;
+    case UNORDERED:
+    case LT:
+    case LE:
+    case EQ:
+      break;
+    case ORDERED:
+    case NE:
+      new_code = reverse_condition (new_code);
+      reverse = 1;
+      break;
+    case UNGT:
+    case UNGE:
+      new_code = reverse_condition_maybe_unordered (new_code);
+      reverse = 1;
+      break;
+    case UNLT:
+    case UNLE:
+      new_code = reverse_condition_maybe_unordered (new_code);
+      tmp = cmp_op0;
+      cmp_op0 = cmp_op1;
+      cmp_op1 = tmp;
+      new_code = swap_condition (new_code);
+      reverse = 1;
+      break;
+    default:
+      return;
+    }
+
+  tmp_reg = gen_reg_rtx (SImode);
+  emit_insn (gen_rtx_SET (tmp_reg,
+			  gen_rtx_fmt_ee (new_code, SImode,
+					  cmp_op0, cmp_op1)));
+
+  PUT_CODE (operands[0], reverse ? EQ : NE);
+  emit_insn (gen_cbranchsi4 (operands[0], tmp_reg,
+			     const0_rtx, operands[3]));
+}
+
+void
+nds32_expand_float_cstore (rtx *operands)
+{
+  enum rtx_code code = GET_CODE (operands[1]);
+  enum rtx_code new_code = code;
+  machine_mode mode = GET_MODE (operands[2]);
+
+  rtx cmp_op0 = operands[2];
+  rtx cmp_op1 = operands[3];
+  rtx tmp;
+
+  /* Main Goal: Use compare instruction to store value.
+
+     For example:
+     GT, GE: swap condition and swap operands.
+       reg_R = (reg_A >  reg_B) --> fcmplt reg_R, reg_B, reg_A
+       reg_R = (reg_A >= reg_B) --> fcmple reg_R, reg_B, reg_A
+
+     LT, LE, EQ: no need to change, it is already LT, LE, EQ.
+       reg_R = (reg_A <  reg_B) --> fcmplt reg_R, reg_A, reg_B
+       reg_R = (reg_A <= reg_B) --> fcmple reg_R, reg_A, reg_B
+       reg_R = (reg_A == reg_B) --> fcmpeq reg_R, reg_A, reg_B
+
+     ORDERED: reverse condition and using xor insturction to achieve 'ORDERED'.
+       reg_R = (reg_A != reg_B) --> fcmpun reg_R, reg_A, reg_B
+				       xor reg_R, reg_R, const1_rtx
+
+     NE: reverse condition and using xor insturction to achieve 'NE'.
+       reg_R = (reg_A != reg_B) --> fcmpeq reg_R, reg_A, reg_B
+				       xor reg_R, reg_R, const1_rtx */
+  switch (code)
+    {
+    case GT:
+    case GE:
+      tmp = cmp_op0;
+      cmp_op0 = cmp_op1;
+      cmp_op1 =tmp;
+      new_code = swap_condition (new_code);
+      break;
+    case UNORDERED:
+    case LT:
+    case LE:
+    case EQ:
+      break;
+    case ORDERED:
+      if (mode == SFmode)
+	emit_insn (gen_cmpsf_un (operands[0], cmp_op0, cmp_op1));
+      else
+	emit_insn (gen_cmpdf_un (operands[0], cmp_op0, cmp_op1));
+
+      emit_insn (gen_xorsi3 (operands[0], operands[0], const1_rtx));
+      return;
+    case NE:
+      if (mode == SFmode)
+	emit_insn (gen_cmpsf_eq (operands[0], cmp_op0, cmp_op1));
+      else
+	emit_insn (gen_cmpdf_eq (operands[0], cmp_op0, cmp_op1));
+
+      emit_insn (gen_xorsi3 (operands[0], operands[0], const1_rtx));
+      return;
+    default:
+      return;
+    }
+
+  emit_insn (gen_rtx_SET (operands[0],
+			  gen_rtx_fmt_ee (new_code, SImode,
+					  cmp_op0, cmp_op1)));
+}
+
 enum nds32_expand_result_type
 nds32_expand_movcc (rtx *operands)
 {
@@ -759,6 +899,11 @@ nds32_expand_movcc (rtx *operands)
 	 we have gcc generate original template rtx.  */
       return EXPAND_CREATE_TEMPLATE;
     }
+  else if ((TARGET_FPU_SINGLE && cmp0_mode == SFmode)
+	   || (TARGET_FPU_DOUBLE && cmp0_mode == DFmode))
+    {
+      nds32_expand_float_movcc (operands);
+    }
   else
     {
       /* Since there is only 'slt'(Set when Less Than) instruction for
@@ -849,6 +994,203 @@ nds32_expand_movcc (rtx *operands)
   return EXPAND_CREATE_TEMPLATE;
 }
 
+void
+nds32_expand_float_movcc (rtx *operands)
+{
+  if ((GET_CODE (operands[1]) == EQ || GET_CODE (operands[1]) == NE)
+      && GET_MODE (XEXP (operands[1], 0)) == SImode
+      && XEXP (operands[1], 1) == const0_rtx)
+    {
+      /* If the operands[1] rtx is already (eq X 0) or (ne X 0),
+	 we have gcc generate original template rtx.  */
+      return;
+    }
+  else
+    {
+      enum rtx_code code = GET_CODE (operands[1]);
+      enum rtx_code new_code = code;
+      machine_mode cmp0_mode = GET_MODE (XEXP (operands[1], 0));
+      machine_mode cmp1_mode = GET_MODE (XEXP (operands[1], 1));
+      rtx cmp_op0 = XEXP (operands[1], 0);
+      rtx cmp_op1 = XEXP (operands[1], 1);
+      rtx tmp;
+
+      /* Compare instruction Operations: (cmp_op0 condition cmp_op1) ? 1 : 0,
+	 when result is 1, and 'reverse' be set 1 for fcmovzs instructuin. */
+      int reverse = 0;
+
+      /* Main Goal: Use cmpare instruction + conditional move instruction.
+	 Strategy : swap condition and swap comparison operands.
+
+	 For example:
+	     a > b ? P : Q   (GT)
+	 --> a < b ? Q : P   (swap condition)
+	 --> b < a ? Q : P   (swap comparison operands to achieve 'GT')
+
+	     a >= b ? P : Q  (GE)
+	 --> a <= b ? Q : P  (swap condition)
+	 --> b <= a ? Q : P  (swap comparison operands to achieve 'GE')
+
+	     a <  b ? P : Q  (LT)
+	 --> (NO NEED TO CHANGE, it is already 'LT')
+
+	     a >= b ? P : Q  (LE)
+	 --> (NO NEED TO CHANGE, it is already 'LE')
+
+	     a == b ? P : Q  (EQ)
+	 --> (NO NEED TO CHANGE, it is already 'EQ') */
+
+      switch (code)
+	{
+	case GT:
+	case GE:
+	  tmp = cmp_op0;
+	  cmp_op0 = cmp_op1;
+	  cmp_op1 =tmp;
+	  new_code = swap_condition (new_code);
+	  break;
+	case UNORDERED:
+	case LT:
+	case LE:
+	case EQ:
+	  break;
+	case ORDERED:
+	case NE:
+	  reverse = 1;
+	  new_code = reverse_condition (new_code);
+	  break;
+	case UNGT:
+	case UNGE:
+	  new_code = reverse_condition_maybe_unordered (new_code);
+	  reverse = 1;
+	  break;
+	case UNLT:
+	case UNLE:
+	  new_code = reverse_condition_maybe_unordered (new_code);
+	  tmp = cmp_op0;
+	  cmp_op0 = cmp_op1;
+	  cmp_op1 = tmp;
+	  new_code = swap_condition (new_code);
+	  reverse = 1;
+	  break;
+	default:
+	  return;
+	}
+
+      /* Use a temporary register to store fcmpxxs result.  */
+      tmp = gen_reg_rtx (SImode);
+
+      /* Create float compare instruction for SFmode and DFmode,
+	 other MODE using cstoresi create compare instruction. */
+      if ((cmp0_mode == DFmode || cmp0_mode == SFmode)
+	  && (cmp1_mode == DFmode || cmp1_mode == SFmode))
+	{
+	  /* This emit_insn create corresponding float compare instruction */
+	  emit_insn (gen_rtx_SET (tmp,
+				  gen_rtx_fmt_ee (new_code, SImode,
+						  cmp_op0, cmp_op1)));
+	}
+      else
+	{
+	  /* This emit_insn using cstoresi create corresponding
+	     compare instruction */
+	  PUT_CODE (operands[1], new_code);
+	  emit_insn (gen_cstoresi4 (tmp, operands[1],
+				    cmp_op0, cmp_op1));
+	}
+      /* operands[1] crete corresponding condition move instruction
+	 for fcmovzs and fcmovns.  */
+      operands[1] = gen_rtx_fmt_ee (reverse ? EQ : NE,
+				    VOIDmode, tmp, const0_rtx);
+    }
+}
+
+void
+nds32_emit_push_fpr_callee_saved (int base_offset)
+{
+  rtx fpu_insn;
+  rtx reg, mem;
+  unsigned int regno = cfun->machine->callee_saved_first_fpr_regno;
+  unsigned int last_fpr = cfun->machine->callee_saved_last_fpr_regno;
+
+  while (regno <= last_fpr)
+    {
+      /* Handling two registers, using fsdi instruction.  */
+      reg = gen_rtx_REG (DFmode, regno);
+      mem = gen_frame_mem (DFmode, plus_constant (Pmode,
+						  stack_pointer_rtx,
+						  base_offset));
+      base_offset += 8;
+      regno += 2;
+      fpu_insn = emit_move_insn (mem, reg);
+      RTX_FRAME_RELATED_P (fpu_insn) = 1;
+    }
+}
+
+void
+nds32_emit_pop_fpr_callee_saved (int gpr_padding_size)
+{
+  rtx fpu_insn;
+  rtx reg, mem, addr;
+  rtx dwarf, adjust_sp_rtx;
+  unsigned int regno = cfun->machine->callee_saved_first_fpr_regno;
+  unsigned int last_fpr = cfun->machine->callee_saved_last_fpr_regno;
+  int padding = 0;
+
+  while (regno <= last_fpr)
+    {
+      /* Handling two registers, using fldi.bi instruction.  */
+      if ((regno + 1) >= last_fpr)
+	padding = gpr_padding_size;
+
+      reg = gen_rtx_REG (DFmode, (regno));
+      addr = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx,
+				  gen_rtx_PLUS (Pmode, stack_pointer_rtx,
+						GEN_INT (8 + padding)));
+      mem = gen_frame_mem (DFmode, addr);
+      regno += 2;
+      fpu_insn = emit_move_insn (reg, mem);
+
+      adjust_sp_rtx =
+	gen_rtx_SET (stack_pointer_rtx,
+		     plus_constant (Pmode, stack_pointer_rtx,
+				    8 + padding));
+
+      dwarf = alloc_reg_note (REG_CFA_RESTORE, reg, NULL_RTX);
+      /* Tell gcc we adjust SP in this insn.  */
+      dwarf = alloc_reg_note (REG_CFA_ADJUST_CFA, copy_rtx (adjust_sp_rtx),
+			      dwarf);
+      RTX_FRAME_RELATED_P (fpu_insn) = 1;
+      REG_NOTES (fpu_insn) = dwarf;
+    }
+}
+
+void
+nds32_emit_v3pop_fpr_callee_saved (int base)
+{
+  int fpu_base_addr = base;
+  int regno;
+  rtx fpu_insn;
+  rtx reg, mem;
+  rtx dwarf;
+
+  regno = cfun->machine->callee_saved_first_fpr_regno;
+  while (regno <= cfun->machine->callee_saved_last_fpr_regno)
+    {
+      /* Handling two registers, using fldi instruction.  */
+      reg = gen_rtx_REG (DFmode, regno);
+      mem = gen_frame_mem (DFmode, plus_constant (Pmode,
+						  stack_pointer_rtx,
+						  fpu_base_addr));
+      fpu_base_addr += 8;
+      regno += 2;
+      fpu_insn = emit_move_insn (reg, mem);
+      dwarf = alloc_reg_note (REG_CFA_RESTORE, reg, NULL_RTX);
+      RTX_FRAME_RELATED_P (fpu_insn) = 1;
+      REG_NOTES (fpu_insn) = dwarf;
+    }
+}
+
 /* ------------------------------------------------------------------------ */
 
 /* Function to return memory format.  */
@@ -867,7 +1209,8 @@ nds32_mem_format (rtx op)
   op = XEXP (op, 0);
 
   /* 45 format.  */
-  if (GET_CODE (op) == REG && (mode_test == SImode))
+  if (GET_CODE (op) == REG
+      && ((mode_test == SImode) || (mode_test == SFmode)))
     return ADDRESS_REG;
 
   /* 333 format for QI/HImode.  */
@@ -875,7 +1218,8 @@ nds32_mem_format (rtx op)
     return ADDRESS_LO_REG_IMM3U;
 
   /* post_inc 333 format.  */
-  if ((GET_CODE (op) == POST_INC) && (mode_test == SImode))
+  if ((GET_CODE (op) == POST_INC)
+      && ((mode_test == SImode) || (mode_test == SFmode)))
     {
       regno = REGNO(XEXP (op, 0));
 
@@ -885,7 +1229,7 @@ nds32_mem_format (rtx op)
 
   /* post_inc 333 format.  */
   if ((GET_CODE (op) == POST_MODIFY)
-      && (mode_test == SImode)
+      && ((mode_test == SImode) || (mode_test == SFmode))
       && (REG_P (XEXP (XEXP (op, 1), 0)))
       && (CONST_INT_P (XEXP (XEXP (op, 1), 1))))
     {
@@ -1409,12 +1753,25 @@ nds32_output_stack_push (rtx par_rtx)
 	 otherwise, generate 'push25 Re,0'.  */
       sp_adjust = cfun->machine->local_size
 		  + cfun->machine->out_args_size
-		  + cfun->machine->callee_saved_area_gpr_padding_bytes;
+		  + cfun->machine->callee_saved_area_gpr_padding_bytes
+		  + cfun->machine->callee_saved_fpr_regs_size;
       if (satisfies_constraint_Iu08 (GEN_INT (sp_adjust))
 	  && NDS32_DOUBLE_WORD_ALIGN_P (sp_adjust))
 	operands[1] = GEN_INT (sp_adjust);
       else
-	operands[1] = GEN_INT (0);
+	{
+	  /* Allocate callee saved fpr space.  */
+	  if (cfun->machine->callee_saved_first_fpr_regno != SP_REGNUM)
+	    {
+	      sp_adjust = cfun->machine->callee_saved_area_gpr_padding_bytes
+			  + cfun->machine->callee_saved_fpr_regs_size;
+	      operands[1] = GEN_INT (sp_adjust);
+	    }
+	  else
+	    {
+	      operands[1] = GEN_INT (0);
+	    }
+	}
 
       /* Create assembly code pattern.  */
       snprintf (pattern, sizeof (pattern), "push25\t%%0, %%1");
@@ -1507,13 +1864,28 @@ nds32_output_stack_pop (rtx par_rtx ATTRIBUTE_UNUSED)
 	 and then use 'pop25 Re,0'.  */
       sp_adjust = cfun->machine->local_size
 		  + cfun->machine->out_args_size
-		  + cfun->machine->callee_saved_area_gpr_padding_bytes;
+		  + cfun->machine->callee_saved_area_gpr_padding_bytes
+		  + cfun->machine->callee_saved_fpr_regs_size;
       if (satisfies_constraint_Iu08 (GEN_INT (sp_adjust))
 	  && NDS32_DOUBLE_WORD_ALIGN_P (sp_adjust)
 	  && !cfun->calls_alloca)
 	operands[1] = GEN_INT (sp_adjust);
       else
-	operands[1] = GEN_INT (0);
+	{
+	  if (cfun->machine->callee_saved_first_fpr_regno != SP_REGNUM)
+	    {
+	      /* If has fpr need to restore, the $sp on callee saved fpr
+		 position, so we need to consider gpr pading bytes and
+		 callee saved fpr size.  */
+	      sp_adjust = cfun->machine->callee_saved_area_gpr_padding_bytes
+			  + cfun->machine->callee_saved_fpr_regs_size;
+	      operands[1] = GEN_INT (sp_adjust);
+	    }
+	  else
+	    {
+	      operands[1] = GEN_INT (0);
+	    }
+	}
 
       /* Create assembly code pattern.  */
       snprintf (pattern, sizeof (pattern), "pop25\t%%0, %%1");
@@ -1638,6 +2010,162 @@ nds32_output_casesi_pc_relative (rtx *operands)
     return "jr\t$ta";
 }
 
+/* output a float load instruction */
+const char *
+nds32_output_float_load (rtx *operands)
+{
+  char buff[100];
+  const char *pattern;
+  rtx addr, addr_op0, addr_op1;
+  int dp = GET_MODE_SIZE (GET_MODE (operands[0])) == 8;
+  addr = XEXP (operands[1], 0);
+  switch (GET_CODE (addr))
+    {
+    case REG:
+      pattern = "fl%ci\t%%0, %%1";
+      break;
+
+    case PLUS:
+      addr_op0 = XEXP (addr, 0);
+      addr_op1 = XEXP (addr, 1);
+
+      if (REG_P (addr_op0) && REG_P (addr_op1))
+	pattern = "fl%c\t%%0, %%1";
+      else if (REG_P (addr_op0) && CONST_INT_P (addr_op1))
+	pattern = "fl%ci\t%%0, %%1";
+      else if (GET_CODE (addr_op0) == MULT && REG_P (addr_op1)
+	       && REG_P (XEXP (addr_op0, 0))
+	       && CONST_INT_P (XEXP (addr_op0, 1)))
+	pattern = "fl%c\t%%0, %%1";
+      else
+	gcc_unreachable ();
+      break;
+
+    case POST_MODIFY:
+      addr_op0 = XEXP (addr, 0);
+      addr_op1 = XEXP (addr, 1);
+
+      if (REG_P (addr_op0) && GET_CODE (addr_op1) == PLUS
+	  && REG_P (XEXP (addr_op1, 1)))
+	pattern = "fl%c.bi\t%%0, %%1";
+      else if (REG_P (addr_op0) && GET_CODE (addr_op1) == PLUS
+	       && CONST_INT_P (XEXP (addr_op1, 1)))
+	pattern = "fl%ci.bi\t%%0, %%1";
+      else
+	gcc_unreachable ();
+      break;
+
+    case POST_INC:
+      if (REG_P (XEXP (addr, 0)))
+	{
+	  if (dp)
+	    pattern = "fl%ci.bi\t%%0, %%1, 8";
+	  else
+	    pattern = "fl%ci.bi\t%%0, %%1, 4";
+	}
+      else
+	gcc_unreachable ();
+      break;
+
+    case POST_DEC:
+      if (REG_P (XEXP (addr, 0)))
+	{
+	  if (dp)
+	    pattern = "fl%ci.bi\t%%0, %%1, -8";
+	  else
+	    pattern = "fl%ci.bi\t%%0, %%1, -4";
+	}
+      else
+	gcc_unreachable ();
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  sprintf (buff, pattern, dp ? 'd' : 's');
+  output_asm_insn (buff, operands);
+  return "";
+}
+
+/* output a float store instruction */
+const char *
+nds32_output_float_store (rtx *operands)
+{
+  char buff[100];
+  const char *pattern;
+  rtx addr, addr_op0, addr_op1;
+  int dp = GET_MODE_SIZE (GET_MODE (operands[0])) == 8;
+  addr = XEXP (operands[0], 0);
+  switch (GET_CODE (addr))
+    {
+    case REG:
+      pattern = "fs%ci\t%%1, %%0";
+      break;
+
+    case PLUS:
+      addr_op0 = XEXP (addr, 0);
+      addr_op1 = XEXP (addr, 1);
+
+      if (REG_P (addr_op0) && REG_P (addr_op1))
+	pattern = "fs%c\t%%1, %%0";
+      else if (REG_P (addr_op0) && CONST_INT_P (addr_op1))
+	pattern = "fs%ci\t%%1, %%0";
+      else if (GET_CODE (addr_op0) == MULT && REG_P (addr_op1)
+	       && REG_P (XEXP (addr_op0, 0))
+	       && CONST_INT_P (XEXP (addr_op0, 1)))
+	pattern = "fs%c\t%%1, %%0";
+      else
+	gcc_unreachable ();
+      break;
+
+    case POST_MODIFY:
+      addr_op0 = XEXP (addr, 0);
+      addr_op1 = XEXP (addr, 1);
+
+      if (REG_P (addr_op0) && GET_CODE (addr_op1) == PLUS
+	  && REG_P (XEXP (addr_op1, 1)))
+	pattern = "fs%c.bi\t%%1, %%0";
+      else if (REG_P (addr_op0) && GET_CODE (addr_op1) == PLUS
+	       && CONST_INT_P (XEXP (addr_op1, 1)))
+	pattern = "fs%ci.bi\t%%1, %%0";
+      else
+	gcc_unreachable ();
+      break;
+
+    case POST_INC:
+      if (REG_P (XEXP (addr, 0)))
+	{
+	  if (dp)
+	    pattern = "fs%ci.bi\t%%1, %%0, 8";
+	  else
+	    pattern = "fs%ci.bi\t%%1, %%0, 4";
+	}
+      else
+	gcc_unreachable ();
+      break;
+
+    case POST_DEC:
+      if (REG_P (XEXP (addr, 0)))
+	{
+	  if (dp)
+	    pattern = "fs%ci.bi\t%%1, %%0, -8";
+	  else
+	    pattern = "fs%ci.bi\t%%1, %%0, -4";
+	}
+      else
+	gcc_unreachable ();
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  sprintf (buff, pattern, dp ? 'd' : 's');
+  output_asm_insn (buff, operands);
+  return "";
+}
+
 /* Function to generate normal jump table.  */
 const char *
 nds32_output_casesi (rtx *operands)
@@ -1936,6 +2464,39 @@ nds32_expand_unaligned_store (rtx *operands, enum machine_mode mode)
     }
 }
 
+/* Using multiple load/store instruction to output doubleword instruction.  */
+const char *
+nds32_output_double (rtx *operands, bool load_p)
+{
+  char pattern[100];
+  int reg = load_p ? 0 : 1;
+  int mem = load_p ? 1 : 0;
+  rtx otherops[3];
+  rtx addr = XEXP (operands[mem], 0);
+
+  otherops[0] = gen_rtx_REG (SImode, REGNO (operands[reg]));
+  otherops[1] = gen_rtx_REG (SImode, REGNO (operands[reg]) + 1);
+
+  if (GET_CODE (addr)  == POST_INC)
+    {
+      /* (mem (post_inc (reg))) */
+      otherops[2] = XEXP (addr, 0);
+      snprintf (pattern, sizeof (pattern),
+		"%cmw.bim\t%%0, [%%2], %%1, 0", load_p ? 'l' : 's');
+    }
+  else
+    {
+      /* (mem (reg)) */
+      otherops[2] = addr;
+      snprintf (pattern, sizeof (pattern),
+		"%cmw.bi\t%%0, [%%2], %%1, 0", load_p ? 'l' : 's');
+
+    }
+
+  output_asm_insn (pattern, otherops);
+  return "";
+}
+
 const char *
 nds32_output_cbranchsi4_equality_zero (rtx_insn *insn, rtx *operands)
 {
@@ -2120,6 +2681,115 @@ nds32_output_cbranchsi4_greater_less_zero (rtx_insn *insn, rtx *operands)
   return "";
 }
 
+/* Spilt a doubleword instrucion to two single word instructions.  */
+void
+nds32_spilt_doubleword (rtx *operands, bool load_p)
+{
+  int reg = load_p ? 0 : 1;
+  int mem = load_p ? 1 : 0;
+  rtx reg_rtx = load_p ? operands[0] : operands[1];
+  rtx mem_rtx = load_p ? operands[1] : operands[0];
+  rtx low_part[2], high_part[2];
+  rtx sub_mem = XEXP (mem_rtx, 0);
+
+  /* Generate low_part and high_part register pattern.
+     i.e. register pattern like:
+     (reg:DI) -> (subreg:SI (reg:DI))
+		 (subreg:SI (reg:DI)) */
+  low_part[reg] = simplify_gen_subreg (SImode, reg_rtx, GET_MODE (reg_rtx), 0);
+  high_part[reg] = simplify_gen_subreg (SImode, reg_rtx, GET_MODE (reg_rtx), 4);
+
+  /* Generate low_part and high_part memory pattern.
+     Memory format is (post_dec) will generate:
+       low_part:  lwi.bi reg, [mem], 4
+       high_part: lwi.bi reg, [mem], -12 */
+  if (GET_CODE (sub_mem) == POST_DEC)
+    {
+      /* memory format is (post_dec (reg)),
+	 so that extract (reg) from the (post_dec (reg)) pattern.  */
+      sub_mem = XEXP (sub_mem, 0);
+
+      /* generate low_part and high_part memory format:
+	   low_part:  (post_modify ((reg) (plus (reg) (const 4)))
+	   high_part: (post_modify ((reg) (plus (reg) (const -12))) */
+      low_part[mem] = gen_frame_mem (SImode,
+				     gen_rtx_POST_MODIFY (Pmode, sub_mem,
+							  gen_rtx_PLUS (Pmode,
+							  sub_mem,
+							  GEN_INT (4))));
+      high_part[mem] = gen_frame_mem (SImode,
+				      gen_rtx_POST_MODIFY (Pmode, sub_mem,
+							   gen_rtx_PLUS (Pmode,
+							   sub_mem,
+							   GEN_INT (-12))));
+    }
+  else if (GET_CODE (sub_mem) == POST_MODIFY)
+    {
+      /* Memory format is (post_modify (reg) (plus (reg) (const))),
+	 so that extract (reg) from the post_modify pattern.  */
+      rtx post_mem = XEXP (sub_mem, 0);
+
+      /* Extract (const) from the (post_modify (reg) (plus (reg) (const)))
+	 pattern.  */
+
+      rtx plus_op = XEXP (sub_mem, 1);
+      rtx post_val = XEXP (plus_op, 1);
+
+      /* Generate low_part and high_part memory format:
+	   low_part:  (post_modify ((reg) (plus (reg) (const)))
+	   high_part: ((plus (reg) (const 4))) */
+      low_part[mem] = gen_frame_mem (SImode,
+				     gen_rtx_POST_MODIFY (Pmode, post_mem,
+							  gen_rtx_PLUS (Pmode,
+							  post_mem,
+							  post_val)));
+      high_part[mem] = gen_frame_mem (SImode, plus_constant (Pmode,
+							     post_mem,
+							     4));
+    }
+  else
+    {
+      /* memory format: (symbol_ref), (const), (reg + const_int).  */
+      low_part[mem] = adjust_address (mem_rtx, SImode, 0);
+      high_part[mem] = adjust_address (mem_rtx, SImode, 4);
+    }
+
+  /* After reload completed, we have dependent issue by low part register and
+     higt part memory. i.e. we cannot split a sequence
+     like:
+	load $r0, [%r1]
+     spilt to
+	lw  $r0, [%r0]
+	lwi $r1, [%r0 + 4]
+     swap position
+	lwi $r1, [%r0 + 4]
+	lw  $r0, [%r0]
+     For store instruction we don't have a problem.
+
+     When memory format is [post_modify], we need to emit high part instruction,
+     before low part instruction.
+     expamle:
+       load $r0, [%r2], post_val
+     spilt to
+       load $r1, [%r2 + 4]
+       load $r0, [$r2], post_val.  */
+  if ((load_p && reg_overlap_mentioned_p (low_part[0], high_part[1]))
+      || GET_CODE (sub_mem) == POST_MODIFY)
+    {
+      operands[2] = high_part[0];
+      operands[3] = high_part[1];
+      operands[4] = low_part[0];
+      operands[5] = low_part[1];
+    }
+  else
+    {
+      operands[2] = low_part[0];
+      operands[3] = low_part[1];
+      operands[4] = high_part[0];
+      operands[5] = high_part[1];
+    }
+}
+
 /* Return true X is need use long call.  */
 bool
 nds32_long_call_p (rtx symbol)
diff --git a/gcc/config/nds32/nds32-opts.h b/gcc/config/nds32/nds32-opts.h
index 77429abeead..642ec2867c5 100644
--- a/gcc/config/nds32/nds32-opts.h
+++ b/gcc/config/nds32/nds32-opts.h
@@ -29,7 +29,9 @@ enum nds32_arch_type
 {
   ARCH_V2,
   ARCH_V3,
-  ARCH_V3M
+  ARCH_V3M,
+  ARCH_V3F,
+  ARCH_V3S
 };
 
 /* The code model defines the address generation strategy.  */
@@ -46,4 +48,24 @@ enum nds32_cpu_type
   CPU_N9
 };
 
+/* Which ABI to use.  */
+enum abi_type
+{
+  NDS32_ABI_V2,
+  NDS32_ABI_V2_FP_PLUS
+};
+
+/* The various FPU number of registers.  */
+enum float_reg_number
+{
+  NDS32_CONFIG_FPU_0,
+  NDS32_CONFIG_FPU_1,
+  NDS32_CONFIG_FPU_2,
+  NDS32_CONFIG_FPU_3,
+  NDS32_CONFIG_FPU_4,
+  NDS32_CONFIG_FPU_5,
+  NDS32_CONFIG_FPU_6,
+  NDS32_CONFIG_FPU_7
+};
+
 #endif
diff --git a/gcc/config/nds32/nds32-predicates.c b/gcc/config/nds32/nds32-predicates.c
index c313efcb831..5e01430c8e3 100644
--- a/gcc/config/nds32/nds32-predicates.c
+++ b/gcc/config/nds32/nds32-predicates.c
@@ -448,4 +448,71 @@ nds32_symbol_load_store_p (rtx_insn *insn)
 
   return false;
 }
+
+/* Vaild memory operand for floating-point loads and stores */
+bool
+nds32_float_mem_operand_p (rtx op)
+{
+  machine_mode mode = GET_MODE (op);
+  rtx addr = XEXP (op, 0);
+
+  /* Not support [symbol] [const] memory */
+  if (GET_CODE (addr) == SYMBOL_REF
+      || GET_CODE (addr) == CONST
+      || GET_CODE (addr) == LO_SUM)
+    return false;
+
+  if (GET_CODE (addr) == PLUS)
+    {
+      if (GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
+	return false;
+
+      /* Restrict const range: (imm12s << 2) */
+      if (GET_CODE (XEXP (addr, 1)) == CONST_INT)
+	{
+	  if ((mode == SImode || mode == SFmode)
+	      && NDS32_SINGLE_WORD_ALIGN_P (INTVAL (XEXP (addr, 1)))
+	      && !satisfies_constraint_Is14 ( XEXP(addr, 1)))
+	    return false;
+
+	  if ((mode == DImode || mode == DFmode)
+	      && NDS32_DOUBLE_WORD_ALIGN_P (INTVAL (XEXP (addr, 1)))
+	      && !satisfies_constraint_Is14 (XEXP (addr, 1)))
+	    return false;
+	}
+    }
+
+  return true;
+}
+
+int
+nds32_cond_move_p (rtx cmp_rtx)
+{
+  machine_mode cmp0_mode = GET_MODE (XEXP (cmp_rtx, 0));
+  machine_mode cmp1_mode = GET_MODE (XEXP (cmp_rtx, 1));
+  enum rtx_code cond = GET_CODE (cmp_rtx);
+
+  if ((cmp0_mode == DFmode || cmp0_mode == SFmode)
+      && (cmp1_mode == DFmode || cmp1_mode == SFmode)
+      && (cond == ORDERED || cond == UNORDERED))
+    return true;
+  return false;
+}
+
+bool
+nds32_const_double_range_ok_p (rtx op, machine_mode mode,
+			       HOST_WIDE_INT lower, HOST_WIDE_INT upper)
+{
+  if (GET_CODE (op) != CONST_DOUBLE
+      || GET_MODE (op) != mode)
+    return false;
+
+  const REAL_VALUE_TYPE *rv;
+  long val;
+
+  rv = CONST_DOUBLE_REAL_VALUE (op);
+  REAL_VALUE_TO_TARGET_SINGLE (*rv, val);
+
+  return val >= lower && val < upper;
+}
 /* ------------------------------------------------------------------------ */
diff --git a/gcc/config/nds32/nds32-protos.h b/gcc/config/nds32/nds32-protos.h
index 42f8dd9d762..aebec3b0b34 100644
--- a/gcc/config/nds32/nds32-protos.h
+++ b/gcc/config/nds32/nds32-protos.h
@@ -58,6 +58,13 @@ extern void nds32_expand_prologue (void);
 extern void nds32_expand_epilogue (bool);
 extern void nds32_expand_prologue_v3push (void);
 extern void nds32_expand_epilogue_v3pop (bool);
+extern void nds32_emit_push_fpr_callee_saved (int);
+extern void nds32_emit_pop_fpr_callee_saved (int);
+extern void nds32_emit_v3pop_fpr_callee_saved (int);
+
+/* Controlling Debugging Information Format.  */
+
+extern unsigned int nds32_dbx_register_number (unsigned int);
 
 /* ------------------------------------------------------------------------ */
 
@@ -101,6 +108,9 @@ extern int nds32_can_use_btgl_p (int);
 
 extern int nds32_can_use_bitci_p (int);
 
+extern bool nds32_const_double_range_ok_p (rtx, machine_mode,
+					   HOST_WIDE_INT, HOST_WIDE_INT);
+
 /* Auxiliary function for 'Computing the Length of an Insn'.  */
 
 extern int nds32_adjust_insn_length (rtx_insn *, int);
@@ -120,19 +130,30 @@ extern const char *nds32_output_casesi (rtx *);
 
 extern enum nds32_expand_result_type nds32_expand_cbranch (rtx *);
 extern enum nds32_expand_result_type nds32_expand_cstore (rtx *);
+extern void nds32_expand_float_cbranch (rtx *);
+extern void nds32_expand_float_cstore (rtx *);
 
 /* Auxiliary functions for conditional move generation.  */
 
 extern enum nds32_expand_result_type nds32_expand_movcc (rtx *);
+extern void nds32_expand_float_movcc (rtx *);
 
 
 /* Auxiliary functions to identify long-call symbol.  */
 extern bool nds32_long_call_p (rtx);
 
+/* Auxiliary functions to identify conditional move comparison operand.  */
+
+extern int nds32_cond_move_p (rtx);
+
 /* Auxiliary functions to identify 16 bit addresing mode.  */
 
 extern enum nds32_16bit_address_type nds32_mem_format (rtx);
 
+/* Auxiliary functions to identify floating-point addresing mode.  */
+
+extern bool nds32_float_mem_operand_p (rtx);
+
 /* Auxiliary functions to output assembly code.  */
 
 extern const char *nds32_output_16bit_store (rtx *, int);
@@ -140,8 +161,11 @@ extern const char *nds32_output_16bit_load (rtx *, int);
 extern const char *nds32_output_32bit_store (rtx *, int);
 extern const char *nds32_output_32bit_load (rtx *, int);
 extern const char *nds32_output_32bit_load_s (rtx *, int);
+extern const char *nds32_output_float_load(rtx *);
+extern const char *nds32_output_float_store(rtx *);
 extern const char *nds32_output_smw_single_word (rtx *);
 extern const char *nds32_output_lmw_single_word (rtx *);
+extern const char *nds32_output_double (rtx *, bool);
 extern const char *nds32_output_cbranchsi4_equality_zero (rtx_insn *, rtx *);
 extern const char *nds32_output_cbranchsi4_equality_reg (rtx_insn *, rtx *);
 extern const char *nds32_output_cbranchsi4_equality_reg_or_const_int (rtx_insn *,
@@ -154,6 +178,10 @@ extern const char *nds32_output_cbranchsi4_greater_less_zero (rtx_insn *, rtx *)
 extern const char *nds32_output_stack_push (rtx);
 extern const char *nds32_output_stack_pop (rtx);
 
+/* Auxiliary functions to split double word RTX pattern.  */
+
+extern void nds32_spilt_doubleword (rtx *, bool);
+
 /* Auxiliary functions to split large constant RTX pattern.  */
 
 extern void nds32_expand_constant (machine_mode,
@@ -190,6 +218,8 @@ extern int nds32_address_cost_impl (rtx, machine_mode, addr_space_t, bool);
 /* Auxiliary functions for pre-define marco.  */
 extern void nds32_cpu_cpp_builtins(struct cpp_reader *);
 
+extern bool nds32_split_double_word_load_store_p (rtx *,bool);
+
 /* Functions for create nds32 specific optimization pass.  */
 extern rtl_opt_pass *make_pass_nds32_relax_opt (gcc::context *);
 
diff --git a/gcc/config/nds32/nds32.c b/gcc/config/nds32/nds32.c
index eedf6f56df1..1070b474d8f 100644
--- a/gcc/config/nds32/nds32.c
+++ b/gcc/config/nds32/nds32.c
@@ -218,6 +218,10 @@ nds32_compute_stack_frame (void)
   cfun->machine->callee_saved_gpr_regs_size = 0;
   cfun->machine->callee_saved_first_gpr_regno = SP_REGNUM;
   cfun->machine->callee_saved_last_gpr_regno  = SP_REGNUM;
+  cfun->machine->callee_saved_fpr_regs_size = 0;
+  cfun->machine->callee_saved_first_fpr_regno = SP_REGNUM;
+  cfun->machine->callee_saved_last_fpr_regno  = SP_REGNUM;
+
   /* Currently, there is no need to check $r28~$r31
      because we will save them in another way.  */
   for (r = 0; r < 28; r++)
@@ -235,6 +239,35 @@ nds32_compute_stack_frame (void)
 	}
     }
 
+  /* Recording fpu callee-saved register.  */
+  if (TARGET_HARD_FLOAT)
+    {
+      for (r = NDS32_FIRST_FPR_REGNUM; r < NDS32_LAST_FPR_REGNUM; r++)
+	{
+	  if (NDS32_REQUIRED_CALLEE_SAVED_P (r))
+	    {
+	      /* Mark the first required callee-saved register.  */
+	      if (cfun->machine->callee_saved_first_fpr_regno == SP_REGNUM)
+		{
+		  /* Make first callee-saved number is even,
+		     bacause we use doubleword access, and this way
+		     promise 8-byte alignemt.  */
+		  if (!NDS32_FPR_REGNO_OK_FOR_DOUBLE (r))
+		    cfun->machine->callee_saved_first_fpr_regno = r - 1;
+		  else
+		    cfun->machine->callee_saved_first_fpr_regno = r;
+		}
+	      cfun->machine->callee_saved_last_fpr_regno = r;
+	    }
+	}
+
+      /* Make last callee-saved register number is odd,
+	 we hope callee-saved register is even.  */
+      int last_fpr = cfun->machine->callee_saved_last_fpr_regno;
+      if (NDS32_FPR_REGNO_OK_FOR_DOUBLE (last_fpr))
+	cfun->machine->callee_saved_last_fpr_regno++;
+    }
+
   /* Check if this function can omit prologue/epilogue code fragment.
      If there is 'naked' attribute in this function,
      we can set 'naked_p' flag to indicate that
@@ -252,6 +285,8 @@ nds32_compute_stack_frame (void)
   if (lookup_attribute ("naked", DECL_ATTRIBUTES (current_function_decl))
       || (cfun->machine->callee_saved_first_gpr_regno == SP_REGNUM
 	  && cfun->machine->callee_saved_last_gpr_regno == SP_REGNUM
+	  && cfun->machine->callee_saved_first_fpr_regno == SP_REGNUM
+	  && cfun->machine->callee_saved_last_fpr_regno == SP_REGNUM
 	  && !df_regs_ever_live_p (FP_REGNUM)
 	  && !df_regs_ever_live_p (LP_REGNUM)
 	  && cfun->machine->local_size == 0))
@@ -340,7 +375,8 @@ nds32_compute_stack_frame (void)
 
   int sp_adjust = cfun->machine->local_size
 		  + cfun->machine->out_args_size
-		  + cfun->machine->callee_saved_area_gpr_padding_bytes;
+		  + cfun->machine->callee_saved_area_gpr_padding_bytes
+		  + cfun->machine->callee_saved_fpr_regs_size;
 
   if (!v3pushpop_p
       && sp_adjust == 0
@@ -385,6 +421,18 @@ nds32_compute_stack_frame (void)
 	       + 1);
     }
 
+  if (TARGET_HARD_FLOAT)
+    {
+      /* Compute size of callee svaed floating-point registers.  */
+      if (cfun->machine->callee_saved_last_fpr_regno != SP_REGNUM)
+	{
+	  cfun->machine->callee_saved_fpr_regs_size
+	   = 4 * (cfun->machine->callee_saved_last_fpr_regno
+		  - cfun->machine->callee_saved_first_fpr_regno
+		  + 1);
+	}
+    }
+
   /* Important: We need to make sure that
 		(fp_size + gp_size + lp_size + callee_saved_gpr_regs_size)
 		is 8-byte alignment.
@@ -1130,45 +1178,61 @@ nds32_legitimate_index_p (machine_mode outer_mode,
 
     case CONST_INT:
       /* The alignment of the integer value is determined by 'outer_mode'.  */
-      if (GET_MODE_SIZE (outer_mode) == 1)
+      switch (GET_MODE_SIZE (outer_mode))
 	{
+	case 1:
 	  /* Further check if the value is legal for the 'outer_mode'.  */
-	  if (!satisfies_constraint_Is15 (index))
-	    return false;
+	  if (satisfies_constraint_Is15 (index))
+	    return true;
+	  break;
 
-	  /* Pass all test, the value is valid, return true.  */
-	  return true;
-	}
-      if (GET_MODE_SIZE (outer_mode) == 2
-	  && NDS32_HALF_WORD_ALIGN_P (INTVAL (index)))
-	{
+	case 2:
 	  /* Further check if the value is legal for the 'outer_mode'.  */
-	  if (!satisfies_constraint_Is16 (index))
-	    return false;
+	  if (satisfies_constraint_Is16 (index))
+	    {
+	      /* Make sure address is half word alignment.  */
+	      if (NDS32_HALF_WORD_ALIGN_P (INTVAL (index)))
+		return true;
+	    }
+	  break;
 
-	  /* Pass all test, the value is valid, return true.  */
-	  return true;
-	}
-      if (GET_MODE_SIZE (outer_mode) == 4
-	  && NDS32_SINGLE_WORD_ALIGN_P (INTVAL (index)))
-	{
+	case 4:
 	  /* Further check if the value is legal for the 'outer_mode'.  */
-	  if (!satisfies_constraint_Is17 (index))
-	    return false;
+	  if (satisfies_constraint_Is17 (index))
+	    {
+	      if ((TARGET_FPU_SINGLE || TARGET_FPU_DOUBLE))
+		{
+		  if (!satisfies_constraint_Is14 (index))
+		    return false;
+		}
+
+	      /* Make sure address is word alignment.  */
+	      if (NDS32_SINGLE_WORD_ALIGN_P (INTVAL (index)))
+		return true;
+	    }
+	  break;
 
-	  /* Pass all test, the value is valid, return true.  */
-	  return true;
-	}
-      if (GET_MODE_SIZE (outer_mode) == 8
-	  && NDS32_SINGLE_WORD_ALIGN_P (INTVAL (index)))
-	{
-	  /* Further check if the value is legal for the 'outer_mode'.  */
-	  if (!satisfies_constraint_Is17 (gen_int_mode (INTVAL (index) + 4,
-							SImode)))
-	    return false;
+	case 8:
+	  if (satisfies_constraint_Is17 (gen_int_mode (INTVAL (index) + 4,
+						       SImode)))
+	    {
+	      if ((TARGET_FPU_SINGLE || TARGET_FPU_DOUBLE))
+		{
+		  if (!satisfies_constraint_Is14 (index))
+		    return false;
+		}
+
+	      /* Make sure address is word alignment.
+		Currently we do not have 64-bit load/store yet,
+		so we will use two 32-bit load/store instructions to do
+		memory access and they are single word alignment.  */
+	      if (NDS32_SINGLE_WORD_ALIGN_P (INTVAL (index)))
+		return true;
+	    }
+	  break;
 
-	  /* Pass all test, the value is valid, return true.  */
-	  return true;
+	default:
+	  return false;
 	}
 
       return false;
@@ -1262,6 +1326,39 @@ nds32_register_passes (void)
 /* ------------------------------------------------------------------------ */
 
 /* PART 3: Implement target hook stuff definitions.  */
+
+
+/* Register Usage.  */
+
+static void
+nds32_conditional_register_usage (void)
+{
+  int regno;
+
+  if (TARGET_HARD_FLOAT)
+    {
+      for (regno = NDS32_FIRST_FPR_REGNUM;
+	   regno <= NDS32_LAST_FPR_REGNUM; regno++)
+	{
+	  fixed_regs[regno] = 0;
+	  if (regno < NDS32_FIRST_FPR_REGNUM + NDS32_MAX_FPR_REGS_FOR_ARGS)
+	    call_used_regs[regno] = 1;
+	  else if (regno >= NDS32_FIRST_FPR_REGNUM + 22
+		   && regno < NDS32_FIRST_FPR_REGNUM + 48)
+	    call_used_regs[regno] = 1;
+	  else
+	    call_used_regs[regno] = 0;
+	}
+    }
+  else if (TARGET_FPU_SINGLE || TARGET_FPU_DOUBLE)
+    {
+      for (regno = NDS32_FIRST_FPR_REGNUM;
+	   regno <= NDS32_LAST_FPR_REGNUM;
+	   regno++)
+	fixed_regs[regno] = 0;
+    }
+}
+
 
 /* Register Classes.  */
 
@@ -1298,6 +1395,22 @@ nds32_register_priority (int hard_regno)
     }
 }
 
+static bool
+nds32_can_change_mode_class (machine_mode from,
+			     machine_mode to,
+			     reg_class_t rclass)
+{
+  /* Don't spill double-precision register to two singal-precision
+     registers  */
+  if ((TARGET_FPU_SINGLE || TARGET_FPU_DOUBLE)
+       && GET_MODE_SIZE (from) != GET_MODE_SIZE (to))
+    {
+      return !reg_classes_intersect_p (rclass, FP_REGS);
+    }
+
+  return true;
+}
+
 
 /* Stack Layout and Calling Conventions.  */
 
@@ -1422,8 +1535,28 @@ nds32_function_arg (cumulative_args_t ca, machine_mode mode,
      are different.  */
   if (TARGET_HARD_FLOAT)
     {
-      /* Currently we have not implemented hard float yet.  */
-      gcc_unreachable ();
+      /* For TARGET_HARD_FLOAT calling convention, we use GPR and FPR
+	 to pass argument.  We have to further check TYPE and MODE so
+	 that we can determine which kind of register we shall use.  */
+
+      /* Note that we need to pass argument entirely in registers under
+	 hard float abi.  */
+      if (GET_MODE_CLASS (mode) == MODE_FLOAT
+	  && NDS32_ARG_ENTIRE_IN_FPR_REG_P (cum->fpr_offset, mode, type))
+	{
+	  /* Pick up the next available FPR register number.  */
+	  regno
+	    = NDS32_AVAILABLE_REGNUM_FOR_FPR_ARG (cum->fpr_offset, mode, type);
+	  return gen_rtx_REG (mode, regno);
+	}
+      else if (GET_MODE_CLASS (mode) != MODE_FLOAT
+	       && NDS32_ARG_ENTIRE_IN_GPR_REG_P (cum->gpr_offset, mode, type))
+	{
+	  /* Pick up the next available GPR register number.  */
+	  regno
+	    = NDS32_AVAILABLE_REGNUM_FOR_GPR_ARG (cum->gpr_offset, mode, type);
+	  return gen_rtx_REG (mode, regno);
+	}
     }
   else
     {
@@ -1506,23 +1639,20 @@ static void
 nds32_function_arg_advance (cumulative_args_t ca, machine_mode mode,
 			    const_tree type, bool named)
 {
-  machine_mode sub_mode;
   CUMULATIVE_ARGS *cum = get_cumulative_args (ca);
 
   if (named)
     {
       /* We need to further check TYPE and MODE so that we can determine
-         which kind of register we shall advance.  */
-      if (type && TREE_CODE (type) == COMPLEX_TYPE)
-	sub_mode = TYPE_MODE (TREE_TYPE (type));
-      else
-	sub_mode = mode;
+	 which kind of register we shall advance.  */
 
       /* Under hard float abi, we may advance FPR registers.  */
-      if (TARGET_HARD_FLOAT && GET_MODE_CLASS (sub_mode) == MODE_FLOAT)
+      if (TARGET_HARD_FLOAT && GET_MODE_CLASS (mode) == MODE_FLOAT)
 	{
-	  /* Currently we have not implemented hard float yet.  */
-	  gcc_unreachable ();
+	  cum->fpr_offset
+	    = NDS32_AVAILABLE_REGNUM_FOR_FPR_ARG (cum->fpr_offset, mode, type)
+	      - NDS32_FPR_ARG_FIRST_REGNUM
+	      + NDS32_NEED_N_REGS_FOR_ARG (mode, type);
 	}
       else
 	{
@@ -1569,22 +1699,62 @@ nds32_function_value (const_tree ret_type,
   mode = TYPE_MODE (ret_type);
   unsignedp = TYPE_UNSIGNED (ret_type);
 
-  mode = promote_mode (ret_type, mode, &unsignedp);
+  if (INTEGRAL_TYPE_P (ret_type))
+    mode = promote_mode (ret_type, mode, &unsignedp);
 
-  return gen_rtx_REG (mode, NDS32_GPR_RET_FIRST_REGNUM);
+  if (TARGET_HARD_FLOAT && (mode == SFmode || mode == DFmode))
+    return gen_rtx_REG (mode, NDS32_FPR_RET_FIRST_REGNUM);
+  else
+    return gen_rtx_REG (mode, NDS32_GPR_RET_FIRST_REGNUM);
 }
 
 static rtx
 nds32_libcall_value (machine_mode mode,
 		     const_rtx fun ATTRIBUTE_UNUSED)
 {
+  if (TARGET_HARD_FLOAT && (mode == SFmode || mode == DFmode))
+    return gen_rtx_REG (mode, NDS32_FPR_RET_FIRST_REGNUM);
+
   return gen_rtx_REG (mode, NDS32_GPR_RET_FIRST_REGNUM);
 }
 
 static bool
 nds32_function_value_regno_p (const unsigned int regno)
 {
-  return (regno == NDS32_GPR_RET_FIRST_REGNUM);
+  if (regno == NDS32_GPR_RET_FIRST_REGNUM
+      || (TARGET_HARD_FLOAT
+	  && regno == NDS32_FPR_RET_FIRST_REGNUM))
+    return true;
+
+  return false;
+}
+
+/* -- How Large Values Are Returned.  */
+
+static bool
+nds32_return_in_memory (const_tree type,
+			const_tree fntype ATTRIBUTE_UNUSED)
+{
+  /* Note that int_size_in_bytes can return -1 if the size can vary
+     or is larger than an integer.  */
+  HOST_WIDE_INT size = int_size_in_bytes (type);
+
+  /* For COMPLEX_TYPE, if the total size cannot be hold within two registers,
+     the return value is supposed to be in memory.  We need to be aware of
+     that the size may be -1.  */
+  if (TREE_CODE (type) == COMPLEX_TYPE)
+    if (size < 0 || size > 2 * UNITS_PER_WORD)
+      return true;
+
+  /* If it is BLKmode and the total size cannot be hold within two registers,
+     the return value is supposed to be in memory.  We need to be aware of
+     that the size may be -1.  */
+  if (TYPE_MODE (type) == BLKmode)
+    if (size < 0 || size > 2 * UNITS_PER_WORD)
+      return true;
+
+  /* For other cases, having result in memory is unnecessary.  */
+  return false;
 }
 
 /* -- Function Entry and Exit.  */
@@ -1614,7 +1784,7 @@ nds32_asm_function_prologue (FILE *file)
   /* Use df_regs_ever_live_p() to detect if the register
      is ever used in the current function.  */
   fprintf (file, "\t! registers ever_live: ");
-  for (r = 0; r < 32; r++)
+  for (r = 0; r < 65; r++)
     {
       if (df_regs_ever_live_p (r))
 	fprintf (file, "%s, ", reg_names[r]);
@@ -2013,6 +2183,43 @@ nds32_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
 static bool
 nds32_legitimate_address_p (machine_mode mode, rtx x, bool strict)
 {
+  if (TARGET_FPU_SINGLE || TARGET_FPU_DOUBLE)
+    {
+     /* When using floating-point instructions,
+	we don't allow 'addr' to be [symbol_ref], [CONST] pattern.  */
+      if ((mode == DFmode || mode == SFmode)
+	  && (GET_CODE (x) == SYMBOL_REF
+	  || GET_CODE(x) == CONST))
+	return false;
+
+      /* Allow [post_modify] addressing mode, when using FPU instructions.  */
+      if (GET_CODE (x) == POST_MODIFY
+	  && mode == DFmode)
+	{
+	  if (GET_CODE (XEXP (x, 0)) == REG
+	      && GET_CODE (XEXP (x, 1)) == PLUS)
+	    {
+	      rtx plus_op = XEXP (x, 1);
+	      rtx op0 = XEXP (plus_op, 0);
+	      rtx op1 = XEXP (plus_op, 1);
+
+	      if (nds32_address_register_rtx_p (op0, strict)
+		  && CONST_INT_P (op1))
+		{
+		  if (satisfies_constraint_Is14 (op1))
+		    {
+		      /* Make sure address is word alignment.
+			Currently we do not have 64-bit load/store yet,
+			so we will use two 32-bit load/store instructions to do
+			memory access and they are single word alignment.  */
+		      if (NDS32_SINGLE_WORD_ALIGN_P (INTVAL (op1)))
+			return true;
+		    }
+		}
+	    }
+	}
+    }
+
   /* For (mem:DI addr) or (mem:DF addr) case,
      we only allow 'addr' to be [reg], [symbol_ref],
 				[const], or [reg + const_int] pattern.  */
@@ -2031,6 +2238,13 @@ nds32_legitimate_address_p (machine_mode mode, rtx x, bool strict)
 	    return true;
 	}
 
+      /* Allow [post_inc] and [post_dec] addressing mode.  */
+      if (GET_CODE (x) == POST_INC || GET_CODE (x) == POST_DEC)
+	{
+	  if (nds32_address_register_rtx_p (XEXP (x, 0), strict))
+	    return true;
+	}
+
       /* Now check [reg], [symbol_ref], and [const].  */
       if (GET_CODE (x) != REG
 	  && GET_CODE (x) != SYMBOL_REF
@@ -2216,10 +2430,13 @@ nds32_register_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
 			  reg_class_t from,
 			  reg_class_t to)
 {
-  if (from == HIGH_REGS || to == HIGH_REGS)
-    return 6;
-
-  return 2;
+  if ((from == FP_REGS && to != FP_REGS)
+      || (from != FP_REGS && to == FP_REGS))
+    return 9;
+  else if (from == HIGH_REGS || to == HIGH_REGS)
+    return optimize_size ? 6 : 2;
+  else
+    return 2;
 }
 
 static int
@@ -2305,7 +2522,10 @@ nds32_asm_file_start (void)
 
   /* Tell assembler which ABI we are using.  */
   fprintf (asm_out_file, "\t! ABI version\n");
-  fprintf (asm_out_file, "\t.abi_2\n");
+  if (TARGET_HARD_FLOAT)
+    fprintf (asm_out_file, "\t.abi_2fp_plus\n");
+  else
+    fprintf (asm_out_file, "\t.abi_2\n");
 
   /* Tell assembler that this asm code is generated by compiler.  */
   fprintf (asm_out_file, "\t! This asm file is generated by compiler\n");
@@ -2334,6 +2554,15 @@ nds32_asm_file_start (void)
   fprintf (asm_out_file, "\t! Endian setting\t: %s\n",
 			 ((TARGET_BIG_ENDIAN) ? "big-endian"
 					      : "little-endian"));
+  fprintf (asm_out_file, "\t! Use SP floating-point instruction\t: %s\n",
+			 ((TARGET_FPU_SINGLE) ? "Yes"
+					      : "No"));
+  fprintf (asm_out_file, "\t! Use DP floating-point instruction\t: %s\n",
+			 ((TARGET_FPU_DOUBLE) ? "Yes"
+					      : "No"));
+  fprintf (asm_out_file, "\t! ABI version\t\t: %s\n",
+			 ((TARGET_HARD_FLOAT) ? "ABI2FP+"
+					      : "ABI2"));
 
   fprintf (asm_out_file, "\t! ------------------------------------\n");
 
@@ -2404,6 +2633,10 @@ nds32_print_operand (FILE *stream, rtx x, int code)
 {
   HOST_WIDE_INT one_position;
   HOST_WIDE_INT zero_position;
+  bool pick_lsb_p = false;
+  bool pick_msb_p = false;
+  int regno;
+
   int op_value;
 
   switch (code)
@@ -2440,6 +2673,20 @@ nds32_print_operand (FILE *stream, rtx x, int code)
       /* No need to handle following process, so return immediately.  */
       return;
 
+    case 'L':
+      /* X is supposed to be REG rtx.  */
+      gcc_assert (REG_P (x));
+      /* Claim that we are going to pick LSB part of X.  */
+      pick_lsb_p = true;
+      break;
+
+    case 'H':
+      /* X is supposed to be REG rtx.  */
+      gcc_assert (REG_P (x));
+      /* Claim that we are going to pick MSB part of X.  */
+      pick_msb_p = true;
+      break;
+
     case 'V':
       /* 'x' is supposed to be CONST_INT, get the value.  */
       gcc_assert (CONST_INT_P (x));
@@ -2489,6 +2736,38 @@ nds32_print_operand (FILE *stream, rtx x, int code)
       break;
 
     case REG:
+      /* Print a Double-precision register name.  */
+      if ((GET_MODE (x) == DImode || GET_MODE (x) == DFmode)
+	  && NDS32_IS_FPR_REGNUM (REGNO (x)))
+	{
+	  regno = REGNO (x);
+	  if (!NDS32_FPR_REGNO_OK_FOR_DOUBLE (regno))
+	    {
+	      output_operand_lossage ("invalid operand for code '%c'", code);
+	      break;
+	    }
+	  fprintf (stream, "$fd%d", (regno - NDS32_FIRST_FPR_REGNUM) >> 1);
+	  break;
+	}
+
+      /* Print LSB or MSB part of register pair if the
+	 constraint modifier 'L' or 'H' is specified.  */
+      if ((GET_MODE (x) == DImode || GET_MODE (x) == DFmode)
+	  && NDS32_IS_GPR_REGNUM (REGNO (x)))
+	{
+	  if ((pick_lsb_p && WORDS_BIG_ENDIAN)
+	      || (pick_msb_p && !WORDS_BIG_ENDIAN))
+	    {
+	      /* If we would like to print out LSB register under big-endian,
+		 or print out MSB register under little-endian, we need to
+		 increase register number.  */
+	      regno = REGNO (x);
+	      regno++;
+	      fputs (reg_names[regno], stream);
+	      break;
+	    }
+	}
+
       /* Forbid using static chain register ($r16)
 	 on reduced-set registers configuration.  */
       if (TARGET_REDUCED_REGS
@@ -2503,6 +2782,33 @@ nds32_print_operand (FILE *stream, rtx x, int code)
       output_address (GET_MODE (x), XEXP (x, 0));
       break;
 
+    case HIGH:
+      if (GET_CODE (XEXP (x, 0)) == CONST_DOUBLE)
+	{
+	  const REAL_VALUE_TYPE *rv;
+	  long val;
+	  gcc_assert (GET_MODE (x) == SFmode);
+
+	  rv = CONST_DOUBLE_REAL_VALUE (XEXP (x, 0));
+	  REAL_VALUE_TO_TARGET_SINGLE (*rv, val);
+
+	  fprintf (stream, "hi20(0x%lx)", val);
+	}
+      else
+	gcc_unreachable ();
+      break;
+
+    case CONST_DOUBLE:
+      const REAL_VALUE_TYPE *rv;
+      long val;
+      gcc_assert (GET_MODE (x) == SFmode);
+
+      rv = CONST_DOUBLE_REAL_VALUE (x);
+      REAL_VALUE_TO_TARGET_SINGLE (*rv, val);
+
+      fprintf (stream, "0x%lx", val);
+      break;
+
     case CODE_LABEL:
     case CONST_INT:
     case CONST:
@@ -2687,6 +2993,84 @@ nds32_print_operand_address (FILE *stream, machine_mode /*mode*/, rtx x)
     }
 }
 
+/* -- Assembler Commands for Exception Regions.  */
+
+static rtx
+nds32_dwarf_register_span (rtx reg)
+{
+  rtx dwarf_high, dwarf_low;
+  rtx dwarf_single;
+  machine_mode mode;
+  int regno;
+
+  mode = GET_MODE (reg);
+  regno = REGNO (reg);
+
+  /* We need to adjust dwarf register information for floating-point registers
+     rather than using default register number mapping.  */
+  if (regno >= NDS32_FIRST_FPR_REGNUM
+      && regno <= NDS32_LAST_FPR_REGNUM)
+    {
+      if (mode == DFmode || mode == SCmode)
+	{
+	  /* By default, GCC maps increasing register numbers to increasing
+	     memory locations, but paired FPRs in NDS32 target are always
+	     big-endian, i.e.:
+
+	       fd0 :  fs0   fs1
+		     (MSB) (LSB)
+
+	     We must return parallel rtx to represent such layout.  */
+	  dwarf_high = gen_rtx_REG (word_mode, regno);
+	  dwarf_low = gen_rtx_REG (word_mode, regno + 1);
+	  return gen_rtx_PARALLEL (VOIDmode,
+				   gen_rtvec (2, dwarf_low, dwarf_high));
+	}
+      else if (mode == DCmode)
+	{
+	  rtx dwarf_high_re = gen_rtx_REG (word_mode, regno);
+	  rtx dwarf_low_re = gen_rtx_REG (word_mode, regno + 1);
+	  rtx dwarf_high_im = gen_rtx_REG (word_mode, regno);
+	  rtx dwarf_low_im = gen_rtx_REG (word_mode, regno + 1);
+	  return gen_rtx_PARALLEL (VOIDmode,
+				   gen_rtvec (4, dwarf_low_re, dwarf_high_re,
+						 dwarf_high_im, dwarf_low_im));
+	}
+      else if (mode == SFmode || mode == SImode)
+	{
+	  /* Create new dwarf information with adjusted register number.  */
+	  dwarf_single = gen_rtx_REG (word_mode, regno);
+	  return gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, dwarf_single));
+	}
+      else
+	{
+	  /* We should not be here.  */
+	  gcc_unreachable ();
+	}
+    }
+
+  return NULL_RTX;
+}
+
+/* Map internal gcc register numbers to DWARF2 register numbers.  */
+
+unsigned int
+nds32_dbx_register_number (unsigned int regno)
+{
+  /* The nds32 port in GDB maintains a mapping between dwarf register
+     number and displayed register name.  For backward compatibility to
+     previous toolchain, currently our gdb still has four registers
+     (d0.l, d0.h, d1.l, and d1.h) between GPR and FPR while compiler
+     does not count those four registers in its register number table.
+     So we have to add 4 on its register number and then create new
+     dwarf information.  Hopefully we can discard such workaround
+     in the future.  */
+  if (NDS32_IS_FPR_REGNUM (regno))
+    return regno + 4;
+
+  return regno;
+}
+
 
 /* Defining target-specific uses of __attribute__.  */
 
@@ -2894,6 +3278,16 @@ nds32_option_override (void)
       target_flags &= ~MASK_V3PUSH;
     }
 
+  if (TARGET_HARD_FLOAT && !(TARGET_FPU_SINGLE || TARGET_FPU_DOUBLE))
+    {
+      if (nds32_arch_option == ARCH_V3S || nds32_arch_option == ARCH_V3F)
+	error ("Disable FPU ISA, "
+	       "the ABI option must be enable '-mfloat-abi=soft'");
+      else
+	error ("'-mabi=2fp+' option only support when FPU available, "
+	       "must be enable '-mext-fpu-sp' or '-mext-fpu-dp'");
+    }
+
   /* Currently, we don't support PIC code generation yet.  */
   if (flag_pic)
     sorry ("position-independent code not supported");
@@ -2954,6 +3348,11 @@ nds32_cpu_cpp_builtins(struct cpp_reader *pfile)
   builtin_define ("__nds32__");
   builtin_define ("__NDS32__");
 
+  if (TARGET_HARD_FLOAT)
+    builtin_define ("__NDS32_ABI_2FP_PLUS__");
+  else
+    builtin_define ("__NDS32_ABI_2__");
+
   if (TARGET_ISA_V2)
     builtin_define ("__NDS32_ISA_V2__");
   if (TARGET_ISA_V3)
@@ -2961,6 +3360,40 @@ nds32_cpu_cpp_builtins(struct cpp_reader *pfile)
   if (TARGET_ISA_V3M)
     builtin_define ("__NDS32_ISA_V3M__");
 
+  if (TARGET_FPU_SINGLE)
+    builtin_define ("__NDS32_EXT_FPU_SP__");
+  if (TARGET_FPU_DOUBLE)
+    builtin_define ("__NDS32_EXT_FPU_DP__");
+
+  if (TARGET_EXT_FPU_FMA)
+    builtin_define ("__NDS32_EXT_FPU_FMA__");
+  if (NDS32_EXT_FPU_DOT_E)
+    builtin_define ("__NDS32_EXT_FPU_DOT_E__");
+  if (TARGET_FPU_SINGLE || TARGET_FPU_DOUBLE)
+    {
+      switch (nds32_fp_regnum)
+	{
+	case 0:
+	case 4:
+	  builtin_define ("__NDS32_EXT_FPU_CONFIG_0__");
+	  break;
+	case 1:
+	case 5:
+	  builtin_define ("__NDS32_EXT_FPU_CONFIG_1__");
+	  break;
+	case 2:
+	case 6:
+	  builtin_define ("__NDS32_EXT_FPU_CONFIG_2__");
+	  break;
+	case 3:
+	case 7:
+	  builtin_define ("__NDS32_EXT_FPU_CONFIG_3__");
+	  break;
+	default:
+	  abort ();
+	}
+    }
+
   if (TARGET_BIG_ENDIAN)
     builtin_define ("__NDS32_EB__");
   else
@@ -2988,6 +3421,12 @@ nds32_cpu_cpp_builtins(struct cpp_reader *pfile)
 
   builtin_assert ("cpu=nds32");
   builtin_assert ("machine=nds32");
+
+  if (TARGET_HARD_FLOAT)
+    builtin_define ("__NDS32_ABI_2FP_PLUS");
+  else
+    builtin_define ("__NDS32_ABI_2");
+
 #undef builtin_define
 #undef builtin_assert
 }
@@ -3026,16 +3465,38 @@ nds32_adjust_reg_alloc_order (void)
 
 /* -- How Values Fit in Registers.  */
 
+static unsigned
+nds32_hard_regno_nregs (unsigned regno ATTRIBUTE_UNUSED,
+			machine_mode mode)
+{
+  return ((GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD);
+}
+
 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
 
 static bool
 nds32_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
 {
-  /* Restrict double-word quantities to even register pairs.  */
-  if (targetm.hard_regno_nregs (regno, mode) == 1
-      || !((regno) & 1))
+  if (regno > FIRST_PSEUDO_REGISTER)
     return true;
 
+  if ((TARGET_FPU_SINGLE || TARGET_FPU_DOUBLE) && NDS32_IS_FPR_REGNUM (regno))
+    {
+      if (NDS32_IS_EXT_FPR_REGNUM(regno))
+	return (NDS32_FPR_REGNO_OK_FOR_DOUBLE(regno) && (mode == DFmode));
+      else if (mode == SFmode || mode == SImode)
+	return NDS32_FPR_REGNO_OK_FOR_SINGLE (regno);
+      else if (mode == DFmode)
+	return NDS32_FPR_REGNO_OK_FOR_DOUBLE (regno);
+
+      return false;
+    }
+
+  /* Restrict double-word quantities to even register pairs.  */
+  if (regno <= NDS32_LAST_GPR_REGNUM)
+    return (targetm.hard_regno_nregs (regno, mode) == 1
+	    || !((regno) & 1));
+
   return false;
 }
 
@@ -3048,10 +3509,22 @@ nds32_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
 static bool
 nds32_modes_tieable_p (machine_mode mode1, machine_mode mode2)
 {
-  return (GET_MODE_CLASS (mode1) == MODE_INT
-	  && GET_MODE_CLASS (mode2) == MODE_INT
-	  && GET_MODE_SIZE (mode1) <= UNITS_PER_WORD
-	  && GET_MODE_SIZE (mode2) <= UNITS_PER_WORD);
+  if ((GET_MODE_CLASS (mode1) == MODE_INT
+       && GET_MODE_CLASS (mode2) == MODE_INT)
+      && GET_MODE_SIZE (mode1) <= UNITS_PER_WORD
+      && GET_MODE_SIZE (mode2) <= UNITS_PER_WORD)
+    return true;
+
+  if (GET_MODE_SIZE (mode1) == GET_MODE_SIZE (mode2))
+    {
+      if ((TARGET_FPU_SINGLE && !TARGET_FPU_DOUBLE)
+	  && (mode1 == DFmode || mode2 == DFmode))
+	return false;
+      else
+	return true;
+    }
+
+  return false;
 }
 
 #undef TARGET_MODES_TIEABLE_P
@@ -3077,7 +3550,14 @@ nds32_regno_reg_class (int regno)
   else if (regno >= 20 && regno <= 31)
     return HIGH_REGS;
   else if (regno == 32 || regno == 33)
-    return FRAME_REGS;
+    {
+      /* $SFP and $AP is FRAME_REGS in fact, However prevent IRA don't
+	 know how to allocate register for $SFP and $AP, just tell IRA they
+	 are GENERAL_REGS, and ARM do this hack too.  */
+      return GENERAL_REGS;
+    }
+  else if (regno >= 34 && regno <= 97)
+    return FP_REGS;
   else
     return NO_REGS;
 }
@@ -3123,6 +3603,7 @@ nds32_initial_elimination_offset (unsigned int from_reg, unsigned int to_reg)
 		+ cfun->machine->lp_size
 		+ cfun->machine->callee_saved_gpr_regs_size
 		+ cfun->machine->callee_saved_area_gpr_padding_bytes
+		+ cfun->machine->callee_saved_fpr_regs_size
 		+ cfun->machine->local_size
 		+ cfun->machine->out_args_size);
     }
@@ -3143,7 +3624,8 @@ nds32_initial_elimination_offset (unsigned int from_reg, unsigned int to_reg)
 		       + cfun->machine->gp_size
 		       + cfun->machine->lp_size
 		       + cfun->machine->callee_saved_gpr_regs_size
-		       + cfun->machine->callee_saved_area_gpr_padding_bytes);
+		       + cfun->machine->callee_saved_area_gpr_padding_bytes
+		       + cfun->machine->callee_saved_fpr_regs_size);
     }
   else
     {
@@ -3162,10 +3644,11 @@ nds32_init_cumulative_args (CUMULATIVE_ARGS *cum,
 			    tree fndecl ATTRIBUTE_UNUSED,
 			    int n_named_args ATTRIBUTE_UNUSED)
 {
-  /* Initial available registers
-     (in offset, corresponding to NDS32_GPR_ARG_FIRST_REGNUM)
+  /* Initial available registers.  The values are offset against
+     NDS32_GPR_ARG_FIRST_REGNUM and NDS32_FPR_ARG_FIRST_REGNUM
      for passing arguments.  */
   cum->gpr_offset = 0;
+  cum->fpr_offset = 0;
 }
 
 /* -- Function Entry and Exit.  */
@@ -3248,17 +3731,79 @@ nds32_expand_prologue (void)
 			       fp_adjust);
     }
 
-  /* Adjust $sp = $sp - local_size - out_args_size
-                      - callee_saved_area_gpr_padding_bytes.  */
-  sp_adjust = cfun->machine->local_size
-	      + cfun->machine->out_args_size
-	      + cfun->machine->callee_saved_area_gpr_padding_bytes;
-  /* sp_adjust value may be out of range of the addi instruction,
-     create alternative add behavior with TA_REGNUM if necessary,
-     using NEGATIVE value to tell that we are decreasing address.  */
-  nds32_emit_adjust_frame (stack_pointer_rtx,
-			   stack_pointer_rtx,
-			   -1 * sp_adjust);
+  /* Save fpu registers.  */
+  if (cfun->machine->callee_saved_first_fpr_regno != SP_REGNUM)
+    {
+      /* When $sp moved to bottom of stack, we need to check whether
+	 the range of offset in the FPU instruction.  */
+      int fpr_offset = cfun->machine->local_size
+		       + cfun->machine->out_args_size
+		       + cfun->machine->callee_saved_fpr_regs_size;
+
+      /* Check FPU instruction offset imm14s.  */
+      if (!satisfies_constraint_Is14 (GEN_INT (fpr_offset)))
+	{
+	  int fpr_space = cfun->machine->callee_saved_area_gpr_padding_bytes
+			  + cfun->machine->callee_saved_fpr_regs_size;
+
+	  /* Save fpu registers, need to allocate stack space
+	     for fpu callee registers.  And now $sp position
+	     on callee saved fpr registers.  */
+	  nds32_emit_adjust_frame (stack_pointer_rtx,
+				   stack_pointer_rtx,
+				   -1 * fpr_space);
+
+	  /* Emit fpu store instruction, using [$sp + offset] store
+	     fpu registers.  */
+	  nds32_emit_push_fpr_callee_saved (0);
+
+          /* Adjust $sp = $sp - local_size - out_args_size.  */
+	  sp_adjust = cfun->machine->local_size
+		      + cfun->machine->out_args_size;
+
+	  /* Allocate stack space for local size and out args size.  */
+	  nds32_emit_adjust_frame (stack_pointer_rtx,
+				   stack_pointer_rtx,
+				   -1 * sp_adjust);
+	}
+      else
+	{
+	  /* Offset range in Is14, so $sp moved to bottom of stack.  */
+
+          /* Adjust $sp = $sp - local_size - out_args_size
+			      - callee_saved_area_gpr_padding_bytes
+			      - callee_saved_fpr_regs_size.  */
+	  sp_adjust = cfun->machine->local_size
+		      + cfun->machine->out_args_size
+		      + cfun->machine->callee_saved_area_gpr_padding_bytes
+		      + cfun->machine->callee_saved_fpr_regs_size;
+
+	  nds32_emit_adjust_frame (stack_pointer_rtx,
+				   stack_pointer_rtx,
+				   -1 * sp_adjust);
+
+	  /* Emit fpu store instruction, using [$sp + offset] store
+	     fpu registers.  */
+	  int fpr_position = cfun->machine->out_args_size
+			     + cfun->machine->local_size;
+	  nds32_emit_push_fpr_callee_saved (fpr_position);
+	}
+    }
+  else
+    {
+      /* Adjust $sp = $sp - local_size - out_args_size
+			  - callee_saved_area_gpr_padding_bytes.  */
+      sp_adjust = cfun->machine->local_size
+		  + cfun->machine->out_args_size
+		  + cfun->machine->callee_saved_area_gpr_padding_bytes;
+
+      /* sp_adjust value may be out of range of the addi instruction,
+	 create alternative add behavior with TA_REGNUM if necessary,
+	 using NEGATIVE value to tell that we are decreasing address.  */
+      nds32_emit_adjust_frame (stack_pointer_rtx,
+			       stack_pointer_rtx,
+			       -1 * sp_adjust);
+    }
 
   /* Prevent the instruction scheduler from
      moving instructions across the boundary.  */
@@ -3310,39 +3855,93 @@ nds32_expand_epilogue (bool sibcall_p)
 
   if (frame_pointer_needed)
     {
-      /* adjust $sp = $fp - ($fp size) - ($gp size) - ($lp size)
-			  - (4 * callee-saved-registers)
-	 Note: No need to adjust
-	       cfun->machine->callee_saved_area_gpr_padding_bytes,
-	       because we want to adjust stack pointer
-	       to the position for pop instruction.  */
-      sp_adjust = cfun->machine->fp_size
-		  + cfun->machine->gp_size
-		  + cfun->machine->lp_size
-		  + cfun->machine->callee_saved_gpr_regs_size;
+      /* Restore fpu registers.  */
+      if (cfun->machine->callee_saved_first_fpr_regno != SP_REGNUM)
+	{
+	  int gpr_padding = cfun->machine->callee_saved_area_gpr_padding_bytes;
+
+	  /* adjust $sp = $fp - ($fp size) - ($gp size) - ($lp size)
+			      - (4 * callee-saved-registers)
+			      - (4 * exception-handling-data-registers)
+			      - (4 * callee-saved-gpr-registers padding byte)
+			      - (4 * callee-saved-fpr-registers)
+	     Note:  we want to adjust stack pointer
+		    to the position for callee-saved fpr register,
+		    And restore fpu register use .bi instruction to adjust $sp
+		    from callee-saved fpr register to pop instruction.  */
+	  sp_adjust = cfun->machine->fp_size
+		      + cfun->machine->gp_size
+		      + cfun->machine->lp_size
+		      + cfun->machine->callee_saved_gpr_regs_size
+		      + cfun->machine->callee_saved_area_gpr_padding_bytes
+		      + cfun->machine->callee_saved_fpr_regs_size;
 
-      nds32_emit_adjust_frame (stack_pointer_rtx,
-			       hard_frame_pointer_rtx,
-			       -1 * sp_adjust);
+	  nds32_emit_adjust_frame (stack_pointer_rtx,
+				   hard_frame_pointer_rtx,
+				   -1 * sp_adjust);
+
+	  /* Emit fpu load instruction, using .bi instruction
+	     load fpu registers.  */
+	  nds32_emit_pop_fpr_callee_saved (gpr_padding);
+	}
+      else
+	{
+	  /* adjust $sp = $fp - ($fp size) - ($gp size) - ($lp size)
+			      - (4 * callee-saved-registers)
+			      - (4 * exception-handling-data-registers)
+	     Note: No need to adjust
+		   cfun->machine->callee_saved_area_gpr_padding_bytes,
+		   because we want to adjust stack pointer
+		   to the position for pop instruction.  */
+	  sp_adjust = cfun->machine->fp_size
+		      + cfun->machine->gp_size
+		      + cfun->machine->lp_size
+		      + cfun->machine->callee_saved_gpr_regs_size;
+
+	  nds32_emit_adjust_frame (stack_pointer_rtx,
+				   hard_frame_pointer_rtx,
+				   -1 * sp_adjust);
+	}
     }
   else
     {
-      /* If frame pointer is NOT needed,
-	 we cannot calculate the sp adjustment from frame pointer.
-	 Instead, we calculate the adjustment by local_size,
-	 out_args_size, and callee_saved_area_gpr_padding_bytes.
-	 Notice that such sp adjustment value may be out of range,
-	 so we have to deal with it as well.  */
+      /* Restore fpu registers.  */
+      if (cfun->machine->callee_saved_first_fpr_regno != SP_REGNUM)
+	{
+	  int gpr_padding = cfun->machine->callee_saved_area_gpr_padding_bytes;
 
-      /* Adjust $sp = $sp + local_size + out_args_size
-	                  + callee_saved_area_gpr_padding_bytes.  */
-      sp_adjust = cfun->machine->local_size
-		  + cfun->machine->out_args_size
-		  + cfun->machine->callee_saved_area_gpr_padding_bytes;
+	  /* Adjust $sp = $sp + local_size + out_args_size.  */
+	  sp_adjust = cfun->machine->local_size
+		      + cfun->machine->out_args_size;
 
-      nds32_emit_adjust_frame (stack_pointer_rtx,
-			       stack_pointer_rtx,
-			       sp_adjust);
+	  nds32_emit_adjust_frame (stack_pointer_rtx,
+				   stack_pointer_rtx,
+				   sp_adjust);
+
+	  /* Emit fpu load instruction, using .bi instruction
+	     load fpu registers, and adjust $sp from callee-saved fpr register
+	     to callee-saved gpr register.  */
+	  nds32_emit_pop_fpr_callee_saved (gpr_padding);
+	}
+      else
+	{
+	  /* If frame pointer is NOT needed,
+	     we cannot calculate the sp adjustment from frame pointer.
+	     Instead, we calculate the adjustment by local_size,
+	     out_args_size, and callee_saved_area_gpr_padding_bytes.
+	     Notice that such sp adjustment value may be out of range,
+	     so we have to deal with it as well.  */
+
+	  /* Adjust $sp = $sp + local_size + out_args_size
+			      + callee_saved_area_gpr_padding_bytes.  */
+	  sp_adjust = cfun->machine->local_size
+		      + cfun->machine->out_args_size
+		      + cfun->machine->callee_saved_area_gpr_padding_bytes;
+
+	  nds32_emit_adjust_frame (stack_pointer_rtx,
+				   stack_pointer_rtx,
+				   sp_adjust);
+	}
     }
 
   /* Get callee_first_regno and callee_last_regno.  */
@@ -3389,6 +3988,7 @@ nds32_expand_prologue_v3push (void)
 {
   int fp_adjust;
   int sp_adjust;
+  int fpr_space = 0;
   unsigned Rb, Re;
 
   /* Compute and setup stack frame size.
@@ -3411,7 +4011,8 @@ nds32_expand_prologue_v3push (void)
      where imm8u has to be 8-byte alignment.  */
   sp_adjust = cfun->machine->local_size
 	      + cfun->machine->out_args_size
-	      + cfun->machine->callee_saved_area_gpr_padding_bytes;
+	      + cfun->machine->callee_saved_area_gpr_padding_bytes
+	      + cfun->machine->callee_saved_fpr_regs_size;
 
   if (satisfies_constraint_Iu08 (GEN_INT (sp_adjust))
       && NDS32_DOUBLE_WORD_ALIGN_P (sp_adjust))
@@ -3421,6 +4022,18 @@ nds32_expand_prologue_v3push (void)
       /* nds32_emit_stack_v3push(last_regno, sp_adjust),
 	 the pattern 'stack_v3push' is implemented in nds32.md.  */
       nds32_emit_stack_v3push (Rb, Re, sp_adjust);
+
+      /* Save fpu registers.  */
+      if (cfun->machine->callee_saved_first_fpr_regno != SP_REGNUM)
+	{
+	  /* Calculate fpr position.  */
+	  int fpr_position = cfun->machine->local_size
+			     + cfun->machine->out_args_size;
+	  /* Emit fpu store instruction, using [$sp + offset] store
+	     fpu registers.  */
+	  nds32_emit_push_fpr_callee_saved (fpr_position);
+	}
+
       /* Check frame_pointer_needed to see
 	 if we shall emit fp adjustment instruction.  */
       if (frame_pointer_needed)
@@ -3448,12 +4061,26 @@ nds32_expand_prologue_v3push (void)
     }
   else
     {
-      /* We have to use 'push25 Re,0' and
-	 expand one more instruction to adjust $sp later.  */
+      if (cfun->machine->callee_saved_first_fpr_regno != SP_REGNUM)
+	{
+	  /* Calculate fpr space.  */
+	  fpr_space = cfun->machine->callee_saved_area_gpr_padding_bytes
+		      + cfun->machine->callee_saved_fpr_regs_size;
+
+	  /* We have to use 'push25 Re, fpr_space', to pre-allocate
+	     callee saved fpr registers space.  */
+	  nds32_emit_stack_v3push (Rb, Re, fpr_space);
+	  nds32_emit_push_fpr_callee_saved (0);
+	}
+      else
+	{
+	  /* We have to use 'push25 Re,0' and
+	     expand one more instruction to adjust $sp later.  */
 
-      /* nds32_emit_stack_v3push(last_regno, sp_adjust),
-	 the pattern 'stack_v3push' is implemented in nds32.md.  */
-      nds32_emit_stack_v3push (Rb, Re, 0);
+	  /* nds32_emit_stack_v3push(last_regno, sp_adjust),
+	     the pattern 'stack_v3push' is implemented in nds32.md.  */
+	  nds32_emit_stack_v3push (Rb, Re, 0);
+	}
 
       /* Check frame_pointer_needed to see
 	 if we shall emit fp adjustment instruction.  */
@@ -3472,11 +4099,27 @@ nds32_expand_prologue_v3push (void)
 		      + cfun->machine->lp_size
 		      + cfun->machine->callee_saved_gpr_regs_size;
 
+	  if (cfun->machine->callee_saved_first_fpr_regno != SP_REGNUM)
+	    {
+	      /* We use 'push25 Re, fpr_space', the $sp is
+		 on callee saved fpr position, so need to consider
+		 fpr space.  */
+	      fp_adjust = fp_adjust + fpr_space;
+	    }
+
 	  nds32_emit_adjust_frame (hard_frame_pointer_rtx,
 				   stack_pointer_rtx,
 				   fp_adjust);
 	}
 
+      if (cfun->machine->callee_saved_first_fpr_regno != SP_REGNUM)
+	{
+	  /* We use 'push25 Re, fpr_space',
+	     the $sp is on callee saved fpr position,
+	     no need to consider fpr space.  */
+	  sp_adjust = sp_adjust - fpr_space;
+	}
+
       /* Because we use 'push25 Re,0',
 	 we need to expand one more instruction to adjust $sp.
 	 using NEGATIVE value to tell that we are decreasing address.  */
@@ -3524,7 +4167,8 @@ nds32_expand_epilogue_v3pop (bool sibcall_p)
      where imm8u has to be 8-byte alignment.  */
   sp_adjust = cfun->machine->local_size
 	      + cfun->machine->out_args_size
-	      + cfun->machine->callee_saved_area_gpr_padding_bytes;
+	      + cfun->machine->callee_saved_area_gpr_padding_bytes
+	      + cfun->machine->callee_saved_fpr_regs_size;
 
   /* We have to consider alloca issue as well.
      If the function does call alloca(), the stack pointer is not fixed.
@@ -3537,6 +4181,16 @@ nds32_expand_epilogue_v3pop (bool sibcall_p)
       && NDS32_DOUBLE_WORD_ALIGN_P (sp_adjust)
       && !cfun->calls_alloca)
     {
+      /* Restore fpu registers.  */
+      if (cfun->machine->callee_saved_first_fpr_regno != SP_REGNUM)
+	{
+	  int fpr_position = cfun->machine->local_size
+			     + cfun->machine->out_args_size;
+	  /* Emit fpu load instruction, using [$sp + offset] restore
+	     fpu registers.  */
+	  nds32_emit_v3pop_fpr_callee_saved (fpr_position);
+	}
+
       /* We can use 'pop25 Re,imm8u'.  */
 
       /* nds32_emit_stack_v3pop(last_regno, sp_adjust),
@@ -3563,9 +4217,29 @@ nds32_expand_epilogue_v3pop (bool sibcall_p)
 		      + cfun->machine->lp_size
 		      + cfun->machine->callee_saved_gpr_regs_size;
 
-	  nds32_emit_adjust_frame (stack_pointer_rtx,
-				   hard_frame_pointer_rtx,
-				   -1 * sp_adjust);
+	  /* Restore fpu registers.  */
+	  if (cfun->machine->callee_saved_first_fpr_regno != SP_REGNUM)
+	    {
+	      /* Set $sp to callee saved fpr position, we need to restore
+		 fpr registers.  */
+	      sp_adjust = sp_adjust
+			  + cfun->machine->callee_saved_area_gpr_padding_bytes
+			  + cfun->machine->callee_saved_fpr_regs_size;
+
+	      nds32_emit_adjust_frame (stack_pointer_rtx,
+				       hard_frame_pointer_rtx,
+				       -1 * sp_adjust);
+
+	      /* Emit fpu load instruction, using [$sp + offset] restore
+		 fpu registers.  */
+	      nds32_emit_v3pop_fpr_callee_saved (0);
+	    }
+	  else
+	    {
+	      nds32_emit_adjust_frame (stack_pointer_rtx,
+				       hard_frame_pointer_rtx,
+				       -1 * sp_adjust);
+	    }
 	}
       else
 	{
@@ -3577,24 +4251,57 @@ nds32_expand_epilogue_v3pop (bool sibcall_p)
 	     so we have to deal with it as well.  */
 
 	  /* Adjust $sp = $sp + local_size + out_args_size
-                              + callee_saved_area_gpr_padding_bytes.  */
+			      + callee_saved_area_gpr_padding_bytes
+			      + callee_saved_fpr_regs_size.  */
 	  sp_adjust = cfun->machine->local_size
 		      + cfun->machine->out_args_size
-		      + cfun->machine->callee_saved_area_gpr_padding_bytes;
-	   /* sp_adjust value may be out of range of the addi instruction,
-	      create alternative add behavior with TA_REGNUM if necessary,
-	      using POSITIVE value to tell that we are increasing
-	      address.  */
-	  nds32_emit_adjust_frame (stack_pointer_rtx,
-				   stack_pointer_rtx,
-				   sp_adjust);
+		      + cfun->machine->callee_saved_area_gpr_padding_bytes
+		      + cfun->machine->callee_saved_fpr_regs_size;
+
+	  /* Restore fpu registers.  */
+	  if (cfun->machine->callee_saved_first_fpr_regno != SP_REGNUM)
+	    {
+	      /* Set $sp to callee saved fpr position, we need to restore
+		 fpr registers.  */
+	      sp_adjust = sp_adjust
+			  - cfun->machine->callee_saved_area_gpr_padding_bytes
+			  - cfun->machine->callee_saved_fpr_regs_size;
+
+	      nds32_emit_adjust_frame (stack_pointer_rtx,
+				       stack_pointer_rtx,
+				       sp_adjust);
+
+	      /* Emit fpu load instruction, using [$sp + offset] restore
+		 fpu registers.  */
+	      nds32_emit_v3pop_fpr_callee_saved (0);
+	    }
+	  else
+	    {
+	       /* sp_adjust value may be out of range of the addi instruction,
+		  create alternative add behavior with TA_REGNUM if necessary,
+		  using POSITIVE value to tell that we are increasing
+		  address.  */
+	      nds32_emit_adjust_frame (stack_pointer_rtx,
+				       stack_pointer_rtx,
+				       sp_adjust);
+	    }
 	}
 
-      /* nds32_emit_stack_v3pop(last_regno, sp_adjust),
-	 the pattern 'stack_v3pop' is implementad in nds32.md.  */
-      nds32_emit_stack_v3pop (Rb, Re, 0);
+      if (cfun->machine->callee_saved_first_fpr_regno != SP_REGNUM)
+	{
+	  /* We have fpr need to restore, so $sp is set on callee saved fpr
+	     position.  And we use 'pop25 Re, fpr_space' to adjust $sp.  */
+	  int fpr_space = cfun->machine->callee_saved_area_gpr_padding_bytes
+			  + cfun->machine->callee_saved_fpr_regs_size;
+	  nds32_emit_stack_v3pop (Rb, Re, fpr_space);
+	}
+      else
+	{
+	  /* nds32_emit_stack_v3pop(last_regno, sp_adjust),
+	     the pattern 'stack_v3pop' is implementad in nds32.md.  */
+	  nds32_emit_stack_v3pop (Rb, Re, 0);
+	}
     }
-
   /* Generate return instruction.  */
   emit_jump_insn (gen_pop25return ());
 }
@@ -3605,11 +4312,26 @@ nds32_expand_epilogue_v3pop (bool sibcall_p)
 int
 nds32_can_use_return_insn (void)
 {
+  int sp_adjust;
+
   /* Prior to reloading, we can't tell how many registers must be saved.
      Thus we can not determine whether this function has null epilogue.  */
   if (!reload_completed)
     return 0;
 
+  sp_adjust = cfun->machine->local_size
+	      + cfun->machine->out_args_size
+	      + cfun->machine->callee_saved_area_gpr_padding_bytes
+	      + cfun->machine->callee_saved_fpr_regs_size;
+  if (!cfun->machine->fp_as_gp_p
+      && satisfies_constraint_Iu08 (GEN_INT (sp_adjust))
+      && NDS32_DOUBLE_WORD_ALIGN_P (sp_adjust)
+      && !cfun->calls_alloca
+      && NDS32_V3PUSH_AVAILABLE_P
+      && !(TARGET_HARD_FLOAT
+	   && (cfun->machine->callee_saved_first_fpr_regno != SP_REGNUM)))
+    return 1;
+
   /* If no stack was created, two conditions must be satisfied:
      1. This is a naked function.
 	So there is no callee-saved, local size, or outgoing size.
@@ -3673,6 +4395,36 @@ nds32_adjust_insn_length (rtx_insn *insn, int length)
   return length;
 }
 
+bool
+nds32_split_double_word_load_store_p(rtx *operands, bool load_p)
+{
+  rtx mem = load_p ? operands[1] : operands[0];
+  /* Do split at split2 if -O0 or schedule 2 not enable.  */
+  if (optimize == 0 || !flag_schedule_insns_after_reload)
+    return !satisfies_constraint_Da (mem) || MEM_VOLATILE_P (mem);
+
+  /* Split double word load store after copy propgation.  */
+  if (current_pass == NULL)
+    return false;
+
+  const char *pass_name = current_pass->name;
+  if (pass_name && ((strcmp (pass_name, "split4") == 0)
+		     || (strcmp (pass_name, "split5") == 0)))
+    return !satisfies_constraint_Da (mem) || MEM_VOLATILE_P (mem);
+
+  return false;
+}
+
+static bool
+nds32_use_blocks_for_constant_p (machine_mode mode,
+				 const_rtx x ATTRIBUTE_UNUSED)
+{
+  if ((TARGET_FPU_SINGLE || TARGET_FPU_DOUBLE)
+      && (mode == DFmode || mode == SFmode))
+    return true;
+  else
+    return false;
+}
 
 /* Return align 2 (log base 2) if the next instruction of LABEL is 4 byte.  */
 int
@@ -3720,10 +4472,16 @@ nds32_target_alignment (rtx_insn *label)
 
 /* -- Basic Characteristics of Registers.  */
 
+#undef TARGET_CONDITIONAL_REGISTER_USAGE
+#define TARGET_CONDITIONAL_REGISTER_USAGE nds32_conditional_register_usage
+
 /* -- Order of Allocation of Registers.  */
 
 /* -- How Values Fit in Registers.  */
 
+#undef TARGET_HARD_REGNO_NREGS
+#define TARGET_HARD_REGNO_NREGS nds32_hard_regno_nregs
+
 /* -- Handling Leaf Functions.  */
 
 /* -- Registers That Form a Stack.  */
@@ -3737,6 +4495,9 @@ nds32_target_alignment (rtx_insn *label)
 #undef TARGET_REGISTER_PRIORITY
 #define TARGET_REGISTER_PRIORITY nds32_register_priority
 
+#undef TARGET_CAN_CHANGE_MODE_CLASS
+#define TARGET_CAN_CHANGE_MODE_CLASS nds32_can_change_mode_class
+
 
 /* Obsolete Macros for Defining Constraints.  */
 
@@ -3788,6 +4549,9 @@ nds32_target_alignment (rtx_insn *label)
 
 /* -- How Large Values Are Returned.  */
 
+#undef TARGET_RETURN_IN_MEMORY
+#define TARGET_RETURN_IN_MEMORY nds32_return_in_memory
+
 /* -- Caller-Saves Register Allocation.  */
 
 /* -- Function Entry and Exit.  */
@@ -3931,6 +4695,9 @@ nds32_target_alignment (rtx_insn *label)
 
 /* -- Assembler Commands for Exception Regions.  */
 
+#undef TARGET_DWARF_REGISTER_SPAN
+#define TARGET_DWARF_REGISTER_SPAN nds32_dwarf_register_span
+
 /* -- Assembler Commands for Alignment.  */
 
 
@@ -4002,6 +4769,10 @@ nds32_target_alignment (rtx_insn *label)
 #undef TARGET_EXPAND_BUILTIN
 #define TARGET_EXPAND_BUILTIN nds32_expand_builtin
 
+
+#undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
+#define TARGET_USE_BLOCKS_FOR_CONSTANT_P nds32_use_blocks_for_constant_p
+
 
 /* ------------------------------------------------------------------------ */
 
diff --git a/gcc/config/nds32/nds32.h b/gcc/config/nds32/nds32.h
index 02b99a17b95..c1d389c7d61 100644
--- a/gcc/config/nds32/nds32.h
+++ b/gcc/config/nds32/nds32.h
@@ -130,12 +130,15 @@ enum nds32_16bit_address_type
 
 /* Define maximum numbers of registers for passing arguments.  */
 #define NDS32_MAX_GPR_REGS_FOR_ARGS 6
+#define NDS32_MAX_FPR_REGS_FOR_ARGS 6
 
 /* Define the register number for first argument.  */
 #define NDS32_GPR_ARG_FIRST_REGNUM 0
+#define NDS32_FPR_ARG_FIRST_REGNUM 34
 
 /* Define the register number for return value.  */
 #define NDS32_GPR_RET_FIRST_REGNUM 0
+#define NDS32_FPR_RET_FIRST_REGNUM 34
 
 /* Define the first integer register number.  */
 #define NDS32_FIRST_GPR_REGNUM 0
@@ -146,6 +149,44 @@ enum nds32_16bit_address_type
 #define NDS32_LAST_CALLEE_SAVE_GPR_REGNUM \
   (TARGET_REDUCED_REGS ? 10 : 14)
 
+/* Define the floating-point number of registers.  */
+#define NDS32_FLOAT_REGISTER_NUMBER                           \
+ (((nds32_fp_regnum == NDS32_CONFIG_FPU_0)              \
+   || (nds32_fp_regnum == NDS32_CONFIG_FPU_4)) ? 8      \
+  : ((nds32_fp_regnum == NDS32_CONFIG_FPU_1)            \
+    || (nds32_fp_regnum == NDS32_CONFIG_FPU_5)) ? 16    \
+  : ((nds32_fp_regnum == NDS32_CONFIG_FPU_2)            \
+    || (nds32_fp_regnum == NDS32_CONFIG_FPU_6)) ? 32    \
+  : ((nds32_fp_regnum == NDS32_CONFIG_FPU_3)            \
+    || (nds32_fp_regnum == NDS32_CONFIG_FPU_7)) ? 64    \
+  : 32)
+
+#define NDS32_EXT_FPU_DOT_E (nds32_fp_regnum >= 4)
+
+/* Define the first floating-point register number.  */
+#define NDS32_FIRST_FPR_REGNUM 34
+/* Define the last floating-point register number.  */
+#define NDS32_LAST_FPR_REGNUM \
+  (NDS32_FIRST_FPR_REGNUM + NDS32_FLOAT_REGISTER_NUMBER - 1)
+
+
+#define NDS32_IS_EXT_FPR_REGNUM(regno) \
+  (((regno) >= NDS32_FIRST_FPR_REGNUM + 32) \
+   && ((regno) < NDS32_FIRST_FPR_REGNUM + 64))
+
+#define NDS32_IS_FPR_REGNUM(regno) \
+  (((regno) >= NDS32_FIRST_FPR_REGNUM) \
+   && ((regno) <= NDS32_LAST_FPR_REGNUM))
+
+#define NDS32_FPR_REGNO_OK_FOR_SINGLE(regno) \
+  ((regno) <= NDS32_LAST_FPR_REGNUM)
+
+#define NDS32_FPR_REGNO_OK_FOR_DOUBLE(regno) \
+  ((((regno) - NDS32_FIRST_FPR_REGNUM) & 1) == 0)
+
+#define NDS32_IS_GPR_REGNUM(regno) \
+  (((regno) <= NDS32_LAST_GPR_REGNUM))
+
 /* Define double word alignment bits.  */
 #define NDS32_DOUBLE_WORD_ALIGNMENT 64
 
@@ -189,7 +230,14 @@ enum nds32_16bit_address_type
       : ((reg_offset) + NDS32_GPR_ARG_FIRST_REGNUM))               \
    : ((reg_offset) + NDS32_GPR_ARG_FIRST_REGNUM))
 
-/* This macro is to check if there are still available registers
+#define NDS32_AVAILABLE_REGNUM_FOR_FPR_ARG(reg_offset, mode, type) \
+  ((NDS32_NEED_N_REGS_FOR_ARG (mode, type) > 1)                    \
+   ? ((NDS32_MODE_TYPE_ALIGN (mode, type) > PARM_BOUNDARY)         \
+      ? (((reg_offset) + NDS32_FPR_ARG_FIRST_REGNUM + 1) & ~1)     \
+      : ((reg_offset) + NDS32_FPR_ARG_FIRST_REGNUM))               \
+   : ((reg_offset) + NDS32_FPR_ARG_FIRST_REGNUM))
+
+/* These two macros are to check if there are still available registers
    for passing argument, which must be entirely in registers.  */
 #define NDS32_ARG_ENTIRE_IN_GPR_REG_P(reg_offset, mode, type)   \
   ((NDS32_AVAILABLE_REGNUM_FOR_GPR_ARG (reg_offset, mode, type) \
@@ -197,13 +245,23 @@ enum nds32_16bit_address_type
    <= (NDS32_GPR_ARG_FIRST_REGNUM                               \
        + NDS32_MAX_GPR_REGS_FOR_ARGS))
 
-/* This macro is to check if there are still available registers
+#define NDS32_ARG_ENTIRE_IN_FPR_REG_P(reg_offset, mode, type)   \
+  ((NDS32_AVAILABLE_REGNUM_FOR_FPR_ARG (reg_offset, mode, type) \
+    + NDS32_NEED_N_REGS_FOR_ARG (mode, type))                   \
+   <= (NDS32_FPR_ARG_FIRST_REGNUM                               \
+       + NDS32_MAX_FPR_REGS_FOR_ARGS))
+
+/* These two macros are to check if there are still available registers
    for passing argument, either entirely in registers or partially
    in registers.  */
 #define NDS32_ARG_PARTIAL_IN_GPR_REG_P(reg_offset, mode, type) \
   (NDS32_AVAILABLE_REGNUM_FOR_GPR_ARG (reg_offset, mode, type) \
    < NDS32_GPR_ARG_FIRST_REGNUM + NDS32_MAX_GPR_REGS_FOR_ARGS)
 
+#define NDS32_ARG_PARTIAL_IN_FPR_REG_P(reg_offset, mode, type) \
+  (NDS32_AVAILABLE_REGNUM_FOR_FPR_ARG (reg_offset, mode, type) \
+   < NDS32_FPR_ARG_FIRST_REGNUM + NDS32_MAX_FPR_REGS_FOR_ARGS)
+
 /* This macro is to check if the register is required to be saved on stack.
    If call_used_regs[regno] == 0, regno is the callee-saved register.
    If df_regs_ever_live_p(regno) == true, it is used in the current function.
@@ -251,6 +309,10 @@ struct GTY(()) machine_function
      callee-saved registers.  */
   int callee_saved_gpr_regs_size;
 
+  /* Number of bytes on the stack for saving floating-point
+     callee-saved registers.  */
+  int callee_saved_fpr_regs_size;
+
   /* The padding bytes in callee-saved area may be required.  */
   int callee_saved_area_gpr_padding_bytes;
 
@@ -259,6 +321,11 @@ struct GTY(()) machine_function
   /* The last required general purpose callee-saved register.  */
   int callee_saved_last_gpr_regno;
 
+  /* The first required floating-point callee-saved register.  */
+  int callee_saved_first_fpr_regno;
+  /* The last required floating-point callee-saved register.  */
+  int callee_saved_last_fpr_regno;
+
   /* The padding bytes in varargs area may be required.  */
   int va_args_area_padding_bytes;
 
@@ -279,6 +346,7 @@ struct GTY(()) machine_function
 typedef struct
 {
   unsigned int gpr_offset;
+  unsigned int fpr_offset;
 } nds32_cumulative_args;
 
 /* ------------------------------------------------------------------------ */
@@ -390,7 +458,11 @@ enum nds32_builtins
 /* ------------------------------------------------------------------------ */
 
 #define TARGET_ISA_V2   (nds32_arch_option == ARCH_V2)
-#define TARGET_ISA_V3   (nds32_arch_option == ARCH_V3)
+
+#define TARGET_ISA_V3 \
+  (nds32_arch_option == ARCH_V3 \
+   || nds32_arch_option == ARCH_V3F \
+   || nds32_arch_option == ARCH_V3S)
 #define TARGET_ISA_V3M  (nds32_arch_option == ARCH_V3M)
 
 #define TARGET_CMODEL_SMALL \
@@ -406,21 +478,60 @@ enum nds32_builtins
    (nds32_cmodel_option == CMODEL_SMALL\
     || nds32_cmodel_option == CMODEL_MEDIUM)
 
-#define TARGET_SOFT_FLOAT 1
-#define TARGET_HARD_FLOAT 0
 
+/* Run-time Target Specification.  */
+#define TARGET_SOFT_FLOAT (nds32_abi == NDS32_ABI_V2)
+/* Use hardware floating point calling convention.  */
+#define TARGET_HARD_FLOAT (nds32_abi == NDS32_ABI_V2_FP_PLUS)
+
+/* Record arch version in TARGET_ARCH_DEFAULT. 0 means soft ABI,
+   1 means  hard ABI and using full floating-point instruction,
+   2 means hard ABI and only using single-precision floating-point
+   instruction  */
+#if TARGET_ARCH_DEFAULT == 1
+#  define TARGET_DEFAULT_ABI NDS32_ABI_V2_FP_PLUS
+#  define TARGET_DEFAULT_FPU_ISA MASK_FPU_DOUBLE | MASK_FPU_SINGLE
+#  define TARGET_DEFAULT_FPU_FMA 0
+#else
+#  if TARGET_ARCH_DEFAULT == 2
+#    define TARGET_DEFAULT_ABI NDS32_ABI_V2_FP_PLUS
+#    define TARGET_DEFAULT_FPU_ISA MASK_FPU_SINGLE
+#    define TARGET_DEFAULT_FPU_FMA 0
+#  else
+#    define TARGET_DEFAULT_ABI NDS32_ABI_V2
+#    define TARGET_DEFAULT_FPU_ISA 0
+#    define TARGET_DEFAULT_FPU_FMA 0
+#  endif
+#endif
+
+#define TARGET_CONFIG_FPU_DEFAULT NDS32_CONFIG_FPU_2
 /* ------------------------------------------------------------------------ */
 
 /* Controlling the Compilation Driver.  */
 
 #define OPTION_DEFAULT_SPECS \
-  {"arch", "%{!march=*:-march=%(VALUE)}" }
+  {"arch", " %{!march=*:-march=%(VALUE)}" \
+	   " %{march=v3f:%{!mfloat-abi=*:-mfloat-abi=hard}" \
+	   " %{!mno-ext-fpu-sp:%{!mext-fpu-sp:-mext-fpu-sp}}" \
+	   " %{!mno-ext-fpu-dp:%{!mext-fpu-dp:-mext-fpu-dp}}}" \
+	   " %{march=v3s:%{!mfloat-abi=*:-mfloat-abi=hard}" \
+	   " %{!mno-ext-fpu-sp:%{!mext-fpu-sp:-mext-fpu-sp}}}" }, \
+  {"float", "%{!mfloat-abi=*:-mfloat-abi=%(VALUE)}" }
 
 #define CC1_SPEC \
   ""
 
 #define ASM_SPEC \
-  " %{mbig-endian:-EB} %{mlittle-endian:-EL}"
+  " %{mbig-endian:-EB} %{mlittle-endian:-EL}" \
+  " %{march=*:-march=%*}" \
+  " %{mabi=*:-mabi=v%*}" \
+  " %{mconfig-fpu=*:-mfpu-freg=%*}" \
+  " %{mext-fpu-mac:-mmac}" \
+  " %{mno-ext-fpu-mac:-mno-mac}" \
+  " %{mext-fpu-sp:-mfpu-sp-ext}" \
+  " %{mno-ext-fpu-sp:-mno-fpu-sp-ext}" \
+  " %{mext-fpu-dp:-mfpu-dp-ext}" \
+  " %{mno-ext-fpu-sp:-mno-fpu-dp-ext}"
 
 /* If user issues -mrelax, we need to pass '--relax' to linker.  */
 #define LINK_SPEC \
@@ -550,8 +661,8 @@ enum nds32_builtins
    $r30 : $lp
    $r31 : $sp
 
-   caller-save registers: $r0 ~ $r5, $r16 ~ $r23
-   callee-save registers: $r6 ~ $r10, $r11 ~ $r14
+   caller-save registers: $r0 ~ $r5, $r16 ~ $r23, $fs0 ~ $fs5, $fs22 ~ $fs47
+   callee-save registers: $r6 ~ $r10, $r11 ~ $r14, $fs6 ~ $fs21, $fs48 ~ $fs63
 
    reserved for assembler : $r15
    reserved for other use : $r24, $r25, $r26, $r27 */
@@ -564,23 +675,23 @@ enum nds32_builtins
       0,   0,   0,   0,   0,   0,   0,   0,     \
   /* r24  r25  r26  r27  r28  r29  r30  r31  */ \
       1,   1,   1,   1,   0,   1,   0,   1,     \
-  /* AP   FP    Reserved.................... */ \
+  /* AP   FP   fs0  fs1  fs2  fs3  fs4  fs5  */ \
       1,   1,   1,   1,   1,   1,   1,   1,     \
-  /* Reserved............................... */ \
+  /* fs6  fs7  fs8  fs9  fs10 fs11 fs12 fs13 */ \
       1,   1,   1,   1,   1,   1,   1,   1,     \
-  /* Reserved............................... */ \
+  /* fs14 fs15 fs16 fs17 fs18 fs19 fs20 fs21 */ \
       1,   1,   1,   1,   1,   1,   1,   1,     \
-  /* Reserved............................... */ \
+  /* fs22 fs23 fs24 fs25 fs26 fs27 fs28 fs29 */ \
       1,   1,   1,   1,   1,   1,   1,   1,     \
-  /* Reserved............................... */ \
+  /* fs30 fs31 fd16      fd17      fd18      */ \
       1,   1,   1,   1,   1,   1,   1,   1,     \
-  /* Reserved............................... */ \
+  /* fd19      fd20      fd21      fd22      */ \
       1,   1,   1,   1,   1,   1,   1,   1,     \
-  /* Reserved............................... */ \
+  /* fd23      fd24      fd25      fd26      */ \
       1,   1,   1,   1,   1,   1,   1,   1,     \
-  /* Reserved............................... */ \
+  /* fd27      fd28      fd29      fd30      */ \
       1,   1,   1,   1,   1,   1,   1,   1,     \
-  /* Reserved............................... */ \
+  /* fd31      Reserved..................... */ \
       1,   1,   1,   1,   1                     \
 }
 
@@ -599,23 +710,23 @@ enum nds32_builtins
       1,   1,   1,   1,   1,   1,   1,   1,     \
   /* r24  r25  r26  r27  r28  r29  r30  r31  */ \
       1,   1,   1,   1,   0,   1,   0,   1,     \
-  /* AP   FP    Reserved.................... */ \
+  /* AP   FP   fs0  fs1  fs2  fs3  fs4  fs5  */ \
       1,   1,   1,   1,   1,   1,   1,   1,     \
-  /* Reserved............................... */ \
+  /* fs6  fs7  fs8  fs9  fs10 fs11 fs12 fs13 */ \
       1,   1,   1,   1,   1,   1,   1,   1,     \
-  /* Reserved............................... */ \
+  /* fs14 fs15 fs16 fs17 fs18 fs19 fs20 fs21 */ \
       1,   1,   1,   1,   1,   1,   1,   1,     \
-  /* Reserved............................... */ \
+  /* fs22 fs23 fs24 fs25 fs26 fs27 fs28 fs29 */ \
       1,   1,   1,   1,   1,   1,   1,   1,     \
-  /* Reserved............................... */ \
+  /* fs30 fs31 fd16      fd17      fd18      */ \
       1,   1,   1,   1,   1,   1,   1,   1,     \
-  /* Reserved............................... */ \
+  /* fd19      fd20      fd21      fd22      */ \
       1,   1,   1,   1,   1,   1,   1,   1,     \
-  /* Reserved............................... */ \
+  /* fd23      fd24      fd25      fd26      */ \
       1,   1,   1,   1,   1,   1,   1,   1,     \
-  /* Reserved............................... */ \
+  /* fd27      fd28      fd29      fd30      */ \
       1,   1,   1,   1,   1,   1,   1,   1,     \
-  /* Reserved............................... */ \
+  /* fd31      Reserved..................... */ \
       1,   1,   1,   1,   1                     \
 }
 
@@ -670,6 +781,7 @@ enum reg_class
   HIGH_REGS,
   GENERAL_REGS,
   FRAME_REGS,
+  FP_REGS,
   ALL_REGS,
   LIM_REG_CLASSES
 };
@@ -689,6 +801,7 @@ enum reg_class
   "HIGH_REGS",          \
   "GENERAL_REGS",       \
   "FRAME_REGS",         \
+  "FP_REGS",            \
   "ALL_REGS"            \
 }
 
@@ -715,6 +828,8 @@ enum reg_class
   {0xffffffff, 0x00000000, 0x00000000, 0x00000000}, \
   /* FRAME_REGS          : 32, 33               */  \
   {0x00000000, 0x00000003, 0x00000000, 0x00000000}, \
+  /* FP_REGS             : 34-98                */  \
+  {0x00000000, 0xfffffffc, 0xffffffff, 0x00000003}, \
   /* ALL_REGS            : 0-100                */  \
   {0xffffffff, 0xffffffff, 0xffffffff, 0x0000001f}  \
 }
@@ -724,13 +839,18 @@ enum reg_class
 #define BASE_REG_CLASS GENERAL_REGS
 #define INDEX_REG_CLASS GENERAL_REGS
 
+#define TEST_REGNO(R, TEST, VALUE) \
+  ((R TEST VALUE) || ((unsigned) reg_renumber[R] TEST VALUE))
+
 /* Return nonzero if it is suitable for use as a
    base register in operand addresses.
    So far, we return nonzero only if "num" is a hard reg
    of the suitable class or a pseudo register which is
    allocated to a suitable hard reg.  */
 #define REGNO_OK_FOR_BASE_P(num) \
-  ((num) < 32 || (unsigned) reg_renumber[num] < 32)
+  (TEST_REGNO (num, <, 32) \
+   || TEST_REGNO (num, ==, FRAME_POINTER_REGNUM) \
+   || TEST_REGNO (num, ==, ARG_POINTER_REGNUM))
 
 /* Return nonzero if it is suitable for use as a
    index register in operand addresses.
@@ -740,7 +860,9 @@ enum reg_class
    The difference between an index register and a base register is that
    the index register may be scaled.  */
 #define REGNO_OK_FOR_INDEX_P(num) \
-  ((num) < 32 || (unsigned) reg_renumber[num] < 32)
+  (TEST_REGNO (num, <, 32) \
+   || TEST_REGNO (num, ==, FRAME_POINTER_REGNUM) \
+   || TEST_REGNO (num, ==, ARG_POINTER_REGNUM))
 
 
 /* Obsolete Macros for Defining Constraints.  */
@@ -768,6 +890,8 @@ enum reg_class
 #define INCOMING_RETURN_ADDR_RTX    gen_rtx_REG (Pmode, LP_REGNUM)
 #define DWARF_FRAME_RETURN_COLUMN   DWARF_FRAME_REGNUM (LP_REGNUM)
 
+#define DBX_REGISTER_NUMBER(REGNO) nds32_dbx_register_number (REGNO)
+
 #define STACK_POINTER_REGNUM SP_REGNUM
 
 #define FRAME_POINTER_REGNUM 33
@@ -796,12 +920,11 @@ enum reg_class
 #define INIT_CUMULATIVE_ARGS(cum, fntype, libname, fndecl, n_named_args) \
   nds32_init_cumulative_args (&cum, fntype, libname, fndecl, n_named_args)
 
-/* The REGNO is an unsigned integer but NDS32_GPR_ARG_FIRST_REGNUM may be 0.
-   We better cast REGNO into signed integer so that we can avoid
-   'comparison of unsigned expression >= 0 is always true' warning.  */
-#define FUNCTION_ARG_REGNO_P(regno)                                        \
-  (((int) regno - NDS32_GPR_ARG_FIRST_REGNUM >= 0)                         \
-   && ((int) regno - NDS32_GPR_ARG_FIRST_REGNUM < NDS32_MAX_GPR_REGS_FOR_ARGS))
+#define FUNCTION_ARG_REGNO_P(regno)                                           \
+ (IN_RANGE ((regno), NDS32_FIRST_GPR_REGNUM, NDS32_MAX_GPR_REGS_FOR_ARGS - 1) \
+  || ((TARGET_FPU_SINGLE || TARGET_FPU_DOUBLE)                                \
+      && IN_RANGE ((regno), NDS32_FPR_ARG_FIRST_REGNUM,                       \
+		   NDS32_FIRST_FPR_REGNUM + NDS32_MAX_FPR_REGS_FOR_ARGS - 1)))
 
 #define DEFAULT_PCC_STRUCT_RETURN 0
 
@@ -944,15 +1067,72 @@ enum reg_class
   "$r8",  "$r9",  "$r10", "$r11", "$r12", "$r13", "$r14", "$ta",  \
   "$r16", "$r17", "$r18", "$r19", "$r20", "$r21", "$r22", "$r23", \
   "$r24", "$r25", "$r26", "$r27", "$fp",  "$gp",  "$lp",  "$sp",  \
-  "$AP",  "$SFP", "NA",   "NA",   "NA",   "NA",   "NA",   "NA",   \
-  "NA",   "NA",   "NA",   "NA",   "NA",   "NA",   "NA",   "NA",   \
-  "NA",   "NA",   "NA",   "NA",   "NA",   "NA",   "NA",   "NA",   \
-  "NA",   "NA",   "NA",   "NA",   "NA",   "NA",   "NA",   "NA",   \
-  "NA",   "NA",   "NA",   "NA",   "NA",   "NA",   "NA",   "NA",   \
-  "NA",   "NA",   "NA",   "NA",   "NA",   "NA",   "NA",   "NA",   \
-  "NA",   "NA",   "NA",   "NA",   "NA",   "NA",   "NA",   "NA",   \
-  "NA",   "NA",   "NA",   "NA",   "NA",   "NA",   "NA",   "NA",   \
-  "NA",   "NA",   "NA",   "NA",   "NA"                            \
+  "$AP",  "$SFP", "$fs0", "$fs1", "$fs2", "$fs3", "$fs4", "$fs5", \
+  "$fs6", "$fs7", "$fs8", "$fs9", "$fs10","$fs11","$fs12","$fs13",\
+  "$fs14","$fs15","$fs16","$fs17","$fs18","$fs19","$fs20","$fs21",\
+  "$fs22","$fs23","$fs24","$fs25","$fs26","$fs27","$fs28","$fs29",\
+  "$fs30","$fs31","$fs32","$fs33","$fs34","$fs35","$fs36","$fs37",\
+  "$fs38","$fs39","$fs40","$fs41","$fs42","$fs43","$fs44","$fs45",\
+  "$fs46","$fs47","$fs48","$fs49","$fs50","$fs51","$fs52","$fs53",\
+  "$fs54","$fs55","$fs56","$fs57","$fs58","$fs59","$fs60","$fs61",\
+  "$fs62","$fs63",   "LB",   "LE",   "LC"                         \
+}
+
+#define ADDITIONAL_REGISTER_NAMES				\
+{								\
+  {"$r15", 15},							\
+  {"$r28", 28},	{"$r29", 29},	{"$r30", 30},	{"$r31", 31},	\
+  {"$a0", 0},	{"$a1", 1},	{"$a2", 2},			\
+  {"$a3", 3},	{"$a4", 4},	{"$a5", 5},			\
+  {"$s0", 6},	{"$s1", 7},	{"$s2", 8},	{"$s3", 9},	\
+  {"$s4", 10},	{"$s5", 11},	{"$s6", 12},	{"$s7", 13},	\
+  {"$s8", 14},							\
+  {"$t0", 16},	{"$t1", 17},	{"$t2", 18},	{"$t3", 19},	\
+  {"$t4", 20},	{"$t5", 21},	{"$t6", 22},	{"$t7", 23},	\
+  {"$t8", 24},	{"$t9", 25},					\
+  {"$p0", 26},	{"$p1", 27},					\
+  {"$h0", 0},	{"$h1", 1},	{"$h2", 2},	{"$h3", 3},	\
+  {"$h4", 4},	{"$h5", 5},	{"$h6", 6},	{"$h7", 7},	\
+  {"$h8", 8},	{"$h9", 9},	{"$h10", 10},	{"$h11", 11},	\
+  {"$h12", 16},	{"$h13", 17},	{"$h14", 18},	{"$h15", 19},	\
+  {"$o0", 0},	{"$o1", 1},	{"$o2", 2},	{"$o3", 3},	\
+  {"$o4", 4},	{"$o5", 5},	{"$o6", 6},	{"$o7", 7},	\
+}
+
+#define OVERLAPPING_REGISTER_NAMES		\
+{						\
+  {"$fd0",  NDS32_FIRST_FPR_REGNUM + 0,  2},	\
+  {"$fd1",  NDS32_FIRST_FPR_REGNUM + 2,  2},	\
+  {"$fd2",  NDS32_FIRST_FPR_REGNUM + 4,  2},	\
+  {"$fd3",  NDS32_FIRST_FPR_REGNUM + 6,  2},	\
+  {"$fd4",  NDS32_FIRST_FPR_REGNUM + 8,  2},	\
+  {"$fd5",  NDS32_FIRST_FPR_REGNUM + 10, 2},	\
+  {"$fd6",  NDS32_FIRST_FPR_REGNUM + 12, 2},	\
+  {"$fd7",  NDS32_FIRST_FPR_REGNUM + 14, 2},	\
+  {"$fd8",  NDS32_FIRST_FPR_REGNUM + 16, 2},	\
+  {"$fd9",  NDS32_FIRST_FPR_REGNUM + 18, 2},	\
+  {"$fd10", NDS32_FIRST_FPR_REGNUM + 20, 2},	\
+  {"$fd11", NDS32_FIRST_FPR_REGNUM + 22, 2},	\
+  {"$fd12", NDS32_FIRST_FPR_REGNUM + 24, 2},	\
+  {"$fd13", NDS32_FIRST_FPR_REGNUM + 26, 2},	\
+  {"$fd14", NDS32_FIRST_FPR_REGNUM + 28, 2},	\
+  {"$fd15", NDS32_FIRST_FPR_REGNUM + 30, 2},	\
+  {"$fd16", NDS32_FIRST_FPR_REGNUM + 32, 2},	\
+  {"$fd17", NDS32_FIRST_FPR_REGNUM + 34, 2},	\
+  {"$fd18", NDS32_FIRST_FPR_REGNUM + 36, 2},	\
+  {"$fd19", NDS32_FIRST_FPR_REGNUM + 38, 2},	\
+  {"$fd20", NDS32_FIRST_FPR_REGNUM + 40, 2},	\
+  {"$fd21", NDS32_FIRST_FPR_REGNUM + 42, 2},	\
+  {"$fd22", NDS32_FIRST_FPR_REGNUM + 44, 2},	\
+  {"$fd23", NDS32_FIRST_FPR_REGNUM + 46, 2},	\
+  {"$fd24", NDS32_FIRST_FPR_REGNUM + 48, 2},	\
+  {"$fd25", NDS32_FIRST_FPR_REGNUM + 50, 2},	\
+  {"$fd26", NDS32_FIRST_FPR_REGNUM + 52, 2},	\
+  {"$fd27", NDS32_FIRST_FPR_REGNUM + 54, 2},	\
+  {"$fd28", NDS32_FIRST_FPR_REGNUM + 56, 2},	\
+  {"$fd29", NDS32_FIRST_FPR_REGNUM + 58, 2},	\
+  {"$fd30", NDS32_FIRST_FPR_REGNUM + 60, 2},	\
+  {"$fd31", NDS32_FIRST_FPR_REGNUM + 62, 2},	\
 }
 
 /* Output normal jump table entry.  */
diff --git a/gcc/config/nds32/nds32.md b/gcc/config/nds32/nds32.md
index dd50f7a62e0..25ae3144a1f 100644
--- a/gcc/config/nds32/nds32.md
+++ b/gcc/config/nds32/nds32.md
@@ -46,13 +46,17 @@
 ;; Include DImode/DFmode operations.
 (include "nds32-doubleword.md")
 
+;; Include floating-point patterns.
+(include "nds32-fpu.md")
+
 ;; Include peephole patterns.
 (include "nds32-peephole2.md")
 
 
 ;; Insn type, it is used to default other attribute values.
 (define_attr "type"
-  "unknown,load,store,load_multiple,store_multiple,alu,alu_shift,mul,mac,div,branch,call,misc"
+  "unknown,load,store,load_multiple,store_multiple,alu,alu_shift,mul,mac,div,branch,call,misc,\
+   falu,fmuls,fmuld,fmacs,fmacd,fdivs,fdivd,fsqrts,fsqrtd,fcmp,fabs,fcpy,fcmov,fmfsr,fmfdr,fmtsr,fmtdr,fload,fstore"
   (const_string "unknown"))
 
 ;; Insn sub-type
@@ -77,7 +81,7 @@
 ;; pe2 : Performance Extension Version 2 Instructions
 ;; se  : String Extension instructions
 (define_attr "feature"
-  "v1,v2,v3m,v3,pe1,pe2,se"
+  "v1,v2,v3m,v3,pe1,pe2,se,fpu"
   (const_string "v1"))
 
 ;; Enabled, which is used to enable/disable insn alternatives.
@@ -107,6 +111,9 @@
 						    (const_string "yes")
 						    (const_string "no"))
 	   (eq_attr "feature" "se")   (if_then_else (match_test "TARGET_EXT_STRING")
+						    (const_string "yes")
+						    (const_string "no"))
+	   (eq_attr "feature" "fpu")  (if_then_else (match_test "TARGET_FPU_SINGLE || TARGET_FPU_DOUBLE")
 						    (const_string "yes")
 						    (const_string "no"))]
 	   (const_string "yes"))))
@@ -193,8 +200,8 @@
 })
 
 (define_insn "*mov<mode>"
-  [(set (match_operand:QIHISI 0 "nonimmediate_operand" "=r, r, U45, U33, U37, U45, m,   l,   l,   l,   d,   d, r,    d,    r,    r,    r")
-	(match_operand:QIHISI 1 "nds32_move_operand"   " r, r,   l,   l,   l,   d, r, U45, U33, U37, U45, Ufe, m, Ip05, Is05, Is20, Ihig"))]
+  [(set (match_operand:QIHISI 0 "nonimmediate_operand" "=r, r,U45,U33,U37,U45, m,  l,  l,  l,  d,  d, r,   d,    r,    r,    r, *f, *f,  r, *f,  Q")
+	(match_operand:QIHISI 1 "nds32_move_operand"   " r, r,  l,  l,  l,  d, r,U45,U33,U37,U45,Ufe, m,Ip05, Is05, Is20, Ihig, *f,  r, *f,  Q, *f"))]
   "register_operand(operands[0], <MODE>mode)
    || register_operand(operands[1], <MODE>mode)"
 {
@@ -227,12 +234,26 @@
       return "movi\t%0, %1";
     case 16:
       return "sethi\t%0, hi20(%1)";
+    case 17:
+      if (TARGET_FPU_SINGLE)
+	return "fcpyss\t%0, %1, %1";
+      else
+	return "#";
+    case 18:
+      return "fmtsr\t%1, %0";
+    case 19:
+      return "fmfsr\t%0, %1";
+    case 20:
+      return nds32_output_float_load (operands);
+    case 21:
+      return nds32_output_float_store (operands);
     default:
       gcc_unreachable ();
     }
 }
-  [(set_attr "type"   "alu,alu,store,store,store,store,store,load,load,load,load,load,load,alu,alu,alu,alu")
-   (set_attr "length" "  2,  4,    2,    2,    2,    2,    4,   2,   2,   2,   2,   2,   4,  2,  2,  4,  4")])
+  [(set_attr "type"    "alu,alu,store,store,store,store,store,load,load,load,load,load,load,alu,alu,alu,alu,fcpy,fmtsr,fmfsr,fload,fstore")
+   (set_attr "length"  "  2,  4,    2,    2,    2,    2,    4,   2,   2,   2,   2,   2,   4,  2,  2,  4,  4,   4,    4,    4,    4,     4")
+   (set_attr "feature" " v1, v1,   v1,   v1,   v1,   v1,   v1,  v1,  v1,  v1,  v1, v3m,  v1, v1, v1, v1, v1, fpu,  fpu,  fpu,  fpu,   fpu")])
 
 
 ;; We use nds32_symbolic_operand to limit that only CONST/SYMBOL_REF/LABEL_REF
@@ -804,6 +825,87 @@
    (set_attr "length"  "  2,  4")
    (set_attr "feature" "v3m, v1")])
 
+(define_expand "negsf2"
+  [(set (match_operand:SF 0 "register_operand" "")
+	(neg:SF (match_operand:SF 1 "register_operand" "")))]
+  ""
+{
+  if (!TARGET_FPU_SINGLE && !TARGET_EXT_PERF)
+    {
+      rtx new_dst = simplify_gen_subreg (SImode, operands[0], SFmode, 0);
+      rtx new_src = simplify_gen_subreg (SImode, operands[1], SFmode, 0);
+
+      emit_insn (gen_xorsi3 (new_dst,
+			     new_src,
+			     gen_int_mode (0x80000000, SImode)));
+
+      DONE;
+    }
+})
+
+(define_expand "negdf2"
+  [(set (match_operand:DF 0 "register_operand" "")
+	(neg:DF (match_operand:DF 1 "register_operand" "")))]
+  ""
+{
+})
+
+(define_insn_and_split "soft_negdf2"
+  [(set (match_operand:DF 0 "register_operand" "")
+	(neg:DF (match_operand:DF 1 "register_operand" "")))]
+  "!TARGET_FPU_DOUBLE"
+  "#"
+  "!TARGET_FPU_DOUBLE"
+  [(const_int 1)]
+{
+    rtx src = operands[1];
+    rtx dst = operands[0];
+    rtx ori_dst = operands[0];
+
+    bool need_extra_move_for_dst_p;
+    /* FPU register can't change mode to SI directly, so we need create a
+       tmp register to handle it, and FPU register can't do `xor` or btgl.  */
+    if (HARD_REGISTER_P (src)
+	&& TEST_HARD_REG_BIT (reg_class_contents[FP_REGS], REGNO (src)))
+      {
+	rtx tmp = gen_reg_rtx (DFmode);
+	emit_move_insn (tmp, src);
+	src = tmp;
+      }
+
+    if (HARD_REGISTER_P (dst)
+	&& TEST_HARD_REG_BIT (reg_class_contents[FP_REGS], REGNO (dst)))
+      {
+	need_extra_move_for_dst_p = true;
+	rtx tmp = gen_reg_rtx (DFmode);
+	dst = tmp;
+      }
+
+    rtx dst_high_part = simplify_gen_subreg (
+			  SImode, dst,
+			  DFmode, subreg_highpart_offset (SImode, DFmode));
+    rtx dst_low_part = simplify_gen_subreg (
+			  SImode, dst,
+			  DFmode, subreg_lowpart_offset (SImode, DFmode));
+    rtx src_high_part = simplify_gen_subreg (
+			  SImode, src,
+			  DFmode, subreg_highpart_offset (SImode, DFmode));
+    rtx src_low_part = simplify_gen_subreg (
+			  SImode, src,
+			  DFmode, subreg_lowpart_offset (SImode, DFmode));
+
+    emit_insn (gen_xorsi3 (dst_high_part,
+			   src_high_part,
+			   gen_int_mode (0x80000000, SImode)));
+    emit_move_insn (dst_low_part, src_low_part);
+
+    if (need_extra_move_for_dst_p)
+      emit_move_insn (ori_dst, dst);
+
+    DONE;
+})
+
+
 ;; ----------------------------------------------------------------------------
 ;; 'ONE_COMPLIMENT' operation
 ;; ----------------------------------------------------------------------------
diff --git a/gcc/config/nds32/nds32.opt b/gcc/config/nds32/nds32.opt
index d6d2f20dbac..bb2bbce18eb 100644
--- a/gcc/config/nds32/nds32.opt
+++ b/gcc/config/nds32/nds32.opt
@@ -32,6 +32,31 @@ EL
 Target RejectNegative Alias(mlittle-endian)
 Generate code in little-endian mode.
 
+
+; ---------------------------------------------------------------
+
+mabi=
+Target RejectNegative Joined Enum(abi_type) Var(nds32_abi) Init(TARGET_DEFAULT_ABI)
+Specify which ABI type to generate code for: 2, 2fp+.
+
+Enum
+Name(abi_type) Type(enum abi_type)
+Known ABIs (for use with the -mabi= option):
+
+EnumValue
+Enum(abi_type) String(2) Value(NDS32_ABI_V2)
+
+EnumValue
+Enum(abi_type) String(2fp+) Value(NDS32_ABI_V2_FP_PLUS)
+
+mfloat-abi=soft
+Target RejectNegative Alias(mabi=, 2)
+Specify use soft floating point ABI which mean alias to -mabi=2.
+
+mfloat-abi=hard
+Target RejectNegative Alias(mabi=, 2fp+)
+Specify use soft floating point ABI which mean alias to -mabi=2fp+.
+
 ; ---------------------------------------------------------------
 
 mreduced-regs
@@ -110,6 +135,12 @@ Enum(nds32_arch_type) String(v3) Value(ARCH_V3)
 EnumValue
 Enum(nds32_arch_type) String(v3m) Value(ARCH_V3M)
 
+EnumValue
+Enum(nds32_arch_type) String(v3f) Value(ARCH_V3F)
+
+EnumValue
+Enum(nds32_arch_type) String(v3s) Value(ARCH_V3S)
+
 mcmodel=
 Target RejectNegative Joined Enum(nds32_cmodel_type) Var(nds32_cmodel_option) Init(CMODEL_LARGE)
 Specify the address generation strategy for code model.
@@ -138,6 +169,38 @@ Known cpu types (for use with the -mcpu= option):
 EnumValue
 Enum(nds32_cpu_type) String(n9) Value(CPU_N9)
 
+mconfig-fpu=
+Target RejectNegative Joined Enum(float_reg_number) Var(nds32_fp_regnum) Init(TARGET_CONFIG_FPU_DEFAULT)
+Specify a fpu configuration value from 0 to 7; 0-3 is as FPU spec says, and 4-7 is corresponding to 0-3.
+
+Enum
+Name(float_reg_number) Type(enum float_reg_number)
+Known floating-point number of registers (for use with the -mconfig-fpu= option):
+
+EnumValue
+Enum(float_reg_number) String(0) Value(NDS32_CONFIG_FPU_0)
+
+EnumValue
+Enum(float_reg_number) String(1) Value(NDS32_CONFIG_FPU_1)
+
+EnumValue
+Enum(float_reg_number) String(2) Value(NDS32_CONFIG_FPU_2)
+
+EnumValue
+Enum(float_reg_number) String(3) Value(NDS32_CONFIG_FPU_3)
+
+EnumValue
+Enum(float_reg_number) String(4) Value(NDS32_CONFIG_FPU_4)
+
+EnumValue
+Enum(float_reg_number) String(5) Value(NDS32_CONFIG_FPU_5)
+
+EnumValue
+Enum(float_reg_number) String(6) Value(NDS32_CONFIG_FPU_6)
+
+EnumValue
+Enum(float_reg_number) String(7) Value(NDS32_CONFIG_FPU_7)
+
 mctor-dtor
 Target Report
 Enable constructor/destructor feature.
@@ -145,3 +208,15 @@ Enable constructor/destructor feature.
 mrelax
 Target Report
 Guide linker to relax instructions.
+
+mext-fpu-fma
+Target Report Mask(EXT_FPU_FMA)
+Generate floating-point multiply-accumulation instructions.
+
+mext-fpu-sp
+Target Report Mask(FPU_SINGLE)
+Generate single-precision floating-point instructions.
+
+mext-fpu-dp
+Target Report Mask(FPU_DOUBLE)
+Generate double-precision floating-point instructions.
diff --git a/gcc/config/nds32/predicates.md b/gcc/config/nds32/predicates.md
index 066ec3471ea..9eb84685514 100644
--- a/gcc/config/nds32/predicates.md
+++ b/gcc/config/nds32/predicates.md
@@ -24,12 +24,21 @@
 (define_predicate "nds32_greater_less_comparison_operator"
   (match_code "gt,ge,lt,le"))
 
+(define_predicate "nds32_float_comparison_operator"
+  (match_code "eq,ne,le,lt,ge,gt,ordered,unordered,ungt,unge,unlt,unle"))
+
 (define_predicate "nds32_movecc_comparison_operator"
   (match_code "eq,ne,le,leu,ge,geu"))
 
 (define_special_predicate "nds32_logical_binary_operator"
   (match_code "and,ior,xor"))
 
+(define_special_predicate "nds32_conditional_call_comparison_operator"
+  (match_code "lt,ge"))
+
+(define_special_predicate "nds32_have_33_inst_operator"
+  (match_code "mult,and,ior,xor"))
+
 (define_predicate "nds32_symbolic_operand"
   (match_code "const,symbol_ref,label_ref"))
 
@@ -122,6 +131,18 @@
   (and (match_code "mem")
        (match_test "nds32_valid_smw_lwm_base_p (op)")))
 
+(define_predicate "float_even_register_operand"
+  (and (match_code "reg")
+       (and (match_test "REGNO (op) >= NDS32_FIRST_FPR_REGNUM")
+	    (match_test "REGNO (op) <= NDS32_LAST_FPR_REGNUM")
+	    (match_test "(REGNO (op) & 1) == 0"))))
+
+(define_predicate "float_odd_register_operand"
+  (and (match_code "reg")
+       (and (match_test "REGNO (op) >= NDS32_FIRST_FPR_REGNUM")
+	    (match_test "REGNO (op) <= NDS32_LAST_FPR_REGNUM")
+	    (match_test "(REGNO (op) & 1) != 0"))))
+
 (define_special_predicate "nds32_load_multiple_operation"
   (match_code "parallel")
 {
-- 
2.30.2