From 312209c6a5ff3828335e698a9d0c872a48411fdf Mon Sep 17 00:00:00 2001
From: Alexandre Oliva <aoliva@redhat.com>
Date: Wed, 28 Jul 2004 09:13:58 +0000
Subject: [PATCH] Introduce sh4a support.

gcc/ChangeLog:
Introduce sh4a support.
* config.gcc: Handle sh4a multilibs and cpu selection.
* config/sh/sh.h: Likewise.  Handle sh4a command line flags.
* config/sh/t-mlib-sh4a: New.
* config/sh/t-mlib-sh4al: New.
* config/sh/t-mlib-sh4a-nofpu: New.
* config/sh/t-mlib-sh4a-single: New.
* config/sh/t-mlib-sh4a-single-only: New.
2004-02-20  DJ Delorie  <dj@redhat.com>
* config/sh/sh.md ("movua"): Change constraint from "m" to "Sua".
* config/sh/sh.h (EXTRA_CONSTRAINT_S): Add "Sua" support.
2003-08-22  Eric Christopher  <echristo@redhat.com>
* config/sh/sh4a.md: Update for chip errata.
2003-08-07  Eric Christopher  <echristo@redhat.com>
* config/sh/sh4a.md: New file. sh4a processor description.
2003-07-08  Alexandre Oliva  <aoliva@redhat.com>
* config/sh/sh.h (TARGET_SWITCHES): Added 4al.  Adjust description
of -m4a-nofpu.
(SH_ASM_SPEC): Pass -dsp for -m4al, not -m4a-nofpu.
* config/sh/t-sh (MULTILIB_MATCHES): Map -m4al to -m4a-nofpu.
* doc/invoke.texi (SH Options): Document -m4al.
2003-07-03  Alexandre Oliva  <aoliva@redhat.com>
* config/sh/sh.c (expand_block_move): Remove commented-out code
checked in by mistake.
(sh_cannot_change_mode_class): Enable SUBREGs to be used to select
single elements from SFmode vectors.
* config/sh/sh.md (fsca): Use VEC_CONCAT to initialize the output
register.
(sinsf2, cossf2, sindf2, cosdf2): Don't emit CLOBBER.
2003-07-01  Alexandre Oliva  <aoliva@redhat.com>
* config/sh/sh.h (sh_fsca_sf2int, sh_fsca_df2int,
sh_fsca_int2sf): Remove variable declarations.
* config/sh/sh.c (sh_fsca_sf2int, sh_fsca_df2int,
sh_fsca_int2sf): New functions.
(sh_fsca_sf2int_rtx, sh_fsca_df2int_rtx,
sh_fsca_int2sf_rtx): New static variables.
* config/sh/sh-protos.h (sh_fsca_sf2int, sh_fsca_df2int,
sh_fsca_int2sf): Declare.
* config/sh/sh.md: Adjust.
* doc/invoke.texi (SH Options): Document new options.
* config/sh/lib1funcs.asm (ic_invalidate): Remove SH4a forward
compatibility from SH4 code.
2003-06-27  Alexandre Oliva  <aoliva@redhat.com>
* config/sh/sh.c (expand_block_move): Don't emit POST_INC too
early.
(memory_movsrc_operand): Renamed to...
(unaligned_load_operand): ... this.  Simplified.
* config/sh/sh.h (PREDICATE_CODES): Adjust.
* config/sh/sh.md (movua, extv, extzv): Likewise.  Change movua's
input operand to SImode, and adjust the others.  Introduce
post-increment by peephole.
* config/sh/sh.c (expand_block_move): Give the target address the
same mode as the temp reg.
* config/sh/sh.c (expand_block_move): Use a temp reg for unaligned
copying.
2003-06-26  Alexandre Oliva  <aoliva@redhat.com>
Introduce support for SH4a.
* config/sh/lib1funcs.asm (ic_invalidate): Use icbi if
__SH4A__.  Emit 4 4kb blocks and touch all of them otherwise.
* config/sh/sh.c (sh_fsca_sf2int, sh_fsca_df2int,
sh_fsca_int2sf): New.
(sh_init_builtins): Initialize them.
(print_operand): Support `d'.
(expand_block_move): Use movua if src is misaligned.
(memory_movsrc_operand): New.
* config/sh/sh.h (TARGET_CPU_CPP_BUILTINS): Define __SH4A__
and one of the SH4 macros.
(SH4A_BIT, TARGET_SH4A_ARCH, TARGET_SH4A_FP,
SELECT_SH4A_NOFPU, SELECT_SH4A_SINGLE_ONLY, SELECT_SH4A,
SELECT_SH4A_SINGLE): New.
(TARGET_NONE): Add SH4A_BIT.
(TARGET_SWITCHES): Add 4a-single-only, 4a-single, 4a-nofpu and 4a.
(SH_ASM_SPEC): Pass -dsp if -m4a-nofpu.
(sh_fsca_sf2int, sh_fsca_df2int, sh_fsca_int2sf): Declare.
(OVERRIDE_OPTIONS): Set cpu to CPU_SH4A when appropriate.
(enum processor_type): Added PROCESSOR_SH4A.
(PREDICATE_CODES): Add memory_movsrc_operand.
* config/sh/sh.md: Removed unused variables.
(attr cpu): Add sh4a.
(attr type): Add movua, fsrra and fsca.
(prefetch): New, for SH4.
(ic_invalidate_line, ic_invalidate_line_sh4a): Use icbi.
(toggle_sz): Set type to fp.
(toggle_pr, rsqrtsf2, fsca, sinsf2, cossf2, sindf2, cosdf2): New.
(movua, extv, extzv): New.
* config/sh/t-sh: Add multilibs for 4a, 4a-nofpu, 4a-single
and 4a-single-only.
gcc/testsuite/ChangeLog:
2003-07-06  Alexandre Oliva  <aoliva@redhat.com>
* gcc.dg/sh4a-memmovua.c: Tweak regular expression.
2003-07-01  Alexandre Oliva  <aoliva@redhat.com>
* gcc.dg/sh4a-bitmovua.c: New.
* gcc.dg/sh4a-cos.c: New.
* gcc.dg/sh4a-cosf.c: New.
* gcc.dg/sh4a-fprun.c: New.
* gcc.dg/sh4a-fsrra.c: New.
* gcc.dg/sh4a-memmovua.c: New.
* gcc.dg/sh4a-sin.c: New.
* gcc.dg/sh4a-sincos.c: New.
* gcc.dg/sh4a-sincosf.c: New.
* gcc.dg/sh4a-sinf.c: New.
libstdc++-v3/ChangeLog:
2003-10-01  Eric Christopher  <echristo@redhat.com>
* config/cpu/sh/atomicity.h (__exchange_and_add): Remove 'm'
constraint.
2003-07-09  Alexandre Oliva  <aoliva@redhat.com>
* config/cpu/sh/atomicity.h: New.  Use movli and movco on SH4a.

From-SVN: r85257
---
 gcc/ChangeLog                          |  90 ++++++++++
 gcc/config.gcc                         |  10 ++
 gcc/config/sh/lib1funcs.asm            |  27 +++
 gcc/config/sh/sh-protos.h              |   3 +
 gcc/config/sh/sh.c                     | 137 +++++++++++++-
 gcc/config/sh/sh.h                     |  80 ++++++++-
 gcc/config/sh/sh.md                    | 214 +++++++++++++++++++++-
 gcc/config/sh/sh4a.md                  | 235 +++++++++++++++++++++++++
 gcc/config/sh/t-mlib-sh4a              |   1 +
 gcc/config/sh/t-mlib-sh4a-nofpu        |   1 +
 gcc/config/sh/t-mlib-sh4a-single       |   1 +
 gcc/config/sh/t-mlib-sh4a-single-only  |   1 +
 gcc/config/sh/t-mlib-sh4al             |   1 +
 gcc/config/sh/t-sh                     |   7 +-
 gcc/doc/invoke.texi                    |  26 +++
 gcc/testsuite/ChangeLog                |  16 ++
 gcc/testsuite/gcc.dg/sh4a-bitmovua.c   |  73 ++++++++
 gcc/testsuite/gcc.dg/sh4a-cos.c        |  13 ++
 gcc/testsuite/gcc.dg/sh4a-cosf.c       |  13 ++
 gcc/testsuite/gcc.dg/sh4a-fprun.c      |  35 ++++
 gcc/testsuite/gcc.dg/sh4a-fsrra.c      |  13 ++
 gcc/testsuite/gcc.dg/sh4a-memmovua.c   |  17 ++
 gcc/testsuite/gcc.dg/sh4a-sin.c        |  13 ++
 gcc/testsuite/gcc.dg/sh4a-sincos.c     |  14 ++
 gcc/testsuite/gcc.dg/sh4a-sincosf.c    |  14 ++
 gcc/testsuite/gcc.dg/sh4a-sinf.c       |  13 ++
 libstdc++-v3/ChangeLog                 |   8 +
 libstdc++-v3/config/cpu/sh/atomicity.h | 123 +++++++++++++
 28 files changed, 1189 insertions(+), 10 deletions(-)
 create mode 100644 gcc/config/sh/sh4a.md
 create mode 100644 gcc/config/sh/t-mlib-sh4a
 create mode 100644 gcc/config/sh/t-mlib-sh4a-nofpu
 create mode 100644 gcc/config/sh/t-mlib-sh4a-single
 create mode 100644 gcc/config/sh/t-mlib-sh4a-single-only
 create mode 100644 gcc/config/sh/t-mlib-sh4al
 create mode 100644 gcc/testsuite/gcc.dg/sh4a-bitmovua.c
 create mode 100644 gcc/testsuite/gcc.dg/sh4a-cos.c
 create mode 100644 gcc/testsuite/gcc.dg/sh4a-cosf.c
 create mode 100644 gcc/testsuite/gcc.dg/sh4a-fprun.c
 create mode 100644 gcc/testsuite/gcc.dg/sh4a-fsrra.c
 create mode 100644 gcc/testsuite/gcc.dg/sh4a-memmovua.c
 create mode 100644 gcc/testsuite/gcc.dg/sh4a-sin.c
 create mode 100644 gcc/testsuite/gcc.dg/sh4a-sincos.c
 create mode 100644 gcc/testsuite/gcc.dg/sh4a-sincosf.c
 create mode 100644 gcc/testsuite/gcc.dg/sh4a-sinf.c
 create mode 100644 libstdc++-v3/config/cpu/sh/atomicity.h

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index a3f30ff9d3d..ddea34766fa 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,93 @@
+2004-07-28  Alexandre Oliva  <aoliva@redhat.com>
+
+	Introduce sh4a support.
+	* config.gcc: Handle sh4a multilibs and cpu selection.
+	* config/sh/sh.h: Likewise.  Handle sh4a command line flags.
+	* config/sh/t-mlib-sh4a: New.
+	* config/sh/t-mlib-sh4al: New.
+	* config/sh/t-mlib-sh4a-nofpu: New.
+	* config/sh/t-mlib-sh4a-single: New.
+	* config/sh/t-mlib-sh4a-single-only: New.
+	2004-02-20  DJ Delorie  <dj@redhat.com>
+	* config/sh/sh.md ("movua"): Change constraint from "m" to "Sua".
+	* config/sh/sh.h (EXTRA_CONSTRAINT_S): Add "Sua" support.
+	2003-08-22  Eric Christopher  <echristo@redhat.com>
+	* config/sh/sh4a.md: Update for chip errata.
+	2003-08-07  Eric Christopher  <echristo@redhat.com>
+	* config/sh/sh4a.md: New file. sh4a processor description.
+	2003-07-08  Alexandre Oliva  <aoliva@redhat.com>
+	* config/sh/sh.h (TARGET_SWITCHES): Added 4al.  Adjust description
+	of -m4a-nofpu.
+	(SH_ASM_SPEC): Pass -dsp for -m4al, not -m4a-nofpu.
+	* config/sh/t-sh (MULTILIB_MATCHES): Map -m4al to -m4a-nofpu.
+	* doc/invoke.texi (SH Options): Document -m4al.
+	2003-07-03  Alexandre Oliva  <aoliva@redhat.com>
+	* config/sh/sh.c (expand_block_move): Remove commented-out code
+	checked in by mistake.
+	(sh_cannot_change_mode_class): Enable SUBREGs to be used to select
+	single elements from SFmode vectors.
+	* config/sh/sh.md (fsca): Use VEC_CONCAT to initialize the output
+	register.
+	(sinsf2, cossf2, sindf2, cosdf2): Don't emit CLOBBER.
+	2003-07-01  Alexandre Oliva  <aoliva@redhat.com>
+	* config/sh/sh.h (sh_fsca_sf2int, sh_fsca_df2int,
+	sh_fsca_int2sf): Remove variable declarations.
+	* config/sh/sh.c (sh_fsca_sf2int, sh_fsca_df2int,
+	sh_fsca_int2sf): New functions.
+	(sh_fsca_sf2int_rtx, sh_fsca_df2int_rtx,
+	sh_fsca_int2sf_rtx): New static variables.
+	* config/sh/sh-protos.h (sh_fsca_sf2int, sh_fsca_df2int,
+	sh_fsca_int2sf): Declare.
+	* config/sh/sh.md: Adjust.
+	* doc/invoke.texi (SH Options): Document new options.
+	* config/sh/lib1funcs.asm (ic_invalidate): Remove SH4a forward
+	compatibility from SH4 code.
+	2003-06-27  Alexandre Oliva  <aoliva@redhat.com>
+	* config/sh/sh.c (expand_block_move): Don't emit POST_INC too
+	early.
+	(memory_movsrc_operand): Renamed to...
+	(unaligned_load_operand): ... this.  Simplified.
+	* config/sh/sh.h (PREDICATE_CODES): Adjust.
+	* config/sh/sh.md (movua, extv, extzv): Likewise.  Change movua's
+	input operand to SImode, and adjust the others.  Introduce
+	post-increment by peephole.
+	* config/sh/sh.c (expand_block_move): Give the target address the
+	same mode as the temp reg.
+	* config/sh/sh.c (expand_block_move): Use a temp reg for unaligned
+	copying.
+	2003-06-26  Alexandre Oliva  <aoliva@redhat.com>
+	Introduce support for SH4a.
+	* config/sh/lib1funcs.asm (ic_invalidate): Use icbi if
+	__SH4A__.  Emit 4 4kb blocks and touch all of them otherwise.
+	* config/sh/sh.c (sh_fsca_sf2int, sh_fsca_df2int,
+	sh_fsca_int2sf): New.
+	(sh_init_builtins): Initialize them.
+	(print_operand): Support `d'.
+	(expand_block_move): Use movua if src is misaligned.
+	(memory_movsrc_operand): New.
+	* config/sh/sh.h (TARGET_CPU_CPP_BUILTINS): Define __SH4A__
+	and one of the SH4 macros.
+	(SH4A_BIT, TARGET_SH4A_ARCH, TARGET_SH4A_FP,
+	SELECT_SH4A_NOFPU, SELECT_SH4A_SINGLE_ONLY, SELECT_SH4A,
+	SELECT_SH4A_SINGLE): New.
+	(TARGET_NONE): Add SH4A_BIT.
+	(TARGET_SWITCHES): Add 4a-single-only, 4a-single, 4a-nofpu and 4a.
+	(SH_ASM_SPEC): Pass -dsp if -m4a-nofpu.
+	(sh_fsca_sf2int, sh_fsca_df2int, sh_fsca_int2sf): Declare.
+	(OVERRIDE_OPTIONS): Set cpu to CPU_SH4A when appropriate.
+	(enum processor_type): Added PROCESSOR_SH4A.
+	(PREDICATE_CODES): Add memory_movsrc_operand.
+	* config/sh/sh.md: Removed unused variables.
+	(attr cpu): Add sh4a.
+	(attr type): Add movua, fsrra and fsca.
+	(prefetch): New, for SH4.
+	(ic_invalidate_line, ic_invalidate_line_sh4a): Use icbi.
+	(toggle_sz): Set type to fp.
+	(toggle_pr, rsqrtsf2, fsca, sinsf2, cossf2, sindf2, cosdf2): New.
+	(movua, extv, extzv): New.
+	* config/sh/t-sh: Add multilibs for 4a, 4a-nofpu, 4a-single
+	and 4a-single-only.
+
 2004-07-28  Diego Novillo  <dnovillo@redhat.com>
 
 	* tree-optimize.c (init_tree_optimization_passes): Schedule
diff --git a/gcc/config.gcc b/gcc/config.gcc
index 8fc8a99fbbc..1360b072a48 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -1746,6 +1746,11 @@ sh-*-symbianelf* | sh[12346l]*-*-symbianelf* | \
 	case `echo ${target} | sed 's/e[lb]-/-/'` in
 	sh64*-*-netbsd*)	sh_cpu_target=sh5-64media ;;
 	sh64* | sh5*-*-netbsd*)	sh_cpu_target=sh5-32media ;;
+	sh4a_single_only*)	sh_cpu_target=sh4a-single-only ;;
+	sh4a_single*)		sh_cpu_target=sh4a-single ;;
+	sh4a_nofpu*)		sh_cpu_target=sh4a-nofpu ;;
+	sh4al)			sh_cpu_target=sh4al ;;
+	sh4a*)			sh_cpu_target=sh4a ;;
 	sh4_single_only*)	sh_cpu_target=sh4-single-only ;;
 	sh4_single*)		sh_cpu_target=sh4-single ;;
 	sh4_nofpu*)		sh_cpu_target=sh4-nofpu ;;
@@ -1760,6 +1765,7 @@ sh-*-symbianelf* | sh[12346l]*-*-symbianelf* | \
 	case $sh_cpu_default in
 	sh5-64media-nofpu | sh5-64media | \
 	  sh5-32media-nofpu | sh5-32media | sh5-compact-nofpu | sh5-compact | \
+	  sh4a-single-only | sh4a-single | sh4a-nofpu | sh4a | sh4al | \
 	  sh4-single-only | sh4-single | sh4-nofpu | sh4 | \
 	  sh3e | sh3 | sh2e | sh2 | sh1) ;;
 	"")	sh_cpu_default=${sh_cpu_target} ;;
@@ -1783,6 +1789,7 @@ sh-*-symbianelf* | sh[12346l]*-*-symbianelf* | \
 		case ${sh_multilib} in
 		sh1 | sh2 | sh2e | sh3 | sh3e | \
 		sh4 | sh4-single | sh4-single-only | sh4-nofpu | \
+		sh4a | sh4a-single | sh4a-single-only | sh4a-nofpu | sh4al | \
 		sh5-64media | sh5-64media-nofpu | \
 		sh5-32media | sh5-32media-nofpu | \
 		sh5-compact | sh5-compact-nofpu)
@@ -2476,9 +2483,12 @@ fi
 		"" | m1 | m2 | m2e | m3 | m3e | m4 | m4-single | m4-single-only | m4-nofpu )
 			# OK
 			;;
+		m4a | m4a-single | m4a-single-only | m4a-nofpu | m4al)
+		        ;;
 		*)
 			echo "Unknown CPU used in --with-cpu=$with_cpu, known values:"  1>&2
 			echo "m1 m2 m2e m3 m3e m4 m4-single m4-single-only m4-nofpu" 1>&2
+			echo "m4a m4a-single m4a-single-only m4a-nofpu m4al" 1>&2
 			exit 1
 			;;
 		esac
diff --git a/gcc/config/sh/lib1funcs.asm b/gcc/config/sh/lib1funcs.asm
index 7619f0c67d7..239438fd06b 100644
--- a/gcc/config/sh/lib1funcs.asm
+++ b/gcc/config/sh/lib1funcs.asm
@@ -2036,7 +2036,34 @@ GLOBAL(ic_invalidate):
 
 	ENDFUNC(GLOBAL(ic_invalidate))
 	ENDFUNC(GLOBAL(init_trampoline))
+#elif defined(__SH4A__)
+	.global GLOBAL(ic_invalidate)
+	FUNC(GLOBAL(ic_invalidate))
+GLOBAL(ic_invalidate):
+	ocbwb	@r4
+	synco
+	rts
+	icbi	@r4
+	ENDFUNC(GLOBAL(ic_invalidate))
 #elif defined(__SH4_SINGLE__) || defined(__SH4__) || defined(__SH4_SINGLE_ONLY__)
+	/* This assumes a direct-mapped cache, which is the case for
+	the first SH4, but not for the second version of SH4, that
+	uses a 2-way set-associative cache, nor SH4a, that is 4-way.
+	SH4a fortunately offers an instruction to invalidate the
+	instruction cache, and we use it above, but SH4 doesn't.
+	However, since the libraries don't contain any nested
+	functions (the only case in which GCC would emit this pattern)
+	and we actually emit the ic_invalidate_line_i pattern for
+	cache invalidation on all SH4 multilibs (even 4-nofpu, that
+	isn't even corevered here), and pre-SH4 cores don't have
+	caches, it seems like this code is pointless, unless it's
+	meant for backward binary compatibility or for userland-only
+	cache invalidation for say sh4-*-linux-gnu.  Such a feature
+	should probably be moved into a system call, such that the
+	kernel could do whatever it takes to invalidate a cache line
+	on the core it's actually running on.  I.e., this hideous :-)
+	piece of code should go away at some point.  */
+
 	.global GLOBAL(ic_invalidate)
 	FUNC(GLOBAL(ic_invalidate))
 GLOBAL(ic_invalidate):
diff --git a/gcc/config/sh/sh-protos.h b/gcc/config/sh/sh-protos.h
index 0d1733c3717..d5ddb7b20b7 100644
--- a/gcc/config/sh/sh-protos.h
+++ b/gcc/config/sh/sh-protos.h
@@ -25,6 +25,9 @@ Boston, MA 02111-1307, USA.  */
 #define GCC_SH_PROTOS_H
 
 #ifdef RTX_CODE
+extern rtx sh_fsca_sf2int (void);
+extern rtx sh_fsca_df2int (void);
+extern rtx sh_fsca_int2sf (void);
 extern struct rtx_def *prepare_scc_operands (enum rtx_code);
 
 /* Declare functions defined in sh.c and used in templates.  */
diff --git a/gcc/config/sh/sh.c b/gcc/config/sh/sh.c
index 913bb34d253..010e5dcadc3 100644
--- a/gcc/config/sh/sh.c
+++ b/gcc/config/sh/sh.c
@@ -537,6 +537,7 @@ print_operand_address (FILE *stream, rtx x)
    'T'  print the next word of a dp value - same as 'R' in big endian mode.
    'M'  print an `x' if `m' will print `base,index'.
    'N'  print 'r63' if the operand is (const_int 0).
+   'd'  print a V2SF reg as dN instead of fpN.
    'm'  print a pair `base,offset' or `base,index', for LD and ST.
    'u'  prints the lowest 16 bits of CONST_INT, as an unsigned value.
    'o'  output an operator.  */
@@ -651,6 +652,13 @@ print_operand (FILE *stream, rtx x, int code)
 	}
       break;
 
+    case 'd':
+      if (GET_CODE (x) != REG || GET_MODE (x) != V2SFmode)
+	abort ();
+
+      fprintf ((stream), "d%s", reg_names[REGNO (x)] + 1);
+      break;
+      
     case 'N':
       if (x == CONST0_RTX (GET_MODE (x)))
 	{
@@ -772,9 +780,48 @@ expand_block_move (rtx *operands)
   int constp = (GET_CODE (operands[2]) == CONST_INT);
   int bytes = (constp ? INTVAL (operands[2]) : 0);
 
+  if (! constp)
+    return 0;
+
+  /* If we could use mov.l to move words and dest is word-aligned, we
+     can use movua.l for loads and still generate a relatively short
+     and efficient sequence.  */
+  if (TARGET_SH4A_ARCH && align < 4
+      && MEM_ALIGN (operands[0]) >= 32
+      && can_move_by_pieces (bytes, 32))
+    {
+      rtx dest = copy_rtx (operands[0]);
+      rtx src = copy_rtx (operands[1]);
+      /* We could use different pseudos for each copied word, but
+	 since movua can only load into r0, it's kind of
+	 pointless.  */
+      rtx temp = gen_reg_rtx (SImode);
+      rtx src_addr = copy_addr_to_reg (XEXP (src, 0));
+      int copied = 0;
+
+      while (copied + 4 <= bytes)
+	{
+	  rtx to = adjust_address (dest, SImode, copied);
+	  rtx from = adjust_automodify_address (src, SImode, src_addr, copied);
+
+	  emit_insn (gen_movua (temp, from));
+	  emit_move_insn (src_addr, plus_constant (src_addr, 4));
+	  emit_move_insn (to, temp);
+	  copied += 4;
+	}
+
+      if (copied < bytes)
+	move_by_pieces (adjust_address (dest, BLKmode, copied),
+			adjust_automodify_address (src, BLKmode,
+						   src_addr, copied),
+			bytes - copied, align, 0);
+
+      return 1;
+    }
+
   /* If it isn't a constant number of bytes, or if it doesn't have 4 byte
      alignment, or if it isn't a multiple of 4 bytes, then fail.  */
-  if (! constp || align < 4 || (bytes % 4 != 0))
+  if (align < 4 || (bytes % 4 != 0))
     return 0;
 
   if (TARGET_HARD_SH4)
@@ -9397,6 +9444,11 @@ bool
 sh_cannot_change_mode_class (enum machine_mode from, enum machine_mode to,
 			     enum reg_class class)
 {
+  /* We want to enable the use of SUBREGs as a means to
+     VEC_SELECT a single element of a vector.  */
+  if (to == SFmode && VECTOR_MODE_P (from) && GET_MODE_INNER (from) == SFmode)
+    return (reg_classes_intersect_p (GENERAL_REGS, class));
+
   if (GET_MODE_SIZE (from) != GET_MODE_SIZE (to))
     {
       if (TARGET_LITTLE_ENDIAN)
@@ -9821,4 +9873,87 @@ check_use_sfunc_addr (rtx insn, rtx reg)
   abort ();
 }
 
+/* Returns 1 if OP is a MEM that can be source of a simple move operation.  */
+
+int
+unaligned_load_operand (rtx op, enum machine_mode mode)
+{
+  rtx inside;
+
+  if (GET_CODE (op) != MEM || GET_MODE (op) != mode)
+    return 0;
+
+  inside = XEXP (op, 0);
+
+  if (GET_CODE (inside) == POST_INC)
+    inside = XEXP (inside, 0);
+
+  if (GET_CODE (inside) == REG)
+    return 1;
+
+  return 0;
+}
+
+/* This function returns a constant rtx that represents pi / 2**15 in
+   SFmode.  it's used to scale SFmode angles, in radians, to a
+   fixed-point signed 16.16-bit fraction of a full circle, i.e., 2*pi
+   maps to 0x10000).  */
+
+static GTY(()) rtx sh_fsca_sf2int_rtx;
+
+rtx
+sh_fsca_sf2int (void)
+{
+  if (! sh_fsca_sf2int_rtx)
+    {
+      REAL_VALUE_TYPE rv;
+
+      real_from_string (&rv, "10430.378350470453");
+      sh_fsca_sf2int_rtx = const_double_from_real_value (rv, SFmode);
+    }
+
+  return sh_fsca_sf2int_rtx;
+}
+  
+/* This function returns a constant rtx that represents pi / 2**15 in
+   DFmode.  it's used to scale DFmode angles, in radians, to a
+   fixed-point signed 16.16-bit fraction of a full circle, i.e., 2*pi
+   maps to 0x10000).  */
+
+static GTY(()) rtx sh_fsca_df2int_rtx;
+
+rtx
+sh_fsca_df2int (void)
+{
+  if (! sh_fsca_df2int_rtx)
+    {
+      REAL_VALUE_TYPE rv;
+
+      real_from_string (&rv, "10430.378350470453");
+      sh_fsca_df2int_rtx = const_double_from_real_value (rv, DFmode);
+    }
+
+  return sh_fsca_df2int_rtx;
+}
+  
+/* This function returns a constant rtx that represents 2**15 / pi in
+   SFmode.  it's used to scale a fixed-point signed 16.16-bit fraction
+   of a full circle back to a SFmode value, i.e., 0x10000 maps to
+   2*pi).  */
+
+static GTY(()) rtx sh_fsca_int2sf_rtx;
+
+rtx
+sh_fsca_int2sf (void)
+{
+  if (! sh_fsca_int2sf_rtx)
+    {
+      REAL_VALUE_TYPE rv;
+
+      real_from_string (&rv, "9.587379924285257e-5");
+      sh_fsca_int2sf_rtx = const_double_from_real_value (rv, SFmode);
+    }
+
+  return sh_fsca_int2sf_rtx;
+}
 #include "gt-sh.h"
diff --git a/gcc/config/sh/sh.h b/gcc/config/sh/sh.h
index e38361f186a..71546facf9d 100644
--- a/gcc/config/sh/sh.h
+++ b/gcc/config/sh/sh.h
@@ -60,6 +60,13 @@ do { \
     case PROCESSOR_SH4: \
       builtin_define (TARGET_FPU_SINGLE ? "__SH4_SINGLE__" : "__SH4__"); \
       break; \
+    case PROCESSOR_SH4A: \
+      builtin_define ("__SH4A__"); \
+      builtin_define (TARGET_SH4 \
+		      ? (TARGET_FPU_SINGLE ? "__SH4_SINGLE__" : "__SH4__") \
+		      : TARGET_FPU_ANY ? "__SH4_SINGLE_ONLY__" \
+		      : "__SH4_NOFPU__"); \
+      break; \
     case PROCESSOR_SH5: \
       { \
 	builtin_define_with_value ("__SH5__", \
@@ -138,6 +145,7 @@ extern int target_flags;
 #define HARD_SH4_BIT	(1<<5)
 #define FPU_SINGLE_BIT	(1<<7)
 #define SH4_BIT	       	(1<<12)
+#define SH4A_BIT	(1<<3)
 #define FMOVD_BIT	(1<<4)
 #define SH5_BIT		(1<<0)
 #define SPACE_BIT 	(1<<13)
@@ -200,6 +208,14 @@ extern int target_flags;
 /* Nonzero if we should generate code using type 4 insns.  */
 #define TARGET_SH4 ((target_flags & SH4_BIT) && (target_flags & SH1_BIT))
 
+/* Nonzero if we're generating code for the common subset of
+   instructions present on both SH4a and SH4al-dsp.  */
+#define TARGET_SH4A_ARCH (target_flags & SH4A_BIT)
+
+/* Nonzero if we're generating code for SH4a, unless the use of the
+   FPU is disabled (which makes it compatible with SH4al-dsp).  */
+#define TARGET_SH4A_FP (TARGET_SH4A_ARCH && TARGET_FPU_ANY)
+
 /* Nonzero if we should generate code for a SH5 CPU (either ISA).  */
 #define TARGET_SH5 (target_flags & SH5_BIT)
 
@@ -285,6 +301,10 @@ extern int target_flags;
 #define SELECT_SH4_SINGLE_ONLY   (HARD_SH4_BIT | SELECT_SH3E)
 #define SELECT_SH4               (SH4_BIT | SH_E_BIT | HARD_SH4_BIT | SELECT_SH3)
 #define SELECT_SH4_SINGLE        (FPU_SINGLE_BIT | SELECT_SH4)
+#define SELECT_SH4A_NOFPU        (SH4A_BIT | SELECT_SH4_NOFPU)
+#define SELECT_SH4A_SINGLE_ONLY  (SH4A_BIT | SELECT_SH4_SINGLE_ONLY)
+#define SELECT_SH4A              (SH4A_BIT | SELECT_SH4)
+#define SELECT_SH4A_SINGLE       (SH4A_BIT | SELECT_SH4_SINGLE)
 #define SELECT_SH5_64MEDIA       (SH5_BIT | SH4_BIT)
 #define SELECT_SH5_64MEDIA_NOFPU (SH5_BIT)
 #define SELECT_SH5_32MEDIA       (SH5_BIT | SH4_BIT | SH_E_BIT)
@@ -302,6 +322,12 @@ extern int target_flags;
 #ifndef SUPPORT_SH4_NOFPU
 #define TARGET_SWITCH_SH4_NOFPU
 #endif
+#ifndef SUPPORT_SH4A_NOFPU
+#define TARGET_SWITCH_SH4A_NOFPU
+#endif
+#ifndef SUPPORT_SH4AL
+#define TARGET_SWITCH_SH4AL
+#endif
 #endif
 #endif
 #endif
@@ -313,15 +339,24 @@ extern int target_flags;
 #ifndef SUPPORT_SH4_SINGLE_ONLY
 #define TARGET_SWITCH_SH4_SINGLE_ONLY
 #endif
+#ifndef SUPPORT_SH4A_SINGLE_ONLY
+#define TARGET_SWITCH_SH4A_SINGLE_ONLY
+#endif
 #endif
 #endif
 
 #ifndef SUPPORT_SH4
 #define TARGET_SWITCH_SH4
+#ifndef SUPPORT_SH4A
+#define TARGET_SWITCH_SH4A
+#endif
 #endif
 
 #ifndef SUPPORT_SH4_SINGLE
 #define TARGET_SWITCH_SH4_SINGLE
+#ifndef SUPPORT_SH4A_SINGLE
+#define TARGET_SWITCH_SH4A_SINGLE
+#endif
 #endif
 
 #ifndef SUPPORT_SH5_64MEDIA
@@ -342,7 +377,7 @@ extern int target_flags;
 
 /* Reset all target-selection flags.  */
 #define TARGET_NONE -(SH1_BIT | SH2_BIT | SH3_BIT | SH_E_BIT | SH4_BIT \
-		      | HARD_SH4_BIT | FPU_SINGLE_BIT | SH5_BIT)
+		      | SH4A_BIT | HARD_SH4_BIT | FPU_SINGLE_BIT | SH5_BIT)
 
 #ifndef TARGET_SWITCH_SH1
 #define TARGET_SWITCH_SH1 \
@@ -389,6 +424,31 @@ extern int target_flags;
   {"4",		TARGET_NONE, "" }, \
   {"4",		SELECT_SH4, "Generate SH4 code" },
 #endif
+#ifndef TARGET_SWITCH_SH4A
+#define TARGET_SWITCH_SH4A \
+  {"4a",	TARGET_NONE, "" }, \
+  {"4a",	SELECT_SH4A, "Generate SH4a code" },
+#endif
+#ifndef TARGET_SWITCH_SH4A_SINGLE_ONLY
+#define TARGET_SWITCH_SH4A_SINGLE_ONLY \
+  {"4a-single-only",	TARGET_NONE, "" },	\
+  {"4a-single-only",	SELECT_SH4A_SINGLE_ONLY, "Generate only single-precision SH4a code" },
+#endif
+#ifndef TARGET_SWITCH_SH4A_SINGLE
+#define TARGET_SWITCH_SH4A_SINGLE \
+  {"4a-single",	TARGET_NONE, "" },\
+  {"4a-single",	SELECT_SH4A_SINGLE, "Generate default single-precision SH4a code" },
+#endif
+#ifndef TARGET_SWITCH_SH4A_NOFPU
+#define TARGET_SWITCH_SH4A_NOFPU \
+  {"4a-nofpu",	TARGET_NONE, "" },\
+  {"4a-nofpu",	SELECT_SH4A_NOFPU, "Generate SH4a FPU-less code" },
+#endif
+#ifndef TARGET_SWITCH_SH4AL
+#define TARGET_SWITCH_SH4AL \
+  {"4al",	TARGET_NONE, "" },\
+  {"4al",	SELECT_SH4A_NOFPU, "Generate SH4al-dsp code" },
+#endif
 #ifndef TARGET_SWITCH_SH5_64MEDIA
 #define TARGET_SWITCH_SH5_64MEDIA \
   {"5-64media",	TARGET_NONE, "" },		\
@@ -424,6 +484,11 @@ extern int target_flags;
   TARGET_SWITCH_SH4_SINGLE \
   TARGET_SWITCH_SH4_NOFPU \
   TARGET_SWITCH_SH4 \
+  TARGET_SWITCH_SH4A_SINGLE_ONLY \
+  TARGET_SWITCH_SH4A_SINGLE \
+  TARGET_SWITCH_SH4A_NOFPU \
+  TARGET_SWITCH_SH4A \
+  TARGET_SWITCH_SH4AL \
   TARGET_SWITCH_SH5_64MEDIA \
   TARGET_SWITCH_SH5_64MEDIA_NOFPU \
   TARGET_SWITCHES_SH5_32MEDIA \
@@ -497,7 +562,7 @@ extern int target_flags;
 
 #define SH_ASM_SPEC \
  "%(subtarget_asm_endian_spec) %{mrelax:-relax %(subtarget_asm_relax_spec)}\
-%(subtarget_asm_isa_spec)"
+%(subtarget_asm_isa_spec) %{m4al:-dsp}"
 
 #define ASM_SPEC SH_ASM_SPEC
 
@@ -584,6 +649,11 @@ do {									\
       assembler_dialect = 1;						\
       sh_cpu = CPU_SH4;							\
     }									\
+  if (TARGET_SH4A_ARCH)							\
+    {									\
+      assembler_dialect = 1;						\
+      sh_cpu = CPU_SH4A;						\
+    }									\
   if (TARGET_SH5)							\
     {									\
       sh_cpu = CPU_SH5;							\
@@ -2441,8 +2511,12 @@ struct sh_args {
 #define EXTRA_CONSTRAINT_Sr0(OP) \
   (memory_operand((OP), GET_MODE (OP)) \
    && ! refers_to_regno_p (R0_REG, R0_REG + 1, OP, (rtx *)0))
+#define EXTRA_CONSTRAINT_Sua(OP) \
+  (memory_operand((OP), GET_MODE (OP)) \
+   && GET_CODE (XEXP (OP, 0)) != PLUS)
 #define EXTRA_CONSTRAINT_S(OP, STR) \
   ((STR)[1] == 'r' && (STR)[2] == '0' ? EXTRA_CONSTRAINT_Sr0 (OP) \
+   : (STR)[1] == 'u' && (STR)[2] == 'a' ? EXTRA_CONSTRAINT_Sua (OP) \
    : 0)
 
 #define EXTRA_CONSTRAINT_STR(OP, C, STR)		\
@@ -3175,6 +3249,7 @@ enum processor_type {
   PROCESSOR_SH3,
   PROCESSOR_SH3E,
   PROCESSOR_SH4,
+  PROCESSOR_SH4A,
   PROCESSOR_SH5
 };
 
@@ -3245,6 +3320,7 @@ extern int rtx_equal_function_value_matters;
   {"general_extend_operand", {SUBREG, REG, MEM, TRUNCATE}},		\
   {"general_movsrc_operand", {SUBREG, REG, CONST_INT, CONST_DOUBLE, MEM}}, \
   {"general_movdst_operand", {SUBREG, REG, MEM}},			\
+  {"unaligned_load_operand", {MEM}},					\
   {"greater_comparison_operator", {GT,GE,GTU,GEU}},			\
   {"int_gpr_dest", {SUBREG, REG}},					\
   {"inqhi_operand", {TRUNCATE}},					\
diff --git a/gcc/config/sh/sh.md b/gcc/config/sh/sh.md
index 4af9cf9a14b..77ba4d59d74 100644
--- a/gcc/config/sh/sh.md
+++ b/gcc/config/sh/sh.md
@@ -161,7 +161,7 @@
 ;; Target CPU.
 
 (define_attr "cpu"
- "sh1,sh2,sh2e,sh3,sh3e,sh4,sh5"
+ "sh1,sh2,sh2e,sh3,sh3e,sh4,sh4a,sh5"
   (const (symbol_ref "sh_cpu_attr")))
 
 (define_attr "endian" "big,little"
@@ -218,6 +218,9 @@
 ;; ftrc_s	fix_truncsfsi2_i4
 ;; dfdiv	double precision floating point divide (or square root)
 ;; cwb		ic_invalidate_line_i
+;; movua	SH4a unaligned load
+;; fsrra	square root reciprocal approximate
+;; fsca		sine and cosine approximate
 ;; tls_load     load TLS related address
 ;; arith_media	SHmedia arithmetic, logical, and shift instructions
 ;; cbranch_media SHmedia conditional branch instructions
@@ -249,7 +252,7 @@
 ;; nil		no-op move, will be deleted.
 
 (define_attr "type"
- "mt_group,cbranch,jump,jump_ind,arith,arith3,arith3b,dyn_shift,load,load_si,fload,store,move,fmove,smpy,dmpy,return,pload,prset,pstore,prget,pcload,pcload_si,pcfload,rte,sfunc,call,fp,fdiv,ftrc_s,dfp_arith,dfp_cmp,dfp_conv,dfdiv,gp_fpul,fpul_gp,mac_gp,mem_fpscr,gp_fpscr,cwb,tls_load,arith_media,cbranch_media,cmp_media,dfdiv_media,dfmul_media,dfparith_media,dfpconv_media,dmpy_media,fcmp_media,fdiv_media,fload_media,fmove_media,fparith_media,fpconv_media,fstore_media,gettr_media,invalidate_line_media,jump_media,load_media,pt_media,ptabs_media,store_media,mcmp_media,mac_media,d2mpy_media,atrans_media,ustore_media,nil,other"
+ "mt_group,cbranch,jump,jump_ind,arith,arith3,arith3b,dyn_shift,load,load_si,fload,store,move,fmove,smpy,dmpy,return,pload,prset,pstore,prget,pcload,pcload_si,pcfload,rte,sfunc,call,fp,fdiv,ftrc_s,dfp_arith,dfp_cmp,dfp_conv,dfdiv,gp_fpul,fpul_gp,mac_gp,mem_fpscr,gp_fpscr,cwb,movua,fsrra,fsca,tls_load,arith_media,cbranch_media,cmp_media,dfdiv_media,dfmul_media,dfparith_media,dfpconv_media,dmpy_media,fcmp_media,fdiv_media,fload_media,fmove_media,fparith_media,fpconv_media,fstore_media,gettr_media,invalidate_line_media,jump_media,load_media,pt_media,ptabs_media,store_media,mcmp_media,mac_media,d2mpy_media,atrans_media,ustore_media,nil,other"
   (const_string "other"))
 
 ;; We define a new attribute namely "insn_class".We use
@@ -3488,6 +3491,11 @@
       emit_insn (gen_ic_invalidate_line_compact (operands[0], operands[1]));
       DONE;
     }
+  else if (TARGET_SH4A_ARCH)
+    {
+      emit_insn (gen_ic_invalidate_line_sh4a (operands[0]));
+      DONE;
+    }
   operands[0] = force_reg (Pmode, operands[0]);
   operands[1] = force_reg (Pmode, GEN_INT (trunc_int_for_mode (0xf0000008,
 							       Pmode)));
@@ -3508,6 +3516,14 @@
   [(set_attr "length" "8")
    (set_attr "type" "cwb")])
 
+(define_insn "ic_invalidate_line_sh4a"
+  [(unspec_volatile [(match_operand:SI 0 "register_operand" "r")]
+		    UNSPEC_ICACHE)]
+  "TARGET_SH4A_ARCH"
+  "ocbwb\\t@%0\;synco\;icbi\\t@%0"
+  [(set_attr "length" "16")
+   (set_attr "type" "cwb")])
+
 ;; ??? could make arg 0 an offsettable memory operand to allow to save
 ;; an add in the code that calculates the address.
 (define_insn "ic_invalidate_line_media"
@@ -8151,7 +8167,19 @@ mov.l\\t1f,r0\\n\\
 	(xor:PSI (reg:PSI FPSCR_REG) (const_int 1048576)))]
   "TARGET_SH4"
   "fschg"
-  [(set_attr "fp_set" "unknown")])
+  [(set_attr "type" "fp") (set_attr "fp_set" "unknown")])
+
+;; There's no way we can use it today, since optimize mode switching
+;; doesn't enable us to know from which mode we're switching to the
+;; mode it requests, to tell whether we can use a relative mode switch
+;; (like toggle_pr) or an absolute switch (like loading fpscr from
+;; memory).
+(define_insn "toggle_pr"
+  [(set (reg:PSI FPSCR_REG)
+	(xor:PSI (reg:PSI FPSCR_REG) (const_int 524288)))]
+  "TARGET_SH4A_FP && ! TARGET_FPU_SINGLE"
+  "fpchg"
+  [(set_attr "type" "fp")])
 
 (define_expand "addsf3"
   [(set (match_operand:SF 0 "arith_reg_operand" "")
@@ -8650,6 +8678,117 @@ mov.l\\t1f,r0\\n\\
   [(set_attr "type" "fdiv")
    (set_attr "fp_mode" "single")])
 
+(define_insn "rsqrtsf2"
+  [(set (match_operand:SF 0 "register_operand" "=f")
+	(div:SF (match_operand:SF 1 "immediate_operand" "i")
+		(sqrt:SF (match_operand:SF 2 "register_operand" "0"))))
+   (use (match_operand:PSI 3 "fpscr_operand" "c"))]
+  "TARGET_SH4A_FP && flag_unsafe_math_optimizations
+   && operands[1] == CONST1_RTX (SFmode)"
+  "fsrra	%0"
+  [(set_attr "type" "fsrra")
+   (set_attr "fp_mode" "single")])
+
+(define_insn "fsca"
+  [(set (match_operand:V2SF 0 "fp_arith_reg_operand" "=f")
+	(vec_concat:V2SF
+	 (unspec:SF [(mult:SF
+		      (float:SF (match_operand:SI 1 "fpul_operand" "y"))
+		      (match_operand:SF 2 "immediate_operand" "i"))
+		    ] UNSPEC_FSINA)
+	 (unspec:SF [(mult:SF (float:SF (match_dup 1)) (match_dup 2))
+		    ] UNSPEC_FCOSA)))
+   (use (match_operand:PSI 3 "fpscr_operand" "c"))]
+  "TARGET_SH4A_FP && flag_unsafe_math_optimizations
+   && operands[2] == sh_fsca_int2sf ()"
+  "fsca	fpul,%d0"
+  [(set_attr "type" "fsca")
+   (set_attr "fp_mode" "single")])
+
+(define_expand "sinsf2"
+  [(set (match_operand:SF 0 "nonimmediate_operand" "")
+	(unspec:SF [(match_operand:SF 1 "fp_arith_reg_operand" "")]
+		   UNSPEC_FSINA))]
+  "TARGET_SH4A_FP && flag_unsafe_math_optimizations"
+  "
+{
+  rtx scaled = gen_reg_rtx (SFmode);
+  rtx truncated = gen_reg_rtx (SImode);
+  rtx fsca = gen_reg_rtx (V2SFmode);
+  rtx scale_reg = force_reg (SFmode, sh_fsca_sf2int ());
+
+  emit_sf_insn (gen_mulsf3 (scaled, operands[1], scale_reg));
+  emit_sf_insn (gen_fix_truncsfsi2 (truncated, scaled));
+  emit_sf_insn (gen_fsca (fsca, truncated, sh_fsca_int2sf (),
+			  get_fpscr_rtx ()));
+  emit_move_insn (operands[0], gen_rtx_SUBREG (SFmode, fsca, 0));
+  DONE;
+}")
+
+(define_expand "cossf2"
+  [(set (match_operand:SF 0 "nonimmediate_operand" "")
+	(unspec:SF [(match_operand:SF 1 "fp_arith_reg_operand" "")]
+		   UNSPEC_FCOSA))]
+  "TARGET_SH4A_FP && flag_unsafe_math_optimizations"
+  "
+{
+  rtx scaled = gen_reg_rtx (SFmode);
+  rtx truncated = gen_reg_rtx (SImode);
+  rtx fsca = gen_reg_rtx (V2SFmode);
+  rtx scale_reg = force_reg (SFmode, sh_fsca_sf2int ());
+
+  emit_sf_insn (gen_mulsf3 (scaled, operands[1], scale_reg));
+  emit_sf_insn (gen_fix_truncsfsi2 (truncated, scaled));
+  emit_sf_insn (gen_fsca (fsca, truncated, sh_fsca_int2sf (),
+			  get_fpscr_rtx ()));
+  emit_move_insn (operands[0], gen_rtx_SUBREG (SFmode, fsca, 4));
+  DONE;
+}")
+
+(define_expand "sindf2"
+  [(set (match_operand:DF 0 "fp_arith_reg_operand" "")
+	(unspec:DF [(match_operand:DF 1 "fp_arith_reg_operand" "")]
+		   UNSPEC_FSINA))]
+  "TARGET_SH4A_FP && ! TARGET_FPU_SINGLE && flag_unsafe_math_optimizations"
+  "
+{
+  rtx scaled = gen_reg_rtx (DFmode);
+  rtx truncated = gen_reg_rtx (SImode);
+  rtx fsca = gen_reg_rtx (V2SFmode);
+  rtx scale_reg = force_reg (DFmode, sh_fsca_df2int ());
+  rtx sfresult = gen_reg_rtx (SFmode);
+
+  emit_df_insn (gen_muldf3 (scaled, operands[1], scale_reg));
+  emit_df_insn (gen_fix_truncdfsi2 (truncated, scaled));
+  emit_sf_insn (gen_fsca (fsca, truncated, sh_fsca_int2sf (),
+			  get_fpscr_rtx ()));
+  emit_move_insn (sfresult, gen_rtx_SUBREG (SFmode, fsca, 0));
+  emit_df_insn (gen_extendsfdf2 (operands[0], sfresult));
+  DONE;
+}")
+
+(define_expand "cosdf2"
+  [(set (match_operand:DF 0 "fp_arith_reg_operand" "")
+	(unspec:DF [(match_operand:DF 1 "fp_arith_reg_operand" "")]
+		   UNSPEC_FCOSA))]
+  "TARGET_SH4A_FP && ! TARGET_FPU_SINGLE && flag_unsafe_math_optimizations"
+  "
+{
+  rtx scaled = gen_reg_rtx (DFmode);
+  rtx truncated = gen_reg_rtx (SImode);
+  rtx fsca = gen_reg_rtx (V2SFmode);
+  rtx scale_reg = force_reg (DFmode, sh_fsca_df2int ());
+  rtx sfresult = gen_reg_rtx (SFmode);
+
+  emit_df_insn (gen_muldf3 (scaled, operands[1], scale_reg));
+  emit_df_insn (gen_fix_truncdfsi2 (truncated, scaled));
+  emit_sf_insn (gen_fsca (fsca, truncated, sh_fsca_int2sf (),
+			  get_fpscr_rtx ()));
+  emit_move_insn (sfresult, gen_rtx_SUBREG (SFmode, fsca, 4));
+  emit_df_insn (gen_extendsfdf2 (operands[0], sfresult));
+  DONE;
+}")
+
 (define_expand "abssf2"
   [(set (match_operand:SF 0 "fp_arith_reg_operand" "")
 	(abs:SF (match_operand:SF 1 "fp_arith_reg_operand" "")))]
@@ -9188,6 +9327,71 @@ mov.l\\t1f,r0\\n\\
 
   DONE;
 }")
+
+(define_insn "movua"
+  [(set (match_operand:SI 0 "register_operand" "=z")
+	(sign_extract:SI (match_operand:SI 1 "unaligned_load_operand" "Sua>")
+			 (const_int 32) (const_int 0)))]
+  "TARGET_SH4A_ARCH"
+  "movua.l	%1,%0"
+  [(set_attr "type" "movua")])
+
+;; We shouldn't need this, but cse replaces increments with references
+;; to other regs before flow has a chance to create post_inc
+;; addressing modes, and only postreload's cse_move2add brings the
+;; increments back to a usable form.
+(define_peephole2
+  [(set (match_operand:SI 0 "register_operand" "")
+	(sign_extract:SI (mem:SI (match_operand:SI 1 "register_operand" ""))
+			 (const_int 32) (const_int 0)))
+   (set (match_dup 1) (plus:SI (match_dup 1) (const_int 4)))]
+  "TARGET_SH4A_ARCH && REGNO (operands[0]) != REGNO (operands[1])"
+  [(set (match_operand:SI 0 "register_operand" "")
+	(sign_extract:SI (mem:SI (post_inc:SI
+				  (match_operand:SI 1 "register_operand" "")))
+			 (const_int 32) (const_int 0)))]
+  "")
+
+(define_expand "extv"
+  [(set (match_operand:SI 0 "register_operand" "")
+	(sign_extract:SI (match_operand:QI 1 "unaligned_load_operand" "")
+			 (match_operand 2 "const_int_operand" "")
+			 (match_operand 3 "const_int_operand" "")))]
+  ""
+{
+  if (TARGET_SH4A_ARCH
+      && INTVAL (operands[2]) == 32
+      && INTVAL (operands[3]) == -24 * (BITS_BIG_ENDIAN != BYTES_BIG_ENDIAN)
+      && GET_CODE (operands[1]) == MEM && MEM_ALIGN (operands[1]) < 32)
+    {
+      emit_insn (gen_movua (operands[0],
+			    adjust_address (operands[1], SImode, 0)));
+      DONE;
+    }
+
+  FAIL;
+})
+
+(define_expand "extzv"
+  [(set (match_operand:SI 0 "register_operand" "")
+	(zero_extract:SI (match_operand:QI 1 "unaligned_load_operand" "")
+			 (match_operand 2 "const_int_operand" "")
+			 (match_operand 3 "const_int_operand" "")))]
+  ""
+{
+  if (TARGET_SH4A_ARCH
+      && INTVAL (operands[2]) == 32
+      && INTVAL (operands[3]) == -24 * (BITS_BIG_ENDIAN != BYTES_BIG_ENDIAN)
+      && GET_CODE (operands[1]) == MEM && MEM_ALIGN (operands[1]) < 32)
+    {
+      emit_insn (gen_movua (operands[0],
+			    adjust_address (operands[1], SImode, 0)));
+      DONE;
+    }
+
+  FAIL;
+})
+
 
 ;; -------------------------------------------------------------------------
 ;; Peepholes
@@ -10661,9 +10865,11 @@ mov.l\\t1f,r0\\n\\
   [(prefetch (match_operand:QI 0 "address_operand" "p")
              (match_operand:SI 1 "const_int_operand" "n")
              (match_operand:SI 2 "const_int_operand" "n"))]
-  "TARGET_SHMEDIA"
+  "TARGET_SHMEDIA || TARGET_HARD_SH4"
   "*
 {
+  if (TARGET_HARD_SH4)
+    return \"pref @%0\";
   operands[0] = gen_rtx_MEM (QImode, operands[0]);
   output_asm_insn (\"ld%M0.b    %m0,r63\", operands);
   return \"\";
diff --git a/gcc/config/sh/sh4a.md b/gcc/config/sh/sh4a.md
new file mode 100644
index 00000000000..b9bac220559
--- /dev/null
+++ b/gcc/config/sh/sh4a.md
@@ -0,0 +1,235 @@
+;; Scheduling description for Renesas SH4a
+;; Copyright (C) 2003 Free Software Foundation, Inc.
+;;
+;; This file is part of GNU CC.
+;;
+;; GNU CC is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 2, or (at your option)
+;; any later version.
+;;
+;; GNU CC is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;; GNU General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GNU CC; see the file COPYING.  If not, write to
+;; the Free Software Foundation, 59 Temple Place - Suite 330,
+;; Boston, MA 02111-1307, USA.
+
+;; The following description models the SH4A pipeline
+;; using the DFA based scheduler.
+
+(define_automaton "sh4a")
+
+(define_cpu_unit "sh4a_ex"   "sh4a")
+(define_cpu_unit "sh4a_ls"   "sh4a")
+(define_cpu_unit "sh4a_fex"  "sh4a")
+(define_cpu_unit "sh4a_fls"  "sh4a")
+(define_cpu_unit "sh4a_mult" "sh4a")
+(define_cpu_unit "sh4a_fdiv" "sh4a")
+
+;; Decoding is done on the integer pipeline like the
+;; sh4. Define issue to be the | of the two pipelines
+;; to control how often instructions are issued.
+(define_reservation "ID_or" "sh4a_ex|sh4a_ls")
+(define_reservation "ID_and" "sh4a_ex+sh4a_ls")
+
+
+;; =======================================================
+;; Locking Descriptions
+
+;; Sh4a_Memory access on the LS pipeline.
+(define_cpu_unit "sh4a_memory" "sh4a")
+
+;; Other access on the LS pipeline.
+(define_cpu_unit "sh4a_load_store" "sh4a")
+
+;;  The address calculator used for branch instructions.
+;; This will be reserved after "issue" of branch instructions
+;; and this is to make sure that no two branch instructions
+;; can be issued in parallel.
+(define_reservation "sh4a_addrcalc" "sh4a_ex")
+
+;; =======================================================
+;; Reservations
+
+;; Branch (BF,BF/S,BT,BT/S,BRA,BSR)
+;; Group: BR
+;; Latency when taken: 2
+(define_insn_reservation "sh4a_branch" 2
+  (and (eq_attr "cpu" "sh4a")
+       (eq_attr "type" "cbranch,jump"))
+  "ID_or+sh4a_addrcalc")
+
+;; Jump (JSR,JMP,RTS)
+;; Group: BR
+;; Latency: 3
+(define_insn_reservation "sh4a_jump" 3
+  (and (eq_attr "cpu" "sh4a")
+       (eq_attr "type" "return,jump_ind"))
+  "ID_or+sh4a_addrcalc")
+
+;; RTE
+;; Group: CO
+;; Latency: 3
+(define_insn_reservation "sh4a_rte" 3
+  (and (eq_attr "cpu" "sh4a")
+       (eq_attr "type" "rte"))
+  "ID_and*4")
+
+;; EX Group Single
+;; Group: EX
+;; Latency: 0
+(define_insn_reservation "sh4a_ex" 0
+  (and (eq_attr "cpu" "sh4a")
+       (eq_attr "insn_class" "ex_group"))
+  "sh4a_ex")
+
+;; MOVA
+;; Group: LS
+;; Latency: 1
+(define_insn_reservation "sh4a_mova" 1
+  (and (eq_attr "cpu" "sh4a")
+       (eq_attr "type" "mova"))
+  "sh4a_ls+sh4a_load_store")
+
+;; MOV
+;; Group: MT
+;; Latency: 0
+(define_insn_reservation "sh4a_mov" 0
+  (and (eq_attr "cpu" "sh4a")
+       (eq_attr "type" "move"))
+  "ID_or")
+
+;; Load
+;; Group: LS
+;; Latency: 3
+(define_insn_reservation "sh4a_load" 3
+  (and (eq_attr "cpu" "sh4a")
+       (eq_attr "type" "load,pcload"))
+  "sh4a_ls+sh4a_memory")
+
+(define_insn_reservation "sh4a_load_si" 3
+  (and (eq_attr "cpu" "sh4a")
+       (eq_attr "type" "load_si,pcload_si"))
+  "sh4a_ls+sh4a_memory")
+
+;; Store
+;; Group: LS
+;; Latency: 0
+(define_insn_reservation "sh4a_store" 0
+  (and (eq_attr "cpu" "sh4a")
+       (eq_attr "type" "store"))
+  "sh4a_ls+sh4a_memory")
+
+;; CWB TYPE
+
+;; MOVUA
+;; Group: LS
+;; Latency: 3
+(define_insn_reservation "sh4a_movua" 3
+  (and (eq_attr "cpu" "sh4a")
+       (eq_attr "type" "movua"))
+  "sh4a_ls+sh4a_memory*2")
+
+;; Fixed point multiplication (single)
+;; Group: CO
+;; Latency: 2
+(define_insn_reservation "sh4a_smult" 2
+  (and (eq_attr "cpu" "sh4a")
+       (eq_attr "type" "smpy"))
+  "ID_or+sh4a_mult")
+
+;; Fixed point multiplication (double)
+;; Group: CO
+;; Latency: 3
+(define_insn_reservation "sh4a_dmult" 3
+  (and (eq_attr "cpu" "sh4a")
+       (eq_attr "type" "dmpy"))
+  "ID_or+sh4a_mult")
+
+(define_insn_reservation "sh4a_mac_gp" 3
+  (and (eq_attr "cpu" "sh4a")
+       (eq_attr "type" "mac_gp"))
+  "ID_and")
+
+;; Other MT  group instructions(1 step operations)
+;; Group:	MT
+;; Latency: 	1
+(define_insn_reservation "sh4a_mt" 1
+  (and (eq_attr "cpu" "sh4a")
+       (eq_attr "type" "mt_group"))
+  "ID_or")
+
+;; Floating point reg move
+;; Group: LS
+;; Latency: 2
+(define_insn_reservation "sh4a_freg_mov" 2
+  (and (eq_attr "cpu" "sh4a")
+       (eq_attr "type" "fmove"))
+  "sh4a_ls,sh4a_fls")
+
+;; Single precision floating point computation FCMP/EQ,
+;; FCMP/GT, FADD, FLOAT, FMAC, FMUL, FSUB, FTRC, FRVHG, FSCHG
+;; Group:	FE
+;; Latency: 	3
+(define_insn_reservation "sh4a_fp_arith"  3
+  (and (eq_attr "cpu" "sh4a")
+       (eq_attr "type" "fp"))
+  "ID_or,sh4a_fex")
+
+(define_insn_reservation "sh4a_fp_arith_ftrc"  3
+  (and (eq_attr "cpu" "sh4a")
+       (eq_attr "type" "ftrc_s"))
+  "ID_or,sh4a_fex")
+
+;; Single-precision FDIV/FSQRT
+;; Group: FE
+;; Latency: 20
+(define_insn_reservation "sh4a_fdiv" 20
+  (and (eq_attr "cpu" "sh4a")
+       (eq_attr "type" "fdiv"))
+  "ID_or,sh4a_fex+sh4a_fdiv,sh4a_fex")
+
+;; Double Precision floating point computation
+;; (FCNVDS, FCNVSD, FLOAT, FTRC)
+;; Group:	FE
+;; Latency: 	3
+(define_insn_reservation "sh4a_dp_float" 3
+  (and (eq_attr "cpu" "sh4a")
+       (eq_attr "type" "dfp_conv"))
+  "ID_or,sh4a_fex")
+
+;; Double-precision floating-point (FADD,FMUL,FSUB)
+;; Group:	FE
+;; Latency: 	5
+(define_insn_reservation "sh4a_fp_double_arith" 5
+  (and (eq_attr "cpu" "sh4a")
+       (eq_attr "type" "dfp_arith"))
+  "ID_or,sh4a_fex*3")
+
+;; Double precision FDIV/SQRT
+;; Group:	FE
+;; Latency: 	36
+(define_insn_reservation "sh4a_dp_div" 36
+  (and (eq_attr "cpu" "sh4a")
+       (eq_attr "type" "dfdiv"))
+  "ID_or,sh4a_fex+sh4a_fdiv,sh4a_fex*2")
+
+;; FSRRA
+;; Group: FE
+;; Latency: 5
+(define_insn_reservation "sh4a_fsrra" 5
+  (and (eq_attr "cpu" "sh4a")
+       (eq_attr "type" "fsrra"))
+  "ID_or,sh4a_fex")
+
+;; FSCA
+;; Group: FE
+;; Latency: 7
+(define_insn_reservation "sh4a_fsca" 7
+  (and (eq_attr "cpu" "sh4a")
+       (eq_attr "type" "fsca"))
+  "ID_or,sh4a_fex*3")
diff --git a/gcc/config/sh/t-mlib-sh4a b/gcc/config/sh/t-mlib-sh4a
new file mode 100644
index 00000000000..788b852962d
--- /dev/null
+++ b/gcc/config/sh/t-mlib-sh4a
@@ -0,0 +1 @@
+ML_sh4a=m4a/
diff --git a/gcc/config/sh/t-mlib-sh4a-nofpu b/gcc/config/sh/t-mlib-sh4a-nofpu
new file mode 100644
index 00000000000..c9dc28bb886
--- /dev/null
+++ b/gcc/config/sh/t-mlib-sh4a-nofpu
@@ -0,0 +1 @@
+ML_sh4a_nofpu=m4a-nofpu/
diff --git a/gcc/config/sh/t-mlib-sh4a-single b/gcc/config/sh/t-mlib-sh4a-single
new file mode 100644
index 00000000000..036a4cc0a36
--- /dev/null
+++ b/gcc/config/sh/t-mlib-sh4a-single
@@ -0,0 +1 @@
+ML_sh4a_single=m4a-single/
diff --git a/gcc/config/sh/t-mlib-sh4a-single-only b/gcc/config/sh/t-mlib-sh4a-single-only
new file mode 100644
index 00000000000..5709e8ef789
--- /dev/null
+++ b/gcc/config/sh/t-mlib-sh4a-single-only
@@ -0,0 +1 @@
+ML_sh4a_single_only=m4a-single-only/
diff --git a/gcc/config/sh/t-mlib-sh4al b/gcc/config/sh/t-mlib-sh4al
new file mode 100644
index 00000000000..e8e36ba5b86
--- /dev/null
+++ b/gcc/config/sh/t-mlib-sh4al
@@ -0,0 +1 @@
+ML_sh4al=m4al/
diff --git a/gcc/config/sh/t-sh b/gcc/config/sh/t-sh
index 8e262fa7e00..97dd99bf101 100644
--- a/gcc/config/sh/t-sh
+++ b/gcc/config/sh/t-sh
@@ -22,15 +22,16 @@ fp-bit.c: $(srcdir)/config/fp-bit.c
 	cat $(srcdir)/config/fp-bit.c >> fp-bit.c
 
 MULTILIB_ENDIAN = ml/mb
-MULTILIB_CPUS= $(ML_sh1)$(ML_sh2e)$(ML_sh2)$(ML_sh3e)$(ML_sh3)$(ML_sh4_nofpu)$(ML_sh4_single_only)$(ML_sh4_single)$(ML_sh4)$(ML_m5_32media)$(ML_m5_32media_nofpu)$(ML_m5_compact)$(ML_m5_compact_nofpu)$(ML_m5_64media)$(ML_m5_64media_nofpu)
+MULTILIB_CPUS= $(ML_sh1)$(ML_sh2e)$(ML_sh2)$(ML_sh3e)$(ML_sh3)$(ML_sh4_nofpu)$(ML_sh4_single_only)$(ML_sh4_single)$(ML_sh4)$(ML_sh4a_nofpu)$(ML_sh4a_single_only)$(ML_sh4a_single)$(ML_sh4a)$(ML_m5_32media)$(ML_m5_32media_nofpu)$(ML_m5_compact)$(ML_m5_compact_nofpu)$(ML_m5_64media)$(ML_m5_64media_nofpu)
 
 MULTILIB_OPTIONS= $(MULTILIB_ENDIAN) $(MULTILIB_CPUS:/=)
 MULTILIB_DIRNAMES= 
 #MULTILIB_MATCHES = m2=m3 m2e=m3e m2=m4-nofpu
 MULTILIB_MATCHES = $(shell \
   multilibs="$(MULTILIB_OPTIONS)" ; \
-  for abi in m1,m2,m3,m4-nofpu \
-             m2e,m3e,m4-single-only \
+  for abi in m1,m2,m3,m4-nofpu,m4al,m4a-nofpu \
+             m2e,m3e,m4-single-only,m4a-single-only \
+             m4-single,m4a-single m4,m4a \
              m5-32media,m5-compact,m5-32media \
              m5-32media-nofpu,m5-compact-nofpu,m5-32media-nofpu; do \
     subst= ; \
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 2dc69698307..204c27bca7b 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -621,6 +621,7 @@ See RS/6000 and PowerPC Options.
 @emph{SH Options}
 @gccoptlist{-m1  -m2  -m2e  -m3  -m3e @gol
 -m4-nofpu  -m4-single-only  -m4-single  -m4 @gol
+-m4a-nofpu -m4a-single-only -m4a-single -m4a -m4al @gol
 -m5-64media  -m5-64media-nofpu @gol
 -m5-32media  -m5-32media-nofpu @gol
 -m5-compact  -m5-compact-nofpu @gol
@@ -10518,6 +10519,31 @@ single-precision mode by default.
 @opindex m4
 Generate code for the SH4.
 
+@item -m4a-nofpu
+@opindex m4a-nofpu
+Generate code for the SH4al-dsp, or for a SH4a in such a way that the
+floating-point unit is not used.
+
+@item -m4a-single-only
+@opindex m4a-single-only
+Generate code for the SH4a, in such a way that no double-precision
+floating point operations are used.
+
+@item -m4a-single
+@opindex m4a-single
+Generate code for the SH4a assuming the floating-point unit is in
+single-precision mode by default.
+
+@item -m4a
+@opindex m4a
+Generate code for the SH4a.
+
+@item -m4al
+@opindex m4al
+Same as @option{-m4a-nofpu}, except that it implicitly passes
+@option{-dsp} to the assembler.  GCC doesn't generate any DSP
+instructions at the moment.
+
 @item -mb
 @opindex mb
 Compile code for the processor in big endian mode.
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 8b99613999f..78fa42055bf 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,19 @@
+2004-07-28  Alexandre Oliva  <aoliva@redhat.com>
+
+	2003-07-06  Alexandre Oliva  <aoliva@redhat.com>
+	* gcc.dg/sh4a-memmovua.c: Tweak regular expression.
+	2003-07-01  Alexandre Oliva  <aoliva@redhat.com>
+	* gcc.dg/sh4a-bitmovua.c: New.
+	* gcc.dg/sh4a-cos.c: New.
+	* gcc.dg/sh4a-cosf.c: New.
+	* gcc.dg/sh4a-fprun.c: New.
+	* gcc.dg/sh4a-fsrra.c: New.
+	* gcc.dg/sh4a-memmovua.c: New.
+	* gcc.dg/sh4a-sin.c: New.
+	* gcc.dg/sh4a-sincos.c: New.
+	* gcc.dg/sh4a-sincosf.c: New.
+	* gcc.dg/sh4a-sinf.c: New.
+
 2004-07-28  Diego Novillo  <dnovillo@redhat.com>
 
 	* gcc.dg/tree-ssa/20030714-2.c: Adjust number of expected
diff --git a/gcc/testsuite/gcc.dg/sh4a-bitmovua.c b/gcc/testsuite/gcc.dg/sh4a-bitmovua.c
new file mode 100644
index 00000000000..b7081bf7186
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/sh4a-bitmovua.c
@@ -0,0 +1,73 @@
+/* Verify that we generate movua to load unaligned 32-bit values.  */
+/* { dg-do compile { target "sh*-*-*" } } */
+/* { dg-options "-O" } */
+/* { dg-final { scan-assembler-times "\tmovua\\.l\t" 6 } } */
+
+#ifdef __SH4A__
+/* Aligned.  */
+struct s0 { long long d : 32; } x0;
+long long f0() {
+  return x0.d;
+}
+
+/* Unaligned load.  */
+struct s1 { long long c : 8; long long d : 32; } x1;
+long long f1() {
+  return x1.d;
+}
+
+/* Unaligned load.  */
+struct s2 { long long c : 16; long long d : 32; } x2;
+long long f2() {
+  return x2.d;
+}
+
+/* Unaligned load.  */
+struct s3 { long long c : 24; long long d : 32; } x3;
+long long f3() {
+  return x3.d;
+}
+
+/* Aligned.  */
+struct s4 { long long c : 32; long long d : 32; } x4;
+long long f4() {
+  return x4.d;
+}
+
+/* Aligned.  */
+struct u0 { unsigned long long d : 32; } y0;
+unsigned long long g0() {
+  return y0.d;
+}
+
+/* Unaligned load.  */
+struct u1 { long long c : 8; unsigned long long d : 32; } y1;
+unsigned long long g1() {
+  return y1.d;
+}
+
+/* Unaligned load.  */
+struct u2 { long long c : 16; unsigned long long d : 32; } y2;
+unsigned long long g2() {
+  return y2.d;
+}
+
+/* Unaligned load.  */
+struct u3 { long long c : 24; unsigned long long d : 32; } y3;
+unsigned long long g3() {
+  return y3.d;
+}
+
+/* Aligned.  */
+struct u4 { long long c : 32; unsigned long long d : 32; } y4;
+unsigned long long g4() {
+  return y4.d;
+}
+#else
+asm ("movua.l\t");
+asm ("movua.l\t");
+asm ("movua.l\t");
+asm ("movua.l\t");
+asm ("movua.l\t");
+asm ("movua.l\t");
+#endif
diff --git a/gcc/testsuite/gcc.dg/sh4a-cos.c b/gcc/testsuite/gcc.dg/sh4a-cos.c
new file mode 100644
index 00000000000..198d41f8675
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/sh4a-cos.c
@@ -0,0 +1,13 @@
+/* Verify that we generate single-precision sine and cosine approximate
+   (fsca) in fast math mode.  */
+/* { dg-do compile { target "sh*-*-*" } } */
+/* { dg-options "-O -ffast-math" } */
+/* { dg-final { scan-assembler "\tfsca\t" } } */
+
+#if defined __SH4A__ && ! defined __SH4_NOFPU__
+#include <math.h>
+
+double test(double f) { return cos(f); }
+#else
+asm ("fsca\t");
+#endif
diff --git a/gcc/testsuite/gcc.dg/sh4a-cosf.c b/gcc/testsuite/gcc.dg/sh4a-cosf.c
new file mode 100644
index 00000000000..f78c140d501
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/sh4a-cosf.c
@@ -0,0 +1,13 @@
+/* Verify that we generate single-precision sine and cosine approximate
+   (fsca) in fast math mode.  */
+/* { dg-do compile { target "sh*-*-*" } } */
+/* { dg-options "-O -ffast-math" } */
+/* { dg-final { scan-assembler "\tfsca\t" } } */
+
+#if defined __SH4A__ && ! defined __SH4_NOFPU__
+#include <math.h>
+
+float test(float f) { return cosf(f); }
+#else
+asm ("fsca\t");
+#endif
diff --git a/gcc/testsuite/gcc.dg/sh4a-fprun.c b/gcc/testsuite/gcc.dg/sh4a-fprun.c
new file mode 100644
index 00000000000..8e26dc170a1
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/sh4a-fprun.c
@@ -0,0 +1,35 @@
+/* Verify that fsca and fssra yield reasonable results.  */
+/* { do-do run { target "sh*-*-*" } } */
+/* { dg-options "-O -ffast-math" } */
+
+#include <math.h>
+#include <stdlib.h>
+
+float sqrt_arg = 4.0f, sqrt_res = 2.0f;
+float dg2rad_f;
+double dg2rad_d;
+
+void check_f (float res, float expected) {
+  if (res >= expected - 0.001f && res <= expected + 0.001f)
+    return;
+
+  abort ();
+}
+
+void check_d (double res, double expected) {
+  if (res >= expected - 0.001 && res <= expected + 0.001)
+    return;
+
+  abort ();
+}
+
+int main() {
+  check_f (sqrtf(sqrt_arg), sqrt_res);
+  dg2rad_f = dg2rad_d = atan(1) / 45;
+  check_f (sinf(90*dg2rad_f), 1);
+  check_f (cosf(90*dg2rad_f), 0);
+  check_d (sin(-90*dg2rad_d), -1);
+  check_d (cos(180*dg2rad_d), -1);
+  check_d (sin(-45*dg2rad_d) * cosf(135*dg2rad_f), 0.5);
+  exit (0);
+}
diff --git a/gcc/testsuite/gcc.dg/sh4a-fsrra.c b/gcc/testsuite/gcc.dg/sh4a-fsrra.c
new file mode 100644
index 00000000000..c8f04e4d2e2
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/sh4a-fsrra.c
@@ -0,0 +1,13 @@
+/* Verify that we generate single-precision square root reciprocal
+   approximate (fsrra) in fast math mode.  */
+/* { dg-do compile { target "sh*-*-*" } } */
+/* { dg-options "-O -ffast-math" } */
+/* { dg-final { scan-assembler "\tfsrra\t" } } */
+
+#if defined __SH4A__ && ! defined __SH4_NOFPU__
+#include <math.h>
+
+float test(float f) { return 1 / sqrtf(f); }
+#else
+asm ("fsrra\t");
+#endif
diff --git a/gcc/testsuite/gcc.dg/sh4a-memmovua.c b/gcc/testsuite/gcc.dg/sh4a-memmovua.c
new file mode 100644
index 00000000000..68927929854
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/sh4a-memmovua.c
@@ -0,0 +1,17 @@
+/* Verify that we generate movua to copy unaligned memory regions to
+   32-bit-aligned addresses.  */
+/* { dg-do compile { target "sh*-*-*" } } */
+/* { dg-options "-O" } */
+/* { dg-final { scan-assembler-times "\tmovua\\.l\t(.*)+" 2 } } */
+
+#ifdef __SH4A__
+#include <stdlib.h>
+
+struct s { int i; char a[10], b[10]; } x;
+int f() {
+  memcpy(x.a, x.b, 10);
+}
+#else
+asm ("movua.l\t+");
+asm ("movua.l\t+");
+#endif
diff --git a/gcc/testsuite/gcc.dg/sh4a-sin.c b/gcc/testsuite/gcc.dg/sh4a-sin.c
new file mode 100644
index 00000000000..9f46f600763
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/sh4a-sin.c
@@ -0,0 +1,13 @@
+/* Verify that we generate single-precision sine and cosine approximate
+   (fsca) in fast math mode.  */
+/* { dg-do compile { target "sh*-*-*" } } */
+/* { dg-options "-O -ffast-math" } */
+/* { dg-final { scan-assembler "\tfsca\t" } } */
+
+#if defined __SH4A__ && ! defined __SH4_NOFPU__
+#include <math.h>
+
+double test(double f) { return sin(f); }
+#else
+asm ("fsca\t");
+#endif
diff --git a/gcc/testsuite/gcc.dg/sh4a-sincos.c b/gcc/testsuite/gcc.dg/sh4a-sincos.c
new file mode 100644
index 00000000000..f4293797534
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/sh4a-sincos.c
@@ -0,0 +1,14 @@
+/* Verify that we generate a single single-precision sine and cosine
+   approximate (fsca) in fast math mode when a function computes both
+   sine and cosine.  */
+/* { dg-do compile { target "sh*-*-*" } } */
+/* { dg-options "-O -ffast-math" } */
+/* { dg-final { scan-assembler-times "\tfsca\t" 1 } } */
+
+#if defined __SH4A__ && ! defined __SH4_NOFPU__
+#include <math.h>
+
+double test(double f) { return sin(f) + cos(f); }
+#else
+asm ("fsca\t");
+#endif
diff --git a/gcc/testsuite/gcc.dg/sh4a-sincosf.c b/gcc/testsuite/gcc.dg/sh4a-sincosf.c
new file mode 100644
index 00000000000..42913dbd59e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/sh4a-sincosf.c
@@ -0,0 +1,14 @@
+/* Verify that we generate a single single-precision sine and cosine
+   approximate (fsca) in fast math mode when a function computes both
+   sine and cosine.  */
+/* { dg-do compile { target "sh*-*-*" } } */
+/* { dg-options "-O -ffast-math" } */
+/* { dg-final { scan-assembler-times "\tfsca\t" 1 } } */
+
+#if defined __SH4A__ && ! defined __SH4_NOFPU__
+#include <math.h>
+
+float test(float f) { return sinf(f) + cosf(f); }
+#else
+asm ("fsca\t");
+#endif
diff --git a/gcc/testsuite/gcc.dg/sh4a-sinf.c b/gcc/testsuite/gcc.dg/sh4a-sinf.c
new file mode 100644
index 00000000000..2a2343fd73a
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/sh4a-sinf.c
@@ -0,0 +1,13 @@
+/* Verify that we generate single-precision sine and cosine approximate
+   (fsca) in fast math mode.  */
+/* { dg-do compile { target "sh*-*-*" } } */
+/* { dg-options "-O -ffast-math" } */
+/* { dg-final { scan-assembler "\tfsca\t" } } */
+
+#if defined __SH4A__ && ! defined __SH4_NOFPU__
+#include <math.h>
+
+float test(float f) { return sinf(f); }
+#else
+asm ("fsca\t");
+#endif
diff --git a/libstdc++-v3/ChangeLog b/libstdc++-v3/ChangeLog
index 3a16fce910e..aed1079e74d 100644
--- a/libstdc++-v3/ChangeLog
+++ b/libstdc++-v3/ChangeLog
@@ -1,3 +1,11 @@
+2004-07-28  Alexandre Oliva  <aoliva@redhat.com>
+
+	2003-10-01  Eric Christopher  <echristo@redhat.com>
+	* config/cpu/sh/atomicity.h (__exchange_and_add): Remove 'm'
+	constraint.
+	2003-07-09  Alexandre Oliva  <aoliva@redhat.com>
+	* config/cpu/sh/atomicity.h: New.  Use movli and movco on SH4a.
+
 2004-07-23  Benjamin Kosnik  <bkoz@redhat.com>
 
 	PR libstdc++/16678
diff --git a/libstdc++-v3/config/cpu/sh/atomicity.h b/libstdc++-v3/config/cpu/sh/atomicity.h
new file mode 100644
index 00000000000..b7d6c605d72
--- /dev/null
+++ b/libstdc++-v3/config/cpu/sh/atomicity.h
@@ -0,0 +1,123 @@
+// Low-level functions for atomic operations: Generic version  -*- C++ -*-
+
+// Copyright (C) 1999, 2001, 2002, 2003 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 2, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING.  If not, write to the Free
+// Software Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+// USA.
+
+// As a special exception, you may use this file as part of a free software
+// library without restriction.  Specifically, if other files instantiate
+// templates or use macros or inline functions from this file, or you compile
+// this file and link it with other files to produce an executable, this
+// file does not by itself cause the resulting executable to be covered by
+// the GNU General Public License.  This exception does not however
+// invalidate any other reasons why the executable file might be covered by
+// the GNU General Public License.
+
+#ifndef _BITS_ATOMICITY_H
+#define _BITS_ATOMICITY_H	1
+
+#ifdef __SH4A__
+
+typedef int _Atomic_word;
+
+static inline _Atomic_word
+__attribute__ ((__unused__))
+__exchange_and_add (volatile _Atomic_word* __mem, int __val)
+{
+  _Atomic_word __result;
+
+  __asm__ __volatile__
+    ("0:\n"
+     "\tmovli.l\t@%2,r0\n"
+     "\tmov\tr0,%1\n"
+     "\tadd\t%3,r0\n"
+     "\tmovco.l\tr0,@%2\n"
+     "\tbf\t0b"
+     : "+m" (*__mem), "=r" (__result)
+     : "r" (__mem), "rI08" (__val)
+     : "r0");
+
+  return __result;
+}
+
+
+static inline void
+__attribute__ ((__unused__))
+__atomic_add (volatile _Atomic_word* __mem, int __val)
+{
+  asm("0:\n"
+      "\tmovli.l\t@%1,r0\n"
+      "\tadd\t%2,r0\n"
+      "\tmovco.l\tr0,@%1\n"
+      "\tbf\t0b"
+      : "+m" (*__mem)
+      : "r" (__mem), "rI08" (__val)
+      : "r0");
+}
+
+#else
+
+/* This is generic/atomicity.h */
+
+#include <bits/gthr.h>
+
+#define _GLIBCPP_NEED_GENERIC_MUTEX
+
+typedef int _Atomic_word;
+
+namespace __gnu_cxx
+{
+  extern __gthread_mutex_t _Atomic_add_mutex;
+
+#ifndef __GTHREAD_MUTEX_INIT
+  extern __gthread_once_t _Atomic_add_mutex_once;
+  extern void __gthread_atomic_add_mutex_once();
+#endif
+}
+
+static inline _Atomic_word
+__attribute__ ((__unused__))
+__exchange_and_add (volatile _Atomic_word* __mem, int __val)
+{
+#ifndef __GTHREAD_MUTEX_INIT
+  __gthread_once (&__gnu_cxx::_Atomic_add_mutex_once,
+                  __gnu_cxx::__gthread_atomic_add_mutex_once);
+#endif
+
+  _Atomic_word __result;
+
+  __gthread_mutex_lock (&__gnu_cxx::_Atomic_add_mutex);
+
+  __result = *__mem;
+  *__mem += __val;
+
+  __gthread_mutex_unlock (&__gnu_cxx::_Atomic_add_mutex);
+  return __result;
+}
+
+
+static inline void
+__attribute__ ((__unused__))
+__atomic_add (volatile _Atomic_word* __mem, int __val)
+{
+  (void) __exchange_and_add (__mem, __val);
+}
+
+
+#endif
+
+#endif /* atomicity.h */
-- 
2.30.2