+2017-03-27 Michael Meissner <meissner@linux.vnet.ibm.com>
+
+ PR target/78543
+ * config/rs6000/rs6000.md (bswaphi2_extenddi): Combine bswap
+ HImode and SImode with zero extend to DImode to one insn.
+ (bswap<mode>2_extenddi): Likewise.
+ (bswapsi2_extenddi): Likewise.
+ (bswaphi2_extendsi): Likewise.
+ (bswaphi2): Combine bswap HImode and SImode into one insn.
+ Separate memory insns from swapping register.
+ (bswapsi2): Likewise.
+ (bswap<mode>2): Likewise.
+ (bswaphi2_internal): Delete, no longer used.
+ (bswapsi2_internal): Likewise.
+ (bswap<mode>2_load): Split bswap HImode/SImode into separate load,
+ store, and gpr<-gpr swap insns.
+ (bswap<mode>2_store): Likewise.
+ (bswaphi2_reg): Register only splitter, combine with the splitter.
+ (bswaphi2 splitter): Likewise.
+ (bswapsi2_reg): Likewise.
+ (bswapsi2 splitter): Likewise.
+ (bswapdi2): If we have the LDBRX and STDBRX instructions, split
+ the insns into load, store, and register/register insns.
+ (bswapdi2_ldbrx): Likewise.
+ (bswapdi2_load): Likewise.
+ (bswapdi2_store): Likewise.
+ (bswapdi2_reg): Likewise.
+
2017-03-27 Gunther Nikl <gnikl@users.sourceforge.net>
* system.h (HAVE_DESIGNATED_INITIALIZERS): Fix non C++ case.
;; Since the hardware zeros the upper part of the register, save generating the
;; AND immediate if we are converting to unsigned
-(define_insn "*bswaphi2_extenddi"
+(define_insn "*bswap<mode>2_extenddi"
[(set (match_operand:DI 0 "gpc_reg_operand" "=r")
(zero_extend:DI
- (bswap:HI (match_operand:HI 1 "memory_operand" "Z"))))]
+ (bswap:HSI (match_operand:HSI 1 "memory_operand" "Z"))))]
"TARGET_POWERPC64"
- "lhbrx %0,%y1"
+ "l<wd>brx %0,%y1"
[(set_attr "length" "4")
(set_attr "type" "load")])
[(set_attr "length" "4")
(set_attr "type" "load")])
-(define_expand "bswaphi2"
- [(parallel [(set (match_operand:HI 0 "reg_or_mem_operand" "")
- (bswap:HI
- (match_operand:HI 1 "reg_or_mem_operand" "")))
- (clobber (match_scratch:SI 2 ""))])]
+;; Separate the bswap patterns into load, store, and gpr<-gpr. This prevents
+;; the register allocator from converting a gpr<-gpr swap into a store and then
+;; load with byte swap, which can be slower than doing it in the registers. It
+;; also prevents certain failures with the RELOAD register allocator.
+
+(define_expand "bswap<mode>2"
+ [(use (match_operand:HSI 0 "reg_or_mem_operand"))
+ (use (match_operand:HSI 1 "reg_or_mem_operand"))]
""
{
- if (!REG_P (operands[0]) && !REG_P (operands[1]))
- operands[1] = force_reg (HImode, operands[1]);
+ rtx dest = operands[0];
+ rtx src = operands[1];
+
+ if (!REG_P (dest) && !REG_P (src))
+ src = force_reg (<MODE>mode, src);
+
+ if (MEM_P (src))
+ emit_insn (gen_bswap<mode>2_load (dest, src));
+ else if (MEM_P (dest))
+ emit_insn (gen_bswap<mode>2_store (dest, src));
+ else
+ emit_insn (gen_bswap<mode>2_reg (dest, src));
+ DONE;
})
-(define_insn "bswaphi2_internal"
- [(set (match_operand:HI 0 "reg_or_mem_operand" "=r,Z,&r")
- (bswap:HI
- (match_operand:HI 1 "reg_or_mem_operand" "Z,r,r")))
- (clobber (match_scratch:SI 2 "=X,X,&r"))]
+(define_insn "bswap<mode>2_load"
+ [(set (match_operand:HSI 0 "gpc_reg_operand" "=r")
+ (bswap:HSI (match_operand:HSI 1 "memory_operand" "Z")))]
""
- "@
- lhbrx %0,%y1
- sthbrx %1,%y0
- #"
- [(set_attr "length" "4,4,12")
- (set_attr "type" "load,store,*")])
+ "l<wd>brx %0,%y1"
+ [(set_attr "type" "load")])
-(define_split
- [(set (match_operand:HI 0 "gpc_reg_operand" "")
- (bswap:HI (match_operand:HI 1 "gpc_reg_operand" "")))
- (clobber (match_operand:SI 2 "gpc_reg_operand" ""))]
+(define_insn "bswap<mode>2_store"
+ [(set (match_operand:HSI 0 "memory_operand" "=Z")
+ (bswap:HSI (match_operand:HSI 1 "gpc_reg_operand" "r")))]
+ ""
+ "st<wd>brx %1,%y0"
+ [(set_attr "type" "store")])
+
+(define_insn_and_split "bswaphi2_reg"
+ [(set (match_operand:HI 0 "gpc_reg_operand" "=&r")
+ (bswap:HI
+ (match_operand:HI 1 "gpc_reg_operand" "r")))
+ (clobber (match_scratch:SI 2 "=&r"))]
+ ""
+ "#"
"reload_completed"
[(set (match_dup 3)
(and:SI (lshiftrt:SI (match_dup 4)
(set (match_dup 3)
(ior:SI (match_dup 3)
(match_dup 2)))]
- "
{
operands[3] = simplify_gen_subreg (SImode, operands[0], HImode, 0);
operands[4] = simplify_gen_subreg (SImode, operands[1], HImode, 0);
-}")
-
-(define_insn "*bswapsi2_extenddi"
- [(set (match_operand:DI 0 "gpc_reg_operand" "=r")
- (zero_extend:DI
- (bswap:SI (match_operand:SI 1 "memory_operand" "Z"))))]
- "TARGET_POWERPC64"
- "lwbrx %0,%y1"
- [(set_attr "length" "4")
- (set_attr "type" "load")])
-
-(define_expand "bswapsi2"
- [(set (match_operand:SI 0 "reg_or_mem_operand" "")
- (bswap:SI
- (match_operand:SI 1 "reg_or_mem_operand" "")))]
- ""
-{
- if (!REG_P (operands[0]) && !REG_P (operands[1]))
- operands[1] = force_reg (SImode, operands[1]);
-})
-
-(define_insn "*bswapsi2_internal"
- [(set (match_operand:SI 0 "reg_or_mem_operand" "=r,Z,&r")
- (bswap:SI
- (match_operand:SI 1 "reg_or_mem_operand" "Z,r,r")))]
- ""
- "@
- lwbrx %0,%y1
- stwbrx %1,%y0
- #"
- [(set_attr "length" "4,4,12")
- (set_attr "type" "load,store,*")])
+}
+ [(set_attr "length" "12")
+ (set_attr "type" "*")])
;; We are always BITS_BIG_ENDIAN, so the bit positions below in
;; zero_extract insns do not change for -mlittle.
-(define_split
- [(set (match_operand:SI 0 "gpc_reg_operand" "")
- (bswap:SI (match_operand:SI 1 "gpc_reg_operand" "")))]
+(define_insn_and_split "bswapsi2_reg"
+ [(set (match_operand:SI 0 "gpc_reg_operand" "=&r")
+ (bswap:SI
+ (match_operand:SI 1 "gpc_reg_operand" "r")))]
+ ""
+ "#"
"reload_completed"
[(set (match_dup 0) ; DABC
(rotate:SI (match_dup 1)
(const_int 24))
(const_int 255))
(and:SI (match_dup 0)
- (const_int -256))))
-
- ]
+ (const_int -256))))]
"")
+;; On systems with LDBRX/STDBRX generate the loads/stores directly, just like
+;; we do for L{H,W}BRX and ST{H,W}BRX above. If not, we have to generate more
+;; complex code.
+
(define_expand "bswapdi2"
[(parallel [(set (match_operand:DI 0 "reg_or_mem_operand" "")
(bswap:DI
(clobber (match_scratch:DI 3 ""))])]
""
{
- if (!REG_P (operands[0]) && !REG_P (operands[1]))
- operands[1] = force_reg (DImode, operands[1]);
+ rtx dest = operands[0];
+ rtx src = operands[1];
+
+ if (!REG_P (dest) && !REG_P (src))
+ operands[1] = src = force_reg (DImode, src);
+
+ if (TARGET_POWERPC64 && TARGET_LDBRX)
+ {
+ if (MEM_P (src))
+ emit_insn (gen_bswapdi2_load (dest, src));
+ else if (MEM_P (dest))
+ emit_insn (gen_bswapdi2_store (dest, src));
+ else
+ emit_insn (gen_bswapdi2_reg (dest, src));
+ DONE;
+ }
if (!TARGET_POWERPC64)
{
/* 32-bit mode needs fewer scratch registers, but 32-bit addressing mode
that uses 64-bit registers needs the same scratch registers as 64-bit
mode. */
- emit_insn (gen_bswapdi2_32bit (operands[0], operands[1]));
+ emit_insn (gen_bswapdi2_32bit (dest, src));
DONE;
}
})
;; Power7/cell has ldbrx/stdbrx, so use it directly
-(define_insn "*bswapdi2_ldbrx"
- [(set (match_operand:DI 0 "reg_or_mem_operand" "=r,Z,&r")
- (bswap:DI (match_operand:DI 1 "reg_or_mem_operand" "Z,r,r")))
- (clobber (match_scratch:DI 2 "=X,X,&r"))
- (clobber (match_scratch:DI 3 "=X,X,&r"))]
- "TARGET_POWERPC64 && TARGET_LDBRX
- && (REG_P (operands[0]) || REG_P (operands[1]))"
- "@
- ldbrx %0,%y1
- stdbrx %1,%y0
- #"
- [(set_attr "length" "4,4,36")
- (set_attr "type" "load,store,*")])
+(define_insn "bswapdi2_load"
+ [(set (match_operand:DI 0 "gpc_reg_operand" "=r")
+ (bswap:DI (match_operand:DI 1 "memory_operand" "Z")))]
+ "TARGET_POWERPC64 && TARGET_LDBRX"
+ "ldbrx %0,%y1"
+ [(set_attr "type" "load")])
+
+(define_insn "bswapdi2_store"
+ [(set (match_operand:DI 0 "memory_operand" "=Z")
+ (bswap:DI (match_operand:DI 1 "gpc_reg_operand" "r")))]
+ "TARGET_POWERPC64 && TARGET_LDBRX"
+ "stdbrx %1,%y0"
+ [(set_attr "type" "store")])
+
+(define_insn "bswapdi2_reg"
+ [(set (match_operand:DI 0 "gpc_reg_operand" "=&r")
+ (bswap:DI (match_operand:DI 1 "gpc_reg_operand" "r")))
+ (clobber (match_scratch:DI 2 "=&r"))
+ (clobber (match_scratch:DI 3 "=&r"))]
+ "TARGET_POWERPC64 && TARGET_LDBRX"
+ "#"
+ [(set_attr "length" "36")])
;; Non-power7/cell, fall back to use lwbrx/stwbrx
(define_insn "*bswapdi2_64bit"