(UNSPEC_ADDP4 24)
(UNSPEC_PROLOGUE_USE 25)
(UNSPEC_RET_ADDR 26)
+ (UNSPEC_SETF_EXP 27)
+ (UNSPEC_FR_SQRT_RECIP_APPROX 28)
])
(define_constants
operands[10] = CONST1_RTX (XFmode);
}
[(set_attr "predicable" "no")])
+
+;; Inline square root.
+
+(define_insn "*sqrt_approx"
+ [(set (match_operand:XF 0 "fr_register_operand" "=f")
+ (div:XF (const_int 1)
+ (sqrt:XF (match_operand:XF 2 "fr_register_operand" "f"))))
+ (set (match_operand:BI 1 "register_operand" "=c")
+ (unspec:BI [(match_dup 2)] UNSPEC_FR_SQRT_RECIP_APPROX))
+ (use (match_operand:SI 3 "const_int_operand" "")) ]
+ ""
+ "frsqrta.s%3 %0, %1 = %2"
+ [(set_attr "itanium_class" "fmisc")
+ (set_attr "predicable" "no")])
+
+(define_insn "*setf_exp_xf"
+ [(set (match_operand:XF 0 "fr_register_operand" "=f")
+ (unspec:XF [(match_operand:DI 1 "register_operand" "r")]
+ UNSPEC_SETF_EXP))]
+ ""
+ "setf.exp %0 = %1"
+ [(set_attr "itanium_class" "frfr")])
+
+(define_expand "sqrtsf2"
+ [(set (match_operand:SF 0 "fr_register_operand" "=&f")
+ (sqrt:SF (match_operand:SF 1 "fr_register_operand" "f")))]
+ "TARGET_INLINE_SQRT"
+{
+ rtx insn;
+ if (TARGET_INLINE_SQRT_LAT)
+#if 0
+ insn = gen_sqrtsf2_internal_lat (operands[0], operands[1]);
+#else
+ abort ();
+#endif
+ else
+ insn = gen_sqrtsf2_internal_thr (operands[0], operands[1]);
+ emit_insn (insn);
+ DONE;
+})
+
+;; Latency-optimized square root.
+;; FIXME: Implement.
+
+;; Throughput-optimized square root.
+
+(define_insn_and_split "sqrtsf2_internal_thr"
+ [(set (match_operand:SF 0 "fr_register_operand" "=&f")
+ (sqrt:SF (match_operand:SF 1 "fr_register_operand" "f")))
+ ;; Register r2 in optimization guide.
+ (clobber (match_scratch:DI 2 "=r"))
+ ;; Register f8 in optimization guide
+ (clobber (match_scratch:XF 3 "=&f"))
+ ;; Register f9 in optimization guide
+ (clobber (match_scratch:XF 4 "=&f"))
+ ;; Register f10 in optimization guide
+ (clobber (match_scratch:XF 5 "=&f"))
+ ;; Register p6 in optimization guide.
+ (clobber (match_scratch:BI 6 "=c"))]
+ "TARGET_INLINE_SQRT_THR"
+ "#"
+ "&& reload_completed"
+ [ ;; exponent of +1/2 in r2
+ (set (match_dup 2) (const_int 65534))
+ ;; +1/2 in f8
+ (set (match_dup 3)
+ (unspec:XF [(match_dup 2)] UNSPEC_SETF_EXP))
+ ;; Step 1
+ ;; y0 = 1/sqrt(a) in f7
+ (parallel [(set (match_dup 7)
+ (div:XF (const_int 1)
+ (sqrt:XF (match_dup 8))))
+ (set (match_dup 6)
+ (unspec:BI [(match_dup 8)]
+ UNSPEC_FR_SQRT_RECIP_APPROX))
+ (use (const_int 0))])
+ ;; Step 2
+ ;; H0 = 1/2 * y0 in f9
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 4)
+ (plus:XF (mult:XF (match_dup 3) (match_dup 7))
+ (match_dup 9)))
+ (use (const_int 1))]))
+ ;; Step 3
+ ;; S0 = a * y0 in f7
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 7)
+ (plus:XF (mult:XF (match_dup 8) (match_dup 7))
+ (match_dup 9)))
+ (use (const_int 1))]))
+ ;; Step 4
+ ;; d = 1/2 - S0 * H0 in f10
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 5)
+ (plus:XF (neg:XF (mult:XF (match_dup 7) (match_dup 4)))
+ (match_dup 3)))
+ (use (const_int 1))]))
+ ;; Step 5
+ ;; d' = d + 1/2 * d in f8
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 3)
+ (plus:XF (mult:XF (match_dup 3) (match_dup 5))
+ (match_dup 5)))
+ (use (const_int 1))]))
+ ;; Step 6
+ ;; e = d + d * d' in f8
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 3)
+ (plus:XF (mult:XF (match_dup 5) (match_dup 3))
+ (match_dup 5)))
+ (use (const_int 1))]))
+ ;; Step 7
+ ;; S1 = S0 + e * S0 in f7
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 0)
+ (float_truncate:SF
+ (plus:XF (mult:XF (match_dup 3) (match_dup 7))
+ (match_dup 7))))
+ (use (const_int 1))]))
+ ;; Step 8
+ ;; H1 = H0 + e * H0 in f8
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 3)
+ (plus:XF (mult:XF (match_dup 3) (match_dup 4))
+ (match_dup 4)))
+ (use (const_int 1))]))
+ ;; Step 9
+ ;; d1 = a - S1 * S1 in f9
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 4)
+ (plus:XF (neg:XF (mult:XF (match_dup 7) (match_dup 7)))
+ (match_dup 8)))
+ (use (const_int 1))]))
+ ;; Step 10
+ ;; S = S1 + d1 * H1 in f7
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 0)
+ (float_truncate:SF
+ (plus:XF (mult:XF (match_dup 4) (match_dup 3))
+ (match_dup 7))))
+ (use (const_int 0))]))]
+{
+ /* Generate 82-bit versions of the input and output operands. */
+ operands[7] = gen_rtx_REG (XFmode, REGNO (operands[0]));
+ operands[8] = gen_rtx_REG (XFmode, REGNO (operands[1]));
+ /* Generate required floating-point constants. */
+ operands[9] = CONST0_RTX (XFmode);
+}
+ [(set_attr "predicable" "no")])
\f
;; ::::::::::::::::::::
;; ::
operands[10] = CONST1_RTX (XFmode);
}
[(set_attr "predicable" "no")])
+
+;; Inline square root.
+
+(define_expand "sqrtdf2"
+ [(set (match_operand:DF 0 "fr_register_operand" "=&f")
+ (sqrt:DF (match_operand:DF 1 "fr_register_operand" "f")))]
+ "TARGET_INLINE_SQRT"
+{
+ rtx insn;
+ if (TARGET_INLINE_SQRT_LAT)
+#if 0
+ insn = gen_sqrtdf2_internal_lat (operands[0], operands[1]);
+#else
+ abort ();
+#endif
+ else
+ insn = gen_sqrtdf2_internal_thr (operands[0], operands[1]);
+ emit_insn (insn);
+ DONE;
+})
+
+;; Latency-optimized square root.
+;; FIXME: Implement.
+
+;; Throughput-optimized square root.
+
+(define_insn_and_split "sqrtdf2_internal_thr"
+ [(set (match_operand:DF 0 "fr_register_operand" "=&f")
+ (sqrt:DF (match_operand:DF 1 "fr_register_operand" "f")))
+ ;; Register r2 in optimization guide.
+ (clobber (match_scratch:DI 2 "=r"))
+ ;; Register f8 in optimization guide
+ (clobber (match_scratch:XF 3 "=&f"))
+ ;; Register f9 in optimization guide
+ (clobber (match_scratch:XF 4 "=&f"))
+ ;; Register f10 in optimization guide
+ (clobber (match_scratch:XF 5 "=&f"))
+ ;; Register p6 in optimization guide.
+ (clobber (match_scratch:BI 6 "=c"))]
+ "TARGET_INLINE_SQRT_THR"
+ "#"
+ "&& reload_completed"
+ [ ;; exponent of +1/2 in r2
+ (set (match_dup 2) (const_int 65534))
+ ;; +1/2 in f10
+ (set (match_dup 5)
+ (unspec:XF [(match_dup 2)] UNSPEC_SETF_EXP))
+ ;; Step 1
+ ;; y0 = 1/sqrt(a) in f7
+ (parallel [(set (match_dup 7)
+ (div:XF (const_int 1)
+ (sqrt:XF (match_dup 8))))
+ (set (match_dup 6)
+ (unspec:BI [(match_dup 8)]
+ UNSPEC_FR_SQRT_RECIP_APPROX))
+ (use (const_int 0))])
+ ;; Step 2
+ ;; H0 = 1/2 * y0 in f8
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 3)
+ (plus:XF (mult:XF (match_dup 5) (match_dup 7))
+ (match_dup 9)))
+ (use (const_int 1))]))
+ ;; Step 3
+ ;; G0 = a * y0 in f7
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 7)
+ (plus:XF (mult:XF (match_dup 8) (match_dup 7))
+ (match_dup 9)))
+ (use (const_int 1))]))
+ ;; Step 4
+ ;; r0 = 1/2 - G0 * H0 in f9
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 4)
+ (plus:XF (neg:XF (mult:XF (match_dup 7) (match_dup 3)))
+ (match_dup 5)))
+ (use (const_int 1))]))
+ ;; Step 5
+ ;; H1 = H0 + r0 * H0 in f8
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 3)
+ (plus:XF (mult:XF (match_dup 4) (match_dup 3))
+ (match_dup 3)))
+ (use (const_int 1))]))
+ ;; Step 6
+ ;; G1 = G0 + r0 * G0 in f7
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 7)
+ (plus:XF (mult:XF (match_dup 4) (match_dup 7))
+ (match_dup 7)))
+ (use (const_int 1))]))
+ ;; Step 7
+ ;; r1 = 1/2 - G1 * H1 in f9
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 4)
+ (plus:XF (neg:XF (mult:XF (match_dup 7) (match_dup 3)))
+ (match_dup 5)))
+ (use (const_int 1))]))
+ ;; Step 8
+ ;; H2 = H1 + r1 * H1 in f8
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 3)
+ (plus:XF (mult:XF (match_dup 4) (match_dup 3))
+ (match_dup 3)))
+ (use (const_int 1))]))
+ ;; Step 9
+ ;; G2 = G1 + r1 * G1 in f7
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 7)
+ (plus:XF (mult:XF (match_dup 4) (match_dup 7))
+ (match_dup 7)))
+ (use (const_int 1))]))
+ ;; Step 10
+ ;; d2 = a - G2 * G2 in f9
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 4)
+ (plus:XF (neg:XF (mult:XF (match_dup 7) (match_dup 7)))
+ (match_dup 8)))
+ (use (const_int 1))]))
+ ;; Step 11
+ ;; G3 = G2 + d2 * H2 in f7
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 7)
+ (plus:XF (mult:XF (match_dup 4) (match_dup 3))
+ (match_dup 7)))
+ (use (const_int 1))]))
+ ;; Step 12
+ ;; d3 = a - G3 * G3 in f9
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 4)
+ (plus:XF (neg:XF (mult:XF (match_dup 7) (match_dup 7)))
+ (match_dup 8)))
+ (use (const_int 1))]))
+ ;; Step 13
+ ;; S = G3 + d3 * H2 in f7
+ (cond_exec (ne (match_dup 6) (const_int 0))
+ (parallel [(set (match_dup 0)
+ (float_truncate:DF
+ (plus:XF (mult:XF (match_dup 4) (match_dup 3))
+ (match_dup 7))))
+ (use (const_int 0))]))]
+{
+ /* Generate 82-bit versions of the input and output operands. */
+ operands[7] = gen_rtx_REG (XFmode, REGNO (operands[0]));
+ operands[8] = gen_rtx_REG (XFmode, REGNO (operands[1]));
+ /* Generate required floating-point constants. */
+ operands[9] = CONST0_RTX (XFmode);
+}
+ [(set_attr "predicable" "no")])
\f
;; ::::::::::::::::::::
;; ::
"fma.s%4 %0 = %F1, %F2, %F3"
[(set_attr "itanium_class" "fmac")])
+(define_insn "*maddxf4_alts_truncsf"
+ [(set (match_operand:SF 0 "fr_register_operand" "=f")
+ (float_truncate:SF
+ (plus:XF (mult:XF (match_operand:XF 1 "xfreg_or_fp01_operand" "fG")
+ (match_operand:XF 2 "xfreg_or_fp01_operand" "fG"))
+ (match_operand:XF 3 "xfreg_or_fp01_operand" "fG"))))
+ (use (match_operand:SI 4 "const_int_operand" ""))]
+ ""
+ "fma.s.s%4 %0 = %F1, %F2, %F3"
+ [(set_attr "itanium_class" "fmac")])
+
(define_insn "*maddxf4_alts_truncdf"
[(set (match_operand:DF 0 "fr_register_operand" "=f")
(float_truncate:DF
"operands[6] = CONST1_RTX (XFmode);"
[(set_attr "predicable" "no")])
+;; Inline square root.
+
+(define_expand "sqrtxf2"
+ [(set (match_operand:XF 0 "fr_register_operand" "=&f")
+ (sqrt:XF (match_operand:XF 1 "fr_register_operand" "f")))]
+ "TARGET_INLINE_SQRT"
+{
+ rtx insn;
+ if (TARGET_INLINE_SQRT_LAT)
+#if 0
+ insn = gen_sqrtxf2_internal_lat (operands[0], operands[1]);
+#else
+ abort ();
+#endif
+ else
+ insn = gen_sqrtxf2_internal_thr (operands[0], operands[1]);
+ emit_insn (insn);
+ DONE;
+})
+
+;; Latency-optimized square root.
+;; FIXME: Implement.
+
+;; Throughput-optimized square root.
+
+(define_insn_and_split "sqrtxf2_internal_thr"
+ [(set (match_operand:XF 0 "fr_register_operand" "=&f")
+ (sqrt:XF (match_operand:XF 1 "fr_register_operand" "f")))
+ ;; Register r2 in optimization guide.
+ (clobber (match_scratch:DI 2 "=r"))
+ ;; Register f8 in optimization guide
+ (clobber (match_scratch:XF 3 "=&f"))
+ ;; Register f9 in optimization guide
+ (clobber (match_scratch:XF 4 "=&f"))
+ ;; Register f10 in optimization guide
+ (clobber (match_scratch:XF 5 "=&f"))
+ ;; Register f11 in optimization guide
+ (clobber (match_scratch:XF 6 "=&f"))
+ ;; Register p6 in optimization guide.
+ (clobber (match_scratch:BI 7 "=c"))]
+ "TARGET_INLINE_SQRT_THR"
+ "#"
+ "&& reload_completed"
+ [ ;; exponent of +1/2 in r2
+ (set (match_dup 2) (const_int 65534))
+ ;; +1/2 in f8. The Intel manual mistakenly specifies f10.
+ (set (match_dup 3)
+ (unspec:XF [(match_dup 2)] UNSPEC_SETF_EXP))
+ ;; Step 1
+ ;; y0 = 1/sqrt(a) in f7
+ (parallel [(set (match_dup 8)
+ (div:XF (const_int 1)
+ (sqrt:XF (match_dup 9))))
+ (set (match_dup 7)
+ (unspec:BI [(match_dup 9)]
+ UNSPEC_FR_SQRT_RECIP_APPROX))
+ (use (const_int 0))])
+ ;; Step 2
+ ;; H0 = 1/2 * y0 in f9
+ (cond_exec (ne (match_dup 7) (const_int 0))
+ (parallel [(set (match_dup 4)
+ (plus:XF (mult:XF (match_dup 3) (match_dup 8))
+ (match_dup 10)))
+ (use (const_int 1))]))
+ ;; Step 3
+ ;; S0 = a * y0 in f7
+ (cond_exec (ne (match_dup 7) (const_int 0))
+ (parallel [(set (match_dup 8)
+ (plus:XF (mult:XF (match_dup 9) (match_dup 8))
+ (match_dup 10)))
+ (use (const_int 1))]))
+ ;; Step 4
+ ;; d0 = 1/2 - S0 * H0 in f10
+ (cond_exec (ne (match_dup 7) (const_int 0))
+ (parallel [(set (match_dup 5)
+ (plus:XF (neg:XF (mult:XF (match_dup 8) (match_dup 4)))
+ (match_dup 3)))
+ (use (const_int 1))]))
+ ;; Step 5
+ ;; H1 = H0 + d0 * H0 in f9
+ (cond_exec (ne (match_dup 7) (const_int 0))
+ (parallel [(set (match_dup 4)
+ (plus:XF (mult:XF (match_dup 5) (match_dup 4))
+ (match_dup 4)))
+ (use (const_int 1))]))
+ ;; Step 6
+ ;; S1 = S0 + d0 * S0 in f7
+ (cond_exec (ne (match_dup 7) (const_int 0))
+ (parallel [(set (match_dup 8)
+ (plus:XF (mult:XF (match_dup 5) (match_dup 8))
+ (match_dup 8)))
+ (use (const_int 1))]))
+ ;; Step 7
+ ;; d1 = 1/2 - S1 * H1 in f10
+ (cond_exec (ne (match_dup 7) (const_int 0))
+ (parallel [(set (match_dup 5)
+ (plus:XF (neg:XF (mult:XF (match_dup 8) (match_dup 4)))
+ (match_dup 3)))
+ (use (const_int 1))]))
+ ;; Step 8
+ ;; H2 = H1 + d1 * H1 in f9
+ (cond_exec (ne (match_dup 7) (const_int 0))
+ (parallel [(set (match_dup 4)
+ (plus:XF (mult:XF (match_dup 5) (match_dup 4))
+ (match_dup 4)))
+ (use (const_int 1))]))
+ ;; Step 9
+ ;; S2 = S1 + d1 * S1 in f7
+ (cond_exec (ne (match_dup 7) (const_int 0))
+ (parallel [(set (match_dup 8)
+ (plus:XF (mult:XF (match_dup 5) (match_dup 8))
+ (match_dup 8)))
+ (use (const_int 1))]))
+ ;; Step 10
+ ;; d2 = 1/2 - S2 * H2 in f10
+ (cond_exec (ne (match_dup 7) (const_int 0))
+ (parallel [(set (match_dup 5)
+ (plus:XF (neg:XF (mult:XF (match_dup 8) (match_dup 4)))
+ (match_dup 3)))
+ (use (const_int 1))]))
+ ;; Step 11
+ ;; e2 = a - S2 * S2 in f8
+ (cond_exec (ne (match_dup 7) (const_int 0))
+ (parallel [(set (match_dup 3)
+ (plus:XF (neg:XF (mult:XF (match_dup 8) (match_dup 8)))
+ (match_dup 9)))
+ (use (const_int 1))]))
+ ;; Step 12
+ ;; S3 = S2 + e2 * H2 in f7
+ (cond_exec (ne (match_dup 7) (const_int 0))
+ (parallel [(set (match_dup 8)
+ (plus:XF (mult:XF (match_dup 3) (match_dup 4))
+ (match_dup 8)))
+ (use (const_int 1))]))
+ ;; Step 13
+ ;; H3 = H2 + d2 * H2 in f9
+ (cond_exec (ne (match_dup 7) (const_int 0))
+ (parallel [(set (match_dup 4)
+ (plus:XF (mult:XF (match_dup 5) (match_dup 4))
+ (match_dup 4)))
+ (use (const_int 1))]))
+ ;; Step 14
+ ;; e3 = a - S3 * S3 in f8
+ (cond_exec (ne (match_dup 7) (const_int 0))
+ (parallel [(set (match_dup 3)
+ (plus:XF (neg:XF (mult:XF (match_dup 8) (match_dup 8)))
+ (match_dup 9)))
+ (use (const_int 1))]))
+ ;; Step 15
+ ;; S = S3 + e3 * H3 in f7
+ (cond_exec (ne (match_dup 7) (const_int 0))
+ (parallel [(set (match_dup 0)
+ (plus:XF (mult:XF (match_dup 3) (match_dup 4))
+ (match_dup 8)))
+ (use (const_int 0))]))]
+{
+ /* Generate 82-bit versions of the input and output operands. */
+ operands[8] = gen_rtx_REG (XFmode, REGNO (operands[0]));
+ operands[9] = gen_rtx_REG (XFmode, REGNO (operands[1]));
+ /* Generate required floating-point constants. */
+ operands[10] = CONST0_RTX (XFmode);
+}
+ [(set_attr "predicable" "no")])
+
;; ??? frcpa works like cmp.foo.unc.
(define_insn "*recip_approx"