#include "lp_bld_debug.h"
#include "lp_bld_arit.h"
+#include "float.h"
#define EXP_POLY_DEGREE 5
*
* x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
*
- * See also:
- * - http://softwarecommunity.intel.com/articles/eng/1818.htm
+ * See also Intel 64 and IA-32 Architectures Optimization Manual.
*/
static INLINE LLVMValueRef
lp_build_rsqrt_refine(struct lp_build_context *bld,
/**
- * Generate 1/sqrt(a)
+ * Generate 1/sqrt(a).
+ * Result is undefined for values < 0, infinity for +0.
*/
LLVMValueRef
lp_build_rsqrt(struct lp_build_context *bld,
assert(type.floating);
- if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
- (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
+ /*
+ * This should be faster but all denormals will end up as infinity.
+ */
+ if (0 && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
+ (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))) {
const unsigned num_iterations = 1;
LLVMValueRef res;
unsigned i;
else {
intrinsic = "llvm.x86.avx.rsqrt.ps.256";
}
+ if (num_iterations) {
+ /*
+ * Newton-Raphson will result in NaN instead of infinity for zero,
+ * and NaN instead of zero for infinity.
+ * Also, need to ensure rsqrt(1.0) == 1.0.
+ * All numbers smaller than FLT_MIN will result in +infinity
+ * (rsqrtps treats all denormals as zero).
+ */
+ /*
+ * Certain non-c99 compilers don't know INFINITY and might not support
+ * hacks to evaluate it at compile time neither.
+ */
+ const unsigned posinf_int = 0x7F800000;
+ LLVMValueRef cmp;
+ LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
+ LLVMValueRef inf = lp_build_const_int_vec(bld->gallivm, type, posinf_int);
- res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
+ inf = LLVMBuildBitCast(builder, inf, lp_build_vec_type(bld->gallivm, type), "");
+ res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
+
+ for (i = 0; i < num_iterations; ++i) {
+ res = lp_build_rsqrt_refine(bld, a, res);
+ }
+ cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
+ res = lp_build_select(bld, cmp, inf, res);
+ cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
+ res = lp_build_select(bld, cmp, bld->zero, res);
+ cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
+ res = lp_build_select(bld, cmp, bld->one, res);
+ }
+ else {
+ /* rsqrt(1.0) != 1.0 here */
+ res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
- for (i = 0; i < num_iterations; ++i) {
- res = lp_build_rsqrt_refine(bld, a, res);
}
return res;
};
+static float rcpf(float x)
+{
+ return 1.0/x;
+}
+
+
+const float rcp_values[] = {
+ -0.0, 0.0,
+ -1.0, 1.0,
+ -1e-007, 1e-007,
+ -4.0, 4.0,
+ -1e+035, -100000,
+ 100000, 1e+035,
+ 5.88e-39f, // denormal
+#if (__STDC_VERSION__ >= 199901L)
+ INFINITY, -INFINITY,
+#endif
+};
+
+
static float rsqrtf(float x)
{
- return 1.0/sqrt(x);
+ return 1.0/(float)sqrt(x);
}
const float rsqrt_values[] = {
- -1, -1e-007,
- 1e-007, 1,
- -4, -1,
- 1, 4,
- -1e+035, -100000,
+ // http://msdn.microsoft.com/en-us/library/windows/desktop/bb147346.aspx
+ 0.0, // must yield infinity
+ 1.0, // must yield 1.0
+ 1e-007, 4.0,
100000, 1e+035,
+ 5.88e-39f, // denormal
+#if (__STDC_VERSION__ >= 199901L)
+ INFINITY,
+#endif
};
{"log2", &lp_build_log2, &log2f, log2_values, Elements(log2_values), 20.0 },
{"exp", &lp_build_exp, &expf, exp2_values, Elements(exp2_values), 18.0 },
{"log", &lp_build_log, &logf, log2_values, Elements(log2_values), 20.0 },
+ {"rcp", &lp_build_rcp, &rcpf, rcp_values, Elements(rcp_values), 20.0 },
{"rsqrt", &lp_build_rsqrt, &rsqrtf, rsqrt_values, Elements(rsqrt_values), 20.0 },
{"sin", &lp_build_sin, &sinf, sincos_values, Elements(sincos_values), 20.0 },
{"cos", &lp_build_cos, &cosf, sincos_values, Elements(sincos_values), 20.0 },
double error, precision;
bool pass;
- error = fabs(out[i] - ref);
+ if (util_inf_sign(ref) && util_inf_sign(out[i]) == util_inf_sign(ref)) {
+ error = 0;
+ } else {
+ error = fabs(out[i] - ref);
+ }
precision = error ? -log2(error/fabs(ref)) : FLT_MANT_DIG;
pass = precision >= test->precision;