+ if (optimise_only)
+ return NULL;
+
+ if ((bld->type.norm && bld->type.sign) &&
+ (is_inverse_factor(factor_src) || is_inverse_factor(factor_dst))) {
+ /*
+ * With snorm blending, the inverse blend factors range from [0,2]
+ * instead of [-1,1], so the ordinary signed normalized arithmetic
+ * doesn't quite work. Unpack must be unsigned, and the add/sub
+ * must be done with wider type.
+ * (Note that it's not quite obvious what the blend equation wrt to
+ * clamping should actually be based on GL spec in this case, but
+ * really the incoming src values are clamped to [-1,1] (the dst is
+ * always clamped already), and then NO further clamping occurs until
+ * the end.)
+ */
+ struct lp_build_context bldw;
+ struct lp_type wide_type = lp_wider_type(bld->type);
+ LLVMValueRef src_terml, src_termh, dst_terml, dst_termh;
+ LLVMValueRef resl, resh;
+
+ /*
+ * We don't need saturate math for the sub/add, since we have
+ * x+1 bit numbers in x*2 wide type (result is x+2 bits).
+ * (Doesn't really matter on x86 sse2 though as we use saturated
+ * intrinsics.)
+ */
+ wide_type.norm = 0;
+ lp_build_context_init(&bldw, bld->gallivm, wide_type);
+
+ /*
+ * XXX This is a bit hackish. Note that -128 really should
+ * be -1.0, the same as -127. However, we did not actually clamp
+ * things anywhere (relying on pack intrinsics instead) therefore
+ * we will get -128, and the inverted factor then 255. But the mul
+ * can overflow in this case (rather the rounding fixups for the mul,
+ * -128*255 will be positive).
+ * So we clamp the src and dst up here but only when necessary (we
+ * should do this before calculating blend factors but it's enough
+ * for avoiding overflow).
+ */
+ if (is_inverse_factor(factor_src)) {
+ src = lp_build_max(bld, src,
+ lp_build_const_vec(bld->gallivm, bld->type, -1.0));
+ }
+ if (is_inverse_factor(factor_dst)) {
+ dst = lp_build_max(bld, dst,
+ lp_build_const_vec(bld->gallivm, bld->type, -1.0));
+ }