+ else {
+ depth_mode = LATE_DEPTH_TEST | LATE_DEPTH_WRITE;
+ }
+
+ if (!(key->depth.enabled && key->depth.writemask) &&
+ !(key->stencil[0].enabled && key->stencil[0].writemask))
+ depth_mode &= ~(LATE_DEPTH_WRITE | EARLY_DEPTH_WRITE);
+ }
+ else {
+ depth_mode = 0;
+ }
+
+
+ stencil_refs[0] = lp_jit_context_stencil_ref_front_value(gallivm, context_ptr);
+ stencil_refs[1] = lp_jit_context_stencil_ref_back_value(gallivm, context_ptr);
+
+ vec_type = lp_build_vec_type(gallivm, type);
+
+ consts_ptr = lp_jit_context_constants(gallivm, context_ptr);
+
+ lp_build_for_loop_begin(&loop_state, gallivm,
+ lp_build_const_int32(gallivm, 0),
+ LLVMIntULT,
+ num_loop,
+ lp_build_const_int32(gallivm, 1));
+
+ mask_ptr = LLVMBuildGEP(builder, mask_store,
+ &loop_state.counter, 1, "mask_ptr");
+ mask_val = LLVMBuildLoad(builder, mask_ptr, "");
+
+ depth_offset = LLVMBuildMul(builder, loop_state.counter,
+ lp_build_const_int32(gallivm, depth_bits * type.length),
+ "");
+
+ depth_ptr_i = LLVMBuildGEP(builder, depth_ptr, &depth_offset, 1, "");
+
+ memset(outputs, 0, sizeof outputs);
+
+ for(cbuf = 0; cbuf < key->nr_cbufs; cbuf++) {
+ for(chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
+ out_color[cbuf][chan] = lp_build_array_alloca(gallivm,
+ lp_build_vec_type(gallivm,
+ type),
+ num_loop, "color");
+ }
+ }
+
+
+
+ /* 'mask' will control execution based on quad's pixel alive/killed state */
+ lp_build_mask_begin(&mask, gallivm, type, mask_val);
+
+ if (!(depth_mode & EARLY_DEPTH_TEST) && !simple_shader)
+ lp_build_mask_check(&mask);
+
+ lp_build_interp_soa_update_pos_dyn(interp, gallivm, loop_state.counter);
+ z = interp->pos[2];
+
+ if (depth_mode & EARLY_DEPTH_TEST) {
+ lp_build_depth_stencil_test(gallivm,
+ &key->depth,
+ key->stencil,
+ type,
+ zs_format_desc,
+ &mask,
+ stencil_refs,
+ z,
+ depth_ptr_i, facing,
+ &zs_value,
+ !simple_shader);
+
+ if (depth_mode & EARLY_DEPTH_WRITE) {
+ lp_build_depth_write(builder, zs_format_desc, depth_ptr_i, zs_value);
+ }
+ }
+
+ lp_build_interp_soa_update_inputs_dyn(interp, gallivm, loop_state.counter);
+
+ /* Build the actual shader */
+ lp_build_tgsi_soa(gallivm, tokens, type, &mask,
+ consts_ptr, &system_values,
+ interp->pos, interp->inputs,
+ outputs, sampler, &shader->info.base);
+
+ /* Alpha test */
+ if (key->alpha.enabled) {
+ int color0 = find_output_by_semantic(&shader->info.base,
+ TGSI_SEMANTIC_COLOR,
+ 0);
+
+ if (color0 != -1 && outputs[color0][3]) {
+ const struct util_format_description *cbuf_format_desc;
+ LLVMValueRef alpha = LLVMBuildLoad(builder, outputs[color0][3], "alpha");
+ LLVMValueRef alpha_ref_value;
+
+ alpha_ref_value = lp_jit_context_alpha_ref_value(gallivm, context_ptr);
+ alpha_ref_value = lp_build_broadcast(gallivm, vec_type, alpha_ref_value);
+
+ cbuf_format_desc = util_format_description(key->cbuf_format[0]);
+
+ lp_build_alpha_test(gallivm, key->alpha.func, type, cbuf_format_desc,
+ &mask, alpha, alpha_ref_value,
+ (depth_mode & LATE_DEPTH_TEST) != 0);
+ }
+ }
+
+ /* Late Z test */
+ if (depth_mode & LATE_DEPTH_TEST) {
+ int pos0 = find_output_by_semantic(&shader->info.base,
+ TGSI_SEMANTIC_POSITION,
+ 0);
+
+ if (pos0 != -1 && outputs[pos0][2]) {
+ z = LLVMBuildLoad(builder, outputs[pos0][2], "output.z");
+ }
+
+ lp_build_depth_stencil_test(gallivm,
+ &key->depth,
+ key->stencil,
+ type,
+ zs_format_desc,
+ &mask,
+ stencil_refs,
+ z,
+ depth_ptr_i, facing,
+ &zs_value,
+ !simple_shader);
+ /* Late Z write */
+ if (depth_mode & LATE_DEPTH_WRITE) {
+ lp_build_depth_write(builder, zs_format_desc, depth_ptr_i, zs_value);
+ }
+ }
+ else if ((depth_mode & EARLY_DEPTH_TEST) &&
+ (depth_mode & LATE_DEPTH_WRITE))
+ {
+ /* Need to apply a reduced mask to the depth write. Reload the
+ * depth value, update from zs_value with the new mask value and
+ * write that out.
+ */
+ lp_build_deferred_depth_write(gallivm,
+ type,
+ zs_format_desc,
+ &mask,
+ depth_ptr_i,
+ zs_value);
+ }
+
+
+ /* Color write */
+ for (attrib = 0; attrib < shader->info.base.num_outputs; ++attrib)
+ {
+ if (shader->info.base.output_semantic_name[attrib] == TGSI_SEMANTIC_COLOR &&
+ shader->info.base.output_semantic_index[attrib] < key->nr_cbufs)
+ {
+ unsigned cbuf = shader->info.base.output_semantic_index[attrib];
+ for(chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
+ if(outputs[attrib][chan]) {
+ /* XXX: just initialize outputs to point at colors[] and
+ * skip this.
+ */
+ LLVMValueRef out = LLVMBuildLoad(builder, outputs[attrib][chan], "");
+ LLVMValueRef color_ptr;
+ color_ptr = LLVMBuildGEP(builder, out_color[cbuf][chan],
+ &loop_state.counter, 1, "");
+ lp_build_name(out, "color%u.%c", attrib, "rgba"[chan]);
+ LLVMBuildStore(builder, out, color_ptr);
+ }
+ }
+ }
+ }
+
+ if (key->occlusion_count) {
+ lp_build_name(counter, "counter");
+ lp_build_occlusion_count(gallivm, type,
+ lp_build_mask_value(&mask), counter);
+ }
+
+ mask_val = lp_build_mask_end(&mask);
+ LLVMBuildStore(builder, mask_val, mask_ptr);
+ lp_build_for_loop_end(&loop_state);
+}
+
+
+/**
+ * This function will reorder pixels from the fragment shader SoA to memory layout AoS
+ *
+ * Fragment Shader outputs pixels in small 2x2 blocks
+ * e.g. (0, 0), (1, 0), (0, 1), (1, 1) ; (2, 0) ...
+ *
+ * However in memory pixels are stored in rows
+ * e.g. (0, 0), (1, 0), (2, 0), (3, 0) ; (0, 1) ...
+ *
+ * @param type fragment shader type (4x or 8x float)
+ * @param num_fs number of fs_src
+ * @param dst_channels number of output channels
+ * @param fs_src output from fragment shader
+ * @param dst pointer to store result
+ * @param pad_inline is channel padding inline or at end of row
+ * @return the number of dsts
+ */
+static int
+generate_fs_twiddle(struct gallivm_state *gallivm,
+ struct lp_type type,
+ unsigned num_fs,
+ unsigned dst_channels,
+ LLVMValueRef fs_src[][4],
+ LLVMValueRef* dst,
+ bool pad_inline)
+{
+ LLVMValueRef src[16];
+
+ bool swizzle_pad;
+ bool twiddle;
+ bool split;
+
+ unsigned pixels = num_fs == 4 ? 1 : 2;
+ unsigned reorder_group;
+ unsigned src_channels;
+ unsigned src_count;
+ unsigned i;
+
+ src_channels = dst_channels < 3 ? dst_channels : 4;
+ src_count = num_fs * src_channels;
+
+ assert(pixels == 2 || num_fs == 4);
+ assert(num_fs * src_channels <= Elements(src));
+
+ /*
+ * Transpose from SoA -> AoS
+ */
+ for (i = 0; i < num_fs; ++i) {
+ lp_build_transpose_aos_n(gallivm, type, &fs_src[i][0], src_channels, &src[i * src_channels]);
+ }
+
+ /*
+ * Pick transformation options
+ */
+ swizzle_pad = false;
+ twiddle = false;
+ split = false;
+ reorder_group = 0;
+
+ if (dst_channels == 1) {
+ twiddle = true;
+
+ if (pixels == 2) {
+ split = true;
+ }
+ } else if (dst_channels == 2) {
+ if (pixels == 1) {
+ reorder_group = 1;
+ }
+ } else if (dst_channels > 2) {
+ if (pixels == 1) {
+ reorder_group = 2;
+ } else {
+ twiddle = true;
+ }
+
+ if (!pad_inline && dst_channels == 3 && pixels > 1) {
+ swizzle_pad = true;
+ }
+ }
+
+ /*
+ * Split the src in half
+ */
+ if (split) {
+ for (i = num_fs; i > 0; --i) {
+ src[(i - 1)*2 + 1] = lp_build_extract_range(gallivm, src[i - 1], 4, 4);
+ src[(i - 1)*2 + 0] = lp_build_extract_range(gallivm, src[i - 1], 0, 4);
+ }
+
+ src_count *= 2;
+ type.length = 4;
+ }
+
+ /*
+ * Ensure pixels are in memory order
+ */
+ if (reorder_group) {
+ /* Twiddle pixels by reordering the array, e.g.:
+ *
+ * src_count = 8 -> 0 2 1 3 4 6 5 7
+ * src_count = 16 -> 0 1 4 5 2 3 6 7 8 9 12 13 10 11 14 15
+ */
+ const unsigned reorder_sw[] = { 0, 2, 1, 3 };
+
+ for (i = 0; i < src_count; ++i) {
+ unsigned group = i / reorder_group;
+ unsigned block = (group / 4) * 4 * reorder_group;
+ unsigned j = block + (reorder_sw[group % 4] * reorder_group) + (i % reorder_group);
+ dst[i] = src[j];
+ }
+ } else if (twiddle) {
+ /* Twiddle pixels across elements of array */
+ lp_bld_quad_twiddle(gallivm, type, src, src_count, dst);
+ } else {
+ /* Do nothing */
+ memcpy(dst, src, sizeof(LLVMValueRef) * src_count);
+ }
+
+ /*
+ * Moves any padding between pixels to the end
+ * e.g. RGBXRGBX -> RGBRGBXX
+ */
+ if (swizzle_pad) {
+ unsigned char swizzles[16];
+ unsigned elems = pixels * dst_channels;
+
+ for (i = 0; i < type.length; ++i) {
+ if (i < elems)
+ swizzles[i] = i % dst_channels + (i / dst_channels) * 4;
+ else
+ swizzles[i] = LP_BLD_SWIZZLE_DONTCARE;
+ }
+
+ for (i = 0; i < src_count; ++i) {
+ dst[i] = lp_build_swizzle_aos_n(gallivm, dst[i], swizzles, type.length, type.length);
+ }
+ }
+
+ return src_count;
+}
+
+
+/**
+ * Load an unswizzled block of pixels from memory
+ */
+static void
+load_unswizzled_block(struct gallivm_state *gallivm,
+ LLVMValueRef base_ptr,
+ LLVMValueRef stride,
+ unsigned block_width,
+ unsigned block_height,
+ LLVMValueRef* dst,
+ struct lp_type dst_type,
+ unsigned dst_count,
+ unsigned dst_alignment)
+{
+ LLVMBuilderRef builder = gallivm->builder;
+ unsigned row_size = dst_count / block_height;
+ unsigned i;
+
+ /* Ensure block exactly fits into dst */
+ assert((block_width * block_height) % dst_count == 0);
+
+ for (i = 0; i < dst_count; ++i) {
+ unsigned x = i % row_size;
+ unsigned y = i / row_size;
+
+ LLVMValueRef bx = lp_build_const_int32(gallivm, x * (dst_type.width / 8) * dst_type.length);
+ LLVMValueRef by = LLVMBuildMul(builder, lp_build_const_int32(gallivm, y), stride, "");
+
+ LLVMValueRef gep[2];
+ LLVMValueRef dst_ptr;
+
+ gep[0] = lp_build_const_int32(gallivm, 0);
+ gep[1] = LLVMBuildAdd(builder, bx, by, "");
+
+ dst_ptr = LLVMBuildGEP(builder, base_ptr, gep, 2, "");
+ dst_ptr = LLVMBuildBitCast(builder, dst_ptr, LLVMPointerType(lp_build_vec_type(gallivm, dst_type), 0), "");
+
+ dst[i] = LLVMBuildLoad(builder, dst_ptr, "");
+
+ lp_set_load_alignment(dst[i], dst_alignment);
+ }
+}
+
+
+/**
+ * Store an unswizzled block of pixels to memory
+ */
+static void
+store_unswizzled_block(struct gallivm_state *gallivm,
+ LLVMValueRef base_ptr,
+ LLVMValueRef stride,
+ unsigned block_width,
+ unsigned block_height,
+ LLVMValueRef* src,
+ struct lp_type src_type,
+ unsigned src_count,
+ unsigned src_alignment)
+{
+ LLVMBuilderRef builder = gallivm->builder;
+ unsigned row_size = src_count / block_height;
+ unsigned i;
+
+ /* Ensure src exactly fits into block */
+ assert((block_width * block_height) % src_count == 0);
+
+ for (i = 0; i < src_count; ++i) {
+ unsigned x = i % row_size;
+ unsigned y = i / row_size;
+
+ LLVMValueRef bx = lp_build_const_int32(gallivm, x * (src_type.width / 8) * src_type.length);
+ LLVMValueRef by = LLVMBuildMul(builder, lp_build_const_int32(gallivm, y), stride, "");
+
+ LLVMValueRef gep[2];
+ LLVMValueRef src_ptr;
+
+ gep[0] = lp_build_const_int32(gallivm, 0);
+ gep[1] = LLVMBuildAdd(builder, bx, by, "");
+
+ src_ptr = LLVMBuildGEP(builder, base_ptr, gep, 2, "");
+ src_ptr = LLVMBuildBitCast(builder, src_ptr, LLVMPointerType(lp_build_vec_type(gallivm, src_type), 0), "");
+
+ src_ptr = LLVMBuildStore(builder, src[i], src_ptr);
+
+ lp_set_store_alignment(src_ptr, src_alignment);
+ }
+}
+
+
+/**
+ * Checks if a format description is an arithmetic format
+ *
+ * A format which has irregular channel sizes such as R3_G3_B2 or R5_G6_B5.
+ */
+static INLINE boolean
+is_arithmetic_format(const struct util_format_description *format_desc)
+{
+ boolean arith = false;
+ unsigned i;
+
+ for (i = 0; i < format_desc->nr_channels; ++i) {
+ arith |= format_desc->channel[i].size != format_desc->channel[0].size;
+ arith |= (format_desc->channel[i].size % 8) != 0;
+ }
+
+ return arith;
+}
+
+
+/**
+ * Retrieves the type representing the memory layout for a format
+ *
+ * e.g. RGBA16F = 4x half-float and R3G3B2 = 1x byte
+ */
+static INLINE void
+lp_mem_type_from_format_desc(const struct util_format_description *format_desc,
+ struct lp_type* type)
+{
+ int i;
+
+ memset(type, 0, sizeof(struct lp_type));
+ type->floating = format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT;
+ type->fixed = format_desc->channel[0].type == UTIL_FORMAT_TYPE_FIXED;
+ type->sign = format_desc->channel[0].type != UTIL_FORMAT_TYPE_UNSIGNED;
+ type->norm = format_desc->channel[0].normalized;
+
+ if (is_arithmetic_format(format_desc)) {
+ type->width = 0;
+ type->length = 1;
+
+ for (i = 0; i < format_desc->nr_channels; ++i) {
+ type->width += format_desc->channel[i].size;
+ }
+ } else {
+ type->width = format_desc->channel[0].size;
+ type->length = format_desc->nr_channels;
+ }
+}
+
+
+/**
+ * Retrieves the type for a format which is usable in the blending code.
+ *
+ * e.g. RGBA16F = 4x float, R3G3B2 = 3x byte
+ */
+static INLINE void
+lp_blend_type_from_format_desc(const struct util_format_description *format_desc,
+ struct lp_type* type)
+{
+ int i;
+
+ memset(type, 0, sizeof(struct lp_type));
+ type->floating = format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT;
+ type->fixed = format_desc->channel[0].type == UTIL_FORMAT_TYPE_FIXED;
+ type->sign = format_desc->channel[0].type != UTIL_FORMAT_TYPE_UNSIGNED;
+ type->norm = format_desc->channel[0].normalized;
+ type->width = format_desc->channel[0].size;
+ type->length = format_desc->nr_channels;
+
+ for (i = 1; i < format_desc->nr_channels; ++i) {
+ if (format_desc->channel[i].size > type->width)
+ type->width = format_desc->channel[i].size;
+ }
+
+ if (type->floating) {
+ type->width = 32;
+ } else {
+ if (type->width <= 8) {
+ type->width = 8;
+ } else if (type->width <= 16) {
+ type->width = 16;
+ } else {
+ type->width = 32;
+ }
+ }
+
+ if (is_arithmetic_format(format_desc) && type->length == 3) {
+ type->length = 4;
+ }
+}
+
+
+/**
+ * Scale a normalised value from src_bits to dst_bits
+ */
+static INLINE LLVMValueRef
+scale_bits(struct gallivm_state *gallivm,
+ int src_bits,
+ int dst_bits,
+ LLVMValueRef src,
+ struct lp_type src_type)
+{
+ LLVMBuilderRef builder = gallivm->builder;
+ LLVMValueRef result = src;
+
+ if (dst_bits < src_bits) {
+ /* Scale down by LShr */
+ result = LLVMBuildLShr(builder,
+ src,
+ lp_build_const_int_vec(gallivm, src_type, src_bits - dst_bits),
+ "");
+ } else if (dst_bits > src_bits) {
+ /* Scale up bits */
+ int db = dst_bits - src_bits;
+
+ /* Shift left by difference in bits */
+ result = LLVMBuildShl(builder,
+ src,
+ lp_build_const_int_vec(gallivm, src_type, db),
+ "");
+
+ if (db < src_bits) {
+ /* Enough bits in src to fill the remainder */
+ LLVMValueRef lower = LLVMBuildLShr(builder,
+ src,
+ lp_build_const_int_vec(gallivm, src_type, src_bits - db),
+ "");
+
+ result = LLVMBuildOr(builder, result, lower, "");
+ } else if (db > src_bits) {
+ /* Need to repeatedely copy src bits to fill remainder in dst */
+ unsigned n;
+
+ for (n = src_bits; n < dst_bits; n *= 2) {
+ LLVMValueRef shuv = lp_build_const_int_vec(gallivm, src_type, n);
+
+ result = LLVMBuildOr(builder,
+ result,
+ LLVMBuildLShr(builder, result, shuv, ""),
+ "");
+ }
+ }
+ }
+
+ return result;
+}
+
+
+/**
+ * Convert from memory format to blending format
+ *
+ * e.g. GL_R3G3B2 is 1 byte in memory but 3 bytes for blending
+ */
+static void
+convert_to_blend_type(struct gallivm_state *gallivm,
+ const struct util_format_description *src_fmt,
+ struct lp_type src_type,
+ struct lp_type dst_type,
+ LLVMValueRef* src, // and dst
+ unsigned num_srcs)
+{
+ LLVMValueRef *dst = src;
+ LLVMBuilderRef builder = gallivm->builder;
+ struct lp_type blend_type;
+ struct lp_type mem_type;
+ unsigned i, j, k;
+ unsigned pixels = 16 / num_srcs;
+ bool is_arith;
+
+ lp_mem_type_from_format_desc(src_fmt, &mem_type);
+ lp_blend_type_from_format_desc(src_fmt, &blend_type);
+
+ /* Is the format arithmetic */
+ is_arith = blend_type.length * blend_type.width != mem_type.width * mem_type.length;
+ is_arith &= !(mem_type.width == 16 && mem_type.floating);
+
+ /* Pad if necessary */
+ if (!is_arith && src_type.length < dst_type.length) {
+ for (i = 0; i < num_srcs; ++i) {
+ dst[i] = lp_build_pad_vector(gallivm, src[i], dst_type.length);
+ }
+
+ src_type.length = dst_type.length;
+ }
+
+ /* Special case for half-floats */
+ if (mem_type.width == 16 && mem_type.floating) {
+ assert(blend_type.width == 32 && blend_type.floating);
+ lp_build_conv_auto(gallivm, src_type, &dst_type, dst, num_srcs, dst);
+ is_arith = false;
+ }
+
+ if (!is_arith) {
+ return;
+ }
+
+ src_type.width = blend_type.width * blend_type.length;
+ blend_type.length *= pixels;
+ src_type.length *= pixels / (src_type.length / mem_type.length);
+
+ for (i = 0; i < num_srcs; ++i) {
+ LLVMValueRef chans[4];
+ LLVMValueRef res;
+ unsigned sa = 0;
+
+ dst[i] = LLVMBuildZExt(builder, src[i], lp_build_vec_type(gallivm, src_type), "");
+
+ for (j = 0; j < src_fmt->nr_channels; ++j) {
+ unsigned mask = 0;
+
+ for (k = 0; k < src_fmt->channel[j].size; ++k) {
+ mask |= 1 << k;
+ }
+
+ /* Extract bits from source */
+ chans[j] = LLVMBuildLShr(builder,
+ dst[i],
+ lp_build_const_int_vec(gallivm, src_type, sa),
+ "");
+
+ chans[j] = LLVMBuildAnd(builder,
+ chans[j],
+ lp_build_const_int_vec(gallivm, src_type, mask),
+ "");
+
+ /* Scale bits */
+ chans[j] = scale_bits(gallivm, src_fmt->channel[j].size, blend_type.width, chans[j], src_type);
+
+ /* Insert bits into correct position */
+ chans[j] = LLVMBuildShl(builder,
+ chans[j],
+ lp_build_const_int_vec(gallivm, src_type, j * blend_type.width),
+ "");
+
+ sa += src_fmt->channel[j].size;
+
+ if (j == 0) {
+ res = chans[j];
+ } else {
+ res = LLVMBuildOr(builder, res, chans[j], "");
+ }
+ }
+
+ dst[i] = LLVMBuildBitCast(builder, res, lp_build_vec_type(gallivm, blend_type), "");
+ }
+}
+
+
+/**
+ * Convert from blending format to memory format
+ *
+ * e.g. GL_R3G3B2 is 3 bytes for blending but 1 byte in memory
+ */
+static void
+convert_from_blend_type(struct gallivm_state *gallivm,
+ const struct util_format_description *src_fmt,
+ struct lp_type src_type,
+ struct lp_type dst_type,
+ LLVMValueRef* src, // and dst
+ unsigned num_srcs)
+{
+ LLVMValueRef* dst = src;
+ unsigned i, j, k;
+ struct lp_type mem_type;
+ struct lp_type blend_type;
+ LLVMBuilderRef builder = gallivm->builder;
+ unsigned pixels = 16 / num_srcs;
+ bool is_arith;
+
+ lp_mem_type_from_format_desc(src_fmt, &mem_type);
+ lp_blend_type_from_format_desc(src_fmt, &blend_type);
+
+ is_arith = (blend_type.length * blend_type.width != mem_type.width * mem_type.length);
+
+ /* Special case for half-floats */
+ if (mem_type.width == 16 && mem_type.floating) {
+ int length = dst_type.length;
+ assert(blend_type.width == 32 && blend_type.floating);
+
+ dst_type.length = src_type.length;
+
+ lp_build_conv_auto(gallivm, src_type, &dst_type, dst, num_srcs, dst);
+
+ dst_type.length = length;
+ is_arith = false;
+ }
+
+ /* Remove any padding */
+ if (!is_arith && (src_type.length % mem_type.length)) {
+ src_type.length -= (src_type.length % mem_type.length);
+
+ for (i = 0; i < num_srcs; ++i) {
+ dst[i] = lp_build_extract_range(gallivm, dst[i], 0, src_type.length);
+ }
+ }
+
+ /* No bit arithmitic to do */
+ if (!is_arith) {
+ return;
+ }
+
+ src_type.length = pixels;
+ src_type.width = blend_type.length * blend_type.width;
+ dst_type.length = pixels;
+
+ for (i = 0; i < num_srcs; ++i) {
+ LLVMValueRef chans[4];
+ LLVMValueRef res;
+ unsigned sa = 0;
+
+ dst[i] = LLVMBuildBitCast(builder, src[i], lp_build_vec_type(gallivm, src_type), "");
+
+ for (j = 0; j < src_fmt->nr_channels; ++j) {
+ unsigned mask = 0;
+
+ assert(blend_type.width > src_fmt->channel[j].size);
+
+ for (k = 0; k < blend_type.width; ++k) {
+ mask |= 1 << k;
+ }
+
+ /* Extract bits */
+ chans[j] = LLVMBuildLShr(builder,
+ dst[i],
+ lp_build_const_int_vec(gallivm, src_type, j * blend_type.width),
+ "");
+
+ chans[j] = LLVMBuildAnd(builder,
+ chans[j],
+ lp_build_const_int_vec(gallivm, src_type, mask),
+ "");
+
+ /* Scale down bits */
+ chans[j] = scale_bits(gallivm, blend_type.width, src_fmt->channel[j].size, chans[j], src_type);
+
+ /* Insert bits */
+ chans[j] = LLVMBuildShl(builder,
+ chans[j],
+ lp_build_const_int_vec(gallivm, src_type, sa),
+ "");
+
+ sa += src_fmt->channel[j].size;
+
+ if (j == 0) {
+ res = chans[j];
+ } else {
+ res = LLVMBuildOr(builder, res, chans[j], "");
+ }
+ }
+
+ assert (dst_type.width != 24);
+
+ dst[i] = LLVMBuildTrunc(builder, res, lp_build_vec_type(gallivm, dst_type), "");
+ }
+}
+
+
+/**
+ * Generates the blend function for unswizzled colour buffers
+ * Also generates the read & write from colour buffer
+ */
+static void
+generate_unswizzled_blend(struct gallivm_state *gallivm,
+ unsigned rt,
+ struct lp_fragment_shader_variant *variant,
+ enum pipe_format out_format,
+ unsigned int num_fs,
+ struct lp_type fs_type,
+ LLVMValueRef* fs_mask,
+ LLVMValueRef fs_out_color[TGSI_NUM_CHANNELS][4],
+ LLVMValueRef context_ptr,
+ LLVMValueRef color_ptr,
+ LLVMValueRef stride,
+ unsigned partial_mask,
+ boolean do_branch)
+{
+ const unsigned alpha_channel = 3;
+ const unsigned block_width = 4;
+ const unsigned block_height = 4;
+ const unsigned block_size = block_width * block_height;
+ const unsigned lp_integer_vector_width = 128;
+
+ LLVMBuilderRef builder = gallivm->builder;
+ LLVMValueRef fs_src[4][TGSI_NUM_CHANNELS];
+ LLVMValueRef src_alpha[4 * 4];
+ LLVMValueRef src_mask[4 * 4];
+ LLVMValueRef src[4 * 4];
+ LLVMValueRef dst[4 * 4];
+ LLVMValueRef blend_color;
+ LLVMValueRef blend_alpha;
+ LLVMValueRef i32_zero;
+ LLVMValueRef check_mask;
+
+ struct lp_build_mask_context mask_ctx;
+ struct lp_type mask_type;
+ struct lp_type blend_type;
+ struct lp_type alpha_type;
+ struct lp_type row_type;
+ struct lp_type dst_type;
+
+ unsigned char swizzle[TGSI_NUM_CHANNELS];
+ unsigned vector_width;
+ unsigned dst_channels;
+ unsigned src_channels;
+ unsigned dst_count;
+ unsigned src_count;
+ unsigned i, j;
+
+ const struct util_format_description* out_format_desc = util_format_description(out_format);
+
+ unsigned dst_alignment;
+
+ bool pad_inline = is_arithmetic_format(out_format_desc);
+ bool has_alpha = false;
+
+ src_channels = TGSI_NUM_CHANNELS;
+ mask_type = lp_int32_vec4_type();
+ mask_type.length = fs_type.length;
+
+ /* Compute the alignment of the destination pointer in bytes */
+#if 0
+ dst_alignment = (block_width * out_format_desc->block.bits + 7)/(out_format_desc->block.width * 8);
+#else
+ /* FIXME -- currently we're fetching pixels one by one, instead of row by row */
+ dst_alignment = (1 * out_format_desc->block.bits + 7)/(out_format_desc->block.width * 8);
+#endif
+ /* Force power-of-two alignment by extracting only the least-significant-bit */
+ dst_alignment = 1 << (ffs(dst_alignment) - 1);
+ /* Resource base and stride pointers are aligned to 16 bytes, so that's the maximum alignment we can guarantee */
+ dst_alignment = MIN2(dst_alignment, 16);
+
+ /* Do not bother executing code when mask is empty.. */
+ if (do_branch) {
+ check_mask = LLVMConstNull(lp_build_int_vec_type(gallivm, mask_type));
+
+ for (i = 0; i < num_fs; ++i) {
+ check_mask = LLVMBuildOr(builder, check_mask, fs_mask[i], "");
+ }
+
+ lp_build_mask_begin(&mask_ctx, gallivm, mask_type, check_mask);
+ lp_build_mask_check(&mask_ctx);
+ }
+
+ partial_mask |= !variant->opaque;
+ i32_zero = lp_build_const_int32(gallivm, 0);
+
+ /* Get type from output format */
+ lp_blend_type_from_format_desc(out_format_desc, &row_type);
+ lp_mem_type_from_format_desc(out_format_desc, &dst_type);
+
+ row_type.length = fs_type.length;
+ vector_width = dst_type.floating ? lp_native_vector_width : lp_integer_vector_width;
+
+ /* Compute correct swizzle and count channels */
+ memset(swizzle, 0xFF, TGSI_NUM_CHANNELS);
+ dst_channels = 0;
+
+ for (i = 0; i < TGSI_NUM_CHANNELS; ++i) {
+ /* Ensure channel is used */
+ if (out_format_desc->swizzle[i] >= TGSI_NUM_CHANNELS) {
+ continue;
+ }
+
+ /* Ensure not already written to (happens in case with GL_ALPHA) */
+ if (swizzle[out_format_desc->swizzle[i]] < TGSI_NUM_CHANNELS) {
+ continue;
+ }
+
+ /* Ensure we havn't already found all channels */
+ if (dst_channels >= out_format_desc->nr_channels) {
+ continue;
+ }
+
+ swizzle[out_format_desc->swizzle[i]] = i;
+ ++dst_channels;
+
+ if (i == alpha_channel) {
+ has_alpha = true;
+ }
+ }
+
+ /* If 3 channels then pad to include alpha for 4 element transpose */
+ if (dst_channels == 3 && !has_alpha) {
+ swizzle[3] = 3;
+
+ if (out_format_desc->nr_channels == 4) {
+ dst_channels = 4;
+ }
+ }
+
+ /*
+ * Load shader output
+ */
+ for (i = 0; i < num_fs; ++i) {
+ /* Always load alpha for use in blending */
+ LLVMValueRef alpha = LLVMBuildLoad(builder, fs_out_color[alpha_channel][i], "");
+
+ /* Load each channel */
+ for (j = 0; j < dst_channels; ++j) {
+ fs_src[i][j] = LLVMBuildLoad(builder, fs_out_color[swizzle[j]][i], "");
+ }
+
+ /* If 3 channels then pad to include alpha for 4 element transpose */
+ if (dst_channels == 3 && !has_alpha) {
+ fs_src[i][3] = alpha;
+ swizzle[3] = 3;