#include "pipe/p_config.h"
#include "pipe/p_compiler.h"
#include "util/u_memory.h"
-#include "util/u_simple_list.h"
+#include "util/u_math.h"
+#include "util/u_format.h"
#include "translate.h"
-#if defined(PIPE_ARCH_X86)
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
#include "rtasm/rtasm_cpu.h"
#include "rtasm/rtasm_x86sse.h"
#define W 3
-typedef void (PIPE_CDECL *run_func)( struct translate *translate,
- unsigned start,
- unsigned count,
- void *output_buffer );
+struct translate_buffer {
+ const void *base_ptr;
+ uintptr_t stride;
+ unsigned max_index;
+};
+
+struct translate_buffer_varient {
+ unsigned buffer_index;
+ unsigned instance_divisor;
+ void *ptr; /* updated either per vertex or per instance */
+};
+
-typedef void (PIPE_CDECL *run_elts_func)( struct translate *translate,
- const unsigned *elts,
- unsigned count,
- void *output_buffer );
+#define ELEMENT_BUFFER_INSTANCE_ID 1001
+#define NUM_CONSTS 7
+
+enum
+{
+ CONST_IDENTITY,
+ CONST_INV_127,
+ CONST_INV_255,
+ CONST_INV_32767,
+ CONST_INV_65535,
+ CONST_INV_2147483647,
+ CONST_255
+};
+#define C(v) {(float)(v), (float)(v), (float)(v), (float)(v)}
+static float consts[NUM_CONSTS][4] = {
+ {0, 0, 0, 1},
+ C(1.0 / 127.0),
+ C(1.0 / 255.0),
+ C(1.0 / 32767.0),
+ C(1.0 / 65535.0),
+ C(1.0 / 2147483647.0),
+ C(255.0)
+};
+#undef C
struct translate_sse {
struct translate translate;
struct x86_function linear_func;
struct x86_function elt_func;
+ struct x86_function elt16_func;
+ struct x86_function elt8_func;
struct x86_function *func;
- boolean loaded_identity;
- boolean loaded_255;
- boolean loaded_inv_255;
+ PIPE_ALIGN_VAR(16) float consts[NUM_CONSTS][4];
+ int8_t reg_to_const[16];
+ int8_t const_to_reg[NUM_CONSTS];
+
+ struct translate_buffer buffer[PIPE_MAX_ATTRIBS];
+ unsigned nr_buffers;
- float identity[4];
- float float_255[4];
- float inv_255[4];
+ /* Multiple buffer varients can map to a single buffer. */
+ struct translate_buffer_varient buffer_varient[PIPE_MAX_ATTRIBS];
+ unsigned nr_buffer_varients;
- struct {
- char *input_ptr;
- unsigned input_stride;
- } attrib[PIPE_MAX_ATTRIBS];
+ /* Multiple elements can map to a single buffer varient. */
+ unsigned element_to_buffer_varient[PIPE_MAX_ATTRIBS];
- run_func gen_run;
- run_elts_func gen_run_elts;
+ boolean use_instancing;
+ unsigned instance_id;
+ /* these are actually known values, but putting them in a struct
+ * like this is helpful to keep them in sync across the file.
+ */
+ struct x86_reg tmp_EAX;
+ struct x86_reg tmp2_EDX;
+ struct x86_reg src_ECX;
+ struct x86_reg idx_ESI; /* either start+i or &elt[i] */
+ struct x86_reg machine_EDI;
+ struct x86_reg outbuf_EBX;
+ struct x86_reg count_EBP; /* decrements to zero */
};
static int get_offset( const void *a, const void *b )
return (const char *)b - (const char *)a;
}
-
-
-static struct x86_reg get_identity( struct translate_sse *p )
+static struct x86_reg get_const( struct translate_sse *p, unsigned id)
{
- struct x86_reg reg = x86_make_reg(file_XMM, 6);
-
- if (!p->loaded_identity) {
- /* Nasty:
- */
- struct x86_reg translateESI = x86_make_reg(file_REG32, reg_SI);
+ struct x86_reg reg;
+ unsigned i;
- p->loaded_identity = TRUE;
- p->identity[0] = 0;
- p->identity[1] = 0;
- p->identity[2] = 0;
- p->identity[3] = 1;
+ if(p->const_to_reg[id] >= 0)
+ return x86_make_reg(file_XMM, p->const_to_reg[id]);
- sse_movups(p->func, reg,
- x86_make_disp(translateESI,
- get_offset(p, &p->identity[0])));
+ for(i = 2; i < 8; ++i)
+ {
+ if(p->reg_to_const[i] < 0)
+ break;
}
- return reg;
-}
+ /* TODO: be smarter here */
+ if(i == 8)
+ --i;
-static struct x86_reg get_255( struct translate_sse *p )
-{
- struct x86_reg reg = x86_make_reg(file_XMM, 6);
+ reg = x86_make_reg(file_XMM, i);
- if (!p->loaded_255) {
- struct x86_reg translateESI = x86_make_reg(file_REG32, reg_SI);
+ if(p->reg_to_const[i] >= 0)
+ p->const_to_reg[p->reg_to_const[i]] = -1;
- p->loaded_255 = TRUE;
- p->float_255[0] =
- p->float_255[1] =
- p->float_255[2] =
- p->float_255[3] = 255.0f;
+ p->reg_to_const[i] = id;
+ p->const_to_reg[id] = i;
- sse_movups(p->func, reg,
- x86_make_disp(translateESI,
- get_offset(p, &p->float_255[0])));
- }
+ /* TODO: this should happen outside the loop, if possible */
+ sse_movaps(p->func, reg,
+ x86_make_disp(p->machine_EDI,
+ get_offset(p, &p->consts[id][0])));
return reg;
- return x86_make_reg(file_XMM, 7);
}
-static struct x86_reg get_inv_255( struct translate_sse *p )
+/* load the data in a SSE2 register, padding with zeros */
+static boolean emit_load_sse2( struct translate_sse *p,
+ struct x86_reg data,
+ struct x86_reg src,
+ unsigned size)
{
- struct x86_reg reg = x86_make_reg(file_XMM, 5);
-
- if (!p->loaded_inv_255) {
- struct x86_reg translateESI = x86_make_reg(file_REG32, reg_SI);
+ struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
+ struct x86_reg tmp = p->tmp_EAX;
+ switch(size)
+ {
+ case 1:
+ x86_movzx8(p->func, tmp, src);
+ sse2_movd(p->func, data, tmp);
+ break;
+ case 2:
+ x86_movzx16(p->func, tmp, src);
+ sse2_movd(p->func, data, tmp);
+ break;
+ case 3:
+ x86_movzx8(p->func, tmp, x86_make_disp(src, 2));
+ x86_shl_imm(p->func, tmp, 16);
+ x86_mov16(p->func, tmp, src);
+ sse2_movd(p->func, data, tmp);
+ break;
+ case 4:
+ sse2_movd(p->func, data, src);
+ break;
+ case 6:
+ sse2_movd(p->func, data, src);
+ x86_movzx16(p->func, tmp, x86_make_disp(src, 4));
+ sse2_movd(p->func, tmpXMM, tmp);
+ sse2_punpckldq(p->func, data, tmpXMM);
+ break;
+ case 8:
+ sse2_movq(p->func, data, src);
+ break;
+ case 12:
+ sse2_movq(p->func, data, src);
+ sse2_movd(p->func, tmpXMM, x86_make_disp(src, 8));
+ sse2_punpcklqdq(p->func, data, tmpXMM);
+ break;
+ case 16:
+ sse2_movdqu(p->func, data, src);
+ break;
+ default:
+ return FALSE;
+ }
+ return TRUE;
+}
- p->loaded_inv_255 = TRUE;
- p->inv_255[0] =
- p->inv_255[1] =
- p->inv_255[2] =
- p->inv_255[3] = 1.0f / 255.0f;
+/* this value can be passed for the out_chans argument */
+#define CHANNELS_0001 5
- sse_movups(p->func, reg,
- x86_make_disp(translateESI,
- get_offset(p, &p->inv_255[0])));
+/* this function will load #chans float values, and will
+ * pad the register with zeroes at least up to out_chans.
+ *
+ * If out_chans is set to CHANNELS_0001, then the fourth
+ * value will be padded with 1. Only pass this value if
+ * chans < 4 or results are undefined.
+ */
+static void emit_load_float32( struct translate_sse *p,
+ struct x86_reg data,
+ struct x86_reg arg0,
+ unsigned out_chans,
+ unsigned chans)
+{
+ switch(chans)
+ {
+ case 1:
+ /* a 0 0 0
+ * a 0 0 1
+ */
+ sse_movss(p->func, data, arg0);
+ if(out_chans == CHANNELS_0001)
+ sse_orps(p->func, data, get_const(p, CONST_IDENTITY) );
+ break;
+ case 2:
+ /* 0 0 0 1
+ * a b 0 1
+ */
+ if(out_chans == CHANNELS_0001)
+ sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X, Y, Z, W) );
+ else if(out_chans > 2)
+ sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY) );
+ sse_movlps(p->func, data, arg0);
+ break;
+ case 3:
+ /* Have to jump through some hoops:
+ *
+ * c 0 0 0
+ * c 0 0 1 if out_chans == CHANNELS_0001
+ * 0 0 c 0/1
+ * a b c 0/1
+ */
+ sse_movss(p->func, data, x86_make_disp(arg0, 8));
+ if(out_chans == CHANNELS_0001)
+ sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X,Y,Z,W) );
+ sse_shufps(p->func, data, data, SHUF(Y,Z,X,W) );
+ sse_movlps(p->func, data, arg0);
+ break;
+ case 4:
+ sse_movups(p->func, data, arg0);
+ break;
}
-
- return reg;
}
+/* this function behaves like emit_load_float32, but loads
+ 64-bit floating point numbers, converting them to 32-bit
+ ones */
+static void emit_load_float64to32( struct translate_sse *p,
+ struct x86_reg data,
+ struct x86_reg arg0,
+ unsigned out_chans,
+ unsigned chans)
+{
+ struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
+ switch(chans)
+ {
+ case 1:
+ sse2_movsd(p->func, data, arg0);
+ if(out_chans > 1)
+ sse2_cvtpd2ps(p->func, data, data);
+ else
+ sse2_cvtsd2ss(p->func, data, data);
+ if(out_chans == CHANNELS_0001)
+ sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X, Y, Z, W) );
+ break;
+ case 2:
+ sse2_movupd(p->func, data, arg0);
+ sse2_cvtpd2ps(p->func, data, data);
+ if(out_chans == CHANNELS_0001)
+ sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X, Y, Z, W) );
+ else if(out_chans > 2)
+ sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY) );
+ break;
+ case 3:
+ sse2_movupd(p->func, data, arg0);
+ sse2_cvtpd2ps(p->func, data, data);
+ sse2_movsd(p->func, tmpXMM, x86_make_disp(arg0, 16));
+ if(out_chans > 3)
+ sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM);
+ else
+ sse2_cvtsd2ss(p->func, tmpXMM, tmpXMM);
+ sse_movlhps(p->func, data, tmpXMM);
+ if(out_chans == CHANNELS_0001)
+ sse_orps(p->func, data, get_const(p, CONST_IDENTITY) );
+ break;
+ case 4:
+ sse2_movupd(p->func, data, arg0);
+ sse2_cvtpd2ps(p->func, data, data);
+ sse2_movupd(p->func, tmpXMM, x86_make_disp(arg0, 16));
+ sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM);
+ sse_movlhps(p->func, data, tmpXMM);
+ break;
+ }
+}
-static void emit_load_R32G32B32A32( struct translate_sse *p,
- struct x86_reg data,
- struct x86_reg arg0 )
+static void emit_mov64(struct translate_sse *p, struct x86_reg dst_gpr, struct x86_reg dst_xmm, struct x86_reg src_gpr, struct x86_reg src_xmm)
{
- sse_movups(p->func, data, arg0);
+ if(x86_target(p->func) != X86_32)
+ x64_mov64(p->func, dst_gpr, src_gpr);
+ else
+ {
+ /* TODO: when/on which CPUs is SSE2 actually better than SSE? */
+ if(x86_target_caps(p->func) & X86_SSE2)
+ sse2_movq(p->func, dst_xmm, src_xmm);
+ else
+ sse_movlps(p->func, dst_xmm, src_xmm);
+ }
}
-static void emit_load_R32G32B32( struct translate_sse *p,
- struct x86_reg data,
- struct x86_reg arg0 )
+static void emit_load64(struct translate_sse *p, struct x86_reg dst_gpr, struct x86_reg dst_xmm, struct x86_reg src)
{
- /* Have to jump through some hoops:
- *
- * c 0 0 0
- * c 0 0 1
- * 0 0 c 1
- * a b c 1
- */
- sse_movss(p->func, data, x86_make_disp(arg0, 8));
- sse_shufps(p->func, data, get_identity(p), SHUF(X,Y,Z,W) );
- sse_shufps(p->func, data, data, SHUF(Y,Z,X,W) );
- sse_movlps(p->func, data, arg0);
+ emit_mov64(p, dst_gpr, dst_xmm, src, src);
}
-static void emit_load_R32G32( struct translate_sse *p,
- struct x86_reg data,
- struct x86_reg arg0 )
+static void emit_store64(struct translate_sse *p, struct x86_reg dst, struct x86_reg src_gpr, struct x86_reg src_xmm)
{
- /* 0 0 0 1
- * a b 0 1
- */
- sse_movups(p->func, data, get_identity(p) );
- sse_movlps(p->func, data, arg0);
+ emit_mov64(p, dst, dst, src_gpr, src_xmm);
}
+static void emit_mov128(struct translate_sse *p, struct x86_reg dst, struct x86_reg src)
+{
+ if(x86_target_caps(p->func) & X86_SSE2)
+ sse2_movdqu(p->func, dst, src);
+ else
+ sse_movups(p->func, dst, src);
+}
-static void emit_load_R32( struct translate_sse *p,
- struct x86_reg data,
- struct x86_reg arg0 )
+/* TODO: this uses unaligned accesses liberally, which is great on Nehalem,
+ * but may or may not be good on older processors
+ * TODO: may perhaps want to use non-temporal stores here if possible
+ */
+static void emit_memcpy(struct translate_sse *p, struct x86_reg dst, struct x86_reg src, unsigned size)
{
- /* a 0 0 0
- * a 0 0 1
- */
- sse_movss(p->func, data, arg0);
- sse_orps(p->func, data, get_identity(p) );
+ struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
+ struct x86_reg dataXMM2 = x86_make_reg(file_XMM, 1);
+ struct x86_reg dataGPR = p->tmp_EAX;
+ struct x86_reg dataGPR2 = p->tmp2_EDX;
+
+ if(size < 8)
+ {
+ switch (size)
+ {
+ case 1:
+ x86_mov8(p->func, dataGPR, src);
+ x86_mov8(p->func, dst, dataGPR);
+ break;
+ case 2:
+ x86_mov16(p->func, dataGPR, src);
+ x86_mov16(p->func, dst, dataGPR);
+ break;
+ case 3:
+ x86_mov16(p->func, dataGPR, src);
+ x86_mov8(p->func, dataGPR2, x86_make_disp(src, 2));
+ x86_mov16(p->func, dst, dataGPR);
+ x86_mov8(p->func, x86_make_disp(dst, 2), dataGPR2);
+ break;
+ case 4:
+ x86_mov(p->func, dataGPR, src);
+ x86_mov(p->func, dst, dataGPR);
+ break;
+ case 6:
+ x86_mov(p->func, dataGPR, src);
+ x86_mov16(p->func, dataGPR2, x86_make_disp(src, 4));
+ x86_mov(p->func, dst, dataGPR);
+ x86_mov16(p->func, x86_make_disp(dst, 4), dataGPR2);
+ break;
+ }
+ }
+ else if(!(x86_target_caps(p->func) & X86_SSE))
+ {
+ unsigned i = 0;
+ assert((size & 3) == 0);
+ for(i = 0; i < size; i += 4)
+ {
+ x86_mov(p->func, dataGPR, x86_make_disp(src, i));
+ x86_mov(p->func, x86_make_disp(dst, i), dataGPR);
+ }
+ }
+ else
+ {
+ switch(size)
+ {
+ case 8:
+ emit_load64(p, dataGPR, dataXMM, src);
+ emit_store64(p, dst, dataGPR, dataXMM);
+ break;
+ case 12:
+ emit_load64(p, dataGPR2, dataXMM, src);
+ x86_mov(p->func, dataGPR, x86_make_disp(src, 8));
+ emit_store64(p, dst, dataGPR2, dataXMM);
+ x86_mov(p->func, x86_make_disp(dst, 8), dataGPR);
+ break;
+ case 16:
+ emit_mov128(p, dataXMM, src);
+ emit_mov128(p, dst, dataXMM);
+ break;
+ case 24:
+ emit_mov128(p, dataXMM, src);
+ emit_load64(p, dataGPR, dataXMM2, x86_make_disp(src, 16));
+ emit_mov128(p, dst, dataXMM);
+ emit_store64(p, x86_make_disp(dst, 16), dataGPR, dataXMM2);
+ break;
+ case 32:
+ emit_mov128(p, dataXMM, src);
+ emit_mov128(p, dataXMM2, x86_make_disp(src, 16));
+ emit_mov128(p, dst, dataXMM);
+ emit_mov128(p, x86_make_disp(dst, 16), dataXMM2);
+ break;
+ default:
+ assert(0);
+ }
+ }
}
+static boolean translate_attr_convert( struct translate_sse *p,
+ const struct translate_element *a,
+ struct x86_reg src,
+ struct x86_reg dst)
-static void emit_load_R8G8B8A8_UNORM( struct translate_sse *p,
- struct x86_reg data,
- struct x86_reg src )
{
+ const struct util_format_description* input_desc = util_format_description(a->input_format);
+ const struct util_format_description* output_desc = util_format_description(a->output_format);
+ unsigned i;
+ boolean id_swizzle = TRUE;
+ unsigned swizzle[4] = {UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE};
+ unsigned needed_chans = 0;
+ unsigned imms[2] = {0, 0x3f800000};
- /* Load and unpack twice:
- */
- sse_movss(p->func, data, src);
- sse2_punpcklbw(p->func, data, get_identity(p));
- sse2_punpcklbw(p->func, data, get_identity(p));
+ if(a->output_format == PIPE_FORMAT_NONE || a->input_format == PIPE_FORMAT_NONE)
+ return FALSE;
- /* Convert to float:
- */
- sse2_cvtdq2ps(p->func, data, data);
+ if(input_desc->channel[0].size & 7)
+ return FALSE;
+ if(input_desc->colorspace != output_desc->colorspace)
+ return FALSE;
- /* Scale by 1/255.0
- */
- sse_mulps(p->func, data, get_inv_255(p));
-}
+ for(i = 1; i < input_desc->nr_channels; ++i)
+ {
+ if(memcmp(&input_desc->channel[i], &input_desc->channel[0], sizeof(input_desc->channel[0])))
+ return FALSE;
+ }
+ for(i = 1; i < output_desc->nr_channels; ++i)
+ {
+ if(memcmp(&output_desc->channel[i], &output_desc->channel[0], sizeof(output_desc->channel[0])))
+ return FALSE;
+ }
+ for(i = 0; i < output_desc->nr_channels; ++i)
+ {
+ if(output_desc->swizzle[i] < 4)
+ swizzle[output_desc->swizzle[i]] = input_desc->swizzle[i];
+ }
+ if((x86_target_caps(p->func) & X86_SSE) && (0
+ || a->output_format == PIPE_FORMAT_R32_FLOAT
+ || a->output_format == PIPE_FORMAT_R32G32_FLOAT
+ || a->output_format == PIPE_FORMAT_R32G32B32_FLOAT
+ || a->output_format == PIPE_FORMAT_R32G32B32A32_FLOAT))
+ {
+ struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
+
+ for(i = 0; i < output_desc->nr_channels; ++i)
+ {
+ if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0 && i >= input_desc->nr_channels)
+ swizzle[i] = i;
+ }
-static void emit_store_R32G32B32A32( struct translate_sse *p,
- struct x86_reg dest,
- struct x86_reg dataXMM )
-{
- sse_movups(p->func, dest, dataXMM);
-}
+ for(i = 0; i < output_desc->nr_channels; ++i)
+ {
+ if(swizzle[i] < 4)
+ needed_chans = MAX2(needed_chans, swizzle[i] + 1);
+ if(swizzle[i] < UTIL_FORMAT_SWIZZLE_0 && swizzle[i] != i)
+ id_swizzle = FALSE;
+ }
-static void emit_store_R32G32B32( struct translate_sse *p,
- struct x86_reg dest,
- struct x86_reg dataXMM )
-{
- /* Emit two, shuffle, emit one.
- */
- sse_movlps(p->func, dest, dataXMM);
- sse_shufps(p->func, dataXMM, dataXMM, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */
- sse_movss(p->func, x86_make_disp(dest,8), dataXMM);
-}
+ if(needed_chans > 0)
+ {
+ switch(input_desc->channel[0].type)
+ {
+ case UTIL_FORMAT_TYPE_UNSIGNED:
+ if(!(x86_target_caps(p->func) & X86_SSE2))
+ return FALSE;
+ emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3);
+
+ /* TODO: add support for SSE4.1 pmovzx */
+ switch(input_desc->channel[0].size)
+ {
+ case 8:
+ /* TODO: this may be inefficient due to get_identity() being used both as a float and integer register */
+ sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
+ sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
+ break;
+ case 16:
+ sse2_punpcklwd(p->func, dataXMM, get_const(p, CONST_IDENTITY));
+ break;
+ case 32: /* we lose precision here */
+ sse2_psrld_imm(p->func, dataXMM, 1);
+ break;
+ default:
+ return FALSE;
+ }
+ sse2_cvtdq2ps(p->func, dataXMM, dataXMM);
+ if(input_desc->channel[0].normalized)
+ {
+ struct x86_reg factor;
+ switch(input_desc->channel[0].size)
+ {
+ case 8:
+ factor = get_const(p, CONST_INV_255);
+ break;
+ case 16:
+ factor = get_const(p, CONST_INV_65535);
+ break;
+ case 32:
+ factor = get_const(p, CONST_INV_2147483647);
+ break;
+ default:
+ assert(0);
+ factor.disp = 0;
+ factor.file = 0;
+ factor.idx = 0;
+ factor.mod = 0;
+ break;
+ }
+ sse_mulps(p->func, dataXMM, factor);
+ }
+ else if(input_desc->channel[0].size == 32)
+ sse_addps(p->func, dataXMM, dataXMM); /* compensate for the bit we threw away to fit u32 into s32 */
+ break;
+ case UTIL_FORMAT_TYPE_SIGNED:
+ if(!(x86_target_caps(p->func) & X86_SSE2))
+ return FALSE;
+ emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3);
+
+ /* TODO: add support for SSE4.1 pmovsx */
+ switch(input_desc->channel[0].size)
+ {
+ case 8:
+ sse2_punpcklbw(p->func, dataXMM, dataXMM);
+ sse2_punpcklbw(p->func, dataXMM, dataXMM);
+ sse2_psrad_imm(p->func, dataXMM, 24);
+ break;
+ case 16:
+ sse2_punpcklwd(p->func, dataXMM, dataXMM);
+ sse2_psrad_imm(p->func, dataXMM, 16);
+ break;
+ case 32: /* we lose precision here */
+ break;
+ default:
+ return FALSE;
+ }
+ sse2_cvtdq2ps(p->func, dataXMM, dataXMM);
+ if(input_desc->channel[0].normalized)
+ {
+ struct x86_reg factor;
+ switch(input_desc->channel[0].size)
+ {
+ case 8:
+ factor = get_const(p, CONST_INV_127);
+ break;
+ case 16:
+ factor = get_const(p, CONST_INV_32767);
+ break;
+ case 32:
+ factor = get_const(p, CONST_INV_2147483647);
+ break;
+ default:
+ assert(0);
+ factor.disp = 0;
+ factor.file = 0;
+ factor.idx = 0;
+ factor.mod = 0;
+ break;
+ }
+ sse_mulps(p->func, dataXMM, factor);
+ }
+ break;
+
+ break;
+ case UTIL_FORMAT_TYPE_FLOAT:
+ if(input_desc->channel[0].size != 32 && input_desc->channel[0].size != 64)
+ return FALSE;
+ if(swizzle[3] == UTIL_FORMAT_SWIZZLE_1 && input_desc->nr_channels <= 3)
+ {
+ swizzle[3] = UTIL_FORMAT_SWIZZLE_W;
+ needed_chans = CHANNELS_0001;
+ }
+ switch(input_desc->channel[0].size)
+ {
+ case 32:
+ emit_load_float32(p, dataXMM, src, needed_chans, input_desc->nr_channels);
+ break;
+ case 64: /* we lose precision here */
+ if(!(x86_target_caps(p->func) & X86_SSE2))
+ return FALSE;
+ emit_load_float64to32(p, dataXMM, src, needed_chans, input_desc->nr_channels);
+ break;
+ default:
+ return FALSE;
+ }
+ break;
+ default:
+ return FALSE;
+ }
+
+ if(!id_swizzle)
+ sse_shufps(p->func, dataXMM, dataXMM, SHUF(swizzle[0], swizzle[1], swizzle[2], swizzle[3]) );
+ }
-static void emit_store_R32G32( struct translate_sse *p,
- struct x86_reg dest,
- struct x86_reg dataXMM )
-{
- sse_movlps(p->func, dest, dataXMM);
-}
+ if(output_desc->nr_channels >= 4
+ && swizzle[0] < UTIL_FORMAT_SWIZZLE_0
+ && swizzle[1] < UTIL_FORMAT_SWIZZLE_0
+ && swizzle[2] < UTIL_FORMAT_SWIZZLE_0
+ && swizzle[3] < UTIL_FORMAT_SWIZZLE_0
+ )
+ sse_movups(p->func, dst, dataXMM);
+ else
+ {
+ if(output_desc->nr_channels >= 2
+ && swizzle[0] < UTIL_FORMAT_SWIZZLE_0
+ && swizzle[1] < UTIL_FORMAT_SWIZZLE_0)
+ sse_movlps(p->func, dst, dataXMM);
+ else
+ {
+ if(swizzle[0] < UTIL_FORMAT_SWIZZLE_0)
+ sse_movss(p->func, dst, dataXMM);
+ else
+ x86_mov_imm(p->func, dst, imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]);
+
+ if(output_desc->nr_channels >= 2)
+ {
+ if(swizzle[1] < UTIL_FORMAT_SWIZZLE_0)
+ {
+ sse_shufps(p->func, dataXMM, dataXMM, SHUF(1, 1, 2, 3));
+ sse_movss(p->func, x86_make_disp(dst, 4), dataXMM);
+ }
+ else
+ x86_mov_imm(p->func, x86_make_disp(dst, 4), imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0]);
+ }
+ }
+
+ if(output_desc->nr_channels >= 3)
+ {
+ if(output_desc->nr_channels >= 4
+ && swizzle[2] < UTIL_FORMAT_SWIZZLE_0
+ && swizzle[3] < UTIL_FORMAT_SWIZZLE_0)
+ sse_movhps(p->func, x86_make_disp(dst, 8), dataXMM);
+ else
+ {
+ if(swizzle[2] < UTIL_FORMAT_SWIZZLE_0)
+ {
+ sse_shufps(p->func, dataXMM, dataXMM, SHUF(2, 2, 2, 3));
+ sse_movss(p->func, x86_make_disp(dst, 8), dataXMM);
+ }
+ else
+ x86_mov_imm(p->func, x86_make_disp(dst, 8), imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]);
+
+ if(output_desc->nr_channels >= 4)
+ {
+ if(swizzle[3] < UTIL_FORMAT_SWIZZLE_0)
+ {
+ sse_shufps(p->func, dataXMM, dataXMM, SHUF(3, 3, 3, 3));
+ sse_movss(p->func, x86_make_disp(dst, 12), dataXMM);
+ }
+ else
+ x86_mov_imm(p->func, x86_make_disp(dst, 12), imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0]);
+ }
+ }
+ }
+ }
+ return TRUE;
+ }
+ else if((x86_target_caps(p->func) & X86_SSE2) && input_desc->channel[0].size == 8 && output_desc->channel[0].size == 16
+ && output_desc->channel[0].normalized == input_desc->channel[0].normalized
+ && (0
+ || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED)
+ || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
+ || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
+ ))
+ {
+ struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
+ struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
+ struct x86_reg tmp = p->tmp_EAX;
+ unsigned imms[2] = {0, 1};
+
+ for(i = 0; i < output_desc->nr_channels; ++i)
+ {
+ if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0 && i >= input_desc->nr_channels)
+ swizzle[i] = i;
+ }
-static void emit_store_R32( struct translate_sse *p,
- struct x86_reg dest,
- struct x86_reg dataXMM )
-{
- sse_movss(p->func, dest, dataXMM);
-}
+ for(i = 0; i < output_desc->nr_channels; ++i)
+ {
+ if(swizzle[i] < 4)
+ needed_chans = MAX2(needed_chans, swizzle[i] + 1);
+ if(swizzle[i] < UTIL_FORMAT_SWIZZLE_0 && swizzle[i] != i)
+ id_swizzle = FALSE;
+ }
+ if(needed_chans > 0)
+ {
+ emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3);
+
+ switch(input_desc->channel[0].type)
+ {
+ case UTIL_FORMAT_TYPE_UNSIGNED:
+ if(input_desc->channel[0].normalized)
+ {
+ sse2_punpcklbw(p->func, dataXMM, dataXMM);
+ if(output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
+ sse2_psrlw_imm(p->func, dataXMM, 1);
+ }
+ else
+ sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
+ break;
+ case UTIL_FORMAT_TYPE_SIGNED:
+ if(input_desc->channel[0].normalized)
+ {
+ sse2_movq(p->func, tmpXMM, get_const(p, CONST_IDENTITY));
+ sse2_punpcklbw(p->func, tmpXMM, dataXMM);
+ sse2_psllw_imm(p->func, dataXMM, 9);
+ sse2_psrlw_imm(p->func, dataXMM, 8);
+ sse2_por(p->func, tmpXMM, dataXMM);
+ sse2_psrlw_imm(p->func, dataXMM, 7);
+ sse2_por(p->func, tmpXMM, dataXMM);
+ {
+ struct x86_reg t = dataXMM;
+ dataXMM = tmpXMM;
+ tmpXMM = t;
+ }
+ }
+ else
+ {
+ sse2_punpcklbw(p->func, dataXMM, dataXMM);
+ sse2_psraw_imm(p->func, dataXMM, 8);
+ }
+ break;
+ default:
+ assert(0);
+ }
+
+ if(output_desc->channel[0].normalized)
+ imms[1] = (output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) ? 0xffff : 0x7ffff;
+
+ if(!id_swizzle)
+ sse2_pshuflw(p->func, dataXMM, dataXMM, (swizzle[0] & 3) | ((swizzle[1] & 3) << 2) | ((swizzle[2] & 3) << 4) | ((swizzle[3] & 3) << 6));
+ }
+ if(output_desc->nr_channels >= 4
+ && swizzle[0] < UTIL_FORMAT_SWIZZLE_0
+ && swizzle[1] < UTIL_FORMAT_SWIZZLE_0
+ && swizzle[2] < UTIL_FORMAT_SWIZZLE_0
+ && swizzle[3] < UTIL_FORMAT_SWIZZLE_0
+ )
+ sse2_movq(p->func, dst, dataXMM);
+ else
+ {
+ if(swizzle[0] < UTIL_FORMAT_SWIZZLE_0)
+ {
+ if(output_desc->nr_channels >= 2 && swizzle[1] < UTIL_FORMAT_SWIZZLE_0)
+ sse2_movd(p->func, dst, dataXMM);
+ else
+ {
+ sse2_movd(p->func, tmp, dataXMM);
+ x86_mov16(p->func, dst, tmp);
+ if(output_desc->nr_channels >= 2)
+ x86_mov16_imm(p->func, x86_make_disp(dst, 2), imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0]);
+ }
+ }
+ else
+ {
+ if(output_desc->nr_channels >= 2 && swizzle[1] >= UTIL_FORMAT_SWIZZLE_0)
+ x86_mov_imm(p->func, dst, (imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0] << 16) | imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]);
+ else
+ {
+ x86_mov16_imm(p->func, dst, imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]);
+ if(output_desc->nr_channels >= 2)
+ {
+ sse2_movd(p->func, tmp, dataXMM);
+ x86_shr_imm(p->func, tmp, 16);
+ x86_mov16(p->func, x86_make_disp(dst, 2), tmp);
+ }
+ }
+ }
+
+ if(output_desc->nr_channels >= 3)
+ {
+ if(swizzle[2] < UTIL_FORMAT_SWIZZLE_0)
+ {
+ if(output_desc->nr_channels >= 4 && swizzle[3] < UTIL_FORMAT_SWIZZLE_0)
+ {
+ sse2_psrlq_imm(p->func, dataXMM, 32);
+ sse2_movd(p->func, x86_make_disp(dst, 4), dataXMM);
+ }
+ else
+ {
+ sse2_psrlq_imm(p->func, dataXMM, 32);
+ sse2_movd(p->func, tmp, dataXMM);
+ x86_mov16(p->func, x86_make_disp(dst, 4), tmp);
+ if(output_desc->nr_channels >= 4)
+ {
+ x86_mov16_imm(p->func, x86_make_disp(dst, 6), imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0]);
+ }
+ }
+ }
+ else
+ {
+ if(output_desc->nr_channels >= 4 && swizzle[3] >= UTIL_FORMAT_SWIZZLE_0)
+ x86_mov_imm(p->func, x86_make_disp(dst, 4), (imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0] << 16) | imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]);
+ else
+ {
+ x86_mov16_imm(p->func, x86_make_disp(dst, 4), imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]);
+
+ if(output_desc->nr_channels >= 4)
+ {
+ sse2_psrlq_imm(p->func, dataXMM, 48);
+ sse2_movd(p->func, tmp, dataXMM);
+ x86_mov16(p->func, x86_make_disp(dst, 6), tmp);
+ }
+ }
+ }
+ }
+ }
+ return TRUE;
+ }
+ else if(!memcmp(&output_desc->channel[0], &input_desc->channel[0], sizeof(output_desc->channel[0])))
+ {
+ struct x86_reg tmp = p->tmp_EAX;
+ unsigned i;
+ if(input_desc->channel[0].size == 8 && input_desc->nr_channels == 4 && output_desc->nr_channels == 4
+ && swizzle[0] == UTIL_FORMAT_SWIZZLE_W
+ && swizzle[1] == UTIL_FORMAT_SWIZZLE_Z
+ && swizzle[2] == UTIL_FORMAT_SWIZZLE_Y
+ && swizzle[3] == UTIL_FORMAT_SWIZZLE_X)
+ {
+ /* TODO: support movbe */
+ x86_mov(p->func, tmp, src);
+ x86_bswap(p->func, tmp);
+ x86_mov(p->func, dst, tmp);
+ return TRUE;
+ }
-static void emit_store_R8G8B8A8_UNORM( struct translate_sse *p,
- struct x86_reg dest,
- struct x86_reg dataXMM )
-{
- /* Scale by 255.0
- */
- sse_mulps(p->func, dataXMM, get_255(p));
+ for(i = 0; i < output_desc->nr_channels; ++i)
+ {
+ switch(output_desc->channel[0].size)
+ {
+ case 8:
+ if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0)
+ {
+ unsigned v = 0;
+ if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1)
+ {
+ switch(output_desc->channel[0].type)
+ {
+ case UTIL_FORMAT_TYPE_UNSIGNED:
+ v = output_desc->channel[0].normalized ? 0xff : 1;
+ break;
+ case UTIL_FORMAT_TYPE_SIGNED:
+ v = output_desc->channel[0].normalized ? 0x7f : 1;
+ break;
+ default:
+ return FALSE;
+ }
+ }
+ x86_mov8_imm(p->func, x86_make_disp(dst, i * 1), v);
+ }
+ else
+ {
+ x86_mov8(p->func, tmp, x86_make_disp(src, swizzle[i] * 1));
+ x86_mov8(p->func, x86_make_disp(dst, i * 1), tmp);
+ }
+ break;
+ case 16:
+ if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0)
+ {
+ unsigned v = 0;
+ if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1)
+ {
+ switch(output_desc->channel[1].type)
+ {
+ case UTIL_FORMAT_TYPE_UNSIGNED:
+ v = output_desc->channel[1].normalized ? 0xffff : 1;
+ break;
+ case UTIL_FORMAT_TYPE_SIGNED:
+ v = output_desc->channel[1].normalized ? 0x7fff : 1;
+ break;
+ case UTIL_FORMAT_TYPE_FLOAT:
+ v = 0x3c00;
+ break;
+ default:
+ return FALSE;
+ }
+ }
+ x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), v);
+ }
+ else if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0)
+ x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), 0);
+ else
+ {
+ x86_mov16(p->func, tmp, x86_make_disp(src, swizzle[i] * 2));
+ x86_mov16(p->func, x86_make_disp(dst, i * 2), tmp);
+ }
+ break;
+ case 32:
+ if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0)
+ {
+ unsigned v = 0;
+ if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1)
+ {
+ switch(output_desc->channel[1].type)
+ {
+ case UTIL_FORMAT_TYPE_UNSIGNED:
+ v = output_desc->channel[1].normalized ? 0xffffffff : 1;
+ break;
+ case UTIL_FORMAT_TYPE_SIGNED:
+ v = output_desc->channel[1].normalized ? 0x7fffffff : 1;
+ break;
+ case UTIL_FORMAT_TYPE_FLOAT:
+ v = 0x3f800000;
+ break;
+ default:
+ return FALSE;
+ }
+ }
+ x86_mov_imm(p->func, x86_make_disp(dst, i * 4), v);
+ }
+ else
+ {
+ x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 4));
+ x86_mov(p->func, x86_make_disp(dst, i * 4), tmp);
+ }
+ break;
+ case 64:
+ if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0)
+ {
+ unsigned l = 0;
+ unsigned h = 0;
+ if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1)
+ {
+ switch(output_desc->channel[1].type)
+ {
+ case UTIL_FORMAT_TYPE_UNSIGNED:
+ h = output_desc->channel[1].normalized ? 0xffffffff : 0;
+ l = output_desc->channel[1].normalized ? 0xffffffff : 1;
+ break;
+ case UTIL_FORMAT_TYPE_SIGNED:
+ h = output_desc->channel[1].normalized ? 0x7fffffff : 0;
+ l = output_desc->channel[1].normalized ? 0xffffffff : 1;
+ break;
+ case UTIL_FORMAT_TYPE_FLOAT:
+ h = 0x3ff00000;
+ l = 0;
+ break;
+ default:
+ return FALSE;
+ }
+ }
+ x86_mov_imm(p->func, x86_make_disp(dst, i * 8), l);
+ x86_mov_imm(p->func, x86_make_disp(dst, i * 8 + 4), h);
+ }
+ else
+ {
+ if(x86_target_caps(p->func) & X86_SSE)
+ {
+ struct x86_reg tmpXMM = x86_make_reg(file_XMM, 0);
+ emit_load64(p, tmp, tmpXMM, x86_make_disp(src, swizzle[i] * 8));
+ emit_store64(p, x86_make_disp(dst, i * 8), tmp, tmpXMM);
+ }
+ else
+ {
+ x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8));
+ x86_mov(p->func, x86_make_disp(dst, i * 8), tmp);
+ x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8 + 4));
+ x86_mov(p->func, x86_make_disp(dst, i * 8 + 4), tmp);
+ }
+ }
+ break;
+ default:
+ return FALSE;
+ }
+ }
+ return TRUE;
+ }
+ /* special case for draw's EMIT_4UB (RGBA) and EMIT_4UB_BGRA */
+ else if((x86_target_caps(p->func) & X86_SSE2) &&
+ a->input_format == PIPE_FORMAT_R32G32B32A32_FLOAT && (0
+ || a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM
+ || a->output_format == PIPE_FORMAT_R8G8B8A8_UNORM
+ ))
+ {
+ struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
+
+ /* load */
+ sse_movups(p->func, dataXMM, src);
+
+ if (a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM)
+ sse_shufps(p->func, dataXMM, dataXMM, SHUF(2,1,0,3));
+
+ /* scale by 255.0 */
+ sse_mulps(p->func, dataXMM, get_const(p, CONST_255));
+
+ /* pack and emit */
+ sse2_cvtps2dq(p->func, dataXMM, dataXMM);
+ sse2_packssdw(p->func, dataXMM, dataXMM);
+ sse2_packuswb(p->func, dataXMM, dataXMM);
+ sse2_movd(p->func, dst, dataXMM);
+
+ return TRUE;
+ }
- /* Pack and emit:
- */
- sse2_cvtps2dq(p->func, dataXMM, dataXMM);
- sse2_packssdw(p->func, dataXMM, dataXMM);
- sse2_packuswb(p->func, dataXMM, dataXMM);
- sse_movss(p->func, dest, dataXMM);
+ return FALSE;
}
+static boolean translate_attr( struct translate_sse *p,
+ const struct translate_element *a,
+ struct x86_reg src,
+ struct x86_reg dst)
+{
+ if(a->input_format == a->output_format)
+ {
+ emit_memcpy(p, dst, src, util_format_get_stride(a->input_format, 1));
+ return TRUE;
+ }
+
+ return translate_attr_convert(p, a, src, dst);
+}
+static boolean init_inputs( struct translate_sse *p,
+ unsigned index_size )
+{
+ unsigned i;
+ struct x86_reg instance_id = x86_make_disp(p->machine_EDI,
+ get_offset(p, &p->instance_id));
+
+ for (i = 0; i < p->nr_buffer_varients; i++) {
+ struct translate_buffer_varient *varient = &p->buffer_varient[i];
+ struct translate_buffer *buffer = &p->buffer[varient->buffer_index];
+
+ if (!index_size || varient->instance_divisor) {
+ struct x86_reg buf_stride = x86_make_disp(p->machine_EDI,
+ get_offset(p, &buffer->stride));
+ struct x86_reg buf_ptr = x86_make_disp(p->machine_EDI,
+ get_offset(p, &varient->ptr));
+ struct x86_reg buf_base_ptr = x86_make_disp(p->machine_EDI,
+ get_offset(p, &buffer->base_ptr));
+ struct x86_reg elt = p->idx_ESI;
+ struct x86_reg tmp_EAX = p->tmp_EAX;
+
+ /* Calculate pointer to first attrib:
+ * base_ptr + stride * index, where index depends on instance divisor
+ */
+ if (varient->instance_divisor) {
+ /* Our index is instance ID divided by instance divisor.
+ */
+ x86_mov(p->func, tmp_EAX, instance_id);
+
+ if (varient->instance_divisor != 1) {
+ struct x86_reg tmp_EDX = p->tmp2_EDX;
+ struct x86_reg tmp_ECX = p->src_ECX;
+
+ /* TODO: Add x86_shr() to rtasm and use it whenever
+ * instance divisor is power of two.
+ */
+
+ x86_xor(p->func, tmp_EDX, tmp_EDX);
+ x86_mov_reg_imm(p->func, tmp_ECX, varient->instance_divisor);
+ x86_div(p->func, tmp_ECX); /* EAX = EDX:EAX / ECX */
+ }
+ } else {
+ x86_mov(p->func, tmp_EAX, elt);
+ }
+
+ /*
+ * TODO: Respect translate_buffer::max_index.
+ */
+
+ x86_imul(p->func, tmp_EAX, buf_stride);
+ x64_rexw(p->func);
+ x86_add(p->func, tmp_EAX, buf_base_ptr);
+
+
+ /* In the linear case, keep the buffer pointer instead of the
+ * index number.
+ */
+ if (!index_size && p->nr_buffer_varients == 1)
+ {
+ x64_rexw(p->func);
+ x86_mov(p->func, elt, tmp_EAX);
+ }
+ else
+ {
+ x64_rexw(p->func);
+ x86_mov(p->func, buf_ptr, tmp_EAX);
+ }
+ }
+ }
+ return TRUE;
+}
-static void get_src_ptr( struct translate_sse *p,
- struct x86_reg srcEAX,
- struct x86_reg translateREG,
- struct x86_reg eltREG,
- unsigned a )
+static struct x86_reg get_buffer_ptr( struct translate_sse *p,
+ unsigned index_size,
+ unsigned var_idx,
+ struct x86_reg elt )
{
- struct x86_reg input_ptr =
- x86_make_disp(translateREG,
- get_offset(p, &p->attrib[a].input_ptr));
+ if (var_idx == ELEMENT_BUFFER_INSTANCE_ID) {
+ return x86_make_disp(p->machine_EDI,
+ get_offset(p, &p->instance_id));
+ }
+ if (!index_size && p->nr_buffer_varients == 1) {
+ return p->idx_ESI;
+ }
+ else if (!index_size || p->buffer_varient[var_idx].instance_divisor) {
+ struct x86_reg ptr = p->src_ECX;
+ struct x86_reg buf_ptr =
+ x86_make_disp(p->machine_EDI,
+ get_offset(p, &p->buffer_varient[var_idx].ptr));
+
+ x64_rexw(p->func);
+ x86_mov(p->func, ptr, buf_ptr);
+ return ptr;
+ }
+ else {
+ struct x86_reg ptr = p->src_ECX;
+ const struct translate_buffer_varient *varient = &p->buffer_varient[var_idx];
- struct x86_reg input_stride =
- x86_make_disp(translateREG,
- get_offset(p, &p->attrib[a].input_stride));
+ struct x86_reg buf_stride =
+ x86_make_disp(p->machine_EDI,
+ get_offset(p, &p->buffer[varient->buffer_index].stride));
- /* Calculate pointer to current attrib:
- */
- x86_mov(p->func, srcEAX, input_stride);
- x86_imul(p->func, srcEAX, eltREG);
- x86_add(p->func, srcEAX, input_ptr);
-}
+ struct x86_reg buf_base_ptr =
+ x86_make_disp(p->machine_EDI,
+ get_offset(p, &p->buffer[varient->buffer_index].base_ptr));
-/* Extended swizzles? Maybe later.
- */
-static void emit_swizzle( struct translate_sse *p,
- struct x86_reg dest,
- struct x86_reg src,
- unsigned char shuffle )
-{
- sse_shufps(p->func, dest, src, shuffle);
+
+ /* Calculate pointer to current attrib:
+ */
+ switch(index_size)
+ {
+ case 1:
+ x86_movzx8(p->func, ptr, elt);
+ break;
+ case 2:
+ x86_movzx16(p->func, ptr, elt);
+ break;
+ case 4:
+ x86_mov(p->func, ptr, elt);
+ break;
+ }
+ x86_imul(p->func, ptr, buf_stride);
+ x64_rexw(p->func);
+ x86_add(p->func, ptr, buf_base_ptr);
+ return ptr;
+ }
}
-static boolean translate_attr( struct translate_sse *p,
- const struct translate_element *a,
- struct x86_reg srcECX,
- struct x86_reg dstEAX)
-{
- struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
- switch (a->input_format) {
- case PIPE_FORMAT_R32_FLOAT:
- emit_load_R32(p, dataXMM, srcECX);
- break;
- case PIPE_FORMAT_R32G32_FLOAT:
- emit_load_R32G32(p, dataXMM, srcECX);
- break;
- case PIPE_FORMAT_R32G32B32_FLOAT:
- emit_load_R32G32B32(p, dataXMM, srcECX);
- break;
- case PIPE_FORMAT_R32G32B32A32_FLOAT:
- emit_load_R32G32B32A32(p, dataXMM, srcECX);
- break;
- case PIPE_FORMAT_B8G8R8A8_UNORM:
- emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX);
- emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W));
- break;
- case PIPE_FORMAT_R8G8B8A8_UNORM:
- emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX);
- break;
- default:
- return FALSE;
+static boolean incr_inputs( struct translate_sse *p,
+ unsigned index_size )
+{
+ if (!index_size && p->nr_buffer_varients == 1) {
+ struct x86_reg stride = x86_make_disp(p->machine_EDI,
+ get_offset(p, &p->buffer[0].stride));
+
+ if (p->buffer_varient[0].instance_divisor == 0) {
+ x64_rexw(p->func);
+ x86_add(p->func, p->idx_ESI, stride);
+ sse_prefetchnta(p->func, x86_make_disp(p->idx_ESI, 192));
+ }
}
+ else if (!index_size) {
+ unsigned i;
- switch (a->output_format) {
- case PIPE_FORMAT_R32_FLOAT:
- emit_store_R32(p, dstEAX, dataXMM);
- break;
- case PIPE_FORMAT_R32G32_FLOAT:
- emit_store_R32G32(p, dstEAX, dataXMM);
- break;
- case PIPE_FORMAT_R32G32B32_FLOAT:
- emit_store_R32G32B32(p, dstEAX, dataXMM);
- break;
- case PIPE_FORMAT_R32G32B32A32_FLOAT:
- emit_store_R32G32B32A32(p, dstEAX, dataXMM);
- break;
- case PIPE_FORMAT_B8G8R8A8_UNORM:
- emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W));
- emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM);
- break;
- case PIPE_FORMAT_R8G8B8A8_UNORM:
- emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM);
- break;
- default:
- return FALSE;
+ /* Is this worthwhile??
+ */
+ for (i = 0; i < p->nr_buffer_varients; i++) {
+ struct translate_buffer_varient *varient = &p->buffer_varient[i];
+ struct x86_reg buf_ptr = x86_make_disp(p->machine_EDI,
+ get_offset(p, &varient->ptr));
+ struct x86_reg buf_stride = x86_make_disp(p->machine_EDI,
+ get_offset(p, &p->buffer[varient->buffer_index].stride));
+
+ if (varient->instance_divisor == 0) {
+ x86_mov(p->func, p->tmp_EAX, buf_stride);
+ x64_rexw(p->func);
+ x86_add(p->func, p->tmp_EAX, buf_ptr);
+ if (i == 0) sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192));
+ x64_rexw(p->func);
+ x86_mov(p->func, buf_ptr, p->tmp_EAX);
+ }
+ }
+ }
+ else {
+ x64_rexw(p->func);
+ x86_lea(p->func, p->idx_ESI, x86_make_disp(p->idx_ESI, index_size));
}
-
+
return TRUE;
}
-/* Build run( struct translate *translate,
+
+/* Build run( struct translate *machine,
* unsigned start,
* unsigned count,
* void *output_buffer )
* or
- * run_elts( struct translate *translate,
+ * run_elts( struct translate *machine,
* unsigned *elts,
* unsigned count,
* void *output_buffer )
*/
static boolean build_vertex_emit( struct translate_sse *p,
struct x86_function *func,
- boolean linear )
+ unsigned index_size )
{
- struct x86_reg vertexECX = x86_make_reg(file_REG32, reg_AX);
- struct x86_reg idxEBX = x86_make_reg(file_REG32, reg_BX);
- struct x86_reg srcEAX = x86_make_reg(file_REG32, reg_CX);
- struct x86_reg countEBP = x86_make_reg(file_REG32, reg_BP);
- struct x86_reg translateESI = x86_make_reg(file_REG32, reg_SI);
int fixup, label;
unsigned j;
+ memset(p->reg_to_const, 0xff, sizeof(p->reg_to_const));
+ memset(p->const_to_reg, 0xff, sizeof(p->const_to_reg));
+
+ p->tmp_EAX = x86_make_reg(file_REG32, reg_AX);
+ p->idx_ESI = x86_make_reg(file_REG32, reg_SI);
+ p->outbuf_EBX = x86_make_reg(file_REG32, reg_BX);
+ p->machine_EDI = x86_make_reg(file_REG32, reg_DI);
+ p->count_EBP = x86_make_reg(file_REG32, reg_BP);
+ p->tmp2_EDX = x86_make_reg(file_REG32, reg_DX);
+ p->src_ECX = x86_make_reg(file_REG32, reg_CX);
+
p->func = func;
- p->loaded_inv_255 = FALSE;
- p->loaded_255 = FALSE;
- p->loaded_identity = FALSE;
x86_init_func(p->func);
- /* Push a few regs?
- */
- x86_push(p->func, countEBP);
- x86_push(p->func, translateESI);
- x86_push(p->func, idxEBX);
+ if(x86_target(p->func) == X86_64_WIN64_ABI)
+ {
+ /* the ABI guarantees a 16-byte aligned 32-byte "shadow space" above the return address */
+ sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8), x86_make_reg(file_XMM, 6));
+ sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24), x86_make_reg(file_XMM, 7));
+ }
- /* Get vertex count, compare to zero
- */
- x86_xor(p->func, idxEBX, idxEBX);
- x86_mov(p->func, countEBP, x86_fn_arg(p->func, 3));
- x86_cmp(p->func, countEBP, idxEBX);
- fixup = x86_jcc_forward(p->func, cc_E);
+ x86_push(p->func, p->outbuf_EBX);
+ x86_push(p->func, p->count_EBP);
- /* If linear, idx is the current element, otherwise it is a pointer
- * to the current element.
- */
- x86_mov(p->func, idxEBX, x86_fn_arg(p->func, 2));
+/* on non-Win64 x86-64, these are already in the right registers */
+ if(x86_target(p->func) != X86_64_STD_ABI)
+ {
+ x86_push(p->func, p->machine_EDI);
+ x86_push(p->func, p->idx_ESI);
+
+ x86_mov(p->func, p->machine_EDI, x86_fn_arg(p->func, 1));
+ x86_mov(p->func, p->idx_ESI, x86_fn_arg(p->func, 2));
+ }
- /* Initialize destination register.
+ x86_mov(p->func, p->count_EBP, x86_fn_arg(p->func, 3));
+
+ if(x86_target(p->func) != X86_32)
+ x64_mov64(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 5));
+ else
+ x86_mov(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 5));
+
+ /* Load instance ID.
*/
- x86_mov(p->func, vertexECX, x86_fn_arg(p->func, 4));
+ if (p->use_instancing) {
+ x86_mov(p->func,
+ p->tmp_EAX,
+ x86_fn_arg(p->func, 4));
+ x86_mov(p->func,
+ x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)),
+ p->tmp_EAX);
+ }
- /* Move argument 1 (translate_sse pointer) into a reg:
+ /* Get vertex count, compare to zero
*/
- x86_mov(p->func, translateESI, x86_fn_arg(p->func, 1));
+ x86_xor(p->func, p->tmp_EAX, p->tmp_EAX);
+ x86_cmp(p->func, p->count_EBP, p->tmp_EAX);
+ fixup = x86_jcc_forward(p->func, cc_E);
-
/* always load, needed or not:
*/
+ init_inputs(p, index_size);
- /* Note address for loop jump */
+ /* Note address for loop jump
+ */
label = x86_get_label(p->func);
-
-
- for (j = 0; j < p->translate.key.nr_elements; j++) {
- const struct translate_element *a = &p->translate.key.element[j];
-
- struct x86_reg destEAX = x86_make_disp(vertexECX,
- a->output_offset);
-
- /* Figure out source pointer address:
- */
- if (linear) {
- get_src_ptr(p, srcEAX, translateESI, idxEBX, j);
+ {
+ struct x86_reg elt = !index_size ? p->idx_ESI : x86_deref(p->idx_ESI);
+ int last_varient = -1;
+ struct x86_reg vb;
+
+ for (j = 0; j < p->translate.key.nr_elements; j++) {
+ const struct translate_element *a = &p->translate.key.element[j];
+ unsigned varient = p->element_to_buffer_varient[j];
+
+ /* Figure out source pointer address:
+ */
+ if (varient != last_varient) {
+ last_varient = varient;
+ vb = get_buffer_ptr(p, index_size, varient, elt);
+ }
+
+ if (!translate_attr( p, a,
+ x86_make_disp(vb, a->input_offset),
+ x86_make_disp(p->outbuf_EBX, a->output_offset)))
+ return FALSE;
}
- else {
- get_src_ptr(p, srcEAX, translateESI, x86_deref(idxEBX), j);
- }
-
- if (!translate_attr( p, a, x86_deref(srcEAX), destEAX ))
- return FALSE;
- }
-
- /* Next vertex:
- */
- x86_lea(p->func, vertexECX, x86_make_disp(vertexECX, p->translate.key.output_stride));
- /* Incr index
- */
- if (linear) {
- x86_inc(p->func, idxEBX);
- }
- else {
- x86_lea(p->func, idxEBX, x86_make_disp(idxEBX, 4));
+ /* Next output vertex:
+ */
+ x64_rexw(p->func);
+ x86_lea(p->func,
+ p->outbuf_EBX,
+ x86_make_disp(p->outbuf_EBX,
+ p->translate.key.output_stride));
+
+ /* Incr index
+ */
+ incr_inputs( p, index_size );
}
/* decr count, loop if not zero
*/
- x86_dec(p->func, countEBP);
- x86_test(p->func, countEBP, countEBP);
+ x86_dec(p->func, p->count_EBP);
x86_jcc(p->func, cc_NZ, label);
/* Exit mmx state?
/* Pop regs and return
*/
- x86_pop(p->func, idxEBX);
- x86_pop(p->func, translateESI);
- x86_pop(p->func, countEBP);
+ if(x86_target(p->func) != X86_64_STD_ABI)
+ {
+ x86_pop(p->func, p->idx_ESI);
+ x86_pop(p->func, p->machine_EDI);
+ }
+
+ x86_pop(p->func, p->count_EBP);
+ x86_pop(p->func, p->outbuf_EBX);
+
+ if(x86_target(p->func) == X86_64_WIN64_ABI)
+ {
+ sse2_movdqa(p->func, x86_make_reg(file_XMM, 6), x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8));
+ sse2_movdqa(p->func, x86_make_reg(file_XMM, 7), x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24));
+ }
x86_ret(p->func);
return TRUE;
static void translate_sse_set_buffer( struct translate *translate,
unsigned buf,
const void *ptr,
- unsigned stride )
+ unsigned stride,
+ unsigned max_index )
{
struct translate_sse *p = (struct translate_sse *)translate;
- unsigned i;
- for (i = 0; i < p->translate.key.nr_elements; i++) {
- if (p->translate.key.element[i].input_buffer == buf) {
- p->attrib[i].input_ptr = ((char *)ptr +
- p->translate.key.element[i].input_offset);
- p->attrib[i].input_stride = stride;
- }
+ if (buf < p->nr_buffers) {
+ p->buffer[buf].base_ptr = (char *)ptr;
+ p->buffer[buf].stride = stride;
+ p->buffer[buf].max_index = max_index;
}
+
+ if (0) debug_printf("%s %d/%d: %p %d\n",
+ __FUNCTION__, buf,
+ p->nr_buffers,
+ ptr, stride);
}
x86_release_func( &p->linear_func );
x86_release_func( &p->elt_func );
- FREE(p);
-}
-
-static void PIPE_CDECL translate_sse_run_elts( struct translate *translate,
- const unsigned *elts,
- unsigned count,
- void *output_buffer )
-{
- struct translate_sse *p = (struct translate_sse *)translate;
-
- p->gen_run_elts( translate,
- elts,
- count,
- output_buffer );
-}
-
-static void PIPE_CDECL translate_sse_run( struct translate *translate,
- unsigned start,
- unsigned count,
- void *output_buffer )
-{
- struct translate_sse *p = (struct translate_sse *)translate;
-
- p->gen_run( translate,
- start,
- count,
- output_buffer );
+ os_free_aligned(p);
}
struct translate *translate_sse2_create( const struct translate_key *key )
{
struct translate_sse *p = NULL;
+ unsigned i;
- if (!rtasm_cpu_has_sse() || !rtasm_cpu_has_sse2())
+ /* this is misnamed, it actually refers to whether rtasm is enabled or not */
+ if (!rtasm_cpu_has_sse())
goto fail;
- p = CALLOC_STRUCT( translate_sse );
+ p = os_malloc_aligned(sizeof(struct translate_sse), 16);
if (p == NULL)
goto fail;
+ memset(p, 0, sizeof(*p));
+ memcpy(p->consts, consts, sizeof(consts));
p->translate.key = *key;
p->translate.release = translate_sse_release;
p->translate.set_buffer = translate_sse_set_buffer;
- p->translate.run_elts = translate_sse_run_elts;
- p->translate.run = translate_sse_run;
- if (!build_vertex_emit(p, &p->linear_func, TRUE))
+ for (i = 0; i < key->nr_elements; i++) {
+ if (key->element[i].type == TRANSLATE_ELEMENT_NORMAL) {
+ unsigned j;
+
+ p->nr_buffers = MAX2(p->nr_buffers, key->element[i].input_buffer + 1);
+
+ if (key->element[i].instance_divisor) {
+ p->use_instancing = TRUE;
+ }
+
+ /*
+ * Map vertex element to vertex buffer varient.
+ */
+ for (j = 0; j < p->nr_buffer_varients; j++) {
+ if (p->buffer_varient[j].buffer_index == key->element[i].input_buffer &&
+ p->buffer_varient[j].instance_divisor == key->element[i].instance_divisor) {
+ break;
+ }
+ }
+ if (j == p->nr_buffer_varients) {
+ p->buffer_varient[j].buffer_index = key->element[i].input_buffer;
+ p->buffer_varient[j].instance_divisor = key->element[i].instance_divisor;
+ p->nr_buffer_varients++;
+ }
+ p->element_to_buffer_varient[i] = j;
+ } else {
+ assert(key->element[i].type == TRANSLATE_ELEMENT_INSTANCE_ID);
+
+ p->element_to_buffer_varient[i] = ELEMENT_BUFFER_INSTANCE_ID;
+ }
+ }
+
+ if (0) debug_printf("nr_buffers: %d\n", p->nr_buffers);
+
+ if (!build_vertex_emit(p, &p->linear_func, 0))
+ goto fail;
+
+ if (!build_vertex_emit(p, &p->elt_func, 4))
+ goto fail;
+
+ if (!build_vertex_emit(p, &p->elt16_func, 2))
+ goto fail;
+
+ if (!build_vertex_emit(p, &p->elt8_func, 1))
+ goto fail;
+
+ p->translate.run = (void*)x86_get_func(&p->linear_func);
+ if (p->translate.run == NULL)
goto fail;
- if (!build_vertex_emit(p, &p->elt_func, FALSE))
+ p->translate.run_elts = (void*)x86_get_func(&p->elt_func);
+ if (p->translate.run_elts == NULL)
goto fail;
- p->gen_run = (run_func)x86_get_func(&p->linear_func);
- if (p->gen_run == NULL)
+ p->translate.run_elts16 = (void*)x86_get_func(&p->elt16_func);
+ if (p->translate.run_elts16 == NULL)
goto fail;
- p->gen_run_elts = (run_elts_func)x86_get_func(&p->elt_func);
- if (p->gen_run_elts == NULL)
+ p->translate.run_elts8 = (void*)x86_get_func(&p->elt8_func);
+ if (p->translate.run_elts8 == NULL)
goto fail;
return &p->translate;