#define ELEMENT_BUFFER_INSTANCE_ID 1001
+#define NUM_CONSTS 7
+
+enum
+{
+ CONST_IDENTITY,
+ CONST_INV_127,
+ CONST_INV_255,
+ CONST_INV_32767,
+ CONST_INV_65535,
+ CONST_INV_2147483647,
+ CONST_255
+};
+
+#define C(v) {(float)(v), (float)(v), (float)(v), (float)(v)}
+static float consts[NUM_CONSTS][4] = {
+ {0, 0, 0, 1},
+ C(1.0 / 127.0),
+ C(1.0 / 255.0),
+ C(1.0 / 32767.0),
+ C(1.0 / 65535.0),
+ C(1.0 / 2147483647.0),
+ C(255.0)
+};
+#undef C
struct translate_sse {
struct translate translate;
struct x86_function elt8_func;
struct x86_function *func;
- boolean loaded_identity;
- boolean loaded_const[5];
-
- float identity[4];
- float const_value[5][4];
+ PIPE_ALIGN_VAR(16) float consts[NUM_CONSTS][4];
+ int8_t reg_to_const[16];
+ int8_t const_to_reg[NUM_CONSTS];
struct translate_buffer buffer[PIPE_MAX_ATTRIBS];
unsigned nr_buffers;
*/
struct x86_reg tmp_EAX;
struct x86_reg tmp2_EDX;
- struct x86_reg tmp3_ECX;
+ struct x86_reg src_ECX;
struct x86_reg idx_ESI; /* either start+i or &elt[i] */
struct x86_reg machine_EDI;
struct x86_reg outbuf_EBX;
return (const char *)b - (const char *)a;
}
-
-
-static struct x86_reg get_identity( struct translate_sse *p )
+static struct x86_reg get_const( struct translate_sse *p, unsigned id)
{
- struct x86_reg reg = x86_make_reg(file_XMM, 7);
-
- if (!p->loaded_identity) {
- p->loaded_identity = TRUE;
- p->identity[0] = 0;
- p->identity[1] = 0;
- p->identity[2] = 0;
- p->identity[3] = 1;
-
- sse_movups(p->func, reg,
- x86_make_disp(p->machine_EDI,
- get_offset(p, &p->identity[0])));
- }
+ struct x86_reg reg;
+ unsigned i;
- return reg;
-}
+ if(p->const_to_reg[id] >= 0)
+ return x86_make_reg(file_XMM, p->const_to_reg[id]);
-static struct x86_reg get_const( struct translate_sse *p, unsigned i, float v)
-{
- struct x86_reg reg = x86_make_reg(file_XMM, 2 + i);
-
- if (!p->loaded_const[i]) {
- p->loaded_const[i] = TRUE;
- p->const_value[i][0] =
- p->const_value[i][1] =
- p->const_value[i][2] =
- p->const_value[i][3] = v;
-
- sse_movups(p->func, reg,
- x86_make_disp(p->machine_EDI,
- get_offset(p, &p->const_value[i][0])));
+ for(i = 2; i < 8; ++i)
+ {
+ if(p->reg_to_const[i] < 0)
+ break;
}
- return reg;
-}
+ /* TODO: be smarter here */
+ if(i == 8)
+ --i;
-static struct x86_reg get_inv_127( struct translate_sse *p )
-{
- return get_const(p, 0, 1.0f / 127.0f);
-}
+ reg = x86_make_reg(file_XMM, i);
-static struct x86_reg get_inv_255( struct translate_sse *p )
-{
- return get_const(p, 1, 1.0f / 255.0f);
-}
+ if(p->reg_to_const[i] >= 0)
+ p->const_to_reg[p->reg_to_const[i]] = -1;
-static struct x86_reg get_inv_32767( struct translate_sse *p )
-{
- return get_const(p, 2, 1.0f / 32767.0f);
-}
+ p->reg_to_const[i] = id;
+ p->const_to_reg[id] = i;
-static struct x86_reg get_inv_65535( struct translate_sse *p )
-{
- return get_const(p, 3, 1.0f / 65535.0f);
-}
+ /* TODO: this should happen outside the loop, if possible */
+ sse_movaps(p->func, reg,
+ x86_make_disp(p->machine_EDI,
+ get_offset(p, &p->consts[id][0])));
-static struct x86_reg get_inv_2147483647( struct translate_sse *p )
-{
- return get_const(p, 4, 1.0f / 2147483647.0f);
+ return reg;
}
/* load the data in a SSE2 register, padding with zeros */
case 2:
x86_movzx16(p->func, tmp, src);
sse2_movd(p->func, data, tmp);
+ break;
case 3:
x86_movzx8(p->func, tmp, x86_make_disp(src, 2));
x86_shl_imm(p->func, tmp, 16);
x86_mov16(p->func, tmp, src);
sse2_movd(p->func, data, tmp);
+ break;
case 4:
sse2_movd(p->func, data, src);
break;
*/
sse_movss(p->func, data, arg0);
if(out_chans == CHANNELS_0001)
- sse_orps(p->func, data, get_identity(p) );
+ sse_orps(p->func, data, get_const(p, CONST_IDENTITY) );
break;
case 2:
/* 0 0 0 1
* a b 0 1
*/
if(out_chans == CHANNELS_0001)
- sse_shufps(p->func, data, get_identity(p), SHUF(X, Y, Z, W) );
+ sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X, Y, Z, W) );
else if(out_chans > 2)
- sse_movlhps(p->func, data, get_identity(p) );
+ sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY) );
sse_movlps(p->func, data, arg0);
break;
case 3:
*/
sse_movss(p->func, data, x86_make_disp(arg0, 8));
if(out_chans == CHANNELS_0001)
- sse_shufps(p->func, data, get_identity(p), SHUF(X,Y,Z,W) );
+ sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X,Y,Z,W) );
sse_shufps(p->func, data, data, SHUF(Y,Z,X,W) );
sse_movlps(p->func, data, arg0);
break;
else
sse2_cvtsd2ss(p->func, data, data);
if(out_chans == CHANNELS_0001)
- sse_shufps(p->func, data, get_identity(p), SHUF(X, Y, Z, W) );
+ sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X, Y, Z, W) );
break;
case 2:
sse2_movupd(p->func, data, arg0);
sse2_cvtpd2ps(p->func, data, data);
if(out_chans == CHANNELS_0001)
- sse_shufps(p->func, data, get_identity(p), SHUF(X, Y, Z, W) );
+ sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X, Y, Z, W) );
else if(out_chans > 2)
- sse_movlhps(p->func, data, get_identity(p) );
+ sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY) );
break;
case 3:
sse2_movupd(p->func, data, arg0);
sse2_cvtsd2ss(p->func, tmpXMM, tmpXMM);
sse_movlhps(p->func, data, tmpXMM);
if(out_chans == CHANNELS_0001)
- sse_orps(p->func, data, get_identity(p) );
+ sse_orps(p->func, data, get_const(p, CONST_IDENTITY) );
break;
case 4:
sse2_movupd(p->func, data, arg0);
|| a->output_format == PIPE_FORMAT_R32G32B32A32_FLOAT))
{
struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
- struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
for(i = 0; i < output_desc->nr_channels; ++i)
{
{
case 8:
/* TODO: this may be inefficient due to get_identity() being used both as a float and integer register */
- sse2_punpcklbw(p->func, dataXMM, get_identity(p));
- sse2_punpcklbw(p->func, dataXMM, get_identity(p));
+ sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
+ sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
break;
case 16:
- sse2_punpcklwd(p->func, dataXMM, get_identity(p));
+ sse2_punpcklwd(p->func, dataXMM, get_const(p, CONST_IDENTITY));
break;
case 32: /* we lose precision here */
sse2_psrld_imm(p->func, dataXMM, 1);
switch(input_desc->channel[0].size)
{
case 8:
- factor = get_inv_255(p);
+ factor = get_const(p, CONST_INV_255);
break;
case 16:
- factor = get_inv_65535(p);
+ factor = get_const(p, CONST_INV_65535);
break;
case 32:
- factor = get_inv_2147483647(p);
+ factor = get_const(p, CONST_INV_2147483647);
+ break;
+ default:
+ assert(0);
+ factor.disp = 0;
+ factor.file = 0;
+ factor.idx = 0;
+ factor.mod = 0;
break;
}
sse_mulps(p->func, dataXMM, factor);
switch(input_desc->channel[0].size)
{
case 8:
- factor = get_inv_127(p);
+ factor = get_const(p, CONST_INV_127);
break;
case 16:
- factor = get_inv_32767(p);
+ factor = get_const(p, CONST_INV_32767);
break;
case 32:
- factor = get_inv_2147483647(p);
+ factor = get_const(p, CONST_INV_2147483647);
+ break;
+ default:
+ assert(0);
+ factor.disp = 0;
+ factor.file = 0;
+ factor.idx = 0;
+ factor.mod = 0;
break;
}
sse_mulps(p->func, dataXMM, factor);
sse2_psrlw_imm(p->func, dataXMM, 1);
}
else
- sse2_punpcklbw(p->func, dataXMM, get_identity(p));
+ sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
break;
case UTIL_FORMAT_TYPE_SIGNED:
if(input_desc->channel[0].normalized)
{
- sse2_movq(p->func, tmpXMM, get_identity(p));
+ sse2_movq(p->func, tmpXMM, get_const(p, CONST_IDENTITY));
sse2_punpcklbw(p->func, tmpXMM, dataXMM);
sse2_psllw_imm(p->func, dataXMM, 9);
sse2_psrlw_imm(p->func, dataXMM, 8);
else if(!memcmp(&output_desc->channel[0], &input_desc->channel[0], sizeof(output_desc->channel[0])))
{
struct x86_reg tmp = p->tmp_EAX;
+ unsigned i;
if(input_desc->channel[0].size == 8 && input_desc->nr_channels == 4 && output_desc->nr_channels == 4
&& swizzle[0] == UTIL_FORMAT_SWIZZLE_W
&& swizzle[1] == UTIL_FORMAT_SWIZZLE_Z
return TRUE;
}
- for(unsigned i = 0; i < output_desc->nr_channels; ++i)
+ for(i = 0; i < output_desc->nr_channels; ++i)
{
switch(output_desc->channel[0].size)
{
}
return TRUE;
}
+ /* special case for draw's EMIT_4UB (RGBA) and EMIT_4UB_BGRA */
+ else if((x86_target_caps(p->func) & X86_SSE2) &&
+ a->input_format == PIPE_FORMAT_R32G32B32A32_FLOAT && (0
+ || a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM
+ || a->output_format == PIPE_FORMAT_R8G8B8A8_UNORM
+ ))
+ {
+ struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
+
+ /* load */
+ sse_movups(p->func, dataXMM, src);
+
+ if (a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM)
+ sse_shufps(p->func, dataXMM, dataXMM, SHUF(2,1,0,3));
+
+ /* scale by 255.0 */
+ sse_mulps(p->func, dataXMM, get_const(p, CONST_255));
+
+ /* pack and emit */
+ sse2_cvtps2dq(p->func, dataXMM, dataXMM);
+ sse2_packssdw(p->func, dataXMM, dataXMM);
+ sse2_packuswb(p->func, dataXMM, dataXMM);
+ sse2_movd(p->func, dst, dataXMM);
+
+ return TRUE;
+ }
+
return FALSE;
}
if (varient->instance_divisor != 1) {
struct x86_reg tmp_EDX = p->tmp2_EDX;
- struct x86_reg tmp_ECX = p->tmp3_ECX;
+ struct x86_reg tmp_ECX = p->src_ECX;
/* TODO: Add x86_shr() to rtasm and use it whenever
* instance divisor is power of two.
return p->idx_ESI;
}
else if (!index_size || p->buffer_varient[var_idx].instance_divisor) {
- struct x86_reg ptr = p->tmp_EAX;
+ struct x86_reg ptr = p->src_ECX;
struct x86_reg buf_ptr =
x86_make_disp(p->machine_EDI,
get_offset(p, &p->buffer_varient[var_idx].ptr));
return ptr;
}
else {
- struct x86_reg ptr = p->tmp_EAX;
+ struct x86_reg ptr = p->src_ECX;
const struct translate_buffer_varient *varient = &p->buffer_varient[var_idx];
struct x86_reg buf_stride =
}
}
else {
+ x64_rexw(p->func);
x86_lea(p->func, p->idx_ESI, x86_make_disp(p->idx_ESI, index_size));
}
int fixup, label;
unsigned j;
+ memset(p->reg_to_const, 0xff, sizeof(p->reg_to_const));
+ memset(p->const_to_reg, 0xff, sizeof(p->const_to_reg));
+
p->tmp_EAX = x86_make_reg(file_REG32, reg_AX);
p->idx_ESI = x86_make_reg(file_REG32, reg_SI);
p->outbuf_EBX = x86_make_reg(file_REG32, reg_BX);
p->machine_EDI = x86_make_reg(file_REG32, reg_DI);
p->count_EBP = x86_make_reg(file_REG32, reg_BP);
p->tmp2_EDX = x86_make_reg(file_REG32, reg_DX);
- p->tmp3_ECX = x86_make_reg(file_REG32, reg_CX);
+ p->src_ECX = x86_make_reg(file_REG32, reg_CX);
p->func = func;
- memset(&p->loaded_const, 0, sizeof(p->loaded_const));
- p->loaded_identity = FALSE;
x86_init_func(p->func);
x86_release_func( &p->linear_func );
x86_release_func( &p->elt_func );
- FREE(p);
+ os_free_aligned(p);
}
if (!rtasm_cpu_has_sse())
goto fail;
- p = CALLOC_STRUCT( translate_sse );
+ p = os_malloc_aligned(sizeof(struct translate_sse), 16);
if (p == NULL)
goto fail;
+ memset(p, 0, sizeof(*p));
+ memcpy(p->consts, consts, sizeof(consts));
p->translate.key = *key;
p->translate.release = translate_sse_release;
if (!build_vertex_emit(p, &p->elt8_func, 1))
goto fail;
- p->translate.run = (void*)x86_get_func(&p->linear_func);
+ p->translate.run = (run_func) x86_get_func(&p->linear_func);
if (p->translate.run == NULL)
goto fail;
- p->translate.run_elts = (void*)x86_get_func(&p->elt_func);
+ p->translate.run_elts = (run_elts_func) x86_get_func(&p->elt_func);
if (p->translate.run_elts == NULL)
goto fail;
- p->translate.run_elts16 = (void*)x86_get_func(&p->elt16_func);
+ p->translate.run_elts16 = (run_elts16_func) x86_get_func(&p->elt16_func);
if (p->translate.run_elts16 == NULL)
goto fail;
- p->translate.run_elts8 = (void*)x86_get_func(&p->elt8_func);
+ p->translate.run_elts8 = (run_elts8_func) x86_get_func(&p->elt8_func);
if (p->translate.run_elts8 == NULL)
goto fail;