#include "pipe/p_config.h"
#include "pipe/p_compiler.h"
#include "util/u_memory.h"
-#include "util/u_simple_list.h"
+#include "util/u_math.h"
#include "translate.h"
typedef void (PIPE_CDECL *run_func)( struct translate *translate,
unsigned start,
unsigned count,
- void *output_buffer );
+ unsigned instance_id,
+ void *output_buffer);
typedef void (PIPE_CDECL *run_elts_func)( struct translate *translate,
const unsigned *elts,
unsigned count,
- void *output_buffer );
+ unsigned instance_id,
+ void *output_buffer);
+struct translate_buffer {
+ const void *base_ptr;
+ unsigned stride;
+};
+
+struct translate_buffer_varient {
+ unsigned buffer_index;
+ unsigned instance_divisor;
+ void *ptr; /* updated either per vertex or per instance */
+};
+
+
+#define ELEMENT_BUFFER_INSTANCE_ID 1001
struct translate_sse {
float float_255[4];
float inv_255[4];
- struct {
- char *input_ptr;
- unsigned input_stride;
- } attrib[PIPE_MAX_ATTRIBS];
+ struct translate_buffer buffer[PIPE_MAX_ATTRIBS];
+ unsigned nr_buffers;
+
+ /* Multiple buffer varients can map to a single buffer. */
+ struct translate_buffer_varient buffer_varient[PIPE_MAX_ATTRIBS];
+ unsigned nr_buffer_varients;
+
+ /* Multiple elements can map to a single buffer varient. */
+ unsigned element_to_buffer_varient[PIPE_MAX_ATTRIBS];
+
+ boolean use_instancing;
+ unsigned instance_id;
run_func gen_run;
run_elts_func gen_run_elts;
+ /* these are actually known values, but putting them in a struct
+ * like this is helpful to keep them in sync across the file.
+ */
+ struct x86_reg tmp_EAX;
+ struct x86_reg idx_EBX; /* either start+i or &elt[i] */
+ struct x86_reg outbuf_ECX;
+ struct x86_reg machine_EDX;
+ struct x86_reg count_ESI; /* decrements to zero */
};
static int get_offset( const void *a, const void *b )
struct x86_reg reg = x86_make_reg(file_XMM, 6);
if (!p->loaded_identity) {
- /* Nasty:
- */
- struct x86_reg translateESI = x86_make_reg(file_REG32, reg_SI);
-
p->loaded_identity = TRUE;
p->identity[0] = 0;
p->identity[1] = 0;
p->identity[3] = 1;
sse_movups(p->func, reg,
- x86_make_disp(translateESI,
+ x86_make_disp(p->machine_EDX,
get_offset(p, &p->identity[0])));
}
static struct x86_reg get_255( struct translate_sse *p )
{
- struct x86_reg reg = x86_make_reg(file_XMM, 6);
+ struct x86_reg reg = x86_make_reg(file_XMM, 7);
if (!p->loaded_255) {
- struct x86_reg translateESI = x86_make_reg(file_REG32, reg_SI);
-
p->loaded_255 = TRUE;
p->float_255[0] =
p->float_255[1] =
p->float_255[3] = 255.0f;
sse_movups(p->func, reg,
- x86_make_disp(translateESI,
+ x86_make_disp(p->machine_EDX,
get_offset(p, &p->float_255[0])));
}
return reg;
- return x86_make_reg(file_XMM, 7);
}
static struct x86_reg get_inv_255( struct translate_sse *p )
struct x86_reg reg = x86_make_reg(file_XMM, 5);
if (!p->loaded_inv_255) {
- struct x86_reg translateESI = x86_make_reg(file_REG32, reg_SI);
-
p->loaded_inv_255 = TRUE;
p->inv_255[0] =
p->inv_255[1] =
p->inv_255[3] = 1.0f / 255.0f;
sse_movups(p->func, reg,
- x86_make_disp(translateESI,
+ x86_make_disp(p->machine_EDX,
get_offset(p, &p->inv_255[0])));
}
-static void get_src_ptr( struct translate_sse *p,
- struct x86_reg srcEAX,
- struct x86_reg translateREG,
- struct x86_reg eltREG,
- unsigned a )
-{
- struct x86_reg input_ptr =
- x86_make_disp(translateREG,
- get_offset(p, &p->attrib[a].input_ptr));
-
- struct x86_reg input_stride =
- x86_make_disp(translateREG,
- get_offset(p, &p->attrib[a].input_stride));
-
- /* Calculate pointer to current attrib:
- */
- x86_mov(p->func, srcEAX, input_stride);
- x86_imul(p->func, srcEAX, eltREG);
- x86_add(p->func, srcEAX, input_ptr);
-}
-
-
/* Extended swizzles? Maybe later.
*/
static void emit_swizzle( struct translate_sse *p,
return TRUE;
}
-/* Build run( struct translate *translate,
+
+static boolean init_inputs( struct translate_sse *p,
+ boolean linear )
+{
+ unsigned i;
+ struct x86_reg instance_id = x86_make_disp(p->machine_EDX,
+ get_offset(p, &p->instance_id));
+
+ for (i = 0; i < p->nr_buffer_varients; i++) {
+ struct translate_buffer_varient *varient = &p->buffer_varient[i];
+ struct translate_buffer *buffer = &p->buffer[varient->buffer_index];
+
+ if (linear || varient->instance_divisor) {
+ struct x86_reg buf_stride = x86_make_disp(p->machine_EDX,
+ get_offset(p, &buffer->stride));
+ struct x86_reg buf_ptr = x86_make_disp(p->machine_EDX,
+ get_offset(p, &varient->ptr));
+ struct x86_reg buf_base_ptr = x86_make_disp(p->machine_EDX,
+ get_offset(p, &buffer->base_ptr));
+ struct x86_reg elt = p->idx_EBX;
+ struct x86_reg tmp_EAX = p->tmp_EAX;
+
+ /* Calculate pointer to first attrib:
+ * base_ptr + stride * index, where index depends on instance divisor
+ */
+ if (varient->instance_divisor) {
+ /* Our index is instance ID divided by instance divisor.
+ */
+ x86_mov(p->func, tmp_EAX, instance_id);
+
+ if (varient->instance_divisor != 1) {
+ struct x86_reg tmp_EDX = p->machine_EDX;
+ struct x86_reg tmp_ECX = p->outbuf_ECX;
+
+ /* TODO: Add x86_shr() to rtasm and use it whenever
+ * instance divisor is power of two.
+ */
+
+ x86_push(p->func, tmp_EDX);
+ x86_push(p->func, tmp_ECX);
+ x86_xor(p->func, tmp_EDX, tmp_EDX);
+ x86_mov_reg_imm(p->func, tmp_ECX, varient->instance_divisor);
+ x86_div(p->func, tmp_ECX); /* EAX = EDX:EAX / ECX */
+ x86_pop(p->func, tmp_ECX);
+ x86_pop(p->func, tmp_EDX);
+ }
+ } else {
+ x86_mov(p->func, tmp_EAX, elt);
+ }
+ x86_imul(p->func, tmp_EAX, buf_stride);
+ x86_add(p->func, tmp_EAX, buf_base_ptr);
+
+
+ /* In the linear case, keep the buffer pointer instead of the
+ * index number.
+ */
+ if (linear && p->nr_buffer_varients == 1)
+ x86_mov(p->func, elt, tmp_EAX);
+ else
+ x86_mov(p->func, buf_ptr, tmp_EAX);
+ }
+ }
+
+ return TRUE;
+}
+
+
+static struct x86_reg get_buffer_ptr( struct translate_sse *p,
+ boolean linear,
+ unsigned var_idx,
+ struct x86_reg elt )
+{
+ if (var_idx == ELEMENT_BUFFER_INSTANCE_ID) {
+ return x86_make_disp(p->machine_EDX,
+ get_offset(p, &p->instance_id));
+ }
+ if (linear && p->nr_buffer_varients == 1) {
+ return p->idx_EBX;
+ }
+ else if (linear || p->buffer_varient[var_idx].instance_divisor) {
+ struct x86_reg ptr = p->tmp_EAX;
+ struct x86_reg buf_ptr =
+ x86_make_disp(p->machine_EDX,
+ get_offset(p, &p->buffer_varient[var_idx].ptr));
+
+ x86_mov(p->func, ptr, buf_ptr);
+ return ptr;
+ }
+ else {
+ struct x86_reg ptr = p->tmp_EAX;
+ const struct translate_buffer_varient *varient = &p->buffer_varient[var_idx];
+
+ struct x86_reg buf_stride =
+ x86_make_disp(p->machine_EDX,
+ get_offset(p, &p->buffer[varient->buffer_index].stride));
+
+ struct x86_reg buf_base_ptr =
+ x86_make_disp(p->machine_EDX,
+ get_offset(p, &p->buffer[varient->buffer_index].base_ptr));
+
+
+
+ /* Calculate pointer to current attrib:
+ */
+ x86_mov(p->func, ptr, buf_stride);
+ x86_imul(p->func, ptr, elt);
+ x86_add(p->func, ptr, buf_base_ptr);
+ return ptr;
+ }
+}
+
+
+
+static boolean incr_inputs( struct translate_sse *p,
+ boolean linear )
+{
+ if (linear && p->nr_buffer_varients == 1) {
+ struct x86_reg stride = x86_make_disp(p->machine_EDX,
+ get_offset(p, &p->buffer[0].stride));
+
+ if (p->buffer_varient[0].instance_divisor == 0) {
+ x86_add(p->func, p->idx_EBX, stride);
+ sse_prefetchnta(p->func, x86_make_disp(p->idx_EBX, 192));
+ }
+ }
+ else if (linear) {
+ unsigned i;
+
+ /* Is this worthwhile??
+ */
+ for (i = 0; i < p->nr_buffer_varients; i++) {
+ struct translate_buffer_varient *varient = &p->buffer_varient[i];
+ struct x86_reg buf_ptr = x86_make_disp(p->machine_EDX,
+ get_offset(p, &varient->ptr));
+ struct x86_reg buf_stride = x86_make_disp(p->machine_EDX,
+ get_offset(p, &p->buffer[varient->buffer_index].stride));
+
+ if (varient->instance_divisor == 0) {
+ x86_mov(p->func, p->tmp_EAX, buf_ptr);
+ x86_add(p->func, p->tmp_EAX, buf_stride);
+ if (i == 0) sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192));
+ x86_mov(p->func, buf_ptr, p->tmp_EAX);
+ }
+ }
+ }
+ else {
+ x86_lea(p->func, p->idx_EBX, x86_make_disp(p->idx_EBX, 4));
+ }
+
+ return TRUE;
+}
+
+
+/* Build run( struct translate *machine,
* unsigned start,
* unsigned count,
* void *output_buffer )
* or
- * run_elts( struct translate *translate,
+ * run_elts( struct translate *machine,
* unsigned *elts,
* unsigned count,
* void *output_buffer )
struct x86_function *func,
boolean linear )
{
- struct x86_reg vertexECX = x86_make_reg(file_REG32, reg_AX);
- struct x86_reg idxEBX = x86_make_reg(file_REG32, reg_BX);
- struct x86_reg srcEAX = x86_make_reg(file_REG32, reg_CX);
- struct x86_reg countEBP = x86_make_reg(file_REG32, reg_BP);
- struct x86_reg translateESI = x86_make_reg(file_REG32, reg_SI);
int fixup, label;
unsigned j;
+ p->tmp_EAX = x86_make_reg(file_REG32, reg_AX);
+ p->idx_EBX = x86_make_reg(file_REG32, reg_BX);
+ p->outbuf_ECX = x86_make_reg(file_REG32, reg_CX);
+ p->machine_EDX = x86_make_reg(file_REG32, reg_DX);
+ p->count_ESI = x86_make_reg(file_REG32, reg_SI);
+
p->func = func;
p->loaded_inv_255 = FALSE;
p->loaded_255 = FALSE;
/* Push a few regs?
*/
- x86_push(p->func, countEBP);
- x86_push(p->func, translateESI);
- x86_push(p->func, idxEBX);
+ x86_push(p->func, p->idx_EBX);
+ x86_push(p->func, p->count_ESI);
- /* Get vertex count, compare to zero
+ /* Load arguments into regs:
*/
- x86_xor(p->func, idxEBX, idxEBX);
- x86_mov(p->func, countEBP, x86_fn_arg(p->func, 3));
- x86_cmp(p->func, countEBP, idxEBX);
- fixup = x86_jcc_forward(p->func, cc_E);
+ x86_mov(p->func, p->machine_EDX, x86_fn_arg(p->func, 1));
+ x86_mov(p->func, p->idx_EBX, x86_fn_arg(p->func, 2));
+ x86_mov(p->func, p->count_ESI, x86_fn_arg(p->func, 3));
+ x86_mov(p->func, p->outbuf_ECX, x86_fn_arg(p->func, 5));
- /* If linear, idx is the current element, otherwise it is a pointer
- * to the current element.
+ /* Load instance ID.
*/
- x86_mov(p->func, idxEBX, x86_fn_arg(p->func, 2));
-
- /* Initialize destination register.
- */
- x86_mov(p->func, vertexECX, x86_fn_arg(p->func, 4));
+ if (p->use_instancing) {
+ x86_mov(p->func,
+ p->tmp_EAX,
+ x86_fn_arg(p->func, 4));
+ x86_mov(p->func,
+ x86_make_disp(p->machine_EDX, get_offset(p, &p->instance_id)),
+ p->tmp_EAX);
+ }
- /* Move argument 1 (translate_sse pointer) into a reg:
+ /* Get vertex count, compare to zero
*/
- x86_mov(p->func, translateESI, x86_fn_arg(p->func, 1));
+ x86_xor(p->func, p->tmp_EAX, p->tmp_EAX);
+ x86_cmp(p->func, p->count_ESI, p->tmp_EAX);
+ fixup = x86_jcc_forward(p->func, cc_E);
-
/* always load, needed or not:
*/
+ init_inputs(p, linear);
- /* Note address for loop jump */
+ /* Note address for loop jump
+ */
label = x86_get_label(p->func);
-
-
- for (j = 0; j < p->translate.key.nr_elements; j++) {
- const struct translate_element *a = &p->translate.key.element[j];
-
- struct x86_reg destEAX = x86_make_disp(vertexECX,
- a->output_offset);
-
- /* Figure out source pointer address:
- */
- if (linear) {
- get_src_ptr(p, srcEAX, translateESI, idxEBX, j);
- }
- else {
- get_src_ptr(p, srcEAX, translateESI, x86_deref(idxEBX), j);
+ {
+ struct x86_reg elt = linear ? p->idx_EBX : x86_deref(p->idx_EBX);
+ int last_varient = -1;
+ struct x86_reg vb;
+
+ for (j = 0; j < p->translate.key.nr_elements; j++) {
+ const struct translate_element *a = &p->translate.key.element[j];
+ unsigned varient = p->element_to_buffer_varient[j];
+
+ /* Figure out source pointer address:
+ */
+ if (varient != last_varient) {
+ last_varient = varient;
+ vb = get_buffer_ptr(p, linear, varient, elt);
+ }
+
+ if (!translate_attr( p, a,
+ x86_make_disp(vb, a->input_offset),
+ x86_make_disp(p->outbuf_ECX, a->output_offset)))
+ return FALSE;
}
- if (!translate_attr( p, a, x86_deref(srcEAX), destEAX ))
- return FALSE;
- }
-
- /* Next vertex:
- */
- x86_lea(p->func, vertexECX, x86_make_disp(vertexECX, p->translate.key.output_stride));
-
- /* Incr index
- */
- if (linear) {
- x86_inc(p->func, idxEBX);
- }
- else {
- x86_lea(p->func, idxEBX, x86_make_disp(idxEBX, 4));
+ /* Next output vertex:
+ */
+ x86_lea(p->func,
+ p->outbuf_ECX,
+ x86_make_disp(p->outbuf_ECX,
+ p->translate.key.output_stride));
+
+ /* Incr index
+ */
+ incr_inputs( p, linear );
}
/* decr count, loop if not zero
*/
- x86_dec(p->func, countEBP);
- x86_test(p->func, countEBP, countEBP);
+ x86_dec(p->func, p->count_ESI);
x86_jcc(p->func, cc_NZ, label);
/* Exit mmx state?
/* Pop regs and return
*/
- x86_pop(p->func, idxEBX);
- x86_pop(p->func, translateESI);
- x86_pop(p->func, countEBP);
+ x86_pop(p->func, p->count_ESI);
+ x86_pop(p->func, p->idx_EBX);
x86_ret(p->func);
return TRUE;
unsigned stride )
{
struct translate_sse *p = (struct translate_sse *)translate;
- unsigned i;
- for (i = 0; i < p->translate.key.nr_elements; i++) {
- if (p->translate.key.element[i].input_buffer == buf) {
- p->attrib[i].input_ptr = ((char *)ptr +
- p->translate.key.element[i].input_offset);
- p->attrib[i].input_stride = stride;
- }
+ if (buf < p->nr_buffers) {
+ p->buffer[buf].base_ptr = (char *)ptr;
+ p->buffer[buf].stride = stride;
}
+
+ if (0) debug_printf("%s %d/%d: %p %d\n",
+ __FUNCTION__, buf,
+ p->nr_buffers,
+ ptr, stride);
}
static void PIPE_CDECL translate_sse_run_elts( struct translate *translate,
const unsigned *elts,
unsigned count,
+ unsigned instance_id,
void *output_buffer )
{
struct translate_sse *p = (struct translate_sse *)translate;
p->gen_run_elts( translate,
elts,
count,
- output_buffer );
+ instance_id,
+ output_buffer);
}
static void PIPE_CDECL translate_sse_run( struct translate *translate,
unsigned start,
unsigned count,
+ unsigned instance_id,
void *output_buffer )
{
struct translate_sse *p = (struct translate_sse *)translate;
p->gen_run( translate,
start,
count,
- output_buffer );
+ instance_id,
+ output_buffer);
}
struct translate *translate_sse2_create( const struct translate_key *key )
{
struct translate_sse *p = NULL;
+ unsigned i;
if (!rtasm_cpu_has_sse() || !rtasm_cpu_has_sse2())
goto fail;
p->translate.run_elts = translate_sse_run_elts;
p->translate.run = translate_sse_run;
+ for (i = 0; i < key->nr_elements; i++) {
+ if (key->element[i].type == TRANSLATE_ELEMENT_NORMAL) {
+ unsigned j;
+
+ p->nr_buffers = MAX2(p->nr_buffers, key->element[i].input_buffer + 1);
+
+ if (key->element[i].instance_divisor) {
+ p->use_instancing = TRUE;
+ }
+
+ /*
+ * Map vertex element to vertex buffer varient.
+ */
+ for (j = 0; j < p->nr_buffer_varients; j++) {
+ if (p->buffer_varient[j].buffer_index == key->element[i].input_buffer &&
+ p->buffer_varient[j].instance_divisor == key->element[i].instance_divisor) {
+ break;
+ }
+ }
+ if (j == p->nr_buffer_varients) {
+ p->buffer_varient[j].buffer_index = key->element[i].input_buffer;
+ p->buffer_varient[j].instance_divisor = key->element[i].instance_divisor;
+ p->nr_buffer_varients++;
+ }
+ p->element_to_buffer_varient[i] = j;
+ } else {
+ assert(key->element[i].type == TRANSLATE_ELEMENT_INSTANCE_ID);
+
+ p->element_to_buffer_varient[i] = ELEMENT_BUFFER_INSTANCE_ID;
+ }
+ }
+
+ if (0) debug_printf("nr_buffers: %d\n", p->nr_buffers);
+
if (!build_vertex_emit(p, &p->linear_func, TRUE))
goto fail;