*/
#include <spu_mfcio.h>
+#include <transpose_matrix4x4.h>
+
#include "pipe/p_util.h"
#include "pipe/p_state.h"
#include "pipe/p_shader_tokens.h"
}
-void
-spu_transpose_4x4(qword *out, const qword *in)
-{
- static const qword masks[8] = {
- {
- 0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
- 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
- },
- {
- 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
- 0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
- },
-
- {
- 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17,
- 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
- },
- {
- 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
- 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17,
- },
-
- {
- 0x08, 0x09, 0x0a, 0x0b, 0x18, 0x19, 0x1a, 0x1b,
- 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
- },
- {
- 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
- 0x08, 0x09, 0x0a, 0x0b, 0x18, 0x19, 0x1a, 0x1b,
- },
-
- {
- 0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f,
- 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
- },
- {
- 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
- 0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f,
- },
- };
-
- out[0] = si_shufb(in[0], in[1], masks[0]);
- out[0] = si_or(out[0], si_shufb(in[2], in[3], masks[1]));
-
- out[1] = si_shufb(in[0], in[1], masks[2]);
- out[1] = si_or(out[1], si_shufb(in[2], in[3], masks[3]));
-
- out[2] = si_shufb(in[0], in[1], masks[4]);
- out[2] = si_or(out[2], si_shufb(in[2], in[3], masks[5]));
-
- out[3] = si_shufb(in[0], in[1], masks[6]);
- out[3] = si_or(out[3], si_shufb(in[2], in[3], masks[7]));
-}
-
-
/**
* Fetch vertex attributes for 'count' vertices.
*/
* excessive number of fetch functions, but we could at least
* minimize the transpose step:
*/
- spu_transpose_4x4(&machine->Inputs[attr].xyzw[0].q, p);
+ _transpose_matrix4x4(&machine->Inputs[attr].xyzw[0].q, p);
}
}