2 * (C) Copyright IBM Corporation 2008
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
26 #include "pipe/p_defines.h"
27 #include "pipe/p_context.h"
28 #include "pipe/p_format.h"
30 #include "../auxiliary/draw/draw_context.h"
31 #include "../auxiliary/draw/draw_private.h"
33 #include "cell_context.h"
34 #include "rtasm/rtasm_ppc_spe.h"
38 * Emit a 4x4 matrix transpose operation
40 * \param p Function that the transpose operation is to be appended to
41 * \param row0 Register containing row 0 of the source matrix
42 * \param row1 Register containing row 1 of the source matrix
43 * \param row2 Register containing row 2 of the source matrix
44 * \param row3 Register containing row 3 of the source matrix
45 * \param dest_ptr Register containing the address of the destination matrix
46 * \param shuf_ptr Register containing the address of the shuffled data
47 * \param count Number of colums to actually be written to the destination
50 * This function assumes that the registers named by \c row0, \c row1,
51 * \c row2, and \c row3 are scratch and can be modified by the generated code.
52 * Furthermore, these registers will be released, via calls to
53 * \c release_register, by this function.
56 * This function requires that four temporary are available on entry.
59 emit_matrix_transpose(struct spe_function
*p
,
60 unsigned row0
, unsigned row1
, unsigned row2
,
61 unsigned row3
, unsigned dest_ptr
,
62 unsigned shuf_ptr
, unsigned count
)
64 int shuf_hi
= spe_allocate_available_register(p
);
65 int shuf_lo
= spe_allocate_available_register(p
);
66 int t1
= spe_allocate_available_register(p
);
67 int t2
= spe_allocate_available_register(p
);
76 spe_lqd(p
, shuf_hi
, shuf_ptr
, 3*16);
77 spe_lqd(p
, shuf_lo
, shuf_ptr
, 4*16);
78 spe_shufb(p
, t1
, row0
, row2
, shuf_hi
);
79 spe_shufb(p
, t2
, row0
, row2
, shuf_lo
);
82 /* row0 and row2 are now no longer needed. Re-use those registers as
88 spe_shufb(p
, t3
, row1
, row3
, shuf_hi
);
89 spe_shufb(p
, t4
, row1
, row3
, shuf_lo
);
92 /* row1 and row3 are now no longer needed. Re-use those registers as
98 spe_shufb(p
, col0
, t1
, t3
, shuf_hi
);
100 spe_shufb(p
, col1
, t1
, t3
, shuf_lo
);
103 /* t1 and t3 are now no longer needed. Re-use those registers as
110 spe_shufb(p
, col2
, t2
, t4
, shuf_hi
);
114 spe_shufb(p
, col3
, t2
, t4
, shuf_lo
);
118 /* Store the results. Remember that the stqd instruction is encoded using
119 * the qword offset (stand-alone assemblers to the byte-offset to
120 * qword-offset conversion for you), so the byte-offset needs be divided by
125 spe_stqd(p
, col3
, dest_ptr
, 3 * 16);
127 spe_stqd(p
, col2
, dest_ptr
, 2 * 16);
129 spe_stqd(p
, col1
, dest_ptr
, 1 * 16);
131 spe_stqd(p
, col0
, dest_ptr
, 0 * 16);
135 /* Release all of the temporary registers used.
137 spe_release_register(p
, col0
);
138 spe_release_register(p
, col1
);
139 spe_release_register(p
, col2
);
140 spe_release_register(p
, col3
);
141 spe_release_register(p
, shuf_hi
);
142 spe_release_register(p
, shuf_lo
);
143 spe_release_register(p
, t2
);
144 spe_release_register(p
, t4
);
149 /* This appears to not be used currently */
151 emit_fetch(struct spe_function
*p
,
152 unsigned in_ptr
, unsigned *offset
,
153 unsigned out_ptr
, unsigned shuf_ptr
,
154 enum pipe_format format
)
156 const unsigned count
= (pf_size_x(format
) != 0) + (pf_size_y(format
) != 0)
157 + (pf_size_z(format
) != 0) + (pf_size_w(format
) != 0);
158 const unsigned type
= pf_type(format
);
159 const unsigned bytes
= pf_size_x(format
);
161 int v0
= spe_allocate_available_register(p
);
162 int v1
= spe_allocate_available_register(p
);
163 int v2
= spe_allocate_available_register(p
);
164 int v3
= spe_allocate_available_register(p
);
165 int tmp
= spe_allocate_available_register(p
);
168 float scale_signed
= 0.0;
169 float scale_unsigned
= 0.0;
171 spe_lqd(p
, v0
, in_ptr
, (0 + offset
[0]) * 16);
172 spe_lqd(p
, v1
, in_ptr
, (1 + offset
[0]) * 16);
173 spe_lqd(p
, v2
, in_ptr
, (2 + offset
[0]) * 16);
174 spe_lqd(p
, v3
, in_ptr
, (3 + offset
[0]) * 16);
179 scale_signed
= 1.0f
/ 127.0f
;
180 scale_unsigned
= 1.0f
/ 255.0f
;
181 spe_lqd(p
, tmp
, shuf_ptr
, 1 * 16);
182 spe_shufb(p
, v0
, v0
, v0
, tmp
);
183 spe_shufb(p
, v1
, v1
, v1
, tmp
);
184 spe_shufb(p
, v2
, v2
, v2
, tmp
);
185 spe_shufb(p
, v3
, v3
, v3
, tmp
);
188 scale_signed
= 1.0f
/ 32767.0f
;
189 scale_unsigned
= 1.0f
/ 65535.0f
;
190 spe_lqd(p
, tmp
, shuf_ptr
, 2 * 16);
191 spe_shufb(p
, v0
, v0
, v0
, tmp
);
192 spe_shufb(p
, v1
, v1
, v1
, tmp
);
193 spe_shufb(p
, v2
, v2
, v2
, tmp
);
194 spe_shufb(p
, v3
, v3
, v3
, tmp
);
197 scale_signed
= 1.0f
/ 2147483647.0f
;
198 scale_unsigned
= 1.0f
/ 4294967295.0f
;
206 case PIPE_FORMAT_TYPE_FLOAT
:
208 case PIPE_FORMAT_TYPE_UNORM
:
209 spe_ilhu(p
, tmp
, ((unsigned) scale_unsigned
) >> 16);
210 spe_iohl(p
, tmp
, ((unsigned) scale_unsigned
) & 0x0ffff);
211 spe_cuflt(p
, v0
, v0
, 0);
212 spe_fm(p
, v0
, v0
, tmp
);
214 case PIPE_FORMAT_TYPE_SNORM
:
215 spe_ilhu(p
, tmp
, ((unsigned) scale_signed
) >> 16);
216 spe_iohl(p
, tmp
, ((unsigned) scale_signed
) & 0x0ffff);
217 spe_csflt(p
, v0
, v0
, 0);
218 spe_fm(p
, v0
, v0
, tmp
);
220 case PIPE_FORMAT_TYPE_USCALED
:
221 spe_cuflt(p
, v0
, v0
, 0);
223 case PIPE_FORMAT_TYPE_SSCALED
:
224 spe_csflt(p
, v0
, v0
, 0);
230 float_one
= spe_allocate_available_register(p
);
231 spe_il(p
, float_one
, 1);
232 spe_cuflt(p
, float_one
, float_one
, 0);
235 float_zero
= spe_allocate_available_register(p
);
236 spe_il(p
, float_zero
, 0);
240 spe_release_register(p
, tmp
);
242 emit_matrix_transpose(p
, v0
, v1
, v2
, v3
, out_ptr
, shuf_ptr
, count
);
246 spe_stqd(p
, float_zero
, out_ptr
, 1 * 16);
248 spe_stqd(p
, float_zero
, out_ptr
, 2 * 16);
250 spe_stqd(p
, float_one
, out_ptr
, 3 * 16);
253 if (float_zero
!= -1) {
254 spe_release_register(p
, float_zero
);
257 if (float_one
!= -1) {
258 spe_release_register(p
, float_one
);
264 void cell_update_vertex_fetch(struct draw_context
*draw
)
267 struct cell_context
*const cell
=
268 (struct cell_context
*) draw
->driver_private
;
269 struct spe_function
*p
= &cell
->attrib_fetch
;
270 unsigned function_index
[PIPE_MAX_ATTRIBS
];
271 unsigned unique_attr_formats
;
279 /* Determine how many unique input attribute formats there are. At the
280 * same time, store the index of the lowest numbered attribute that has
281 * the same format as any non-unique format.
283 unique_attr_formats
= 1;
284 function_index
[0] = 0;
285 for (i
= 1; i
< draw
->vertex_fetch
.nr_attrs
; i
++) {
286 const enum pipe_format curr_fmt
= draw
->vertex_element
[i
].src_format
;
288 for (j
= 0; j
< i
; j
++) {
289 if (curr_fmt
== draw
->vertex_element
[j
].src_format
) {
295 unique_attr_formats
++;
298 function_index
[i
] = j
;
302 /* Each fetch function can be a maximum of 34 instructions (note: this is
303 * actually a slight over-estimate).
305 spe_init_func(p
, 34 * SPE_INST_SIZE
* unique_attr_formats
);
308 /* Allocate registers for the function's input parameters.
310 out_ptr
= spe_allocate_register(p
, 3);
311 in_ptr
= spe_allocate_register(p
, 4);
312 shuf_ptr
= spe_allocate_register(p
, 5);
315 /* Generate code for the individual attribute fetch functions.
317 for (i
= 0; i
< draw
->vertex_fetch
.nr_attrs
; i
++) {
320 if (function_index
[i
] == i
) {
321 cell
->attrib_fetch_offsets
[i
] = (unsigned) ((void *) p
->csr
322 - (void *) p
->store
);
325 emit_fetch(p
, in_ptr
, &offset
, out_ptr
, shuf_ptr
,
326 draw
->vertex_element
[i
].src_format
);
329 /* Round up to the next 16-byte boundary.
331 if ((((unsigned) p
->store
) & 0x0f) != 0) {
332 const unsigned align
= ((unsigned) p
->store
) & 0x0f;
333 p
->store
= (uint32_t *) (((void *) p
->store
) + align
);
336 /* Use the same function entry-point as a previously seen attribute
337 * with the same format.
339 cell
->attrib_fetch_offsets
[i
] =
340 cell
->attrib_fetch_offsets
[function_index
[i
]];