2 * (C) Copyright IBM Corporation 2008
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
26 #include "pipe/p_defines.h"
27 #include "pipe/p_context.h"
28 #include "pipe/p_format.h"
30 #include "../auxiliary/draw/draw_context.h"
31 #include "../auxiliary/draw/draw_private.h"
33 #include "cell_context.h"
34 #include "ppc/rtasm/spe_asm.h"
36 typedef uint64_t register_mask
;
38 int allocate_available_register(register_mask
*m
)
41 for (i
= 0; i
< 64; i
++) {
42 const uint64_t mask
= (1ULL << i
);
44 if ((m
[0] & mask
) != 0) {
54 int allocate_register(register_mask
*m
, unsigned reg
)
56 assert((m
[0] & (1ULL << reg
)) != 0);
58 m
[0] &= ~(1ULL << reg
);
63 void release_register(register_mask
*m
, unsigned reg
)
65 assert((m
[0] & (1ULL << reg
)) == 0);
67 m
[0] |= (1ULL << reg
);
72 * Emit a 4x4 matrix transpose operation
74 * \param p Function that the transpose operation is to be appended to
75 * \param m Live register mask
76 * \param row0 Register containing row 0 of the source matrix
77 * \param row1 Register containing row 1 of the source matrix
78 * \param row2 Register containing row 2 of the source matrix
79 * \param row3 Register containing row 3 of the source matrix
80 * \param dest_ptr Register containing the address of the destination matrix
81 * \param shuf_ptr Register containing the address of the shuffled data
82 * \param count Number of colums to actually be written to the destination
85 * This function assumes that the registers named by \c row0, \c row1,
86 * \c row2, and \c row3 are scratch and can be modified by the generated code.
87 * Furthermore, these registers will be released, via calls to
88 * \c release_register, by this function.
91 * This function requires that four temporary are available on entry.
94 emit_matrix_transpose(struct spe_function
*p
, register_mask
*m
,
95 unsigned row0
, unsigned row1
, unsigned row2
,
96 unsigned row3
, unsigned dest_ptr
,
97 unsigned shuf_ptr
, unsigned count
)
99 int shuf_hi
= allocate_available_register(m
);
100 int shuf_lo
= allocate_available_register(m
);
101 int t1
= allocate_available_register(m
);
102 int t2
= allocate_available_register(m
);
111 spe_lqd(p
, shuf_hi
, shuf_ptr
, 3);
112 spe_lqd(p
, shuf_lo
, shuf_ptr
, 4);
113 spe_shufb(p
, t1
, row0
, row2
, shuf_hi
);
114 spe_shufb(p
, t2
, row0
, row2
, shuf_lo
);
117 /* row0 and row2 are now no longer needed. Re-use those registers as
123 spe_shufb(p
, t3
, row1
, row3
, shuf_hi
);
124 spe_shufb(p
, t4
, row1
, row3
, shuf_lo
);
127 /* row1 and row3 are now no longer needed. Re-use those registers as
133 spe_shufb(p
, col0
, t1
, t3
, shuf_hi
);
135 spe_shufb(p
, col1
, t1
, t3
, shuf_lo
);
138 /* t1 and t3 are now no longer needed. Re-use those registers as
145 spe_shufb(p
, col2
, t2
, t4
, shuf_hi
);
149 spe_shufb(p
, col3
, t2
, t4
, shuf_lo
);
153 /* Store the results. Remember that the stqd instruction is encoded using
154 * the qword offset (stand-alone assemblers to the byte-offset to
155 * qword-offset conversion for you), so the byte-offset needs be divided by
160 spe_stqd(p
, col3
, dest_ptr
, 3);
162 spe_stqd(p
, col2
, dest_ptr
, 2);
164 spe_stqd(p
, col1
, dest_ptr
, 1);
166 spe_stqd(p
, col0
, dest_ptr
, 0);
170 /* Release all of the temporary registers used.
172 release_register(m
, col0
);
173 release_register(m
, col1
);
174 release_register(m
, col2
);
175 release_register(m
, col3
);
176 release_register(m
, shuf_hi
);
177 release_register(m
, shuf_lo
);
178 release_register(m
, t2
);
179 release_register(m
, t4
);
184 emit_fetch(struct spe_function
*p
, register_mask
*m
,
185 unsigned in_ptr
, unsigned *offset
,
186 unsigned out_ptr
, unsigned shuf_ptr
,
187 enum pipe_format format
)
189 const unsigned count
= (pf_size_x(format
) != 0) + (pf_size_y(format
) != 0)
190 + (pf_size_z(format
) != 0) + (pf_size_w(format
) != 0);
191 const unsigned type
= pf_type(format
);
192 const unsigned bytes
= pf_size_x(format
);
194 int v0
= allocate_available_register(m
);
195 int v1
= allocate_available_register(m
);
196 int v2
= allocate_available_register(m
);
197 int v3
= allocate_available_register(m
);
198 int tmp
= allocate_available_register(m
);
201 float scale_signed
= 0.0;
202 float scale_unsigned
= 0.0;
204 spe_lqd(p
, v0
, in_ptr
, 0 + offset
[0]);
205 spe_lqd(p
, v1
, in_ptr
, 1 + offset
[0]);
206 spe_lqd(p
, v2
, in_ptr
, 2 + offset
[0]);
207 spe_lqd(p
, v3
, in_ptr
, 3 + offset
[0]);
212 scale_signed
= 1.0f
/ 127.0f
;
213 scale_unsigned
= 1.0f
/ 255.0f
;
214 spe_lqd(p
, tmp
, shuf_ptr
, 1);
215 spe_shufb(p
, v0
, v0
, v0
, tmp
);
216 spe_shufb(p
, v1
, v1
, v1
, tmp
);
217 spe_shufb(p
, v2
, v2
, v2
, tmp
);
218 spe_shufb(p
, v3
, v3
, v3
, tmp
);
221 scale_signed
= 1.0f
/ 32767.0f
;
222 scale_unsigned
= 1.0f
/ 65535.0f
;
223 spe_lqd(p
, tmp
, shuf_ptr
, 2);
224 spe_shufb(p
, v0
, v0
, v0
, tmp
);
225 spe_shufb(p
, v1
, v1
, v1
, tmp
);
226 spe_shufb(p
, v2
, v2
, v2
, tmp
);
227 spe_shufb(p
, v3
, v3
, v3
, tmp
);
230 scale_signed
= 1.0f
/ 2147483647.0f
;
231 scale_unsigned
= 1.0f
/ 4294967295.0f
;
239 case PIPE_FORMAT_TYPE_FLOAT
:
241 case PIPE_FORMAT_TYPE_UNORM
:
242 spe_ilhu(p
, tmp
, ((unsigned) scale_unsigned
) >> 16);
243 spe_iohl(p
, tmp
, ((unsigned) scale_unsigned
) & 0x0ffff);
244 spe_cuflt(p
, v0
, v0
, 0);
245 spe_fm(p
, v0
, v0
, tmp
);
247 case PIPE_FORMAT_TYPE_SNORM
:
248 spe_ilhu(p
, tmp
, ((unsigned) scale_signed
) >> 16);
249 spe_iohl(p
, tmp
, ((unsigned) scale_signed
) & 0x0ffff);
250 spe_csflt(p
, v0
, v0
, 0);
251 spe_fm(p
, v0
, v0
, tmp
);
253 case PIPE_FORMAT_TYPE_USCALED
:
254 spe_cuflt(p
, v0
, v0
, 0);
256 case PIPE_FORMAT_TYPE_SSCALED
:
257 spe_csflt(p
, v0
, v0
, 0);
263 float_one
= allocate_available_register(m
);
264 spe_il(p
, float_one
, 1);
265 spe_cuflt(p
, float_one
, float_one
, 0);
268 float_zero
= allocate_available_register(m
);
269 spe_il(p
, float_zero
, 0);
273 release_register(m
, tmp
);
275 emit_matrix_transpose(p
, m
, v0
, v1
, v2
, v3
, out_ptr
, shuf_ptr
, count
);
279 spe_stqd(p
, float_zero
, out_ptr
, 1);
281 spe_stqd(p
, float_zero
, out_ptr
, 2);
283 spe_stqd(p
, float_one
, out_ptr
, 3);
286 if (float_zero
!= -1) {
287 release_register(m
, float_zero
);
290 if (float_one
!= -1) {
291 release_register(m
, float_one
);
296 void cell_update_vertex_fetch(struct draw_context
*draw
)
298 struct cell_context
*const cell
=
299 (struct cell_context
*) draw
->driver_private
;
300 register_mask m
= ~0;
301 struct spe_function
*p
= &cell
->attrib_fetch
;
302 unsigned function_index
[PIPE_ATTRIB_MAX
];
303 unsigned unique_attr_formats
;
311 /* Determine how many unique input attribute formats there are. At the
312 * same time, store the index of the lowest numbered attribute that has
313 * the same format as any non-unique format.
315 unique_attr_formats
= 1;
316 function_index
[0] = 0;
317 for (i
= 1; i
< draw
->vertex_fetch
.nr_attrs
; i
++) {
318 const enum pipe_format curr_fmt
= draw
->vertex_element
[i
].src_format
;
320 for (j
= 0; j
< i
; j
++) {
321 if (curr_fmt
== draw
->vertex_element
[j
].src_format
) {
327 unique_attr_formats
++;
330 function_index
[i
] = j
;
334 /* Each fetch function can be a maximum of 34 instructions (note: this is
335 * actually a slight over-estimate). That means (34 * 4) = 136 bytes
338 spe_init_func(p
, 136 * unique_attr_formats
);
341 /* Registers 0, 1, and 2 are reserved by the ABI.
343 allocate_register(&m
, 0);
344 allocate_register(&m
, 1);
345 allocate_register(&m
, 2);
348 /* Allocate registers for the function's input parameters.
350 out_ptr
= allocate_register(&m
, 3);
351 in_ptr
= allocate_register(&m
, 4);
352 shuf_ptr
= allocate_register(&m
, 5);
355 /* Generate code for the individual attribute fetch functions.
357 for (i
= 0; i
< draw
->vertex_fetch
.nr_attrs
; i
++) {
360 if (function_index
[i
] == i
) {
361 cell
->attrib_fetch_offsets
[i
] = (unsigned) ((void *) p
->csr
362 - (void *) p
->store
);
365 emit_fetch(p
, & m
, in_ptr
, &offset
, out_ptr
, shuf_ptr
,
366 draw
->vertex_element
[i
].src_format
);
369 /* Round up to the next 16-byte boundary.
371 if ((((unsigned) p
->store
) & 0x0f) != 0) {
372 const unsigned align
= ((unsigned) p
->store
) & 0x0f;
373 p
->store
= (uint32_t *) (((void *) p
->store
) + align
);
376 /* Use the same function entry-point as a previously seen attribute
377 * with the same format.
379 cell
->attrib_fetch_offsets
[i
] =
380 cell
->attrib_fetch_offsets
[function_index
[i
]];