Merge branch 'upstream-gallium-0.1' into nouveau-gallium-0.1
[mesa.git] / src / gallium / drivers / cell / ppu / cell_vertex_fetch.c
1 /*
2 * (C) Copyright IBM Corporation 2008
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25 #include <inttypes.h>
26 #include "pipe/p_defines.h"
27 #include "pipe/p_context.h"
28 #include "pipe/p_format.h"
29
30 #include "../auxiliary/draw/draw_context.h"
31 #include "../auxiliary/draw/draw_private.h"
32
33 #include "cell_context.h"
34 #include "ppc/rtasm/spe_asm.h"
35
36 typedef uint64_t register_mask;
37
38 int allocate_available_register(register_mask *m)
39 {
40 unsigned i;
41 for (i = 0; i < 64; i++) {
42 const uint64_t mask = (1ULL << i);
43
44 if ((m[0] & mask) != 0) {
45 m[0] &= ~mask;
46 return i;
47 }
48 }
49
50 return -1;
51 }
52
53
54 int allocate_register(register_mask *m, unsigned reg)
55 {
56 assert((m[0] & (1ULL << reg)) != 0);
57
58 m[0] &= ~(1ULL << reg);
59 return reg;
60 }
61
62
63 void release_register(register_mask *m, unsigned reg)
64 {
65 assert((m[0] & (1ULL << reg)) == 0);
66
67 m[0] |= (1ULL << reg);
68 }
69
70
71 /**
72 * Emit a 4x4 matrix transpose operation
73 *
74 * \param p Function that the transpose operation is to be appended to
75 * \param m Live register mask
76 * \param row0 Register containing row 0 of the source matrix
77 * \param row1 Register containing row 1 of the source matrix
78 * \param row2 Register containing row 2 of the source matrix
79 * \param row3 Register containing row 3 of the source matrix
80 * \param dest_ptr Register containing the address of the destination matrix
81 * \param shuf_ptr Register containing the address of the shuffled data
82 * \param count Number of colums to actually be written to the destination
83 *
84 * \note
85 * This function assumes that the registers named by \c row0, \c row1,
86 * \c row2, and \c row3 are scratch and can be modified by the generated code.
87 * Furthermore, these registers will be released, via calls to
88 * \c release_register, by this function.
89 *
90 * \note
91 * This function requires that four temporary are available on entry.
92 */
93 static void
94 emit_matrix_transpose(struct spe_function *p, register_mask *m,
95 unsigned row0, unsigned row1, unsigned row2,
96 unsigned row3, unsigned dest_ptr,
97 unsigned shuf_ptr, unsigned count)
98 {
99 int shuf_hi = allocate_available_register(m);
100 int shuf_lo = allocate_available_register(m);
101 int t1 = allocate_available_register(m);
102 int t2 = allocate_available_register(m);
103 int t3;
104 int t4;
105 int col0;
106 int col1;
107 int col2;
108 int col3;
109
110
111 spe_lqd(p, shuf_hi, shuf_ptr, 3);
112 spe_lqd(p, shuf_lo, shuf_ptr, 4);
113 spe_shufb(p, t1, row0, row2, shuf_hi);
114 spe_shufb(p, t2, row0, row2, shuf_lo);
115
116
117 /* row0 and row2 are now no longer needed. Re-use those registers as
118 * temporaries.
119 */
120 t3 = row0;
121 t4 = row2;
122
123 spe_shufb(p, t3, row1, row3, shuf_hi);
124 spe_shufb(p, t4, row1, row3, shuf_lo);
125
126
127 /* row1 and row3 are now no longer needed. Re-use those registers as
128 * temporaries.
129 */
130 col0 = row1;
131 col1 = row3;
132
133 spe_shufb(p, col0, t1, t3, shuf_hi);
134 if (count > 1) {
135 spe_shufb(p, col1, t1, t3, shuf_lo);
136 }
137
138 /* t1 and t3 are now no longer needed. Re-use those registers as
139 * temporaries.
140 */
141 col2 = t1;
142 col3 = t3;
143
144 if (count > 2) {
145 spe_shufb(p, col2, t2, t4, shuf_hi);
146 }
147
148 if (count > 3) {
149 spe_shufb(p, col3, t2, t4, shuf_lo);
150 }
151
152
153 /* Store the results. Remember that the stqd instruction is encoded using
154 * the qword offset (stand-alone assemblers to the byte-offset to
155 * qword-offset conversion for you), so the byte-offset needs be divided by
156 * 16.
157 */
158 switch (count) {
159 case 4:
160 spe_stqd(p, col3, dest_ptr, 3);
161 case 3:
162 spe_stqd(p, col2, dest_ptr, 2);
163 case 2:
164 spe_stqd(p, col1, dest_ptr, 1);
165 case 1:
166 spe_stqd(p, col0, dest_ptr, 0);
167 }
168
169
170 /* Release all of the temporary registers used.
171 */
172 release_register(m, col0);
173 release_register(m, col1);
174 release_register(m, col2);
175 release_register(m, col3);
176 release_register(m, shuf_hi);
177 release_register(m, shuf_lo);
178 release_register(m, t2);
179 release_register(m, t4);
180 }
181
182
183 static void
184 emit_fetch(struct spe_function *p, register_mask *m,
185 unsigned in_ptr, unsigned *offset,
186 unsigned out_ptr, unsigned shuf_ptr,
187 enum pipe_format format)
188 {
189 const unsigned count = (pf_size_x(format) != 0) + (pf_size_y(format) != 0)
190 + (pf_size_z(format) != 0) + (pf_size_w(format) != 0);
191 const unsigned type = pf_type(format);
192 const unsigned bytes = pf_size_x(format);
193
194 int v0 = allocate_available_register(m);
195 int v1 = allocate_available_register(m);
196 int v2 = allocate_available_register(m);
197 int v3 = allocate_available_register(m);
198 int tmp = allocate_available_register(m);
199 int float_zero = -1;
200 int float_one = -1;
201 float scale_signed = 0.0;
202 float scale_unsigned = 0.0;
203
204 spe_lqd(p, v0, in_ptr, 0 + offset[0]);
205 spe_lqd(p, v1, in_ptr, 1 + offset[0]);
206 spe_lqd(p, v2, in_ptr, 2 + offset[0]);
207 spe_lqd(p, v3, in_ptr, 3 + offset[0]);
208 offset[0] += 4;
209
210 switch (bytes) {
211 case 1:
212 scale_signed = 1.0f / 127.0f;
213 scale_unsigned = 1.0f / 255.0f;
214 spe_lqd(p, tmp, shuf_ptr, 1);
215 spe_shufb(p, v0, v0, v0, tmp);
216 spe_shufb(p, v1, v1, v1, tmp);
217 spe_shufb(p, v2, v2, v2, tmp);
218 spe_shufb(p, v3, v3, v3, tmp);
219 break;
220 case 2:
221 scale_signed = 1.0f / 32767.0f;
222 scale_unsigned = 1.0f / 65535.0f;
223 spe_lqd(p, tmp, shuf_ptr, 2);
224 spe_shufb(p, v0, v0, v0, tmp);
225 spe_shufb(p, v1, v1, v1, tmp);
226 spe_shufb(p, v2, v2, v2, tmp);
227 spe_shufb(p, v3, v3, v3, tmp);
228 break;
229 case 4:
230 scale_signed = 1.0f / 2147483647.0f;
231 scale_unsigned = 1.0f / 4294967295.0f;
232 break;
233 default:
234 assert(0);
235 break;
236 }
237
238 switch (type) {
239 case PIPE_FORMAT_TYPE_FLOAT:
240 break;
241 case PIPE_FORMAT_TYPE_UNORM:
242 spe_ilhu(p, tmp, ((unsigned) scale_unsigned) >> 16);
243 spe_iohl(p, tmp, ((unsigned) scale_unsigned) & 0x0ffff);
244 spe_cuflt(p, v0, v0, 0);
245 spe_fm(p, v0, v0, tmp);
246 break;
247 case PIPE_FORMAT_TYPE_SNORM:
248 spe_ilhu(p, tmp, ((unsigned) scale_signed) >> 16);
249 spe_iohl(p, tmp, ((unsigned) scale_signed) & 0x0ffff);
250 spe_csflt(p, v0, v0, 0);
251 spe_fm(p, v0, v0, tmp);
252 break;
253 case PIPE_FORMAT_TYPE_USCALED:
254 spe_cuflt(p, v0, v0, 0);
255 break;
256 case PIPE_FORMAT_TYPE_SSCALED:
257 spe_csflt(p, v0, v0, 0);
258 break;
259 }
260
261
262 if (count < 4) {
263 float_one = allocate_available_register(m);
264 spe_il(p, float_one, 1);
265 spe_cuflt(p, float_one, float_one, 0);
266
267 if (count < 3) {
268 float_zero = allocate_available_register(m);
269 spe_il(p, float_zero, 0);
270 }
271 }
272
273 release_register(m, tmp);
274
275 emit_matrix_transpose(p, m, v0, v1, v2, v3, out_ptr, shuf_ptr, count);
276
277 switch (count) {
278 case 1:
279 spe_stqd(p, float_zero, out_ptr, 1);
280 case 2:
281 spe_stqd(p, float_zero, out_ptr, 2);
282 case 3:
283 spe_stqd(p, float_one, out_ptr, 3);
284 }
285
286 if (float_zero != -1) {
287 release_register(m, float_zero);
288 }
289
290 if (float_one != -1) {
291 release_register(m, float_one);
292 }
293 }
294
295
296 void cell_update_vertex_fetch(struct draw_context *draw)
297 {
298 struct cell_context *const cell =
299 (struct cell_context *) draw->driver_private;
300 register_mask m = ~0;
301 struct spe_function *p = &cell->attrib_fetch;
302 unsigned function_index[PIPE_ATTRIB_MAX];
303 unsigned unique_attr_formats;
304 int out_ptr;
305 int in_ptr;
306 int shuf_ptr;
307 unsigned i;
308 unsigned j;
309
310
311 /* Determine how many unique input attribute formats there are. At the
312 * same time, store the index of the lowest numbered attribute that has
313 * the same format as any non-unique format.
314 */
315 unique_attr_formats = 1;
316 function_index[0] = 0;
317 for (i = 1; i < draw->vertex_fetch.nr_attrs; i++) {
318 const enum pipe_format curr_fmt = draw->vertex_element[i].src_format;
319
320 for (j = 0; j < i; j++) {
321 if (curr_fmt == draw->vertex_element[j].src_format) {
322 break;
323 }
324 }
325
326 if (j == i) {
327 unique_attr_formats++;
328 }
329
330 function_index[i] = j;
331 }
332
333
334 /* Each fetch function can be a maximum of 34 instructions (note: this is
335 * actually a slight over-estimate). That means (34 * 4) = 136 bytes
336 * each maximum.
337 */
338 spe_init_func(p, 136 * unique_attr_formats);
339
340
341 /* Registers 0, 1, and 2 are reserved by the ABI.
342 */
343 allocate_register(&m, 0);
344 allocate_register(&m, 1);
345 allocate_register(&m, 2);
346
347
348 /* Allocate registers for the function's input parameters.
349 */
350 out_ptr = allocate_register(&m, 3);
351 in_ptr = allocate_register(&m, 4);
352 shuf_ptr = allocate_register(&m, 5);
353
354
355 /* Generate code for the individual attribute fetch functions.
356 */
357 for (i = 0; i < draw->vertex_fetch.nr_attrs; i++) {
358 unsigned offset;
359
360 if (function_index[i] == i) {
361 cell->attrib_fetch_offsets[i] = (unsigned) ((void *) p->csr
362 - (void *) p->store);
363
364 offset = 0;
365 emit_fetch(p, & m, in_ptr, &offset, out_ptr, shuf_ptr,
366 draw->vertex_element[i].src_format);
367 spe_bi(p, 0, 0, 0);
368
369 /* Round up to the next 16-byte boundary.
370 */
371 if ((((unsigned) p->store) & 0x0f) != 0) {
372 const unsigned align = ((unsigned) p->store) & 0x0f;
373 p->store = (uint32_t *) (((void *) p->store) + align);
374 }
375 } else {
376 /* Use the same function entry-point as a previously seen attribute
377 * with the same format.
378 */
379 cell->attrib_fetch_offsets[i] =
380 cell->attrib_fetch_offsets[function_index[i]];
381 }
382 }
383 }