2ece0250f6f49e8d91d36e28acd41b8c3de15c5c
[mesa.git] / src / gallium / drivers / cell / ppu / cell_vertex_fetch.c
1 /*
2 * (C) Copyright IBM Corporation 2008
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25 #include <inttypes.h>
26 #include "pipe/p_defines.h"
27 #include "pipe/p_context.h"
28 #include "pipe/p_format.h"
29
30 #include "../auxiliary/draw/draw_context.h"
31 #include "../auxiliary/draw/draw_private.h"
32
33 #include "cell_context.h"
34 #include "rtasm/rtasm_ppc_spe.h"
35
36
37 /**
38 * Emit a 4x4 matrix transpose operation
39 *
40 * \param p Function that the transpose operation is to be appended to
41 * \param row0 Register containing row 0 of the source matrix
42 * \param row1 Register containing row 1 of the source matrix
43 * \param row2 Register containing row 2 of the source matrix
44 * \param row3 Register containing row 3 of the source matrix
45 * \param dest_ptr Register containing the address of the destination matrix
46 * \param shuf_ptr Register containing the address of the shuffled data
47 * \param count Number of colums to actually be written to the destination
48 *
49 * \note
50 * This function assumes that the registers named by \c row0, \c row1,
51 * \c row2, and \c row3 are scratch and can be modified by the generated code.
52 * Furthermore, these registers will be released, via calls to
53 * \c release_register, by this function.
54 *
55 * \note
56 * This function requires that four temporary are available on entry.
57 */
58 static void
59 emit_matrix_transpose(struct spe_function *p,
60 unsigned row0, unsigned row1, unsigned row2,
61 unsigned row3, unsigned dest_ptr,
62 unsigned shuf_ptr, unsigned count)
63 {
64 int shuf_hi = spe_allocate_available_register(p);
65 int shuf_lo = spe_allocate_available_register(p);
66 int t1 = spe_allocate_available_register(p);
67 int t2 = spe_allocate_available_register(p);
68 int t3;
69 int t4;
70 int col0;
71 int col1;
72 int col2;
73 int col3;
74
75
76 spe_lqd(p, shuf_hi, shuf_ptr, 3);
77 spe_lqd(p, shuf_lo, shuf_ptr, 4);
78 spe_shufb(p, t1, row0, row2, shuf_hi);
79 spe_shufb(p, t2, row0, row2, shuf_lo);
80
81
82 /* row0 and row2 are now no longer needed. Re-use those registers as
83 * temporaries.
84 */
85 t3 = row0;
86 t4 = row2;
87
88 spe_shufb(p, t3, row1, row3, shuf_hi);
89 spe_shufb(p, t4, row1, row3, shuf_lo);
90
91
92 /* row1 and row3 are now no longer needed. Re-use those registers as
93 * temporaries.
94 */
95 col0 = row1;
96 col1 = row3;
97
98 spe_shufb(p, col0, t1, t3, shuf_hi);
99 if (count > 1) {
100 spe_shufb(p, col1, t1, t3, shuf_lo);
101 }
102
103 /* t1 and t3 are now no longer needed. Re-use those registers as
104 * temporaries.
105 */
106 col2 = t1;
107 col3 = t3;
108
109 if (count > 2) {
110 spe_shufb(p, col2, t2, t4, shuf_hi);
111 }
112
113 if (count > 3) {
114 spe_shufb(p, col3, t2, t4, shuf_lo);
115 }
116
117
118 /* Store the results. Remember that the stqd instruction is encoded using
119 * the qword offset (stand-alone assemblers to the byte-offset to
120 * qword-offset conversion for you), so the byte-offset needs be divided by
121 * 16.
122 */
123 switch (count) {
124 case 4:
125 spe_stqd(p, col3, dest_ptr, 3);
126 case 3:
127 spe_stqd(p, col2, dest_ptr, 2);
128 case 2:
129 spe_stqd(p, col1, dest_ptr, 1);
130 case 1:
131 spe_stqd(p, col0, dest_ptr, 0);
132 }
133
134
135 /* Release all of the temporary registers used.
136 */
137 spe_release_register(p, col0);
138 spe_release_register(p, col1);
139 spe_release_register(p, col2);
140 spe_release_register(p, col3);
141 spe_release_register(p, shuf_hi);
142 spe_release_register(p, shuf_lo);
143 spe_release_register(p, t2);
144 spe_release_register(p, t4);
145 }
146
147
148 static void
149 emit_fetch(struct spe_function *p,
150 unsigned in_ptr, unsigned *offset,
151 unsigned out_ptr, unsigned shuf_ptr,
152 enum pipe_format format)
153 {
154 const unsigned count = (pf_size_x(format) != 0) + (pf_size_y(format) != 0)
155 + (pf_size_z(format) != 0) + (pf_size_w(format) != 0);
156 const unsigned type = pf_type(format);
157 const unsigned bytes = pf_size_x(format);
158
159 int v0 = spe_allocate_available_register(p);
160 int v1 = spe_allocate_available_register(p);
161 int v2 = spe_allocate_available_register(p);
162 int v3 = spe_allocate_available_register(p);
163 int tmp = spe_allocate_available_register(p);
164 int float_zero = -1;
165 int float_one = -1;
166 float scale_signed = 0.0;
167 float scale_unsigned = 0.0;
168
169 spe_lqd(p, v0, in_ptr, 0 + offset[0]);
170 spe_lqd(p, v1, in_ptr, 1 + offset[0]);
171 spe_lqd(p, v2, in_ptr, 2 + offset[0]);
172 spe_lqd(p, v3, in_ptr, 3 + offset[0]);
173 offset[0] += 4;
174
175 switch (bytes) {
176 case 1:
177 scale_signed = 1.0f / 127.0f;
178 scale_unsigned = 1.0f / 255.0f;
179 spe_lqd(p, tmp, shuf_ptr, 1);
180 spe_shufb(p, v0, v0, v0, tmp);
181 spe_shufb(p, v1, v1, v1, tmp);
182 spe_shufb(p, v2, v2, v2, tmp);
183 spe_shufb(p, v3, v3, v3, tmp);
184 break;
185 case 2:
186 scale_signed = 1.0f / 32767.0f;
187 scale_unsigned = 1.0f / 65535.0f;
188 spe_lqd(p, tmp, shuf_ptr, 2);
189 spe_shufb(p, v0, v0, v0, tmp);
190 spe_shufb(p, v1, v1, v1, tmp);
191 spe_shufb(p, v2, v2, v2, tmp);
192 spe_shufb(p, v3, v3, v3, tmp);
193 break;
194 case 4:
195 scale_signed = 1.0f / 2147483647.0f;
196 scale_unsigned = 1.0f / 4294967295.0f;
197 break;
198 default:
199 assert(0);
200 break;
201 }
202
203 switch (type) {
204 case PIPE_FORMAT_TYPE_FLOAT:
205 break;
206 case PIPE_FORMAT_TYPE_UNORM:
207 spe_ilhu(p, tmp, ((unsigned) scale_unsigned) >> 16);
208 spe_iohl(p, tmp, ((unsigned) scale_unsigned) & 0x0ffff);
209 spe_cuflt(p, v0, v0, 0);
210 spe_fm(p, v0, v0, tmp);
211 break;
212 case PIPE_FORMAT_TYPE_SNORM:
213 spe_ilhu(p, tmp, ((unsigned) scale_signed) >> 16);
214 spe_iohl(p, tmp, ((unsigned) scale_signed) & 0x0ffff);
215 spe_csflt(p, v0, v0, 0);
216 spe_fm(p, v0, v0, tmp);
217 break;
218 case PIPE_FORMAT_TYPE_USCALED:
219 spe_cuflt(p, v0, v0, 0);
220 break;
221 case PIPE_FORMAT_TYPE_SSCALED:
222 spe_csflt(p, v0, v0, 0);
223 break;
224 }
225
226
227 if (count < 4) {
228 float_one = spe_allocate_available_register(p);
229 spe_il(p, float_one, 1);
230 spe_cuflt(p, float_one, float_one, 0);
231
232 if (count < 3) {
233 float_zero = spe_allocate_available_register(p);
234 spe_il(p, float_zero, 0);
235 }
236 }
237
238 spe_release_register(p, tmp);
239
240 emit_matrix_transpose(p, v0, v1, v2, v3, out_ptr, shuf_ptr, count);
241
242 switch (count) {
243 case 1:
244 spe_stqd(p, float_zero, out_ptr, 1);
245 case 2:
246 spe_stqd(p, float_zero, out_ptr, 2);
247 case 3:
248 spe_stqd(p, float_one, out_ptr, 3);
249 }
250
251 if (float_zero != -1) {
252 spe_release_register(p, float_zero);
253 }
254
255 if (float_one != -1) {
256 spe_release_register(p, float_one);
257 }
258 }
259
260
261 void cell_update_vertex_fetch(struct draw_context *draw)
262 {
263 #if 0
264 struct cell_context *const cell =
265 (struct cell_context *) draw->driver_private;
266 struct spe_function *p = &cell->attrib_fetch;
267 unsigned function_index[PIPE_MAX_ATTRIBS];
268 unsigned unique_attr_formats;
269 int out_ptr;
270 int in_ptr;
271 int shuf_ptr;
272 unsigned i;
273 unsigned j;
274
275
276 /* Determine how many unique input attribute formats there are. At the
277 * same time, store the index of the lowest numbered attribute that has
278 * the same format as any non-unique format.
279 */
280 unique_attr_formats = 1;
281 function_index[0] = 0;
282 for (i = 1; i < draw->vertex_fetch.nr_attrs; i++) {
283 const enum pipe_format curr_fmt = draw->vertex_element[i].src_format;
284
285 for (j = 0; j < i; j++) {
286 if (curr_fmt == draw->vertex_element[j].src_format) {
287 break;
288 }
289 }
290
291 if (j == i) {
292 unique_attr_formats++;
293 }
294
295 function_index[i] = j;
296 }
297
298
299 /* Each fetch function can be a maximum of 34 instructions (note: this is
300 * actually a slight over-estimate). That means (34 * 4) = 136 bytes
301 * each maximum.
302 */
303 spe_init_func(p, 136 * unique_attr_formats);
304
305
306 /* Allocate registers for the function's input parameters.
307 */
308 out_ptr = spe_allocate_register(p, 3);
309 in_ptr = spe_allocate_register(p, 4);
310 shuf_ptr = spe_allocate_register(p, 5);
311
312
313 /* Generate code for the individual attribute fetch functions.
314 */
315 for (i = 0; i < draw->vertex_fetch.nr_attrs; i++) {
316 unsigned offset;
317
318 if (function_index[i] == i) {
319 cell->attrib_fetch_offsets[i] = (unsigned) ((void *) p->csr
320 - (void *) p->store);
321
322 offset = 0;
323 emit_fetch(p, in_ptr, &offset, out_ptr, shuf_ptr,
324 draw->vertex_element[i].src_format);
325 spe_bi(p, 0, 0, 0);
326
327 /* Round up to the next 16-byte boundary.
328 */
329 if ((((unsigned) p->store) & 0x0f) != 0) {
330 const unsigned align = ((unsigned) p->store) & 0x0f;
331 p->store = (uint32_t *) (((void *) p->store) + align);
332 }
333 } else {
334 /* Use the same function entry-point as a previously seen attribute
335 * with the same format.
336 */
337 cell->attrib_fetch_offsets[i] =
338 cell->attrib_fetch_offsets[function_index[i]];
339 }
340 }
341 #else
342 assert(0);
343 #endif
344 }