draw: don't keep refetching constant inputs
[mesa.git] / src / gallium / auxiliary / draw / draw_vs_aos_io.c
1 /**************************************************************************
2 *
3 * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 #include "util/u_memory.h"
30 #include "pipe/p_shader_tokens.h"
31 #include "tgsi/tgsi_parse.h"
32 #include "tgsi/tgsi_util.h"
33 #include "tgsi/tgsi_exec.h"
34 #include "draw_vs.h"
35 #include "draw_vs_aos.h"
36 #include "draw_vertex.h"
37
38 #include "rtasm/rtasm_x86sse.h"
39
40 #ifdef PIPE_ARCH_X86
41
42 /* Note - don't yet have to worry about interacting with the code in
43 * draw_vs_aos.c as there is no intermingling of generated code...
44 * That may have to change, we'll see.
45 */
46 static void emit_load_R32G32B32A32( struct aos_compilation *cp,
47 struct x86_reg data,
48 struct x86_reg src_ptr )
49 {
50 sse_movups(cp->func, data, src_ptr);
51 }
52
53 static void emit_load_R32G32B32( struct aos_compilation *cp,
54 struct x86_reg data,
55 struct x86_reg src_ptr )
56 {
57 sse_movss(cp->func, data, x86_make_disp(src_ptr, 8));
58 /* data = z ? ? ? */
59 sse_shufps(cp->func, data, aos_get_internal_xmm( cp, IMM_IDENTITY ), SHUF(X,Y,Z,W) );
60 /* data = z ? 0 1 */
61 sse_shufps(cp->func, data, data, SHUF(Y,Z,X,W) );
62 /* data = ? 0 z 1 */
63 sse_movlps(cp->func, data, src_ptr);
64 /* data = x y z 1 */
65 }
66
67 static void emit_load_R32G32( struct aos_compilation *cp,
68 struct x86_reg data,
69 struct x86_reg src_ptr )
70 {
71 sse_movups(cp->func, data, aos_get_internal_xmm( cp, IMM_IDENTITY ) );
72 sse_movlps(cp->func, data, src_ptr);
73 }
74
75
76 static void emit_load_R32( struct aos_compilation *cp,
77 struct x86_reg data,
78 struct x86_reg src_ptr )
79 {
80 sse_movss(cp->func, data, src_ptr);
81 sse_orps(cp->func, data, aos_get_internal_xmm( cp, IMM_IDENTITY ) );
82 }
83
84
85 static void emit_load_R8G8B8A8_UNORM( struct aos_compilation *cp,
86 struct x86_reg data,
87 struct x86_reg src_ptr )
88 {
89 sse_movss(cp->func, data, src_ptr);
90 sse2_punpcklbw(cp->func, data, aos_get_internal_xmm( cp, IMM_IDENTITY ));
91 sse2_punpcklbw(cp->func, data, aos_get_internal_xmm( cp, IMM_IDENTITY ));
92 sse2_cvtdq2ps(cp->func, data, data);
93 sse_mulps(cp->func, data, aos_get_internal(cp, IMM_INV_255));
94 }
95
96
97
98 /* Extended swizzles? Maybe later.
99 */
100 static void emit_swizzle( struct aos_compilation *cp,
101 struct x86_reg dest,
102 struct x86_reg src,
103 ubyte shuffle )
104 {
105 sse_shufps(cp->func, dest, src, shuffle);
106 }
107
108
109
110 static boolean get_buffer_ptr( struct aos_compilation *cp,
111 boolean linear,
112 unsigned buf_idx,
113 struct x86_reg elt,
114 struct x86_reg ptr)
115 {
116 struct x86_reg buf = x86_make_disp(aos_get_x86( cp, 0, X86_BUFFERS ),
117 buf_idx * sizeof(struct aos_buffer));
118
119 struct x86_reg buf_stride = x86_make_disp(buf,
120 Offset(struct aos_buffer, stride));
121 if (linear) {
122 struct x86_reg buf_ptr = x86_make_disp(buf,
123 Offset(struct aos_buffer, ptr));
124
125
126 /* Calculate pointer to current attrib:
127 */
128 x86_mov(cp->func, ptr, buf_ptr);
129 x86_mov(cp->func, elt, buf_stride);
130 x86_add(cp->func, elt, ptr);
131 sse_prefetchnta(cp->func, x86_deref(elt));
132 x86_mov(cp->func, buf_ptr, elt);
133 }
134 else {
135 struct x86_reg buf_base_ptr = x86_make_disp(buf,
136 Offset(struct aos_buffer, base_ptr));
137
138
139 /* Calculate pointer to current attrib:
140 */
141 x86_mov(cp->func, ptr, buf_stride);
142 x86_imul(cp->func, ptr, elt);
143 x86_add(cp->func, ptr, buf_base_ptr);
144 }
145
146 cp->insn_counter++;
147
148 return TRUE;
149 }
150
151
152 static boolean load_input( struct aos_compilation *cp,
153 unsigned idx,
154 struct x86_reg bufptr )
155 {
156 unsigned format = cp->vaos->base.key.element[idx].in.format;
157 unsigned offset = cp->vaos->base.key.element[idx].in.offset;
158 struct x86_reg dataXMM = aos_get_xmm_reg(cp);
159
160 /* Figure out source pointer address:
161 */
162 struct x86_reg src = x86_make_disp(bufptr, offset);
163
164 aos_adopt_xmm_reg( cp,
165 dataXMM,
166 TGSI_FILE_INPUT,
167 idx,
168 TRUE );
169
170 switch (format) {
171 case PIPE_FORMAT_R32_FLOAT:
172 emit_load_R32(cp, dataXMM, src);
173 break;
174 case PIPE_FORMAT_R32G32_FLOAT:
175 emit_load_R32G32(cp, dataXMM, src);
176 break;
177 case PIPE_FORMAT_R32G32B32_FLOAT:
178 emit_load_R32G32B32(cp, dataXMM, src);
179 break;
180 case PIPE_FORMAT_R32G32B32A32_FLOAT:
181 emit_load_R32G32B32A32(cp, dataXMM, src);
182 break;
183 case PIPE_FORMAT_B8G8R8A8_UNORM:
184 emit_load_R8G8B8A8_UNORM(cp, dataXMM, src);
185 emit_swizzle(cp, dataXMM, dataXMM, SHUF(Z,Y,X,W));
186 break;
187 case PIPE_FORMAT_R8G8B8A8_UNORM:
188 emit_load_R8G8B8A8_UNORM(cp, dataXMM, src);
189 break;
190 default:
191 ERROR(cp, "unhandled input format");
192 return FALSE;
193 }
194
195 return TRUE;
196 }
197
198 static boolean load_inputs( struct aos_compilation *cp,
199 unsigned buffer,
200 struct x86_reg ptr )
201 {
202 unsigned i;
203
204 for (i = 0; i < cp->vaos->base.key.nr_inputs; i++) {
205 if (cp->vaos->base.key.element[i].in.buffer == buffer) {
206
207 if (!load_input( cp, i, ptr ))
208 return FALSE;
209
210 cp->insn_counter++;
211 }
212 }
213
214 return TRUE;
215 }
216
217 boolean aos_init_inputs( struct aos_compilation *cp, boolean linear )
218 {
219 unsigned i;
220 for (i = 0; i < cp->vaos->nr_vb; i++) {
221 struct x86_reg buf = x86_make_disp(aos_get_x86( cp, 0, X86_BUFFERS ),
222 i * sizeof(struct aos_buffer));
223
224 struct x86_reg buf_base_ptr = x86_make_disp(buf,
225 Offset(struct aos_buffer, base_ptr));
226
227 if (cp->vaos->base.key.const_vbuffers & (1<<i)) {
228 struct x86_reg ptr = cp->tmp_EAX;
229
230 x86_mov(cp->func, ptr, buf_base_ptr);
231
232 /* Load all inputs for this constant vertex buffer
233 */
234 load_inputs( cp, i, x86_deref(ptr) );
235
236 /* Then just force them out to aos_machine.input[]
237 */
238 aos_spill_all( cp );
239
240 }
241 else if (linear) {
242
243 struct x86_reg elt = cp->idx_EBX;
244 struct x86_reg ptr = cp->tmp_EAX;
245
246 struct x86_reg buf_stride = x86_make_disp(buf,
247 Offset(struct aos_buffer, stride));
248
249 struct x86_reg buf_ptr = x86_make_disp(buf,
250 Offset(struct aos_buffer, ptr));
251
252
253 /* Calculate pointer to current attrib:
254 */
255 x86_mov(cp->func, ptr, buf_stride);
256 x86_imul(cp->func, ptr, elt);
257 x86_add(cp->func, ptr, buf_base_ptr);
258
259
260 /* In the linear case, keep the buffer pointer instead of the
261 * index number.
262 */
263 if (cp->vaos->nr_vb == 1)
264 x86_mov( cp->func, elt, ptr );
265 else
266 x86_mov( cp->func, buf_ptr, ptr );
267
268 cp->insn_counter++;
269 }
270 }
271
272 return TRUE;
273 }
274
275 boolean aos_fetch_inputs( struct aos_compilation *cp, boolean linear )
276 {
277 unsigned j;
278
279 for (j = 0; j < cp->vaos->nr_vb; j++) {
280 if (cp->vaos->base.key.const_vbuffers & (1<<j)) {
281 /* just retreive pre-transformed input */
282 }
283 else if (linear && cp->vaos->nr_vb == 1) {
284 load_inputs( cp, 0, cp->idx_EBX );
285 }
286 else {
287 struct x86_reg elt = linear ? cp->idx_EBX : x86_deref(cp->idx_EBX);
288 struct x86_reg ptr = cp->tmp_EAX;
289
290 if (!get_buffer_ptr( cp, linear, j, elt, ptr ))
291 return FALSE;
292
293 if (!load_inputs( cp, j, ptr ))
294 return FALSE;
295 }
296 }
297
298 return TRUE;
299 }
300
301 boolean aos_incr_inputs( struct aos_compilation *cp, boolean linear )
302 {
303 if (linear && cp->vaos->nr_vb == 1) {
304 struct x86_reg stride = x86_make_disp(aos_get_x86( cp, 0, X86_BUFFERS ),
305 (0 * sizeof(struct aos_buffer) +
306 Offset(struct aos_buffer, stride)));
307
308 x86_add(cp->func, cp->idx_EBX, stride);
309 sse_prefetchnta(cp->func, x86_deref(cp->idx_EBX));
310 }
311 else if (linear) {
312 /* Nothing to do */
313 }
314 else {
315 x86_lea(cp->func, cp->idx_EBX, x86_make_disp(cp->idx_EBX, 4));
316 }
317
318 return TRUE;
319 }
320
321
322
323
324
325
326 static void emit_store_R32G32B32A32( struct aos_compilation *cp,
327 struct x86_reg dst_ptr,
328 struct x86_reg dataXMM )
329 {
330 sse_movups(cp->func, dst_ptr, dataXMM);
331 }
332
333 static void emit_store_R32G32B32( struct aos_compilation *cp,
334 struct x86_reg dst_ptr,
335 struct x86_reg dataXMM )
336 {
337 sse_movlps(cp->func, dst_ptr, dataXMM);
338 sse_shufps(cp->func, dataXMM, dataXMM, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */
339 sse_movss(cp->func, x86_make_disp(dst_ptr,8), dataXMM);
340 }
341
342 static void emit_store_R32G32( struct aos_compilation *cp,
343 struct x86_reg dst_ptr,
344 struct x86_reg dataXMM )
345 {
346 sse_movlps(cp->func, dst_ptr, dataXMM);
347 }
348
349 static void emit_store_R32( struct aos_compilation *cp,
350 struct x86_reg dst_ptr,
351 struct x86_reg dataXMM )
352 {
353 sse_movss(cp->func, dst_ptr, dataXMM);
354 }
355
356
357
358 static void emit_store_R8G8B8A8_UNORM( struct aos_compilation *cp,
359 struct x86_reg dst_ptr,
360 struct x86_reg dataXMM )
361 {
362 sse_mulps(cp->func, dataXMM, aos_get_internal(cp, IMM_255));
363 sse2_cvtps2dq(cp->func, dataXMM, dataXMM);
364 sse2_packssdw(cp->func, dataXMM, dataXMM);
365 sse2_packuswb(cp->func, dataXMM, dataXMM);
366 sse_movss(cp->func, dst_ptr, dataXMM);
367 }
368
369
370
371
372
373 static boolean emit_output( struct aos_compilation *cp,
374 struct x86_reg ptr,
375 struct x86_reg dataXMM,
376 unsigned format )
377 {
378 switch (format) {
379 case EMIT_1F:
380 case EMIT_1F_PSIZE:
381 emit_store_R32(cp, ptr, dataXMM);
382 break;
383 case EMIT_2F:
384 emit_store_R32G32(cp, ptr, dataXMM);
385 break;
386 case EMIT_3F:
387 emit_store_R32G32B32(cp, ptr, dataXMM);
388 break;
389 case EMIT_4F:
390 emit_store_R32G32B32A32(cp, ptr, dataXMM);
391 break;
392 case EMIT_4UB:
393 if (1) {
394 emit_swizzle(cp, dataXMM, dataXMM, SHUF(Z,Y,X,W));
395 emit_store_R8G8B8A8_UNORM(cp, ptr, dataXMM);
396 }
397 else {
398 emit_store_R8G8B8A8_UNORM(cp, ptr, dataXMM);
399 }
400 break;
401 default:
402 ERROR(cp, "unhandled output format");
403 return FALSE;
404 }
405
406 return TRUE;
407 }
408
409
410
411 boolean aos_emit_outputs( struct aos_compilation *cp )
412 {
413 unsigned i;
414
415 for (i = 0; i < cp->vaos->base.key.nr_outputs; i++) {
416 unsigned format = cp->vaos->base.key.element[i].out.format;
417 unsigned offset = cp->vaos->base.key.element[i].out.offset;
418 unsigned vs_output = cp->vaos->base.key.element[i].out.vs_output;
419
420 struct x86_reg data;
421
422 if (format == EMIT_1F_PSIZE) {
423 data = aos_get_internal_xmm( cp, IMM_PSIZE );
424 }
425 else {
426 data = aos_get_shader_reg( cp,
427 TGSI_FILE_OUTPUT,
428 vs_output );
429 }
430
431 if (data.file != file_XMM) {
432 struct x86_reg tmp = aos_get_xmm_reg( cp );
433 sse_movups(cp->func, tmp, data);
434 data = tmp;
435 }
436
437 if (!emit_output( cp,
438 x86_make_disp( cp->outbuf_ECX, offset ),
439 data,
440 format ))
441 return FALSE;
442
443 aos_release_xmm_reg( cp, data.idx );
444
445 cp->insn_counter++;
446 }
447
448 return TRUE;
449 }
450
451 #endif