got rid of the notify routine as intermediate step
[mesa.git] / src / mesa / tnl / t_vtx_x86_gcc.S
1 /**************************************************************************
2
3 Copyright 2004 Tungsten Graphics Inc., Cedar Park, Texas.
4
5 All Rights Reserved.
6
7 Permission is hereby granted, free of charge, to any person obtaining a
8 copy of this software and associated documentation files (the "Software"),
9 to deal in the Software without restriction, including without limitation
10 on the rights to use, copy, modify, merge, publish, distribute, sub
11 license, and/or sell copies of the Software, and to permit persons to whom
12 the Software is furnished to do so, subject to the following conditions:
13
14 The above copyright notice and this permission notice (including the next
15 paragraph) shall be included in all copies or substantial portions of the
16 Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
21 ATI, TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
22 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
23 OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
24 USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **************************************************************************/
27
28 /*
29 * Authors:
30 * Keith Whitwell <keith@tungstengraphics.com>
31 */
32
33 #if !defined (__DJGPP__)
34
35 #define GLOBL( x ) \
36 .globl x; \
37 x:
38
39 #else /* defined(__DJGPP__) */
40
41 #define GLOBL( x ) \
42 .globl _##x; \
43 _##x:
44
45 #endif /* defined(__DJGPP__) */
46
47 .data
48 .align 4
49
50 // Someone who knew a lot about this sort of thing would use this
51 // macro to note current offsets, etc in a special region of the
52 // object file & just make everything work out neat. I don't know
53 // enough to do that...
54
55 #define SUBST( x ) (0x10101010 + x)
56
57
58
59 // [dBorca] TODO
60 // Unfold functions for each vertex size?
61 // Build super-specialized MMX/SSE versions?
62 // STDCALL woes (HAVE_NONSTANDARD_GLAPIENTRY):
63 // need separate routine for the non "fv" case,
64 // to clean up the stack (I guess we could codegen
65 // 'ret nn' insn)! Also we need to call notify, then
66 // return, instead of jump!
67
68 GLOBL ( _tnl_x86_Vertex1fv )
69 movl 4(%esp), %ecx
70 push %edi
71 push %esi
72 movl SUBST(0), %edi // 0x0 --> tnl->vtx.vbptr
73 movl (%ecx), %edx // load v[0]
74 movl %edx, (%edi) // tnl->vtx.vbptr[0] = v[0]
75 addl $4, %edi // tnl->vtx.vbptr += 1
76 movl $SUBST(1), %ecx // 0x1 --> (tnl->vtx.vertex_size - 1)
77 movl $SUBST(2), %esi // 0x2 --> (tnl->vtx.vertex + 1)
78 repz
79 movsl %ds:(%esi), %es:(%edi)
80 movl %edi, SUBST(0) // 0x0 --> tnl->vtx.vbptr
81 movl SUBST(3), %edx // 0x3 --> counter
82 pop %esi
83 pop %edi
84 dec %edx // counter--
85 movl %edx, SUBST(3) // 0x3 --> counter
86 jne .0 // if (counter != 0) return
87 pushl $SUBST(4) // 0x4 --> ctx
88 .byte 0xe8 // call ...
89 .long SUBST(5) // ... _tnl_wrap_filled_vertex(ctx)
90 pop %eax
91 .0:
92 ret // return
93 GLOBL ( _tnl_x86_Vertex1fv_end )
94
95
96 .align 4
97 GLOBL ( _tnl_x86_Vertex2fv )
98 movl 4(%esp), %ecx
99 push %edi
100 push %esi
101 movl SUBST(0), %edi // load tnl->vtx.vbptr
102 movl (%ecx), %edx // load v[0]
103 movl 4(%ecx), %eax // load v[1]
104 movl %edx, (%edi) // tnl->vtx.vbptr[0] = v[0]
105 movl %eax, 4(%edi) // tnl->vtx.vbptr[1] = v[1]
106 addl $8, %edi // tnl->vtx.vbptr += 2
107 movl $SUBST(1), %ecx // vertex_size - 2
108 movl $SUBST(2), %esi // tnl->vtx.vertex + 2
109 repz
110 movsl %ds:(%esi), %es:(%edi)
111 movl %edi, SUBST(0) // save tnl->vtx.vbptr
112 movl SUBST(3), %edx // load counter
113 pop %esi
114 pop %edi
115 dec %edx // counter--
116 movl %edx, SUBST(3) // save counter
117 jne .1 // if (counter != 0) return
118 pushl $SUBST(4) // load ctx
119 .byte 0xe8 // call ...
120 .long SUBST(5) // ... _tnl_wrap_filled_vertex(ctx)
121 pop %eax
122 .1:
123 ret // return
124 GLOBL ( _tnl_x86_Vertex2fv_end )
125
126 .align 4
127 GLOBL ( _tnl_x86_Vertex3fv )
128 movl 4(%esp), %ecx
129 push %edi
130 push %esi
131 movl SUBST(0), %edi // load tnl->vtx.vbptr
132 movl (%ecx), %edx // load v[0]
133 movl 4(%ecx), %eax // load v[1]
134 movl 8(%ecx), %esi // load v[2]
135 movl %edx, (%edi) // tnl->vtx.vbptr[0] = v[0]
136 movl %eax, 4(%edi) // tnl->vtx.vbptr[1] = v[1]
137 movl %esi, 8(%edi) // tnl->vtx.vbptr[2] = v[2]
138 addl $12, %edi // tnl->vtx.vbptr += 3
139 movl $SUBST(1), %ecx // vertex_size - 3
140 movl $SUBST(2), %esi // tnl->vtx.vertex + 3
141 repz
142 movsl %ds:(%esi), %es:(%edi)
143 movl %edi, SUBST(0) // save tnl->vtx.vbptr
144 movl SUBST(3), %edx // load counter
145 pop %esi
146 pop %edi
147 dec %edx // counter--
148 movl %edx, SUBST(3) // save counter
149 jne .2 // if (counter != 0) return
150 pushl $SUBST(4) // load ctx
151 .byte 0xe8 // call ...
152 .long SUBST(5) // ... _tnl_wrap_filled_vertex(ctx)
153 pop %eax
154 .2:
155 ret // return
156 GLOBL ( _tnl_x86_Vertex3fv_end )
157
158
159 .align 4
160 GLOBL ( _tnl_x86_Vertex4fv )
161 movl 4(%esp), %ecx
162 push %edi
163 push %esi
164 movl SUBST(0), %edi // load tnl->vtx.vbptr
165 movl (%ecx), %edx // load v[0]
166 movl 4(%ecx), %eax // load v[1]
167 movl 8(%ecx), %esi // load v[2]
168 movl 12(%ecx), %ecx // load v[3]
169 movl %edx, (%edi) // tnl->vtx.vbptr[0] = v[0]
170 movl %eax, 4(%edi) // tnl->vtx.vbptr[1] = v[1]
171 movl %esi, 8(%edi) // tnl->vtx.vbptr[2] = v[2]
172 movl %ecx, 12(%edi) // tnl->vtx.vbptr[3] = v[3]
173 addl $16, %edi // tnl->vtx.vbptr += 4
174 movl $SUBST(1), %ecx // vertex_size - 4
175 movl $SUBST(2), %esi // tnl->vtx.vertex + 3
176 repz
177 movsl %ds:(%esi), %es:(%edi)
178 movl %edi, SUBST(0) // save tnl->vtx.vbptr
179 movl SUBST(3), %edx // load counter
180 pop %esi
181 pop %edi
182 dec %edx // counter--
183 movl %edx, SUBST(3) // save counter
184 jne .3 // if (counter != 0) return
185 pushl $SUBST(4) // load ctx
186 .byte 0xe8 // call ...
187 .long SUBST(5) // ... _tnl_wrap_filled_vertex(ctx)
188 pop %eax
189 .3:
190 ret // return
191 GLOBL ( _tnl_x86_Vertex4fv_end )
192
193
194
195 /**
196 * Generic handlers for vector format data.
197 */
198
199 GLOBL( _tnl_x86_Attribute1fv)
200 movl 4(%esp), %ecx
201 movl (%ecx), %eax /* load v[0] */
202 movl %eax, SUBST(0) /* store v[0] to current vertex */
203 ret
204 GLOBL ( _tnl_x86_Attribute1fv_end )
205
206 GLOBL( _tnl_x86_Attribute2fv)
207 movl 4(%esp), %ecx
208 movl (%ecx), %eax /* load v[0] */
209 movl 4(%ecx), %edx /* load v[1] */
210 movl %eax, SUBST(0) /* store v[0] to current vertex */
211 movl %edx, SUBST(1) /* store v[1] to current vertex */
212 ret
213 GLOBL ( _tnl_x86_Attribute2fv_end )
214
215
216 GLOBL( _tnl_x86_Attribute3fv)
217 movl 4(%esp), %ecx
218 movl (%ecx), %eax /* load v[0] */
219 movl 4(%ecx), %edx /* load v[1] */
220 movl 8(%ecx), %ecx /* load v[2] */
221 movl %eax, SUBST(0) /* store v[0] to current vertex */
222 movl %edx, SUBST(1) /* store v[1] to current vertex */
223 movl %ecx, SUBST(2) /* store v[2] to current vertex */
224 ret
225 GLOBL ( _tnl_x86_Attribute3fv_end )
226
227 GLOBL( _tnl_x86_Attribute4fv)
228 movl 4(%esp), %ecx
229 movl (%ecx), %eax /* load v[0] */
230 movl 4(%ecx), %edx /* load v[1] */
231 movl %eax, SUBST(0) /* store v[0] to current vertex */
232 movl %edx, SUBST(1) /* store v[1] to current vertex */
233 movl 8(%ecx), %eax /* load v[2] */
234 movl 12(%ecx), %edx /* load v[3] */
235 movl %eax, SUBST(2) /* store v[2] to current vertex */
236 movl %edx, SUBST(3) /* store v[3] to current vertex */
237 ret
238 GLOBL ( _tnl_x86_Attribute4fv_end )
239
240
241 // Choosers:
242
243 // Must generate all of these ahead of first usage. Generate at
244 // compile-time?
245
246
247 GLOBL( _tnl_x86_choose_fv)
248 subl $12, %esp // gcc does 16 byte alignment of stack frames?
249 movl $SUBST(0), (%esp) // arg 0 - attrib
250 movl $SUBST(1), 4(%esp) // arg 1 - N
251 .byte 0xe8 // call ...
252 .long SUBST(2) // ... do_choose
253 add $12, %esp // tear down stack frame
254 jmp *%eax // jump to new func
255 GLOBL ( _tnl_x86_choose_fv_end )
256
257
258
259 // FIRST LEVEL FUNCTIONS -- these are plugged directly into GL dispatch.
260
261
262
263 // In the 1st level dispatch functions, switch to a different
264 // calling convention -- (const GLfloat *v) in %ecx.
265 //
266 // As with regular (x86) dispatch, don't create a new stack frame -
267 // just let the 'ret' in the dispatched function return straight
268 // back to the original caller.
269
270
271
272 // Vertex/Normal/Color, etc: the address of the function pointer
273 // is known at codegen time.
274
275
276 // Unfortunately, have to play with the stack in the non-fv case:
277 //
278 GLOBL( _tnl_x86_dispatch_attrf )
279 subl $12, %esp // gcc does 16 byte alignment of stack frames?
280 leal 16(%esp), %edx // address of first float on stack
281 movl %edx, (%esp) // save as 'v'
282 call *SUBST(0) // 0x0 --> tabfv[attr][n]
283 addl $12, %esp // tear down frame
284 ret // return
285 GLOBL( _tnl_x86_dispatch_attrf_end )
286
287 // The fv case is simpler:
288 //
289 GLOBL( _tnl_x86_dispatch_attrfv )
290 jmp *SUBST(0) // 0x0 --> tabfv[attr][n]
291 GLOBL( _tnl_x86_dispatch_attrfv_end )
292
293
294 // MultiTexcoord: the address of the function pointer must be
295 // calculated, but can use the index argument slot to hold 'v', and
296 // avoid setting up a new stack frame.
297 //
298 // [dBorca]
299 // right, this would be the preferred approach, but gcc does not
300 // clean up the stack after each function call when optimizing (-fdefer-pop);
301 // can it make assumptions about what's already on the stack? I dunno,
302 // but in this case, we can't mess with the caller's stack frame, and
303 // we must use a model like `_x86_dispatch_attrfv' above. Caveat emptor!
304
305 // Also, will only need a maximum of four of each of these per context:
306 //
307 GLOBL( _tnl_x86_dispatch_multitexcoordf )
308 movl 4(%esp), %ecx
309 leal 8(%esp), %edx
310 andl $7, %ecx
311 movl %edx, 4(%esp)
312 sall $4, %ecx
313 jmp *SUBST(0)(%ecx) // 0x0 - tabfv[tex0][n]
314 GLOBL( _tnl_x86_dispatch_multitexcoordf_end )
315
316 GLOBL( _tnl_x86_dispatch_multitexcoordfv )
317 movl 4(%esp), %ecx
318 movl 8(%esp), %edx
319 andl $7, %ecx
320 movl %edx, 4(%esp)
321 sall $4, %ecx
322 jmp *SUBST(0)(%ecx) // 0x0 - tabfv[tex0][n]
323 GLOBL( _tnl_x86_dispatch_multitexcoordfv_end )
324
325 // VertexAttrib: the address of the function pointer must be
326 // calculated.
327
328 GLOBL( _tnl_x86_dispatch_vertexattribf )
329 movl 4(%esp), %eax
330 cmpl $16, %eax
331 jb .8 // "cmovge" is not supported on all CPUs
332 movl $16, %eax
333 .8:
334 leal 8(%esp), %ecx // calculate 'v'
335 movl %ecx, 4(%esp) // save in 1st arg slot
336 sall $4, %eax
337 jmp *SUBST(0)(%eax) // 0x0 - tabfv[0][n]
338 GLOBL( _tnl_x86_dispatch_vertexattribf_end )
339
340 GLOBL( _tnl_x86_dispatch_vertexattribfv )
341 movl 4(%esp), %eax
342 cmpl $16, %eax
343 jb .9 // "cmovge" is not supported on all CPUs
344 movl $16, %eax
345 .9:
346 movl 8(%esp), %ecx // load 'v'
347 movl %ecx, 4(%esp) // save in 1st arg slot
348 sall $4, %eax
349 jmp *SUBST(0)(%eax) // 0x0 - tabfv[0][n]
350 GLOBL( _tnl_x86_dispatch_vertexattribfv_end )