radeon: Optimize memory handling for dma operations.
[mesa.git] / src / mesa / drivers / dri / radeon / radeon_dma.c
1 /**************************************************************************
2
3 Copyright (C) 2004 Nicolai Haehnle.
4 Copyright (C) The Weather Channel, Inc. 2002. All Rights Reserved.
5
6 The Weather Channel (TM) funded Tungsten Graphics to develop the
7 initial release of the Radeon 8500 driver under the XFree86 license.
8 This notice must be preserved.
9
10 All Rights Reserved.
11
12 Permission is hereby granted, free of charge, to any person obtaining a
13 copy of this software and associated documentation files (the "Software"),
14 to deal in the Software without restriction, including without limitation
15 on the rights to use, copy, modify, merge, publish, distribute, sub
16 license, and/or sell copies of the Software, and to permit persons to whom
17 the Software is furnished to do so, subject to the following conditions:
18
19 The above copyright notice and this permission notice (including the next
20 paragraph) shall be included in all copies or substantial portions of the
21 Software.
22
23 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25 FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
26 ATI, VA LINUX SYSTEMS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
27 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
28 OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
29 USE OR OTHER DEALINGS IN THE SOFTWARE.
30
31 **************************************************************************/
32
33 #include "radeon_common.h"
34 #include "main/simple_list.h"
35
36 #if defined(USE_X86_ASM)
37 #define COPY_DWORDS( dst, src, nr ) \
38 do { \
39 int __tmp; \
40 __asm__ __volatile__( "rep ; movsl" \
41 : "=%c" (__tmp), "=D" (dst), "=S" (__tmp) \
42 : "0" (nr), \
43 "D" ((long)dst), \
44 "S" ((long)src) ); \
45 } while (0)
46 #else
47 #define COPY_DWORDS( dst, src, nr ) \
48 do { \
49 int j; \
50 for ( j = 0 ; j < nr ; j++ ) \
51 dst[j] = ((int *)src)[j]; \
52 dst += nr; \
53 } while (0)
54 #endif
55
56 void radeonEmitVec4(uint32_t *out, const GLvoid * data, int stride, int count)
57 {
58 int i;
59
60 if (RADEON_DEBUG & DEBUG_VERTS)
61 fprintf(stderr, "%s count %d stride %d out %p data %p\n",
62 __FUNCTION__, count, stride, (void *)out, (void *)data);
63
64 if (stride == 4)
65 COPY_DWORDS(out, data, count);
66 else
67 for (i = 0; i < count; i++) {
68 out[0] = *(int *)data;
69 out++;
70 data += stride;
71 }
72 }
73
74 void radeonEmitVec8(uint32_t *out, const GLvoid * data, int stride, int count)
75 {
76 int i;
77
78 if (RADEON_DEBUG & DEBUG_VERTS)
79 fprintf(stderr, "%s count %d stride %d out %p data %p\n",
80 __FUNCTION__, count, stride, (void *)out, (void *)data);
81
82 if (stride == 8)
83 COPY_DWORDS(out, data, count * 2);
84 else
85 for (i = 0; i < count; i++) {
86 out[0] = *(int *)data;
87 out[1] = *(int *)(data + 4);
88 out += 2;
89 data += stride;
90 }
91 }
92
93 void radeonEmitVec12(uint32_t *out, const GLvoid * data, int stride, int count)
94 {
95 int i;
96
97 if (RADEON_DEBUG & DEBUG_VERTS)
98 fprintf(stderr, "%s count %d stride %d out %p data %p\n",
99 __FUNCTION__, count, stride, (void *)out, (void *)data);
100
101 if (stride == 12) {
102 COPY_DWORDS(out, data, count * 3);
103 }
104 else
105 for (i = 0; i < count; i++) {
106 out[0] = *(int *)data;
107 out[1] = *(int *)(data + 4);
108 out[2] = *(int *)(data + 8);
109 out += 3;
110 data += stride;
111 }
112 }
113
114 void radeonEmitVec16(uint32_t *out, const GLvoid * data, int stride, int count)
115 {
116 int i;
117
118 if (RADEON_DEBUG & DEBUG_VERTS)
119 fprintf(stderr, "%s count %d stride %d out %p data %p\n",
120 __FUNCTION__, count, stride, (void *)out, (void *)data);
121
122 if (stride == 16)
123 COPY_DWORDS(out, data, count * 4);
124 else
125 for (i = 0; i < count; i++) {
126 out[0] = *(int *)data;
127 out[1] = *(int *)(data + 4);
128 out[2] = *(int *)(data + 8);
129 out[3] = *(int *)(data + 12);
130 out += 4;
131 data += stride;
132 }
133 }
134
135 void rcommon_emit_vector(GLcontext * ctx, struct radeon_aos *aos,
136 const GLvoid * data, int size, int stride, int count)
137 {
138 radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
139 uint32_t *out;
140
141 if (stride == 0) {
142 radeonAllocDmaRegion(rmesa, &aos->bo, &aos->offset, size * 4, 32);
143 count = 1;
144 aos->stride = 0;
145 } else {
146 radeonAllocDmaRegion(rmesa, &aos->bo, &aos->offset, size * count * 4, 32);
147 aos->stride = size;
148 }
149
150 aos->components = size;
151 aos->count = count;
152
153 out = (uint32_t*)((char*)aos->bo->ptr + aos->offset);
154 switch (size) {
155 case 1: radeonEmitVec4(out, data, stride, count); break;
156 case 2: radeonEmitVec8(out, data, stride, count); break;
157 case 3: radeonEmitVec12(out, data, stride, count); break;
158 case 4: radeonEmitVec16(out, data, stride, count); break;
159 default:
160 assert(0);
161 break;
162 }
163 }
164
165 void radeon_init_dma(radeonContextPtr rmesa)
166 {
167 make_empty_list(&rmesa->dma.free);
168 make_empty_list(&rmesa->dma.wait);
169 make_empty_list(&rmesa->dma.reserved);
170 }
171
172 void radeonRefillCurrentDmaRegion(radeonContextPtr rmesa, int size)
173 {
174 size = MAX2(size, MAX_DMA_BUF_SZ);
175
176 if (RADEON_DEBUG & (DEBUG_IOCTL | DEBUG_DMA))
177 fprintf(stderr, "%s\n", __FUNCTION__);
178
179 if (rmesa->dma.flush) {
180 rmesa->dma.flush(rmesa->glCtx);
181 }
182
183 if (is_empty_list(&rmesa->dma.free)) {
184 struct radeon_dma_bo *dma_bo = CALLOC(sizeof(struct radeon_dma_bo));
185 assert(dma_bo);
186
187 again_alloc:
188 dma_bo->bo = radeon_bo_open(rmesa->radeonScreen->bom,
189 0, size, 4, RADEON_GEM_DOMAIN_GTT,
190 0);
191
192 if (!dma_bo->bo) {
193 rcommonFlushCmdBuf(rmesa, __FUNCTION__);
194 goto again_alloc;
195 }
196 insert_at_head(&rmesa->dma.reserved, dma_bo);
197 } else {
198 struct radeon_dma_bo *dma_bo = last_elem(&rmesa->dma.free);
199 assert(dma_bo->bo->cref == 1);
200 remove_from_list(dma_bo);
201 insert_at_head(&rmesa->dma.reserved, dma_bo);
202 }
203
204 rmesa->dma.current_used = 0;
205 rmesa->dma.current_vertexptr = 0;
206
207 if (radeon_cs_space_check_with_bo(rmesa->cmdbuf.cs,
208 first_elem(&rmesa->dma.reserved)->bo,
209 RADEON_GEM_DOMAIN_GTT, 0))
210 fprintf(stderr,"failure to revalidate BOs - badness\n");
211
212 if (is_empty_list(&rmesa->dma.reserved)) {
213 /* Cmd buff have been flushed in radeon_revalidate_bos */
214 goto again_alloc;
215 }
216
217 radeon_bo_map(first_elem(&rmesa->dma.reserved)->bo, 1);
218 }
219
220 /* Allocates a region from rmesa->dma.current. If there isn't enough
221 * space in current, grab a new buffer (and discard what was left of current)
222 */
223 void radeonAllocDmaRegion(radeonContextPtr rmesa,
224 struct radeon_bo **pbo, int *poffset,
225 int bytes, int alignment)
226 {
227 if (RADEON_DEBUG & DEBUG_IOCTL)
228 fprintf(stderr, "%s %d\n", __FUNCTION__, bytes);
229
230 if (rmesa->dma.flush)
231 rmesa->dma.flush(rmesa->glCtx);
232
233 assert(rmesa->dma.current_used == rmesa->dma.current_vertexptr);
234
235 alignment--;
236 rmesa->dma.current_used = (rmesa->dma.current_used + alignment) & ~alignment;
237
238 if (is_empty_list(&rmesa->dma.reserved)
239 || rmesa->dma.current_used + bytes > first_elem(&rmesa->dma.reserved)->bo->size)
240 radeonRefillCurrentDmaRegion(rmesa, (bytes + 15) & ~15);
241
242 *poffset = rmesa->dma.current_used;
243 *pbo = first_elem(&rmesa->dma.reserved)->bo;
244 radeon_bo_ref(*pbo);
245
246 /* Always align to at least 16 bytes */
247 rmesa->dma.current_used = (rmesa->dma.current_used + bytes + 15) & ~15;
248 rmesa->dma.current_vertexptr = rmesa->dma.current_used;
249
250 assert(rmesa->dma.current_used <= first_elem(&rmesa->dma.reserved)->bo->size);
251 }
252
253 void radeonFreeDmaRegions(radeonContextPtr rmesa)
254 {
255 struct radeon_dma_bo *dma_bo;
256 struct radeon_dma_bo *temp;
257 if (RADEON_DEBUG & DEBUG_IOCTL)
258 fprintf(stderr, "%s\n", __FUNCTION__);
259
260 foreach_s(dma_bo, temp, &rmesa->dma.free) {
261 remove_from_list(dma_bo);
262 radeon_bo_unref(dma_bo->bo);
263 FREE(dma_bo);
264 }
265
266 foreach_s(dma_bo, temp, &rmesa->dma.reserved) {
267 remove_from_list(dma_bo);
268 radeon_bo_unmap(dma_bo->bo);
269 radeon_bo_unref(dma_bo->bo);
270 FREE(dma_bo);
271 }
272 }
273
274 void radeonReleaseDmaRegions(radeonContextPtr rmesa)
275 {
276 struct radeon_dma_bo *dma_bo;
277 struct radeon_dma_bo *temp;
278 const int expire_at = ++rmesa->dma.free.expire_counter + DMA_BO_FREE_TIME;
279 const int time = rmesa->dma.free.expire_counter;
280 if (RADEON_DEBUG & DEBUG_IOCTL)
281 fprintf(stderr, "%s\n", __FUNCTION__);
282
283 /* move waiting bos to free list.
284 wait list provides gpu time to handle data before reuse */
285 foreach_s(dma_bo, temp, &rmesa->dma.wait) {
286 if (dma_bo->expire_counter == time) {
287 WARN_ONCE("Leaking dma buffer object!\n");
288 radeon_bo_unref(dma_bo->bo);
289 remove_from_list(dma_bo);
290 FREE(dma_bo);
291 continue;
292 }
293 if (dma_bo->bo->cref > 1)
294 continue;
295 remove_from_list(dma_bo);
296 dma_bo->expire_counter = expire_at;
297 insert_at_tail(&rmesa->dma.free, dma_bo);
298 }
299
300 /* unmap the last dma region */
301 if (!is_empty_list(&rmesa->dma.reserved))
302 radeon_bo_unmap(first_elem(&rmesa->dma.reserved)->bo);
303 /* move reserved to wait list */
304 foreach_s(dma_bo, temp, &rmesa->dma.reserved) {
305 remove_from_list(dma_bo);
306 dma_bo->expire_counter = expire_at;
307 insert_at_tail(&rmesa->dma.wait, dma_bo);
308 }
309
310 /* free bos that have been unused for some time */
311 foreach_s(dma_bo, temp, &rmesa->dma.free) {
312 if (dma_bo->expire_counter != time)
313 break;
314 remove_from_list(dma_bo);
315 radeon_bo_unref(dma_bo->bo);
316 FREE(dma_bo);
317 }
318
319 }
320
321
322 /* Flush vertices in the current dma region.
323 */
324 void rcommon_flush_last_swtcl_prim( GLcontext *ctx )
325 {
326 radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
327 struct radeon_dma *dma = &rmesa->dma;
328
329
330 if (RADEON_DEBUG & DEBUG_IOCTL)
331 fprintf(stderr, "%s\n", __FUNCTION__);
332 dma->flush = NULL;
333
334 if (!is_empty_list(&dma->reserved)) {
335 GLuint current_offset = dma->current_used;
336
337 assert (dma->current_used +
338 rmesa->swtcl.numverts * rmesa->swtcl.vertex_size * 4 ==
339 dma->current_vertexptr);
340
341 if (dma->current_used != dma->current_vertexptr) {
342 dma->current_used = dma->current_vertexptr;
343
344 rmesa->vtbl.swtcl_flush(ctx, current_offset);
345 }
346 rmesa->swtcl.numverts = 0;
347 }
348 }
349 /* Alloc space in the current dma region.
350 */
351 void *
352 rcommonAllocDmaLowVerts( radeonContextPtr rmesa, int nverts, int vsize )
353 {
354 GLuint bytes = vsize * nverts;
355 void *head;
356 restart:
357 if (RADEON_DEBUG & DEBUG_IOCTL)
358 fprintf(stderr, "%s\n", __FUNCTION__);
359 if (is_empty_list(&rmesa->dma.reserved)
360 || rmesa->dma.current_vertexptr + bytes > first_elem(&rmesa->dma.reserved)->bo->size) {
361 radeonRefillCurrentDmaRegion(rmesa, bytes);
362 }
363
364 if (!rmesa->dma.flush) {
365 /* make sure we have enough space to use this in cmdbuf */
366 rcommonEnsureCmdBufSpace(rmesa,
367 rmesa->hw.max_state_size + (20*sizeof(int)),
368 __FUNCTION__);
369 /* if cmdbuf flushed DMA restart */
370 if (is_empty_list(&rmesa->dma.reserved))
371 goto restart;
372 rmesa->glCtx->Driver.NeedFlush |= FLUSH_STORED_VERTICES;
373 rmesa->dma.flush = rcommon_flush_last_swtcl_prim;
374 }
375
376 ASSERT( vsize == rmesa->swtcl.vertex_size * 4 );
377 ASSERT( rmesa->dma.flush == rcommon_flush_last_swtcl_prim );
378 ASSERT( rmesa->dma.current_used +
379 rmesa->swtcl.numverts * rmesa->swtcl.vertex_size * 4 ==
380 rmesa->dma.current_vertexptr );
381
382 head = (first_elem(&rmesa->dma.reserved)->bo->ptr + rmesa->dma.current_vertexptr);
383 rmesa->dma.current_vertexptr += bytes;
384 rmesa->swtcl.numverts += nverts;
385 return head;
386 }
387
388 void radeonReleaseArrays( GLcontext *ctx, GLuint newinputs )
389 {
390 radeonContextPtr radeon = RADEON_CONTEXT( ctx );
391 int i;
392 if (RADEON_DEBUG & DEBUG_IOCTL)
393 fprintf(stderr, "%s\n", __FUNCTION__);
394
395 if (radeon->dma.flush) {
396 radeon->dma.flush(radeon->glCtx);
397 }
398 for (i = 0; i < radeon->tcl.aos_count; i++) {
399 if (radeon->tcl.aos[i].bo) {
400 radeon_bo_unref(radeon->tcl.aos[i].bo);
401 radeon->tcl.aos[i].bo = NULL;
402
403 }
404 }
405 }