2 /**************************************************************************
4 * Copyright 2003 VMware, Inc.
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the
9 * "Software"), to deal in the Software without restriction, including
10 * without limitation the rights to use, copy, modify, merge, publish,
11 * distribute, sub license, and/or sell copies of the Software, and to
12 * permit persons to whom the Software is furnished to do so, subject to
13 * the following conditions:
15 * The above copyright notice and this permission notice (including the
16 * next paragraph) shall be included in all copies or substantial portions
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
22 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
23 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 **************************************************************************/
29 #include "main/bufferobj.h"
30 #include "main/image.h"
31 #include "main/macros.h"
32 #include "main/mtypes.h"
34 #include "main/texobj.h"
35 #include "main/texstore.h"
36 #include "main/texcompress.h"
37 #include "main/enums.h"
38 #include "drivers/common/meta.h"
40 #include "brw_context.h"
41 #include "intel_batchbuffer.h"
42 #include "intel_tex.h"
43 #include "intel_mipmap_tree.h"
44 #include "intel_blit.h"
47 #include <tmmintrin.h>
50 #define FILE_DEBUG_FLAG DEBUG_TEXTURE
52 #define ALIGN_DOWN(a, b) ROUND_DOWN_TO(a, b)
53 #define ALIGN_UP(a, b) ALIGN(a, b)
56 * Width and span are in bytes, height is in pixels (i.e. unitless).
57 * A "span" is the most number of bytes we can copy from linear to tiled
58 * without needing to calculate a new destination address.
60 static const uint32_t xtile_width
= 512;
61 static const uint32_t xtile_height
= 8;
62 static const uint32_t xtile_span
= 64;
63 static const uint32_t ytile_width
= 128;
64 static const uint32_t ytile_height
= 32;
65 static const uint32_t ytile_span
= 16;
67 typedef void *(*mem_copy_fn
)(void *dest
, const void *src
, size_t n
);
70 * Each row from y0 to y1 is copied in three parts: [x0,x1), [x1,x2), [x2,x3).
71 * These ranges are in bytes, i.e. pixels * bytes-per-pixel.
72 * The first and last ranges must be shorter than a "span" (the longest linear
73 * stretch within a tile) and the middle must equal a whole number of spans.
74 * Ranges may be empty. The region copied must land entirely within one tile.
75 * 'dst' is the start of the tile and 'src' is the corresponding
76 * address to copy from, though copying begins at (x0, y0).
77 * To enable swizzling 'swizzle_bit' must be 1<<6, otherwise zero.
78 * Swizzling flips bit 6 in the copy destination offset, when certain other
81 typedef void (*tile_copy_fn
)(uint32_t x0
, uint32_t x1
, uint32_t x2
, uint32_t x3
,
82 uint32_t y0
, uint32_t y1
,
83 char *dst
, const char *src
,
86 mem_copy_fn mem_copy
);
90 static const uint8_t rgba8_permutation
[16] =
91 { 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15 };
93 /* NOTE: dst must be 16 byte aligned */
94 #define rgba8_copy_16(dst, src) \
95 *(__m128i *)(dst) = _mm_shuffle_epi8( \
96 (__m128i) _mm_loadu_ps((float *)(src)), \
97 *(__m128i *) rgba8_permutation \
102 * Copy RGBA to BGRA - swap R and B.
105 rgba8_copy(void *dst
, const void *src
, size_t bytes
)
108 uint8_t const *s
= src
;
111 /* Fast copying for tile spans.
113 * As long as the destination texture is 16 aligned,
114 * any 16 or 64 spans we get here should also be 16 aligned.
118 assert(!(((uintptr_t)dst
) & 0xf));
119 rgba8_copy_16(d
+ 0, s
+ 0);
124 assert(!(((uintptr_t)dst
) & 0xf));
125 rgba8_copy_16(d
+ 0, s
+ 0);
126 rgba8_copy_16(d
+16, s
+16);
127 rgba8_copy_16(d
+32, s
+32);
128 rgba8_copy_16(d
+48, s
+48);
146 * Copy texture data from linear to X tile layout.
148 * \copydoc tile_copy_fn
151 xtile_copy(uint32_t x0
, uint32_t x1
, uint32_t x2
, uint32_t x3
,
152 uint32_t y0
, uint32_t y1
,
153 char *dst
, const char *src
,
155 uint32_t swizzle_bit
,
156 mem_copy_fn mem_copy
)
158 /* The copy destination offset for each range copied is the sum of
159 * an X offset 'x0' or 'xo' and a Y offset 'yo.'
163 src
+= y0
* src_pitch
;
165 for (yo
= y0
* xtile_width
; yo
< y1
* xtile_width
; yo
+= xtile_width
) {
166 /* Bits 9 and 10 of the copy destination offset control swizzling.
167 * Only 'yo' contributes to those bits in the total offset,
168 * so calculate 'swizzle' just once per row.
169 * Move bits 9 and 10 three and four places respectively down
170 * to bit 6 and xor them.
172 uint32_t swizzle
= ((yo
>> 3) ^ (yo
>> 4)) & swizzle_bit
;
174 mem_copy(dst
+ ((x0
+ yo
) ^ swizzle
), src
+ x0
, x1
- x0
);
176 for (xo
= x1
; xo
< x2
; xo
+= xtile_span
) {
177 mem_copy(dst
+ ((xo
+ yo
) ^ swizzle
), src
+ xo
, xtile_span
);
180 mem_copy(dst
+ ((xo
+ yo
) ^ swizzle
), src
+ x2
, x3
- x2
);
187 * Copy texture data from linear to Y tile layout.
189 * \copydoc tile_copy_fn
193 uint32_t x0
, uint32_t x1
, uint32_t x2
, uint32_t x3
,
194 uint32_t y0
, uint32_t y1
,
195 char *dst
, const char *src
,
197 uint32_t swizzle_bit
,
198 mem_copy_fn mem_copy
)
200 /* Y tiles consist of columns that are 'ytile_span' wide (and the same height
201 * as the tile). Thus the destination offset for (x,y) is the sum of:
202 * (x % column_width) // position within column
203 * (x / column_width) * bytes_per_column // column number * bytes per column
206 * The copy destination offset for each range copied is the sum of
207 * an X offset 'xo0' or 'xo' and a Y offset 'yo.'
209 const uint32_t column_width
= ytile_span
;
210 const uint32_t bytes_per_column
= column_width
* ytile_height
;
212 uint32_t xo0
= (x0
% ytile_span
) + (x0
/ ytile_span
) * bytes_per_column
;
213 uint32_t xo1
= (x1
% ytile_span
) + (x1
/ ytile_span
) * bytes_per_column
;
215 /* Bit 9 of the destination offset control swizzling.
216 * Only the X offset contributes to bit 9 of the total offset,
217 * so swizzle can be calculated in advance for these X positions.
218 * Move bit 9 three places down to bit 6.
220 uint32_t swizzle0
= (xo0
>> 3) & swizzle_bit
;
221 uint32_t swizzle1
= (xo1
>> 3) & swizzle_bit
;
225 src
+= y0
* src_pitch
;
227 for (yo
= y0
* column_width
; yo
< y1
* column_width
; yo
+= column_width
) {
229 uint32_t swizzle
= swizzle1
;
231 mem_copy(dst
+ ((xo0
+ yo
) ^ swizzle0
), src
+ x0
, x1
- x0
);
233 /* Step by spans/columns. As it happens, the swizzle bit flips
234 * at each step so we don't need to calculate it explicitly.
236 for (x
= x1
; x
< x2
; x
+= ytile_span
) {
237 mem_copy(dst
+ ((xo
+ yo
) ^ swizzle
), src
+ x
, ytile_span
);
238 xo
+= bytes_per_column
;
239 swizzle
^= swizzle_bit
;
242 mem_copy(dst
+ ((xo
+ yo
) ^ swizzle
), src
+ x2
, x3
- x2
);
249 * Copy texture data from linear to X tile layout, faster.
251 * Same as \ref xtile_copy but faster, because it passes constant parameters
252 * for common cases, allowing the compiler to inline code optimized for those
255 * \copydoc tile_copy_fn
258 xtile_copy_faster(uint32_t x0
, uint32_t x1
, uint32_t x2
, uint32_t x3
,
259 uint32_t y0
, uint32_t y1
,
260 char *dst
, const char *src
,
262 uint32_t swizzle_bit
,
263 mem_copy_fn mem_copy
)
265 if (x0
== 0 && x3
== xtile_width
&& y0
== 0 && y1
== xtile_height
) {
266 if (mem_copy
== memcpy
)
267 return xtile_copy(0, 0, xtile_width
, xtile_width
, 0, xtile_height
,
268 dst
, src
, src_pitch
, swizzle_bit
, memcpy
);
269 else if (mem_copy
== rgba8_copy
)
270 return xtile_copy(0, 0, xtile_width
, xtile_width
, 0, xtile_height
,
271 dst
, src
, src_pitch
, swizzle_bit
, rgba8_copy
);
273 if (mem_copy
== memcpy
)
274 return xtile_copy(x0
, x1
, x2
, x3
, y0
, y1
,
275 dst
, src
, src_pitch
, swizzle_bit
, memcpy
);
276 else if (mem_copy
== rgba8_copy
)
277 return xtile_copy(x0
, x1
, x2
, x3
, y0
, y1
,
278 dst
, src
, src_pitch
, swizzle_bit
, rgba8_copy
);
280 xtile_copy(x0
, x1
, x2
, x3
, y0
, y1
,
281 dst
, src
, src_pitch
, swizzle_bit
, mem_copy
);
285 * Copy texture data from linear to Y tile layout, faster.
287 * Same as \ref ytile_copy but faster, because it passes constant parameters
288 * for common cases, allowing the compiler to inline code optimized for those
291 * \copydoc tile_copy_fn
294 ytile_copy_faster(uint32_t x0
, uint32_t x1
, uint32_t x2
, uint32_t x3
,
295 uint32_t y0
, uint32_t y1
,
296 char *dst
, const char *src
,
298 uint32_t swizzle_bit
,
299 mem_copy_fn mem_copy
)
301 if (x0
== 0 && x3
== ytile_width
&& y0
== 0 && y1
== ytile_height
) {
302 if (mem_copy
== memcpy
)
303 return ytile_copy(0, 0, ytile_width
, ytile_width
, 0, ytile_height
,
304 dst
, src
, src_pitch
, swizzle_bit
, memcpy
);
305 else if (mem_copy
== rgba8_copy
)
306 return ytile_copy(0, 0, ytile_width
, ytile_width
, 0, ytile_height
,
307 dst
, src
, src_pitch
, swizzle_bit
, rgba8_copy
);
309 if (mem_copy
== memcpy
)
310 return ytile_copy(x0
, x1
, x2
, x3
, y0
, y1
,
311 dst
, src
, src_pitch
, swizzle_bit
, memcpy
);
312 else if (mem_copy
== rgba8_copy
)
313 return ytile_copy(x0
, x1
, x2
, x3
, y0
, y1
,
314 dst
, src
, src_pitch
, swizzle_bit
, rgba8_copy
);
316 ytile_copy(x0
, x1
, x2
, x3
, y0
, y1
,
317 dst
, src
, src_pitch
, swizzle_bit
, mem_copy
);
321 * Copy from linear to tiled texture.
323 * Divide the region given by X range [xt1, xt2) and Y range [yt1, yt2) into
324 * pieces that do not cross tile boundaries and copy each piece with a tile
325 * copy function (\ref tile_copy_fn).
326 * The X range is in bytes, i.e. pixels * bytes-per-pixel.
327 * The Y range is in pixels (i.e. unitless).
328 * 'dst' is the start of the texture and 'src' is the corresponding
329 * address to copy from, though copying begins at (xt1, yt1).
332 linear_to_tiled(uint32_t xt1
, uint32_t xt2
,
333 uint32_t yt1
, uint32_t yt2
,
334 char *dst
, const char *src
,
335 uint32_t dst_pitch
, uint32_t src_pitch
,
338 mem_copy_fn mem_copy
)
340 tile_copy_fn tile_copy
;
344 uint32_t tw
, th
, span
;
345 uint32_t swizzle_bit
= has_swizzling
? 1<<6 : 0;
347 if (tiling
== I915_TILING_X
) {
351 tile_copy
= xtile_copy_faster
;
352 } else if (tiling
== I915_TILING_Y
) {
356 tile_copy
= ytile_copy_faster
;
358 unreachable("unsupported tiling");
361 /* Round out to tile boundaries. */
362 xt0
= ALIGN_DOWN(xt1
, tw
);
363 xt3
= ALIGN_UP (xt2
, tw
);
364 yt0
= ALIGN_DOWN(yt1
, th
);
365 yt3
= ALIGN_UP (yt2
, th
);
367 /* Loop over all tiles to which we have something to copy.
368 * 'xt' and 'yt' are the origin of the destination tile, whether copying
369 * copying a full or partial tile.
370 * tile_copy() copies one tile or partial tile.
371 * Looping x inside y is the faster memory access pattern.
373 for (yt
= yt0
; yt
< yt3
; yt
+= th
) {
374 for (xt
= xt0
; xt
< xt3
; xt
+= tw
) {
375 /* The area to update is [x0,x3) x [y0,y1).
376 * May not want the whole tile, hence the min and max.
378 uint32_t x0
= MAX2(xt1
, xt
);
379 uint32_t y0
= MAX2(yt1
, yt
);
380 uint32_t x3
= MIN2(xt2
, xt
+ tw
);
381 uint32_t y1
= MIN2(yt2
, yt
+ th
);
383 /* [x0,x3) is split into [x0,x1), [x1,x2), [x2,x3) such that
384 * the middle interval is the longest span-aligned part.
385 * The sub-ranges could be empty.
388 x1
= ALIGN_UP(x0
, span
);
392 x2
= ALIGN_DOWN(x3
, span
);
394 assert(x0
<= x1
&& x1
<= x2
&& x2
<= x3
);
395 assert(x1
- x0
< span
&& x3
- x2
< span
);
396 assert(x3
- x0
<= tw
);
397 assert((x2
- x1
) % span
== 0);
399 /* Translate by (xt,yt) for single-tile copier. */
400 tile_copy(x0
-xt
, x1
-xt
, x2
-xt
, x3
-xt
,
402 dst
+ (ptrdiff_t) xt
* th
+ (ptrdiff_t) yt
* dst_pitch
,
403 src
+ (ptrdiff_t) xt
+ (ptrdiff_t) yt
* src_pitch
,
412 * \brief A fast path for glTexImage and glTexSubImage.
414 * \param for_glTexImage Was this called from glTexImage or glTexSubImage?
416 * This fast path is taken when the texture format is BGRA, RGBA,
417 * A or L and when the texture memory is X- or Y-tiled. It uploads
418 * the texture data by mapping the texture memory without a GTT fence, thus
419 * acquiring a tiled view of the memory, and then copying sucessive
420 * spans within each tile.
422 * This is a performance win over the conventional texture upload path because
423 * it avoids the performance penalty of writing through the write-combine
424 * buffer. In the conventional texture upload path,
425 * texstore.c:store_texsubimage(), the texture memory is mapped through a GTT
426 * fence, thus acquiring a linear view of the memory, then each row in the
427 * image is memcpy'd. In this fast path, we replace each row's copy with
428 * a sequence of copies over each linear span in tile.
430 * One use case is Google Chrome's paint rectangles. Chrome (as
431 * of version 21) renders each page as a tiling of 256x256 GL_BGRA textures.
432 * Each page's content is initially uploaded with glTexImage2D and damaged
433 * regions are updated with glTexSubImage2D. On some workloads, the
434 * performance gain of this fastpath on Sandybridge is over 5x.
437 intel_texsubimage_tiled_memcpy(struct gl_context
* ctx
,
439 struct gl_texture_image
*texImage
,
440 GLint xoffset
, GLint yoffset
, GLint zoffset
,
441 GLsizei width
, GLsizei height
, GLsizei depth
,
442 GLenum format
, GLenum type
,
443 const GLvoid
*pixels
,
444 const struct gl_pixelstore_attrib
*packing
,
447 struct brw_context
*brw
= brw_context(ctx
);
448 struct intel_texture_image
*image
= intel_texture_image(texImage
);
451 /* The miptree's buffer. */
457 mem_copy_fn mem_copy
= NULL
;
459 /* This fastpath is restricted to specific texture types:
460 * a 2D BGRA, RGBA, L8 or A8 texture. It could be generalized to support
463 * FINISHME: The restrictions below on packing alignment and packing row
464 * length are likely unneeded now because we calculate the source stride
465 * with _mesa_image_row_stride. However, before removing the restrictions
469 !(type
== GL_UNSIGNED_BYTE
|| type
== GL_UNSIGNED_INT_8_8_8_8_REV
) ||
470 texImage
->TexObject
->Target
!= GL_TEXTURE_2D
||
472 _mesa_is_bufferobj(packing
->BufferObj
) ||
473 packing
->Alignment
> 4 ||
474 packing
->SkipPixels
> 0 ||
475 packing
->SkipRows
> 0 ||
476 (packing
->RowLength
!= 0 && packing
->RowLength
!= width
) ||
477 packing
->SwapBytes
||
482 if (type
== GL_UNSIGNED_INT_8_8_8_8_REV
&&
483 !(format
== GL_RGBA
|| format
== GL_BGRA
))
484 return false; /* Invalid type/format combination */
486 if ((texImage
->TexFormat
== MESA_FORMAT_L_UNORM8
&& format
== GL_LUMINANCE
) ||
487 (texImage
->TexFormat
== MESA_FORMAT_A_UNORM8
&& format
== GL_ALPHA
)) {
490 } else if ((texImage
->TexFormat
== MESA_FORMAT_B8G8R8A8_UNORM
) ||
491 (texImage
->TexFormat
== MESA_FORMAT_B8G8R8X8_UNORM
)) {
493 if (format
== GL_BGRA
) {
495 } else if (format
== GL_RGBA
) {
496 mem_copy
= rgba8_copy
;
498 } else if ((texImage
->TexFormat
== MESA_FORMAT_R8G8B8A8_UNORM
) ||
499 (texImage
->TexFormat
== MESA_FORMAT_R8G8B8X8_UNORM
)) {
501 if (format
== GL_BGRA
) {
502 /* Copying from RGBA to BGRA is the same as BGRA to RGBA so we can
503 * use the same function.
505 mem_copy
= rgba8_copy
;
506 } else if (format
== GL_RGBA
) {
513 /* If this is a nontrivial texture view, let another path handle it instead. */
514 if (texImage
->TexObject
->MinLayer
)
518 ctx
->Driver
.AllocTextureImageBuffer(ctx
, texImage
);
521 (image
->mt
->tiling
!= I915_TILING_X
&&
522 image
->mt
->tiling
!= I915_TILING_Y
)) {
523 /* The algorithm is written only for X- or Y-tiled memory. */
527 /* Since we are going to write raw data to the miptree, we need to resolve
528 * any pending fast color clears before we start.
530 intel_miptree_resolve_color(brw
, image
->mt
);
534 if (drm_intel_bo_references(brw
->batch
.bo
, bo
)) {
535 perf_debug("Flushing before mapping a referenced bo.\n");
536 intel_batchbuffer_flush(brw
);
539 error
= brw_bo_map(brw
, bo
, true /* write enable */, "miptree");
540 if (error
|| bo
->virtual == NULL
) {
541 DBG("%s: failed to map bo\n", __FUNCTION__
);
545 src_pitch
= _mesa_image_row_stride(packing
, width
, format
, type
);
547 /* We postponed printing this message until having committed to executing
550 DBG("%s: level=%d offset=(%d,%d) (w,h)=(%d,%d) format=0x%x type=0x%x "
551 "mesa_format=0x%x tiling=%d "
552 "packing=(alignment=%d row_length=%d skip_pixels=%d skip_rows=%d) "
553 "for_glTexImage=%d\n",
554 __FUNCTION__
, texImage
->Level
, xoffset
, yoffset
, width
, height
,
555 format
, type
, texImage
->TexFormat
, image
->mt
->tiling
,
556 packing
->Alignment
, packing
->RowLength
, packing
->SkipPixels
,
557 packing
->SkipRows
, for_glTexImage
);
559 int level
= texImage
->Level
+ texImage
->TexObject
->MinLevel
;
561 /* Adjust x and y offset based on miplevel */
562 xoffset
+= image
->mt
->level
[level
].level_x
;
563 yoffset
+= image
->mt
->level
[level
].level_y
;
566 xoffset
* cpp
, (xoffset
+ width
) * cpp
,
567 yoffset
, yoffset
+ height
,
569 pixels
- (ptrdiff_t) yoffset
* src_pitch
- (ptrdiff_t) xoffset
* cpp
,
570 image
->mt
->pitch
, src_pitch
,
576 drm_intel_bo_unmap(bo
);
581 intelTexSubImage(struct gl_context
* ctx
,
583 struct gl_texture_image
*texImage
,
584 GLint xoffset
, GLint yoffset
, GLint zoffset
,
585 GLsizei width
, GLsizei height
, GLsizei depth
,
586 GLenum format
, GLenum type
,
587 const GLvoid
* pixels
,
588 const struct gl_pixelstore_attrib
*packing
)
590 struct intel_texture_image
*intelImage
= intel_texture_image(texImage
);
593 bool tex_busy
= intelImage
->mt
&& drm_intel_bo_busy(intelImage
->mt
->bo
);
595 DBG("%s mesa_format %s target %s format %s type %s level %d %dx%dx%d\n",
596 __FUNCTION__
, _mesa_get_format_name(texImage
->TexFormat
),
597 _mesa_lookup_enum_by_nr(texImage
->TexObject
->Target
),
598 _mesa_lookup_enum_by_nr(format
), _mesa_lookup_enum_by_nr(type
),
599 texImage
->Level
, texImage
->Width
, texImage
->Height
, texImage
->Depth
);
601 ok
= _mesa_meta_pbo_TexSubImage(ctx
, dims
, texImage
,
602 xoffset
, yoffset
, zoffset
,
603 width
, height
, depth
, format
, type
,
604 pixels
, false, tex_busy
, packing
);
608 ok
= intel_texsubimage_tiled_memcpy(ctx
, dims
, texImage
,
609 xoffset
, yoffset
, zoffset
,
610 width
, height
, depth
,
611 format
, type
, pixels
, packing
,
612 false /*for_glTexImage*/);
616 _mesa_store_texsubimage(ctx
, dims
, texImage
,
617 xoffset
, yoffset
, zoffset
,
618 width
, height
, depth
,
619 format
, type
, pixels
, packing
);
623 intelInitTextureSubImageFuncs(struct dd_function_table
*functions
)
625 functions
->TexSubImage
= intelTexSubImage
;