src/mesa/x86/read_rgba_span_x86.S

   1 /*
   2  * (C) Copyright IBM Corporation 2004
   3  * All Rights Reserved.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * on the rights to use, copy, modify, merge, publish, distribute, sub
   9  * license, and/or sell copies of the Software, and to permit persons to whom
  10  * the Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
  19  * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25 /**
  26  * \file read_rgba_span_x86.S
  27  * Optimized routines to transfer pixel data from the framebuffer to a
  28  * buffer in main memory.
  29  *
  30  * \author Ian Romanick <idr@us.ibm.com>
  31  */
  32
  33         .file   "read_rgba_span_x86.S"
  34 #if !defined(__DJGPP__) && !defined(__MINGW32__) /* this one cries for assyntax.h */
  35         .section        .rodata
  36         .align 16
  37         .type   mask, @object
  38         .size   mask, 32
  39 mask:
  40         .long   0xff00ff00
  41         .long   0xff00ff00
  42         .long   0xff00ff00
  43         .long   0xff00ff00
  44         .long   0x00ff0000
  45         .long   0x00ff0000
  46         .long   0x00ff0000
  47         .long   0x00ff0000
  48
  49
  50 /* I implemented these as macros because the appear in quite a few places,
  51  * and I've tweaked them a number of times.  I got tired of changing every
  52  * place they appear. :)
  53  */
  54
  55 #define DO_ONE_PIXEL() \
  56         movl    (%ebx), %eax ; \
  57         addl    $4, %ebx ; \
  58         bswap   %eax          /* ARGB -> BGRA */ ; \
  59         rorl    $8, %eax      /* BGRA -> ABGR */ ; \
  60         movl    %eax, (%ecx)  /* ABGR -> R, G, B, A */ ; \
  61         addl    $4, %ecx
  62
  63 #define DO_ONE_LAST_PIXEL() \
  64         movl    (%ebx), %eax ; \
  65         bswap   %eax          /* ARGB -> BGRA */ ; \
  66         rorl    $8, %eax      /* BGRA -> ABGR */ ; \
  67         movl    %eax, (%ecx)  /* ABGR -> R, G, B, A */ ; \
  68
  69
  70 /**
  71  * MMX optimized version of the BGRA8888_REV to RGBA copy routine.
  72  *
  73  * \warning
  74  * This function assumes that the caller will issue the EMMS instruction
  75  * at the correct places.
  76  */
  77
  78 .globl _generic_read_RGBA_span_BGRA8888_REV_MMX
  79         .type   _generic_read_RGBA_span_BGRA8888_REV_MMX, @function
  80 _generic_read_RGBA_span_BGRA8888_REV_MMX:
  81         pushl   %ebx
  82
  83 #ifdef USE_INNER_EMMS
  84         emms
  85 #endif
  86         movq    mask, %mm1
  87         movq    mask+16, %mm2
  88
  89         movl    8(%esp), %ebx   /* source pointer */
  90         movl    16(%esp), %edx  /* number of pixels to copy */
  91         movl    12(%esp), %ecx  /* destination pointer */
  92
  93         testl   %edx, %edx
  94         je      .L20            /* Bail if there's nothing to do. */
  95
  96         movl    %ebx, %eax
  97
  98         negl    %eax
  99         sarl    $2, %eax
 100         andl    $1, %eax
 101         je      .L17
 102
 103         subl    %eax, %edx
 104         DO_ONE_PIXEL()
 105 .L17:
 106
 107         /* Would it be faster to unroll this loop once and process 4 pixels
 108          * per pass, instead of just two?
 109          */
 110
 111         movl    %edx, %eax
 112         shrl    %eax
 113         jmp     .L18
 114 .L19:
 115         movq    (%ebx), %mm0
 116         addl    $8, %ebx
 117
 118         /* These 9 instructions do what PSHUFB (if there were such an
 119          * instruction) could do in 1. :(
 120          */
 121
 122         movq    %mm0, %mm3
 123         movq    %mm0, %mm4
 124
 125         pand    %mm2, %mm3
 126         psllq   $16, %mm4
 127         psrlq   $16, %mm3
 128         pand    %mm2, %mm4
 129
 130         pand    %mm1, %mm0
 131         por     %mm4, %mm3
 132         por     %mm3, %mm0
 133
 134         movq    %mm0, (%ecx)
 135         addl    $8, %ecx
 136         subl    $1, %eax
 137 .L18:
 138         jne     .L19
 139
 140 #ifdef USE_INNER_EMMS
 141         emms
 142 #endif
 143
 144         /* At this point there are either 1 or 0 pixels remaining to be
 145          * converted.  Convert the last pixel, if needed.
 146          */
 147
 148         testl   $1, %edx
 149         je      .L20
 150
 151         DO_ONE_LAST_PIXEL()
 152
 153 .L20:
 154         popl    %ebx
 155         ret
 156         .size   _generic_read_RGBA_span_BGRA8888_REV_MMX, .-_generic_read_RGBA_span_BGRA8888_REV_MMX
 157
 158
 159 /**
 160  * SSE optimized version of the BGRA8888_REV to RGBA copy routine.  SSE
 161  * instructions are only actually used to read data from the framebuffer.
 162  * In practice, the speed-up is pretty small.
 163  *
 164  * \todo
 165  * Do some more testing and determine if there's any reason to have this
 166  * function in addition to the MMX version.
 167  *
 168  * \warning
 169  * This function assumes that the caller will issue the EMMS instruction
 170  * at the correct places.
 171  */
 172
 173 .globl _generic_read_RGBA_span_BGRA8888_REV_SSE
 174         .type   _generic_read_RGBA_span_BGRA8888_REV_SSE, @function
 175 _generic_read_RGBA_span_BGRA8888_REV_SSE:
 176         pushl   %esi
 177         pushl   %ebx
 178         pushl   %ebp
 179
 180 #ifdef USE_INNER_EMMS
 181         emms
 182 #endif
 183         movq    mask, %mm1
 184         movq    mask+16, %mm2
 185
 186         movl    16(%esp), %ebx  /* source pointer */
 187         movl    24(%esp), %edx  /* number of pixels to copy */
 188         movl    20(%esp), %ecx  /* destination pointer */
 189
 190         movl    %esp, %ebp
 191         subl    $16, %esp
 192         andl    $0xfffffff0, %esp
 193
 194         movl    %ebx, %eax
 195         movl    %edx, %esi
 196
 197         negl    %eax
 198         andl    $15, %eax
 199         sarl    $2, %eax
 200         cmpl    %edx, %eax
 201         cmovle  %eax, %esi
 202
 203         subl    %esi, %edx
 204
 205         testl   $1, %esi
 206         je      .L32
 207
 208         DO_ONE_PIXEL()
 209 .L32:
 210
 211         testl   $2, %esi
 212         je      .L31
 213
 214         movq    (%ebx), %mm0
 215         addl    $8, %ebx
 216
 217         movq    %mm0, %mm3
 218         movq    %mm0, %mm4
 219
 220         pand    %mm2, %mm3
 221         psllq   $16, %mm4
 222         psrlq   $16, %mm3
 223         pand    %mm2, %mm4
 224
 225         pand    %mm1, %mm0
 226         por     %mm4, %mm3
 227         por     %mm3, %mm0
 228
 229         movq    %mm0, (%ecx)
 230         addl    $8, %ecx
 231 .L31:
 232
 233         movl    %edx, %eax
 234         shrl    $2, %eax
 235         jmp     .L33
 236 .L34:
 237         movaps  (%ebx), %xmm0
 238         addl    $16, %ebx
 239
 240         /* This would be so much better if we could just move directly from
 241          * an SSE register to an MMX register.  Unfortunately, that
 242          * functionality wasn't introduced until SSE2 with the MOVDQ2Q
 243          * instruction.
 244          */
 245
 246         movaps  %xmm0, (%esp)
 247         movq    (%esp), %mm0
 248         movq    8(%esp), %mm5
 249
 250         movq    %mm0, %mm3
 251         movq    %mm0, %mm4
 252         movq    %mm5, %mm6
 253         movq    %mm5, %mm7
 254
 255         pand    %mm2, %mm3
 256         pand    %mm2, %mm6
 257
 258         psllq   $16, %mm4
 259         psllq   $16, %mm7
 260
 261         psrlq   $16, %mm3
 262         psrlq   $16, %mm6
 263
 264         pand    %mm2, %mm4
 265         pand    %mm2, %mm7
 266
 267         pand    %mm1, %mm0
 268         pand    %mm1, %mm5
 269
 270         por     %mm4, %mm3
 271         por     %mm7, %mm6
 272
 273         por     %mm3, %mm0
 274         por     %mm6, %mm5
 275
 276         movq    %mm0, (%ecx)
 277         movq    %mm5, 8(%ecx)
 278         addl    $16, %ecx
 279
 280         subl    $1, %eax
 281 .L33:
 282         jne     .L34
 283
 284 #ifdef USE_INNER_EMMS
 285         emms
 286 #endif
 287         movl    %ebp, %esp
 288
 289         /* At this point there are either [0, 3] pixels remaining to be
 290          * converted.
 291          */
 292
 293         testl   $2, %edx
 294         je      .L36
 295
 296         movq    (%ebx), %mm0
 297         addl    $8, %ebx
 298
 299         movq    %mm0, %mm3
 300         movq    %mm0, %mm4
 301
 302         pand    %mm2, %mm3
 303         psllq   $16, %mm4
 304         psrlq   $16, %mm3
 305         pand    %mm2, %mm4
 306
 307         pand    %mm1, %mm0
 308         por     %mm4, %mm3
 309         por     %mm3, %mm0
 310
 311         movq    %mm0, (%ecx)
 312         addl    $8, %ecx
 313 .L36:
 314
 315         testl   $1, %edx
 316         je      .L35
 317
 318         DO_ONE_LAST_PIXEL()
 319 .L35:
 320         popl    %ebp
 321         popl    %ebx
 322         popl    %esi
 323         ret
 324         .size   _generic_read_RGBA_span_BGRA8888_REV_SSE, .-_generic_read_RGBA_span_BGRA8888_REV_SSE
 325
 326
 327 /**
 328  * SSE2 optimized version of the BGRA8888_REV to RGBA copy routine.
 329  */
 330
 331         .text
 332 .globl _generic_read_RGBA_span_BGRA8888_REV_SSE2
 333         .type   _generic_read_RGBA_span_BGRA8888_REV_SSE2, @function
 334 _generic_read_RGBA_span_BGRA8888_REV_SSE2:
 335         pushl   %esi
 336         pushl   %ebx
 337
 338         movdqa  mask, %xmm1
 339         movdqa  mask+16, %xmm2
 340
 341         movl    12(%esp), %ebx  /* source pointer */
 342         movl    20(%esp), %edx  /* number of pixels to copy */
 343         movl    16(%esp), %ecx  /* destination pointer */
 344
 345         movl    %ebx, %eax
 346         movl    %edx, %esi
 347
 348         /* If the source pointer isn't a multiple of 16 we have to process
 349          * a few pixels the "slow" way to get the address aligned for
 350          * the SSE fetch intsructions.
 351          */
 352
 353         negl    %eax
 354         andl    $15, %eax
 355         sarl    $2, %eax
 356
 357         cmpl    %edx, %eax
 358         cmovbe  %eax, %esi
 359         subl    %esi, %edx
 360
 361         testl   $1, %esi
 362         je      .L41
 363
 364         DO_ONE_PIXEL()
 365 .L41:
 366         testl   $2, %esi
 367         je      .L40
 368
 369         movq    (%ebx), %xmm0
 370         addl    $8, %ebx
 371
 372         movdqa  %xmm0, %xmm3
 373         movdqa  %xmm0, %xmm4
 374         andps   %xmm1, %xmm0
 375
 376         andps   %xmm2, %xmm3
 377         pslldq  $2, %xmm4
 378         psrldq  $2, %xmm3
 379         andps   %xmm2, %xmm4
 380
 381         orps    %xmm4, %xmm3
 382         orps    %xmm3, %xmm0
 383
 384         movq    %xmm0, (%ecx)
 385         addl    $8, %ecx
 386 .L40:
 387
 388         /* Would it be worth having a specialized version of this loop for
 389          * the case where the destination is 16-byte aligned?  That version
 390          * would be identical except that it could use movedqa instead of
 391          * movdqu.
 392          */
 393
 394         movl    %edx, %eax
 395         shrl    $2, %eax
 396         jmp     .L42
 397 .L43:
 398         movdqa  (%ebx), %xmm0
 399         addl    $16, %ebx
 400
 401         movdqa  %xmm0, %xmm3
 402         movdqa  %xmm0, %xmm4
 403         andps   %xmm1, %xmm0
 404
 405         andps   %xmm2, %xmm3
 406         pslldq  $2, %xmm4
 407         psrldq  $2, %xmm3
 408         andps   %xmm2, %xmm4
 409
 410         orps    %xmm4, %xmm3
 411         orps    %xmm3, %xmm0
 412
 413         movdqu  %xmm0, (%ecx)
 414         addl    $16, %ecx
 415         subl    $1, %eax
 416 .L42:
 417         jne     .L43
 418
 419
 420         /* There may be upto 3 pixels remaining to be copied.  Take care
 421          * of them now.  We do the 2 pixel case first because the data
 422          * will be aligned.
 423          */
 424
 425         testl   $2, %edx
 426         je      .L47
 427
 428         movq    (%ebx), %xmm0
 429
 430         movdqa  %xmm0, %xmm3
 431         movdqa  %xmm0, %xmm4
 432         andps   %xmm1, %xmm0
 433
 434         andps   %xmm2, %xmm3
 435         pslldq  $2, %xmm4
 436         psrldq  $2, %xmm3
 437         andps   %xmm2, %xmm4
 438
 439         orps    %xmm4, %xmm3
 440         orps    %xmm3, %xmm0
 441
 442         movq    %xmm0, (%ecx)
 443 .L47:
 444
 445         testl   $1, %edx
 446         je      .L46
 447
 448         DO_ONE_LAST_PIXEL()
 449 .L46:
 450
 451         popl    %ebx
 452         popl    %esi
 453         ret
 454         .size   _generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2
 455
 456
 457
 458         .section        .rodata
 459
 460         .align  16
 461 mask_565:
 462         .word   0xf800
 463         .word   0x07e0
 464         .word   0x001f
 465         .word   0x0000
 466
 467 /* Setting SCALE_ADJUST to 5 gives a perfect match with the classic C
 468  * implementation in Mesa.  Setting SCALE_ADJUST to 0 is slightly faster but
 469  * at a small cost to accuracy.
 470  */
 471
 472 #define SCALE_ADJUST    5
 473 #if SCALE_ADJUST == 5
 474 prescale:
 475         .word   0x0001
 476         .word   0x0010
 477         .word   0x0200
 478         .word   0x0000
 479
 480 scale:
 481         .word   0x20e8          /* (0x00ff0000 / 0x000007c0) + 1 */
 482         .word   0x40c5          /* (0x00ff0000 / 0x000003f0) + 1 */
 483         .word   0x839d          /* (0x00ff0000 / 0x000001f0) + 1 */
 484         .word   0x0000
 485 #elif SCALE_ADJUST == 0
 486 prescale:
 487         .word   0x0001
 488         .word   0x0020
 489         .word   0x0800
 490         .word   0x0000
 491
 492 scale:
 493         .word   0x0108          /* (0x00ff0000 / 0x0000f800) + 1 */
 494         .word   0x0104          /* (0x00ff0000 / 0x0000fc00) + 1 */
 495         .word   0x0108          /* (0x00ff0000 / 0x0000f800) + 1 */
 496         .word   0x0000
 497 #else
 498 #error SCALE_ADJUST must either be 5 or 0.
 499 #endif
 500
 501
 502 alpha:  .long   0x00000000
 503         .long   0x00ff0000
 504
 505 /**
 506  * MMX optimized version of the RGB565 to RGBA copy routine.
 507  */
 508
 509         .text
 510         .globl  _generic_read_RGBA_span_RGB565_MMX
 511         .type   _generic_read_RGBA_span_RGB565_MMX, @function
 512
 513 _generic_read_RGBA_span_RGB565_MMX:
 514
 515 #ifdef USE_INNER_EMMS
 516         emms
 517 #endif
 518
 519         movl    4(%esp), %eax   /* source pointer */
 520         movl    8(%esp), %edx   /* destination pointer */
 521         movl    12(%esp), %ecx  /* number of pixels to copy */
 522
 523         movq    mask_565, %mm5
 524         movq    prescale, %mm6
 525         movq    scale, %mm7
 526
 527         shrl    $2, %ecx
 528         jmp     .L02
 529
 530 .L03:
 531         /* Fetch 4 RGB565 pixels into %mm4.  Distribute the first and
 532          * second pixels into the four words of %mm0 and %mm2.
 533          */
 534
 535         movq    (%eax), %mm4
 536         addl    $8, %eax
 537
 538         pshufw  $0x00, %mm4, %mm0
 539         pshufw  $0x55, %mm4, %mm2
 540
 541
 542         /* Mask the pixels so that each word of each register contains only
 543          * one color component.
 544          */
 545
 546         pand    %mm5, %mm0
 547         pand    %mm5, %mm2
 548
 549
 550         /* Adjust the component values so that they are as small as possible,
 551          * but large enough so that we can multiply them by an unsigned 16-bit
 552          * number and get a value as large as 0x00ff0000.
 553          */
 554
 555         pmullw  %mm6, %mm0
 556         pmullw  %mm6, %mm2
 557 #if SCALE_ADJUST > 0
 558         psrlw   $SCALE_ADJUST, %mm0
 559         psrlw   $SCALE_ADJUST, %mm2
 560 #endif
 561
 562         /* Scale the input component values to be on the range
 563          * [0, 0x00ff0000].  This it the real magic of the whole routine.
 564          */
 565
 566         pmulhuw %mm7, %mm0
 567         pmulhuw %mm7, %mm2
 568
 569
 570         /* Always set the alpha value to 0xff.
 571          */
 572
 573         por     alpha, %mm0
 574         por     alpha, %mm2
 575
 576
 577         /* Pack the 16-bit values to 8-bit values and store the converted
 578          * pixel data.
 579          */
 580
 581         packuswb        %mm2, %mm0
 582         movq    %mm0, (%edx)
 583         addl    $8, %edx
 584
 585
 586
 587         pshufw  $0xaa, %mm4, %mm0
 588         pshufw  $0xff, %mm4, %mm2
 589
 590         pand    %mm5, %mm0
 591         pand    %mm5, %mm2
 592         pmullw  %mm6, %mm0
 593         pmullw  %mm6, %mm2
 594 #if SCALE_ADJUST > 0
 595         psrlw   $SCALE_ADJUST, %mm0
 596         psrlw   $SCALE_ADJUST, %mm2
 597 #endif
 598         pmulhuw %mm7, %mm0
 599         pmulhuw %mm7, %mm2
 600
 601         por     alpha, %mm0
 602         por     alpha, %mm2
 603
 604         packuswb        %mm2, %mm0
 605
 606         movq    %mm0, (%edx)
 607         addl    $8, %edx
 608
 609         subl    $1, %ecx
 610 .L02:
 611         jne     .L03
 612
 613
 614         /* At this point there can be at most 3 pixels left to process.  If
 615          * there is either 2 or 3 left, process 2.
 616          */
 617
 618         movl    12(%esp), %ecx
 619         testl   $0x02, %ecx
 620         je      .L04
 621
 622         movd    (%eax), %mm4
 623         addl    $4, %eax
 624
 625         pshufw  $0x00, %mm4, %mm0
 626         pshufw  $0x55, %mm4, %mm2
 627
 628         pand    %mm5, %mm0
 629         pand    %mm5, %mm2
 630         pmullw  %mm6, %mm0
 631         pmullw  %mm6, %mm2
 632 #if SCALE_ADJUST > 0
 633         psrlw   $SCALE_ADJUST, %mm0
 634         psrlw   $SCALE_ADJUST, %mm2
 635 #endif
 636         pmulhuw %mm7, %mm0
 637         pmulhuw %mm7, %mm2
 638
 639         por     alpha, %mm0
 640         por     alpha, %mm2
 641
 642         packuswb        %mm2, %mm0
 643
 644         movq    %mm0, (%edx)
 645         addl    $8, %edx
 646
 647 .L04:
 648         /* At this point there can be at most 1 pixel left to process.
 649          * Process it if needed.
 650          */
 651
 652         testl   $0x01, %ecx
 653         je      .L01
 654
 655         movzxw  (%eax), %ecx
 656         movd    %ecx, %mm4
 657
 658         pshufw  $0x00, %mm4, %mm0
 659
 660         pand    %mm5, %mm0
 661         pmullw  %mm6, %mm0
 662 #if SCALE_ADJUST > 0
 663         psrlw   $SCALE_ADJUST, %mm0
 664 #endif
 665         pmulhuw %mm7, %mm0
 666
 667         por     alpha, %mm0
 668
 669         packuswb        %mm0, %mm0
 670
 671         movd    %mm0, (%edx)
 672
 673 .L01:
 674 #ifdef USE_INNER_EMMS
 675         emms
 676 #endif
 677         ret
 678 #endif /* !defined(__DJGPP__) && !defined(__MINGW32__) */