src/mesa/x86/read_rgba_span_x86.S

   1 /*
   2  * (C) Copyright IBM Corporation 2004
   3  * All Rights Reserved.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * on the rights to use, copy, modify, merge, publish, distribute, sub
   9  * license, and/or sell copies of the Software, and to permit persons to whom
  10  * the Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
  19  * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25 /**
  26  * \file read_rgba_span_x86.S
  27  * Optimized routines to transfer pixel data from the framebuffer to a
  28  * buffer in main memory.
  29  *
  30  * \author Ian Romanick <idr@us.ibm.com>
  31  */
  32
  33         .file   "read_rgba_span_x86.S"
  34         .section        .rodata
  35         .align 16
  36         .type   mask, @object
  37         .size   mask, 32
  38 mask:
  39         .long   0xff00ff00
  40         .long   0xff00ff00
  41         .long   0xff00ff00
  42         .long   0xff00ff00
  43         .long   0x00ff0000
  44         .long   0x00ff0000
  45         .long   0x00ff0000
  46         .long   0x00ff0000
  47
  48
  49 /* I implemented these as macros because the appear in quite a few places,
  50  * and I've tweaked them a number of times.  I got tired of changing every
  51  * place they appear. :)
  52  */
  53
  54 #define DO_ONE_PIXEL() \
  55         movl    (%ebx), %eax ; \
  56         addl    $4, %ebx ; \
  57         bswap   %eax          /* ARGB -> BGRA */ ; \
  58         rorl    $8, %eax      /* BGRA -> ABGR */ ; \
  59         movl    %eax, (%ecx)  /* ABGR -> R, G, B, A */ ; \
  60         addl    $4, %ecx
  61
  62 #define DO_ONE_LAST_PIXEL() \
  63         movl    (%ebx), %eax ; \
  64         bswap   %eax          /* ARGB -> BGRA */ ; \
  65         rorl    $8, %eax      /* BGRA -> ABGR */ ; \
  66         movl    %eax, (%ecx)  /* ABGR -> R, G, B, A */ ; \
  67
  68
  69 /**
  70  * MMX optimized version of the BGRA8888_REV to RGBA copy routine.
  71  *
  72  * \warning
  73  * This function assumes that the caller will issue the EMMS instruction
  74  * at the correct places.
  75  */
  76
  77 .globl _generic_read_RGBA_span_BGRA8888_REV_MMX
  78         .type   _generic_read_RGBA_span_BGRA8888_REV_MMX, @function
  79 _generic_read_RGBA_span_BGRA8888_REV_MMX:
  80         pushl   %ebx
  81
  82 #ifdef USE_INNER_EMMS
  83         emms
  84 #endif
  85         movq    mask, %mm1
  86         movq    mask+16, %mm2
  87
  88         movl    8(%esp), %ebx   /* source pointer */
  89         movl    16(%esp), %edx  /* number of pixels to copy */
  90         movl    12(%esp), %ecx  /* destination pointer */
  91
  92         testl   %edx, %edx
  93         je      .L20            /* Bail if there's nothing to do. */
  94
  95         movl    %ebx, %eax
  96
  97         negl    %eax
  98         sarl    $2, %eax
  99         andl    $1, %eax
 100         je      .L17
 101
 102         subl    %eax, %edx
 103         DO_ONE_PIXEL()
 104 .L17:
 105
 106         /* Would it be faster to unroll this loop once and process 4 pixels
 107          * per pass, instead of just two?
 108          */
 109
 110         movl    %edx, %eax
 111         shrl    %eax
 112         jmp     .L18
 113 .L19:
 114         movq    (%ebx), %mm0
 115         addl    $8, %ebx
 116
 117         /* These 9 instructions do what PSHUFB (if there were such an
 118          * instruction) could do in 1. :(
 119          */
 120
 121         movq    %mm0, %mm3
 122         movq    %mm0, %mm4
 123
 124         pand    %mm2, %mm3
 125         psllq   $16, %mm4
 126         psrlq   $16, %mm3
 127         pand    %mm2, %mm4
 128
 129         pand    %mm1, %mm0
 130         por     %mm4, %mm3
 131         por     %mm3, %mm0
 132
 133         movq    %mm0, (%ecx)
 134         addl    $8, %ecx
 135         subl    $1, %eax
 136 .L18:
 137         jne     .L19
 138
 139 #ifdef USE_INNER_EMMS
 140         emms
 141 #endif
 142
 143         /* At this point there are either 1 or 0 pixels remaining to be
 144          * converted.  Convert the last pixel, if needed.
 145          */
 146
 147         testl   $1, %edx
 148         je      .L20
 149
 150         DO_ONE_LAST_PIXEL()
 151
 152 .L20:
 153         popl    %ebx
 154         ret
 155         .size   _generic_read_RGBA_span_BGRA8888_REV_MMX, .-_generic_read_RGBA_span_BGRA8888_REV_MMX
 156
 157
 158 /**
 159  * SSE optimized version of the BGRA8888_REV to RGBA copy routine.  SSE
 160  * instructions are only actually used to read data from the framebuffer.
 161  * In practice, the speed-up is pretty small.
 162  *
 163  * \todo
 164  * Do some more testing and determine if there's any reason to have this
 165  * function in addition to the MMX version.
 166  *
 167  * \warning
 168  * This function assumes that the caller will issue the EMMS instruction
 169  * at the correct places.
 170  */
 171
 172 .globl _generic_read_RGBA_span_BGRA8888_REV_SSE
 173         .type   _generic_read_RGBA_span_BGRA8888_REV_SSE, @function
 174 _generic_read_RGBA_span_BGRA8888_REV_SSE:
 175         pushl   %esi
 176         pushl   %ebx
 177         pushl   %ebp
 178
 179 #ifdef USE_INNER_EMMS
 180         emms
 181 #endif
 182         movq    mask, %mm1
 183         movq    mask+16, %mm2
 184
 185         movl    16(%esp), %ebx  /* source pointer */
 186         movl    24(%esp), %edx  /* number of pixels to copy */
 187         movl    20(%esp), %ecx  /* destination pointer */
 188
 189         movl    %esp, %ebp
 190         subl    $16, %esp
 191         andl    $0xfffffff0, %esp
 192
 193         movl    %ebx, %eax
 194         movl    %edx, %esi
 195
 196         negl    %eax
 197         andl    $15, %eax
 198         sarl    $2, %eax
 199         cmpl    %edx, %eax
 200         cmovle  %eax, %esi
 201
 202         subl    %esi, %edx
 203
 204         testl   $1, %esi
 205         je      .L32
 206
 207         DO_ONE_PIXEL()
 208 .L32:
 209
 210         testl   $2, %esi
 211         je      .L31
 212
 213         movq    (%ebx), %mm0
 214         addl    $8, %ebx
 215
 216         movq    %mm0, %mm3
 217         movq    %mm0, %mm4
 218
 219         pand    %mm2, %mm3
 220         psllq   $16, %mm4
 221         psrlq   $16, %mm3
 222         pand    %mm2, %mm4
 223
 224         pand    %mm1, %mm0
 225         por     %mm4, %mm3
 226         por     %mm3, %mm0
 227
 228         movq    %mm0, (%ecx)
 229         addl    $8, %ecx
 230 .L31:
 231
 232         movl    %edx, %eax
 233         shrl    $2, %eax
 234         jmp     .L33
 235 .L34:
 236         movaps  (%ebx), %xmm0
 237         addl    $16, %ebx
 238
 239         /* This would be so much better if we could just move directly from
 240          * an SSE register to an MMX register.  Unfortunately, that
 241          * functionality wasn't introduced until SSE2 with the MOVDQ2Q
 242          * instruction.
 243          */
 244
 245         movaps  %xmm0, (%esp)
 246         movq    (%esp), %mm0
 247         movq    8(%esp), %mm5
 248
 249         movq    %mm0, %mm3
 250         movq    %mm0, %mm4
 251         movq    %mm5, %mm6
 252         movq    %mm5, %mm7
 253
 254         pand    %mm2, %mm3
 255         pand    %mm2, %mm6
 256
 257         psllq   $16, %mm4
 258         psllq   $16, %mm7
 259
 260         psrlq   $16, %mm3
 261         psrlq   $16, %mm6
 262
 263         pand    %mm2, %mm4
 264         pand    %mm2, %mm7
 265
 266         pand    %mm1, %mm0
 267         pand    %mm1, %mm5
 268
 269         por     %mm4, %mm3
 270         por     %mm7, %mm6
 271
 272         por     %mm3, %mm0
 273         por     %mm6, %mm5
 274
 275         movq    %mm0, (%ecx)
 276         movq    %mm5, 8(%ecx)
 277         addl    $16, %ecx
 278
 279         subl    $1, %eax
 280 .L33:
 281         jne     .L34
 282
 283 #ifdef USE_INNER_EMMS
 284         emms
 285 #endif
 286         movl    %ebp, %esp
 287
 288         /* At this point there are either [0, 3] pixels remaining to be
 289          * converted.
 290          */
 291
 292         testl   $2, %edx
 293         je      .L36
 294
 295         movq    (%ebx), %mm0
 296         addl    $8, %ebx
 297
 298         movq    %mm0, %mm3
 299         movq    %mm0, %mm4
 300
 301         pand    %mm2, %mm3
 302         psllq   $16, %mm4
 303         psrlq   $16, %mm3
 304         pand    %mm2, %mm4
 305
 306         pand    %mm1, %mm0
 307         por     %mm4, %mm3
 308         por     %mm3, %mm0
 309
 310         movq    %mm0, (%ecx)
 311         addl    $8, %ecx
 312 .L36:
 313
 314         testl   $1, %edx
 315         je      .L35
 316
 317         DO_ONE_LAST_PIXEL()
 318 .L35:
 319         popl    %ebp
 320         popl    %ebx
 321         popl    %esi
 322         ret
 323         .size   _generic_read_RGBA_span_BGRA8888_REV_SSE, .-_generic_read_RGBA_span_BGRA8888_REV_SSE
 324
 325
 326 /**
 327  * SSE2 optimized version of the BGRA8888_REV to RGBA copy routine.
 328  */
 329
 330         .text
 331 .globl _generic_read_RGBA_span_BGRA8888_REV_SSE2
 332         .type   _generic_read_RGBA_span_BGRA8888_REV_SSE2, @function
 333 _generic_read_RGBA_span_BGRA8888_REV_SSE2:
 334         pushl   %esi
 335         pushl   %ebx
 336
 337         movdqa  mask, %xmm1
 338         movdqa  mask+16, %xmm2
 339
 340         movl    12(%esp), %ebx  /* source pointer */
 341         movl    20(%esp), %edx  /* number of pixels to copy */
 342         movl    16(%esp), %ecx  /* destination pointer */
 343
 344         movl    %ebx, %eax
 345         movl    %edx, %esi
 346
 347         /* If the source pointer isn't a multiple of 16 we have to process
 348          * a few pixels the "slow" way to get the address aligned for
 349          * the SSE fetch intsructions.
 350          */
 351
 352         negl    %eax
 353         andl    $15, %eax
 354         sarl    $2, %eax
 355
 356         cmpl    %edx, %eax
 357         cmovbe  %eax, %esi
 358         subl    %esi, %edx
 359
 360         testl   $1, %esi
 361         je      .L41
 362
 363         DO_ONE_PIXEL()
 364 .L41:
 365         testl   $2, %esi
 366         je      .L40
 367
 368         movq    (%ebx), %xmm0
 369         addl    $8, %ebx
 370
 371         movdqa  %xmm0, %xmm3
 372         movdqa  %xmm0, %xmm4
 373         andps   %xmm1, %xmm0
 374
 375         andps   %xmm2, %xmm3
 376         pslldq  $2, %xmm4
 377         psrldq  $2, %xmm3
 378         andps   %xmm2, %xmm4
 379
 380         orps    %xmm4, %xmm3
 381         orps    %xmm3, %xmm0
 382
 383         movq    %xmm0, (%ecx)
 384         addl    $8, %ecx
 385 .L40:
 386
 387         /* Would it be worth having a specialized version of this loop for
 388          * the case where the destination is 16-byte aligned?  That version
 389          * would be identical except that it could use movedqa instead of
 390          * movdqu.
 391          */
 392
 393         movl    %edx, %eax
 394         shrl    $2, %eax
 395         jmp     .L42
 396 .L43:
 397         movdqa  (%ebx), %xmm0
 398         addl    $16, %ebx
 399
 400         movdqa  %xmm0, %xmm3
 401         movdqa  %xmm0, %xmm4
 402         andps   %xmm1, %xmm0
 403
 404         andps   %xmm2, %xmm3
 405         pslldq  $2, %xmm4
 406         psrldq  $2, %xmm3
 407         andps   %xmm2, %xmm4
 408
 409         orps    %xmm4, %xmm3
 410         orps    %xmm3, %xmm0
 411
 412         movdqu  %xmm0, (%ecx)
 413         addl    $16, %ecx
 414         subl    $1, %eax
 415 .L42:
 416         jne     .L43
 417
 418
 419         /* There may be upto 3 pixels remaining to be copied.  Take care
 420          * of them now.  We do the 2 pixel case first because the data
 421          * will be aligned.
 422          */
 423
 424         testl   $2, %edx
 425         je      .L47
 426
 427         movq    (%ebx), %xmm0
 428
 429         movdqa  %xmm0, %xmm3
 430         movdqa  %xmm0, %xmm4
 431         andps   %xmm1, %xmm0
 432
 433         andps   %xmm2, %xmm3
 434         pslldq  $2, %xmm4
 435         psrldq  $2, %xmm3
 436         andps   %xmm2, %xmm4
 437
 438         orps    %xmm4, %xmm3
 439         orps    %xmm3, %xmm0
 440
 441         movq    %xmm0, (%ecx)
 442 .L47:
 443
 444         testl   $1, %edx
 445         je      .L46
 446
 447         DO_ONE_LAST_PIXEL()
 448 .L46:
 449
 450         popl    %ebx
 451         popl    %esi
 452         ret
 453         .size   _generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2
 454
 455
 456
 457         .section        .rodata
 458
 459         .align  16
 460 mask_565:
 461         .word   0xf800
 462         .word   0x07e0
 463         .word   0x001f
 464         .word   0x0000
 465
 466 /* Setting SCALE_ADJUST to 5 gives a perfect match with the classic C
 467  * implementation in Mesa.  Setting SCALE_ADJUST to 0 is slightly faster but
 468  * at a small cost to accuracy.
 469  */
 470
 471 #define SCALE_ADJUST    5
 472 #if SCALE_ADJUST == 5
 473 prescale:
 474         .word   0x0001
 475         .word   0x0010
 476         .word   0x0200
 477         .word   0x0000
 478
 479 scale:
 480         .word   0x20e8          /* (0x00ff0000 / 0x000007c0) + 1 */
 481         .word   0x40c5          /* (0x00ff0000 / 0x000003f0) + 1 */
 482         .word   0x839d          /* (0x00ff0000 / 0x000001f0) + 1 */
 483         .word   0x0000
 484 #elif SCALE_ADJUST == 0
 485 prescale:
 486         .word   0x0001
 487         .word   0x0020
 488         .word   0x0800
 489         .word   0x0000
 490
 491 scale:
 492         .word   0x0108          /* (0x00ff0000 / 0x0000f800) + 1 */
 493         .word   0x0104          /* (0x00ff0000 / 0x0000fc00) + 1 */
 494         .word   0x0108          /* (0x00ff0000 / 0x0000f800) + 1 */
 495         .word   0x0000
 496 #else
 497 #error SCALE_ADJUST must either be 5 or 0.
 498 #endif
 499
 500
 501 alpha:  .long   0x00000000
 502         .long   0x00ff0000
 503
 504 /**
 505  * MMX optimized version of the RGB565 to RGBA copy routine.
 506  */
 507
 508         .text
 509         .globl  _generic_read_RGBA_span_RGB565_MMX
 510         .type   _generic_read_RGBA_span_RGB565_MMX, @function
 511
 512 _generic_read_RGBA_span_RGB565_MMX:
 513
 514 #ifdef USE_INNER_EMMS
 515         emms
 516 #endif
 517
 518         movl    4(%esp), %eax   /* source pointer */
 519         movl    8(%esp), %edx   /* destination pointer */
 520         movl    12(%esp), %ecx  /* number of pixels to copy */
 521
 522         movq    mask_565, %mm5
 523         movq    prescale, %mm6
 524         movq    scale, %mm7
 525
 526         shrl    $2, %ecx
 527         jmp     .L02
 528
 529 .L03:
 530         /* Fetch 4 RGB565 pixels into %mm4.  Distribute the first and
 531          * second pixels into the four words of %mm0 and %mm2.
 532          */
 533
 534         movq    (%eax), %mm4
 535         addl    $8, %eax
 536
 537         pshufw  $0x00, %mm4, %mm0
 538         pshufw  $0x55, %mm4, %mm2
 539
 540
 541         /* Mask the pixels so that each word of each register contains only
 542          * one color component.
 543          */
 544
 545         pand    %mm5, %mm0
 546         pand    %mm5, %mm2
 547
 548
 549         /* Adjust the component values so that they are as small as possible,
 550          * but large enough so that we can multiply them by an unsigned 16-bit
 551          * number and get a value as large as 0x00ff0000.
 552          */
 553
 554         pmullw  %mm6, %mm0
 555         pmullw  %mm6, %mm2
 556 #if SCALE_ADJUST > 0
 557         psrlw   $SCALE_ADJUST, %mm0
 558         psrlw   $SCALE_ADJUST, %mm2
 559 #endif
 560
 561         /* Scale the input component values to be on the range
 562          * [0, 0x00ff0000].  This it the real magic of the whole routine.
 563          */
 564
 565         pmulhuw %mm7, %mm0
 566         pmulhuw %mm7, %mm2
 567
 568
 569         /* Always set the alpha value to 0xff.
 570          */
 571
 572         por     alpha, %mm0
 573         por     alpha, %mm2
 574
 575
 576         /* Pack the 16-bit values to 8-bit values and store the converted
 577          * pixel data.
 578          */
 579
 580         packuswb        %mm2, %mm0
 581         movq    %mm0, (%edx)
 582         addl    $8, %edx
 583
 584
 585
 586         pshufw  $0xaa, %mm4, %mm0
 587         pshufw  $0xff, %mm4, %mm2
 588
 589         pand    %mm5, %mm0
 590         pand    %mm5, %mm2
 591         pmullw  %mm6, %mm0
 592         pmullw  %mm6, %mm2
 593 #if SCALE_ADJUST > 0
 594         psrlw   $SCALE_ADJUST, %mm0
 595         psrlw   $SCALE_ADJUST, %mm2
 596 #endif
 597         pmulhuw %mm7, %mm0
 598         pmulhuw %mm7, %mm2
 599
 600         por     alpha, %mm0
 601         por     alpha, %mm2
 602
 603         packuswb        %mm2, %mm0
 604
 605         movq    %mm0, (%edx)
 606         addl    $8, %edx
 607
 608         subl    $1, %ecx
 609 .L02:
 610         jne     .L03
 611
 612
 613         /* At this point there can be at most 3 pixels left to process.  If
 614          * there is either 2 or 3 left, process 2.
 615          */
 616
 617         movl    12(%esp), %ecx
 618         testl   $0x02, %ecx
 619         je      .L04
 620
 621         movd    (%eax), %mm4
 622         addl    $4, %eax
 623
 624         pshufw  $0x00, %mm4, %mm0
 625         pshufw  $0x55, %mm4, %mm2
 626
 627         pand    %mm5, %mm0
 628         pand    %mm5, %mm2
 629         pmullw  %mm6, %mm0
 630         pmullw  %mm6, %mm2
 631 #if SCALE_ADJUST > 0
 632         psrlw   $SCALE_ADJUST, %mm0
 633         psrlw   $SCALE_ADJUST, %mm2
 634 #endif
 635         pmulhuw %mm7, %mm0
 636         pmulhuw %mm7, %mm2
 637
 638         por     alpha, %mm0
 639         por     alpha, %mm2
 640
 641         packuswb        %mm2, %mm0
 642
 643         movq    %mm0, (%edx)
 644         addl    $8, %edx
 645
 646 .L04:
 647         /* At this point there can be at most 1 pixel left to process.
 648          * Process it if needed.
 649          */
 650
 651         testl   $0x01, %ecx
 652         je      .L01
 653
 654         movzxw  (%eax), %ecx
 655         movd    %ecx, %mm4
 656
 657         pshufw  $0x00, %mm4, %mm0
 658
 659         pand    %mm5, %mm0
 660         pmullw  %mm6, %mm0
 661 #if SCALE_ADJUST > 0
 662         psrlw   $SCALE_ADJUST, %mm0
 663 #endif
 664         pmulhuw %mm7, %mm0
 665
 666         por     alpha, %mm0
 667
 668         packuswb        %mm0, %mm0
 669
 670         movd    %mm0, (%edx)
 671
 672 .L01:
 673 #ifdef USE_INNER_EMMS
 674         emms
 675 #endif
 676         ret