src/mesa/x86/read_rgba_span_x86.S

   1 /*
   2  * (C) Copyright IBM Corporation 2004
   3  * All Rights Reserved.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * on the rights to use, copy, modify, merge, publish, distribute, sub
   9  * license, and/or sell copies of the Software, and to permit persons to whom
  10  * the Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
  19  * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25 /**
  26  * \file read_rgba_span_x86.S
  27  * Optimized routines to transfer pixel data from the framebuffer to a
  28  * buffer in main memory.
  29  *
  30  * \author Ian Romanick <idr@us.ibm.com>
  31  */
  32
  33         .file   "read_rgba_span_x86.S"
  34 #if !defined(__DJGPP__) && !defined(__MINGW32__) /* this one cries for assyntax.h */
  35         .section        .rodata
  36         .align 16
  37         .type   mask, @object
  38         .size   mask, 32
  39 mask:
  40         .long   0xff00ff00
  41         .long   0xff00ff00
  42         .long   0xff00ff00
  43         .long   0xff00ff00
  44         .long   0x00ff0000
  45         .long   0x00ff0000
  46         .long   0x00ff0000
  47         .long   0x00ff0000
  48
  49
  50 /* I implemented these as macros because the appear in quite a few places,
  51  * and I've tweaked them a number of times.  I got tired of changing every
  52  * place they appear. :)
  53  */
  54
  55 #define DO_ONE_PIXEL() \
  56         movl    (%ebx), %eax ; \
  57         addl    $4, %ebx ; \
  58         bswap   %eax          /* ARGB -> BGRA */ ; \
  59         rorl    $8, %eax      /* BGRA -> ABGR */ ; \
  60         movl    %eax, (%ecx)  /* ABGR -> R, G, B, A */ ; \
  61         addl    $4, %ecx
  62
  63 #define DO_ONE_LAST_PIXEL() \
  64         movl    (%ebx), %eax ; \
  65         bswap   %eax          /* ARGB -> BGRA */ ; \
  66         rorl    $8, %eax      /* BGRA -> ABGR */ ; \
  67         movl    %eax, (%ecx)  /* ABGR -> R, G, B, A */ ; \
  68
  69
  70 /**
  71  * MMX optimized version of the BGRA8888_REV to RGBA copy routine.
  72  *
  73  * \warning
  74  * This function assumes that the caller will issue the EMMS instruction
  75  * at the correct places.
  76  */
  77
  78 .globl _generic_read_RGBA_span_BGRA8888_REV_MMX
  79 .hidden _generic_read_RGBA_span_BGRA8888_REV_MMX
  80         .type   _generic_read_RGBA_span_BGRA8888_REV_MMX, @function
  81 _generic_read_RGBA_span_BGRA8888_REV_MMX:
  82         pushl   %ebx
  83
  84 #ifdef USE_INNER_EMMS
  85         emms
  86 #endif
  87         movq    mask, %mm1
  88         movq    mask+16, %mm2
  89
  90         movl    8(%esp), %ebx   /* source pointer */
  91         movl    16(%esp), %edx  /* number of pixels to copy */
  92         movl    12(%esp), %ecx  /* destination pointer */
  93
  94         testl   %edx, %edx
  95         jle     .L20            /* Bail if there's nothing to do. */
  96
  97         movl    %ebx, %eax
  98
  99         negl    %eax
 100         sarl    $2, %eax
 101         andl    $1, %eax
 102         je      .L17
 103
 104         subl    %eax, %edx
 105         DO_ONE_PIXEL()
 106 .L17:
 107
 108         /* Would it be faster to unroll this loop once and process 4 pixels
 109          * per pass, instead of just two?
 110          */
 111
 112         movl    %edx, %eax
 113         shrl    %eax
 114         jmp     .L18
 115 .L19:
 116         movq    (%ebx), %mm0
 117         addl    $8, %ebx
 118
 119         /* These 9 instructions do what PSHUFB (if there were such an
 120          * instruction) could do in 1. :(
 121          */
 122
 123         movq    %mm0, %mm3
 124         movq    %mm0, %mm4
 125
 126         pand    %mm2, %mm3
 127         psllq   $16, %mm4
 128         psrlq   $16, %mm3
 129         pand    %mm2, %mm4
 130
 131         pand    %mm1, %mm0
 132         por     %mm4, %mm3
 133         por     %mm3, %mm0
 134
 135         movq    %mm0, (%ecx)
 136         addl    $8, %ecx
 137         subl    $1, %eax
 138 .L18:
 139         jne     .L19
 140
 141 #ifdef USE_INNER_EMMS
 142         emms
 143 #endif
 144
 145         /* At this point there are either 1 or 0 pixels remaining to be
 146          * converted.  Convert the last pixel, if needed.
 147          */
 148
 149         testl   $1, %edx
 150         je      .L20
 151
 152         DO_ONE_LAST_PIXEL()
 153
 154 .L20:
 155         popl    %ebx
 156         ret
 157         .size   _generic_read_RGBA_span_BGRA8888_REV_MMX, .-_generic_read_RGBA_span_BGRA8888_REV_MMX
 158
 159
 160 /**
 161  * SSE optimized version of the BGRA8888_REV to RGBA copy routine.  SSE
 162  * instructions are only actually used to read data from the framebuffer.
 163  * In practice, the speed-up is pretty small.
 164  *
 165  * \todo
 166  * Do some more testing and determine if there's any reason to have this
 167  * function in addition to the MMX version.
 168  *
 169  * \warning
 170  * This function assumes that the caller will issue the EMMS instruction
 171  * at the correct places.
 172  */
 173
 174 .globl _generic_read_RGBA_span_BGRA8888_REV_SSE
 175 .hidden _generic_read_RGBA_span_BGRA8888_REV_SSE
 176         .type   _generic_read_RGBA_span_BGRA8888_REV_SSE, @function
 177 _generic_read_RGBA_span_BGRA8888_REV_SSE:
 178         pushl   %esi
 179         pushl   %ebx
 180         pushl   %ebp
 181
 182 #ifdef USE_INNER_EMMS
 183         emms
 184 #endif
 185         movq    mask, %mm1
 186         movq    mask+16, %mm2
 187
 188         movl    16(%esp), %ebx  /* source pointer */
 189         movl    24(%esp), %edx  /* number of pixels to copy */
 190         movl    20(%esp), %ecx  /* destination pointer */
 191
 192         testl   %edx, %edx
 193         jle     .L35            /* Bail if there's nothing to do. */
 194
 195         movl    %esp, %ebp
 196         subl    $16, %esp
 197         andl    $0xfffffff0, %esp
 198
 199         movl    %ebx, %eax
 200         movl    %edx, %esi
 201
 202         negl    %eax
 203         andl    $15, %eax
 204         sarl    $2, %eax
 205         cmpl    %edx, %eax
 206         cmovle  %eax, %esi
 207
 208         subl    %esi, %edx
 209
 210         testl   $1, %esi
 211         je      .L32
 212
 213         DO_ONE_PIXEL()
 214 .L32:
 215
 216         testl   $2, %esi
 217         je      .L31
 218
 219         movq    (%ebx), %mm0
 220         addl    $8, %ebx
 221
 222         movq    %mm0, %mm3
 223         movq    %mm0, %mm4
 224
 225         pand    %mm2, %mm3
 226         psllq   $16, %mm4
 227         psrlq   $16, %mm3
 228         pand    %mm2, %mm4
 229
 230         pand    %mm1, %mm0
 231         por     %mm4, %mm3
 232         por     %mm3, %mm0
 233
 234         movq    %mm0, (%ecx)
 235         addl    $8, %ecx
 236 .L31:
 237
 238         movl    %edx, %eax
 239         shrl    $2, %eax
 240         jmp     .L33
 241 .L34:
 242         movaps  (%ebx), %xmm0
 243         addl    $16, %ebx
 244
 245         /* This would be so much better if we could just move directly from
 246          * an SSE register to an MMX register.  Unfortunately, that
 247          * functionality wasn't introduced until SSE2 with the MOVDQ2Q
 248          * instruction.
 249          */
 250
 251         movaps  %xmm0, (%esp)
 252         movq    (%esp), %mm0
 253         movq    8(%esp), %mm5
 254
 255         movq    %mm0, %mm3
 256         movq    %mm0, %mm4
 257         movq    %mm5, %mm6
 258         movq    %mm5, %mm7
 259
 260         pand    %mm2, %mm3
 261         pand    %mm2, %mm6
 262
 263         psllq   $16, %mm4
 264         psllq   $16, %mm7
 265
 266         psrlq   $16, %mm3
 267         psrlq   $16, %mm6
 268
 269         pand    %mm2, %mm4
 270         pand    %mm2, %mm7
 271
 272         pand    %mm1, %mm0
 273         pand    %mm1, %mm5
 274
 275         por     %mm4, %mm3
 276         por     %mm7, %mm6
 277
 278         por     %mm3, %mm0
 279         por     %mm6, %mm5
 280
 281         movq    %mm0, (%ecx)
 282         movq    %mm5, 8(%ecx)
 283         addl    $16, %ecx
 284
 285         subl    $1, %eax
 286 .L33:
 287         jne     .L34
 288
 289 #ifdef USE_INNER_EMMS
 290         emms
 291 #endif
 292         movl    %ebp, %esp
 293
 294         /* At this point there are either [0, 3] pixels remaining to be
 295          * converted.
 296          */
 297
 298         testl   $2, %edx
 299         je      .L36
 300
 301         movq    (%ebx), %mm0
 302         addl    $8, %ebx
 303
 304         movq    %mm0, %mm3
 305         movq    %mm0, %mm4
 306
 307         pand    %mm2, %mm3
 308         psllq   $16, %mm4
 309         psrlq   $16, %mm3
 310         pand    %mm2, %mm4
 311
 312         pand    %mm1, %mm0
 313         por     %mm4, %mm3
 314         por     %mm3, %mm0
 315
 316         movq    %mm0, (%ecx)
 317         addl    $8, %ecx
 318 .L36:
 319
 320         testl   $1, %edx
 321         je      .L35
 322
 323         DO_ONE_LAST_PIXEL()
 324 .L35:
 325         popl    %ebp
 326         popl    %ebx
 327         popl    %esi
 328         ret
 329         .size   _generic_read_RGBA_span_BGRA8888_REV_SSE, .-_generic_read_RGBA_span_BGRA8888_REV_SSE
 330
 331
 332 /**
 333  * SSE2 optimized version of the BGRA8888_REV to RGBA copy routine.
 334  */
 335
 336         .text
 337 .globl _generic_read_RGBA_span_BGRA8888_REV_SSE2
 338 .hidden _generic_read_RGBA_span_BGRA8888_REV_SSE2
 339         .type   _generic_read_RGBA_span_BGRA8888_REV_SSE2, @function
 340 _generic_read_RGBA_span_BGRA8888_REV_SSE2:
 341         pushl   %esi
 342         pushl   %ebx
 343
 344         movdqa  mask, %xmm1
 345         movdqa  mask+16, %xmm2
 346
 347         movl    12(%esp), %ebx  /* source pointer */
 348         movl    20(%esp), %edx  /* number of pixels to copy */
 349         movl    16(%esp), %ecx  /* destination pointer */
 350
 351         movl    %ebx, %eax
 352         movl    %edx, %esi
 353
 354         testl   %edx, %edx
 355         jle     .L46            /* Bail if there's nothing to do. */
 356
 357         /* If the source pointer isn't a multiple of 16 we have to process
 358          * a few pixels the "slow" way to get the address aligned for
 359          * the SSE fetch intsructions.
 360          */
 361
 362         negl    %eax
 363         andl    $15, %eax
 364         sarl    $2, %eax
 365
 366         cmpl    %edx, %eax
 367         cmovbe  %eax, %esi
 368         subl    %esi, %edx
 369
 370         testl   $1, %esi
 371         je      .L41
 372
 373         DO_ONE_PIXEL()
 374 .L41:
 375         testl   $2, %esi
 376         je      .L40
 377
 378         movq    (%ebx), %xmm0
 379         addl    $8, %ebx
 380
 381         movdqa  %xmm0, %xmm3
 382         movdqa  %xmm0, %xmm4
 383         andps   %xmm1, %xmm0
 384
 385         andps   %xmm2, %xmm3
 386         pslldq  $2, %xmm4
 387         psrldq  $2, %xmm3
 388         andps   %xmm2, %xmm4
 389
 390         orps    %xmm4, %xmm3
 391         orps    %xmm3, %xmm0
 392
 393         movq    %xmm0, (%ecx)
 394         addl    $8, %ecx
 395 .L40:
 396
 397         /* Would it be worth having a specialized version of this loop for
 398          * the case where the destination is 16-byte aligned?  That version
 399          * would be identical except that it could use movedqa instead of
 400          * movdqu.
 401          */
 402
 403         movl    %edx, %eax
 404         shrl    $2, %eax
 405         jmp     .L42
 406 .L43:
 407         movdqa  (%ebx), %xmm0
 408         addl    $16, %ebx
 409
 410         movdqa  %xmm0, %xmm3
 411         movdqa  %xmm0, %xmm4
 412         andps   %xmm1, %xmm0
 413
 414         andps   %xmm2, %xmm3
 415         pslldq  $2, %xmm4
 416         psrldq  $2, %xmm3
 417         andps   %xmm2, %xmm4
 418
 419         orps    %xmm4, %xmm3
 420         orps    %xmm3, %xmm0
 421
 422         movdqu  %xmm0, (%ecx)
 423         addl    $16, %ecx
 424         subl    $1, %eax
 425 .L42:
 426         jne     .L43
 427
 428
 429         /* There may be upto 3 pixels remaining to be copied.  Take care
 430          * of them now.  We do the 2 pixel case first because the data
 431          * will be aligned.
 432          */
 433
 434         testl   $2, %edx
 435         je      .L47
 436
 437         movq    (%ebx), %xmm0
 438
 439         movdqa  %xmm0, %xmm3
 440         movdqa  %xmm0, %xmm4
 441         andps   %xmm1, %xmm0
 442
 443         andps   %xmm2, %xmm3
 444         pslldq  $2, %xmm4
 445         psrldq  $2, %xmm3
 446         andps   %xmm2, %xmm4
 447
 448         orps    %xmm4, %xmm3
 449         orps    %xmm3, %xmm0
 450
 451         movq    %xmm0, (%ecx)
 452 .L47:
 453
 454         testl   $1, %edx
 455         je      .L46
 456
 457         DO_ONE_LAST_PIXEL()
 458 .L46:
 459
 460         popl    %ebx
 461         popl    %esi
 462         ret
 463         .size   _generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2
 464
 465
 466
 467         .section        .rodata
 468
 469         .align  16
 470 mask_565:
 471         .word   0xf800
 472         .word   0x07e0
 473         .word   0x001f
 474         .word   0x0000
 475
 476 /* Setting SCALE_ADJUST to 5 gives a perfect match with the classic C
 477  * implementation in Mesa.  Setting SCALE_ADJUST to 0 is slightly faster but
 478  * at a small cost to accuracy.
 479  */
 480
 481 #define SCALE_ADJUST    5
 482 #if SCALE_ADJUST == 5
 483 prescale:
 484         .word   0x0001
 485         .word   0x0010
 486         .word   0x0200
 487         .word   0x0000
 488
 489 scale:
 490         .word   0x20e8          /* (0x00ff0000 / 0x000007c0) + 1 */
 491         .word   0x40c5          /* (0x00ff0000 / 0x000003f0) + 1 */
 492         .word   0x839d          /* (0x00ff0000 / 0x000001f0) + 1 */
 493         .word   0x0000
 494 #elif SCALE_ADJUST == 0
 495 prescale:
 496         .word   0x0001
 497         .word   0x0020
 498         .word   0x0800
 499         .word   0x0000
 500
 501 scale:
 502         .word   0x0108          /* (0x00ff0000 / 0x0000f800) + 1 */
 503         .word   0x0104          /* (0x00ff0000 / 0x0000fc00) + 1 */
 504         .word   0x0108          /* (0x00ff0000 / 0x0000f800) + 1 */
 505         .word   0x0000
 506 #else
 507 #error SCALE_ADJUST must either be 5 or 0.
 508 #endif
 509
 510
 511 alpha:  .long   0x00000000
 512         .long   0x00ff0000
 513
 514 /**
 515  * MMX optimized version of the RGB565 to RGBA copy routine.
 516  */
 517
 518         .text
 519         .globl  _generic_read_RGBA_span_RGB565_MMX
 520         .hidden _generic_read_RGBA_span_RGB565_MMX
 521         .type   _generic_read_RGBA_span_RGB565_MMX, @function
 522
 523 _generic_read_RGBA_span_RGB565_MMX:
 524
 525 #ifdef USE_INNER_EMMS
 526         emms
 527 #endif
 528
 529         movl    4(%esp), %eax   /* source pointer */
 530         movl    8(%esp), %edx   /* destination pointer */
 531         movl    12(%esp), %ecx  /* number of pixels to copy */
 532
 533         movq    mask_565, %mm5
 534         movq    prescale, %mm6
 535         movq    scale, %mm7
 536
 537         sarl    $2, %ecx
 538         jle     .L01            /* Bail early if the count is negative. */
 539         jmp     .L02
 540
 541 .L03:
 542         /* Fetch 4 RGB565 pixels into %mm4.  Distribute the first and
 543          * second pixels into the four words of %mm0 and %mm2.
 544          */
 545
 546         movq    (%eax), %mm4
 547         addl    $8, %eax
 548
 549         pshufw  $0x00, %mm4, %mm0
 550         pshufw  $0x55, %mm4, %mm2
 551
 552
 553         /* Mask the pixels so that each word of each register contains only
 554          * one color component.
 555          */
 556
 557         pand    %mm5, %mm0
 558         pand    %mm5, %mm2
 559
 560
 561         /* Adjust the component values so that they are as small as possible,
 562          * but large enough so that we can multiply them by an unsigned 16-bit
 563          * number and get a value as large as 0x00ff0000.
 564          */
 565
 566         pmullw  %mm6, %mm0
 567         pmullw  %mm6, %mm2
 568 #if SCALE_ADJUST > 0
 569         psrlw   $SCALE_ADJUST, %mm0
 570         psrlw   $SCALE_ADJUST, %mm2
 571 #endif
 572
 573         /* Scale the input component values to be on the range
 574          * [0, 0x00ff0000].  This it the real magic of the whole routine.
 575          */
 576
 577         pmulhuw %mm7, %mm0
 578         pmulhuw %mm7, %mm2
 579
 580
 581         /* Always set the alpha value to 0xff.
 582          */
 583
 584         por     alpha, %mm0
 585         por     alpha, %mm2
 586
 587
 588         /* Pack the 16-bit values to 8-bit values and store the converted
 589          * pixel data.
 590          */
 591
 592         packuswb        %mm2, %mm0
 593         movq    %mm0, (%edx)
 594         addl    $8, %edx
 595
 596
 597
 598         pshufw  $0xaa, %mm4, %mm0
 599         pshufw  $0xff, %mm4, %mm2
 600
 601         pand    %mm5, %mm0
 602         pand    %mm5, %mm2
 603         pmullw  %mm6, %mm0
 604         pmullw  %mm6, %mm2
 605 #if SCALE_ADJUST > 0
 606         psrlw   $SCALE_ADJUST, %mm0
 607         psrlw   $SCALE_ADJUST, %mm2
 608 #endif
 609         pmulhuw %mm7, %mm0
 610         pmulhuw %mm7, %mm2
 611
 612         por     alpha, %mm0
 613         por     alpha, %mm2
 614
 615         packuswb        %mm2, %mm0
 616
 617         movq    %mm0, (%edx)
 618         addl    $8, %edx
 619
 620         subl    $1, %ecx
 621 .L02:
 622         jne     .L03
 623
 624
 625         /* At this point there can be at most 3 pixels left to process.  If
 626          * there is either 2 or 3 left, process 2.
 627          */
 628
 629         movl    12(%esp), %ecx
 630         testl   $0x02, %ecx
 631         je      .L04
 632
 633         movd    (%eax), %mm4
 634         addl    $4, %eax
 635
 636         pshufw  $0x00, %mm4, %mm0
 637         pshufw  $0x55, %mm4, %mm2
 638
 639         pand    %mm5, %mm0
 640         pand    %mm5, %mm2
 641         pmullw  %mm6, %mm0
 642         pmullw  %mm6, %mm2
 643 #if SCALE_ADJUST > 0
 644         psrlw   $SCALE_ADJUST, %mm0
 645         psrlw   $SCALE_ADJUST, %mm2
 646 #endif
 647         pmulhuw %mm7, %mm0
 648         pmulhuw %mm7, %mm2
 649
 650         por     alpha, %mm0
 651         por     alpha, %mm2
 652
 653         packuswb        %mm2, %mm0
 654
 655         movq    %mm0, (%edx)
 656         addl    $8, %edx
 657
 658 .L04:
 659         /* At this point there can be at most 1 pixel left to process.
 660          * Process it if needed.
 661          */
 662
 663         testl   $0x01, %ecx
 664         je      .L01
 665
 666         movzxw  (%eax), %ecx
 667         movd    %ecx, %mm4
 668
 669         pshufw  $0x00, %mm4, %mm0
 670
 671         pand    %mm5, %mm0
 672         pmullw  %mm6, %mm0
 673 #if SCALE_ADJUST > 0
 674         psrlw   $SCALE_ADJUST, %mm0
 675 #endif
 676         pmulhuw %mm7, %mm0
 677
 678         por     alpha, %mm0
 679
 680         packuswb        %mm0, %mm0
 681
 682         movd    %mm0, (%edx)
 683
 684 .L01:
 685 #ifdef USE_INNER_EMMS
 686         emms
 687 #endif
 688         ret
 689 #endif /* !defined(__DJGPP__) && !defined(__MINGW32__) */
 690
 691 #if defined (__ELF__) && defined (__linux__)
 692         .section .note.GNU-stack,"",%progbits
 693 #endif