src/mesa/x86/read_rgba_span_x86.S

   1 /*
   2  * (C) Copyright IBM Corporation 2004
   3  * All Rights Reserved.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * on the rights to use, copy, modify, merge, publish, distribute, sub
   9  * license, and/or sell copies of the Software, and to permit persons to whom
  10  * the Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
  19  * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25 /**
  26  * \file read_rgba_span_x86.S
  27  * Optimized routines to transfer pixel data from the framebuffer to a
  28  * buffer in main memory.
  29  *
  30  * \author Ian Romanick <idr@us.ibm.com>
  31  */
  32
  33         .file   "read_rgba_span_x86.S"
  34 #if !defined(__DJGPP__) && !defined(__MINGW32__) /* this one cries for assyntax.h */
  35         .section        .rodata
  36         .align 16
  37         .type   mask, @object
  38         .size   mask, 32
  39 mask:
  40         .long   0xff00ff00
  41         .long   0xff00ff00
  42         .long   0xff00ff00
  43         .long   0xff00ff00
  44         .long   0x00ff0000
  45         .long   0x00ff0000
  46         .long   0x00ff0000
  47         .long   0x00ff0000
  48
  49
  50 /* I implemented these as macros because the appear in quite a few places,
  51  * and I've tweaked them a number of times.  I got tired of changing every
  52  * place they appear. :)
  53  */
  54
  55 #define DO_ONE_PIXEL() \
  56         movl    (%ebx), %eax ; \
  57         addl    $4, %ebx ; \
  58         bswap   %eax          /* ARGB -> BGRA */ ; \
  59         rorl    $8, %eax      /* BGRA -> ABGR */ ; \
  60         movl    %eax, (%ecx)  /* ABGR -> R, G, B, A */ ; \
  61         addl    $4, %ecx
  62
  63 #define DO_ONE_LAST_PIXEL() \
  64         movl    (%ebx), %eax ; \
  65         bswap   %eax          /* ARGB -> BGRA */ ; \
  66         rorl    $8, %eax      /* BGRA -> ABGR */ ; \
  67         movl    %eax, (%ecx)  /* ABGR -> R, G, B, A */ ; \
  68
  69
  70 /**
  71  * MMX optimized version of the BGRA8888_REV to RGBA copy routine.
  72  *
  73  * \warning
  74  * This function assumes that the caller will issue the EMMS instruction
  75  * at the correct places.
  76  */
  77
  78 .globl _generic_read_RGBA_span_BGRA8888_REV_MMX
  79 .hidden _generic_read_RGBA_span_BGRA8888_REV_MMX
  80         .type   _generic_read_RGBA_span_BGRA8888_REV_MMX, @function
  81 _generic_read_RGBA_span_BGRA8888_REV_MMX:
  82         pushl   %ebx
  83
  84 #ifdef USE_INNER_EMMS
  85         emms
  86 #endif
  87         movq    mask, %mm1
  88         movq    mask+16, %mm2
  89
  90         movl    8(%esp), %ebx   /* source pointer */
  91         movl    16(%esp), %edx  /* number of pixels to copy */
  92         movl    12(%esp), %ecx  /* destination pointer */
  93
  94         testl   %edx, %edx
  95         je      .L20            /* Bail if there's nothing to do. */
  96
  97         movl    %ebx, %eax
  98
  99         negl    %eax
 100         sarl    $2, %eax
 101         andl    $1, %eax
 102         je      .L17
 103
 104         subl    %eax, %edx
 105         DO_ONE_PIXEL()
 106 .L17:
 107
 108         /* Would it be faster to unroll this loop once and process 4 pixels
 109          * per pass, instead of just two?
 110          */
 111
 112         movl    %edx, %eax
 113         shrl    %eax
 114         jmp     .L18
 115 .L19:
 116         movq    (%ebx), %mm0
 117         addl    $8, %ebx
 118
 119         /* These 9 instructions do what PSHUFB (if there were such an
 120          * instruction) could do in 1. :(
 121          */
 122
 123         movq    %mm0, %mm3
 124         movq    %mm0, %mm4
 125
 126         pand    %mm2, %mm3
 127         psllq   $16, %mm4
 128         psrlq   $16, %mm3
 129         pand    %mm2, %mm4
 130
 131         pand    %mm1, %mm0
 132         por     %mm4, %mm3
 133         por     %mm3, %mm0
 134
 135         movq    %mm0, (%ecx)
 136         addl    $8, %ecx
 137         subl    $1, %eax
 138 .L18:
 139         jne     .L19
 140
 141 #ifdef USE_INNER_EMMS
 142         emms
 143 #endif
 144
 145         /* At this point there are either 1 or 0 pixels remaining to be
 146          * converted.  Convert the last pixel, if needed.
 147          */
 148
 149         testl   $1, %edx
 150         je      .L20
 151
 152         DO_ONE_LAST_PIXEL()
 153
 154 .L20:
 155         popl    %ebx
 156         ret
 157         .size   _generic_read_RGBA_span_BGRA8888_REV_MMX, .-_generic_read_RGBA_span_BGRA8888_REV_MMX
 158
 159
 160 /**
 161  * SSE optimized version of the BGRA8888_REV to RGBA copy routine.  SSE
 162  * instructions are only actually used to read data from the framebuffer.
 163  * In practice, the speed-up is pretty small.
 164  *
 165  * \todo
 166  * Do some more testing and determine if there's any reason to have this
 167  * function in addition to the MMX version.
 168  *
 169  * \warning
 170  * This function assumes that the caller will issue the EMMS instruction
 171  * at the correct places.
 172  */
 173
 174 .globl _generic_read_RGBA_span_BGRA8888_REV_SSE
 175 .hidden _generic_read_RGBA_span_BGRA8888_REV_SSE
 176         .type   _generic_read_RGBA_span_BGRA8888_REV_SSE, @function
 177 _generic_read_RGBA_span_BGRA8888_REV_SSE:
 178         pushl   %esi
 179         pushl   %ebx
 180         pushl   %ebp
 181
 182 #ifdef USE_INNER_EMMS
 183         emms
 184 #endif
 185         movq    mask, %mm1
 186         movq    mask+16, %mm2
 187
 188         movl    16(%esp), %ebx  /* source pointer */
 189         movl    24(%esp), %edx  /* number of pixels to copy */
 190         movl    20(%esp), %ecx  /* destination pointer */
 191
 192         movl    %esp, %ebp
 193         subl    $16, %esp
 194         andl    $0xfffffff0, %esp
 195
 196         movl    %ebx, %eax
 197         movl    %edx, %esi
 198
 199         negl    %eax
 200         andl    $15, %eax
 201         sarl    $2, %eax
 202         cmpl    %edx, %eax
 203         cmovle  %eax, %esi
 204
 205         subl    %esi, %edx
 206
 207         testl   $1, %esi
 208         je      .L32
 209
 210         DO_ONE_PIXEL()
 211 .L32:
 212
 213         testl   $2, %esi
 214         je      .L31
 215
 216         movq    (%ebx), %mm0
 217         addl    $8, %ebx
 218
 219         movq    %mm0, %mm3
 220         movq    %mm0, %mm4
 221
 222         pand    %mm2, %mm3
 223         psllq   $16, %mm4
 224         psrlq   $16, %mm3
 225         pand    %mm2, %mm4
 226
 227         pand    %mm1, %mm0
 228         por     %mm4, %mm3
 229         por     %mm3, %mm0
 230
 231         movq    %mm0, (%ecx)
 232         addl    $8, %ecx
 233 .L31:
 234
 235         movl    %edx, %eax
 236         shrl    $2, %eax
 237         jmp     .L33
 238 .L34:
 239         movaps  (%ebx), %xmm0
 240         addl    $16, %ebx
 241
 242         /* This would be so much better if we could just move directly from
 243          * an SSE register to an MMX register.  Unfortunately, that
 244          * functionality wasn't introduced until SSE2 with the MOVDQ2Q
 245          * instruction.
 246          */
 247
 248         movaps  %xmm0, (%esp)
 249         movq    (%esp), %mm0
 250         movq    8(%esp), %mm5
 251
 252         movq    %mm0, %mm3
 253         movq    %mm0, %mm4
 254         movq    %mm5, %mm6
 255         movq    %mm5, %mm7
 256
 257         pand    %mm2, %mm3
 258         pand    %mm2, %mm6
 259
 260         psllq   $16, %mm4
 261         psllq   $16, %mm7
 262
 263         psrlq   $16, %mm3
 264         psrlq   $16, %mm6
 265
 266         pand    %mm2, %mm4
 267         pand    %mm2, %mm7
 268
 269         pand    %mm1, %mm0
 270         pand    %mm1, %mm5
 271
 272         por     %mm4, %mm3
 273         por     %mm7, %mm6
 274
 275         por     %mm3, %mm0
 276         por     %mm6, %mm5
 277
 278         movq    %mm0, (%ecx)
 279         movq    %mm5, 8(%ecx)
 280         addl    $16, %ecx
 281
 282         subl    $1, %eax
 283 .L33:
 284         jne     .L34
 285
 286 #ifdef USE_INNER_EMMS
 287         emms
 288 #endif
 289         movl    %ebp, %esp
 290
 291         /* At this point there are either [0, 3] pixels remaining to be
 292          * converted.
 293          */
 294
 295         testl   $2, %edx
 296         je      .L36
 297
 298         movq    (%ebx), %mm0
 299         addl    $8, %ebx
 300
 301         movq    %mm0, %mm3
 302         movq    %mm0, %mm4
 303
 304         pand    %mm2, %mm3
 305         psllq   $16, %mm4
 306         psrlq   $16, %mm3
 307         pand    %mm2, %mm4
 308
 309         pand    %mm1, %mm0
 310         por     %mm4, %mm3
 311         por     %mm3, %mm0
 312
 313         movq    %mm0, (%ecx)
 314         addl    $8, %ecx
 315 .L36:
 316
 317         testl   $1, %edx
 318         je      .L35
 319
 320         DO_ONE_LAST_PIXEL()
 321 .L35:
 322         popl    %ebp
 323         popl    %ebx
 324         popl    %esi
 325         ret
 326         .size   _generic_read_RGBA_span_BGRA8888_REV_SSE, .-_generic_read_RGBA_span_BGRA8888_REV_SSE
 327
 328
 329 /**
 330  * SSE2 optimized version of the BGRA8888_REV to RGBA copy routine.
 331  */
 332
 333         .text
 334 .globl _generic_read_RGBA_span_BGRA8888_REV_SSE2
 335 .hidden _generic_read_RGBA_span_BGRA8888_REV_SSE2
 336         .type   _generic_read_RGBA_span_BGRA8888_REV_SSE2, @function
 337 _generic_read_RGBA_span_BGRA8888_REV_SSE2:
 338         pushl   %esi
 339         pushl   %ebx
 340
 341         movdqa  mask, %xmm1
 342         movdqa  mask+16, %xmm2
 343
 344         movl    12(%esp), %ebx  /* source pointer */
 345         movl    20(%esp), %edx  /* number of pixels to copy */
 346         movl    16(%esp), %ecx  /* destination pointer */
 347
 348         movl    %ebx, %eax
 349         movl    %edx, %esi
 350
 351         /* If the source pointer isn't a multiple of 16 we have to process
 352          * a few pixels the "slow" way to get the address aligned for
 353          * the SSE fetch intsructions.
 354          */
 355
 356         negl    %eax
 357         andl    $15, %eax
 358         sarl    $2, %eax
 359
 360         cmpl    %edx, %eax
 361         cmovbe  %eax, %esi
 362         subl    %esi, %edx
 363
 364         testl   $1, %esi
 365         je      .L41
 366
 367         DO_ONE_PIXEL()
 368 .L41:
 369         testl   $2, %esi
 370         je      .L40
 371
 372         movq    (%ebx), %xmm0
 373         addl    $8, %ebx
 374
 375         movdqa  %xmm0, %xmm3
 376         movdqa  %xmm0, %xmm4
 377         andps   %xmm1, %xmm0
 378
 379         andps   %xmm2, %xmm3
 380         pslldq  $2, %xmm4
 381         psrldq  $2, %xmm3
 382         andps   %xmm2, %xmm4
 383
 384         orps    %xmm4, %xmm3
 385         orps    %xmm3, %xmm0
 386
 387         movq    %xmm0, (%ecx)
 388         addl    $8, %ecx
 389 .L40:
 390
 391         /* Would it be worth having a specialized version of this loop for
 392          * the case where the destination is 16-byte aligned?  That version
 393          * would be identical except that it could use movedqa instead of
 394          * movdqu.
 395          */
 396
 397         movl    %edx, %eax
 398         shrl    $2, %eax
 399         jmp     .L42
 400 .L43:
 401         movdqa  (%ebx), %xmm0
 402         addl    $16, %ebx
 403
 404         movdqa  %xmm0, %xmm3
 405         movdqa  %xmm0, %xmm4
 406         andps   %xmm1, %xmm0
 407
 408         andps   %xmm2, %xmm3
 409         pslldq  $2, %xmm4
 410         psrldq  $2, %xmm3
 411         andps   %xmm2, %xmm4
 412
 413         orps    %xmm4, %xmm3
 414         orps    %xmm3, %xmm0
 415
 416         movdqu  %xmm0, (%ecx)
 417         addl    $16, %ecx
 418         subl    $1, %eax
 419 .L42:
 420         jne     .L43
 421
 422
 423         /* There may be upto 3 pixels remaining to be copied.  Take care
 424          * of them now.  We do the 2 pixel case first because the data
 425          * will be aligned.
 426          */
 427
 428         testl   $2, %edx
 429         je      .L47
 430
 431         movq    (%ebx), %xmm0
 432
 433         movdqa  %xmm0, %xmm3
 434         movdqa  %xmm0, %xmm4
 435         andps   %xmm1, %xmm0
 436
 437         andps   %xmm2, %xmm3
 438         pslldq  $2, %xmm4
 439         psrldq  $2, %xmm3
 440         andps   %xmm2, %xmm4
 441
 442         orps    %xmm4, %xmm3
 443         orps    %xmm3, %xmm0
 444
 445         movq    %xmm0, (%ecx)
 446 .L47:
 447
 448         testl   $1, %edx
 449         je      .L46
 450
 451         DO_ONE_LAST_PIXEL()
 452 .L46:
 453
 454         popl    %ebx
 455         popl    %esi
 456         ret
 457         .size   _generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2
 458
 459
 460
 461         .section        .rodata
 462
 463         .align  16
 464 mask_565:
 465         .word   0xf800
 466         .word   0x07e0
 467         .word   0x001f
 468         .word   0x0000
 469
 470 /* Setting SCALE_ADJUST to 5 gives a perfect match with the classic C
 471  * implementation in Mesa.  Setting SCALE_ADJUST to 0 is slightly faster but
 472  * at a small cost to accuracy.
 473  */
 474
 475 #define SCALE_ADJUST    5
 476 #if SCALE_ADJUST == 5
 477 prescale:
 478         .word   0x0001
 479         .word   0x0010
 480         .word   0x0200
 481         .word   0x0000
 482
 483 scale:
 484         .word   0x20e8          /* (0x00ff0000 / 0x000007c0) + 1 */
 485         .word   0x40c5          /* (0x00ff0000 / 0x000003f0) + 1 */
 486         .word   0x839d          /* (0x00ff0000 / 0x000001f0) + 1 */
 487         .word   0x0000
 488 #elif SCALE_ADJUST == 0
 489 prescale:
 490         .word   0x0001
 491         .word   0x0020
 492         .word   0x0800
 493         .word   0x0000
 494
 495 scale:
 496         .word   0x0108          /* (0x00ff0000 / 0x0000f800) + 1 */
 497         .word   0x0104          /* (0x00ff0000 / 0x0000fc00) + 1 */
 498         .word   0x0108          /* (0x00ff0000 / 0x0000f800) + 1 */
 499         .word   0x0000
 500 #else
 501 #error SCALE_ADJUST must either be 5 or 0.
 502 #endif
 503
 504
 505 alpha:  .long   0x00000000
 506         .long   0x00ff0000
 507
 508 /**
 509  * MMX optimized version of the RGB565 to RGBA copy routine.
 510  */
 511
 512         .text
 513         .globl  _generic_read_RGBA_span_RGB565_MMX
 514         .hidden _generic_read_RGBA_span_RGB565_MMX
 515         .type   _generic_read_RGBA_span_RGB565_MMX, @function
 516
 517 _generic_read_RGBA_span_RGB565_MMX:
 518
 519 #ifdef USE_INNER_EMMS
 520         emms
 521 #endif
 522
 523         movl    4(%esp), %eax   /* source pointer */
 524         movl    8(%esp), %edx   /* destination pointer */
 525         movl    12(%esp), %ecx  /* number of pixels to copy */
 526
 527         movq    mask_565, %mm5
 528         movq    prescale, %mm6
 529         movq    scale, %mm7
 530
 531         shrl    $2, %ecx
 532         jmp     .L02
 533
 534 .L03:
 535         /* Fetch 4 RGB565 pixels into %mm4.  Distribute the first and
 536          * second pixels into the four words of %mm0 and %mm2.
 537          */
 538
 539         movq    (%eax), %mm4
 540         addl    $8, %eax
 541
 542         pshufw  $0x00, %mm4, %mm0
 543         pshufw  $0x55, %mm4, %mm2
 544
 545
 546         /* Mask the pixels so that each word of each register contains only
 547          * one color component.
 548          */
 549
 550         pand    %mm5, %mm0
 551         pand    %mm5, %mm2
 552
 553
 554         /* Adjust the component values so that they are as small as possible,
 555          * but large enough so that we can multiply them by an unsigned 16-bit
 556          * number and get a value as large as 0x00ff0000.
 557          */
 558
 559         pmullw  %mm6, %mm0
 560         pmullw  %mm6, %mm2
 561 #if SCALE_ADJUST > 0
 562         psrlw   $SCALE_ADJUST, %mm0
 563         psrlw   $SCALE_ADJUST, %mm2
 564 #endif
 565
 566         /* Scale the input component values to be on the range
 567          * [0, 0x00ff0000].  This it the real magic of the whole routine.
 568          */
 569
 570         pmulhuw %mm7, %mm0
 571         pmulhuw %mm7, %mm2
 572
 573
 574         /* Always set the alpha value to 0xff.
 575          */
 576
 577         por     alpha, %mm0
 578         por     alpha, %mm2
 579
 580
 581         /* Pack the 16-bit values to 8-bit values and store the converted
 582          * pixel data.
 583          */
 584
 585         packuswb        %mm2, %mm0
 586         movq    %mm0, (%edx)
 587         addl    $8, %edx
 588
 589
 590
 591         pshufw  $0xaa, %mm4, %mm0
 592         pshufw  $0xff, %mm4, %mm2
 593
 594         pand    %mm5, %mm0
 595         pand    %mm5, %mm2
 596         pmullw  %mm6, %mm0
 597         pmullw  %mm6, %mm2
 598 #if SCALE_ADJUST > 0
 599         psrlw   $SCALE_ADJUST, %mm0
 600         psrlw   $SCALE_ADJUST, %mm2
 601 #endif
 602         pmulhuw %mm7, %mm0
 603         pmulhuw %mm7, %mm2
 604
 605         por     alpha, %mm0
 606         por     alpha, %mm2
 607
 608         packuswb        %mm2, %mm0
 609
 610         movq    %mm0, (%edx)
 611         addl    $8, %edx
 612
 613         subl    $1, %ecx
 614 .L02:
 615         jne     .L03
 616
 617
 618         /* At this point there can be at most 3 pixels left to process.  If
 619          * there is either 2 or 3 left, process 2.
 620          */
 621
 622         movl    12(%esp), %ecx
 623         testl   $0x02, %ecx
 624         je      .L04
 625
 626         movd    (%eax), %mm4
 627         addl    $4, %eax
 628
 629         pshufw  $0x00, %mm4, %mm0
 630         pshufw  $0x55, %mm4, %mm2
 631
 632         pand    %mm5, %mm0
 633         pand    %mm5, %mm2
 634         pmullw  %mm6, %mm0
 635         pmullw  %mm6, %mm2
 636 #if SCALE_ADJUST > 0
 637         psrlw   $SCALE_ADJUST, %mm0
 638         psrlw   $SCALE_ADJUST, %mm2
 639 #endif
 640         pmulhuw %mm7, %mm0
 641         pmulhuw %mm7, %mm2
 642
 643         por     alpha, %mm0
 644         por     alpha, %mm2
 645
 646         packuswb        %mm2, %mm0
 647
 648         movq    %mm0, (%edx)
 649         addl    $8, %edx
 650
 651 .L04:
 652         /* At this point there can be at most 1 pixel left to process.
 653          * Process it if needed.
 654          */
 655
 656         testl   $0x01, %ecx
 657         je      .L01
 658
 659         movzxw  (%eax), %ecx
 660         movd    %ecx, %mm4
 661
 662         pshufw  $0x00, %mm4, %mm0
 663
 664         pand    %mm5, %mm0
 665         pmullw  %mm6, %mm0
 666 #if SCALE_ADJUST > 0
 667         psrlw   $SCALE_ADJUST, %mm0
 668 #endif
 669         pmulhuw %mm7, %mm0
 670
 671         por     alpha, %mm0
 672
 673         packuswb        %mm0, %mm0
 674
 675         movd    %mm0, (%edx)
 676
 677 .L01:
 678 #ifdef USE_INNER_EMMS
 679         emms
 680 #endif
 681         ret
 682 #endif /* !defined(__DJGPP__) && !defined(__MINGW32__) */