src/mesa/x86/read_rgba_span_x86.S

   1 /*
   2  * (C) Copyright IBM Corporation 2004
   3  * All Rights Reserved.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * on the rights to use, copy, modify, merge, publish, distribute, sub
   9  * license, and/or sell copies of the Software, and to permit persons to whom
  10  * the Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
  19  * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25 /**
  26  * \file read_rgba_span_x86.S
  27  * Optimized routines to transfer pixel data from the framebuffer to a
  28  * buffer in main memory.
  29  *
  30  * \author Ian Romanick <idr@us.ibm.com>
  31  */
  32
  33         .file   "read_rgba_span_x86.S"
  34 #if !defined(__DJGPP__) && !defined(__MINGW32__) /* this one cries for assyntax.h */
  35 /* Kevin F. Quinn 2nd July 2006
  36  * Replaced data segment constants with text-segment instructions.
  37  */
  38 #define LOAD_MASK(mvins,m1,m2) \
  39         pushl   $0xff00ff00 ;\
  40         pushl   $0xff00ff00 ;\
  41         pushl   $0xff00ff00 ;\
  42         pushl   $0xff00ff00 ;\
  43         mvins   (%esp), m1      ;\
  44         pushl   $0x00ff0000 ;\
  45         pushl   $0x00ff0000 ;\
  46         pushl   $0x00ff0000 ;\
  47         pushl   $0x00ff0000 ;\
  48         mvins   (%esp), m2      ;\
  49         addl    $32, %esp
  50
  51 /* I implemented these as macros because they appear in several places,
  52  * and I've tweaked them a number of times.  I got tired of changing every
  53  * place they appear. :)
  54  */
  55
  56 #define DO_ONE_PIXEL() \
  57         movl    (%ebx), %eax ; \
  58         addl    $4, %ebx ; \
  59         bswap   %eax          /* ARGB -> BGRA */ ; \
  60         rorl    $8, %eax      /* BGRA -> ABGR */ ; \
  61         movl    %eax, (%ecx)  /* ABGR -> R, G, B, A */ ; \
  62         addl    $4, %ecx
  63
  64 #define DO_ONE_LAST_PIXEL() \
  65         movl    (%ebx), %eax ; \
  66         bswap   %eax          /* ARGB -> BGRA */ ; \
  67         rorl    $8, %eax      /* BGRA -> ABGR */ ; \
  68         movl    %eax, (%ecx)  /* ABGR -> R, G, B, A */ ; \
  69
  70
  71 /**
  72  * MMX optimized version of the BGRA8888_REV to RGBA copy routine.
  73  *
  74  * \warning
  75  * This function assumes that the caller will issue the EMMS instruction
  76  * at the correct places.
  77  */
  78
  79 .globl _generic_read_RGBA_span_BGRA8888_REV_MMX
  80 .hidden _generic_read_RGBA_span_BGRA8888_REV_MMX
  81         .type   _generic_read_RGBA_span_BGRA8888_REV_MMX, @function
  82 _generic_read_RGBA_span_BGRA8888_REV_MMX:
  83         pushl   %ebx
  84
  85 #ifdef USE_INNER_EMMS
  86         emms
  87 #endif
  88         LOAD_MASK(movq,%mm1,%mm2)
  89
  90         movl    8(%esp), %ebx   /* source pointer */
  91         movl    16(%esp), %edx  /* number of pixels to copy */
  92         movl    12(%esp), %ecx  /* destination pointer */
  93
  94         testl   %edx, %edx
  95         jle     .L20            /* Bail if there's nothing to do. */
  96
  97         movl    %ebx, %eax
  98
  99         negl    %eax
 100         sarl    $2, %eax
 101         andl    $1, %eax
 102         je      .L17
 103
 104         subl    %eax, %edx
 105         DO_ONE_PIXEL()
 106 .L17:
 107
 108         /* Would it be faster to unroll this loop once and process 4 pixels
 109          * per pass, instead of just two?
 110          */
 111
 112         movl    %edx, %eax
 113         shrl    %eax
 114         jmp     .L18
 115 .L19:
 116         movq    (%ebx), %mm0
 117         addl    $8, %ebx
 118
 119         /* These 9 instructions do what PSHUFB (if there were such an
 120          * instruction) could do in 1. :(
 121          */
 122
 123         movq    %mm0, %mm3
 124         movq    %mm0, %mm4
 125
 126         pand    %mm2, %mm3
 127         psllq   $16, %mm4
 128         psrlq   $16, %mm3
 129         pand    %mm2, %mm4
 130
 131         pand    %mm1, %mm0
 132         por     %mm4, %mm3
 133         por     %mm3, %mm0
 134
 135         movq    %mm0, (%ecx)
 136         addl    $8, %ecx
 137         subl    $1, %eax
 138 .L18:
 139         jne     .L19
 140
 141 #ifdef USE_INNER_EMMS
 142         emms
 143 #endif
 144
 145         /* At this point there are either 1 or 0 pixels remaining to be
 146          * converted.  Convert the last pixel, if needed.
 147          */
 148
 149         testl   $1, %edx
 150         je      .L20
 151
 152         DO_ONE_LAST_PIXEL()
 153
 154 .L20:
 155         popl    %ebx
 156         ret
 157         .size   _generic_read_RGBA_span_BGRA8888_REV_MMX, .-_generic_read_RGBA_span_BGRA8888_REV_MMX
 158
 159
 160 /**
 161  * SSE optimized version of the BGRA8888_REV to RGBA copy routine.  SSE
 162  * instructions are only actually used to read data from the framebuffer.
 163  * In practice, the speed-up is pretty small.
 164  *
 165  * \todo
 166  * Do some more testing and determine if there's any reason to have this
 167  * function in addition to the MMX version.
 168  *
 169  * \warning
 170  * This function assumes that the caller will issue the EMMS instruction
 171  * at the correct places.
 172  */
 173
 174 .globl _generic_read_RGBA_span_BGRA8888_REV_SSE
 175 .hidden _generic_read_RGBA_span_BGRA8888_REV_SSE
 176         .type   _generic_read_RGBA_span_BGRA8888_REV_SSE, @function
 177 _generic_read_RGBA_span_BGRA8888_REV_SSE:
 178         pushl   %esi
 179         pushl   %ebx
 180         pushl   %ebp
 181
 182 #ifdef USE_INNER_EMMS
 183         emms
 184 #endif
 185
 186         LOAD_MASK(movq,%mm1,%mm2)
 187
 188         movl    16(%esp), %ebx  /* source pointer */
 189         movl    24(%esp), %edx  /* number of pixels to copy */
 190         movl    20(%esp), %ecx  /* destination pointer */
 191
 192         testl   %edx, %edx
 193         jle     .L35            /* Bail if there's nothing to do. */
 194
 195         movl    %esp, %ebp
 196         subl    $16, %esp
 197         andl    $0xfffffff0, %esp
 198
 199         movl    %ebx, %eax
 200         movl    %edx, %esi
 201
 202         negl    %eax
 203         andl    $15, %eax
 204         sarl    $2, %eax
 205         cmpl    %edx, %eax
 206         cmovle  %eax, %esi
 207
 208         subl    %esi, %edx
 209
 210         testl   $1, %esi
 211         je      .L32
 212
 213         DO_ONE_PIXEL()
 214 .L32:
 215
 216         testl   $2, %esi
 217         je      .L31
 218
 219         movq    (%ebx), %mm0
 220         addl    $8, %ebx
 221
 222         movq    %mm0, %mm3
 223         movq    %mm0, %mm4
 224
 225         pand    %mm2, %mm3
 226         psllq   $16, %mm4
 227         psrlq   $16, %mm3
 228         pand    %mm2, %mm4
 229
 230         pand    %mm1, %mm0
 231         por     %mm4, %mm3
 232         por     %mm3, %mm0
 233
 234         movq    %mm0, (%ecx)
 235         addl    $8, %ecx
 236 .L31:
 237
 238         movl    %edx, %eax
 239         shrl    $2, %eax
 240         jmp     .L33
 241 .L34:
 242         movaps  (%ebx), %xmm0
 243         addl    $16, %ebx
 244
 245         /* This would be so much better if we could just move directly from
 246          * an SSE register to an MMX register.  Unfortunately, that
 247          * functionality wasn't introduced until SSE2 with the MOVDQ2Q
 248          * instruction.
 249          */
 250
 251         movaps  %xmm0, (%esp)
 252         movq    (%esp), %mm0
 253         movq    8(%esp), %mm5
 254
 255         movq    %mm0, %mm3
 256         movq    %mm0, %mm4
 257         movq    %mm5, %mm6
 258         movq    %mm5, %mm7
 259
 260         pand    %mm2, %mm3
 261         pand    %mm2, %mm6
 262
 263         psllq   $16, %mm4
 264         psllq   $16, %mm7
 265
 266         psrlq   $16, %mm3
 267         psrlq   $16, %mm6
 268
 269         pand    %mm2, %mm4
 270         pand    %mm2, %mm7
 271
 272         pand    %mm1, %mm0
 273         pand    %mm1, %mm5
 274
 275         por     %mm4, %mm3
 276         por     %mm7, %mm6
 277
 278         por     %mm3, %mm0
 279         por     %mm6, %mm5
 280
 281         movq    %mm0, (%ecx)
 282         movq    %mm5, 8(%ecx)
 283         addl    $16, %ecx
 284
 285         subl    $1, %eax
 286 .L33:
 287         jne     .L34
 288
 289 #ifdef USE_INNER_EMMS
 290         emms
 291 #endif
 292         movl    %ebp, %esp
 293
 294         /* At this point there are either [0, 3] pixels remaining to be
 295          * converted.
 296          */
 297
 298         testl   $2, %edx
 299         je      .L36
 300
 301         movq    (%ebx), %mm0
 302         addl    $8, %ebx
 303
 304         movq    %mm0, %mm3
 305         movq    %mm0, %mm4
 306
 307         pand    %mm2, %mm3
 308         psllq   $16, %mm4
 309         psrlq   $16, %mm3
 310         pand    %mm2, %mm4
 311
 312         pand    %mm1, %mm0
 313         por     %mm4, %mm3
 314         por     %mm3, %mm0
 315
 316         movq    %mm0, (%ecx)
 317         addl    $8, %ecx
 318 .L36:
 319
 320         testl   $1, %edx
 321         je      .L35
 322
 323         DO_ONE_LAST_PIXEL()
 324 .L35:
 325         popl    %ebp
 326         popl    %ebx
 327         popl    %esi
 328         ret
 329         .size   _generic_read_RGBA_span_BGRA8888_REV_SSE, .-_generic_read_RGBA_span_BGRA8888_REV_SSE
 330
 331
 332 /**
 333  * SSE2 optimized version of the BGRA8888_REV to RGBA copy routine.
 334  */
 335
 336         .text
 337 .globl _generic_read_RGBA_span_BGRA8888_REV_SSE2
 338 .hidden _generic_read_RGBA_span_BGRA8888_REV_SSE2
 339         .type   _generic_read_RGBA_span_BGRA8888_REV_SSE2, @function
 340 _generic_read_RGBA_span_BGRA8888_REV_SSE2:
 341         pushl   %esi
 342         pushl   %ebx
 343
 344         LOAD_MASK(movdqu,%xmm1,%xmm2)
 345
 346         movl    12(%esp), %ebx  /* source pointer */
 347         movl    20(%esp), %edx  /* number of pixels to copy */
 348         movl    16(%esp), %ecx  /* destination pointer */
 349
 350         movl    %ebx, %eax
 351         movl    %edx, %esi
 352
 353         testl   %edx, %edx
 354         jle     .L46            /* Bail if there's nothing to do. */
 355
 356         /* If the source pointer isn't a multiple of 16 we have to process
 357          * a few pixels the "slow" way to get the address aligned for
 358          * the SSE fetch intsructions.
 359          */
 360
 361         negl    %eax
 362         andl    $15, %eax
 363         sarl    $2, %eax
 364
 365         cmpl    %edx, %eax
 366         cmovbe  %eax, %esi
 367         subl    %esi, %edx
 368
 369         testl   $1, %esi
 370         je      .L41
 371
 372         DO_ONE_PIXEL()
 373 .L41:
 374         testl   $2, %esi
 375         je      .L40
 376
 377         movq    (%ebx), %xmm0
 378         addl    $8, %ebx
 379
 380         movdqa  %xmm0, %xmm3
 381         movdqa  %xmm0, %xmm4
 382         andps   %xmm1, %xmm0
 383
 384         andps   %xmm2, %xmm3
 385         pslldq  $2, %xmm4
 386         psrldq  $2, %xmm3
 387         andps   %xmm2, %xmm4
 388
 389         orps    %xmm4, %xmm3
 390         orps    %xmm3, %xmm0
 391
 392         movq    %xmm0, (%ecx)
 393         addl    $8, %ecx
 394 .L40:
 395
 396         /* Would it be worth having a specialized version of this loop for
 397          * the case where the destination is 16-byte aligned?  That version
 398          * would be identical except that it could use movedqa instead of
 399          * movdqu.
 400          */
 401
 402         movl    %edx, %eax
 403         shrl    $2, %eax
 404         jmp     .L42
 405 .L43:
 406         movdqa  (%ebx), %xmm0
 407         addl    $16, %ebx
 408
 409         movdqa  %xmm0, %xmm3
 410         movdqa  %xmm0, %xmm4
 411         andps   %xmm1, %xmm0
 412
 413         andps   %xmm2, %xmm3
 414         pslldq  $2, %xmm4
 415         psrldq  $2, %xmm3
 416         andps   %xmm2, %xmm4
 417
 418         orps    %xmm4, %xmm3
 419         orps    %xmm3, %xmm0
 420
 421         movdqu  %xmm0, (%ecx)
 422         addl    $16, %ecx
 423         subl    $1, %eax
 424 .L42:
 425         jne     .L43
 426
 427
 428         /* There may be upto 3 pixels remaining to be copied.  Take care
 429          * of them now.  We do the 2 pixel case first because the data
 430          * will be aligned.
 431          */
 432
 433         testl   $2, %edx
 434         je      .L47
 435
 436         movq    (%ebx), %xmm0
 437
 438         movdqa  %xmm0, %xmm3
 439         movdqa  %xmm0, %xmm4
 440         andps   %xmm1, %xmm0
 441
 442         andps   %xmm2, %xmm3
 443         pslldq  $2, %xmm4
 444         psrldq  $2, %xmm3
 445         andps   %xmm2, %xmm4
 446
 447         orps    %xmm4, %xmm3
 448         orps    %xmm3, %xmm0
 449
 450         movq    %xmm0, (%ecx)
 451 .L47:
 452
 453         testl   $1, %edx
 454         je      .L46
 455
 456         DO_ONE_LAST_PIXEL()
 457 .L46:
 458
 459         popl    %ebx
 460         popl    %esi
 461         ret
 462         .size   _generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2
 463
 464
 465
 466 #define MASK_565_L      0x07e0f800
 467 #define MASK_565_H      0x0000001f
 468 /* Setting SCALE_ADJUST to 5 gives a perfect match with the
 469  * classic C implementation in Mesa.  Setting SCALE_ADJUST
 470  * to 0 is slightly faster but at a small cost to accuracy.
 471  */
 472 #define SCALE_ADJUST    5
 473 #if SCALE_ADJUST == 5
 474 #define PRESCALE_L 0x00100001
 475 #define PRESCALE_H 0x00000200
 476 #define SCALE_L 0x40C620E8
 477 #define SCALE_H 0x0000839d
 478 #elif SCALE_ADJUST == 0
 479 #define PRESCALE_L 0x00200001
 480 #define PRESCALE_H 0x00000800
 481 #define SCALE_L 0x01040108
 482 #define SCALE_H 0x00000108
 483 #else
 484 #error SCALE_ADJUST must either be 5 or 0.
 485 #endif
 486 #define ALPHA_L 0x00000000
 487 #define ALPHA_H 0x00ff0000
 488
 489 /**
 490  * MMX optimized version of the RGB565 to RGBA copy routine.
 491  */
 492
 493         .text
 494         .globl  _generic_read_RGBA_span_RGB565_MMX
 495         .hidden _generic_read_RGBA_span_RGB565_MMX
 496         .type   _generic_read_RGBA_span_RGB565_MMX, @function
 497
 498 _generic_read_RGBA_span_RGB565_MMX:
 499
 500 #ifdef USE_INNER_EMMS
 501         emms
 502 #endif
 503
 504         movl    4(%esp), %eax   /* source pointer */
 505         movl    8(%esp), %edx   /* destination pointer */
 506         movl    12(%esp), %ecx  /* number of pixels to copy */
 507
 508         pushl   $MASK_565_H
 509         pushl   $MASK_565_L
 510         movq    (%esp), %mm5
 511         pushl   $PRESCALE_H
 512         pushl   $PRESCALE_L
 513         movq    (%esp), %mm6
 514         pushl   $SCALE_H
 515         pushl   $SCALE_L
 516         movq    (%esp), %mm7
 517         pushl   $ALPHA_H
 518         pushl   $ALPHA_L
 519         movq    (%esp), %mm3
 520         addl    $32,%esp
 521
 522         sarl    $2, %ecx
 523         jle     .L01            /* Bail early if the count is negative. */
 524         jmp     .L02
 525
 526 .L03:
 527         /* Fetch 4 RGB565 pixels into %mm4.  Distribute the first and
 528          * second pixels into the four words of %mm0 and %mm2.
 529          */
 530
 531         movq    (%eax), %mm4
 532         addl    $8, %eax
 533
 534         pshufw  $0x00, %mm4, %mm0
 535         pshufw  $0x55, %mm4, %mm2
 536
 537
 538         /* Mask the pixels so that each word of each register contains only
 539          * one color component.
 540          */
 541
 542         pand    %mm5, %mm0
 543         pand    %mm5, %mm2
 544
 545
 546         /* Adjust the component values so that they are as small as possible,
 547          * but large enough so that we can multiply them by an unsigned 16-bit
 548          * number and get a value as large as 0x00ff0000.
 549          */
 550
 551         pmullw  %mm6, %mm0
 552         pmullw  %mm6, %mm2
 553 #if SCALE_ADJUST > 0
 554         psrlw   $SCALE_ADJUST, %mm0
 555         psrlw   $SCALE_ADJUST, %mm2
 556 #endif
 557
 558         /* Scale the input component values to be on the range
 559          * [0, 0x00ff0000].  This it the real magic of the whole routine.
 560          */
 561
 562         pmulhuw %mm7, %mm0
 563         pmulhuw %mm7, %mm2
 564
 565
 566         /* Always set the alpha value to 0xff.
 567          */
 568
 569         por %mm3, %mm0
 570         por %mm3, %mm2
 571
 572
 573         /* Pack the 16-bit values to 8-bit values and store the converted
 574          * pixel data.
 575          */
 576
 577         packuswb        %mm2, %mm0
 578         movq    %mm0, (%edx)
 579         addl    $8, %edx
 580
 581         pshufw  $0xaa, %mm4, %mm0
 582         pshufw  $0xff, %mm4, %mm2
 583
 584         pand    %mm5, %mm0
 585         pand    %mm5, %mm2
 586         pmullw  %mm6, %mm0
 587         pmullw  %mm6, %mm2
 588 #if SCALE_ADJUST > 0
 589         psrlw   $SCALE_ADJUST, %mm0
 590         psrlw   $SCALE_ADJUST, %mm2
 591 #endif
 592         pmulhuw %mm7, %mm0
 593         pmulhuw %mm7, %mm2
 594
 595         por %mm3, %mm0
 596         por %mm3, %mm2
 597
 598         packuswb        %mm2, %mm0
 599
 600         movq    %mm0, (%edx)
 601         addl    $8, %edx
 602
 603         subl    $1, %ecx
 604 .L02:
 605         jne     .L03
 606
 607
 608         /* At this point there can be at most 3 pixels left to process.  If
 609          * there is either 2 or 3 left, process 2.
 610          */
 611
 612         movl    12(%esp), %ecx
 613         testl   $0x02, %ecx
 614         je      .L04
 615
 616         movd    (%eax), %mm4
 617         addl    $4, %eax
 618
 619         pshufw  $0x00, %mm4, %mm0
 620         pshufw  $0x55, %mm4, %mm2
 621
 622         pand    %mm5, %mm0
 623         pand    %mm5, %mm2
 624         pmullw  %mm6, %mm0
 625         pmullw  %mm6, %mm2
 626 #if SCALE_ADJUST > 0
 627         psrlw   $SCALE_ADJUST, %mm0
 628         psrlw   $SCALE_ADJUST, %mm2
 629 #endif
 630         pmulhuw %mm7, %mm0
 631         pmulhuw %mm7, %mm2
 632
 633         por %mm3, %mm0
 634         por %mm3, %mm2
 635
 636         packuswb        %mm2, %mm0
 637
 638         movq    %mm0, (%edx)
 639         addl    $8, %edx
 640
 641 .L04:
 642         /* At this point there can be at most 1 pixel left to process.
 643          * Process it if needed.
 644          */
 645
 646         testl   $0x01, %ecx
 647         je      .L01
 648
 649         movzxw  (%eax), %ecx
 650         movd    %ecx, %mm4
 651
 652         pshufw  $0x00, %mm4, %mm0
 653
 654         pand    %mm5, %mm0
 655         pmullw  %mm6, %mm0
 656 #if SCALE_ADJUST > 0
 657         psrlw   $SCALE_ADJUST, %mm0
 658 #endif
 659         pmulhuw %mm7, %mm0
 660
 661         por %mm3, %mm0
 662
 663         packuswb        %mm0, %mm0
 664
 665         movd    %mm0, (%edx)
 666
 667 .L01:
 668 #ifdef USE_INNER_EMMS
 669         emms
 670 #endif
 671         ret
 672 #endif /* !defined(__DJGPP__) && !defined(__MINGW32__) */
 673
 674 #if defined (__ELF__) && defined (__linux__)
 675         .section .note.GNU-stack,"",%progbits
 676 #endif