src/mesa/x86/read_rgba_span_x86.S

   1 /*
   2  * (C) Copyright IBM Corporation 2004
   3  * All Rights Reserved.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * on the rights to use, copy, modify, merge, publish, distribute, sub
   9  * license, and/or sell copies of the Software, and to permit persons to whom
  10  * the Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
  19  * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25 /**
  26  * \file read_rgba_span_x86.S
  27  * Optimized routines to transfer pixel data from the framebuffer to a
  28  * buffer in main memory.
  29  *
  30  * \author Ian Romanick <idr@us.ibm.com>
  31  */
  32
  33         .file   "read_rgba_span_x86.S"
  34 #if !defined(__DJGPP__) && !defined(__MINGW32__) && !defined(__APPLE__) /* this one cries for assyntax.h */
  35 /* Kevin F. Quinn 2nd July 2006
  36  * Replaced data segment constants with text-segment instructions.
  37  */
  38 #define LOAD_MASK(mvins,m1,m2) \
  39         pushl   $0xff00ff00 ;\
  40         pushl   $0xff00ff00 ;\
  41         pushl   $0xff00ff00 ;\
  42         pushl   $0xff00ff00 ;\
  43         mvins   (%esp), m1      ;\
  44         pushl   $0x00ff0000 ;\
  45         pushl   $0x00ff0000 ;\
  46         pushl   $0x00ff0000 ;\
  47         pushl   $0x00ff0000 ;\
  48         mvins   (%esp), m2      ;\
  49         addl    $32, %esp
  50
  51 /* I implemented these as macros because they appear in several places,
  52  * and I've tweaked them a number of times.  I got tired of changing every
  53  * place they appear. :)
  54  */
  55
  56 #define DO_ONE_PIXEL() \
  57         movl    (%ebx), %eax ; \
  58         addl    $4, %ebx ; \
  59         bswap   %eax          /* ARGB -> BGRA */ ; \
  60         rorl    $8, %eax      /* BGRA -> ABGR */ ; \
  61         movl    %eax, (%ecx)  /* ABGR -> R, G, B, A */ ; \
  62         addl    $4, %ecx
  63
  64 #define DO_ONE_LAST_PIXEL() \
  65         movl    (%ebx), %eax ; \
  66         bswap   %eax          /* ARGB -> BGRA */ ; \
  67         rorl    $8, %eax      /* BGRA -> ABGR */ ; \
  68         movl    %eax, (%ecx)  /* ABGR -> R, G, B, A */ ; \
  69
  70
  71 /**
  72  * MMX optimized version of the BGRA8888_REV to RGBA copy routine.
  73  *
  74  * \warning
  75  * This function assumes that the caller will issue the EMMS instruction
  76  * at the correct places.
  77  */
  78
  79 .globl _generic_read_RGBA_span_BGRA8888_REV_MMX
  80 #ifndef USE_DRICORE
  81 .hidden _generic_read_RGBA_span_BGRA8888_REV_MMX
  82 #endif
  83         .type   _generic_read_RGBA_span_BGRA8888_REV_MMX, @function
  84 _generic_read_RGBA_span_BGRA8888_REV_MMX:
  85         pushl   %ebx
  86
  87 #ifdef USE_INNER_EMMS
  88         emms
  89 #endif
  90         LOAD_MASK(movq,%mm1,%mm2)
  91
  92         movl    8(%esp), %ebx   /* source pointer */
  93         movl    16(%esp), %edx  /* number of pixels to copy */
  94         movl    12(%esp), %ecx  /* destination pointer */
  95
  96         testl   %edx, %edx
  97         jle     .L20            /* Bail if there's nothing to do. */
  98
  99         movl    %ebx, %eax
 100
 101         negl    %eax
 102         sarl    $2, %eax
 103         andl    $1, %eax
 104         je      .L17
 105
 106         subl    %eax, %edx
 107         DO_ONE_PIXEL()
 108 .L17:
 109
 110         /* Would it be faster to unroll this loop once and process 4 pixels
 111          * per pass, instead of just two?
 112          */
 113
 114         movl    %edx, %eax
 115         shrl    %eax
 116         jmp     .L18
 117 .L19:
 118         movq    (%ebx), %mm0
 119         addl    $8, %ebx
 120
 121         /* These 9 instructions do what PSHUFB (if there were such an
 122          * instruction) could do in 1. :(
 123          */
 124
 125         movq    %mm0, %mm3
 126         movq    %mm0, %mm4
 127
 128         pand    %mm2, %mm3
 129         psllq   $16, %mm4
 130         psrlq   $16, %mm3
 131         pand    %mm2, %mm4
 132
 133         pand    %mm1, %mm0
 134         por     %mm4, %mm3
 135         por     %mm3, %mm0
 136
 137         movq    %mm0, (%ecx)
 138         addl    $8, %ecx
 139         subl    $1, %eax
 140 .L18:
 141         jne     .L19
 142
 143 #ifdef USE_INNER_EMMS
 144         emms
 145 #endif
 146
 147         /* At this point there are either 1 or 0 pixels remaining to be
 148          * converted.  Convert the last pixel, if needed.
 149          */
 150
 151         testl   $1, %edx
 152         je      .L20
 153
 154         DO_ONE_LAST_PIXEL()
 155
 156 .L20:
 157         popl    %ebx
 158         ret
 159         .size   _generic_read_RGBA_span_BGRA8888_REV_MMX, .-_generic_read_RGBA_span_BGRA8888_REV_MMX
 160
 161
 162 /**
 163  * SSE optimized version of the BGRA8888_REV to RGBA copy routine.  SSE
 164  * instructions are only actually used to read data from the framebuffer.
 165  * In practice, the speed-up is pretty small.
 166  *
 167  * \todo
 168  * Do some more testing and determine if there's any reason to have this
 169  * function in addition to the MMX version.
 170  *
 171  * \warning
 172  * This function assumes that the caller will issue the EMMS instruction
 173  * at the correct places.
 174  */
 175
 176 .globl _generic_read_RGBA_span_BGRA8888_REV_SSE
 177 #ifndef USE_DRICORE
 178 .hidden _generic_read_RGBA_span_BGRA8888_REV_SSE
 179 #endif
 180         .type   _generic_read_RGBA_span_BGRA8888_REV_SSE, @function
 181 _generic_read_RGBA_span_BGRA8888_REV_SSE:
 182         pushl   %esi
 183         pushl   %ebx
 184         pushl   %ebp
 185
 186 #ifdef USE_INNER_EMMS
 187         emms
 188 #endif
 189
 190         LOAD_MASK(movq,%mm1,%mm2)
 191
 192         movl    16(%esp), %ebx  /* source pointer */
 193         movl    24(%esp), %edx  /* number of pixels to copy */
 194         movl    20(%esp), %ecx  /* destination pointer */
 195
 196         testl   %edx, %edx
 197         jle     .L35            /* Bail if there's nothing to do. */
 198
 199         movl    %esp, %ebp
 200         subl    $16, %esp
 201         andl    $0xfffffff0, %esp
 202
 203         movl    %ebx, %eax
 204         movl    %edx, %esi
 205
 206         negl    %eax
 207         andl    $15, %eax
 208         sarl    $2, %eax
 209         cmpl    %edx, %eax
 210         cmovle  %eax, %esi
 211
 212         subl    %esi, %edx
 213
 214         testl   $1, %esi
 215         je      .L32
 216
 217         DO_ONE_PIXEL()
 218 .L32:
 219
 220         testl   $2, %esi
 221         je      .L31
 222
 223         movq    (%ebx), %mm0
 224         addl    $8, %ebx
 225
 226         movq    %mm0, %mm3
 227         movq    %mm0, %mm4
 228
 229         pand    %mm2, %mm3
 230         psllq   $16, %mm4
 231         psrlq   $16, %mm3
 232         pand    %mm2, %mm4
 233
 234         pand    %mm1, %mm0
 235         por     %mm4, %mm3
 236         por     %mm3, %mm0
 237
 238         movq    %mm0, (%ecx)
 239         addl    $8, %ecx
 240 .L31:
 241
 242         movl    %edx, %eax
 243         shrl    $2, %eax
 244         jmp     .L33
 245 .L34:
 246         movaps  (%ebx), %xmm0
 247         addl    $16, %ebx
 248
 249         /* This would be so much better if we could just move directly from
 250          * an SSE register to an MMX register.  Unfortunately, that
 251          * functionality wasn't introduced until SSE2 with the MOVDQ2Q
 252          * instruction.
 253          */
 254
 255         movaps  %xmm0, (%esp)
 256         movq    (%esp), %mm0
 257         movq    8(%esp), %mm5
 258
 259         movq    %mm0, %mm3
 260         movq    %mm0, %mm4
 261         movq    %mm5, %mm6
 262         movq    %mm5, %mm7
 263
 264         pand    %mm2, %mm3
 265         pand    %mm2, %mm6
 266
 267         psllq   $16, %mm4
 268         psllq   $16, %mm7
 269
 270         psrlq   $16, %mm3
 271         psrlq   $16, %mm6
 272
 273         pand    %mm2, %mm4
 274         pand    %mm2, %mm7
 275
 276         pand    %mm1, %mm0
 277         pand    %mm1, %mm5
 278
 279         por     %mm4, %mm3
 280         por     %mm7, %mm6
 281
 282         por     %mm3, %mm0
 283         por     %mm6, %mm5
 284
 285         movq    %mm0, (%ecx)
 286         movq    %mm5, 8(%ecx)
 287         addl    $16, %ecx
 288
 289         subl    $1, %eax
 290 .L33:
 291         jne     .L34
 292
 293 #ifdef USE_INNER_EMMS
 294         emms
 295 #endif
 296         movl    %ebp, %esp
 297
 298         /* At this point there are either [0, 3] pixels remaining to be
 299          * converted.
 300          */
 301
 302         testl   $2, %edx
 303         je      .L36
 304
 305         movq    (%ebx), %mm0
 306         addl    $8, %ebx
 307
 308         movq    %mm0, %mm3
 309         movq    %mm0, %mm4
 310
 311         pand    %mm2, %mm3
 312         psllq   $16, %mm4
 313         psrlq   $16, %mm3
 314         pand    %mm2, %mm4
 315
 316         pand    %mm1, %mm0
 317         por     %mm4, %mm3
 318         por     %mm3, %mm0
 319
 320         movq    %mm0, (%ecx)
 321         addl    $8, %ecx
 322 .L36:
 323
 324         testl   $1, %edx
 325         je      .L35
 326
 327         DO_ONE_LAST_PIXEL()
 328 .L35:
 329         popl    %ebp
 330         popl    %ebx
 331         popl    %esi
 332         ret
 333         .size   _generic_read_RGBA_span_BGRA8888_REV_SSE, .-_generic_read_RGBA_span_BGRA8888_REV_SSE
 334
 335
 336 /**
 337  * SSE2 optimized version of the BGRA8888_REV to RGBA copy routine.
 338  */
 339
 340         .text
 341 .globl _generic_read_RGBA_span_BGRA8888_REV_SSE2
 342 #ifndef USE_DRICORE
 343 .hidden _generic_read_RGBA_span_BGRA8888_REV_SSE2
 344 #endif
 345         .type   _generic_read_RGBA_span_BGRA8888_REV_SSE2, @function
 346 _generic_read_RGBA_span_BGRA8888_REV_SSE2:
 347         pushl   %esi
 348         pushl   %ebx
 349
 350         LOAD_MASK(movdqu,%xmm1,%xmm2)
 351
 352         movl    12(%esp), %ebx  /* source pointer */
 353         movl    20(%esp), %edx  /* number of pixels to copy */
 354         movl    16(%esp), %ecx  /* destination pointer */
 355
 356         movl    %ebx, %eax
 357         movl    %edx, %esi
 358
 359         testl   %edx, %edx
 360         jle     .L46            /* Bail if there's nothing to do. */
 361
 362         /* If the source pointer isn't a multiple of 16 we have to process
 363          * a few pixels the "slow" way to get the address aligned for
 364          * the SSE fetch intsructions.
 365          */
 366
 367         negl    %eax
 368         andl    $15, %eax
 369         sarl    $2, %eax
 370
 371         cmpl    %edx, %eax
 372         cmovbe  %eax, %esi
 373         subl    %esi, %edx
 374
 375         testl   $1, %esi
 376         je      .L41
 377
 378         DO_ONE_PIXEL()
 379 .L41:
 380         testl   $2, %esi
 381         je      .L40
 382
 383         movq    (%ebx), %xmm0
 384         addl    $8, %ebx
 385
 386         movdqa  %xmm0, %xmm3
 387         movdqa  %xmm0, %xmm4
 388         andps   %xmm1, %xmm0
 389
 390         andps   %xmm2, %xmm3
 391         pslldq  $2, %xmm4
 392         psrldq  $2, %xmm3
 393         andps   %xmm2, %xmm4
 394
 395         orps    %xmm4, %xmm3
 396         orps    %xmm3, %xmm0
 397
 398         movq    %xmm0, (%ecx)
 399         addl    $8, %ecx
 400 .L40:
 401
 402         /* Would it be worth having a specialized version of this loop for
 403          * the case where the destination is 16-byte aligned?  That version
 404          * would be identical except that it could use movedqa instead of
 405          * movdqu.
 406          */
 407
 408         movl    %edx, %eax
 409         shrl    $2, %eax
 410         jmp     .L42
 411 .L43:
 412         movdqa  (%ebx), %xmm0
 413         addl    $16, %ebx
 414
 415         movdqa  %xmm0, %xmm3
 416         movdqa  %xmm0, %xmm4
 417         andps   %xmm1, %xmm0
 418
 419         andps   %xmm2, %xmm3
 420         pslldq  $2, %xmm4
 421         psrldq  $2, %xmm3
 422         andps   %xmm2, %xmm4
 423
 424         orps    %xmm4, %xmm3
 425         orps    %xmm3, %xmm0
 426
 427         movdqu  %xmm0, (%ecx)
 428         addl    $16, %ecx
 429         subl    $1, %eax
 430 .L42:
 431         jne     .L43
 432
 433
 434         /* There may be upto 3 pixels remaining to be copied.  Take care
 435          * of them now.  We do the 2 pixel case first because the data
 436          * will be aligned.
 437          */
 438
 439         testl   $2, %edx
 440         je      .L47
 441
 442         movq    (%ebx), %xmm0
 443         addl    $8, %ebx
 444
 445         movdqa  %xmm0, %xmm3
 446         movdqa  %xmm0, %xmm4
 447         andps   %xmm1, %xmm0
 448
 449         andps   %xmm2, %xmm3
 450         pslldq  $2, %xmm4
 451         psrldq  $2, %xmm3
 452         andps   %xmm2, %xmm4
 453
 454         orps    %xmm4, %xmm3
 455         orps    %xmm3, %xmm0
 456
 457         movq    %xmm0, (%ecx)
 458         addl    $8, %ecx
 459 .L47:
 460
 461         testl   $1, %edx
 462         je      .L46
 463
 464         DO_ONE_LAST_PIXEL()
 465 .L46:
 466
 467         popl    %ebx
 468         popl    %esi
 469         ret
 470         .size   _generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2
 471
 472
 473
 474 #define MASK_565_L      0x07e0f800
 475 #define MASK_565_H      0x0000001f
 476 /* Setting SCALE_ADJUST to 5 gives a perfect match with the
 477  * classic C implementation in Mesa.  Setting SCALE_ADJUST
 478  * to 0 is slightly faster but at a small cost to accuracy.
 479  */
 480 #define SCALE_ADJUST    5
 481 #if SCALE_ADJUST == 5
 482 #define PRESCALE_L 0x00100001
 483 #define PRESCALE_H 0x00000200
 484 #define SCALE_L 0x40C620E8
 485 #define SCALE_H 0x0000839d
 486 #elif SCALE_ADJUST == 0
 487 #define PRESCALE_L 0x00200001
 488 #define PRESCALE_H 0x00000800
 489 #define SCALE_L 0x01040108
 490 #define SCALE_H 0x00000108
 491 #else
 492 #error SCALE_ADJUST must either be 5 or 0.
 493 #endif
 494 #define ALPHA_L 0x00000000
 495 #define ALPHA_H 0x00ff0000
 496
 497 /**
 498  * MMX optimized version of the RGB565 to RGBA copy routine.
 499  */
 500
 501         .text
 502         .globl  _generic_read_RGBA_span_RGB565_MMX
 503 #ifndef USE_DRICORE
 504         .hidden _generic_read_RGBA_span_RGB565_MMX
 505 #endif
 506         .type   _generic_read_RGBA_span_RGB565_MMX, @function
 507
 508 _generic_read_RGBA_span_RGB565_MMX:
 509
 510 #ifdef USE_INNER_EMMS
 511         emms
 512 #endif
 513
 514         movl    4(%esp), %eax   /* source pointer */
 515         movl    8(%esp), %edx   /* destination pointer */
 516         movl    12(%esp), %ecx  /* number of pixels to copy */
 517
 518         pushl   $MASK_565_H
 519         pushl   $MASK_565_L
 520         movq    (%esp), %mm5
 521         pushl   $PRESCALE_H
 522         pushl   $PRESCALE_L
 523         movq    (%esp), %mm6
 524         pushl   $SCALE_H
 525         pushl   $SCALE_L
 526         movq    (%esp), %mm7
 527         pushl   $ALPHA_H
 528         pushl   $ALPHA_L
 529         movq    (%esp), %mm3
 530         addl    $32,%esp
 531
 532         sarl    $2, %ecx
 533         jl      .L01            /* Bail early if the count is negative. */
 534         jmp     .L02
 535
 536 .L03:
 537         /* Fetch 4 RGB565 pixels into %mm4.  Distribute the first and
 538          * second pixels into the four words of %mm0 and %mm2.
 539          */
 540
 541         movq    (%eax), %mm4
 542         addl    $8, %eax
 543
 544         pshufw  $0x00, %mm4, %mm0
 545         pshufw  $0x55, %mm4, %mm2
 546
 547
 548         /* Mask the pixels so that each word of each register contains only
 549          * one color component.
 550          */
 551
 552         pand    %mm5, %mm0
 553         pand    %mm5, %mm2
 554
 555
 556         /* Adjust the component values so that they are as small as possible,
 557          * but large enough so that we can multiply them by an unsigned 16-bit
 558          * number and get a value as large as 0x00ff0000.
 559          */
 560
 561         pmullw  %mm6, %mm0
 562         pmullw  %mm6, %mm2
 563 #if SCALE_ADJUST > 0
 564         psrlw   $SCALE_ADJUST, %mm0
 565         psrlw   $SCALE_ADJUST, %mm2
 566 #endif
 567
 568         /* Scale the input component values to be on the range
 569          * [0, 0x00ff0000].  This it the real magic of the whole routine.
 570          */
 571
 572         pmulhuw %mm7, %mm0
 573         pmulhuw %mm7, %mm2
 574
 575
 576         /* Always set the alpha value to 0xff.
 577          */
 578
 579         por %mm3, %mm0
 580         por %mm3, %mm2
 581
 582
 583         /* Pack the 16-bit values to 8-bit values and store the converted
 584          * pixel data.
 585          */
 586
 587         packuswb        %mm2, %mm0
 588         movq    %mm0, (%edx)
 589         addl    $8, %edx
 590
 591         pshufw  $0xaa, %mm4, %mm0
 592         pshufw  $0xff, %mm4, %mm2
 593
 594         pand    %mm5, %mm0
 595         pand    %mm5, %mm2
 596         pmullw  %mm6, %mm0
 597         pmullw  %mm6, %mm2
 598 #if SCALE_ADJUST > 0
 599         psrlw   $SCALE_ADJUST, %mm0
 600         psrlw   $SCALE_ADJUST, %mm2
 601 #endif
 602         pmulhuw %mm7, %mm0
 603         pmulhuw %mm7, %mm2
 604
 605         por %mm3, %mm0
 606         por %mm3, %mm2
 607
 608         packuswb        %mm2, %mm0
 609
 610         movq    %mm0, (%edx)
 611         addl    $8, %edx
 612
 613         subl    $1, %ecx
 614 .L02:
 615         jne     .L03
 616
 617
 618         /* At this point there can be at most 3 pixels left to process.  If
 619          * there is either 2 or 3 left, process 2.
 620          */
 621
 622         movl    12(%esp), %ecx
 623         testl   $0x02, %ecx
 624         je      .L04
 625
 626         movd    (%eax), %mm4
 627         addl    $4, %eax
 628
 629         pshufw  $0x00, %mm4, %mm0
 630         pshufw  $0x55, %mm4, %mm2
 631
 632         pand    %mm5, %mm0
 633         pand    %mm5, %mm2
 634         pmullw  %mm6, %mm0
 635         pmullw  %mm6, %mm2
 636 #if SCALE_ADJUST > 0
 637         psrlw   $SCALE_ADJUST, %mm0
 638         psrlw   $SCALE_ADJUST, %mm2
 639 #endif
 640         pmulhuw %mm7, %mm0
 641         pmulhuw %mm7, %mm2
 642
 643         por %mm3, %mm0
 644         por %mm3, %mm2
 645
 646         packuswb        %mm2, %mm0
 647
 648         movq    %mm0, (%edx)
 649         addl    $8, %edx
 650
 651 .L04:
 652         /* At this point there can be at most 1 pixel left to process.
 653          * Process it if needed.
 654          */
 655
 656         testl   $0x01, %ecx
 657         je      .L01
 658
 659         movzwl  (%eax), %ecx
 660         movd    %ecx, %mm4
 661
 662         pshufw  $0x00, %mm4, %mm0
 663
 664         pand    %mm5, %mm0
 665         pmullw  %mm6, %mm0
 666 #if SCALE_ADJUST > 0
 667         psrlw   $SCALE_ADJUST, %mm0
 668 #endif
 669         pmulhuw %mm7, %mm0
 670
 671         por %mm3, %mm0
 672
 673         packuswb        %mm0, %mm0
 674
 675         movd    %mm0, (%edx)
 676
 677 .L01:
 678 #ifdef USE_INNER_EMMS
 679         emms
 680 #endif
 681         ret
 682 #endif /* !defined(__DJGPP__) && !defined(__MINGW32__) && !defined(__APPLE__) */
 683
 684 #if defined (__ELF__) && defined (__linux__)
 685         .section .note.GNU-stack,"",%progbits
 686 #endif