src/gallium/drivers/nvfx/nv04_2d.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009 Ben Skeggs
   4  * Copyright 2009 Younes Manton
   5  * Copyright 2010 Luca Barbieri
   6  * All Rights Reserved.
   7  *
   8  * Permission is hereby granted, free of charge, to any person obtaining
   9  * a copy of this software and associated documentation files (the
  10  * "Software"), to deal in the Software without restriction, including
  11  * without limitation the rights to use, copy, modify, merge, publish,
  12  * distribute, sub license, and/or sell copies of the Software, and to
  13  * permit persons to whom the Software is furnished to do so, subject to
  14  * the following conditions:
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  17  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
  18  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  19  * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
  20  * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  22  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  23  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  24  *
  25  * The above copyright notice and this permission notice (including the
  26  * next paragraph) shall be included in all copies or substantial portions
  27  * of the Software.
  28  *
  29  **************************************************************************/
  30
  31 /* this code has no Mesa or Gallium dependency and can be reused in the classic Mesa driver or DDX */
  32
  33 #include <stdlib.h>
  34 #include <stdio.h>
  35 #include <stdint.h>
  36 #include <nouveau/nouveau_class.h>
  37 #include <nouveau/nouveau_device.h>
  38 #include <nouveau/nouveau_pushbuf.h>
  39 #include <nouveau/nouveau_channel.h>
  40 #include <nouveau/nouveau_bo.h>
  41 #include <nouveau/nouveau_notifier.h>
  42 #include <nouveau/nouveau_grobj.h>
  43 #include "nv04_2d.h"
  44
  45 /* avoid depending on Mesa/Gallium */
  46 #ifdef __GNUC__
  47 #define likely(x) __builtin_expect(!!(x), 1)
  48 #define unlikely(x) __builtin_expect(!!(x), 0)
  49 #else
  50 #define likely(x) !!(x)
  51 #define unlikely(x) !!(x)
  52 #endif
  53
  54 #define MIN2( A, B )   ( (A)<(B) ? (A) : (B) )
  55 #define MAX2( A, B )   ( (A)>(B) ? (A) : (B) )
  56
  57 struct nv04_2d_context
  58 {
  59         struct nouveau_notifier *ntfy;
  60         struct nouveau_grobj *surf2d;
  61         struct nouveau_grobj *swzsurf;
  62         struct nouveau_grobj *m2mf;
  63         struct nouveau_grobj *rect;
  64         struct nouveau_grobj *sifm;
  65         struct nouveau_grobj *blit;
  66 };
  67
  68 static inline int
  69 align(int value, int alignment)
  70 {
  71    return (value + alignment - 1) & ~(alignment - 1);
  72 }
  73
  74 static inline int
  75 util_is_pot(unsigned x)
  76 {
  77    return (x & (x - 1)) == 0;
  78 }
  79
  80 /* Integer base-2 logarithm, rounded towards zero. */
  81 static inline unsigned log2i(unsigned i)
  82 {
  83         unsigned r = 0;
  84
  85         if (i & 0xffff0000) {
  86                 i >>= 16;
  87                 r += 16;
  88         }
  89         if (i & 0x0000ff00) {
  90                 i >>= 8;
  91                 r += 8;
  92         }
  93         if (i & 0x000000f0) {
  94                 i >>= 4;
  95                 r += 4;
  96         }
  97         if (i & 0x0000000c) {
  98                 i >>= 2;
  99                 r += 2;
 100         }
 101         if (i & 0x00000002) {
 102                 r += 1;
 103         }
 104         return r;
 105 }
 106
 107 //#define NV04_REGION_DEBUG
 108
 109 // Yes, we really want to inline everything, since all the functions are used only once
 110 #if defined(__GNUC__) && !defined(DEBUG)
 111 #define inline __attribute__((always_inline)) inline
 112 #endif
 113
 114 static inline unsigned
 115 nv04_swizzle_bits_square(unsigned x, unsigned y)
 116 {
 117         unsigned u = (x & 0x001) << 0 |
 118                      (x & 0x002) << 1 |
 119                      (x & 0x004) << 2 |
 120                      (x & 0x008) << 3 |
 121                      (x & 0x010) << 4 |
 122                      (x & 0x020) << 5 |
 123                      (x & 0x040) << 6 |
 124                      (x & 0x080) << 7 |
 125                      (x & 0x100) << 8 |
 126                      (x & 0x200) << 9 |
 127                      (x & 0x400) << 10 |
 128                      (x & 0x800) << 11;
 129
 130         unsigned v = (y & 0x001) << 1 |
 131                      (y & 0x002) << 2 |
 132                      (y & 0x004) << 3 |
 133                      (y & 0x008) << 4 |
 134                      (y & 0x010) << 5 |
 135                      (y & 0x020) << 6 |
 136                      (y & 0x040) << 7 |
 137                      (y & 0x080) << 8 |
 138                      (y & 0x100) << 9 |
 139                      (y & 0x200) << 10 |
 140                      (y & 0x400) << 11 |
 141                      (y & 0x800) << 12;
 142         return v | u;
 143 }
 144
 145 /* rectangular swizzled textures are linear concatenations of swizzled square tiles */
 146 static inline unsigned
 147 nv04_swizzle_bits_2d(unsigned x, unsigned y, unsigned w, unsigned h)
 148 {
 149         if(h <= 1)
 150                 return x;
 151         else
 152         {
 153                 unsigned s = MIN2(w, h);
 154                 unsigned m = s - 1;
 155                 return (((x | y) & ~m) * s) | nv04_swizzle_bits_square(x & m, y & m);
 156         }
 157 }
 158
 159 // general 3D texture case
 160 static inline unsigned
 161 nv04_swizzle_bits(unsigned x, unsigned y, unsigned z, unsigned w, unsigned h, unsigned d)
 162 {
 163         if(d <= 1)
 164                 return nv04_swizzle_bits_2d(x, y, w, h);
 165         else
 166         {
 167                 // TODO: autogenerate code for all possible texture sizes (13 * 13 * 13 with dims <= 4096) and do a single indirect call
 168                 unsigned v = 0;
 169                 w >>= 1;
 170                 h >>= 1;
 171                 d >>= 1;
 172                 for(int i = 0;;)
 173                 {
 174                         int oldi = i;
 175                         if(likely(w))
 176                         {
 177                                 v |= (x & 1) << i;
 178                                 x >>= 1;
 179                                 w >>= 1;
 180                                 ++i;
 181                         }
 182
 183                         if(likely(h))
 184                         {
 185                                 v |= (y & 1) << i;
 186                                 y >>= 1;
 187                                 h >>= 1;
 188                                 ++i;
 189                         }
 190
 191                         if(likely(d))
 192                         {
 193                                 v |= (z & 1) << i;
 194                                 z >>= 1;
 195                                 d >>= 1;
 196                                 ++i;
 197                         }
 198
 199                         if(i == oldi)
 200                                 break;
 201                 }
 202                 return v;
 203         }
 204 }
 205
 206 unsigned
 207 nv04_region_begin(struct nv04_region* rgn, unsigned w, unsigned h)
 208 {
 209         if(rgn->pitch)
 210                 return rgn->pitch * rgn->y + (rgn->x << rgn->bpps);
 211         else
 212                 return nv04_swizzle_bits(rgn->x, rgn->y, rgn->z, rgn->w, rgn->h, rgn->d) << rgn->bpps;
 213 }
 214
 215 unsigned
 216 nv04_region_end(struct nv04_region* rgn, unsigned w, unsigned h)
 217 {
 218         if(rgn->pitch)
 219                 return rgn->pitch * (rgn->y + h - 1) + ((rgn->x + w) << rgn->bpps);
 220         else
 221                 return (nv04_swizzle_bits(rgn->x + w - 1, rgn->y + h - 1, rgn->z, rgn->w, rgn->h, rgn->d) + 1) << rgn->bpps;
 222 }
 223
 224 // *pitch = -1 -> use 3D swizzling for (x, y), *pitch = 0 -> use 2D swizzling, other *pitch -> use linear calculations
 225 // returns 2 if pixel order is 3D-swizzled and 1 if subrect is 2D-swizzled
 226 /* *pitch == -1 ret = 0 -> 3D swizzled subrect
 227  * *pitch == 0 ret = 0 -> 2D swizzled subrect
 228  * *pitch > 0 ret = 0 -> linear subrect
 229  * *pitch > 0 ret = 1 -> linear subrect, but with swizzled 3D data inside
 230  */
 231
 232 static inline void
 233 nv04_region_print(struct nv04_region* rgn)
 234 {
 235         fprintf(stderr, "<%i[%i]> ", rgn->bo->handle, rgn->offset);
 236         if(rgn->pitch)
 237                 fprintf(stderr, "lin %i", rgn->pitch);
 238         else
 239                 fprintf(stderr, "swz %ix%ix%i", rgn->w, rgn->h, rgn->d);
 240         fprintf(stderr, " (%i, %i, %i)", rgn->x, rgn->y, rgn->z);
 241 }
 242
 243 static inline void
 244 nv04_region_assert(struct nv04_region* rgn, unsigned w, unsigned h)
 245 {
 246         unsigned end = rgn->offset + nv04_region_end(rgn, w, h);
 247
 248         assert(rgn->offset <= (int)rgn->bo->size);
 249         assert(end <= rgn->bo->size);
 250         (void) end;
 251         if(!rgn->pitch) {
 252                 assert(util_is_pot(rgn->w));
 253                 assert(util_is_pot(rgn->h));
 254         }
 255 }
 256
 257 /* determine if region can be linearized or fake-linearized */
 258 static inline int
 259 nv04_region_is_contiguous(struct nv04_region* rgn, int w, int h)
 260 {
 261         int surf_min;
 262         int rect_min;
 263
 264         if(rgn->pitch)
 265                 return rgn->pitch == w << rgn->bpps;
 266
 267         // redundant, but this is the fast path for the common case
 268         if(w == rgn->w && h == rgn->h && rgn->d <= 1)
 269                 return 1;
 270
 271         // must be POT
 272         if((w & (w - 1)) || (h & (h - 1)))
 273                 return 0;
 274
 275         // must be aligned
 276         if((rgn->x & (w - 1)) || (rgn->y & (h - 1)))
 277                 return 0;
 278
 279         if(rgn->d > 1)
 280                 return 0;
 281
 282         surf_min = MIN2(rgn->w, rgn->h);
 283         rect_min = MIN2(w, h);
 284
 285         if((rect_min == surf_min) || (w == h) || (w == 2 * h))
 286                 return 1;
 287
 288         return 0;
 289 }
 290
 291 // double the pitch until it is larger than the alignment, or the height becomes odd or 1
 292 static inline void
 293 nv04_region_contiguous_shape(struct nv04_region* rgn, int* w, int* h, int align)
 294 {
 295         while(!(*h & 1) && (*w << rgn->bpps) < (1 << align))
 296         {
 297                 *w <<= 1;
 298                 *h >>= 1;
 299         }
 300
 301         while((*w << rgn->bpps) > 16384 && !(*w & 1))
 302         {
 303                 *w >>= 1;
 304                 *h <<= 1;
 305         }
 306
 307 #ifdef NV04_REGION_DEBUG
 308         fprintf(stderr, "\tCONTIGUOUS %ix%i\n", *w, *h);
 309 #endif
 310 }
 311
 312 static inline void
 313 nv04_region_linearize_contiguous(struct nv04_region* rgn, unsigned w, unsigned h)
 314 {
 315         int pos;
 316         if(rgn->pitch)
 317         {
 318                 rgn->offset += rgn->y * rgn->pitch + (rgn->x << rgn->bpps);
 319                 rgn->x = 0;
 320                 rgn->y = 0;
 321         }
 322         else
 323         {
 324                 rgn->offset += (rgn->w * rgn->h * rgn->z) << rgn->bpps;
 325                 pos = nv04_swizzle_bits(rgn->x, rgn->y, rgn->z, rgn->w, rgn->h, rgn->d);
 326                 rgn->x = pos & (w - 1);
 327                 rgn->y = pos / w;
 328         }
 329         rgn->pitch = w << rgn->bpps;
 330
 331 #ifdef NV04_REGION_DEBUG
 332         fprintf(stderr, "\tLINEARIZE ");
 333         nv04_region_print(rgn);
 334         fprintf(stderr, "\n");
 335 #endif
 336 }
 337
 338         /* preserve the offset! */
 339         /*
 340         rgn->pitch = util_format_get_stride(rgn->format, w);
 341         int pos = nv04_swizzle_bits(rgn->x, rgn->y, rgn->z, rgn->w, rgn->h, rgn->d);
 342         rgn->x = pos & (w - 1);
 343         rgn->y = pos & ~(w - 1);
 344         */
 345
 346         /*
 347         rgn->offset +=
 348         rgn->pitch = util_format_get_stride(rgn->format, w);
 349         rgn->x = 0;
 350         rgn->y = 0;
 351         */
 352
 353 /* This code will get used for, and always succeed on:
 354  * - 4x2 1bpp swizzled texture mipmap levels
 355  * - linear regions created by linearization
 356  *
 357  * This code will get used for, and MAY work for:
 358  * - misaligned texture blanket
 359  * - linear surfaces created without wide_pitch (in this case, it will only work if we are lucky)
 360  *
 361  * The general case requires splitting the region in 2.
 362  */
 363 static inline int
 364 nv04_region_do_align_offset(struct nv04_region* rgn, unsigned w, unsigned h, int shift)
 365 {
 366         if(rgn->pitch > 0)
 367         {
 368                 int delta;
 369
 370                 assert(!(rgn->offset & ((1 << rgn->bpps) - 1))); // fatal!
 371                 delta = rgn->offset & ((1 << shift) - 1);
 372
 373                 if(h <= 1)
 374                 {
 375                         rgn->x += delta >> rgn->bpps;
 376                         rgn->offset -= delta;
 377                         rgn->pitch = align((rgn->x + w) << rgn->bpps, 1 << shift);
 378                 }
 379                 else
 380                 {
 381                         int newxo = (rgn->x << rgn->bpps) + delta;
 382                         int dy = newxo / rgn->pitch;
 383                         newxo -= dy * rgn->pitch;
 384                         if((newxo + (w << rgn->bpps)) > rgn->pitch)
 385                         {
 386                                 // TODO: split the region into two rectangles (!) if *really* necessary, unless the hardware actually supports "wrapping" rectangles
 387                                 // this does not happen if the surface is pitch-aligned, which it should always be
 388                                 assert(0);
 389                                 return -1;
 390                         }
 391                         rgn->x = newxo >> rgn->bpps;
 392                         rgn->y += dy;
 393                 }
 394         }
 395         else
 396         {
 397                 int size;
 398                 int min;
 399                 int v;
 400
 401                 // we don't care about the alignment of 3D surfaces since the 2D engine can't use them
 402                 if(rgn->d < 0)
 403                         return -1;
 404
 405                 min = MIN2(rgn->w, rgn->h);
 406                 size = min * min << rgn->bpps;
 407
 408                 // this is unfixable, and should not be happening
 409                 if(rgn->offset & (size - 1))
 410                         return -1;
 411
 412                 v = (rgn->offset & ((1 << shift) - 1)) / size;
 413                 rgn->offset -= v * size;
 414
 415                 if(rgn->h == min)
 416                 {
 417                         unsigned w;
 418                         rgn->x += rgn->h * v;
 419                         w = rgn->w + rgn->h * v;
 420
 421                         while(rgn->w < w)
 422                                 rgn->w += rgn->w;
 423                 }
 424                 else
 425                 {
 426                         unsigned h;
 427                         rgn->y += rgn->w * v;
 428                         h = rgn->h + rgn->w * v;
 429
 430                         while(rgn->h < h)
 431                                 rgn->h += rgn->h;
 432                 }
 433         }
 434
 435 #ifdef NV04_REGION_DEBUG
 436         fprintf(stderr, "\tALIGNED ");
 437         nv04_region_print(rgn);
 438         fprintf(stderr, "\n");
 439 #endif
 440         return 0;
 441 }
 442
 443 // both pitch and shift
 444 // will leave the region unchanged if it fails
 445 static inline int
 446 nv04_region_align(struct nv04_region* rgn, unsigned w, unsigned h, int shift)
 447 {
 448         if(rgn->pitch & ((1 << shift) - 1))
 449         {
 450                 if(h == 1)
 451                         goto do_align; /* this will fix pitch too in this case */
 452                 else
 453                         return -1;
 454         }
 455
 456         if(rgn->offset & ((1 << shift) - 1))
 457         {
 458                 do_align:
 459                 if(nv04_region_do_align_offset(rgn, w, h, shift))
 460                         return -1;
 461         }
 462         return 0;
 463 }
 464
 465 /* this contains 22 different copy loops after preprocessing. unfortunately, it's necessary */
 466 void
 467 nv04_region_copy_cpu(struct nv04_region* dst, struct nv04_region* src, int w, int h)
 468 {
 469         uint8_t* mdst;
 470         uint8_t* msrc;
 471         int size;
 472
 473         if(dst->bo != src->bo)
 474         {
 475                 nouveau_bo_map(dst->bo, NOUVEAU_BO_WR);
 476                 nouveau_bo_map(src->bo, NOUVEAU_BO_RD);
 477         }
 478         else
 479                 nouveau_bo_map(dst->bo, NOUVEAU_BO_WR | NOUVEAU_BO_RD);
 480
 481         mdst = (uint8_t*)dst->bo->map + dst->offset;
 482         msrc = (uint8_t*)src->bo->map + src->offset;
 483
 484         size = w << dst->bpps;
 485
 486         nv04_region_assert(dst, w, h);
 487         nv04_region_assert(src, w, h);
 488
 489 #ifdef NV04_REGION_DEBUG
 490         fprintf(stderr, "\tRGN_COPY_CPU [%i, %i: %i] ", w, h, dst->bpps);
 491         for(int i = 0; i < 2; ++i)
 492         {
 493                 nv04_region_print(i ? src : dst);
 494                 fprintf(stderr, i ? "\n" : " <- ");
 495         }
 496
 497 //      for(int i = 0; i < 16; ++i)
 498 //              fprintf(stderr, "%02x ", msrc[i]);
 499 //      fprintf(stderr, "\n");
 500 #endif
 501
 502         // TODO: support overlapping copies!
 503         if(src->pitch && dst->pitch)
 504         {
 505                 mdst += dst->y * dst->pitch + (dst->x << dst->bpps);
 506                 msrc += src->y * src->pitch + (src->x << src->bpps);
 507                 if(dst->bo != src->bo)
 508                         goto simple;
 509                 else if(mdst < msrc)
 510                 {
 511                         if(mdst + size <= msrc)
 512                         {
 513 simple:
 514                                 for(int iy = 0; iy < h; ++iy)
 515                                 {
 516                                         assert(mdst + size <= (uint8_t*)dst->bo->map + dst->bo->size);
 517                                         assert(msrc + size <= (uint8_t*)src->bo->map + src->bo->size);
 518                                         memcpy(mdst, msrc, size);
 519                                         msrc += src->pitch; mdst += dst->pitch;
 520                                 }
 521                         }
 522                         else
 523                         {
 524                                 for(int iy = 0; iy < h; ++iy)
 525                                 {
 526                                         assert(mdst + size <= (uint8_t*)dst->bo->map + dst->bo->size);
 527                                         assert(msrc + size <= (uint8_t*)src->bo->map + src->bo->size);
 528                                         memmove(mdst, msrc, size);
 529                                         msrc += src->pitch; mdst += dst->pitch;
 530                                 }
 531                         }
 532                 }
 533                 else
 534                 {
 535                         /* copy backwards so we don't destroy data we have to read yet */
 536                         if(msrc + size <= mdst)
 537                         {
 538                                 for(int iy = h - 1; iy >= 0; --iy)
 539                                 {
 540                                         assert(mdst + size <= (uint8_t*)dst->bo->map + dst->bo->size);
 541                                         assert(msrc + size <= (uint8_t*)src->bo->map + src->bo->size);
 542                                         memcpy(mdst, msrc, size);
 543                                         msrc += src->pitch; mdst += dst->pitch;
 544                                 }
 545                         }
 546                         else
 547                         {
 548                                 for(int iy = h - 1; iy >= 0; --iy)
 549                                 {
 550                                         assert(mdst + size <= (uint8_t*)dst->bo->map + dst->bo->size);
 551                                         assert(msrc + size <= (uint8_t*)src->bo->map + src->bo->size);
 552                                         memmove(mdst, msrc, size);
 553                                         msrc += src->pitch; mdst += dst->pitch;
 554                                 }
 555                         }
 556                 }
 557         }
 558         else
 559         {
 560                 int* dswx = NULL;
 561                 int* dswy = NULL;
 562                 int* sswx = NULL;
 563                 int* sswy = NULL;
 564                 int dir;
 565
 566                 if(!dst->pitch)
 567                 {
 568                         dswx = alloca(w * sizeof(int));
 569                         for(int ix = 0; ix < w; ++ix) // we are adding, so z cannot be contributed by both
 570                                 dswx[ix] = nv04_swizzle_bits(dst->x + ix, 0, 0, dst->w, dst->h, dst->d);
 571                         dswy = alloca(h * sizeof(int));
 572                         for(int iy = 0; iy < h; ++iy)
 573                                 dswy[iy] = nv04_swizzle_bits(0, dst->y + iy, dst->z, dst->w, dst->h, dst->d);
 574                 }
 575
 576                 if(!src->pitch)
 577                 {
 578                         sswx = alloca(w * sizeof(int));
 579                         for(int ix = 0; ix < w; ++ix)
 580                                 sswx[ix] = nv04_swizzle_bits(src->x + ix, 0, 0, src->w, src->h, src->d);
 581                         sswy = alloca(h * sizeof(int));
 582                         for(int iy = 0; iy < h; ++iy)
 583                                 sswy[iy] = nv04_swizzle_bits(0, src->y + iy, src->z, src->w, src->h, src->d);
 584                 }
 585
 586                 dir = 1;
 587                 /* do backwards copies for overlapping swizzled surfaces */
 588                 if(dst->pitch == src->pitch && dst->offset == src->offset)
 589                 {
 590                         if(dst->y > src->y || (dst->y == src->y && dst->x > src->x))
 591                                 dir = -1;
 592                 }
 593
 594 #define SWIZZLED_COPY_LOOPS
 595                 if(dir == 1)
 596                 {
 597                         int dir = 1;
 598 #define LOOP_Y for(int iy = 0; iy < h; ++iy)
 599 #define LOOP_X for(int ix = 0; ix < w; ++ix)
 600 #include "nv04_2d_loops.h"
 601 #undef LOOP_X
 602 #undef LOOP_Y
 603                 }
 604                 else
 605                 {
 606                         int dir = -1;
 607 #define LOOP_Y for(int iy = h - 1; iy >= 0; --iy)
 608 #define LOOP_X for(int ix = w - 1; ix >= 0; --ix)
 609 #include "nv04_2d_loops.h"
 610 #undef LOOP_X
 611 #undef LOOP_Y
 612                 }
 613 #undef SWIZZLED_COPY_LOOP
 614         }
 615
 616         if(src->bo != dst->bo)
 617                 nouveau_bo_unmap(src->bo);
 618         nouveau_bo_unmap(dst->bo);
 619 }
 620
 621 /* TODO: if the destination is swizzled, we are doing random writes, which causes write combining to fail
 622  * the alternative is to read, modify and copy back, which may or may not be faster
 623  * loading 3D textures is a common case that hits this and could probably benefit from the temporary
 624  */
 625 void
 626 nv04_region_fill_cpu(struct nv04_region* dst, int w, int h, unsigned value)
 627 {
 628         uint8_t* mdst = (nouveau_bo_map(dst->bo, NOUVEAU_BO_WR), (uint8_t*)dst->bo->map + dst->offset);
 629
 630 #ifdef NV04_REGION_DEBUG
 631         fprintf(stderr, "\tRGN_FILL_CPU ");
 632         nv04_region_print(dst);
 633         fprintf(stderr, "\n");
 634 #endif
 635
 636         nv04_region_assert(dst, w, h);
 637
 638         if(dst->pitch)
 639         {
 640                 unsigned size = w << dst->bpps;
 641
 642 #define FILL(T) do { \
 643                         for(int iy = 0; iy < h; ++iy) \
 644                         { \
 645                                 assert((char*)((T*)mdst + w) <= (char*)dst->bo->map + dst->bo->size); \
 646                                 for(int ix = 0; ix < w; ++ix) \
 647                                         ((T*)mdst)[ix] = (T)value; \
 648                                 mdst += dst->pitch; \
 649                         } \
 650                 } while(0)
 651
 652                 mdst += dst->y * dst->pitch + (dst->x << dst->bpps);
 653
 654                 if(dst->bpps == 0)
 655                 {
 656 ms:
 657                         assert(mdst + size * h <= (uint8_t*)dst->bo->map + dst->bo->size);
 658                         if(size == dst->pitch)
 659                                 memset(mdst, (uint8_t)value, size * h);
 660                         else
 661                         {
 662                                 for(int iy = 0; iy < h; ++iy)
 663                                 {
 664                                         assert(mdst + size <= (uint8_t*)dst->bo->map + dst->bo->size);
 665                                         memset(mdst, (uint8_t)value, size);
 666                                         mdst += dst->pitch;
 667                                 }
 668                         }
 669                 }
 670                 else if(dst->bpps == 1)
 671                 {
 672                         if(!((uint8_t)value ^ (uint8_t)(value >> 8)))
 673                                 goto ms;
 674
 675                         FILL(uint16_t);
 676                 }
 677                 else if(dst->bpps == 2)
 678                 {
 679                         if(value == (uint8_t)value * 0x1010101)
 680                                 goto ms;
 681                         FILL(uint32_t);
 682                 }
 683                 else
 684                         assert(0);
 685 #undef FILL
 686         }
 687         else
 688         {
 689                 int* dswx;
 690                 int* dswy;
 691
 692                 dswx = alloca(w * sizeof(int));
 693                 for(int ix = 0; ix < w; ++ix)
 694                         dswx[ix] = nv04_swizzle_bits(dst->x + ix, 0, dst->z, dst->w, dst->h, dst->d);
 695                 dswy = alloca(h * sizeof(int));
 696                 for(int iy = 0; iy < h; ++iy)
 697                         dswy[iy] = nv04_swizzle_bits(0, dst->y + iy, dst->z, dst->w, dst->h, dst->d);
 698
 699 #define FILL(T) do { \
 700                         T tvalue = (T)value; \
 701                         for(int iy = 0; iy < h; ++iy) \
 702                         { \
 703                                 T* pdst = (T*)mdst + dswy[iy]; \
 704                                 for(int ix = 0; ix < w; ++ix) \
 705                                 { \
 706                                         assert((uint8_t*)&pdst[dswx[ix] + 1] <= (uint8_t*)dst->bo->map + dst->bo->size); \
 707                                         pdst[dswx[ix]] = tvalue; \
 708                                 } \
 709                         } \
 710                 } while(0)
 711
 712                 if(dst->bpps == 0)
 713                         FILL(uint8_t);
 714                 else if(dst->bpps == 1)
 715                         FILL(uint16_t);
 716                 else if(dst->bpps == 2)
 717                         FILL(uint32_t);
 718                 else
 719                         assert(0 && "unhandled bpp");
 720 #undef FILL
 721         }
 722
 723         nouveau_bo_unmap(dst->bo);
 724 }
 725
 726 static void
 727 nv04_region_copy_swizzle(struct nv04_2d_context *ctx,
 728                           struct nv04_region* dst,
 729                           struct nv04_region* src,
 730                           int w, int h, int cs2d_format, int sifm_format)
 731 {
 732         struct nouveau_channel *chan = ctx->swzsurf->channel;
 733         struct nouveau_grobj *swzsurf = ctx->swzsurf;
 734         struct nouveau_grobj *sifm = ctx->sifm;
 735         /* Max width & height may not be the same on all HW, but must be POT */
 736         unsigned max_shift = 10;
 737         unsigned cw = 1 << max_shift;
 738         unsigned ch = 1 << max_shift;
 739         unsigned sx = dst->x >> max_shift;
 740         unsigned sy = dst->y >> max_shift;
 741         unsigned ex = (dst->x + w - 1) >> max_shift;
 742         unsigned ey = (dst->y + h - 1) >> max_shift;
 743         unsigned chunks = (ex - sx + 1) * (ey - sy + 1);
 744         unsigned chunk_size;
 745         if(dst->w < cw)
 746                 cw = dst->w;
 747         if(dst->h < ch)
 748                 ch = dst->h;
 749         chunk_size = cw * ch << dst->bpps;
 750
 751 #ifdef NV04_REGION_DEBUG
 752         fprintf(stderr, "\tRGN_COPY_SWIZZLE [%i, %i: %i] ", w, h, dst->bpps);
 753         for(int i = 0; i < 2; ++i)
 754         {
 755                 nv04_region_print(i ? src : dst);
 756                 fprintf(stderr, i ? "\n" : " <- ");
 757         }
 758 #endif
 759
 760         nv04_region_assert(dst, w, h);
 761         nv04_region_assert(src, w, h);
 762
 763         MARK_RING (chan, 8 + chunks * 17, 2 + chunks * 2);
 764
 765         BEGIN_RING(chan, swzsurf, NV04_SWIZZLED_SURFACE_DMA_IMAGE, 1);
 766         OUT_RELOCo(chan, dst->bo,
 767                         NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
 768
 769         BEGIN_RING(chan, swzsurf, NV04_SWIZZLED_SURFACE_FORMAT, 1);
 770         OUT_RING  (chan, cs2d_format |
 771                          log2i(cw) << NV04_SWIZZLED_SURFACE_FORMAT_BASE_SIZE_U_SHIFT |
 772                          log2i(ch) << NV04_SWIZZLED_SURFACE_FORMAT_BASE_SIZE_V_SHIFT);
 773
 774         BEGIN_RING(chan, sifm, NV03_SCALED_IMAGE_FROM_MEMORY_DMA_IMAGE, 1);
 775         OUT_RELOCo(chan, src->bo,
 776                          NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
 777         BEGIN_RING(chan, sifm, NV04_SCALED_IMAGE_FROM_MEMORY_SURFACE, 1);
 778         OUT_RING  (chan, swzsurf->handle);
 779
 780         assert(!(dst->offset & 63));
 781
 782         for (int cy = sy; cy <= ey; ++cy) {
 783           int ry = MAX2(0, (int)(dst->y - ch * cy));
 784           int rh = MIN2((int)ch, (int)(dst->y - ch * cy + h)) - ry;
 785           for (int cx = sx; cx <= ex; ++cx) {
 786             int rx = MAX2(0, (int)(dst->x - cw * cx));
 787             int rw = MIN2((int)cw, (int)(dst->x - cw * cx + w)) - rx;
 788             unsigned dst_offset;
 789             unsigned src_offset;
 790
 791             BEGIN_RING(chan, swzsurf, NV04_SWIZZLED_SURFACE_OFFSET, 1);
 792
 793             dst_offset = dst->offset + (nv04_swizzle_bits_2d(cx * cw, cy * ch, dst->w, dst->h) << dst->bpps);
 794             assert(dst_offset <= dst->bo->size);
 795             assert(dst_offset + chunk_size <= dst->bo->size);
 796             OUT_RELOCl(chan, dst->bo, dst_offset,
 797                             NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
 798
 799             BEGIN_RING(chan, sifm, NV05_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION, 9);
 800             OUT_RING  (chan, NV05_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION_TRUNCATE);
 801             OUT_RING  (chan, sifm_format);
 802             OUT_RING  (chan, NV03_SCALED_IMAGE_FROM_MEMORY_OPERATION_SRCCOPY);
 803             OUT_RING  (chan, rx | (ry << NV03_SCALED_IMAGE_FROM_MEMORY_CLIP_POINT_Y_SHIFT));
 804             OUT_RING  (chan, rh << NV03_SCALED_IMAGE_FROM_MEMORY_CLIP_SIZE_H_SHIFT | rw);
 805             OUT_RING  (chan, rx | (ry << NV03_SCALED_IMAGE_FROM_MEMORY_OUT_POINT_Y_SHIFT));
 806             OUT_RING  (chan, rh << NV03_SCALED_IMAGE_FROM_MEMORY_OUT_SIZE_H_SHIFT | rw);
 807             OUT_RING  (chan, 1 << 20);
 808             OUT_RING  (chan, 1 << 20);
 809
 810             BEGIN_RING(chan, sifm, NV03_SCALED_IMAGE_FROM_MEMORY_SIZE, 4);
 811             OUT_RING  (chan, rh << NV03_SCALED_IMAGE_FROM_MEMORY_SIZE_H_SHIFT | align(rw, 8));
 812             OUT_RING  (chan, src->pitch |
 813                              NV03_SCALED_IMAGE_FROM_MEMORY_FORMAT_ORIGIN_CENTER |
 814                              NV03_SCALED_IMAGE_FROM_MEMORY_FORMAT_FILTER_POINT_SAMPLE);
 815             src_offset = src->offset + (cy * ch + ry + src->y - dst->y) * src->pitch + ((cx * cw + rx + src->x - dst->x) << src->bpps);
 816             assert(src_offset <= src->bo->size);
 817             assert(src_offset + (src->pitch * (rh - 1)) + (rw << src->bpps) <= src->bo->size);
 818             OUT_RELOCl(chan, src->bo, src_offset,
 819                              NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
 820             OUT_RING  (chan, 0);
 821           }
 822         }
 823 }
 824
 825 static inline void
 826 nv04_copy_m2mf_begin(struct nv04_2d_context *ctx, struct nouveau_bo* dstbo, struct nouveau_bo* srcbo, unsigned commands)
 827 {
 828         struct nouveau_channel *chan = ctx->m2mf->channel;
 829         struct nouveau_grobj *m2mf = ctx->m2mf;
 830         MARK_RING (chan, 3 + commands * 9, 2 + commands * 2);
 831         BEGIN_RING(chan, m2mf, NV04_MEMORY_TO_MEMORY_FORMAT_DMA_BUFFER_IN, 2);
 832         OUT_RELOCo(chan, srcbo,
 833                    NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
 834         OUT_RELOCo(chan, dstbo,
 835                    NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
 836 }
 837
 838 static inline void
 839 nv04_copy_m2mf_body(struct nv04_2d_context *ctx, struct nouveau_bo* dstbo, int* pdstoff, unsigned dstpitch, struct nouveau_bo* srcbo, int* psrcoff, unsigned srcpitch, unsigned size, unsigned lines)
 840 {
 841         struct nouveau_channel *chan = ctx->m2mf->channel;
 842         struct nouveau_grobj *m2mf = ctx->m2mf;
 843
 844 #ifdef NV04_REGION_DEBUG
 845         fprintf(stderr, "\t\t\tCOPY_M2MF_BODY [%i, %i] <%i[%u]> lin %u <- <%i[%u]> lin %u\n", size, lines, dstbo->handle, *pdstoff, dstpitch, srcbo->handle, *psrcoff, srcpitch);
 846 #endif
 847
 848         BEGIN_RING(chan, m2mf, NV04_MEMORY_TO_MEMORY_FORMAT_OFFSET_IN, 8);
 849         OUT_RELOCl(chan, srcbo, *psrcoff,
 850                    NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
 851         OUT_RELOCl(chan, dstbo, *pdstoff,
 852                    NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_WR);
 853         OUT_RING  (chan, srcpitch);
 854         OUT_RING  (chan, dstpitch);
 855         OUT_RING  (chan, size);
 856         OUT_RING  (chan, lines);
 857         OUT_RING  (chan, 0x0101);
 858         OUT_RING  (chan, 0);
 859
 860         *psrcoff += srcpitch * lines;
 861         *pdstoff += dstpitch * lines;
 862 }
 863
 864 static void
 865 nv04_copy_m2mf(struct nv04_2d_context *ctx,
 866                 struct nouveau_bo* dstbo, int dstoff, unsigned dstpitch,
 867                 struct nouveau_bo* srcbo, int srcoff, unsigned srcpitch,
 868                 unsigned size, unsigned h)
 869 {
 870         unsigned max_pitch = 32767;
 871         unsigned max_lines = 2047;
 872
 873 #ifdef NV04_REGION_DEBUG
 874         fprintf(stderr, "\t\tCOPY_M2MF [%i, %i] <%i[%i]> lin %u <- <%i[%i]> lin %u\n", size, h, dstbo->handle, dstoff, dstpitch, srcbo->handle, srcoff, srcpitch);
 875 #endif
 876
 877         if(srcpitch <= max_pitch && dstpitch <= max_pitch)
 878         {
 879                 unsigned full_pages = h / max_lines;
 880                 unsigned leftover_lines = h - full_pages * max_lines;
 881
 882                 nv04_copy_m2mf_begin(ctx, dstbo, srcbo, full_pages + !!leftover_lines);
 883
 884                 for(unsigned i = 0; i < full_pages; ++i)
 885                         nv04_copy_m2mf_body(ctx, dstbo, &dstoff, dstpitch, srcbo, &srcoff, srcpitch, size, max_lines);
 886
 887                 if(leftover_lines)
 888                         nv04_copy_m2mf_body(ctx, dstbo, &dstoff, dstpitch, srcbo, &srcoff, srcpitch, size, leftover_lines);
 889         }
 890         else
 891         {
 892                 unsigned lines = size / max_pitch;
 893                 unsigned leftover = size - lines * max_pitch;
 894                 unsigned full_pages = lines / max_lines;
 895                 unsigned leftover_lines = lines - full_pages * max_lines;
 896                 unsigned srcgap = srcpitch - size;
 897                 unsigned dstgap = dstpitch - size;
 898
 899                 nv04_copy_m2mf_begin(ctx, dstbo, srcbo, h * (full_pages + !!leftover_lines + !!leftover));
 900
 901                 for(unsigned i = 0; i < h; ++i)
 902                 {
 903                         for(unsigned j = 0; j < full_pages; ++j)
 904                                 nv04_copy_m2mf_body(ctx, dstbo, &dstoff, max_pitch, srcbo, &srcoff, max_pitch, max_pitch, max_lines);
 905
 906                         if(leftover_lines)
 907                                 nv04_copy_m2mf_body(ctx, dstbo, &dstoff, max_pitch, srcbo, &srcoff, max_pitch, max_pitch, leftover_lines);
 908
 909                         if(leftover)
 910                                 nv04_copy_m2mf_body(ctx, dstbo, &dstoff, leftover, srcbo, &srcoff, leftover, leftover, 1);
 911
 912                         srcoff += srcgap;
 913                         dstoff += dstgap;
 914                 }
 915         }
 916 }
 917
 918 void
 919 nv04_memcpy(struct nv04_2d_context *ctx, struct nouveau_bo* dstbo, int dstoff, struct nouveau_bo* srcbo, int srcoff, unsigned size)
 920 {
 921 #ifdef NV04_REGION_DEBUG
 922         fprintf(stderr, "\tMEMCPY [%i] <%i[%i]> <- <%i[%i]>\n", size, dstbo->handle, dstoff, srcbo->handle, srcoff);
 923 #endif
 924
 925         nv04_copy_m2mf(ctx, dstbo, dstoff, size, srcbo, srcoff, size, size, 1);
 926 }
 927
 928 static void
 929 nv04_region_copy_m2mf(struct nv04_2d_context *ctx, struct nv04_region *dst, struct nv04_region *src, int w, int h)
 930 {
 931 #ifdef NV04_REGION_DEBUG
 932         fprintf(stderr, "\tRGN_COPY_M2MF [%i, %i: %i] ", w, h, dst->bpps);
 933         for(int i = 0; i < 2; ++i)
 934         {
 935                 nv04_region_print(i ? src : dst);
 936                 fprintf(stderr, i ? "\n" : " <- ");
 937         }
 938 #endif
 939
 940         nv04_region_assert(dst, w, h);
 941         nv04_region_assert(src, w, h);
 942         assert(src->pitch);
 943         assert(dst->pitch);
 944
 945         nv04_copy_m2mf(ctx,
 946                         dst->bo, dst->offset + dst->y * dst->pitch + (dst->x << dst->bpps), dst->pitch,
 947                         src->bo, src->offset + src->y * src->pitch + (src->x << src->bpps), src->pitch,
 948                         w << src->bpps, h);
 949 }
 950
 951 static inline void
 952 nv04_region_copy_blit(struct nv04_2d_context *ctx, struct nv04_region* dst, struct nv04_region* src, int w, int h, int format)
 953 {
 954         struct nouveau_channel *chan = ctx->surf2d->channel;
 955         struct nouveau_grobj *surf2d = ctx->surf2d;
 956         struct nouveau_grobj *blit = ctx->blit;
 957
 958 #ifdef NV04_REGION_DEBUG
 959         fprintf(stderr, "\tRGN_COPY_BLIT [%i, %i: %i] ", w, h, dst->bpps);
 960         for(int i = 0; i < 2; ++i)
 961         {
 962                 nv04_region_print(i ? src : dst);
 963                 fprintf(stderr, i ? "\n" : " <- ");
 964         }
 965 #endif
 966
 967         assert(!(src->pitch & 63) && src->pitch);
 968         assert(!(dst->pitch & 63) && dst->pitch);
 969         nv04_region_assert(dst, w, h);
 970         nv04_region_assert(src, w, h);
 971
 972         MARK_RING (chan, 12, 4);
 973         BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_DMA_IMAGE_SOURCE, 2);
 974         OUT_RELOCo(chan, src->bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
 975         OUT_RELOCo(chan, dst->bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
 976         BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_FORMAT, 4);
 977         OUT_RING  (chan, format);
 978         OUT_RING  (chan, (dst->pitch << 16) | src->pitch);
 979         OUT_RELOCl(chan, src->bo, src->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
 980         OUT_RELOCl(chan, dst->bo, dst->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
 981
 982         BEGIN_RING(chan, blit, 0x0300, 3);
 983         OUT_RING  (chan, (src->y << 16) | src->x);
 984         OUT_RING  (chan, (dst->y << 16) | dst->x);
 985         OUT_RING  (chan, ( h << 16) |  w);
 986 }
 987
 988 /* THEOREM: a non-linearizable swizzled destination is always 64 byte aligned, except for 4x2 mipmap levels of swizzled 1bpp surfaces
 989  * HYPOTESIS:
 990  * 1. The first mipmap level is 64-byte-aligned
 991  * PROOF:
 992  * 1. Thus, all mipmaps level with a parent which is 64-byte or more in size are.
 993  * 2. At 1bpp, the smallest levels with a <= 32-byte parent are either Nx1 or 1xN or size <=8, thus 4x2, 2x2 or 2x4
 994  * 3. Nx1, 1xN, 2x4, 2x2 have all subrects linearizable. 4x2 does not.
 995  * 4. At 2/4bpp or more, the smallest levels with a 32-byte parent are 1xN, Nx1 or 2x2
 996  *
 997  * However, nv04_region_align handles that.
 998  */
 999
1000 // 0 -> done, 1 -> do with 3D engine or CPU, -1 -> do with CPU
1001 // dst and src may be modified, and the possibly modified version should be passed to nv04_region_cpu if necessary
1002 int
1003 nv04_region_copy_2d(struct nv04_2d_context *ctx, struct nv04_region* dst, struct nv04_region* src,
1004                 int w, int h, int cs2d_format, int sifm_format, int dst_to_gpu, int src_on_gpu)
1005 {
1006         assert(src->bpps == dst->bpps);
1007
1008 #ifdef NV04_REGION_DEBUG
1009         fprintf(stderr, "RGN_COPY%s [%i, %i: %i] ", (cs2d_format >= 0) ? "_2D" : "_NO2D", w, h, dst->bpps);
1010         for(int i = 0; i < 2; ++i)
1011         {
1012                 int gpu = i ? src_on_gpu : dst_to_gpu;
1013                 nv04_region_print(i ? src : dst);
1014                 fprintf(stderr, " %s", gpu ? "gpu" : "cpu");
1015                 fprintf(stderr, i ? "\n" : " <- ");
1016         }
1017 #endif
1018
1019         // if they are contiguous and either both swizzled or both linear, reshape
1020         if(!dst->pitch == !src->pitch
1021                 && nv04_region_is_contiguous(dst, w, h)
1022                 && nv04_region_is_contiguous(src, w, h))
1023         {
1024                 nv04_region_contiguous_shape(dst, &w, &h, 6);
1025                 nv04_region_linearize_contiguous(dst, w, h);
1026                 nv04_region_linearize_contiguous(src, w, h);
1027         }
1028
1029 #ifdef NV04_REGION_DEBUG
1030         fprintf(stderr, "\tOPT ");
1031         for(int i = 0; i < 2; ++i)
1032         {
1033                 nv04_region_print(i ? src : dst);
1034                 fprintf(stderr, i ? "\n" : " <- ");
1035         }
1036 #endif
1037
1038         /* if the destination is not for GPU _and_ source is on CPU, use CPU */
1039         /* if the destination is not for GPU _or_ source is on CPU, use CPU only if we think it's faster than the GPU */
1040         /* TODO: benchmark to find out in which cases exactly we should prefer the CPU */
1041          if((!dst_to_gpu && !src_on_gpu)
1042                 || (!dst->pitch && dst->d > 1)
1043                 /* 3D swizzled destination are unwritable by the GPU, and 2D swizzled ones are readable only by the 3D engine */
1044          )
1045                  return -1;
1046         /* there is no known way to read 2D/3D-swizzled surfaces with the 2D engine
1047          * ask the caller to use the 3D engine
1048          * If a format cannot be sampled from the 3D engine there is no point in making it swizzled, so we must not do so
1049          */
1050          else if(!src->pitch)
1051          {
1052 #ifdef NV04_REGION_DEBUG
1053                 fprintf(stderr, "\tCOPY_ENG3D\n");
1054 #endif
1055                  return 1;
1056          }
1057         /* Setup transfer to swizzle the texture to vram if needed */
1058         else
1059         {
1060                 if (!dst->pitch)
1061                 {
1062                         if(cs2d_format < 0 || sifm_format < 0 || !dst_to_gpu)
1063                         {
1064 #ifdef NV04_REGION_DEBUG
1065                                 fprintf(stderr, "\tCOPY_ENG3D\n");
1066 #endif
1067                                 return 1;
1068                         }
1069                         else
1070                         {
1071                                 assert(!nv04_region_align(dst, w, h, 6));
1072
1073                                 nv04_region_copy_swizzle(ctx, dst, src, w, h, cs2d_format, sifm_format);
1074                                 return 0;
1075                         }
1076                 }
1077                 else
1078                 {
1079                         /* NV_CONTEXT_SURFACES_2D has buffer alignment restrictions, fallback
1080                          * to NV_MEMORY_TO_MEMORY_FORMAT in this case.
1081                          * TODO: is this also true for the source? possibly not
1082                          */
1083
1084                         if ((cs2d_format < 0)
1085                                 || !dst_to_gpu
1086                                 || nv04_region_align(src, w, h, 6)
1087                                 || nv04_region_align(dst, w, h, 6)
1088                                 )
1089                                 nv04_region_copy_m2mf(ctx, dst, src, w, h);
1090                         else
1091                                 nv04_region_copy_blit(ctx, dst, src, w, h, cs2d_format);
1092
1093                         return 0;
1094                 }
1095         }
1096 }
1097
1098 static inline void
1099 nv04_region_fill_gdirect(struct nv04_2d_context *ctx, struct nv04_region* dst, int w, int h, unsigned value)
1100 {
1101         struct nouveau_channel *chan = ctx->surf2d->channel;
1102         struct nouveau_grobj *surf2d = ctx->surf2d;
1103         struct nouveau_grobj *rect = ctx->rect;
1104         int cs2d_format, gdirect_format;
1105
1106 #ifdef NV04_REGION_DEBUG
1107         fprintf(stderr, "\tFILL_GDIRECT\n");
1108 #endif
1109
1110         assert(!(dst->pitch & 63) && dst->pitch);
1111         nv04_region_assert(dst, w, h);
1112
1113         if(dst->bpps == 0)
1114         {
1115                 gdirect_format = NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A8R8G8B8;
1116                 cs2d_format = NV04_CONTEXT_SURFACES_2D_FORMAT_Y8;
1117         }
1118         else if(dst->bpps == 1)
1119         {
1120                 gdirect_format = NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A16R5G6B5;
1121                 cs2d_format = NV04_CONTEXT_SURFACES_2D_FORMAT_Y16;
1122         }
1123         else if(dst->bpps == 2)
1124         {
1125                 gdirect_format = NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A8R8G8B8;
1126                 cs2d_format = NV04_CONTEXT_SURFACES_2D_FORMAT_Y32;
1127         }
1128         else
1129         {
1130                 assert(0);
1131                 gdirect_format = 0;
1132                 cs2d_format = 0;
1133         }
1134
1135         MARK_RING (chan, 15, 4);
1136         BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_DMA_IMAGE_SOURCE, 2);
1137         OUT_RELOCo(chan, dst->bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
1138         OUT_RELOCo(chan, dst->bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
1139         BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_FORMAT, 4);
1140         OUT_RING  (chan, cs2d_format);
1141         OUT_RING  (chan, (dst->pitch << 16) | dst->pitch);
1142         OUT_RELOCl(chan, dst->bo, dst->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
1143         OUT_RELOCl(chan, dst->bo, dst->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
1144
1145         BEGIN_RING(chan, rect, NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT, 1);
1146         OUT_RING  (chan, gdirect_format);
1147         BEGIN_RING(chan, rect, NV04_GDI_RECTANGLE_TEXT_COLOR1_A, 1);
1148         OUT_RING  (chan, value);
1149         BEGIN_RING(chan, rect, NV04_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_POINT(0), 2);
1150         OUT_RING  (chan, (dst->x << 16) | dst->y);
1151         OUT_RING  (chan, ( w << 16) |  h);
1152 }
1153
1154 int
1155 nv04_region_fill_2d(struct nv04_2d_context *ctx, struct nv04_region *dst,
1156                   int w, int h, unsigned value)
1157 {
1158         if(!w || !h)
1159                 return 0;
1160
1161 #ifdef NV04_REGION_DEBUG
1162         fprintf(stderr, "FILL [%i, %i: %i] ", w, h, dst->bpps);
1163         nv04_region_print(dst);
1164         fprintf(stderr, " <- 0x%x\n", value);
1165 #endif
1166
1167         if(nv04_region_is_contiguous(dst, w, h))
1168         {
1169                 nv04_region_contiguous_shape(dst, &w, &h, 6);
1170                 nv04_region_linearize_contiguous(dst, w, h);
1171         }
1172
1173         // TODO: maybe do intermediate copies for some cases instead of using the 3D engine/CPU
1174         /* GdiRect doesn't work together with swzsurf, so the 3D engine, or an intermediate copy, is the only option here */
1175         if(!dst->pitch)
1176         {
1177 #ifdef NV04_REGION_DEBUG
1178                 fprintf(stderr, "\tFILL_ENG3D\n");
1179 #endif
1180                 return 1;
1181         }
1182         else if(!nv04_region_align(dst, w, h, 6))
1183         {
1184                 nv04_region_fill_gdirect(ctx, dst, w, h, value);
1185                 return 0;
1186         }
1187         else
1188                 return -1;
1189 }
1190
1191
1192 void
1193 nv04_2d_context_takedown(struct nv04_2d_context *ctx)
1194 {
1195         nouveau_notifier_free(&ctx->ntfy);
1196         nouveau_grobj_free(&ctx->m2mf);
1197         nouveau_grobj_free(&ctx->surf2d);
1198         nouveau_grobj_free(&ctx->swzsurf);
1199         nouveau_grobj_free(&ctx->rect);
1200         nouveau_grobj_free(&ctx->blit);
1201         nouveau_grobj_free(&ctx->sifm);
1202
1203         free(ctx);
1204 }
1205
1206 struct nv04_2d_context *
1207 nv04_2d_context_init(struct nouveau_channel* chan)
1208 {
1209         struct nv04_2d_context *ctx = calloc(1, sizeof(struct nv04_2d_context));
1210         unsigned handle = 0x88000000, class;
1211         int ret;
1212
1213         if (!ctx)
1214                 return NULL;
1215
1216         ret = nouveau_notifier_alloc(chan, handle++, 1, &ctx->ntfy);
1217         if (ret) {
1218                 nv04_2d_context_takedown(ctx);
1219                 return NULL;
1220         }
1221
1222         ret = nouveau_grobj_alloc(chan, handle++, 0x0039, &ctx->m2mf);
1223         if (ret) {
1224                 nv04_2d_context_takedown(ctx);
1225                 return NULL;
1226         }
1227
1228         BEGIN_RING(chan, ctx->m2mf, NV04_MEMORY_TO_MEMORY_FORMAT_DMA_NOTIFY, 1);
1229         OUT_RING  (chan, ctx->ntfy->handle);
1230
1231         if (chan->device->chipset < 0x10)
1232                 class = NV04_CONTEXT_SURFACES_2D;
1233         else
1234                 class = NV10_CONTEXT_SURFACES_2D;
1235
1236         ret = nouveau_grobj_alloc(chan, handle++, class, &ctx->surf2d);
1237         if (ret) {
1238                 nv04_2d_context_takedown(ctx);
1239                 return NULL;
1240         }
1241
1242         BEGIN_RING(chan, ctx->surf2d,
1243                          NV04_CONTEXT_SURFACES_2D_DMA_IMAGE_SOURCE, 2);
1244         OUT_RING  (chan, chan->vram->handle);
1245         OUT_RING  (chan, chan->vram->handle);
1246
1247         if (chan->device->chipset < 0x10)
1248                 class = NV04_IMAGE_BLIT;
1249         else
1250                 class = NV12_IMAGE_BLIT;
1251
1252         ret = nouveau_grobj_alloc(chan, handle++, class, &ctx->blit);
1253         if (ret) {
1254                 nv04_2d_context_takedown(ctx);
1255                 return NULL;
1256         }
1257
1258         BEGIN_RING(chan, ctx->blit, NV01_IMAGE_BLIT_DMA_NOTIFY, 1);
1259         OUT_RING  (chan, ctx->ntfy->handle);
1260         BEGIN_RING(chan, ctx->blit, NV04_IMAGE_BLIT_SURFACE, 1);
1261         OUT_RING  (chan, ctx->surf2d->handle);
1262         BEGIN_RING(chan, ctx->blit, NV01_IMAGE_BLIT_OPERATION, 1);
1263         OUT_RING  (chan, NV01_IMAGE_BLIT_OPERATION_SRCCOPY);
1264
1265         ret = nouveau_grobj_alloc(chan, handle++, NV04_GDI_RECTANGLE_TEXT,
1266                                   &ctx->rect);
1267         if (ret) {
1268                 nv04_2d_context_takedown(ctx);
1269                 return NULL;
1270         }
1271
1272         BEGIN_RING(chan, ctx->rect, NV04_GDI_RECTANGLE_TEXT_DMA_NOTIFY, 1);
1273         OUT_RING  (chan, ctx->ntfy->handle);
1274         BEGIN_RING(chan, ctx->rect, NV04_GDI_RECTANGLE_TEXT_SURFACE, 1);
1275         OUT_RING  (chan, ctx->surf2d->handle);
1276         BEGIN_RING(chan, ctx->rect, NV04_GDI_RECTANGLE_TEXT_OPERATION, 1);
1277         OUT_RING  (chan, NV04_GDI_RECTANGLE_TEXT_OPERATION_SRCCOPY);
1278         BEGIN_RING(chan, ctx->rect,
1279                          NV04_GDI_RECTANGLE_TEXT_MONOCHROME_FORMAT, 1);
1280         OUT_RING  (chan, NV04_GDI_RECTANGLE_TEXT_MONOCHROME_FORMAT_LE);
1281
1282         switch (chan->device->chipset & 0xf0) {
1283         case 0x00:
1284         case 0x10:
1285                 class = NV04_SWIZZLED_SURFACE;
1286                 break;
1287         case 0x20:
1288                 class = NV20_SWIZZLED_SURFACE;
1289                 break;
1290         case 0x30:
1291                 class = NV30_SWIZZLED_SURFACE;
1292                 break;
1293         case 0x40:
1294         case 0x60:
1295                 class = NV40_SWIZZLED_SURFACE;
1296                 break;
1297         default:
1298                 /* Famous last words: this really can't happen.. */
1299                 assert(0);
1300                 break;
1301         }
1302
1303         ret = nouveau_grobj_alloc(chan, handle++, class, &ctx->swzsurf);
1304         if (ret) {
1305                 nv04_2d_context_takedown(ctx);
1306                 return NULL;
1307         }
1308
1309         /* all the Gallium MARK_RING calculations assume no autobinding, so do that now */
1310         if(ctx->swzsurf->bound == NOUVEAU_GROBJ_UNBOUND)
1311                 nouveau_grobj_autobind(ctx->swzsurf);
1312
1313         switch (chan->device->chipset & 0xf0) {
1314         case 0x10:
1315         case 0x20:
1316                 class = NV10_SCALED_IMAGE_FROM_MEMORY;
1317                 break;
1318         case 0x30:
1319                 class = NV30_SCALED_IMAGE_FROM_MEMORY;
1320                 break;
1321         case 0x40:
1322         case 0x60:
1323                 class = NV40_SCALED_IMAGE_FROM_MEMORY;
1324                 break;
1325         default:
1326                 class = NV04_SCALED_IMAGE_FROM_MEMORY;
1327                 break;
1328         }
1329
1330         ret = nouveau_grobj_alloc(chan, handle++, class, &ctx->sifm);
1331         if (ret) {
1332                 nv04_2d_context_takedown(ctx);
1333                 return NULL;
1334         }
1335
1336         /* all the Gallium MARK_RING calculations assume no autobinding, so do that now */
1337         if(ctx->sifm->bound == NOUVEAU_GROBJ_UNBOUND)
1338                 nouveau_grobj_autobind(ctx->sifm);
1339
1340         return ctx;
1341 }