src/gallium/drivers/nouveau/nv50/nv84_video_vp.c

   1 /*
   2  * Copyright 2013 Ilia Mirkin
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice shall be included in
  12  * all copies or substantial portions of the Software.
  13  *
  14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  17  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
  18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  20  * OTHER DEALINGS IN THE SOFTWARE.
  21  */
  22
  23 #include "nv50/nv84_video.h"
  24
  25 #include "util/u_sse.h"
  26
  27 struct h264_iparm1 {
  28    uint8_t scaling_lists_4x4[6][16]; // 00
  29    uint8_t scaling_lists_8x8[2][64]; // 60
  30    uint32_t width; // e0
  31    uint32_t height; // e4
  32    uint64_t ref1_addrs[16]; // e8
  33    uint64_t ref2_addrs[16]; // 168
  34    uint32_t unk1e8;
  35    uint32_t unk1ec;
  36    uint32_t w1; // 1f0
  37    uint32_t w2; // 1f4
  38    uint32_t w3; // 1f8
  39    uint32_t h1; // 1fc
  40    uint32_t h2; // 200
  41    uint32_t h3; // 204
  42    uint32_t mb_adaptive_frame_field_flag; // 208
  43    uint32_t field_pic_flag; // 20c
  44    uint32_t format; // 210
  45    uint32_t unk214; // 214
  46 };
  47
  48 struct h264_iparm2 {
  49    uint32_t width; // 00
  50    uint32_t height; // 04
  51    uint32_t mbs; // 08
  52    uint32_t w1; // 0c
  53    uint32_t w2; // 10
  54    uint32_t w3; // 14
  55    uint32_t h1; // 18
  56    uint32_t h2; // 1c
  57    uint32_t h3; // 20
  58    uint32_t unk24;
  59    uint32_t mb_adaptive_frame_field_flag; // 28
  60    uint32_t top; // 2c
  61    uint32_t bottom; // 30
  62    uint32_t is_reference; // 34
  63 };
  64
  65 void
  66 nv84_decoder_vp_h264(struct nv84_decoder *dec,
  67                      struct pipe_h264_picture_desc *desc,
  68                      struct nv84_video_buffer *dest)
  69 {
  70    struct h264_iparm1 param1;
  71    struct h264_iparm2 param2;
  72    int i, width = align(dest->base.width, 16),
  73       height = align(dest->base.height, 16);
  74
  75    struct nouveau_pushbuf *push = dec->vp_pushbuf;
  76    struct nouveau_pushbuf_refn bo_refs[] = {
  77       { dest->interlaced, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
  78       { dest->full, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
  79       { dec->vpring, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
  80       { dec->mbring, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
  81       { dec->vp_params, NOUVEAU_BO_RDWR | NOUVEAU_BO_GART },
  82       { dec->fence, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
  83    };
  84    int num_refs = sizeof(bo_refs)/sizeof(*bo_refs);
  85    bool is_ref = desc->is_reference;
  86
  87    STATIC_ASSERT(sizeof(struct h264_iparm1) == 0x218);
  88    STATIC_ASSERT(sizeof(struct h264_iparm2) == 0x38);
  89
  90    memset(&param1, 0, sizeof(param1));
  91    memset(&param2, 0, sizeof(param2));
  92
  93    memcpy(&param1.scaling_lists_4x4, desc->scaling_lists_4x4,
  94           sizeof(param1.scaling_lists_4x4));
  95    memcpy(&param1.scaling_lists_8x8, desc->scaling_lists_8x8,
  96           sizeof(param1.scaling_lists_8x8));
  97
  98    param1.width = width;
  99    param1.w1 = param1.w2 = param1.w3 = align(width, 64);
 100    param1.height = param1.h2 = height;
 101    param1.h1 = param1.h3 = align(height, 32);
 102    param1.format = 0x3231564e; /* 'NV12' */
 103    param1.mb_adaptive_frame_field_flag = desc->mb_adaptive_frame_field_flag;
 104    param1.field_pic_flag = desc->field_pic_flag;
 105
 106    param2.width = width;
 107    param2.w1 = param2.w2 = param2.w3 = param1.w1;
 108    if (desc->field_pic_flag)
 109       param2.height = align(height, 32) / 2;
 110    else
 111       param2.height = height;
 112    param2.h1 = param2.h2 = align(height, 32);
 113    param2.h3 = height;
 114    param2.mbs = width * height >> 8;
 115    if (desc->field_pic_flag) {
 116       param2.top = desc->bottom_field_flag ? 2 : 1;
 117       param2.bottom = desc->bottom_field_flag;
 118    }
 119    param2.mb_adaptive_frame_field_flag = desc->mb_adaptive_frame_field_flag;
 120    param2.is_reference = desc->is_reference;
 121
 122    PUSH_SPACE(push, 5 + 16 + 3 + 2 + 6 + (is_ref ? 2 : 0) + 3 + 2 + 4 + 2);
 123
 124    struct nouveau_bo *ref2_default = dest->full;
 125
 126    for (i = 0; i < 16; i++) {
 127       struct nv84_video_buffer *buf = (struct nv84_video_buffer *)desc->ref[i];
 128       struct nouveau_bo *bo1, *bo2;
 129       if (buf) {
 130          bo1 = buf->interlaced;
 131          bo2 = buf->full;
 132          if (i == 0)
 133             ref2_default = buf->full;
 134       } else {
 135          bo1 = dest->interlaced;
 136          bo2 = ref2_default;
 137       }
 138       param1.ref1_addrs[i] = bo1->offset;
 139       param1.ref2_addrs[i] = bo2->offset;
 140       struct nouveau_pushbuf_refn bo_refs[] = {
 141          { bo1, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
 142          { bo2, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
 143       };
 144       nouveau_pushbuf_refn(push, bo_refs, sizeof(bo_refs)/sizeof(bo_refs[0]));
 145    }
 146
 147    memcpy(dec->vp_params->map, &param1, sizeof(param1));
 148    memcpy(dec->vp_params->map + 0x400, &param2, sizeof(param2));
 149
 150    nouveau_pushbuf_refn(push, bo_refs, num_refs);
 151
 152    /* Wait for BSP to have completed */
 153    BEGIN_NV04(push, SUBC_VP(0x10), 4);
 154    PUSH_DATAh(push, dec->fence->offset);
 155    PUSH_DATA (push, dec->fence->offset);
 156    PUSH_DATA (push, 2);
 157    PUSH_DATA (push, 1); /* wait for sem == 2 */
 158
 159    /* VP step 1 */
 160    BEGIN_NV04(push, SUBC_VP(0x400), 15);
 161    PUSH_DATA (push, 1);
 162    PUSH_DATA (push, param2.mbs);
 163    PUSH_DATA (push, 0x3987654); /* each nibble probably a dma index */
 164    PUSH_DATA (push, 0x55001); /* constant */
 165    PUSH_DATA (push, dec->vp_params->offset >> 8);
 166    PUSH_DATA (push, (dec->vpring->offset + dec->vpring_residual) >> 8);
 167    PUSH_DATA (push, dec->vpring_ctrl);
 168    PUSH_DATA (push, dec->vpring->offset >> 8);
 169    PUSH_DATA (push, dec->bitstream->size / 2 - 0x700);
 170    PUSH_DATA (push, (dec->mbring->offset + dec->mbring->size - 0x2000) >> 8);
 171    PUSH_DATA (push, (dec->vpring->offset + dec->vpring_ctrl +
 172                      dec->vpring_residual + dec->vpring_deblock) >> 8);
 173    PUSH_DATA (push, 0);
 174    PUSH_DATA (push, 0x100008);
 175    PUSH_DATA (push, dest->interlaced->offset >> 8);
 176    PUSH_DATA (push, 0);
 177
 178    BEGIN_NV04(push, SUBC_VP(0x620), 2);
 179    PUSH_DATA (push, 0);
 180    PUSH_DATA (push, 0);
 181
 182    BEGIN_NV04(push, SUBC_VP(0x300), 1);
 183    PUSH_DATA (push, 0);
 184
 185    /* VP step 2 */
 186    BEGIN_NV04(push, SUBC_VP(0x400), 5);
 187    PUSH_DATA (push, 0x54530201);
 188    PUSH_DATA (push, (dec->vp_params->offset >> 8) + 0x4);
 189    PUSH_DATA (push, (dec->vpring->offset + dec->vpring_ctrl +
 190                      dec->vpring_residual) >> 8);
 191    PUSH_DATA (push, dest->interlaced->offset >> 8);
 192    PUSH_DATA (push, dest->interlaced->offset >> 8);
 193
 194    if (is_ref) {
 195       BEGIN_NV04(push, SUBC_VP(0x414), 1);
 196       PUSH_DATA (push, dest->full->offset >> 8);
 197    }
 198
 199    BEGIN_NV04(push, SUBC_VP(0x620), 2);
 200    PUSH_DATAh(push, dec->vp_fw2_offset);
 201    PUSH_DATA (push, dec->vp_fw2_offset);
 202
 203    BEGIN_NV04(push, SUBC_VP(0x300), 1);
 204    PUSH_DATA (push, 0);
 205
 206    /* Set the semaphore back to 1 */
 207    BEGIN_NV04(push, SUBC_VP(0x610), 3);
 208    PUSH_DATAh(push, dec->fence->offset);
 209    PUSH_DATA (push, dec->fence->offset);
 210    PUSH_DATA (push, 1);
 211
 212    /* Write to the semaphore location, intr */
 213    BEGIN_NV04(push, SUBC_VP(0x304), 1);
 214    PUSH_DATA (push, 0x101);
 215
 216    for (i = 0; i < 2; i++) {
 217       struct nv50_miptree *mt = nv50_miptree(dest->resources[i]);
 218       mt->base.status |= NOUVEAU_BUFFER_STATUS_GPU_WRITING;
 219    }
 220
 221    PUSH_KICK (push);
 222 }
 223
 224 static INLINE int16_t inverse_quantize(int16_t val, uint8_t quant, int mpeg1) {
 225    int16_t ret = val * quant / 16;
 226    if (mpeg1 && ret) {
 227       if (ret > 0)
 228          ret = (ret - 1) | 1;
 229       else
 230          ret = (ret + 1) | 1;
 231    }
 232    if (ret < -2048)
 233       ret = -2048;
 234    else if (ret > 2047)
 235       ret = 2047;
 236    return ret;
 237 }
 238
 239 struct mpeg12_mb_info {
 240    uint32_t index;
 241    uint8_t unk4;
 242    uint8_t unk5;
 243    uint16_t coded_block_pattern;
 244    uint8_t block_counts[6];
 245    uint16_t PMV[8];
 246    uint16_t skipped;
 247 };
 248
 249 void
 250 nv84_decoder_vp_mpeg12_mb(struct nv84_decoder *dec,
 251                           struct pipe_mpeg12_picture_desc *desc,
 252                           const struct pipe_mpeg12_macroblock *macrob)
 253 {
 254    STATIC_ASSERT(sizeof(struct mpeg12_mb_info) == 32);
 255
 256    struct mpeg12_mb_info info = {0};
 257    int i, sum = 0, mask, block_index, count;
 258    const int16_t *blocks;
 259    int intra = macrob->macroblock_type & PIPE_MPEG12_MB_TYPE_INTRA;
 260    int motion = macrob->macroblock_type &
 261       (PIPE_MPEG12_MB_TYPE_MOTION_FORWARD | PIPE_MPEG12_MB_TYPE_MOTION_BACKWARD);
 262    const uint8_t *quant_matrix = intra ? dec->mpeg12_intra_matrix :
 263       dec->mpeg12_non_intra_matrix;
 264    int mpeg1 = dec->base.profile == PIPE_VIDEO_PROFILE_MPEG1;
 265
 266    info.index = macrob->y * mb(dec->base.width) + macrob->x;
 267    info.unk4 = motion;
 268    if (intra)
 269       info.unk4 |= 1;
 270    if (macrob->macroblock_modes.bits.dct_type)
 271       info.unk4 |= 0x20;
 272    info.unk5 = (macrob->motion_vertical_field_select << 4) |
 273       (macrob->macroblock_modes.value & 0xf);
 274    info.coded_block_pattern = macrob->coded_block_pattern;
 275    if (motion) {
 276       memcpy(info.PMV, macrob->PMV, sizeof(info.PMV));
 277    }
 278    blocks = macrob->blocks;
 279    for (mask = 0x20, block_index = 0; mask > 0; mask >>= 1, block_index++) {
 280       if ((macrob->coded_block_pattern & mask) == 0)
 281          continue;
 282
 283       count = 0;
 284
 285       /*
 286        * The observation here is that there are a lot of 0's, and things go
 287        * a lot faster if one skips over them.
 288        */
 289
 290 #if defined(PIPE_ARCH_SSE) && defined(PIPE_ARCH_X86_64)
 291 /* Note that the SSE implementation is much more tuned to X86_64. As it's not
 292  * benchmarked on X86_32, disable it there. I suspect that the code needs to
 293  * be reorganized in terms of 32-bit wide data in order to be more
 294  * efficient. NV84+ were released well into the 64-bit CPU era, so it should
 295  * be a minority case.
 296  */
 297
 298 /* This returns a 16-bit bit-mask, each 2 bits are both 1 or both 0, depending
 299  * on whether the corresponding (16-bit) word in blocks is zero or non-zero. */
 300 #define wordmask(blocks, zero) \
 301       (uint64_t)(_mm_movemask_epi8( \
 302                        _mm_cmpeq_epi16( \
 303                              zero, _mm_load_si128((__m128i *)(blocks)))))
 304
 305       __m128i zero = _mm_setzero_si128();
 306
 307       /* TODO: Look into doing the inverse quantization in terms of SSE
 308        * operations unconditionally, when necessary. */
 309       uint64_t bmask0 = wordmask(blocks, zero);
 310       bmask0 |= wordmask(blocks + 8, zero) << 16;
 311       bmask0 |= wordmask(blocks + 16, zero) << 32;
 312       bmask0 |= wordmask(blocks + 24, zero) << 48;
 313       uint64_t bmask1 = wordmask(blocks + 32, zero);
 314       bmask1 |= wordmask(blocks + 40, zero) << 16;
 315       bmask1 |= wordmask(blocks + 48, zero) << 32;
 316       bmask1 |= wordmask(blocks + 56, zero) << 48;
 317
 318       /* The wordmask macro returns the inverse of what we want, since it
 319        * returns a 1 for equal-to-zero. Invert. */
 320       bmask0 = ~bmask0;
 321       bmask1 = ~bmask1;
 322
 323       /* Note that the bitmask is actually sequences of 2 bits for each block
 324        * index. This is because there is no movemask_epi16. That means that
 325        * (a) ffs will never return 64, since the prev bit will always be set
 326        * in that case, and (b) we need to do an extra bit shift. Or'ing the
 327        * bitmasks together is faster than having a loop that computes them one
 328        * at a time and processes them, on a Core i7-920. Trying to put bmask
 329        * into an array and then looping also slows things down.
 330        */
 331
 332       /* shift needs to be the same width as i, and unsigned so that / 2
 333        * becomes a rshift operation */
 334       uint32_t shift;
 335       i = 0;
 336
 337       if (dec->base.entrypoint == PIPE_VIDEO_ENTRYPOINT_BITSTREAM) {
 338          int16_t tmp;
 339          while ((shift = __builtin_ffsll(bmask0))) {
 340             i += (shift - 1) / 2;
 341             bmask0 >>= shift - 1;
 342             *dec->mpeg12_data++ = dec->zscan[i] * 2;
 343             tmp = inverse_quantize(blocks[i], quant_matrix[i], mpeg1);
 344             *dec->mpeg12_data++ = tmp;
 345             sum += tmp;
 346             count++;
 347             i++;
 348             bmask0 >>= 2;
 349          }
 350          i = 32;
 351          while ((shift = __builtin_ffsll(bmask1))) {
 352             i += (shift - 1) / 2;
 353             bmask1 >>= shift - 1;
 354             *dec->mpeg12_data++ = dec->zscan[i] * 2;
 355             tmp = inverse_quantize(blocks[i], quant_matrix[i], mpeg1);
 356             *dec->mpeg12_data++ = tmp;
 357             sum += tmp;
 358             count++;
 359             i++;
 360             bmask1 >>= 2;
 361          }
 362       } else {
 363          while ((shift = __builtin_ffsll(bmask0))) {
 364             i += (shift - 1) / 2;
 365             bmask0 >>= shift - 1;
 366             *dec->mpeg12_data++ = i * 2;
 367             *dec->mpeg12_data++ = blocks[i];
 368             count++;
 369             i++;
 370             bmask0 >>= 2;
 371          }
 372          i = 32;
 373          while ((shift = __builtin_ffsll(bmask1))) {
 374             i += (shift - 1) / 2;
 375             bmask1 >>= shift - 1;
 376             *dec->mpeg12_data++ = i * 2;
 377             *dec->mpeg12_data++ = blocks[i];
 378             count++;
 379             i++;
 380             bmask1 >>= 2;
 381          }
 382       }
 383 #undef wordmask
 384 #else
 385
 386       /*
 387        * This loop looks ridiculously written... and it is. I tried a lot of
 388        * different ways of achieving this scan, and this was the fastest, at
 389        * least on a Core i7-920. Note that it's not necessary to skip the 0's,
 390        * the firmware will deal with those just fine. But it's faster to skip
 391        * them. Note to people trying benchmarks: make sure to use realistic
 392        * mpeg data, which can often be a single data point first followed by
 393        * 63 0's, or <data> 7x <0> <data> 7x <0> etc.
 394        */
 395       i = 0;
 396       if (dec->base.entrypoint == PIPE_VIDEO_ENTRYPOINT_BITSTREAM) {
 397          while (true) {
 398             int16_t tmp;
 399             while (likely(i < 64 && !(tmp = blocks[i]))) i++;
 400             if (i >= 64) break;
 401             *dec->mpeg12_data++ = dec->zscan[i] * 2;
 402             tmp = inverse_quantize(tmp, quant_matrix[i], mpeg1);
 403             *dec->mpeg12_data++ = tmp;
 404             sum += tmp;
 405             count++;
 406             i++;
 407          }
 408       } else {
 409          while (true) {
 410             int16_t tmp;
 411             while (likely(i < 64 && !(tmp = blocks[i]))) i++;
 412             if (i >= 64) break;
 413             *dec->mpeg12_data++ = i * 2;
 414             *dec->mpeg12_data++ = tmp;
 415             count++;
 416             i++;
 417          }
 418       }
 419
 420 #endif
 421
 422       if (dec->base.entrypoint == PIPE_VIDEO_ENTRYPOINT_BITSTREAM) {
 423          if (!mpeg1 && (sum & 1) == 0) {
 424             if (count && *(dec->mpeg12_data - 2) == 63 * 2) {
 425                uint16_t *val = dec->mpeg12_data - 1;
 426                if (*val & 1) *val -= 1;
 427                else *val += 1;
 428             } else {
 429                *dec->mpeg12_data++ = 63 * 2;
 430                *dec->mpeg12_data++ = 1;
 431                count++;
 432             }
 433          }
 434       }
 435
 436       if (count) {
 437          *(dec->mpeg12_data - 2) |= 1;
 438       } else {
 439          *dec->mpeg12_data++ = 1;
 440          *dec->mpeg12_data++ = 0;
 441          count = 1;
 442       }
 443       info.block_counts[block_index] = count;
 444       blocks += 64;
 445    }
 446
 447    memcpy(dec->mpeg12_mb_info, &info, sizeof(info));
 448    dec->mpeg12_mb_info += sizeof(info);
 449
 450    if (macrob->num_skipped_macroblocks) {
 451       info.index++;
 452       info.coded_block_pattern = 0;
 453       info.skipped = macrob->num_skipped_macroblocks - 1;
 454       memset(info.block_counts, 0, sizeof(info.block_counts));
 455       memcpy(dec->mpeg12_mb_info, &info, sizeof(info));
 456       dec->mpeg12_mb_info += sizeof(info);
 457    }
 458 }
 459
 460 struct mpeg12_header {
 461    uint32_t luma_top_size; // 00
 462    uint32_t luma_bottom_size; // 04
 463    uint32_t chroma_top_size; // 08
 464    uint32_t mbs; // 0c
 465    uint32_t mb_info_size; // 10
 466    uint32_t mb_width_minus1; // 14
 467    uint32_t mb_height_minus1; // 18
 468    uint32_t width; // 1c
 469    uint32_t height; // 20
 470    uint8_t progressive; // 24
 471    uint8_t mocomp_only; // 25
 472    uint8_t frames; // 26
 473    uint8_t picture_structure; // 27
 474    uint32_t unk28; // 28 -- 0x50100
 475    uint32_t unk2c; // 2c
 476    uint32_t pad[4 * 13];
 477 };
 478
 479 void
 480 nv84_decoder_vp_mpeg12(struct nv84_decoder *dec,
 481                        struct pipe_mpeg12_picture_desc *desc,
 482                        struct nv84_video_buffer *dest)
 483 {
 484    struct nouveau_pushbuf *push = dec->vp_pushbuf;
 485    struct nv84_video_buffer *ref1 = (struct nv84_video_buffer *)desc->ref[0];
 486    struct nv84_video_buffer *ref2 = (struct nv84_video_buffer *)desc->ref[1];
 487    struct nouveau_pushbuf_refn bo_refs[] = {
 488       { dest->interlaced, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
 489       { NULL, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
 490       { NULL, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
 491       { dec->mpeg12_bo, NOUVEAU_BO_RDWR | NOUVEAU_BO_GART },
 492    };
 493    int i, num_refs = sizeof(bo_refs) / sizeof(*bo_refs);
 494    struct mpeg12_header header = {0};
 495    struct nv50_miptree *y = nv50_miptree(dest->resources[0]);
 496    struct nv50_miptree *uv = nv50_miptree(dest->resources[1]);
 497
 498    STATIC_ASSERT(sizeof(struct mpeg12_header) == 0x100);
 499
 500    if (ref1 == NULL)
 501       ref1 = dest;
 502    if (ref2 == NULL)
 503       ref2 = dest;
 504    bo_refs[1].bo = ref1->interlaced;
 505    bo_refs[2].bo = ref2->interlaced;
 506
 507    header.luma_top_size = y->layer_stride;
 508    header.luma_bottom_size = y->layer_stride;
 509    header.chroma_top_size = uv->layer_stride;
 510    header.mbs = mb(dec->base.width) * mb(dec->base.height);
 511    header.mb_info_size = dec->mpeg12_mb_info - dec->mpeg12_bo->map - 0x100;
 512    header.mb_width_minus1 = mb(dec->base.width) - 1;
 513    header.mb_height_minus1 = mb(dec->base.height) - 1;
 514    header.width = align(dec->base.width, 16);
 515    header.height = align(dec->base.height, 16);
 516    header.progressive = desc->frame_pred_frame_dct;
 517    header.frames = 1 + (desc->ref[0] != NULL) + (desc->ref[1] != NULL);
 518    header.picture_structure = desc->picture_structure;
 519    header.unk28 = 0x50100;
 520
 521    memcpy(dec->mpeg12_bo->map, &header, sizeof(header));
 522
 523    PUSH_SPACE(push, 10 + 3 + 2);
 524
 525    nouveau_pushbuf_refn(push, bo_refs, num_refs);
 526
 527    BEGIN_NV04(push, SUBC_VP(0x400), 9);
 528    PUSH_DATA (push, 0x543210); /* each nibble possibly a dma index */
 529    PUSH_DATA (push, 0x555001); /* constant */
 530    PUSH_DATA (push, dec->mpeg12_bo->offset >> 8);
 531    PUSH_DATA (push, (dec->mpeg12_bo->offset + 0x100) >> 8);
 532    PUSH_DATA (push, (dec->mpeg12_bo->offset + 0x100 +
 533                      align(0x20 * mb(dec->base.width) *
 534                            mb(dec->base.height), 0x100)) >> 8);
 535    PUSH_DATA (push, dest->interlaced->offset >> 8);
 536    PUSH_DATA (push, ref1->interlaced->offset >> 8);
 537    PUSH_DATA (push, ref2->interlaced->offset >> 8);
 538    PUSH_DATA (push, 6 * 64 * 8 * header.mbs);
 539
 540    BEGIN_NV04(push, SUBC_VP(0x620), 2);
 541    PUSH_DATA (push, 0);
 542    PUSH_DATA (push, 0);
 543
 544    BEGIN_NV04(push, SUBC_VP(0x300), 1);
 545    PUSH_DATA (push, 0);
 546
 547    for (i = 0; i < 2; i++) {
 548       struct nv50_miptree *mt = nv50_miptree(dest->resources[i]);
 549       mt->base.status |= NOUVEAU_BUFFER_STATUS_GPU_WRITING;
 550    }
 551    PUSH_KICK (push);
 552 }