nvc0: add support for accelerated video decoding through the dedicated engines
authorMaarten Lankhorst <m.b.lankhorst@gmail.com>
Sun, 2 Dec 2012 11:07:35 +0000 (12:07 +0100)
committerMaarten Lankhorst <maarten.lankhorst@canonical.com>
Thu, 17 Jan 2013 15:28:57 +0000 (16:28 +0100)
Currently the use of external firmware is required, with kernel and
userspace firmware needed for all Fermi cards except nvd9. Kepler and nvd9
should only require kernel firmware.

configure.ac
src/gallium/drivers/nvc0/Makefile.sources
src/gallium/drivers/nvc0/nvc0_video.c
src/gallium/drivers/nvc0/nvc0_video.h
src/gallium/drivers/nvc0/nvc0_video_bsp.c [new file with mode: 0644]
src/gallium/drivers/nvc0/nvc0_video_ppp.c [new file with mode: 0644]
src/gallium/drivers/nvc0/nvc0_video_vp.c [new file with mode: 0644]

index 99a08fd060f7f6ea902a955a22f4a46eea86e854..ba4c203e5f959eb16bafa4f7158c69992ce55340 100644 (file)
@@ -33,7 +33,7 @@ LIBDRM_REQUIRED=2.4.24
 LIBDRM_RADEON_REQUIRED=2.4.40
 LIBDRM_INTEL_REQUIRED=2.4.38
 LIBDRM_NVVIEUX_REQUIRED=2.4.33
-LIBDRM_NOUVEAU_REQUIRED=2.4.33
+LIBDRM_NOUVEAU_REQUIRED="2.4.33 libdrm >= 2.4.41"
 DRI2PROTO_REQUIRED=2.6
 GLPROTO_REQUIRED=1.4.14
 LIBDRM_XORG_REQUIRED=2.4.24
index 12eedf96231188689bb81126ffb37d12b78559a4..33b90f290fb4bc355b67015c1005102617aed514 100644 (file)
@@ -14,4 +14,7 @@ C_SOURCES := \
        nvc0_program.c \
        nvc0_shader_state.c \
        nvc0_query.c \
-       nvc0_video.c
+       nvc0_video.c \
+       nvc0_video_bsp.c \
+       nvc0_video_vp.c \
+       nvc0_video_ppp.c
index 5cf16e79b2b7b286ae38c4e6c6ea3a4bce67394c..cdb80dba064f7eed2b10bc2eda9187786b660fa6 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright 2011 Maarten Lankhorst
+ * Copyright 2011-2013 Maarten Lankhorst
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -54,6 +54,116 @@ nvc0_screen_get_video_param(struct pipe_screen *pscreen,
    }
 }
 
+static void
+nvc0_decoder_decode_bitstream(struct pipe_video_decoder *decoder,
+                              struct pipe_video_buffer *video_target,
+                              struct pipe_picture_desc *picture,
+                              unsigned num_buffers,
+                              const void *const *data,
+                              const unsigned *num_bytes)
+{
+   struct nvc0_decoder *dec = (struct nvc0_decoder *)decoder;
+   struct nvc0_video_buffer *target = (struct nvc0_video_buffer *)video_target;
+   uint32_t comm_seq = ++dec->fence_seq;
+   union pipe_desc desc;
+
+   unsigned vp_caps, is_ref, ret;
+   struct nvc0_video_buffer *refs[16] = {};
+
+   desc.base = picture;
+
+   assert(target->base.buffer_format == PIPE_FORMAT_NV12);
+
+   ret = nvc0_decoder_bsp(dec, desc, target, comm_seq,
+                          num_buffers, data, num_bytes,
+                          &vp_caps, &is_ref, refs);
+
+   /* did we decode bitstream correctly? */
+   assert(ret == 2);
+
+   nvc0_decoder_vp(dec, desc, target, comm_seq, vp_caps, is_ref, refs);
+   nvc0_decoder_ppp(dec, desc, target, comm_seq);
+}
+
+static void
+nvc0_decoder_flush(struct pipe_video_decoder *decoder)
+{
+   struct nvc0_decoder *dec = (struct nvc0_decoder *)decoder;
+   (void)dec;
+}
+
+static void
+nvc0_decoder_begin_frame(struct pipe_video_decoder *decoder,
+                         struct pipe_video_buffer *target,
+                         struct pipe_picture_desc *picture)
+{
+}
+
+static void
+nvc0_decoder_end_frame(struct pipe_video_decoder *decoder,
+                       struct pipe_video_buffer *target,
+                       struct pipe_picture_desc *picture)
+{
+}
+
+static void
+nvc0_decoder_destroy(struct pipe_video_decoder *decoder)
+{
+   struct nvc0_decoder *dec = (struct nvc0_decoder *)decoder;
+   int i;
+
+   nouveau_bo_ref(NULL, &dec->ref_bo);
+   nouveau_bo_ref(NULL, &dec->bitplane_bo);
+   nouveau_bo_ref(NULL, &dec->inter_bo[0]);
+   nouveau_bo_ref(NULL, &dec->inter_bo[1]);
+#ifdef NVC0_DEBUG_FENCE
+   nouveau_bo_ref(NULL, &dec->fence_bo);
+#endif
+   nouveau_bo_ref(NULL, &dec->fw_bo);
+
+   for (i = 0; i < NVC0_VIDEO_QDEPTH; ++i)
+      nouveau_bo_ref(NULL, &dec->bsp_bo[i]);
+
+   nouveau_object_del(&dec->bsp);
+   nouveau_object_del(&dec->vp);
+   nouveau_object_del(&dec->ppp);
+
+   if (dec->channel[0] != dec->channel[1]) {
+      for (i = 0; i < 3; ++i) {
+         nouveau_pushbuf_del(&dec->pushbuf[i]);
+         nouveau_object_del(&dec->channel[i]);
+      }
+   } else {
+      nouveau_pushbuf_del(dec->pushbuf);
+      nouveau_object_del(dec->channel);
+   }
+
+   FREE(dec);
+}
+
+static void nvc0_video_getpath(enum pipe_video_profile profile, char *path)
+{
+   switch (u_reduce_video_profile(profile)) {
+      case PIPE_VIDEO_CODEC_MPEG12: {
+         sprintf(path, "/lib/firmware/nouveau/vuc-mpeg12-0");
+         break;
+      }
+      case PIPE_VIDEO_CODEC_MPEG4: {
+         sprintf(path, "/lib/firmware/nouveau/vuc-mpeg4-0");
+         break;
+      }
+      case PIPE_VIDEO_CODEC_VC1: {
+         sprintf(path, "/lib/firmware/nouveau/vuc-vc1-%u", profile - PIPE_VIDEO_PROFILE_VC1_SIMPLE);
+         break;
+      }
+      case PIPE_VIDEO_CODEC_MPEG4_AVC: {
+         sprintf(path, "/lib/firmware/nouveau/vuc-h264-0");
+         break;
+      }
+      default: assert(0);
+   }
+}
+
 struct pipe_video_decoder *
 nvc0_create_decoder(struct pipe_context *context,
                     enum pipe_video_profile profile,
@@ -62,6 +172,20 @@ nvc0_create_decoder(struct pipe_context *context,
                     unsigned width, unsigned height, unsigned max_references,
                     bool chunked_decode)
 {
+   struct nouveau_screen *screen = &((struct nvc0_context *)context)->screen->base;
+   struct nvc0_decoder *dec;
+   struct nouveau_pushbuf **push;
+   union nouveau_bo_config cfg;
+   bool kepler = screen->device->chipset >= 0xe0;
+
+   cfg.nvc0.tile_mode = 0x10;
+   cfg.nvc0.memtype = 0xfe;
+
+   int ret, i;
+   uint32_t codec = 1, ppp_codec = 3;
+   uint32_t timeout;
+   u32 tmp_size = 0;
+
    if (getenv("XVMC_VL"))
        return vl_create_decoder(context, profile, entrypoint,
                                 chroma_format, width, height,
@@ -72,6 +196,307 @@ nvc0_create_decoder(struct pipe_context *context,
       return NULL;
    }
 
+   dec = CALLOC_STRUCT(nvc0_decoder);
+   if (!dec)
+      return NULL;
+   dec->client = screen->client;
+
+   if (!kepler) {
+      dec->bsp_idx = 5;
+      dec->vp_idx = 6;
+      dec->ppp_idx = 7;
+   } else {
+      dec->bsp_idx = 2;
+      dec->vp_idx = 2;
+      dec->ppp_idx = 2;
+   }
+
+   for (i = 0; i < 3; ++i)
+      if (i && !kepler) {
+         dec->channel[i] = dec->channel[0];
+         dec->pushbuf[i] = dec->pushbuf[0];
+      } else {
+         void *data;
+         u32 size;
+         struct nvc0_fifo nvc0_args = {};
+         struct nve0_fifo nve0_args = {};
+
+         if (!kepler) {
+            size = sizeof(nvc0_args);
+            data = &nvc0_args;
+         } else {
+            unsigned engine[] = {
+               NVE0_FIFO_ENGINE_BSP,
+               NVE0_FIFO_ENGINE_VP,
+               NVE0_FIFO_ENGINE_PPP
+            };
+
+            nve0_args.engine = engine[i];
+            size = sizeof(nve0_args);
+            data = &nve0_args;
+         }
+
+         ret = nouveau_object_new(&screen->device->object, 0,
+                                  NOUVEAU_FIFO_CHANNEL_CLASS,
+                                  data, size, &dec->channel[i]);
+
+         if (!ret)
+            ret = nouveau_pushbuf_new(screen->client, dec->channel[i], 4,
+                                   32 * 1024, true, &dec->pushbuf[i]);
+         if (ret)
+            break;
+      }
+   push = dec->pushbuf;
+
+   if (!kepler) {
+      if (!ret)
+         ret = nouveau_object_new(dec->channel[0], 0x390b1, 0x90b1, NULL, 0, &dec->bsp);
+      if (!ret)
+         ret = nouveau_object_new(dec->channel[1], 0x190b2, 0x90b2, NULL, 0, &dec->vp);
+      if (!ret)
+         ret = nouveau_object_new(dec->channel[2], 0x290b3, 0x90b3, NULL, 0, &dec->ppp);
+   } else {
+      if (!ret)
+         ret = nouveau_object_new(dec->channel[0], 0x95b1, 0x95b1, NULL, 0, &dec->bsp);
+      if (!ret)
+         ret = nouveau_object_new(dec->channel[1], 0x95b2, 0x95b2, NULL, 0, &dec->vp);
+      if (!ret)
+         ret = nouveau_object_new(dec->channel[2], 0x90b3, 0x90b3, NULL, 0, &dec->ppp);
+   }
+   if (ret)
+      goto fail;
+
+   BEGIN_NVC0(push[0], SUBC_BSP(NV01_SUBCHAN_OBJECT), 1);
+   PUSH_DATA (push[0], dec->bsp->handle);
+
+   BEGIN_NVC0(push[1], SUBC_VP(NV01_SUBCHAN_OBJECT), 1);
+   PUSH_DATA (push[1], dec->vp->handle);
+
+   BEGIN_NVC0(push[2], SUBC_PPP(NV01_SUBCHAN_OBJECT), 1);
+   PUSH_DATA (push[2], dec->ppp->handle);
+
+   dec->base.context = context;
+   dec->base.profile = profile;
+   dec->base.entrypoint = entrypoint;
+   dec->base.chroma_format = chroma_format;
+   dec->base.width = width;
+   dec->base.height = height;
+   dec->base.max_references = max_references;
+   dec->base.destroy = nvc0_decoder_destroy;
+   dec->base.flush = nvc0_decoder_flush;
+   dec->base.decode_bitstream = nvc0_decoder_decode_bitstream;
+   dec->base.begin_frame = nvc0_decoder_begin_frame;
+   dec->base.end_frame = nvc0_decoder_end_frame;
+
+   for (i = 0; i < NVC0_VIDEO_QDEPTH && !ret; ++i)
+      ret = nouveau_bo_new(screen->device, NOUVEAU_BO_VRAM,
+                           0, 1 << 20, &cfg, &dec->bsp_bo[i]);
+   if (!ret)
+      ret = nouveau_bo_new(screen->device, NOUVEAU_BO_VRAM,
+                           0x100, 4 << 20, &cfg, &dec->inter_bo[0]);
+   if (!ret) {
+      if (!kepler)
+         nouveau_bo_ref(dec->inter_bo[0], &dec->inter_bo[1]);
+      else
+         ret = nouveau_bo_new(screen->device, NOUVEAU_BO_VRAM,
+                              0x100, dec->inter_bo[0]->size, &cfg,
+                              &dec->inter_bo[1]);
+   }
+   if (ret)
+      goto fail;
+
+   switch (u_reduce_video_profile(profile)) {
+   case PIPE_VIDEO_CODEC_MPEG12: {
+      codec = 1;
+      assert(max_references <= 2);
+      break;
+   }
+   case PIPE_VIDEO_CODEC_MPEG4: {
+      codec = 4;
+      tmp_size = mb(height)*16 * mb(width)*16;
+      assert(max_references <= 2);
+      break;
+   }
+   case PIPE_VIDEO_CODEC_VC1: {
+      ppp_codec = codec = 2;
+      tmp_size = mb(height)*16 * mb(width)*16;
+      assert(max_references <= 2);
+      break;
+   }
+   case PIPE_VIDEO_CODEC_MPEG4_AVC: {
+      codec = 3;
+      dec->tmp_stride = 16 * mb_half(width) * nvc0_video_align(height) * 3 / 2;
+      tmp_size = dec->tmp_stride * (max_references + 1);
+      assert(max_references <= 16);
+      break;
+   }
+   default:
+      fprintf(stderr, "invalid codec\n");
+      goto fail;
+   }
+
+   if (screen->device->chipset < 0xd0) {
+      int fd;
+      char path[PATH_MAX];
+      ssize_t r;
+      uint32_t *end, endval;
+
+      ret = nouveau_bo_new(screen->device, NOUVEAU_BO_VRAM, 0,
+                           0x4000, &cfg, &dec->fw_bo);
+      if (!ret)
+         ret = nouveau_bo_map(dec->fw_bo, NOUVEAU_BO_WR, dec->client);
+      if (ret)
+         goto fail;
+
+      nvc0_video_getpath(profile, path);
+
+      fd = open(path, O_RDONLY | O_CLOEXEC);
+      if (fd < 0) {
+         fprintf(stderr, "opening firmware file %s failed: %m\n", path);
+         goto fw_fail;
+      }
+      r = read(fd, dec->fw_bo->map, 0x4000);
+      if (r < 0) {
+         fprintf(stderr, "reading firmware file %s failed: %m\n", path);
+         goto fw_fail;
+      }
+
+      if (r == 0x4000) {
+         close(fd);
+         fprintf(stderr, "firmware file %s too large!\n", path);
+         goto fw_fail;
+      }
+
+      if (r & 0xff) {
+         close(fd);
+         fprintf(stderr, "firmware file %s wrong size!\n", path);
+         goto fw_fail;
+      }
+
+      end = dec->fw_bo->map + r - 4;
+      endval = *end;
+      while (endval == *end)
+         end--;
+
+      r = (intptr_t)end - (intptr_t)dec->fw_bo->map + 4;
+
+      switch (u_reduce_video_profile(profile)) {
+      case PIPE_VIDEO_CODEC_MPEG12: {
+         assert((r & 0xff) == 0xe0);
+         dec->fw_sizes = (0x2e0<<16) | (r - 0x2e0);
+         break;
+      }
+      case PIPE_VIDEO_CODEC_MPEG4: {
+         assert((r & 0xff) == 0xe0);
+         dec->fw_sizes = (0x2e0<<16) | (r - 0x2e0);
+         break;
+      }
+      case PIPE_VIDEO_CODEC_VC1: {
+         assert((r & 0xff) == 0xac);
+         dec->fw_sizes = (0x3ac<<16) | (r - 0x3ac);
+         break;
+      }
+      case PIPE_VIDEO_CODEC_MPEG4_AVC: {
+         assert((r & 0xff) == 0x70);
+         dec->fw_sizes = (0x370<<16) | (r - 0x370);
+         break;
+      }
+      default:
+         goto fw_fail;
+      }
+      munmap(dec->fw_bo->map, dec->fw_bo->size);
+      dec->fw_bo->map = NULL;
+   }
+
+   if (codec != 3) {
+      ret = nouveau_bo_new(screen->device, NOUVEAU_BO_VRAM, 0,
+                           0x400, &cfg, &dec->bitplane_bo);
+      if (ret)
+         goto fail;
+   }
+
+   dec->ref_stride = mb(width)*16 * (mb_half(height)*32 + nvc0_video_align(height)/2);
+   ret = nouveau_bo_new(screen->device, NOUVEAU_BO_VRAM, 0,
+                        dec->ref_stride * (max_references+2) + tmp_size,
+                        &cfg, &dec->ref_bo);
+   if (ret)
+      goto fail;
+
+   timeout = 0;
+
+   BEGIN_NVC0(push[0], SUBC_BSP(0x200), 2);
+   PUSH_DATA (push[0], codec);
+   PUSH_DATA (push[0], timeout);
+
+   BEGIN_NVC0(push[1], SUBC_VP(0x200), 2);
+   PUSH_DATA (push[1], codec);
+   PUSH_DATA (push[1], timeout);
+
+   BEGIN_NVC0(push[2], SUBC_PPP(0x200), 2);
+   PUSH_DATA (push[2], ppp_codec);
+   PUSH_DATA (push[2], timeout);
+
+   ++dec->fence_seq;
+
+#if NVC0_DEBUG_FENCE
+   ret = nouveau_bo_new(screen->device, NOUVEAU_BO_GART|NOUVEAU_BO_MAP,
+                        0, 0x1000, &cfg, &dec->fence_bo);
+   if (ret)
+      goto fail;
+
+   nouveau_bo_map(dec->fence_bo, NOUVEAU_BO_RDWR, screen->client);
+   dec->fence_map = dec->fence_bo->map;
+   dec->fence_map[0] = dec->fence_map[4] = dec->fence_map[8] = 0;
+   dec->comm = (struct comm *)(dec->fence_map + (COMM_OFFSET/sizeof(*dec->fence_map)));
+
+   /* So lets test if the fence is working? */
+   BEGIN_NVC0(push[0], SUBC_BSP(0x240), 3);
+   PUSH_DATAh(push[0], dec->fence_bo->offset);
+   PUSH_DATA (push[0], dec->fence_bo->offset);
+   PUSH_DATA (push[0], dec->fence_seq);
+
+   BEGIN_NVC0(push[0], SUBC_BSP(0x304), 1);
+   PUSH_DATA (push[0], 1);
+   PUSH_KICK (push[0]);
+
+   BEGIN_NVC0(push[1], SUBC_VP(0x240), 3);
+   PUSH_DATAh(push[1], (dec->fence_bo->offset + 0x10));
+   PUSH_DATA (push[1], (dec->fence_bo->offset + 0x10));
+   PUSH_DATA (push[1], dec->fence_seq);
+
+   BEGIN_NVC0(push[1], SUBC_VP(0x304), 1);
+   PUSH_DATA (push[1], 1);
+   PUSH_KICK (push[1]);
+
+   BEGIN_NVC0(push[2], SUBC_PPP(0x240), 3);
+   PUSH_DATAh(push[2], (dec->fence_bo->offset + 0x20));
+   PUSH_DATA (push[2], (dec->fence_bo->offset + 0x20));
+   PUSH_DATA (push[2], dec->fence_seq);
+
+   BEGIN_NVC0(push[2], SUBC_PPP(0x304), 1);
+   PUSH_DATA (push[2], 1);
+   PUSH_KICK (push[2]);
+
+   usleep(100);
+   while (dec->fence_seq > dec->fence_map[0] &&
+          dec->fence_seq > dec->fence_map[4] &&
+          dec->fence_seq > dec->fence_map[8]) {
+      debug_printf("%u: %u %u %u\n", dec->fence_seq, dec->fence_map[0], dec->fence_map[4], dec->fence_map[8]);
+      usleep(100);
+   }
+   debug_printf("%u: %u %u %u\n", dec->fence_seq, dec->fence_map[0], dec->fence_map[4], dec->fence_map[8]);
+#endif
+
+   return &dec->base;
+
+fw_fail:
+   debug_printf("Cannot create decoder without firmware..\n");
+   nvc0_decoder_destroy(&dec->base);
+   return NULL;
+
+fail:
+   debug_printf("Creation failed: %s (%i)\n", strerror(-ret), ret);
+   nvc0_decoder_destroy(&dec->base);
    return NULL;
 }
 
index e2cfc3d59029b5af31b153245b3421d604a01380..4cc0ebbb60503e83b39d9ee9451dc92ea4dc88dd 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright 2011 Maarten Lankhorst
+ * Copyright 2011-2013 Maarten Lankhorst
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
 
 #include "util/u_video.h"
 
+#define SLICE_SIZE 0x200
+#define VP_OFFSET 0x200
+#define COMM_OFFSET 0x500
+
+//#define NVC0_DEBUG_FENCE 1
+
+#ifdef NVC0_DEBUG_FENCE
+# define NVC0_VIDEO_QDEPTH 1
+#else
+# define NVC0_VIDEO_QDEPTH 2
+#endif
+
+#define SUBC_BSP(m) dec->bsp_idx, (m)
+#define SUBC_VP(m) dec->vp_idx, (m)
+#define SUBC_PPP(m) dec->ppp_idx, (m)
+
+union pipe_desc {
+   struct pipe_picture_desc *base;
+   struct pipe_mpeg12_picture_desc *mpeg12;
+   struct pipe_mpeg4_picture_desc *mpeg4;
+   struct pipe_vc1_picture_desc *vc1;
+   struct pipe_h264_picture_desc *h264;
+};
+
 struct nvc0_video_buffer {
    struct pipe_video_buffer base;
    unsigned num_planes, valid_ref;
@@ -38,6 +62,79 @@ struct nvc0_video_buffer {
    struct pipe_surface *surfaces[VL_NUM_COMPONENTS * 2];
 };
 
+struct nvc0_decoder {
+   struct pipe_video_decoder base;
+   struct nouveau_client *client;
+   struct nouveau_object *channel[3], *bsp, *vp, *ppp;
+   struct nouveau_pushbuf *pushbuf[3];
+
+#ifdef NVC0_DEBUG_FENCE
+   /* dump fence and comm, as needed.. */
+   unsigned *fence_map;
+   struct comm *comm;
+
+   struct nouveau_bo *fence_bo;
+#endif
+
+   struct nouveau_bo *fw_bo, *bitplane_bo;
+
+   // array size max_references + 2, contains unpostprocessed images
+   // added at the end of ref_bo is a tmp array
+   // tmp is an array for h264, with each member being used for a ref frame or current
+   // target.. size = (((mb(w)*((mb(h)+1)&~1))+3)>>2)<<8 * (max_references+1)
+   // for other codecs, it simply seems that size = w*h is enough
+   // unsure what it's supposed to contain..
+   struct nouveau_bo *ref_bo;
+
+   struct nouveau_bo *inter_bo[2];
+
+   struct nouveau_bo *bsp_bo[NVC0_VIDEO_QDEPTH];
+
+   // bo's used by each cycle:
+
+   // bsp_bo: contains raw bitstream data and parameters for BSP and VP.
+   // inter_bo: contains data shared between BSP and VP
+   // ref_bo: reference image data, used by PPP and VP
+   // bitplane_bo: contain bitplane data (similar to ref_bo), used by BSP only
+   // fw_bo: used by VP only.
+
+   // Needed amount of copies in optimal case:
+   // 2 copies of inter_bo, VP would process the last inter_bo, while BSP is
+   // writing out a new set.
+   // NVC0_VIDEO_QDEPTH copies of bsp_bo. We don't want to block the pipeline ever,
+   // and give shaders a chance to run as well.
+
+   struct {
+      struct nvc0_video_buffer *vidbuf;
+      unsigned last_used;
+      unsigned field_pic_flag : 1;
+      unsigned decoded_top : 1;
+      unsigned decoded_bottom : 1;
+   } refs[17];
+   unsigned fence_seq, fw_sizes, last_frame_num, tmp_stride, ref_stride;
+
+   unsigned bsp_idx, vp_idx, ppp_idx;
+};
+
+struct comm {
+       uint32_t bsp_cur_index; // 000
+       uint32_t byte_ofs; // 004
+       uint32_t status[0x10]; // 008
+       uint32_t pos[0x10]; // 048
+       uint8_t pad[0x100 - 0x88]; // 0a0 bool comm_encrypted
+
+       uint32_t pvp_cur_index; // 100
+       uint32_t acked_byte_ofs; // 104
+       uint32_t status_vp[0x10]; // 108
+       uint16_t mb_y[0x10]; //148
+       uint32_t pvp_stage; // 168 0xeeXX
+       uint16_t parse_endpos_index; // 16c
+       uint16_t irq_index; // 16e
+       uint8_t  irq_470[0x10]; // 170
+       uint32_t irq_pos[0x10]; // 180
+       uint32_t parse_endpos[0x10]; // 1c0
+};
+
 static INLINE uint32_t nvc0_video_align(uint32_t h)
 {
    return ((h+0x3f)&~0x3f);
@@ -52,3 +149,73 @@ static INLINE uint32_t mb_half(uint32_t coord)
 {
    return (coord + 0x1f)>>5;
 }
+
+static INLINE uint64_t
+nvc0_video_addr(struct nvc0_decoder *dec, struct nvc0_video_buffer *target)
+{
+   uint64_t ret;
+   if (target)
+      ret = dec->ref_stride * target->valid_ref;
+   else
+      ret = dec->ref_stride * (dec->base.max_references+1);
+   return dec->ref_bo->offset + ret;
+}
+
+static INLINE void
+nvc0_decoder_ycbcr_offsets(struct nvc0_decoder *dec, uint32_t *y2,
+                           uint32_t *cbcr, uint32_t *cbcr2)
+{
+   uint32_t w = mb(dec->base.width), size;
+   *y2 = mb_half(dec->base.height)*w;
+   *cbcr = *y2 * 2;
+   *cbcr2 = *cbcr + w * (nvc0_video_align(dec->base.height)>>6);
+
+   /* The check here should never fail because it means a bug
+    * in the code rather than a bug in hardware..
+    */
+   size = (2 * (*cbcr2 - *cbcr) + *cbcr) << 8;
+   if (size > dec->ref_stride) {
+      debug_printf("Overshot ref_stride (%u) with size %u and ofs (%u,%u,%u)\n",
+                   dec->ref_stride, size, *y2<<8, *cbcr<<8, *cbcr2<<8);
+      *y2 = *cbcr = *cbcr2 = 0;
+      assert(size <= dec->ref_stride);
+   }
+}
+
+static INLINE void
+nvc0_decoder_inter_sizes(struct nvc0_decoder *dec, uint32_t slice_count,
+                         uint32_t *slice_size, uint32_t *bucket_size,
+                         uint32_t *ring_size)
+{
+   *slice_size = (SLICE_SIZE * slice_count)>>8;
+   if (u_reduce_video_profile(dec->base.profile) == PIPE_VIDEO_CODEC_MPEG12)
+      *bucket_size = 0;
+   else
+      *bucket_size = mb(dec->base.width) * 3;
+   *ring_size = (dec->inter_bo[0]->size >> 8) - *bucket_size - *slice_size;
+}
+
+extern unsigned
+nvc0_decoder_bsp(struct nvc0_decoder *dec, union pipe_desc desc,
+                 struct nvc0_video_buffer *target,
+                 unsigned comm_seq, unsigned num_buffers,
+                 const void *const *data, const unsigned *num_bytes,
+                 unsigned *vp_caps, unsigned *is_ref,
+                 struct nvc0_video_buffer *refs[16]);
+
+extern void nvc0_decoder_vp_caps(struct nvc0_decoder *dec,
+                                 union pipe_desc desc,
+                                 struct nvc0_video_buffer *target,
+                                 unsigned comm_seq,
+                                 unsigned *caps, unsigned *is_ref,
+                                 struct nvc0_video_buffer *refs[16]);
+
+extern void
+nvc0_decoder_vp(struct nvc0_decoder *dec, union pipe_desc desc,
+                struct nvc0_video_buffer *target, unsigned comm_seq,
+                unsigned caps, unsigned is_ref,
+                struct nvc0_video_buffer *refs[16]);
+
+extern void
+nvc0_decoder_ppp(struct nvc0_decoder *dec, union pipe_desc desc,
+                 struct nvc0_video_buffer *target, unsigned comm_seq);
diff --git a/src/gallium/drivers/nvc0/nvc0_video_bsp.c b/src/gallium/drivers/nvc0/nvc0_video_bsp.c
new file mode 100644 (file)
index 0000000..798b4bf
--- /dev/null
@@ -0,0 +1,423 @@
+/*
+ * Copyright 2011-2013 Maarten Lankhorst
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "nvc0_video.h"
+
+struct strparm_bsp {
+       uint32_t w0[4]; // bits 0-23 length, bits 24-31 addr_hi
+       uint32_t w1[4]; // bit 8-24 addr_lo
+       uint32_t unk20; // should be idx * 0x8000000, bitstream offset
+       uint32_t do_crypto_crap; // set to 0
+};
+
+struct mpeg12_picparm_bsp {
+       uint16_t width;
+       uint16_t height;
+       uint8_t picture_structure;
+       uint8_t picture_coding_type;
+       uint8_t intra_dc_precision;
+       uint8_t frame_pred_frame_dct;
+       uint8_t concealment_motion_vectors;
+       uint8_t intra_vlc_format;
+       uint16_t pad;
+       uint8_t f_code[2][2];
+};
+
+struct mpeg4_picparm_bsp {
+       uint16_t width;
+       uint16_t height;
+       uint8_t vop_time_increment_size;
+       uint8_t interlaced;
+       uint8_t resync_marker_disable;
+};
+
+struct vc1_picparm_bsp {
+       uint16_t width;
+       uint16_t height;
+       uint8_t profile; // 04 0 simple, 1 main, 2 advanced
+       uint8_t postprocflag; // 05
+       uint8_t pulldown; // 06
+       uint8_t interlaced; // 07
+       uint8_t tfcntrflag; // 08
+       uint8_t finterpflag; // 09
+       uint8_t psf; // 0a
+       uint8_t pad; // 0b
+       uint8_t multires; // 0c
+       uint8_t syncmarker; // 0d
+       uint8_t rangered; // 0e
+       uint8_t maxbframes; // 0f
+       uint8_t dquant; // 10
+       uint8_t panscan_flag; // 11
+       uint8_t refdist_flag; // 12
+       uint8_t quantizer; // 13
+       uint8_t extended_mv; // 14
+       uint8_t extended_dmv; // 15
+       uint8_t overlap; // 16
+       uint8_t vstransform; // 17
+};
+
+struct h264_picparm_bsp {
+       // 00
+       uint32_t unk00;
+       // 04
+       uint32_t log2_max_frame_num_minus4; // 04 checked
+       uint32_t pic_order_cnt_type; // 08 checked
+       uint32_t log2_max_pic_order_cnt_lsb_minus4; // 0c checked
+       uint32_t delta_pic_order_always_zero_flag; // 10, or unknown
+
+       uint32_t frame_mbs_only_flag; // 14, always 1?
+       uint32_t direct_8x8_inference_flag; // 18, always 1?
+       uint32_t width_mb; // 1c checked
+       uint32_t height_mb; // 20 checked
+       // 24
+       //struct picparm2
+               uint32_t entropy_coding_mode_flag; // 00, checked
+               uint32_t pic_order_present_flag; // 04 checked
+               uint32_t unk; // 08 seems to be 0?
+               uint32_t pad1; // 0c seems to be 0?
+               uint32_t pad2; // 10 always 0 ?
+               uint32_t num_ref_idx_l0_active_minus1; // 14 always 0?
+               uint32_t num_ref_idx_l1_active_minus1; // 18 always 0?
+               uint32_t weighted_pred_flag; // 1c checked
+               uint32_t weighted_bipred_idc; // 20 checked
+               uint32_t pic_init_qp_minus26; // 24 checked
+               uint32_t deblocking_filter_control_present_flag; // 28 always 1?
+               uint32_t redundant_pic_cnt_present_flag; // 2c always 0?
+               uint32_t transform_8x8_mode_flag; // 30 checked
+               uint32_t mb_adaptive_frame_field_flag; // 34 checked-ish
+               uint8_t field_pic_flag; // 38 checked
+               uint8_t bottom_field_flag; // 39 checked
+               uint8_t real_pad[0x1b]; // XX why?
+};
+
+static uint32_t
+nvc0_decoder_fill_picparm_mpeg12_bsp(struct nvc0_decoder *dec,
+                                     struct pipe_mpeg12_picture_desc *desc,
+                                     char *map)
+{
+   struct mpeg12_picparm_bsp *pic_bsp = (struct mpeg12_picparm_bsp *)map;
+   int i;
+   pic_bsp->width = dec->base.width;
+   pic_bsp->height = dec->base.height;
+   pic_bsp->picture_structure = desc->picture_structure;
+   pic_bsp->picture_coding_type = desc->picture_coding_type;
+   pic_bsp->intra_dc_precision = desc->intra_dc_precision;
+   pic_bsp->frame_pred_frame_dct = desc->frame_pred_frame_dct;
+   pic_bsp->concealment_motion_vectors = desc->concealment_motion_vectors;
+   pic_bsp->intra_vlc_format = desc->intra_vlc_format;
+   pic_bsp->pad = 0;
+   for (i = 0; i < 4; ++i)
+      pic_bsp->f_code[i/2][i%2] = desc->f_code[i/2][i%2] + 1; // FU
+
+   return (desc->num_slices << 4) | (dec->base.profile != PIPE_VIDEO_PROFILE_MPEG1);
+}
+
+static uint32_t
+nvc0_decoder_fill_picparm_mpeg4_bsp(struct nvc0_decoder *dec,
+                                    struct pipe_mpeg4_picture_desc *desc,
+                                    char *map)
+{
+   struct mpeg4_picparm_bsp *pic_bsp = (struct mpeg4_picparm_bsp *)map;
+   uint32_t t, bits = 0;
+   pic_bsp->width = dec->base.width;
+   pic_bsp->height = dec->base.height;
+   assert(desc->vop_time_increment_resolution > 0);
+
+   t = desc->vop_time_increment_resolution - 1;
+   while (t) {
+      bits++;
+      t /= 2;
+   }
+   if (!bits)
+      bits = 1;
+   t = desc->vop_time_increment_resolution - 1;
+   pic_bsp->vop_time_increment_size = bits;
+   pic_bsp->interlaced = desc->interlaced;
+   pic_bsp->resync_marker_disable = desc->resync_marker_disable;
+   return 4;
+}
+
+static uint32_t
+nvc0_decoder_fill_picparm_vc1_bsp(struct nvc0_decoder *dec,
+                                  struct pipe_vc1_picture_desc *d,
+                                  char *map)
+{
+   struct vc1_picparm_bsp *vc = (struct vc1_picparm_bsp *)map;
+   uint32_t caps = (d->slice_count << 4)&0xfff0;
+   vc->width = dec->base.width;
+   vc->height = dec->base.height;
+   vc->profile = dec->base.profile - PIPE_VIDEO_PROFILE_VC1_SIMPLE; // 04
+   vc->postprocflag = d->postprocflag;
+   vc->pulldown = d->pulldown;
+   vc->interlaced = d->interlace;
+   vc->tfcntrflag = d->tfcntrflag; // 08
+   vc->finterpflag = d->finterpflag;
+   vc->psf = d->psf;
+   vc->pad = 0;
+   vc->multires = d->multires; // 0c
+   vc->syncmarker = d->syncmarker;
+   vc->rangered = d->rangered;
+   vc->maxbframes = d->maxbframes;
+   vc->dquant = d->dquant; // 10
+   vc->panscan_flag = d->panscan_flag;
+   vc->refdist_flag = d->refdist_flag;
+   vc->quantizer = d->quantizer;
+   vc->extended_mv = d->extended_mv; // 14
+   vc->extended_dmv = d->extended_dmv;
+   vc->overlap = d->overlap;
+   vc->vstransform = d->vstransform;
+   return caps | 2;
+}
+
+static uint32_t
+nvc0_decoder_fill_picparm_h264_bsp(struct nvc0_decoder *dec,
+                                   struct pipe_h264_picture_desc *d,
+                                   char *map)
+{
+   struct h264_picparm_bsp stub_h = {}, *h = &stub_h;
+   uint32_t caps = (d->slice_count << 4)&0xfff0;
+
+   assert(!(d->slice_count & ~0xfff));
+   if (d->slice_count & 0x1000)
+      caps |= 1 << 20;
+
+   assert(offsetof(struct h264_picparm_bsp, bottom_field_flag) == (0x39 + 0x24));
+   h->unk00 = 1;
+   h->pad1 = h->pad2 = 0;
+   h->unk = 0;
+   h->log2_max_frame_num_minus4 = d->log2_max_frame_num_minus4;
+   h->frame_mbs_only_flag = d->frame_mbs_only_flag;
+   h->direct_8x8_inference_flag = d->direct_8x8_inference_flag;
+   h->width_mb = mb(dec->base.width);
+   h->height_mb = mb(dec->base.height);
+   h->entropy_coding_mode_flag = d->entropy_coding_mode_flag;
+   h->pic_order_present_flag = d->pic_order_present_flag;
+   h->pic_order_cnt_type = d->pic_order_cnt_type;
+   h->log2_max_pic_order_cnt_lsb_minus4 = d->log2_max_pic_order_cnt_lsb_minus4;
+   h->delta_pic_order_always_zero_flag = d->delta_pic_order_always_zero_flag;
+   h->num_ref_idx_l0_active_minus1 = d->num_ref_idx_l0_active_minus1;
+   h->num_ref_idx_l1_active_minus1 = d->num_ref_idx_l1_active_minus1;
+   h->weighted_pred_flag = d->weighted_pred_flag;
+   h->weighted_bipred_idc = d->weighted_bipred_idc;
+   h->pic_init_qp_minus26 = d->pic_init_qp_minus26;
+   h->deblocking_filter_control_present_flag = d->deblocking_filter_control_present_flag;
+   h->redundant_pic_cnt_present_flag = d->redundant_pic_cnt_present_flag;
+   h->transform_8x8_mode_flag = d->transform_8x8_mode_flag;
+   h->mb_adaptive_frame_field_flag = d->mb_adaptive_frame_field_flag;
+   h->field_pic_flag = d->field_pic_flag;
+   h->bottom_field_flag = d->bottom_field_flag;
+   memset(h->real_pad, 0, sizeof(h->real_pad));
+   *(struct h264_picparm_bsp *)map = *h;
+   return caps | 3;
+}
+
+#if NVC0_DEBUG_FENCE
+static void dump_comm_bsp(struct comm *comm)
+{
+   unsigned idx = comm->bsp_cur_index & 0xf;
+   debug_printf("Cur seq: %x, bsp byte ofs: %x\n", comm->bsp_cur_index, comm->byte_ofs);
+   debug_printf("Status: %08x, pos: %08x\n", comm->status[idx], comm->pos[idx]);
+}
+#endif
+
+unsigned
+nvc0_decoder_bsp(struct nvc0_decoder *dec, union pipe_desc desc,
+                 struct nvc0_video_buffer *target,
+                 unsigned comm_seq, unsigned num_buffers,
+                 const void *const *data, const unsigned *num_bytes,
+                 unsigned *vp_caps, unsigned *is_ref,
+                 struct nvc0_video_buffer *refs[16])
+{
+   struct nouveau_pushbuf *push = dec->pushbuf[0];
+   enum pipe_video_codec codec = u_reduce_video_profile(dec->base.profile);
+   char *bsp;
+   uint32_t bsp_addr, comm_addr, inter_addr;
+   uint32_t slice_size, bucket_size, ring_size;
+   uint32_t endmarker, caps;
+   struct strparm_bsp *str_bsp;
+   int ret, i;
+   struct nouveau_bo *bsp_bo = dec->bsp_bo[comm_seq % NVC0_VIDEO_QDEPTH];
+   struct nouveau_bo *inter_bo = dec->inter_bo[comm_seq & 1];
+   unsigned fence_extra = 0;
+   struct nouveau_pushbuf_refn bo_refs[] = {
+      { bsp_bo, NOUVEAU_BO_RD | NOUVEAU_BO_VRAM },
+      { inter_bo, NOUVEAU_BO_WR | NOUVEAU_BO_VRAM },
+#ifdef NVC0_DEBUG_FENCE
+      { dec->fence_bo, NOUVEAU_BO_WR | NOUVEAU_BO_GART },
+#endif
+      { dec->bitplane_bo, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
+   };
+   int num_refs = sizeof(bo_refs)/sizeof(*bo_refs);
+
+#ifdef NVC0_DEBUG_FENCE
+   fence_extra = 4;
+#endif
+
+   ret = nouveau_bo_map(bsp_bo, NOUVEAU_BO_WR, dec->client);
+   if (ret) {
+      debug_printf("map failed: %i %s\n", ret, strerror(-ret));
+      return -1;
+   }
+   bsp = bsp_bo->map;
+   /*
+    * 0x000..0x100: picparm_bsp
+    * 0x200..0x500: picparm_vp
+    * 0x500..0x700: comm
+    * 0x700..onward: raw bitstream
+    */
+
+   switch (codec){
+   case PIPE_VIDEO_CODEC_MPEG12:
+      endmarker = 0xb7010000;
+      caps = nvc0_decoder_fill_picparm_mpeg12_bsp(dec, desc.mpeg12, bsp);
+      break;
+   case PIPE_VIDEO_CODEC_MPEG4:
+      endmarker = 0xb1010000;
+      caps = nvc0_decoder_fill_picparm_mpeg4_bsp(dec, desc.mpeg4, bsp);
+      break;
+   case PIPE_VIDEO_CODEC_VC1: {
+      endmarker = 0x0a010000;
+      caps = nvc0_decoder_fill_picparm_vc1_bsp(dec, desc.vc1, bsp);
+      break;
+   }
+   case PIPE_VIDEO_CODEC_MPEG4_AVC: {
+      endmarker = 0x0b010000;
+      caps = nvc0_decoder_fill_picparm_h264_bsp(dec, desc.h264, bsp);
+      break;
+   }
+   default: assert(0); return -1;
+   }
+
+   nvc0_decoder_vp_caps(dec, desc, target, comm_seq, vp_caps, is_ref, refs);
+
+   PUSH_SPACE(push, 6 + (codec == PIPE_VIDEO_CODEC_MPEG4_AVC ? 9 : 7) + fence_extra + 2);
+   if (!dec->bitplane_bo)
+      num_refs--;
+   nouveau_pushbuf_refn(push, bo_refs, num_refs);
+
+   caps |= 0 << 16; // reset struct comm if flag is set
+   caps |= 1 << 17; // enable watchdog
+   caps |= 0 << 18; // do not report error to VP, so it can continue decoding what we have
+   caps |= 0 << 19; // if enabled, use crypto crap?
+   bsp += 0x100;
+
+   str_bsp = (struct strparm_bsp *)bsp;
+   memset(str_bsp, 0, 0x80);
+   str_bsp->w0[0] = 16;
+   str_bsp->w1[0] = 0x1;
+   bsp += 0x100;
+   /* Reserved for picparm_vp */
+   bsp += 0x300;
+   /* Reserved for comm */
+#if !NVC0_DEBUG_FENCE
+   memset(bsp, 0, 0x200);
+#endif
+   bsp += 0x200;
+   for (i = 0; i < num_buffers; ++i) {
+      memcpy(bsp, data[i], num_bytes[i]);
+      bsp += num_bytes[i];
+      str_bsp->w0[0] += num_bytes[i];
+   }
+
+   /* Append end sequence */
+   *(uint32_t *)bsp = endmarker;
+   bsp += 4;
+   *(uint32_t *)bsp = 0x00000000;
+   bsp += 4;
+   *(uint32_t *)bsp = endmarker;
+   bsp += 4;
+   *(uint32_t *)bsp = 0x00000000;
+
+   bsp_addr = bsp_bo->offset >> 8;
+   inter_addr = inter_bo->offset >> 8;
+
+#if NVC0_DEBUG_FENCE
+   memset(dec->comm, 0, 0x200);
+   comm_addr = (dec->fence_bo->offset + COMM_OFFSET) >> 8;
+#else
+   comm_addr = bsp_addr + (COMM_OFFSET>>8);
+#endif
+
+   BEGIN_NVC0(push, SUBC_BSP(0x700), 5);
+   PUSH_DATA (push, caps); // 700 cmd
+   PUSH_DATA (push, bsp_addr + 1); // 704 strparm_bsp
+   PUSH_DATA (push, bsp_addr + 7); // 708 str addr
+   PUSH_DATA (push, comm_addr); // 70c comm
+   PUSH_DATA (push, comm_seq); // 710 seq
+
+   if (codec != PIPE_VIDEO_CODEC_MPEG4_AVC) {
+      u32 bitplane_addr;
+
+      bitplane_addr = dec->bitplane_bo->offset >> 8;
+
+      nvc0_decoder_inter_sizes(dec, 1, &slice_size, &bucket_size, &ring_size);
+      BEGIN_NVC0(push, SUBC_BSP(0x400), 6);
+      PUSH_DATA (push, bsp_addr); // 400 picparm addr
+      PUSH_DATA (push, inter_addr); // 404 interparm addr
+      PUSH_DATA (push, inter_addr + slice_size + bucket_size); // 408 interdata addr
+      PUSH_DATA (push, ring_size << 8); // 40c interdata_size
+      PUSH_DATA (push, bitplane_addr); // 410 BITPLANE_DATA
+      PUSH_DATA (push, 0x400); // 414 BITPLANE_DATA_SIZE
+   } else {
+      nvc0_decoder_inter_sizes(dec, desc.h264->slice_count, &slice_size, &bucket_size, &ring_size);
+      BEGIN_NVC0(push, SUBC_BSP(0x400), 8);
+      PUSH_DATA (push, bsp_addr); // 400 picparm addr
+      PUSH_DATA (push, inter_addr); // 404 interparm addr
+      PUSH_DATA (push, slice_size << 8); // 408 interparm size?
+      PUSH_DATA (push, inter_addr + slice_size + bucket_size); // 40c interdata addr
+      PUSH_DATA (push, ring_size << 8); // 410 interdata size
+      PUSH_DATA (push, inter_addr + slice_size); // 414 bucket?
+      PUSH_DATA (push, bucket_size << 8); // 418 bucket size? unshifted..
+      PUSH_DATA (push, 0); // 41c targets
+      // TODO: Double check 414 / 418 with nvidia trace
+   }
+
+#if NVC0_DEBUG_FENCE
+   BEGIN_NVC0(push, SUBC_BSP(0x240), 3);
+   PUSH_DATAh(push, dec->fence_bo->offset);
+   PUSH_DATA (push, dec->fence_bo->offset);
+   PUSH_DATA (push, dec->fence_seq);
+
+   BEGIN_NVC0(push, SUBC_BSP(0x300), 1);
+   PUSH_DATA (push, 1);
+   PUSH_KICK (push);
+
+   {
+      unsigned spin = 0;
+      do {
+         usleep(100);
+         if ((spin++ & 0xff) == 0xff) {
+            debug_printf("%u: %u\n", dec->fence_seq, dec->fence_map[0]);
+            dump_comm_bsp(dec->comm);
+         }
+      } while (dec->fence_seq > dec->fence_map[0]);
+   }
+
+   dump_comm_bsp(dec->comm);
+   return dec->comm->status[comm_seq & 0xf];
+#else
+   BEGIN_NVC0(push, SUBC_BSP(0x300), 1);
+   PUSH_DATA (push, 0);
+   PUSH_KICK (push);
+   return 2;
+#endif
+}
diff --git a/src/gallium/drivers/nvc0/nvc0_video_ppp.c b/src/gallium/drivers/nvc0/nvc0_video_ppp.c
new file mode 100644 (file)
index 0000000..2e99540
--- /dev/null
@@ -0,0 +1,145 @@
+/*
+ * Copyright 2011-2013 Maarten Lankhorst
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "nvc0_video.h"
+
+static void
+nvc0_decoder_setup_ppp(struct nvc0_decoder *dec, struct nvc0_video_buffer *target, uint32_t low700) {
+   struct nouveau_pushbuf *push = dec->pushbuf[2];
+
+   uint32_t stride_in = mb(dec->base.width);
+   uint32_t stride_out = mb(target->resources[0]->width0);
+   uint32_t dec_h = mb(dec->base.height);
+   uint32_t dec_w = mb(dec->base.width);
+   uint64_t in_addr;
+   uint32_t y2, cbcr, cbcr2, i;
+   struct nouveau_pushbuf_refn bo_refs[] = {
+      { NULL, NOUVEAU_BO_WR | NOUVEAU_BO_VRAM },
+      { NULL, NOUVEAU_BO_WR | NOUVEAU_BO_VRAM },
+      { dec->ref_bo, NOUVEAU_BO_RD | NOUVEAU_BO_VRAM },
+#ifdef NVC0_DEBUG_FENCE
+      { dec->fence_bo, NOUVEAU_BO_WR | NOUVEAU_BO_GART },
+#endif
+   };
+   unsigned num_refs = sizeof(bo_refs)/sizeof(*bo_refs);
+
+   for (i = 0; i < 2; ++i) {
+      struct nv50_miptree *mt = (struct nv50_miptree *)target->resources[i];
+      bo_refs[i].bo = mt->base.bo;
+   }
+
+   nouveau_pushbuf_refn(push, bo_refs, num_refs);
+   nvc0_decoder_ycbcr_offsets(dec, &y2, &cbcr, &cbcr2);
+
+   BEGIN_NVC0(push, SUBC_PPP(0x700), 10);
+   in_addr = nvc0_video_addr(dec, target) >> 8;
+
+   PUSH_DATA (push, (stride_out << 24) | (stride_out << 16) | low700); // 700
+   PUSH_DATA (push, (stride_in << 24) | (stride_in << 16) | (dec_h << 8) | dec_w); // 704
+   assert(dec_w == stride_in);
+
+   /* Input: */
+   PUSH_DATA (push, in_addr); // 708
+   PUSH_DATA (push, in_addr + y2); // 70c
+   PUSH_DATA (push, in_addr + cbcr); // 710
+   PUSH_DATA (push, in_addr + cbcr2); // 714
+   assert(target->resources[0]->width0 >= 16 * dec_w);
+   assert(target->resources[0]->height0 >= dec->base.height/2);
+
+   for (i = 0; i < 2; ++i) {
+      struct nv50_miptree *mt = (struct nv50_miptree *)target->resources[i];
+
+      PUSH_DATA (push, mt->base.address >> 8);
+      PUSH_DATA (push, (mt->base.address + mt->total_size/2/mt->base.base.array_size) >> 8);
+      mt->base.status |= NOUVEAU_BUFFER_STATUS_GPU_WRITING;
+   }
+}
+
+static uint32_t
+nvc0_decoder_vc1_ppp(struct nvc0_decoder *dec, struct pipe_vc1_picture_desc *desc, struct nvc0_video_buffer *target) {
+   struct nouveau_pushbuf *push = dec->pushbuf[2];
+
+   nvc0_decoder_setup_ppp(dec, target, 0x1412);
+   assert(!desc->deblockEnable);
+   assert(!(dec->base.width & 0xf));
+   assert(!(dec->base.height & 0xf));
+
+   BEGIN_NVC0(push, SUBC_PPP(0x400), 1);
+   PUSH_DATA (push, desc->pquant << 11);
+
+   // 728 = wtf?
+   return 0x10;
+}
+
+void
+nvc0_decoder_ppp(struct nvc0_decoder *dec, union pipe_desc desc, struct nvc0_video_buffer *target, unsigned comm_seq) {
+   enum pipe_video_codec codec = u_reduce_video_profile(dec->base.profile);
+   struct nouveau_pushbuf *push = dec->pushbuf[2];
+   unsigned ppp_caps = 0x10;
+   unsigned fence_extra = 0;
+
+#if NVC0_DEBUG_FENCE
+   fence_extra = 4;
+#endif
+
+   PUSH_SPACE(push, 11 + (codec == PIPE_VIDEO_CODEC_VC1 ? 2 : 0) + 3 + fence_extra + 2);
+
+   switch (codec) {
+   case PIPE_VIDEO_CODEC_MPEG12: {
+      unsigned mpeg2 = dec->base.profile != PIPE_VIDEO_PROFILE_MPEG1;
+      nvc0_decoder_setup_ppp(dec, target, 0x1410 | mpeg2);
+      break;
+   }
+   case PIPE_VIDEO_CODEC_MPEG4: nvc0_decoder_setup_ppp(dec, target, 0x1414); break;
+   case PIPE_VIDEO_CODEC_VC1: ppp_caps = nvc0_decoder_vc1_ppp(dec, desc.vc1, target); break;
+   case PIPE_VIDEO_CODEC_MPEG4_AVC: nvc0_decoder_setup_ppp(dec, target, 0x1413); break;
+   default: assert(0);
+   }
+   BEGIN_NVC0(push, SUBC_PPP(0x734), 2);
+   PUSH_DATA (push, comm_seq);
+   PUSH_DATA (push, ppp_caps);
+
+#if NVC0_DEBUG_FENCE
+   BEGIN_NVC0(push, SUBC_PPP(0x240), 3);
+   PUSH_DATAh(push, (dec->fence_bo->offset + 0x20));
+   PUSH_DATA (push, (dec->fence_bo->offset + 0x20));
+   PUSH_DATA (push, dec->fence_seq);
+
+   BEGIN_NVC0(push, SUBC_PPP(0x300), 1);
+   PUSH_DATA (push, 1);
+   PUSH_KICK (push);
+
+   {
+      unsigned spin = 0;
+
+      do {
+         usleep(100);
+         if ((spin++ & 0xff) == 0xff)
+            debug_printf("ppp%u: %u\n", dec->fence_seq, dec->fence_map[8]);
+      } while (dec->fence_seq > dec->fence_map[8]);
+   }
+#else
+   BEGIN_NVC0(push, SUBC_PPP(0x300), 1);
+   PUSH_DATA (push, 0);
+   PUSH_KICK (push);
+#endif
+}
diff --git a/src/gallium/drivers/nvc0/nvc0_video_vp.c b/src/gallium/drivers/nvc0/nvc0_video_vp.c
new file mode 100644 (file)
index 0000000..84af0d6
--- /dev/null
@@ -0,0 +1,667 @@
+/*
+ * Copyright 2011-2013 Maarten Lankhorst
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "nvc0_video.h"
+#include <sys/mman.h>
+
+struct mpeg12_picparm_vp {
+       uint16_t width; // 00 in mb units
+       uint16_t height; // 02 in mb units
+
+       uint32_t unk04; // 04 stride for Y?
+       uint32_t unk08; // 08 stride for CbCr?
+
+       uint32_t ofs[6]; // 1c..20 ofs
+       uint32_t bucket_size; // 24
+       uint32_t inter_ring_data_size; // 28
+       uint16_t unk2c; // 2c
+       uint16_t alternate_scan; // 2e
+       uint16_t unk30; // 30 not seen set yet
+       uint16_t picture_structure; // 32
+       uint16_t pad2[3];
+       uint16_t unk3a; // 3a set on I frame?
+
+       uint32_t f_code[4]; // 3c
+       uint32_t picture_coding_type; // 4c
+       uint32_t intra_dc_precision; // 50
+       uint32_t q_scale_type; // 54
+       uint32_t top_field_first; // 58
+       uint32_t full_pel_forward_vector; // 5c
+       uint32_t full_pel_backward_vector; // 60
+       uint8_t intra_quantizer_matrix[0x40]; // 64
+       uint8_t non_intra_quantizer_matrix[0x40]; // a4
+};
+
+struct mpeg4_picparm_vp {
+       uint32_t width; // 00 in normal units
+       uint32_t height; // 04 in normal units
+       uint32_t unk08; // stride 1
+       uint32_t unk0c; // stride 2
+       uint32_t ofs[6]; // 10..24 ofs
+       uint32_t bucket_size; // 28
+       uint32_t pad1; // 2c, pad
+       uint32_t pad2; // 30
+       uint32_t inter_ring_data_size; // 34
+
+       uint32_t trd[2]; // 38, 3c
+       uint32_t trb[2]; // 40, 44
+       uint32_t u48; // XXX codec selection? Should test with different values of VdpDecoderProfile
+       uint16_t f_code_fw; // 4c
+       uint16_t f_code_bw; // 4e
+       uint8_t interlaced; // 50
+
+       uint8_t quant_type; // bool, written to 528
+       uint8_t quarter_sample; // bool, written to 548
+       uint8_t short_video_header; // bool, negated written to 528 shifted by 1
+       uint8_t u54; // bool, written to 0x740
+       uint8_t vop_coding_type; // 55
+       uint8_t rounding_control; // 56
+       uint8_t alternate_vertical_scan_flag; // 57 bool
+       uint8_t top_field_first; // bool, written to vuc
+
+       uint8_t pad4[3]; // 59, 5a, 5b, contains garbage on blob
+       uint32_t pad5[0x10]; // 5c...9c non-inclusive, but WHY?
+
+       uint32_t intra[0x10]; // 9c
+       uint32_t non_intra[0x10]; // bc
+       // udc..uff pad?
+};
+
+// Full version, with data pumped from BSP
+struct vc1_picparm_vp {
+       uint32_t bucket_size; // 00
+       uint32_t pad; // 04
+
+       uint32_t inter_ring_data_size; // 08
+       uint32_t unk0c; // stride 1
+       uint32_t unk10; // stride 2
+       uint32_t ofs[6]; // 14..28 ofs
+
+       uint16_t width; // 2c
+       uint16_t height; // 2e
+
+       uint8_t profile; // 30 0 = simple, 1 = main, 2 = advanced
+       uint8_t loopfilter; // 31 written into vuc
+       uint8_t fastuvmc; // 32, written into vuc
+       uint8_t dquant; // 33
+
+       uint8_t overlap; // 34
+       uint8_t quantizer; // 35
+       uint8_t u36; // 36, bool
+       uint8_t pad2; // 37, to align to 0x38
+};
+
+struct h264_picparm_vp { // 700..a00
+       uint16_t width, height;
+       uint32_t stride1, stride2; // 04 08
+       uint32_t ofs[6]; // 0c..24 in-image offset
+
+       uint32_t u24; // nfi ac8 ?
+       uint32_t bucket_size; // 28 bucket size
+       uint32_t inter_ring_data_size; // 2c
+
+       unsigned f0 : 1; // 0 0x01: into 640 shifted by 3, 540 shifted by 5, half size something?
+       unsigned f1 : 1; // 1 0x02: into vuc ofs 56
+       unsigned weighted_pred_flag : 1; // 2 0x04
+       unsigned f3 : 1; // 3 0x08: into vuc ofs 68
+       unsigned is_reference : 1; // 4
+       unsigned interlace : 1; // 5 field_pic_flag
+       unsigned bottom_field_flag : 1; // 6
+       unsigned f7 : 1; // 7 0x80: nfi yet
+
+       signed log2_max_frame_num_minus4 : 4; // 31 0..3
+       unsigned u31_45 : 2; // 31 4..5
+       unsigned pic_order_cnt_type : 2; // 31 6..7
+       signed pic_init_qp_minus26 : 6; // 32 0..5
+       signed chroma_qp_index_offset : 5; // 32 6..10
+       signed second_chroma_qp_index_offset : 5; // 32 11..15
+
+       unsigned weighted_bipred_idc : 2; // 34 0..1
+       unsigned fifo_dec_index : 7; // 34 2..8
+       unsigned tmp_idx : 5; // 34 9..13
+       unsigned frame_number : 16; // 34 14..29
+       unsigned u34_3030 : 1; // 34 30..30 pp.u34[30:30]
+       unsigned u34_3131 : 1; // 34 31..31 pad?
+
+       uint32_t field_order_cnt[2]; // 38, 3c
+
+       struct { // 40
+               // 0x00223102
+               // nfi (needs: top_is_reference, bottom_is_reference, is_long_term, maybe some other state that was saved..
+               unsigned fifo_idx : 7; // 00 0..6
+               unsigned tmp_idx : 5; // 00 7..11
+               unsigned unk12 : 1; // 00 12 not seen yet, but set, maybe top_is_reference
+               unsigned unk13 : 1; // 00 13 not seen yet, but set, maybe bottom_is_reference?
+               unsigned unk14 : 1; // 00 14 skipped?
+               unsigned notseenyet : 1; // 00 15 pad?
+               unsigned unk16 : 1; // 00 16
+               unsigned unk17 : 4; // 00 17..20
+               unsigned unk21 : 4; // 00 21..24
+               unsigned pad : 7; // 00 d25..31
+
+               uint32_t field_order_cnt[2]; // 04,08
+               uint32_t frame_idx; // 0c
+       } refs[0x10];
+
+       uint8_t m4x4[6][16]; // 140
+       uint8_t m8x8[2][64]; // 1a0
+       uint32_t u220; // 220 number of extra reorder_list to append?
+       uint8_t u224[0x20]; // 224..244 reorder_list append ?
+       uint8_t nfi244[0xb0]; // add some pad to make sure nulls are read
+};
+
+static void
+nvc0_decoder_handle_references(struct nvc0_decoder *dec, struct nvc0_video_buffer *refs[16], unsigned seq, struct nvc0_video_buffer *target)
+{
+   unsigned h264 = u_reduce_video_profile(dec->base.profile) == PIPE_VIDEO_CODEC_MPEG4_AVC;
+   unsigned i, idx, empty_spot = dec->base.max_references + 1;
+   for (i = 0; i < dec->base.max_references; ++i) {
+      if (!refs[i])
+         continue;
+
+      idx = refs[i]->valid_ref;
+      //debug_printf("ref[%i] %p in slot %i\n", i, refs[i], idx);
+      assert(target != refs[i] ||
+             (h264 && empty_spot &&
+              (!dec->refs[idx].decoded_bottom || !dec->refs[idx].decoded_top)));
+      if (target == refs[i])
+         empty_spot = 0;
+      assert(!h264 ||
+             dec->refs[idx].last_used == seq - 1);
+
+      if (dec->refs[idx].vidbuf != refs[i]) {
+         debug_printf("%p is not a real ref\n", refs[i]);
+         // FIXME: Maybe do m2mf copy here if a application really depends on it?
+         continue;
+      }
+
+      assert(dec->refs[idx].vidbuf == refs[i]);
+      dec->refs[idx].last_used = seq;
+   }
+   if (!empty_spot)
+      return;
+
+   /* Try to find a real empty spot first, there should be one..
+    */
+   for (i = 0; i < dec->base.max_references + 1; ++i) {
+      if (dec->refs[i].last_used < seq) {
+         if (!dec->refs[i].vidbuf) {
+            empty_spot = i;
+            break;
+         }
+         if (empty_spot < dec->base.max_references+1 &&
+             dec->refs[empty_spot].last_used < dec->refs[i].last_used)
+            continue;
+         empty_spot = i;
+      }
+   }
+   assert(empty_spot < dec->base.max_references+1);
+   dec->refs[empty_spot].last_used = seq;
+//   debug_printf("Kicked %p to add %p to slot %i\n", dec->refs[empty_spot].vidbuf, target, i);
+   dec->refs[empty_spot].vidbuf = target;
+   dec->refs[empty_spot].decoded_bottom = dec->refs[empty_spot].decoded_top = 0;
+   target->valid_ref = empty_spot;
+}
+
+static void
+nvc0_decoder_kick_ref(struct nvc0_decoder *dec, struct nvc0_video_buffer *target)
+{
+   dec->refs[target->valid_ref].vidbuf = NULL;
+   dec->refs[target->valid_ref].last_used = 0;
+//   debug_printf("Unreffed %p\n", target);
+}
+
+static uint32_t
+nvc0_decoder_fill_picparm_mpeg12_vp(struct nvc0_decoder *dec,
+                                    struct pipe_mpeg12_picture_desc *desc,
+                                    struct nvc0_video_buffer *refs[16],
+                                    unsigned *is_ref,
+                                    char *map)
+{
+   struct mpeg12_picparm_vp pic_vp_stub = {}, *pic_vp = &pic_vp_stub;
+   uint32_t i, ret = 0x01010, ring; // !async_shutdown << 16 | watchdog << 12 | irq_record << 4 | unk;
+   assert(!(dec->base.width & 0xf));
+   *is_ref = desc->picture_coding_type <= 2;
+
+   if (dec->base.profile == PIPE_VIDEO_PROFILE_MPEG1)
+      pic_vp->picture_structure = 3;
+   else
+      pic_vp->picture_structure = desc->picture_structure;
+
+   assert(desc->picture_structure != 4);
+   if (desc->picture_structure == 4) // Untested, but should work
+      ret |= 0x100;
+   pic_vp->width = mb(dec->base.width);
+   pic_vp->height = mb(dec->base.height);
+   pic_vp->unk08 = pic_vp->unk04 = (dec->base.width+0xf)&~0xf; // Stride
+
+   nvc0_decoder_ycbcr_offsets(dec, &pic_vp->ofs[1], &pic_vp->ofs[3], &pic_vp->ofs[4]);
+   pic_vp->ofs[5] = pic_vp->ofs[3];
+   pic_vp->ofs[0] = pic_vp->ofs[2] = 0;
+   nvc0_decoder_inter_sizes(dec, 1, &ring, &pic_vp->bucket_size, &pic_vp->inter_ring_data_size);
+
+   pic_vp->alternate_scan = desc->alternate_scan;
+   pic_vp->pad2[0] = pic_vp->pad2[1] = pic_vp->pad2[2] = 0;
+   pic_vp->unk30 = desc->picture_structure < 3 && (desc->picture_structure == 2 - desc->top_field_first);
+   pic_vp->unk3a = (desc->picture_coding_type == 1);
+   for (i = 0; i < 4; ++i)
+      pic_vp->f_code[i] = desc->f_code[i/2][i%2] + 1; // FU
+   pic_vp->picture_coding_type = desc->picture_coding_type;
+   pic_vp->intra_dc_precision = desc->intra_dc_precision;
+   pic_vp->q_scale_type = desc->q_scale_type;
+   pic_vp->top_field_first = desc->top_field_first;
+   pic_vp->full_pel_forward_vector = desc->full_pel_forward_vector;
+   pic_vp->full_pel_backward_vector = desc->full_pel_backward_vector;
+   memcpy(pic_vp->intra_quantizer_matrix, desc->intra_matrix, 0x40);
+   memcpy(pic_vp->non_intra_quantizer_matrix, desc->non_intra_matrix, 0x40);
+   memcpy(map, pic_vp, sizeof(*pic_vp));
+   refs[0] = (struct nvc0_video_buffer *)desc->ref[0];
+   refs[!!refs[0]] = (struct nvc0_video_buffer *)desc->ref[1];
+   return ret | (dec->base.profile != PIPE_VIDEO_PROFILE_MPEG1);
+}
+
+static uint32_t
+nvc0_decoder_fill_picparm_mpeg4_vp(struct nvc0_decoder *dec,
+                                   struct pipe_mpeg4_picture_desc *desc,
+                                   struct nvc0_video_buffer *refs[16],
+                                   unsigned *is_ref,
+                                   char *map)
+{
+   struct mpeg4_picparm_vp pic_vp_stub = {}, *pic_vp = &pic_vp_stub;
+   uint32_t ring, ret = 0x01014; // !async_shutdown << 16 | watchdog << 12 | irq_record << 4 | unk;
+   assert(!(dec->base.width & 0xf));
+   *is_ref = desc->vop_coding_type <= 1;
+
+   pic_vp->width = dec->base.width;
+   pic_vp->height = mb(dec->base.height)<<4;
+   pic_vp->unk0c = pic_vp->unk08 = mb(dec->base.width)<<4; // Stride
+
+   nvc0_decoder_ycbcr_offsets(dec, &pic_vp->ofs[1], &pic_vp->ofs[3], &pic_vp->ofs[4]);
+   pic_vp->ofs[5] = pic_vp->ofs[3];
+   pic_vp->ofs[0] = pic_vp->ofs[2] = 0;
+   pic_vp->pad1 = pic_vp->pad2 = 0;
+   nvc0_decoder_inter_sizes(dec, 1, &ring, &pic_vp->bucket_size, &pic_vp->inter_ring_data_size);
+
+   pic_vp->trd[0] = desc->trd[0];
+   pic_vp->trd[1] = desc->trd[1];
+   pic_vp->trb[0] = desc->trb[0];
+   pic_vp->trb[1] = desc->trb[1];
+   pic_vp->u48 = 0; // Codec?
+   pic_vp->pad1 = pic_vp->pad2 = 0;
+   pic_vp->f_code_fw = desc->vop_fcode_forward;
+   pic_vp->f_code_bw = desc->vop_fcode_backward;
+   pic_vp->interlaced = desc->interlaced;
+   pic_vp->quant_type = desc->quant_type;
+   pic_vp->quarter_sample = desc->quarter_sample;
+   pic_vp->short_video_header = desc->short_video_header;
+   pic_vp->u54 = 0;
+   pic_vp->vop_coding_type = desc->vop_coding_type;
+   pic_vp->rounding_control = desc->rounding_control;
+   pic_vp->alternate_vertical_scan_flag = desc->alternate_vertical_scan_flag;
+   pic_vp->top_field_first = desc->top_field_first;
+
+   memcpy(pic_vp->intra, desc->intra_matrix, 0x40);
+   memcpy(pic_vp->non_intra, desc->non_intra_matrix, 0x40);
+   memcpy(map, pic_vp, sizeof(*pic_vp));
+   refs[0] = (struct nvc0_video_buffer *)desc->ref[0];
+   refs[!!refs[0]] = (struct nvc0_video_buffer *)desc->ref[1];
+   return ret;
+}
+
+static uint32_t
+nvc0_decoder_fill_picparm_h264_vp(struct nvc0_decoder *dec,
+                                  const struct pipe_h264_picture_desc *d,
+                                  struct nvc0_video_buffer *refs[16],
+                                  unsigned *is_ref,
+                                  char *map)
+{
+   struct h264_picparm_vp stub_h = {}, *h = &stub_h;
+   unsigned ring, i, j = 0;
+   assert(offsetof(struct h264_picparm_vp, u224) == 0x224);
+   *is_ref = d->is_reference;
+   assert(!d->frame_num || dec->last_frame_num + 1 == d->frame_num || dec->last_frame_num == d->frame_num);
+   dec->last_frame_num = d->frame_num;
+
+   h->width = mb(dec->base.width);
+   h->height = mb(dec->base.height);
+   h->stride1 = h->stride2 = mb(dec->base.width)*16;
+   nvc0_decoder_ycbcr_offsets(dec, &h->ofs[1], &h->ofs[3], &h->ofs[4]);
+   h->ofs[5] = h->ofs[3];
+   h->ofs[0] = h->ofs[2] = 0;
+   h->u24 = dec->tmp_stride >> 8;
+   assert(h->u24);
+   nvc0_decoder_inter_sizes(dec, 1, &ring, &h->bucket_size, &h->inter_ring_data_size);
+
+   h->u220 = 0;
+   h->f0 = d->mb_adaptive_frame_field_flag;
+   h->f1 = d->direct_8x8_inference_flag;
+   h->weighted_pred_flag = d->weighted_pred_flag;
+   h->f3 = d->constrained_intra_pred_flag;
+   h->is_reference = d->is_reference;
+   h->interlace = d->field_pic_flag;
+   h->bottom_field_flag = d->bottom_field_flag;
+   h->f7 = 0; // TODO: figure out when set..
+   h->log2_max_frame_num_minus4 = d->log2_max_frame_num_minus4;
+   h->u31_45 = 1;
+
+   h->pic_order_cnt_type = d->pic_order_cnt_type;
+   h->pic_init_qp_minus26 = d->pic_init_qp_minus26;
+   h->chroma_qp_index_offset = d->chroma_qp_index_offset;
+   h->second_chroma_qp_index_offset = d->second_chroma_qp_index_offset;
+   h->weighted_bipred_idc = d->weighted_bipred_idc;
+   h->tmp_idx = 0; // set in h264_vp_refs below
+   h->fifo_dec_index = 0; // always set to 0 to be fifo compatible with other codecs
+   h->frame_number = d->frame_num;
+   h->u34_3030 = h->u34_3131 = 0;
+   h->field_order_cnt[0] = d->field_order_cnt[0];
+   h->field_order_cnt[1] = d->field_order_cnt[1];
+   memset(h->refs, 0, sizeof(h->refs));
+   memcpy(h->m4x4, d->scaling_lists_4x4, sizeof(h->m4x4) + sizeof(h->m8x8));
+   h->u220 = 0;
+   for (i = 0; i < d->num_ref_frames; ++i) {
+      if (!d->ref[i])
+         break;
+      refs[j] = (struct nvc0_video_buffer *)d->ref[i];
+      h->refs[j].fifo_idx = j + 1;
+      h->refs[j].tmp_idx = refs[j]->valid_ref;
+      h->refs[j].field_order_cnt[0] = d->field_order_cnt_list[i][0];
+      h->refs[j].field_order_cnt[1] = d->field_order_cnt_list[i][1];
+      h->refs[j].frame_idx = d->frame_num_list[i];
+      if (!dec->refs[refs[j]->valid_ref].field_pic_flag) {
+         h->refs[j].unk12 = d->top_is_reference[i];
+         h->refs[j].unk13 = d->bottom_is_reference[i];
+      }
+      h->refs[j].unk14 = 0;
+      h->refs[j].notseenyet = 0;
+      h->refs[j].unk16 = dec->refs[refs[j]->valid_ref].field_pic_flag;
+      h->refs[j].unk17 = dec->refs[refs[j]->valid_ref].decoded_top &&
+                         d->top_is_reference[i];
+      h->refs[j].unk21 = dec->refs[refs[j]->valid_ref].decoded_bottom &&
+                         d->bottom_is_reference[i];
+      h->refs[j].pad = 0;
+      assert(!d->is_long_term[i]);
+      j++;
+   }
+   for (; i < 16; ++i)
+      assert(!d->ref[i]);
+   assert(d->num_ref_frames <= dec->base.max_references);
+
+   for (; i < d->num_ref_frames; ++i)
+      h->refs[j].unk16 = d->field_pic_flag;
+   *(struct h264_picparm_vp *)map = *h;
+
+   return 0x1113;
+}
+
+static void
+nvc0_decoder_fill_picparm_h264_vp_refs(struct nvc0_decoder *dec,
+                                       struct pipe_h264_picture_desc *d,
+                                       struct nvc0_video_buffer *refs[16],
+                                       struct nvc0_video_buffer *target,
+                                       char *map)
+{
+   struct h264_picparm_vp *h = (struct h264_picparm_vp *)map;
+   assert(dec->refs[target->valid_ref].vidbuf == target);
+//    debug_printf("Target: %p\n", target);
+
+   h->tmp_idx = target->valid_ref;
+   dec->refs[target->valid_ref].field_pic_flag = d->field_pic_flag;
+   if (!d->field_pic_flag || d->bottom_field_flag)
+      dec->refs[target->valid_ref].decoded_bottom = 1;
+   if (!d->field_pic_flag || !d->bottom_field_flag)
+      dec->refs[target->valid_ref].decoded_top = 1;
+}
+
+static uint32_t
+nvc0_decoder_fill_picparm_vc1_vp(struct nvc0_decoder *dec,
+                                 struct pipe_vc1_picture_desc *d,
+                                 struct nvc0_video_buffer *refs[16],
+                                 unsigned *is_ref,
+                                 char *map)
+{
+   struct vc1_picparm_vp *vc = (struct vc1_picparm_vp *)map;
+   unsigned ring;
+   assert(dec->base.profile != PIPE_VIDEO_PROFILE_VC1_SIMPLE);
+   *is_ref = d->picture_type <= 1;
+
+   nvc0_decoder_ycbcr_offsets(dec, &vc->ofs[1], &vc->ofs[3], &vc->ofs[4]);
+   vc->ofs[5] = vc->ofs[3];
+   vc->ofs[0] = vc->ofs[2] = 0;
+   vc->width = dec->base.width;
+   vc->height = mb(dec->base.height)<<4;
+   vc->unk0c = vc->unk10 = mb(dec->base.width)<<4; // Stride
+   vc->pad = vc->pad2 = 0;
+   nvc0_decoder_inter_sizes(dec, 1, &ring, &vc->bucket_size, &vc->inter_ring_data_size);
+   vc->profile = dec->base.profile - PIPE_VIDEO_PROFILE_VC1_SIMPLE;
+   vc->loopfilter = d->loopfilter;
+   vc->fastuvmc = d->fastuvmc;
+   vc->dquant = d->dquant;
+   vc->overlap = d->overlap;
+   vc->quantizer = d->quantizer;
+   vc->u36 = 0; // ? No idea what this one is..
+   refs[0] = (struct nvc0_video_buffer *)d->ref[0];
+   refs[!!refs[0]] = (struct nvc0_video_buffer *)d->ref[1];
+   return 0x12;
+}
+
+#if NVC0_DEBUG_FENCE
+static void dump_comm_vp(struct nvc0_decoder *dec, struct comm *comm, u32 comm_seq,
+                         struct nouveau_bo *inter_bo, unsigned slice_size)
+{
+       unsigned i, idx = comm->pvp_cur_index & 0xf;
+       debug_printf("Status: %08x, stage: %08x\n", comm->status_vp[idx], comm->pvp_stage);
+#if 0
+       debug_printf("Acked byte ofs: %x, bsp byte ofs: %x\n", comm->acked_byte_ofs, comm->byte_ofs);
+       debug_printf("Irq/parse indexes: %i %i\n", comm->irq_index, comm->parse_endpos_index);
+
+       for (i = 0; i != comm->irq_index; ++i)
+               debug_printf("irq[%i] = { @ %08x -> %04x }\n", i, comm->irq_pos[i], comm->irq_470[i]);
+       for (i = 0; i != comm->parse_endpos_index; ++i)
+               debug_printf("parse_endpos[%i] = { @ %08x}\n", i, comm->parse_endpos[i]);
+#endif
+       debug_printf("mb_y = %u\n", comm->mb_y[idx]);
+       if (comm->status_vp[idx] == 1)
+               return;
+
+       if ((comm->pvp_stage & 0xff) != 0xff) {
+               unsigned *map;
+               assert(nouveau_bo_map(inter_bo, NOUVEAU_BO_RD|NOUVEAU_BO_NOBLOCK, dec->client) >= 0);
+               map = inter_bo->map;
+               for (i = 0; i < comm->byte_ofs + slice_size; i += 0x10) {
+                       debug_printf("%05x: %08x %08x %08x %08x\n", i, map[i/4], map[i/4+1], map[i/4+2], map[i/4+3]);
+               }
+               munmap(inter_bo->map, inter_bo->size);
+               inter_bo->map = NULL;
+       }
+       assert((comm->pvp_stage & 0xff) == 0xff);
+}
+#endif
+
+void nvc0_decoder_vp_caps(struct nvc0_decoder *dec, union pipe_desc desc,
+                          struct nvc0_video_buffer *target, unsigned comm_seq,
+                          unsigned *caps, unsigned *is_ref,
+                          struct nvc0_video_buffer *refs[16])
+{
+   struct nouveau_bo *bsp_bo = dec->bsp_bo[comm_seq % NVC0_VIDEO_QDEPTH];
+   enum pipe_video_codec codec = u_reduce_video_profile(dec->base.profile);
+   char *vp = bsp_bo->map + VP_OFFSET;
+
+   switch (codec){
+   case PIPE_VIDEO_CODEC_MPEG12:
+      *caps = nvc0_decoder_fill_picparm_mpeg12_vp(dec, desc.mpeg12, refs, is_ref, vp);
+      nvc0_decoder_handle_references(dec, refs, dec->fence_seq, target);
+      return;
+   case PIPE_VIDEO_CODEC_MPEG4:
+      *caps = nvc0_decoder_fill_picparm_mpeg4_vp(dec, desc.mpeg4, refs, is_ref, vp);
+      nvc0_decoder_handle_references(dec, refs, dec->fence_seq, target);
+      return;
+   case PIPE_VIDEO_CODEC_VC1: {
+      *caps = nvc0_decoder_fill_picparm_vc1_vp(dec, desc.vc1, refs, is_ref, vp);
+      nvc0_decoder_handle_references(dec, refs, dec->fence_seq, target);
+      return;
+   }
+   case PIPE_VIDEO_CODEC_MPEG4_AVC: {
+      *caps = nvc0_decoder_fill_picparm_h264_vp(dec, desc.h264, refs, is_ref, vp);
+      nvc0_decoder_handle_references(dec, refs, dec->fence_seq, target);
+      nvc0_decoder_fill_picparm_h264_vp_refs(dec, desc.h264, refs, target, vp);
+      return;
+   }
+   default: assert(0); return;
+   }
+}
+
+void
+nvc0_decoder_vp(struct nvc0_decoder *dec, union pipe_desc desc,
+                struct nvc0_video_buffer *target, unsigned comm_seq,
+                unsigned caps, unsigned is_ref,
+                struct nvc0_video_buffer *refs[16])
+{
+   struct nouveau_pushbuf *push = dec->pushbuf[1];
+   uint32_t bsp_addr, comm_addr, inter_addr, ucode_addr, pic_addr[17], last_addr, null_addr;
+   uint32_t slice_size, bucket_size, ring_size, i;
+   enum pipe_video_codec codec = u_reduce_video_profile(dec->base.profile);
+   struct nouveau_bo *bsp_bo = dec->bsp_bo[comm_seq % NVC0_VIDEO_QDEPTH];
+   struct nouveau_bo *inter_bo = dec->inter_bo[comm_seq & 1];
+   u32 fence_extra = 0, codec_extra = 0;
+   struct nouveau_pushbuf_refn bo_refs[] = {
+      { inter_bo, NOUVEAU_BO_WR | NOUVEAU_BO_VRAM },
+      { dec->ref_bo, NOUVEAU_BO_WR | NOUVEAU_BO_VRAM },
+      { bsp_bo, NOUVEAU_BO_RD | NOUVEAU_BO_VRAM },
+#ifdef NVC0_DEBUG_FENCE
+      { dec->fence_bo, NOUVEAU_BO_WR | NOUVEAU_BO_GART },
+#endif
+      { dec->fw_bo, NOUVEAU_BO_RD | NOUVEAU_BO_VRAM },
+   };
+   int num_refs = sizeof(bo_refs)/sizeof(*bo_refs) - !dec->fw_bo;
+
+#if NVC0_DEBUG_FENCE
+   fence_extra = 4;
+#endif
+
+   if (codec == PIPE_VIDEO_CODEC_MPEG4_AVC) {
+      nvc0_decoder_inter_sizes(dec, desc.h264->slice_count, &slice_size, &bucket_size, &ring_size);
+      codec_extra += 2;
+   } else
+      nvc0_decoder_inter_sizes(dec, 1, &slice_size, &bucket_size, &ring_size);
+
+   if (dec->base.max_references > 2)
+      codec_extra += 1 + (dec->base.max_references - 2);
+
+   pic_addr[16] = nvc0_video_addr(dec, target) >> 8;
+   last_addr = null_addr = nvc0_video_addr(dec, NULL) >> 8;
+
+   for (i = 0; i < dec->base.max_references; ++i) {
+      if (!refs[i])
+         pic_addr[i] = last_addr;
+      else if (dec->refs[refs[i]->valid_ref].vidbuf == refs[i])
+         last_addr = pic_addr[i] = nvc0_video_addr(dec, refs[i]) >> 8;
+      else
+         pic_addr[i] = null_addr;
+   }
+   if (!is_ref)
+      nvc0_decoder_kick_ref(dec, target);
+
+   PUSH_SPACE(push, 8 + 3 * (codec != PIPE_VIDEO_CODEC_MPEG12) +
+              6 + codec_extra + fence_extra + 2);
+
+   nouveau_pushbuf_refn(push, bo_refs, num_refs);
+
+   bsp_addr = bsp_bo->offset >> 8;
+#if NVC0_DEBUG_FENCE
+   comm_addr = (dec->fence_bo->offset + COMM_OFFSET)>>8;
+#else
+   comm_addr = bsp_addr + (COMM_OFFSET>>8);
+#endif
+   inter_addr = inter_bo->offset >> 8;
+   if (dec->fw_bo)
+      ucode_addr = dec->fw_bo->offset >> 8;
+   else
+      ucode_addr = 0;
+
+   BEGIN_NVC0(push, SUBC_VP(0x700), 7);
+   PUSH_DATA (push, caps); // 700
+   PUSH_DATA (push, comm_seq); // 704
+   PUSH_DATA (push, 0); // 708 fuc targets, ignored for nvc0
+   PUSH_DATA (push, dec->fw_sizes); // 70c
+   PUSH_DATA (push, bsp_addr+(VP_OFFSET>>8)); // 710 picparm_addr
+   PUSH_DATA (push, inter_addr); // 714 inter_parm
+   PUSH_DATA (push, inter_addr + slice_size + bucket_size); // 718 inter_data_ofs
+
+   if (bucket_size) {
+      uint64_t tmpimg_addr = dec->ref_bo->offset + dec->ref_stride * (dec->base.max_references+2);
+
+      BEGIN_NVC0(push, SUBC_VP(0x71c), 2);
+      PUSH_DATA (push, tmpimg_addr >> 8); // 71c
+      PUSH_DATA (push, inter_addr + slice_size); // 720 bucket_ofs
+   }
+
+   BEGIN_NVC0(push, SUBC_VP(0x724), 5);
+   PUSH_DATA (push, comm_addr); // 724
+   PUSH_DATA (push, ucode_addr); // 728
+   PUSH_DATA (push, pic_addr[16]); // 734
+   PUSH_DATA (push, pic_addr[0]); // 72c
+   PUSH_DATA (push, pic_addr[1]); // 730
+
+   if (dec->base.max_references > 2) {
+      int i;
+
+      BEGIN_NVC0(push, SUBC_VP(0x400), dec->base.max_references - 2);
+      for (i = 2; i < dec->base.max_references; ++i) {
+         assert(0x400 + (i - 2) * 4 < 0x438);
+         PUSH_DATA (push, pic_addr[i]);
+      }
+   }
+
+   if (codec == PIPE_VIDEO_CODEC_MPEG4_AVC) {
+      BEGIN_NVC0(push, SUBC_VP(0x438), 1);
+      PUSH_DATA (push, desc.h264->slice_count);
+   }
+
+   //debug_printf("Decoding %08lx with %08lx and %08lx\n", pic_addr[16], pic_addr[0], pic_addr[1]);
+
+#if NVC0_DEBUG_FENCE
+   BEGIN_NVC0(push, SUBC_VP(0x240), 3);
+   PUSH_DATAh(push, (dec->fence_bo->offset + 0x10));
+   PUSH_DATA (push, (dec->fence_bo->offset + 0x10));
+   PUSH_DATA (push, dec->fence_seq);
+
+   BEGIN_NVC0(push, SUBC_VP(0x300), 1);
+   PUSH_DATA (push, 1);
+   PUSH_KICK(push);
+
+   {
+      unsigned spin = 0;
+      do {
+         usleep(100);
+         if ((spin++ & 0xff) == 0xff) {
+            debug_printf("vp%u: %u\n", dec->fence_seq, dec->fence_map[4]);
+            dump_comm_vp(dec, dec->comm, comm_seq, inter_bo, slice_size << 8);
+         }
+      } while (dec->fence_seq > dec->fence_map[4]);
+   }
+   dump_comm_vp(dec, dec->comm, comm_seq, inter_bo, slice_size << 8);
+#else
+   BEGIN_NVC0(push, SUBC_VP(0x300), 1);
+   PUSH_DATA (push, 0);
+   PUSH_KICK (push);
+#endif
+}