src/gallium/state_trackers/g3dvl/vl_r16snorm_mc_buf.c

   1 #define VL_INTERNAL
   2 #include "vl_r16snorm_mc_buf.h"
   3 #include <assert.h>
   4 #include <stdlib.h>
   5 #include <pipe/p_context.h>
   6 #include <pipe/p_winsys.h>
   7 #include <pipe/p_screen.h>
   8 #include <pipe/p_state.h>
   9 #include <pipe/p_util.h>
  10 #include <pipe/p_inlines.h>
  11 #include <tgsi/tgsi_parse.h>
  12 #include <tgsi/tgsi_build.h>
  13 #include "vl_render.h"
  14 #include "vl_shader_build.h"
  15 #include "vl_surface.h"
  16 #include "vl_util.h"
  17 #include "vl_types.h"
  18 #include "vl_defs.h"
  19
  20 /*
  21  * TODO: Dynamically determine number of buf sets to use, based on
  22  * video size and available mem, since we can easily run out of memory
  23  * for high res videos.
  24  * Note: Destroying previous frame's buffers and creating new ones
  25  * doesn't work, since the buffer are not actually destroyed until their
  26  * fence is signalled, and if we render fast enough we will create faster
  27  * than we destroy.
  28  */
  29 #define NUM_BUF_SETS 4  /* Number of rotating buffer sets to use */
  30
  31 enum vlMacroBlockTypeEx
  32 {
  33         vlMacroBlockExTypeIntra,
  34         vlMacroBlockExTypeFwdPredictedFrame,
  35         vlMacroBlockExTypeFwdPredictedField,
  36         vlMacroBlockExTypeBkwdPredictedFrame,
  37         vlMacroBlockExTypeBkwdPredictedField,
  38         vlMacroBlockExTypeBiPredictedFrame,
  39         vlMacroBlockExTypeBiPredictedField,
  40
  41         vlNumMacroBlockExTypes
  42 };
  43
  44 struct vlVertexShaderConsts
  45 {
  46         struct vlVertex4f denorm;
  47 };
  48
  49 struct vlFragmentShaderConsts
  50 {
  51         struct vlVertex4f multiplier;
  52         struct vlVertex4f div;
  53 };
  54
  55 struct vlR16SnormBufferedMC
  56 {
  57         struct vlRender                         base;
  58
  59         unsigned int                            video_width, video_height;
  60         enum vlFormat                           video_format;
  61
  62         unsigned int                            cur_buf;
  63         struct vlSurface                        *buffered_surface;
  64         struct vlSurface                        *past_surface, *future_surface;
  65         struct vlVertex2f                       surface_tex_inv_size;
  66         unsigned int                            num_macroblocks[vlNumMacroBlockExTypes];
  67
  68         struct pipe_context                     *pipe;
  69         struct pipe_viewport_state              viewport;
  70         struct pipe_framebuffer_state           render_target;
  71         struct pipe_sampler_state               *samplers[5];
  72         struct pipe_texture                     *textures[NUM_BUF_SETS][5];
  73         void                                    *i_vs, *p_vs[2], *b_vs[2];
  74         void                                    *i_fs, *p_fs[2], *b_fs[2];
  75         struct pipe_vertex_buffer               vertex_bufs[NUM_BUF_SETS][vlNumMacroBlockExTypes][3];
  76         struct pipe_vertex_element              vertex_elems[5];
  77         struct pipe_constant_buffer             vs_const_buf, fs_const_buf;
  78 };
  79
  80 static int vlBegin
  81 (
  82         struct vlRender *render
  83 )
  84 {
  85         assert(render);
  86
  87         return 0;
  88 }
  89
  90 static inline int vlGrabFrameCodedBlock(short *src, short *dst, unsigned int dst_pitch)
  91 {
  92         unsigned int y;
  93
  94         for (y = 0; y < VL_BLOCK_HEIGHT; ++y)
  95                 memcpy
  96                 (
  97                         dst + y * dst_pitch,
  98                         src + y * VL_BLOCK_WIDTH,
  99                         VL_BLOCK_WIDTH * 2
 100                 );
 101
 102         return 0;
 103 }
 104
 105 static inline int vlGrabFieldCodedBlock(short *src, short *dst, unsigned int dst_pitch)
 106 {
 107         unsigned int y;
 108
 109         for (y = 0; y < VL_BLOCK_HEIGHT / 2; ++y)
 110                 memcpy
 111                 (
 112                         dst + y * dst_pitch * 2,
 113                         src + y * VL_BLOCK_WIDTH,
 114                         VL_BLOCK_WIDTH * 2
 115                 );
 116
 117         dst += VL_BLOCK_HEIGHT * dst_pitch;
 118
 119         for (; y < VL_BLOCK_HEIGHT; ++y)
 120                 memcpy
 121                 (
 122                         dst + y * dst_pitch * 2,
 123                         src + y * VL_BLOCK_WIDTH,
 124                         VL_BLOCK_WIDTH * 2
 125                 );
 126
 127         return 0;
 128 }
 129
 130 static inline int vlGrabNoBlock(short *dst, unsigned int dst_pitch)
 131 {
 132         unsigned int y;
 133
 134         for (y = 0; y < VL_BLOCK_HEIGHT; ++y)
 135                 memset
 136                 (
 137                         dst + y * dst_pitch,
 138                         0,
 139                         VL_BLOCK_WIDTH * 2
 140                 );
 141
 142         return 0;
 143 }
 144
 145 static inline int vlGrabBlocks
 146 (
 147         struct vlR16SnormBufferedMC *mc,
 148         unsigned int mbx,
 149         unsigned int mby,
 150         enum vlDCTType dct_type,
 151         unsigned int coded_block_pattern,
 152         short *blocks
 153 )
 154 {
 155         struct pipe_surface     *tex_surface;
 156         short                   *texels;
 157         unsigned int            tex_pitch;
 158         unsigned int            x, y, tb = 0, sb = 0;
 159         unsigned int            mbpx = mbx * VL_MACROBLOCK_WIDTH, mbpy = mby * VL_MACROBLOCK_HEIGHT;
 160
 161         assert(mc);
 162         assert(blocks);
 163
 164         tex_surface = mc->pipe->screen->get_tex_surface
 165         (
 166                 mc->pipe->screen,
 167                 mc->textures[mc->cur_buf % NUM_BUF_SETS][0],
 168                 0, 0, 0, PIPE_BUFFER_USAGE_CPU_WRITE
 169         );
 170
 171         texels = pipe_surface_map(tex_surface, PIPE_BUFFER_USAGE_CPU_WRITE);
 172         tex_pitch = tex_surface->stride / tex_surface->block.size;
 173
 174         texels += mbpy * tex_pitch + mbpx;
 175
 176         for (y = 0; y < 2; ++y)
 177         {
 178                 for (x = 0; x < 2; ++x, ++tb)
 179                 {
 180                         if ((coded_block_pattern >> (5 - tb)) & 1)
 181                         {
 182                                 short *cur_block = blocks + sb * VL_BLOCK_WIDTH * VL_BLOCK_HEIGHT;
 183
 184                                 if (dct_type == vlDCTTypeFrameCoded)
 185                                 {
 186                                         vlGrabFrameCodedBlock
 187                                         (
 188                                                 cur_block,
 189                                                 texels + y * tex_pitch * VL_BLOCK_HEIGHT + x * VL_BLOCK_WIDTH,
 190                                                 tex_pitch
 191                                         );
 192                                 }
 193                                 else
 194                                 {
 195                                         vlGrabFieldCodedBlock
 196                                         (
 197                                                 cur_block,
 198                                                 texels + y * tex_pitch + x * VL_BLOCK_WIDTH,
 199                                                 tex_pitch
 200                                         );
 201                                 }
 202
 203                                 ++sb;
 204                         }
 205                         else
 206                                 vlGrabNoBlock(texels + y * tex_pitch * VL_BLOCK_HEIGHT + x * VL_BLOCK_WIDTH, tex_pitch);
 207                 }
 208         }
 209
 210         pipe_surface_unmap(tex_surface);
 211
 212         /* TODO: Implement 422, 444 */
 213         mbpx >>= 1;
 214         mbpy >>= 1;
 215
 216         for (tb = 0; tb < 2; ++tb)
 217         {
 218                 tex_surface = mc->pipe->screen->get_tex_surface
 219                 (
 220                         mc->pipe->screen,
 221                         mc->textures[mc->cur_buf % NUM_BUF_SETS][tb + 1],
 222                         0, 0, 0, PIPE_BUFFER_USAGE_CPU_WRITE
 223                 );
 224
 225                 texels = pipe_surface_map(tex_surface, PIPE_BUFFER_USAGE_CPU_WRITE);
 226                 tex_pitch = tex_surface->stride / tex_surface->block.size;
 227
 228                 texels += mbpy * tex_pitch + mbpx;
 229
 230                 if ((coded_block_pattern >> (1 - tb)) & 1)
 231                 {
 232                         short *cur_block = blocks + sb * VL_BLOCK_WIDTH * VL_BLOCK_HEIGHT;
 233
 234                         vlGrabFrameCodedBlock
 235                         (
 236                                 cur_block,
 237                                 texels,
 238                                 tex_pitch
 239                         );
 240
 241                         ++sb;
 242                 }
 243                 else
 244                         vlGrabNoBlock(texels, tex_pitch);
 245
 246                 pipe_surface_unmap(tex_surface);
 247         }
 248
 249         return 0;
 250 }
 251
 252 static inline int vlGrabMacroBlock
 253 (
 254         struct vlR16SnormBufferedMC *mc,
 255         struct vlMpeg2MacroBlock *macroblock
 256 )
 257 {
 258         const struct vlVertex2f unit =
 259         {
 260                 mc->surface_tex_inv_size.x * VL_MACROBLOCK_WIDTH,
 261                 mc->surface_tex_inv_size.y * VL_MACROBLOCK_HEIGHT
 262         };
 263         const struct vlVertex2f half =
 264         {
 265                 mc->surface_tex_inv_size.x * (VL_MACROBLOCK_WIDTH / 2),
 266                 mc->surface_tex_inv_size.y * (VL_MACROBLOCK_HEIGHT / 2)
 267         };
 268
 269         struct vlVertex2f       *vb;
 270         enum vlMacroBlockTypeEx mb_type_ex;
 271         struct vlVertex2f       mo_vec[2];
 272         unsigned int            i;
 273
 274         assert(mc);
 275         assert(macroblock);
 276
 277         switch (macroblock->mb_type)
 278         {
 279                 case vlMacroBlockTypeIntra:
 280                 {
 281                         mb_type_ex = vlMacroBlockExTypeIntra;
 282                         break;
 283                 }
 284                 case vlMacroBlockTypeFwdPredicted:
 285                 {
 286                         mb_type_ex = macroblock->mo_type == vlMotionTypeFrame ?
 287                                 vlMacroBlockExTypeFwdPredictedFrame : vlMacroBlockExTypeFwdPredictedField;
 288                         break;
 289                 }
 290                 case vlMacroBlockTypeBkwdPredicted:
 291                 {
 292                         mb_type_ex = macroblock->mo_type == vlMotionTypeFrame ?
 293                                 vlMacroBlockExTypeBkwdPredictedFrame : vlMacroBlockExTypeBkwdPredictedField;
 294                         break;
 295                 }
 296                 case vlMacroBlockTypeBiPredicted:
 297                 {
 298                         mb_type_ex = macroblock->mo_type == vlMotionTypeFrame ?
 299                                 vlMacroBlockExTypeBiPredictedFrame : vlMacroBlockExTypeBiPredictedField;
 300                         break;
 301                 }
 302                 default:
 303                         assert(0);
 304         }
 305
 306         switch (macroblock->mb_type)
 307         {
 308                 case vlMacroBlockTypeBiPredicted:
 309                 {
 310                         vb = (struct vlVertex2f*)mc->pipe->winsys->buffer_map
 311                         (
 312                                 mc->pipe->winsys,
 313                                 mc->vertex_bufs[mc->cur_buf % NUM_BUF_SETS][mb_type_ex][2].buffer,
 314                                 PIPE_BUFFER_USAGE_CPU_WRITE
 315                         ) + mc->num_macroblocks[mb_type_ex] * 2 * 24;
 316
 317                         mo_vec[0].x = macroblock->PMV[0][1][0] * 0.5f * mc->surface_tex_inv_size.x;
 318                         mo_vec[0].y = macroblock->PMV[0][1][1] * 0.5f * mc->surface_tex_inv_size.y;
 319
 320                         if (macroblock->mo_type == vlMotionTypeFrame)
 321                         {
 322                                 for (i = 0; i < 24 * 2; i += 2)
 323                                 {
 324                                         vb[i].x = mo_vec[0].x;
 325                                         vb[i].y = mo_vec[0].y;
 326                                 }
 327                         }
 328                         else
 329                         {
 330                                 mo_vec[1].x = macroblock->PMV[1][1][0] * 0.5f * mc->surface_tex_inv_size.x;
 331                                 mo_vec[1].y = macroblock->PMV[1][1][1] * 0.5f * mc->surface_tex_inv_size.y;
 332
 333                                 for (i = 0; i < 24 * 2; i += 2)
 334                                 {
 335                                         vb[i].x = mo_vec[0].x;
 336                                         vb[i].y = mo_vec[0].y;
 337                                         vb[i + 1].x = mo_vec[1].x;
 338                                         vb[i + 1].y = mo_vec[1].y;
 339                                 }
 340                         }
 341
 342                         mc->pipe->winsys->buffer_unmap(mc->pipe->winsys, mc->vertex_bufs[mc->cur_buf % NUM_BUF_SETS][mb_type_ex][2].buffer);
 343
 344                         /* fall-through */
 345                 }
 346                 case vlMacroBlockTypeFwdPredicted:
 347                 case vlMacroBlockTypeBkwdPredicted:
 348                 {
 349                         vb = (struct vlVertex2f*)mc->pipe->winsys->buffer_map
 350                         (
 351                                 mc->pipe->winsys,
 352                                 mc->vertex_bufs[mc->cur_buf % NUM_BUF_SETS][mb_type_ex][1].buffer,
 353                                 PIPE_BUFFER_USAGE_CPU_WRITE
 354                         ) + mc->num_macroblocks[mb_type_ex] * 2 * 24;
 355
 356                         if (macroblock->mb_type == vlMacroBlockTypeBkwdPredicted)
 357                         {
 358                                 mo_vec[0].x = macroblock->PMV[0][1][0] * 0.5f * mc->surface_tex_inv_size.x;
 359                                 mo_vec[0].y = macroblock->PMV[0][1][1] * 0.5f * mc->surface_tex_inv_size.y;
 360
 361                                 if (macroblock->mo_type == vlMotionTypeField)
 362                                 {
 363                                         mo_vec[1].x = macroblock->PMV[1][1][0] * 0.5f * mc->surface_tex_inv_size.x;
 364                                         mo_vec[1].y = macroblock->PMV[1][1][1] * 0.5f * mc->surface_tex_inv_size.y;
 365                                 }
 366                         }
 367                         else
 368                         {
 369                                 mo_vec[0].x = macroblock->PMV[0][0][0] * 0.5f * mc->surface_tex_inv_size.x;
 370                                 mo_vec[0].y = macroblock->PMV[0][0][1] * 0.5f * mc->surface_tex_inv_size.y;
 371
 372                                 if (macroblock->mo_type == vlMotionTypeField)
 373                                 {
 374                                         mo_vec[1].x = macroblock->PMV[1][0][0] * 0.5f * mc->surface_tex_inv_size.x;
 375                                         mo_vec[1].y = macroblock->PMV[1][0][1] * 0.5f * mc->surface_tex_inv_size.y;
 376                                 }
 377                         }
 378
 379                         if (macroblock->mo_type == vlMotionTypeFrame)
 380                         {
 381                                 for (i = 0; i < 24 * 2; i += 2)
 382                                 {
 383                                         vb[i].x = mo_vec[0].x;
 384                                         vb[i].y = mo_vec[0].y;
 385                                 }
 386                         }
 387                         else
 388                         {
 389                                 for (i = 0; i < 24 * 2; i += 2)
 390                                 {
 391                                         vb[i].x = mo_vec[0].x;
 392                                         vb[i].y = mo_vec[0].y;
 393                                         vb[i + 1].x = mo_vec[1].x;
 394                                         vb[i + 1].y = mo_vec[1].y;
 395                                 }
 396                         }
 397
 398                         mc->pipe->winsys->buffer_unmap(mc->pipe->winsys, mc->vertex_bufs[mc->cur_buf % NUM_BUF_SETS][mb_type_ex][1].buffer);
 399
 400                         /* fall-through */
 401                 }
 402                 case vlMacroBlockTypeIntra:
 403                 {
 404                         vb = (struct vlVertex2f*)mc->pipe->winsys->buffer_map
 405                         (
 406                                 mc->pipe->winsys,
 407                                 mc->vertex_bufs[mc->cur_buf % NUM_BUF_SETS][mb_type_ex][0].buffer,
 408                                 PIPE_BUFFER_USAGE_CPU_WRITE
 409                         ) + mc->num_macroblocks[mb_type_ex] * 24;
 410
 411                         vb[0].x = macroblock->mbx * unit.x;             vb[0].y = macroblock->mby * unit.y;
 412                         vb[1].x = macroblock->mbx * unit.x;             vb[1].y = macroblock->mby * unit.y + half.y;
 413                         vb[2].x = macroblock->mbx * unit.x + half.x;    vb[2].y = macroblock->mby * unit.y;
 414
 415                         vb[3].x = macroblock->mbx * unit.x + half.x;    vb[3].y = macroblock->mby * unit.y;
 416                         vb[4].x = macroblock->mbx * unit.x;             vb[4].y = macroblock->mby * unit.y + half.y;
 417                         vb[5].x = macroblock->mbx * unit.x + half.x;    vb[5].y = macroblock->mby * unit.y + half.y;
 418
 419                         vb[6].x = macroblock->mbx * unit.x + half.x;    vb[6].y = macroblock->mby * unit.y;
 420                         vb[7].x = macroblock->mbx * unit.x + half.x;    vb[7].y = macroblock->mby * unit.y + half.y;
 421                         vb[8].x = macroblock->mbx * unit.x + unit.x;    vb[8].y = macroblock->mby * unit.y;
 422
 423                         vb[9].x = macroblock->mbx * unit.x + unit.x;    vb[9].y = macroblock->mby * unit.y;
 424                         vb[10].x = macroblock->mbx * unit.x + half.x;   vb[10].y = macroblock->mby * unit.y + half.y;
 425                         vb[11].x = macroblock->mbx * unit.x + unit.x;   vb[11].y = macroblock->mby * unit.y + half.y;
 426
 427                         vb[12].x = macroblock->mbx * unit.x;            vb[12].y = macroblock->mby * unit.y + half.y;
 428                         vb[13].x = macroblock->mbx * unit.x;            vb[13].y = macroblock->mby * unit.y + unit.y;
 429                         vb[14].x = macroblock->mbx * unit.x + half.x;   vb[14].y = macroblock->mby * unit.y + half.y;
 430
 431                         vb[15].x = macroblock->mbx * unit.x + half.x;   vb[15].y = macroblock->mby * unit.y + half.y;
 432                         vb[16].x = macroblock->mbx * unit.x;            vb[16].y = macroblock->mby * unit.y + unit.y;
 433                         vb[17].x = macroblock->mbx * unit.x + half.x;   vb[17].y = macroblock->mby * unit.y + unit.y;
 434
 435                         vb[18].x = macroblock->mbx * unit.x + half.x;   vb[18].y = macroblock->mby * unit.y + half.y;
 436                         vb[19].x = macroblock->mbx * unit.x + half.x;   vb[19].y = macroblock->mby * unit.y + unit.y;
 437                         vb[20].x = macroblock->mbx * unit.x + unit.x;   vb[20].y = macroblock->mby * unit.y + half.y;
 438
 439                         vb[21].x = macroblock->mbx * unit.x + unit.x;   vb[21].y = macroblock->mby * unit.y + half.y;
 440                         vb[22].x = macroblock->mbx * unit.x + half.x;   vb[22].y = macroblock->mby * unit.y + unit.y;
 441                         vb[23].x = macroblock->mbx * unit.x + unit.x;   vb[23].y = macroblock->mby * unit.y + unit.y;
 442
 443                         mc->pipe->winsys->buffer_unmap(mc->pipe->winsys, mc->vertex_bufs[mc->cur_buf % NUM_BUF_SETS][mb_type_ex][0].buffer);
 444
 445                         break;
 446                 }
 447                 default:
 448                         assert(0);
 449         }
 450
 451         vlGrabBlocks
 452         (
 453                 mc,
 454                 macroblock->mbx,
 455                 macroblock->mby,
 456                 macroblock->dct_type,
 457                 macroblock->cbp,
 458                 macroblock->blocks
 459         );
 460
 461         mc->num_macroblocks[mb_type_ex]++;
 462
 463         return 0;
 464 }
 465
 466 static int vlFlush
 467 (
 468         struct vlRender *render
 469 )
 470 {
 471         struct vlR16SnormBufferedMC     *mc;
 472         struct pipe_context             *pipe;
 473         struct vlVertexShaderConsts     *vs_consts;
 474
 475         assert(mc);
 476
 477         mc = (struct vlR16SnormBufferedMC*)render;
 478         pipe = mc->pipe;
 479
 480         mc->render_target.cbufs[0] = pipe->screen->get_tex_surface
 481         (
 482                 pipe->screen,
 483                 mc->buffered_surface->texture,
 484                 0, 0, 0, PIPE_BUFFER_USAGE_GPU_READ | PIPE_BUFFER_USAGE_GPU_WRITE
 485         );
 486
 487         pipe->set_framebuffer_state(pipe, &mc->render_target);
 488         pipe->set_viewport_state(pipe, &mc->viewport);
 489         vs_consts = pipe->winsys->buffer_map
 490         (
 491                 pipe->winsys,
 492                 mc->vs_const_buf.buffer,
 493                 PIPE_BUFFER_USAGE_CPU_WRITE
 494         );
 495
 496         vs_consts->denorm.x = mc->buffered_surface->texture->width[0];
 497         vs_consts->denorm.y = mc->buffered_surface->texture->height[0];
 498
 499         pipe->winsys->buffer_unmap(pipe->winsys, mc->vs_const_buf.buffer);
 500         pipe->set_constant_buffer(pipe, PIPE_SHADER_FRAGMENT, 0, &mc->fs_const_buf);
 501
 502         if (mc->num_macroblocks[vlMacroBlockExTypeIntra] > 0)
 503         {
 504                 pipe->set_vertex_buffers(pipe, 1, mc->vertex_bufs[mc->cur_buf % NUM_BUF_SETS][vlMacroBlockExTypeIntra]);
 505                 pipe->set_vertex_elements(pipe, 1, mc->vertex_elems);
 506                 pipe->set_sampler_textures(pipe, 3, mc->textures[mc->cur_buf % NUM_BUF_SETS]);
 507                 pipe->bind_sampler_states(pipe, 3, (void**)mc->samplers);
 508                 pipe->bind_vs_state(pipe, mc->i_vs);
 509                 pipe->bind_fs_state(pipe, mc->i_fs);
 510
 511                 pipe->draw_arrays(pipe, PIPE_PRIM_TRIANGLES, 0, mc->num_macroblocks[vlMacroBlockExTypeIntra] * 24);
 512         }
 513
 514         if (mc->num_macroblocks[vlMacroBlockExTypeFwdPredictedFrame] > 0)
 515         {
 516                 pipe->set_vertex_buffers(pipe, 2, mc->vertex_bufs[mc->cur_buf % NUM_BUF_SETS][vlMacroBlockExTypeFwdPredictedFrame]);
 517                 pipe->set_vertex_elements(pipe, 3, mc->vertex_elems);
 518                 mc->textures[mc->cur_buf % NUM_BUF_SETS][3] = mc->past_surface->texture;
 519                 pipe->set_sampler_textures(pipe, 4, mc->textures[mc->cur_buf % NUM_BUF_SETS]);
 520                 pipe->bind_sampler_states(pipe, 4, (void**)mc->samplers);
 521                 pipe->bind_vs_state(pipe, mc->p_vs[0]);
 522                 pipe->bind_fs_state(pipe, mc->p_fs[0]);
 523
 524                 pipe->draw_arrays(pipe, PIPE_PRIM_TRIANGLES, 0, mc->num_macroblocks[vlMacroBlockExTypeFwdPredictedFrame] * 24);
 525         }
 526
 527         if (mc->num_macroblocks[vlMacroBlockExTypeFwdPredictedField] > 0)
 528         {
 529                 pipe->set_vertex_buffers(pipe, 2, mc->vertex_bufs[mc->cur_buf % NUM_BUF_SETS][vlMacroBlockExTypeFwdPredictedField]);
 530                 pipe->set_vertex_elements(pipe, 3, mc->vertex_elems);
 531                 mc->textures[mc->cur_buf % NUM_BUF_SETS][3] = mc->past_surface->texture;
 532                 pipe->set_sampler_textures(pipe, 4, mc->textures[mc->cur_buf % NUM_BUF_SETS]);
 533                 pipe->bind_sampler_states(pipe, 4, (void**)mc->samplers);
 534                 pipe->bind_vs_state(pipe, mc->p_vs[1]);
 535                 pipe->bind_fs_state(pipe, mc->p_fs[1]);
 536
 537                 pipe->draw_arrays(pipe, PIPE_PRIM_TRIANGLES, 0, mc->num_macroblocks[vlMacroBlockExTypeFwdPredictedField] * 24);
 538         }
 539
 540         if (mc->num_macroblocks[vlMacroBlockExTypeBkwdPredictedFrame] > 0)
 541         {
 542                 pipe->set_vertex_buffers(pipe, 2, mc->vertex_bufs[mc->cur_buf % NUM_BUF_SETS][vlMacroBlockExTypeBkwdPredictedFrame]);
 543                 pipe->set_vertex_elements(pipe, 3, mc->vertex_elems);
 544                 mc->textures[mc->cur_buf % NUM_BUF_SETS][3] = mc->future_surface->texture;
 545                 pipe->set_sampler_textures(pipe, 4, mc->textures[mc->cur_buf % NUM_BUF_SETS]);
 546                 pipe->bind_sampler_states(pipe, 4, (void**)mc->samplers);
 547                 pipe->bind_vs_state(pipe, mc->p_vs[0]);
 548                 pipe->bind_fs_state(pipe, mc->p_fs[0]);
 549
 550                 pipe->draw_arrays(pipe, PIPE_PRIM_TRIANGLES, 0, mc->num_macroblocks[vlMacroBlockExTypeBkwdPredictedFrame] * 24);
 551         }
 552
 553         if (mc->num_macroblocks[vlMacroBlockExTypeBkwdPredictedField] > 0)
 554         {
 555                 pipe->set_vertex_buffers(pipe, 2, mc->vertex_bufs[mc->cur_buf % NUM_BUF_SETS][vlMacroBlockExTypeBkwdPredictedField]);
 556                 pipe->set_vertex_elements(pipe, 3, mc->vertex_elems);
 557                 mc->textures[mc->cur_buf % NUM_BUF_SETS][3] = mc->future_surface->texture;
 558                 pipe->set_sampler_textures(pipe, 4, mc->textures[mc->cur_buf % NUM_BUF_SETS]);
 559                 pipe->bind_sampler_states(pipe, 4, (void**)mc->samplers);
 560                 pipe->bind_vs_state(pipe, mc->p_vs[1]);
 561                 pipe->bind_fs_state(pipe, mc->p_fs[1]);
 562
 563                 pipe->draw_arrays(pipe, PIPE_PRIM_TRIANGLES, 0, mc->num_macroblocks[vlMacroBlockExTypeBkwdPredictedField] * 24);
 564         }
 565
 566         if (mc->num_macroblocks[vlMacroBlockExTypeBiPredictedFrame] > 0)
 567         {
 568                 pipe->set_vertex_buffers(pipe, 3, mc->vertex_bufs[mc->cur_buf % NUM_BUF_SETS][vlMacroBlockExTypeBiPredictedFrame]);
 569                 pipe->set_vertex_elements(pipe, 5, mc->vertex_elems);
 570                 mc->textures[mc->cur_buf % NUM_BUF_SETS][3] = mc->past_surface->texture;
 571                 mc->textures[mc->cur_buf % NUM_BUF_SETS][4] = mc->future_surface->texture;
 572                 pipe->set_sampler_textures(pipe, 5, mc->textures[mc->cur_buf % NUM_BUF_SETS]);
 573                 pipe->bind_sampler_states(pipe, 5, (void**)mc->samplers);
 574                 pipe->bind_vs_state(pipe, mc->b_vs[0]);
 575                 pipe->bind_fs_state(pipe, mc->b_fs[0]);
 576
 577                 pipe->draw_arrays(pipe, PIPE_PRIM_TRIANGLES, 0, mc->num_macroblocks[vlMacroBlockExTypeBiPredictedFrame] * 24);
 578         }
 579
 580         if (mc->num_macroblocks[vlMacroBlockExTypeBiPredictedField] > 0)
 581         {
 582                 pipe->set_vertex_buffers(pipe, 3, mc->vertex_bufs[mc->cur_buf % NUM_BUF_SETS][vlMacroBlockExTypeBiPredictedField]);
 583                 pipe->set_vertex_elements(pipe, 5, mc->vertex_elems);
 584                 mc->textures[mc->cur_buf % NUM_BUF_SETS][3] = mc->past_surface->texture;
 585                 mc->textures[mc->cur_buf % NUM_BUF_SETS][4] = mc->future_surface->texture;
 586                 pipe->set_sampler_textures(pipe, 5, mc->textures[mc->cur_buf % NUM_BUF_SETS]);
 587                 pipe->bind_sampler_states(pipe, 5, (void**)mc->samplers);
 588                 pipe->bind_vs_state(pipe, mc->b_vs[1]);
 589                 pipe->bind_fs_state(pipe, mc->b_fs[1]);
 590
 591                 pipe->draw_arrays(pipe, PIPE_PRIM_TRIANGLES, 0, mc->num_macroblocks[vlMacroBlockExTypeBiPredictedField] * 24);
 592         }
 593
 594         memset(mc->num_macroblocks, 0, sizeof(unsigned int) * vlNumMacroBlockExTypes);
 595         mc->cur_buf++;
 596
 597         return 0;
 598 }
 599
 600 static int vlRenderMacroBlocksMpeg2R16SnormBuffered
 601 (
 602         struct vlRender *render,
 603         struct vlMpeg2MacroBlockBatch *batch,
 604         struct vlSurface *surface
 605 )
 606 {
 607         struct vlR16SnormBufferedMC     *mc;
 608         unsigned int                    i;
 609
 610         assert(render);
 611
 612         mc = (struct vlR16SnormBufferedMC*)render;
 613
 614         if (mc->buffered_surface)
 615         {
 616                 if
 617                 (
 618                         mc->buffered_surface != surface /*||
 619                         mc->past_surface != batch->past_surface ||
 620                         mc->future_surface != batch->future_surface*/
 621                 )
 622                 {
 623                         vlFlush(&mc->base);
 624                         mc->buffered_surface = surface;
 625                         mc->past_surface = batch->past_surface;
 626                         mc->future_surface = batch->future_surface;
 627                         mc->surface_tex_inv_size.x = 1.0f / surface->texture->width[0];
 628                         mc->surface_tex_inv_size.y = 1.0f / surface->texture->height[0];
 629                 }
 630         }
 631         else
 632         {
 633                 mc->buffered_surface = surface;
 634                 mc->past_surface = batch->past_surface;
 635                 mc->future_surface = batch->future_surface;
 636                 mc->surface_tex_inv_size.x = 1.0f / surface->texture->width[0];
 637                 mc->surface_tex_inv_size.y = 1.0f / surface->texture->height[0];
 638         }
 639
 640         for (i = 0; i < batch->num_macroblocks; ++i)
 641                 vlGrabMacroBlock(mc, &batch->macroblocks[i]);
 642
 643         return 0;
 644 }
 645
 646 static int vlEnd
 647 (
 648         struct vlRender *render
 649 )
 650 {
 651         assert(render);
 652
 653         return 0;
 654 }
 655
 656 static int vlDestroy
 657 (
 658         struct vlRender *render
 659 )
 660 {
 661         struct vlR16SnormBufferedMC     *mc;
 662         struct pipe_context             *pipe;
 663         unsigned int                    g, h, i;
 664
 665         assert(render);
 666
 667         mc = (struct vlR16SnormBufferedMC*)render;
 668         pipe = mc->pipe;
 669
 670         for (i = 0; i < 5; ++i)
 671                 pipe->delete_sampler_state(pipe, mc->samplers[i]);
 672
 673         for (g = 0; g < NUM_BUF_SETS; ++g)
 674                 for (h = 0; h < vlNumMacroBlockExTypes; ++h)
 675                         for (i = 0; i < 3; ++i)
 676                                 pipe->winsys->buffer_destroy(pipe->winsys, mc->vertex_bufs[g][h][i].buffer);
 677
 678         /* Textures 3 & 4 are not created directly, no need to release them here */
 679         for (i = 0; i < NUM_BUF_SETS; ++i)
 680         {
 681                 pipe_texture_release(&mc->textures[i][0]);
 682                 pipe_texture_release(&mc->textures[i][1]);
 683                 pipe_texture_release(&mc->textures[i][2]);
 684         }
 685
 686         pipe->delete_vs_state(pipe, mc->i_vs);
 687         pipe->delete_fs_state(pipe, mc->i_fs);
 688
 689         for (i = 0; i < 2; ++i)
 690         {
 691                 pipe->delete_vs_state(pipe, mc->p_vs[i]);
 692                 pipe->delete_fs_state(pipe, mc->p_fs[i]);
 693                 pipe->delete_vs_state(pipe, mc->b_vs[i]);
 694                 pipe->delete_fs_state(pipe, mc->b_fs[i]);
 695         }
 696
 697         pipe->winsys->buffer_destroy(pipe->winsys, mc->vs_const_buf.buffer);
 698         pipe->winsys->buffer_destroy(pipe->winsys, mc->fs_const_buf.buffer);
 699
 700         free(mc);
 701
 702         return 0;
 703 }
 704
 705 /*
 706  * Muliplier renormalizes block samples from 16 bits to 12 bits.
 707  * Divider is used when calculating Y % 2 for choosing top or bottom
 708  * field for P or B macroblocks.
 709  * TODO: Use immediates.
 710  */
 711 static const struct vlFragmentShaderConsts fs_consts =
 712 {
 713         {32767.0f / 255.0f, 32767.0f / 255.0f, 32767.0f / 255.0f, 0.0f},
 714         {0.5f, 2.0f, 0.0f, 0.0f}
 715 };
 716
 717 static int vlCreateVertexShaderIMB
 718 (
 719         struct vlR16SnormBufferedMC *mc
 720 )
 721 {
 722         const unsigned int              max_tokens = 50;
 723
 724         struct pipe_context             *pipe;
 725         struct pipe_shader_state        vs;
 726         struct tgsi_token               *tokens;
 727         struct tgsi_header              *header;
 728
 729         struct tgsi_full_declaration    decl;
 730         struct tgsi_full_instruction    inst;
 731
 732         unsigned int                    ti;
 733         unsigned int                    i;
 734
 735         assert(mc);
 736
 737         pipe = mc->pipe;
 738         tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
 739
 740         /* Version */
 741         *(struct tgsi_version*)&tokens[0] = tgsi_build_version();
 742         /* Header */
 743         header = (struct tgsi_header*)&tokens[1];
 744         *header = tgsi_build_header();
 745         /* Processor */
 746         *(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header);
 747
 748         ti = 3;
 749
 750         /*
 751          * decl i0              ; Vertex pos, luma & chroma texcoords
 752          */
 753         for (i = 0; i < 3; i++)
 754         {
 755                 decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
 756                 ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
 757         }
 758
 759         /*
 760          * decl o0              ; Vertex pos
 761          * decl o1              ; Luma/chroma texcoords
 762          */
 763         for (i = 0; i < 2; i++)
 764         {
 765                 decl = vl_decl_output(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
 766                 ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
 767         }
 768
 769         /*
 770          * mov o0, i0           ; Move input vertex pos to output
 771          * mov o1, i0           ; Move input luma/chroma texcoords to output
 772          */
 773         for (i = 0; i < 2; ++i)
 774         {
 775                 inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, 0);
 776                 ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
 777         }
 778
 779         /* end */
 780         inst = vl_end();
 781         ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
 782
 783         vs.tokens = tokens;
 784         mc->i_vs = pipe->create_vs_state(pipe, &vs);
 785         free(tokens);
 786
 787         return 0;
 788 }
 789
 790 static int vlCreateFragmentShaderIMB
 791 (
 792         struct vlR16SnormBufferedMC *mc
 793 )
 794 {
 795         const unsigned int              max_tokens = 100;
 796
 797         struct pipe_context             *pipe;
 798         struct pipe_shader_state        fs;
 799         struct tgsi_token               *tokens;
 800         struct tgsi_header              *header;
 801
 802         struct tgsi_full_declaration    decl;
 803         struct tgsi_full_instruction    inst;
 804
 805         unsigned int                    ti;
 806         unsigned int                    i;
 807
 808         assert(mc);
 809
 810         pipe = mc->pipe;
 811         tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
 812
 813         /* Version */
 814         *(struct tgsi_version*)&tokens[0] = tgsi_build_version();
 815         /* Header */
 816         header = (struct tgsi_header*)&tokens[1];
 817         *header = tgsi_build_header();
 818         /* Processor */
 819         *(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
 820
 821         ti = 3;
 822
 823         /* decl i0                      ; Luma/chroma texcoords */
 824         decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, 1, 0, 0, TGSI_INTERPOLATE_LINEAR);
 825         ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
 826
 827         /* decl c0                      ; Scaling factor, rescales 16-bit snorm to 9-bit snorm */
 828         decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 0);
 829         ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
 830
 831         /* decl o0                      ; Fragment color */
 832         decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
 833         ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
 834
 835         /* decl t0, t1 */
 836         decl = vl_decl_temps(0, 1);
 837         ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
 838
 839         /*
 840          * decl s0                      ; Sampler for luma texture
 841          * decl s1                      ; Sampler for chroma Cb texture
 842          * decl s2                      ; Sampler for chroma Cr texture
 843          */
 844         for (i = 0; i < 3; ++i)
 845         {
 846                 decl = vl_decl_samplers(i, i);
 847                 ti += tgsi_build_full_declaration(&decl, &tokens[ti], header,max_tokens - ti);
 848         }
 849
 850         /*
 851          * tex2d t1, i0, s0             ; Read texel from luma texture
 852          * mov t0.x, t1.x               ; Move luma sample into .x component
 853          * tex2d t1, i0, s1             ; Read texel from chroma Cb texture
 854          * mov t0.y, t1.x               ; Move Cb sample into .y component
 855          * tex2d t1, i0, s2             ; Read texel from chroma Cr texture
 856          * mov t0.z, t1.x               ; Move Cr sample into .z component
 857          */
 858         for (i = 0; i < 3; ++i)
 859         {
 860                 inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, 0, TGSI_FILE_SAMPLER, i);
 861                 ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
 862
 863                 inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
 864                 inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
 865                 inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
 866                 inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
 867                 inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i;
 868                 ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
 869
 870         }
 871
 872         /* mul o0, t0, c0               ; Rescale texel to correct range */
 873         inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0);
 874         ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
 875
 876         /* end */
 877         inst = vl_end();
 878         ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
 879
 880         fs.tokens = tokens;
 881         mc->i_fs = pipe->create_fs_state(pipe, &fs);
 882         free(tokens);
 883
 884         return 0;
 885 }
 886
 887 static int vlCreateVertexShaderFramePMB
 888 (
 889         struct vlR16SnormBufferedMC *mc
 890 )
 891 {
 892         const unsigned int              max_tokens = 100;
 893
 894         struct pipe_context             *pipe;
 895         struct pipe_shader_state        vs;
 896         struct tgsi_token               *tokens;
 897         struct tgsi_header              *header;
 898
 899         struct tgsi_full_declaration    decl;
 900         struct tgsi_full_instruction    inst;
 901
 902         unsigned int                    ti;
 903         unsigned int                    i;
 904
 905         assert(mc);
 906
 907         pipe = mc->pipe;
 908         tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
 909
 910         /* Version */
 911         *(struct tgsi_version*)&tokens[0] = tgsi_build_version();
 912         /* Header */
 913         header = (struct tgsi_header*)&tokens[1];
 914         *header = tgsi_build_header();
 915         /* Processor */
 916         *(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header);
 917
 918         ti = 3;
 919
 920         /*
 921          * decl i0              ; Vertex pos, luma/chroma texcoords
 922          * decl i1              ; Ref surface top field texcoords
 923          * decl i2              ; Ref surface bottom field texcoords (unused, packed in the same stream)
 924          */
 925         for (i = 0; i < 3; i++)
 926         {
 927                 decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
 928                 ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
 929         }
 930
 931         /*
 932          * decl o0              ; Vertex pos
 933          * decl o1              ; Luma/chroma texcoords
 934          * decl o2              ; Ref macroblock texcoords
 935          */
 936         for (i = 0; i < 3; i++)
 937         {
 938                 decl = vl_decl_output(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
 939                 ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
 940         }
 941
 942         /*
 943          * mov o0, i0           ; Move input vertex pos to output
 944          * mov o1, i0           ; Move input luma/chroma texcoords to output
 945          */
 946         for (i = 0; i < 2; ++i)
 947         {
 948                 inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, 0);
 949                 ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
 950         }
 951
 952         /* add o2, i0, i1       ; Translate vertex pos by motion vec to form ref macroblock texcoords */
 953         inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 2, TGSI_FILE_INPUT, 0, TGSI_FILE_INPUT, 1);
 954         ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
 955
 956         /* end */
 957         inst = vl_end();
 958         ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
 959
 960         vs.tokens = tokens;
 961         mc->p_vs[0] = pipe->create_vs_state(pipe, &vs);
 962         free(tokens);
 963
 964         return 0;
 965 }
 966
 967 static int vlCreateVertexShaderFieldPMB
 968 (
 969         struct vlR16SnormBufferedMC *mc
 970 )
 971 {
 972         const unsigned int              max_tokens = 100;
 973
 974         struct pipe_context             *pipe;
 975         struct pipe_shader_state        vs;
 976         struct tgsi_token               *tokens;
 977         struct tgsi_header              *header;
 978
 979         struct tgsi_full_declaration    decl;
 980         struct tgsi_full_instruction    inst;
 981
 982         unsigned int                    ti;
 983         unsigned int                    i;
 984
 985         assert(mc);
 986
 987         pipe = mc->pipe;
 988         tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
 989
 990         /* Version */
 991         *(struct tgsi_version*)&tokens[0] = tgsi_build_version();
 992         /* Header */
 993         header = (struct tgsi_header*)&tokens[1];
 994         *header = tgsi_build_header();
 995         /* Processor */
 996         *(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header);
 997
 998         ti = 3;
 999
1000         /*
1001          * decl i0              ; Vertex pos, luma/chroma texcoords
1002          * decl i1              ; Ref surface top field texcoords
1003          * decl i2              ; Ref surface bottom field texcoords
1004          */
1005         for (i = 0; i < 3; i++)
1006         {
1007                 decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
1008                 ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
1009         }
1010
1011         /* decl c0              ; Texcoord denorm coefficients */
1012         decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 0);
1013         ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
1014
1015         /*
1016          * decl o0              ; Vertex pos
1017          * decl o1              ; Luma/chroma texcoords
1018          * decl o2              ; Top field ref macroblock texcoords
1019          * decl o3              ; Bottom field ref macroblock texcoords
1020          * decl o4              ; Denormalized vertex pos
1021          */
1022         for (i = 0; i < 5; i++)
1023         {
1024                 decl = vl_decl_output((i == 0 || i == 5) ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
1025                 ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
1026         }
1027
1028         /*
1029          * mov o0, i0           ; Move input vertex pos to output
1030          * mov o1, i0           ; Move input luma/chroma texcoords to output
1031          */
1032         for (i = 0; i < 3; ++i)
1033         {
1034                 inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, i == 0 ? 0 : i - 1);
1035                 ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
1036         }
1037
1038         /*
1039          * add o2, i0, i1       ; Translate vertex pos by motion vec to form top field macroblock texcoords
1040          * add o3, i0, i2       ; Translate vertex pos by motion vec to form bottom field macroblock texcoords
1041          */
1042         for (i = 0; i < 2; ++i)
1043         {
1044                 inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, i + 2, TGSI_FILE_INPUT, 0, TGSI_FILE_INPUT, i + 1);
1045                 ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
1046         }
1047
1048         /* mul o4, i0, c0       ; Denorm vertex pos */
1049         inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_OUTPUT, 5, TGSI_FILE_INPUT, 0, TGSI_FILE_CONSTANT, 0);
1050         ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
1051
1052         /* end */
1053         inst = vl_end();
1054         ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
1055
1056         vs.tokens = tokens;
1057         mc->p_vs[1] = pipe->create_vs_state(pipe, &vs);
1058         free(tokens);
1059
1060         return 0;
1061 }
1062
1063 static int vlCreateFragmentShaderFramePMB
1064 (
1065         struct vlR16SnormBufferedMC *mc
1066 )
1067 {
1068         const unsigned int              max_tokens = 100;
1069
1070         struct pipe_context             *pipe;
1071         struct pipe_shader_state        fs;
1072         struct tgsi_token               *tokens;
1073         struct tgsi_header              *header;
1074
1075         struct tgsi_full_declaration    decl;
1076         struct tgsi_full_instruction    inst;
1077
1078         unsigned int                    ti;
1079         unsigned int                    i;
1080
1081         assert(mc);
1082
1083         pipe = mc->pipe;
1084         tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
1085
1086         /* Version */
1087         *(struct tgsi_version*)&tokens[0] = tgsi_build_version();
1088         /* Header */
1089         header = (struct tgsi_header*)&tokens[1];
1090         *header = tgsi_build_header();
1091         /* Processor */
1092         *(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
1093
1094         ti = 3;
1095
1096         /*
1097          * decl i0                      ; Texcoords for s0, s1, s2
1098          * decl i1                      ; Texcoords for s3
1099          */
1100         for (i = 0; i < 2; ++i)
1101         {
1102                 decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, i + 1, i, i, TGSI_INTERPOLATE_LINEAR);
1103                 ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
1104         }
1105
1106         /* decl c0                      ; Scaling factor, rescales 16-bit snorm to 9-bit snorm */
1107         decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 0);
1108         ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
1109
1110         /* decl o0                      ; Fragment color */
1111         decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
1112         ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
1113
1114         /* decl t0, t1 */
1115         decl = vl_decl_temps(0, 1);
1116         ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
1117
1118         /*
1119          * decl s0                      ; Sampler for luma texture
1120          * decl s1                      ; Sampler for chroma Cb texture
1121          * decl s2                      ; Sampler for chroma Cr texture
1122          * decl s3                      ; Sampler for ref surface texture
1123          */
1124         for (i = 0; i < 4; ++i)
1125         {
1126                 decl = vl_decl_samplers(i, i);
1127                 ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
1128         }
1129
1130         /*
1131          * tex2d t1, i0, s0             ; Read texel from luma texture
1132          * mov t0.x, t1.x               ; Move luma sample into .x component
1133          * tex2d t1, i0, s1             ; Read texel from chroma Cb texture
1134          * mov t0.y, t1.x               ; Move Cb sample into .y component
1135          * tex2d t1, i0, s2             ; Read texel from chroma Cr texture
1136          * mov t0.z, t1.x               ; Move Cr sample into .z component
1137          */
1138         for (i = 0; i < 3; ++i)
1139         {
1140                 inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, 0, TGSI_FILE_SAMPLER, i);
1141                 ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
1142
1143                 inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
1144                 inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
1145                 inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
1146                 inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
1147                 inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i;
1148                 ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
1149
1150         }
1151
1152         /* mul t0, t0, c0               ; Rescale texel to correct range */
1153         inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0);
1154         ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
1155
1156         /* tex2d t1, i1, s3             ; Read texel from ref macroblock */
1157         inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, 1, TGSI_FILE_SAMPLER, 3);
1158         ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
1159
1160         /* add o0, t0, t1               ; Add ref and differential to form final output */
1161         inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
1162         ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
1163
1164         /* end */
1165         inst = vl_end();
1166         ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
1167
1168         fs.tokens = tokens;
1169         mc->p_fs[0] = pipe->create_fs_state(pipe, &fs);
1170         free(tokens);
1171
1172         return 0;
1173 }
1174
1175 static int vlCreateFragmentShaderFieldPMB
1176 (
1177         struct vlR16SnormBufferedMC *mc
1178 )
1179 {
1180         const unsigned int              max_tokens = 200;
1181
1182         struct pipe_context             *pipe;
1183         struct pipe_shader_state        fs;
1184         struct tgsi_token               *tokens;
1185         struct tgsi_header              *header;
1186
1187         struct tgsi_full_declaration    decl;
1188         struct tgsi_full_instruction    inst;
1189
1190         unsigned int                    ti;
1191         unsigned int                    i;
1192
1193         assert(mc);
1194
1195         pipe = mc->pipe;
1196         tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
1197
1198         /* Version */
1199         *(struct tgsi_version*)&tokens[0] = tgsi_build_version();
1200         /* Header */
1201         header = (struct tgsi_header*)&tokens[1];
1202         *header = tgsi_build_header();
1203         /* Processor */
1204         *(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
1205
1206         ti = 3;
1207
1208         /*
1209          * decl i0                      ; Texcoords for s0, s1, s2
1210          * decl i1                      ; Texcoords for s3
1211          * decl i2                      ; Texcoords for s3
1212          * decl i3                      ; Denormalized vertex pos
1213          */
1214         for (i = 0; i < 4; ++i)
1215         {
1216                 decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, i + 1, i, i, TGSI_INTERPOLATE_LINEAR);
1217                 ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
1218         }
1219
1220         /*
1221          * decl c0                      ; Scaling factor, rescales 16-bit snorm to 9-bit snorm
1222          * decl c1                      ; Constants 1/2 & 2 in .x, .y channels for Y-mod-2 top/bottom field selection
1223          */
1224         decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 1);
1225         ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
1226
1227         /* decl o0                      ; Fragment color */
1228         decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
1229         ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
1230
1231         /* decl t0-t4 */
1232         decl = vl_decl_temps(0, 4);
1233         ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
1234
1235         /*
1236          * decl s0                      ; Sampler for luma texture
1237          * decl s1                      ; Sampler for chroma Cb texture
1238          * decl s2                      ; Sampler for chroma Cr texture
1239          * decl s3                      ; Sampler for ref surface texture
1240          */
1241         for (i = 0; i < 4; ++i)
1242         {
1243                 decl = vl_decl_samplers(i, i);
1244                 ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
1245         }
1246
1247         /*
1248          * tex2d t1, i0, s0             ; Read texel from luma texture
1249          * mov t0.x, t1.x               ; Move luma sample into .x component
1250          * tex2d t1, i0, s1             ; Read texel from chroma Cb texture
1251          * mov t0.y, t1.x               ; Move Cb sample into .y component
1252          * tex2d t1, i0, s2             ; Read texel from chroma Cr texture
1253          * mov t0.z, t1.x               ; Move Cr sample into .z component
1254          */
1255         for (i = 0; i < 3; ++i)
1256         {
1257                 inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, 0, TGSI_FILE_SAMPLER, i);
1258                 ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
1259
1260                 inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
1261                 inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
1262                 inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
1263                 inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
1264                 inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i;
1265                 ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
1266
1267         }
1268
1269         /* mul t0, t0, c0               ; Rescale texel to correct range */
1270         inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0);
1271         ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
1272
1273         /*
1274          * tex2d t1, i1, s3             ; Read texel from ref macroblock top field
1275          * tex2d t2, i2, s3             ; Read texel from ref macroblock bottom field
1276          */
1277         for (i = 0; i < 2; ++i)
1278         {
1279                 inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, i + 1, TGSI_FILE_INPUT, i + 1, TGSI_FILE_SAMPLER, 3);
1280                 ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
1281         }
1282
1283         /* XXX: Pos values off by 0.5? */
1284         /* sub t4, i3.y, c1.x           ; Sub 0.5 from denormalized pos */
1285         inst = vl_inst3(TGSI_OPCODE_SUB, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_INPUT, 3, TGSI_FILE_CONSTANT, 1);
1286         inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_Y;
1287         inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_Y;
1288         inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_Y;
1289         inst.FullSrcRegisters[0].SrcRegister.SwizzleW = TGSI_SWIZZLE_Y;
1290         inst.FullSrcRegisters[1].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
1291         inst.FullSrcRegisters[1].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
1292         inst.FullSrcRegisters[1].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
1293         inst.FullSrcRegisters[1].SrcRegister.SwizzleW = TGSI_SWIZZLE_X;
1294         ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
1295
1296         /* mul t3, t4, c1.x             ; Multiply pos Y-coord by 1/2 */
1297         inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_CONSTANT, 1);
1298         inst.FullSrcRegisters[1].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
1299         inst.FullSrcRegisters[1].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
1300         inst.FullSrcRegisters[1].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
1301         inst.FullSrcRegisters[1].SrcRegister.SwizzleW = TGSI_SWIZZLE_X;
1302         ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
1303
1304         /* floor t3, t3                 ; Get rid of fractional part */
1305         inst = vl_inst2(TGSI_OPCODE_FLOOR, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 3);
1306         ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
1307
1308         /* mul t3, t3, c1.y             ; Multiply by 2 */
1309         inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_CONSTANT, 1);
1310         inst.FullSrcRegisters[1].SrcRegister.SwizzleX = TGSI_SWIZZLE_Y;
1311         inst.FullSrcRegisters[1].SrcRegister.SwizzleY = TGSI_SWIZZLE_Y;
1312         inst.FullSrcRegisters[1].SrcRegister.SwizzleZ = TGSI_SWIZZLE_Y;
1313         inst.FullSrcRegisters[1].SrcRegister.SwizzleW = TGSI_SWIZZLE_Y;
1314         ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
1315
1316         /* sub t3, t4, t3               ; Subtract from original Y to get Y % 2 */
1317         inst = vl_inst3(TGSI_OPCODE_SUB, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_TEMPORARY, 3);
1318         ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
1319
1320         /* TODO: Move to conditional tex fetch on t3 instead of lerp */
1321         /* lerp t1, t3, t1, t2          ; Choose between top and bottom fields based on Y % 2 */
1322         inst = vl_inst4(TGSI_OPCODE_LERP, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 2);
1323         ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
1324
1325         /* add o0, t0, t1               ; Add ref and differential to form final output */
1326         inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
1327         ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
1328
1329         /* end */
1330         inst = vl_end();
1331         ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
1332
1333         fs.tokens = tokens;
1334         mc->p_fs[1] = pipe->create_fs_state(pipe, &fs);
1335         free(tokens);
1336
1337         return 0;
1338 }
1339
1340 static int vlCreateVertexShaderFrameBMB
1341 (
1342         struct vlR16SnormBufferedMC *mc
1343 )
1344 {
1345         const unsigned int              max_tokens = 100;
1346
1347         struct pipe_context             *pipe;
1348         struct pipe_shader_state        vs;
1349         struct tgsi_token               *tokens;
1350         struct tgsi_header              *header;
1351
1352         struct tgsi_full_declaration    decl;
1353         struct tgsi_full_instruction    inst;
1354
1355         unsigned int                    ti;
1356         unsigned int                    i;
1357
1358         assert(mc);
1359
1360         pipe = mc->pipe;
1361         tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
1362
1363         /* Version */
1364         *(struct tgsi_version*)&tokens[0] = tgsi_build_version();
1365         /* Header */
1366         header = (struct tgsi_header*)&tokens[1];
1367         *header = tgsi_build_header();
1368         /* Processor */
1369         *(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header);
1370
1371         ti = 3;
1372
1373         /*
1374          * decl i0              ; Vertex pos, luma/chroma texcoords
1375          * decl i1              ; First ref surface top field texcoords
1376          * decl i2              ; First ref surface bottom field texcoords (unused, packed in the same stream)
1377          * decl i3              ; Second ref surface top field texcoords
1378          * decl i4              ; Second ref surface bottom field texcoords (unused, packed in the same stream)
1379          */
1380         for (i = 0; i < 5; i++)
1381         {
1382                 decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
1383                 ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
1384         }
1385
1386         /*
1387          * decl o0              ; Vertex pos
1388          * decl o1              ; Luma/chroma texcoords
1389          * decl o2              ; First ref macroblock texcoords
1390          * decl o3              ; Second ref macroblock texcoords
1391          */
1392         for (i = 0; i < 4; i++)
1393         {
1394                 decl = vl_decl_output(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
1395                 ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
1396         }
1397
1398         /*
1399          * mov o0, i0           ; Move input vertex pos to output
1400          * mov o1, i0           ; Move input luma/chroma texcoords to output
1401          */
1402         for (i = 0; i < 2; ++i)
1403         {
1404                 inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, 0);
1405                 ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
1406         }
1407
1408         /*
1409          * add o2, i0, i1       ; Translate vertex pos by motion vec to form first ref macroblock texcoords
1410          * add o3, i0, i3       ; Translate vertex pos by motion vec to form second ref macroblock texcoords
1411          */
1412         for (i = 0; i < 2; ++i)
1413         {
1414                 inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, i + 2, TGSI_FILE_INPUT, 0, TGSI_FILE_INPUT, i * 2 + 1);
1415                 ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
1416         }
1417
1418         /* end */
1419         inst = vl_end();
1420         ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
1421
1422         vs.tokens = tokens;
1423         mc->b_vs[0] = pipe->create_vs_state(pipe, &vs);
1424         free(tokens);
1425
1426         return 0;
1427 }
1428
1429 static int vlCreateVertexShaderFieldBMB
1430 (
1431         struct vlR16SnormBufferedMC *mc
1432 )
1433 {
1434         const unsigned int              max_tokens = 100;
1435
1436         struct pipe_context             *pipe;
1437         struct pipe_shader_state        vs;
1438         struct tgsi_token               *tokens;
1439         struct tgsi_header              *header;
1440
1441         struct tgsi_full_declaration    decl;
1442         struct tgsi_full_instruction    inst;
1443
1444         unsigned int                    ti;
1445         unsigned int                    i;
1446
1447         assert(mc);
1448
1449         pipe = mc->pipe;
1450         tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
1451
1452         /* Version */
1453         *(struct tgsi_version*)&tokens[0] = tgsi_build_version();
1454         /* Header */
1455         header = (struct tgsi_header*)&tokens[1];
1456         *header = tgsi_build_header();
1457         /* Processor */
1458         *(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header);
1459
1460         ti = 3;
1461
1462         /*
1463          * decl i0              ; Vertex pos, Luma/chroma texcoords
1464          * decl i1              ; First ref surface top field texcoords
1465          * decl i2              ; First ref surface bottom field texcoords
1466          * decl i3              ; Second ref surface top field texcoords
1467          * decl i4              ; Second ref surface bottom field texcoords
1468          */
1469         for (i = 0; i < 5; i++)
1470         {
1471                 decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
1472                 ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
1473         }
1474
1475         /* decl c0              ; Denorm coefficients */
1476         decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 6);
1477         ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
1478
1479         /*
1480          * decl o0              ; Vertex pos
1481          * decl o1              ; Luma/chroma texcoords
1482          * decl o2              ; Top field past ref macroblock texcoords
1483          * decl o3              ; Bottom field past ref macroblock texcoords
1484          * decl o4              ; Top field future ref macroblock texcoords
1485          * decl o5              ; Bottom field future ref macroblock texcoords
1486          * decl o6              ; Denormalized vertex pos
1487          */
1488         for (i = 0; i < 7; i++)
1489         {
1490                 decl = vl_decl_output((i == 0 || i == 7) ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
1491                 ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
1492         }
1493
1494         /* decl t0, t1 */
1495         decl = vl_decl_temps(0, 1);
1496         ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
1497
1498         /*
1499          * mov o0, i0           ; Move input vertex pos to output
1500          * mov o1, i0           ; Move input luma/chroma texcoords to output
1501          * mov o2, i1           ; Move past top field texcoords to output
1502          * mov o3, i2           ; Move past bottom field texcoords to output
1503          * mov o4, i3           ; Move future top field texcoords to output
1504          * mov o5, i4           ; Move future bottom field texcoords to output
1505          */
1506         for (i = 0; i < 6; ++i)
1507         {
1508                 inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, 0);
1509                 ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
1510         }
1511
1512         /*
1513          * add o2, i0, i1       ; Translate vertex pos by motion vec to form first top field macroblock texcoords
1514          * add o3, i0, i2       ; Translate vertex pos by motion vec to form first bottom field macroblock texcoords
1515          * add o4, i0, i3       ; Translate vertex pos by motion vec to form second top field macroblock texcoords
1516          * add o5, i0, i4       ; Translate vertex pos by motion vec to form second bottom field macroblock texcoords
1517          */
1518         for (i = 0; i < 4; ++i)
1519         {
1520                 inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, i + 2, TGSI_FILE_INPUT, 0, TGSI_FILE_INPUT, i + 1);
1521                 ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
1522         }
1523
1524         /* mul o6, i0, c0       ; Denorm vertex pos */
1525         inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_OUTPUT, 6, TGSI_FILE_INPUT, 0, TGSI_FILE_CONSTANT, 0);
1526         ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
1527
1528         /* end */
1529         inst = vl_end();
1530         ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
1531
1532         vs.tokens = tokens;
1533         mc->b_vs[1] = pipe->create_vs_state(pipe, &vs);
1534         free(tokens);
1535
1536         return 0;
1537 }
1538
1539 static int vlCreateFragmentShaderFrameBMB
1540 (
1541         struct vlR16SnormBufferedMC *mc
1542 )
1543 {
1544         const unsigned int              max_tokens = 100;
1545
1546         struct pipe_context             *pipe;
1547         struct pipe_shader_state        fs;
1548         struct tgsi_token               *tokens;
1549         struct tgsi_header              *header;
1550
1551         struct tgsi_full_declaration    decl;
1552         struct tgsi_full_instruction    inst;
1553
1554         unsigned int                    ti;
1555         unsigned int                    i;
1556
1557         assert(mc);
1558
1559         pipe = mc->pipe;
1560         tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
1561
1562         /* Version */
1563         *(struct tgsi_version*)&tokens[0] = tgsi_build_version();
1564         /* Header */
1565         header = (struct tgsi_header*)&tokens[1];
1566         *header = tgsi_build_header();
1567         /* Processor */
1568         *(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
1569
1570         ti = 3;
1571
1572         /*
1573          * decl i0                      ; Texcoords for s0, s1, s2
1574          * decl i1                      ; Texcoords for s3
1575          * decl i2                      ; Texcoords for s4
1576          */
1577         for (i = 0; i < 3; ++i)
1578         {
1579                 decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, i + 1, i, i, TGSI_INTERPOLATE_LINEAR);
1580                 ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
1581         }
1582
1583         /*
1584          * decl c0                      ; Scaling factor, rescales 16-bit snorm to 9-bit snorm
1585          * decl c1                      ; Constant 1/2 in .x channel to use as weight to blend past and future texels
1586          */
1587         decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 1);
1588         ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
1589
1590         /* decl o0                      ; Fragment color */
1591         decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
1592         ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
1593
1594         /* decl t0-t2 */
1595         decl = vl_decl_temps(0, 2);
1596         ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
1597
1598         /*
1599          * decl s0                      ; Sampler for luma texture
1600          * decl s1                      ; Sampler for chroma Cb texture
1601          * decl s2                      ; Sampler for chroma Cr texture
1602          * decl s3                      ; Sampler for past ref surface texture
1603          * decl s4                      ; Sampler for future ref surface texture
1604          */
1605         for (i = 0; i < 5; ++i)
1606         {
1607                 decl = vl_decl_samplers(i, i);
1608                 ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
1609         }
1610
1611         /*
1612          * tex2d t1, i0, s0             ; Read texel from luma texture
1613          * mov t0.x, t1.x               ; Move luma sample into .x component
1614          * tex2d t1, i0, s1             ; Read texel from chroma Cb texture
1615          * mov t0.y, t1.x               ; Move Cb sample into .y component
1616          * tex2d t1, i0, s2             ; Read texel from chroma Cr texture
1617          * mov t0.z, t1.x               ; Move Cr sample into .z component
1618          */
1619         for (i = 0; i < 3; ++i)
1620         {
1621                 inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, 0, TGSI_FILE_SAMPLER, i);
1622                 ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
1623
1624                 inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
1625                 inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
1626                 inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
1627                 inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
1628                 inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i;
1629                 ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
1630
1631         }
1632
1633         /* mul t0, t0, c0               ; Rescale texel to correct range */
1634         inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0);
1635         ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
1636
1637         /*
1638          * tex2d t1, i1, s3             ; Read texel from past ref macroblock
1639          * tex2d t2, i2, s4             ; Read texel from future ref macroblock
1640          */
1641         for (i = 0; i < 2; ++i)
1642         {
1643                 inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, i + 1, TGSI_FILE_INPUT, i + 1, TGSI_FILE_SAMPLER, i + 3);
1644                 ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
1645         }
1646
1647         /* lerp t1, c1.x, t1, t2        ; Blend past and future texels */
1648         inst = vl_inst4(TGSI_OPCODE_LERP, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_CONSTANT, 1, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 2);
1649         inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
1650         inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
1651         inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
1652         inst.FullSrcRegisters[0].SrcRegister.SwizzleW = TGSI_SWIZZLE_X;
1653         ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
1654
1655         /* add o0, t0, t1               ; Add past/future ref and differential to form final output */
1656         inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
1657         ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
1658
1659         /* end */
1660         inst = vl_end();
1661         ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
1662
1663         fs.tokens = tokens;
1664         mc->b_fs[0] = pipe->create_fs_state(pipe, &fs);
1665         free(tokens);
1666
1667         return 0;
1668 }
1669
1670 static int vlCreateFragmentShaderFieldBMB
1671 (
1672         struct vlR16SnormBufferedMC *mc
1673 )
1674 {
1675         const unsigned int              max_tokens = 200;
1676
1677         struct pipe_context             *pipe;
1678         struct pipe_shader_state        fs;
1679         struct tgsi_token               *tokens;
1680         struct tgsi_header              *header;
1681
1682         struct tgsi_full_declaration    decl;
1683         struct tgsi_full_instruction    inst;
1684
1685         unsigned int                    ti;
1686         unsigned int                    i;
1687
1688         assert(mc);
1689
1690         pipe = mc->pipe;
1691         tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
1692
1693         /* Version */
1694         *(struct tgsi_version*)&tokens[0] = tgsi_build_version();
1695         /* Header */
1696         header = (struct tgsi_header*)&tokens[1];
1697         *header = tgsi_build_header();
1698         /* Processor */
1699         *(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
1700
1701         ti = 3;
1702
1703         /*
1704          * decl i0                      ; Texcoords for s0, s1, s2
1705          * decl i1                      ; Texcoords for s3
1706          * decl i2                      ; Texcoords for s3
1707          * decl i3                      ; Texcoords for s4
1708          * decl i4                      ; Texcoords for s4
1709          * decl i5                      ; Denormalized vertex pos
1710          */
1711         for (i = 0; i < 6; ++i)
1712         {
1713                 decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, i + 1, i, i, TGSI_INTERPOLATE_LINEAR);
1714                 ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
1715         }
1716
1717         /*
1718          * decl c0                      ; Scaling factor, rescales 16-bit snorm to 9-bit snorm
1719          * decl c1                      ; Constants 1/2 & 2 in .x, .y channels to use as weight to blend past and future texels
1720          *                              ; and for Y-mod-2 top/bottom field selection
1721          */
1722         decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 1);
1723         ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
1724
1725         /* decl o0                      ; Fragment color */
1726         decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
1727         ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
1728
1729         /* decl t0-t5 */
1730         decl = vl_decl_temps(0, 5);
1731         ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
1732
1733         /*
1734          * decl s0                      ; Sampler for luma texture
1735          * decl s1                      ; Sampler for chroma Cb texture
1736          * decl s2                      ; Sampler for chroma Cr texture
1737          * decl s3                      ; Sampler for past ref surface texture
1738          * decl s4                      ; Sampler for future ref surface texture
1739          */
1740         for (i = 0; i < 5; ++i)
1741         {
1742                 decl = vl_decl_samplers(i, i);
1743                 ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
1744         }
1745
1746         /*
1747          * tex2d t1, i0, s0             ; Read texel from luma texture
1748          * mov t0.x, t1.x               ; Move luma sample into .x component
1749          * tex2d t1, i0, s1             ; Read texel from chroma Cb texture
1750          * mov t0.y, t1.x               ; Move Cb sample into .y component
1751          * tex2d t1, i0, s2             ; Read texel from chroma Cr texture
1752          * mov t0.z, t1.x               ; Move Cr sample into .z component
1753          */
1754         for (i = 0; i < 3; ++i)
1755         {
1756                 inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, 0, TGSI_FILE_SAMPLER, i);
1757                 ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
1758
1759                 inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
1760                 inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
1761                 inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
1762                 inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
1763                 inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i;
1764                 ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
1765
1766         }
1767
1768         /* mul t0, t0, c0               ; Rescale texel to correct range */
1769         inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0);
1770         ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
1771
1772         /* XXX: Pos values off by 0.5? */
1773         /* sub t4, i5.y, c1.x           ; Sub 0.5 from denormalized pos */
1774         inst = vl_inst3(TGSI_OPCODE_SUB, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_INPUT, 5, TGSI_FILE_CONSTANT, 1);
1775         inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_Y;
1776         inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_Y;
1777         inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_Y;
1778         inst.FullSrcRegisters[0].SrcRegister.SwizzleW = TGSI_SWIZZLE_Y;
1779         inst.FullSrcRegisters[1].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
1780         inst.FullSrcRegisters[1].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
1781         inst.FullSrcRegisters[1].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
1782         inst.FullSrcRegisters[1].SrcRegister.SwizzleW = TGSI_SWIZZLE_X;
1783         ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
1784
1785         /* mul t3, t4, c1.x             ; Multiply pos Y-coord by 1/2 */
1786         inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_CONSTANT, 1);
1787         inst.FullSrcRegisters[1].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
1788         inst.FullSrcRegisters[1].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
1789         inst.FullSrcRegisters[1].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
1790         inst.FullSrcRegisters[1].SrcRegister.SwizzleW = TGSI_SWIZZLE_X;
1791         ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
1792
1793         /* floor t3, t3                 ; Get rid of fractional part */
1794         inst = vl_inst2(TGSI_OPCODE_FLOOR, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 3);
1795         ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
1796
1797         /* mul t3, t3, c1.y             ; Multiply by 2 */
1798         inst = vl_inst3( TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_CONSTANT, 1);
1799         inst.FullSrcRegisters[1].SrcRegister.SwizzleX = TGSI_SWIZZLE_Y;
1800         inst.FullSrcRegisters[1].SrcRegister.SwizzleY = TGSI_SWIZZLE_Y;
1801         inst.FullSrcRegisters[1].SrcRegister.SwizzleZ = TGSI_SWIZZLE_Y;
1802         inst.FullSrcRegisters[1].SrcRegister.SwizzleW = TGSI_SWIZZLE_Y;
1803         ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
1804
1805         /* sub t3, t4, t3               ; Subtract from original Y to get Y % 2 */
1806         inst = vl_inst3(TGSI_OPCODE_SUB, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_TEMPORARY, 3);
1807         ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
1808
1809         /*
1810          * tex2d t1, i1, s3             ; Read texel from past ref macroblock top field
1811          * tex2d t2, i2, s3             ; Read texel from past ref macroblock bottom field
1812          */
1813         for (i = 0; i < 2; ++i)
1814         {
1815                 inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, i + 1, TGSI_FILE_INPUT, i + 1, TGSI_FILE_SAMPLER, 3);
1816                 ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
1817         }
1818
1819         /* TODO: Move to conditional tex fetch on t3 instead of lerp */
1820         /* lerp t1, t3, t1, t2          ; Choose between top and bottom fields based on Y % 2 */
1821         inst = vl_inst4(TGSI_OPCODE_LERP, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 2);
1822         ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
1823
1824         /*
1825          * tex2d t4, i3, s4             ; Read texel from future ref macroblock top field
1826          * tex2d t5, i4, s4             ; Read texel from future ref macroblock bottom field
1827          */
1828         for (i = 0; i < 2; ++i)
1829         {
1830                 inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, i + 4, TGSI_FILE_INPUT, i + 3, TGSI_FILE_SAMPLER, 4);
1831                 ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
1832         }
1833
1834         /* TODO: Move to conditional tex fetch on t3 instead of lerp */
1835         /* lerp t2, t3, t4, t5          ; Choose between top and bottom fields based on Y % 2 */
1836         inst = vl_inst4(TGSI_OPCODE_LERP, TGSI_FILE_TEMPORARY, 2, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_TEMPORARY, 5);
1837         ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
1838
1839         /* lerp t1, c1.x, t1, t2        ; Blend past and future texels */
1840         inst = vl_inst4(TGSI_OPCODE_LERP, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_CONSTANT, 1, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 2);
1841         inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
1842         inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
1843         inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
1844         inst.FullSrcRegisters[0].SrcRegister.SwizzleW = TGSI_SWIZZLE_X;
1845         ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
1846
1847         /* add o0, t0, t1               ; Add past/future ref and differential to form final output */
1848         inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
1849         ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
1850
1851         /* end */
1852         inst = vl_end();
1853         ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
1854
1855         fs.tokens = tokens;
1856         mc->b_fs[1] = pipe->create_fs_state(pipe, &fs);
1857         free(tokens);
1858
1859         return 0;
1860 }
1861
1862 static int vlCreateDataBufs
1863 (
1864         struct vlR16SnormBufferedMC *mc
1865 )
1866 {
1867         const unsigned int      mbw = align(mc->video_width, VL_MACROBLOCK_WIDTH) / VL_MACROBLOCK_WIDTH;
1868         const unsigned int      mbh = align(mc->video_height, VL_MACROBLOCK_HEIGHT) / VL_MACROBLOCK_HEIGHT;
1869         const unsigned int      num_mb_per_frame = mbw * mbh;
1870
1871         struct pipe_context     *pipe;
1872         unsigned int            g, h, i;
1873
1874         assert(mc);
1875
1876         pipe = mc->pipe;
1877
1878         for (g = 0; g < NUM_BUF_SETS; ++g)
1879         {
1880                 for (h = 0; h < vlNumMacroBlockExTypes; ++h)
1881                 {
1882                         /* Create our vertex buffer and vertex buffer element */
1883                         mc->vertex_bufs[g][h][0].pitch = sizeof(struct vlVertex2f);
1884                         mc->vertex_bufs[g][h][0].max_index = 24 * num_mb_per_frame - 1;
1885                         mc->vertex_bufs[g][h][0].buffer_offset = 0;
1886                         mc->vertex_bufs[g][h][0].buffer = pipe->winsys->buffer_create
1887                         (
1888                                 pipe->winsys,
1889                                 1,
1890                                 PIPE_BUFFER_USAGE_VERTEX,
1891                                 sizeof(struct vlVertex2f) * 24 * num_mb_per_frame
1892                         );
1893                 }
1894         }
1895
1896         /* Position & block luma, block chroma texcoord element */
1897         mc->vertex_elems[0].src_offset = 0;
1898         mc->vertex_elems[0].vertex_buffer_index = 0;
1899         mc->vertex_elems[0].nr_components = 2;
1900         mc->vertex_elems[0].src_format = PIPE_FORMAT_R32G32_FLOAT;
1901
1902         for (g = 0; g < NUM_BUF_SETS; ++g)
1903         {
1904                 for (h = 0; h < vlNumMacroBlockExTypes; ++h)
1905                 {
1906                         for (i = 1; i < 3; ++i)
1907                         {
1908                                 mc->vertex_bufs[g][h][i].pitch = sizeof(struct vlVertex2f) * 2;
1909                                 mc->vertex_bufs[g][h][i].max_index = 24 * num_mb_per_frame - 1;
1910                                 mc->vertex_bufs[g][h][i].buffer_offset = 0;
1911                                 mc->vertex_bufs[g][h][i].buffer = pipe->winsys->buffer_create
1912                                 (
1913                                         pipe->winsys,
1914                                         1,
1915                                         PIPE_BUFFER_USAGE_VERTEX,
1916                                         sizeof(struct vlVertex2f) * 2 * 24 * num_mb_per_frame
1917                                 );
1918                         }
1919                 }
1920         }
1921
1922         /* First ref surface top field texcoord element */
1923         mc->vertex_elems[1].src_offset = 0;
1924         mc->vertex_elems[1].vertex_buffer_index = 1;
1925         mc->vertex_elems[1].nr_components = 2;
1926         mc->vertex_elems[1].src_format = PIPE_FORMAT_R32G32_FLOAT;
1927
1928         /* First ref surface bottom field texcoord element */
1929         mc->vertex_elems[2].src_offset = sizeof(struct vlVertex2f);
1930         mc->vertex_elems[2].vertex_buffer_index = 1;
1931         mc->vertex_elems[2].nr_components = 2;
1932         mc->vertex_elems[2].src_format = PIPE_FORMAT_R32G32_FLOAT;
1933
1934         /* Second ref surface top field texcoord element */
1935         mc->vertex_elems[3].src_offset = 0;
1936         mc->vertex_elems[3].vertex_buffer_index = 2;
1937         mc->vertex_elems[3].nr_components = 2;
1938         mc->vertex_elems[3].src_format = PIPE_FORMAT_R32G32_FLOAT;
1939
1940         /* Second ref surface bottom field texcoord element */
1941         mc->vertex_elems[4].src_offset = sizeof(struct vlVertex2f);
1942         mc->vertex_elems[4].vertex_buffer_index = 2;
1943         mc->vertex_elems[4].nr_components = 2;
1944         mc->vertex_elems[4].src_format = PIPE_FORMAT_R32G32_FLOAT;
1945
1946         /* Create our constant buffer */
1947         mc->vs_const_buf.size = sizeof(struct vlVertexShaderConsts);
1948         mc->vs_const_buf.buffer = pipe->winsys->buffer_create
1949         (
1950                 pipe->winsys,
1951                 1,
1952                 PIPE_BUFFER_USAGE_CONSTANT,
1953                 mc->vs_const_buf.size
1954         );
1955
1956         mc->fs_const_buf.size = sizeof(struct vlFragmentShaderConsts);
1957         mc->fs_const_buf.buffer = pipe->winsys->buffer_create
1958         (
1959                 pipe->winsys,
1960                 1,
1961                 PIPE_BUFFER_USAGE_CONSTANT,
1962                 mc->fs_const_buf.size
1963         );
1964
1965         memcpy
1966         (
1967                 pipe->winsys->buffer_map(pipe->winsys, mc->fs_const_buf.buffer, PIPE_BUFFER_USAGE_CPU_WRITE),
1968                 &fs_consts,
1969                 sizeof(struct vlFragmentShaderConsts)
1970         );
1971
1972         pipe->winsys->buffer_unmap(pipe->winsys, mc->fs_const_buf.buffer);
1973
1974         return 0;
1975 }
1976
1977 static int vlInit
1978 (
1979         struct vlR16SnormBufferedMC *mc
1980 )
1981 {
1982         struct pipe_context             *pipe;
1983         struct pipe_sampler_state       sampler;
1984         struct pipe_texture             template;
1985         unsigned int                    filters[5];
1986         unsigned int                    i;
1987
1988         assert(mc);
1989
1990         pipe = mc->pipe;
1991
1992         /* For MC we render to textures, which are rounded up to nearest POT */
1993         mc->viewport.scale[0] = vlRoundUpPOT(mc->video_width);
1994         mc->viewport.scale[1] = vlRoundUpPOT(mc->video_height);
1995         mc->viewport.scale[2] = 1;
1996         mc->viewport.scale[3] = 1;
1997         mc->viewport.translate[0] = 0;
1998         mc->viewport.translate[1] = 0;
1999         mc->viewport.translate[2] = 0;
2000         mc->viewport.translate[3] = 0;
2001
2002         mc->render_target.width = vlRoundUpPOT(mc->video_width);
2003         mc->render_target.height = vlRoundUpPOT(mc->video_height);
2004         mc->render_target.num_cbufs = 1;
2005         /* FB for MC stage is a vlSurface created by the user, set at render time */
2006         mc->render_target.zsbuf = NULL;
2007
2008         filters[0] = PIPE_TEX_FILTER_NEAREST;
2009         /* FIXME: Linear causes discoloration around block edges */
2010         filters[1] = /*mc->video_format == vlFormatYCbCr444 ?*/ PIPE_TEX_FILTER_NEAREST /*: PIPE_TEX_FILTER_LINEAR*/;
2011         filters[2] = /*mc->video_format == vlFormatYCbCr444 ?*/ PIPE_TEX_FILTER_NEAREST /*: PIPE_TEX_FILTER_LINEAR*/;
2012         filters[3] = PIPE_TEX_FILTER_LINEAR;
2013         filters[4] = PIPE_TEX_FILTER_LINEAR;
2014
2015         for (i = 0; i < 5; ++i)
2016         {
2017                 sampler.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
2018                 sampler.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
2019                 sampler.wrap_r = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
2020                 sampler.min_img_filter = filters[i];
2021                 sampler.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
2022                 sampler.mag_img_filter = filters[i];
2023                 sampler.compare_mode = PIPE_TEX_COMPARE_NONE;
2024                 sampler.compare_func = PIPE_FUNC_ALWAYS;
2025                 sampler.normalized_coords = 1;
2026                 /*sampler.prefilter = ;*/
2027                 /*sampler.shadow_ambient = ;*/
2028                 /*sampler.lod_bias = ;*/
2029                 sampler.min_lod = 0;
2030                 /*sampler.max_lod = ;*/
2031                 /*sampler.border_color[i] = ;*/
2032                 /*sampler.max_anisotropy = ;*/
2033                 mc->samplers[i] = pipe->create_sampler_state(pipe, &sampler);
2034         }
2035
2036         memset(&template, 0, sizeof(struct pipe_texture));
2037         template.target = PIPE_TEXTURE_2D;
2038         template.format = PIPE_FORMAT_R16_SNORM;
2039         template.last_level = 0;
2040         template.width[0] = vlRoundUpPOT(mc->video_width);
2041         template.height[0] = vlRoundUpPOT(mc->video_height);
2042         template.depth[0] = 1;
2043         template.compressed = 0;
2044         pf_get_block(template.format, &template.block);
2045
2046         for (i = 0; i < NUM_BUF_SETS; ++i)
2047                 mc->textures[i][0] = pipe->screen->texture_create(pipe->screen, &template);
2048
2049         if (mc->video_format == vlFormatYCbCr420)
2050         {
2051                 template.width[0] = vlRoundUpPOT(mc->video_width / 2);
2052                 template.height[0] = vlRoundUpPOT(mc->video_height / 2);
2053         }
2054         else if (mc->video_format == vlFormatYCbCr422)
2055                 template.height[0] = vlRoundUpPOT(mc->video_height / 2);
2056
2057         for (i = 0; i < NUM_BUF_SETS; ++i)
2058         {
2059                 mc->textures[i][1] = pipe->screen->texture_create(pipe->screen, &template);
2060                 mc->textures[i][2] = pipe->screen->texture_create(pipe->screen, &template);
2061         }
2062
2063         /* textures[3] & textures[4] are assigned from vlSurfaces for P and B macroblocks at render time */
2064
2065         vlCreateVertexShaderIMB(mc);
2066         vlCreateFragmentShaderIMB(mc);
2067         vlCreateVertexShaderFramePMB(mc);
2068         vlCreateVertexShaderFieldPMB(mc);
2069         vlCreateFragmentShaderFramePMB(mc);
2070         vlCreateFragmentShaderFieldPMB(mc);
2071         vlCreateVertexShaderFrameBMB(mc);
2072         vlCreateVertexShaderFieldBMB(mc);
2073         vlCreateFragmentShaderFrameBMB(mc);
2074         vlCreateFragmentShaderFieldBMB(mc);
2075         vlCreateDataBufs(mc);
2076
2077         return 0;
2078 }
2079
2080 int vlCreateR16SNormBufferedMC
2081 (
2082         struct pipe_context *pipe,
2083         unsigned int video_width,
2084         unsigned int video_height,
2085         enum vlFormat video_format,
2086         struct vlRender **render
2087 )
2088 {
2089         struct vlR16SnormBufferedMC *mc;
2090
2091         assert(pipe);
2092         assert(render);
2093
2094         mc = calloc(1, sizeof(struct vlR16SnormBufferedMC));
2095
2096         mc->base.vlBegin = &vlBegin;
2097         mc->base.vlRenderMacroBlocksMpeg2 = &vlRenderMacroBlocksMpeg2R16SnormBuffered;
2098         mc->base.vlEnd = &vlEnd;
2099         mc->base.vlFlush = &vlFlush;
2100         mc->base.vlDestroy = &vlDestroy;
2101         mc->pipe = pipe;
2102         mc->video_width = video_width;
2103         mc->video_height = video_height;
2104
2105         mc->cur_buf = 0;
2106         mc->buffered_surface = NULL;
2107         mc->past_surface = NULL;
2108         mc->future_surface = NULL;
2109         memset(mc->num_macroblocks, 0, sizeof(unsigned int) * vlNumMacroBlockExTypes);
2110
2111         vlInit(mc);
2112
2113         *render = &mc->base;
2114
2115         return 0;
2116 }