2 * Copyright 2013 Ilia Mirkin
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
23 #include "nv50/nv84_video.h"
25 #include "util/u_sse.h"
28 uint8_t scaling_lists_4x4
[6][16]; // 00
29 uint8_t scaling_lists_8x8
[2][64]; // 60
31 uint32_t height
; // e4
32 uint64_t ref1_addrs
[16]; // e8
33 uint64_t ref2_addrs
[16]; // 168
42 uint32_t mb_adaptive_frame_field_flag
; // 208
43 uint32_t field_pic_flag
; // 20c
44 uint32_t format
; // 210
45 uint32_t unk214
; // 214
50 uint32_t height
; // 04
59 uint32_t mb_adaptive_frame_field_flag
; // 28
61 uint32_t bottom
; // 30
62 uint32_t is_reference
; // 34
66 nv84_decoder_vp_h264(struct nv84_decoder
*dec
,
67 struct pipe_h264_picture_desc
*desc
,
68 struct nv84_video_buffer
*dest
)
70 struct h264_iparm1 param1
;
71 struct h264_iparm2 param2
;
72 int i
, width
= align(dest
->base
.width
, 16),
73 height
= align(dest
->base
.height
, 16);
75 struct nouveau_pushbuf
*push
= dec
->vp_pushbuf
;
76 struct nouveau_pushbuf_refn bo_refs
[] = {
77 { dest
->interlaced
, NOUVEAU_BO_RDWR
| NOUVEAU_BO_VRAM
},
78 { dest
->full
, NOUVEAU_BO_RDWR
| NOUVEAU_BO_VRAM
},
79 { dec
->vpring
, NOUVEAU_BO_RDWR
| NOUVEAU_BO_VRAM
},
80 { dec
->mbring
, NOUVEAU_BO_RDWR
| NOUVEAU_BO_VRAM
},
81 { dec
->vp_params
, NOUVEAU_BO_RDWR
| NOUVEAU_BO_GART
},
82 { dec
->fence
, NOUVEAU_BO_RDWR
| NOUVEAU_BO_VRAM
},
84 int num_refs
= sizeof(bo_refs
)/sizeof(*bo_refs
);
85 bool is_ref
= desc
->is_reference
;
87 STATIC_ASSERT(sizeof(struct h264_iparm1
) == 0x218);
88 STATIC_ASSERT(sizeof(struct h264_iparm2
) == 0x38);
90 memset(¶m1
, 0, sizeof(param1
));
91 memset(¶m2
, 0, sizeof(param2
));
93 memcpy(¶m1
.scaling_lists_4x4
, desc
->scaling_lists_4x4
,
94 sizeof(param1
.scaling_lists_4x4
));
95 memcpy(¶m1
.scaling_lists_8x8
, desc
->scaling_lists_8x8
,
96 sizeof(param1
.scaling_lists_8x8
));
99 param1
.w1
= param1
.w2
= param1
.w3
= align(width
, 64);
100 param1
.height
= param1
.h2
= height
;
101 param1
.h1
= param1
.h3
= align(height
, 32);
102 param1
.format
= 0x3231564e; /* 'NV12' */
103 param1
.mb_adaptive_frame_field_flag
= desc
->mb_adaptive_frame_field_flag
;
104 param1
.field_pic_flag
= desc
->field_pic_flag
;
106 param2
.width
= width
;
107 param2
.w1
= param2
.w2
= param2
.w3
= param1
.w1
;
108 if (desc
->field_pic_flag
)
109 param2
.height
= align(height
, 32) / 2;
111 param2
.height
= height
;
112 param2
.h1
= param2
.h2
= align(height
, 32);
114 param2
.mbs
= width
* height
>> 8;
115 if (desc
->field_pic_flag
) {
116 param2
.top
= desc
->bottom_field_flag
? 2 : 1;
117 param2
.bottom
= desc
->bottom_field_flag
;
119 param2
.mb_adaptive_frame_field_flag
= desc
->mb_adaptive_frame_field_flag
;
120 param2
.is_reference
= desc
->is_reference
;
122 PUSH_SPACE(push
, 5 + 16 + 3 + 2 + 6 + (is_ref
? 2 : 0) + 3 + 2 + 4 + 2);
124 struct nouveau_bo
*ref2_default
= dest
->full
;
126 for (i
= 0; i
< 16; i
++) {
127 struct nv84_video_buffer
*buf
= (struct nv84_video_buffer
*)desc
->ref
[i
];
128 struct nouveau_bo
*bo1
, *bo2
;
130 bo1
= buf
->interlaced
;
133 ref2_default
= buf
->full
;
135 bo1
= dest
->interlaced
;
138 param1
.ref1_addrs
[i
] = bo1
->offset
;
139 param1
.ref2_addrs
[i
] = bo2
->offset
;
140 struct nouveau_pushbuf_refn bo_refs
[] = {
141 { bo1
, NOUVEAU_BO_RDWR
| NOUVEAU_BO_VRAM
},
142 { bo2
, NOUVEAU_BO_RDWR
| NOUVEAU_BO_VRAM
},
144 nouveau_pushbuf_refn(push
, bo_refs
, sizeof(bo_refs
)/sizeof(bo_refs
[0]));
147 memcpy(dec
->vp_params
->map
, ¶m1
, sizeof(param1
));
148 memcpy(dec
->vp_params
->map
+ 0x400, ¶m2
, sizeof(param2
));
150 nouveau_pushbuf_refn(push
, bo_refs
, num_refs
);
152 /* Wait for BSP to have completed */
153 BEGIN_NV04(push
, SUBC_VP(0x10), 4);
154 PUSH_DATAh(push
, dec
->fence
->offset
);
155 PUSH_DATA (push
, dec
->fence
->offset
);
157 PUSH_DATA (push
, 1); /* wait for sem == 2 */
160 BEGIN_NV04(push
, SUBC_VP(0x400), 15);
162 PUSH_DATA (push
, param2
.mbs
);
163 PUSH_DATA (push
, 0x3987654); /* each nibble probably a dma index */
164 PUSH_DATA (push
, 0x55001); /* constant */
165 PUSH_DATA (push
, dec
->vp_params
->offset
>> 8);
166 PUSH_DATA (push
, (dec
->vpring
->offset
+ dec
->vpring_residual
) >> 8);
167 PUSH_DATA (push
, dec
->vpring_ctrl
);
168 PUSH_DATA (push
, dec
->vpring
->offset
>> 8);
169 PUSH_DATA (push
, dec
->bitstream
->size
/ 2 - 0x700);
170 PUSH_DATA (push
, (dec
->mbring
->offset
+ dec
->mbring
->size
- 0x2000) >> 8);
171 PUSH_DATA (push
, (dec
->vpring
->offset
+ dec
->vpring_ctrl
+
172 dec
->vpring_residual
+ dec
->vpring_deblock
) >> 8);
174 PUSH_DATA (push
, 0x100008);
175 PUSH_DATA (push
, dest
->interlaced
->offset
>> 8);
178 BEGIN_NV04(push
, SUBC_VP(0x620), 2);
182 BEGIN_NV04(push
, SUBC_VP(0x300), 1);
186 BEGIN_NV04(push
, SUBC_VP(0x400), 5);
187 PUSH_DATA (push
, 0x54530201);
188 PUSH_DATA (push
, (dec
->vp_params
->offset
>> 8) + 0x4);
189 PUSH_DATA (push
, (dec
->vpring
->offset
+ dec
->vpring_ctrl
+
190 dec
->vpring_residual
) >> 8);
191 PUSH_DATA (push
, dest
->interlaced
->offset
>> 8);
192 PUSH_DATA (push
, dest
->interlaced
->offset
>> 8);
195 BEGIN_NV04(push
, SUBC_VP(0x414), 1);
196 PUSH_DATA (push
, dest
->full
->offset
>> 8);
199 BEGIN_NV04(push
, SUBC_VP(0x620), 2);
200 PUSH_DATAh(push
, dec
->vp_fw2_offset
);
201 PUSH_DATA (push
, dec
->vp_fw2_offset
);
203 BEGIN_NV04(push
, SUBC_VP(0x300), 1);
206 /* Set the semaphore back to 1 */
207 BEGIN_NV04(push
, SUBC_VP(0x610), 3);
208 PUSH_DATAh(push
, dec
->fence
->offset
);
209 PUSH_DATA (push
, dec
->fence
->offset
);
212 /* Write to the semaphore location, intr */
213 BEGIN_NV04(push
, SUBC_VP(0x304), 1);
214 PUSH_DATA (push
, 0x101);
216 for (i
= 0; i
< 2; i
++) {
217 struct nv50_miptree
*mt
= nv50_miptree(dest
->resources
[i
]);
218 mt
->base
.status
|= NOUVEAU_BUFFER_STATUS_GPU_WRITING
;
224 static INLINE
int16_t inverse_quantize(int16_t val
, uint8_t quant
, int mpeg1
) {
225 int16_t ret
= val
* quant
/ 16;
239 struct mpeg12_mb_info
{
243 uint16_t coded_block_pattern
;
244 uint8_t block_counts
[6];
250 nv84_decoder_vp_mpeg12_mb(struct nv84_decoder
*dec
,
251 struct pipe_mpeg12_picture_desc
*desc
,
252 const struct pipe_mpeg12_macroblock
*macrob
)
254 STATIC_ASSERT(sizeof(struct mpeg12_mb_info
) == 32);
256 struct mpeg12_mb_info info
= {0};
257 int i
, sum
= 0, mask
, block_index
, count
;
258 const int16_t *blocks
;
259 int intra
= macrob
->macroblock_type
& PIPE_MPEG12_MB_TYPE_INTRA
;
260 int motion
= macrob
->macroblock_type
&
261 (PIPE_MPEG12_MB_TYPE_MOTION_FORWARD
| PIPE_MPEG12_MB_TYPE_MOTION_BACKWARD
);
262 const uint8_t *quant_matrix
= intra
? dec
->mpeg12_intra_matrix
:
263 dec
->mpeg12_non_intra_matrix
;
264 int mpeg1
= dec
->base
.profile
== PIPE_VIDEO_PROFILE_MPEG1
;
266 info
.index
= macrob
->y
* mb(dec
->base
.width
) + macrob
->x
;
270 if (macrob
->macroblock_modes
.bits
.dct_type
)
272 info
.unk5
= (macrob
->motion_vertical_field_select
<< 4) |
273 (macrob
->macroblock_modes
.value
& 0xf);
274 info
.coded_block_pattern
= macrob
->coded_block_pattern
;
276 memcpy(info
.PMV
, macrob
->PMV
, sizeof(info
.PMV
));
278 blocks
= macrob
->blocks
;
279 for (mask
= 0x20, block_index
= 0; mask
> 0; mask
>>= 1, block_index
++) {
280 if ((macrob
->coded_block_pattern
& mask
) == 0)
286 * The observation here is that there are a lot of 0's, and things go
287 * a lot faster if one skips over them.
290 #if defined(PIPE_ARCH_SSE) && defined(PIPE_ARCH_X86_64)
291 /* Note that the SSE implementation is much more tuned to X86_64. As it's not
292 * benchmarked on X86_32, disable it there. I suspect that the code needs to
293 * be reorganized in terms of 32-bit wide data in order to be more
294 * efficient. NV84+ were released well into the 64-bit CPU era, so it should
295 * be a minority case.
298 /* This returns a 16-bit bit-mask, each 2 bits are both 1 or both 0, depending
299 * on whether the corresponding (16-bit) word in blocks is zero or non-zero. */
300 #define wordmask(blocks, zero) \
301 (uint64_t)(_mm_movemask_epi8( \
303 zero, _mm_load_si128((__m128i *)(blocks)))))
305 __m128i zero
= _mm_setzero_si128();
307 /* TODO: Look into doing the inverse quantization in terms of SSE
308 * operations unconditionally, when necessary. */
309 uint64_t bmask0
= wordmask(blocks
, zero
);
310 bmask0
|= wordmask(blocks
+ 8, zero
) << 16;
311 bmask0
|= wordmask(blocks
+ 16, zero
) << 32;
312 bmask0
|= wordmask(blocks
+ 24, zero
) << 48;
313 uint64_t bmask1
= wordmask(blocks
+ 32, zero
);
314 bmask1
|= wordmask(blocks
+ 40, zero
) << 16;
315 bmask1
|= wordmask(blocks
+ 48, zero
) << 32;
316 bmask1
|= wordmask(blocks
+ 56, zero
) << 48;
318 /* The wordmask macro returns the inverse of what we want, since it
319 * returns a 1 for equal-to-zero. Invert. */
323 /* Note that the bitmask is actually sequences of 2 bits for each block
324 * index. This is because there is no movemask_epi16. That means that
325 * (a) ffs will never return 64, since the prev bit will always be set
326 * in that case, and (b) we need to do an extra bit shift. Or'ing the
327 * bitmasks together is faster than having a loop that computes them one
328 * at a time and processes them, on a Core i7-920. Trying to put bmask
329 * into an array and then looping also slows things down.
332 /* shift needs to be the same width as i, and unsigned so that / 2
333 * becomes a rshift operation */
337 if (dec
->base
.entrypoint
== PIPE_VIDEO_ENTRYPOINT_BITSTREAM
) {
339 while ((shift
= __builtin_ffsll(bmask0
))) {
340 i
+= (shift
- 1) / 2;
341 bmask0
>>= shift
- 1;
342 *dec
->mpeg12_data
++ = dec
->zscan
[i
] * 2;
343 tmp
= inverse_quantize(blocks
[i
], quant_matrix
[i
], mpeg1
);
344 *dec
->mpeg12_data
++ = tmp
;
351 while ((shift
= __builtin_ffsll(bmask1
))) {
352 i
+= (shift
- 1) / 2;
353 bmask1
>>= shift
- 1;
354 *dec
->mpeg12_data
++ = dec
->zscan
[i
] * 2;
355 tmp
= inverse_quantize(blocks
[i
], quant_matrix
[i
], mpeg1
);
356 *dec
->mpeg12_data
++ = tmp
;
363 while ((shift
= __builtin_ffsll(bmask0
))) {
364 i
+= (shift
- 1) / 2;
365 bmask0
>>= shift
- 1;
366 *dec
->mpeg12_data
++ = i
* 2;
367 *dec
->mpeg12_data
++ = blocks
[i
];
373 while ((shift
= __builtin_ffsll(bmask1
))) {
374 i
+= (shift
- 1) / 2;
375 bmask1
>>= shift
- 1;
376 *dec
->mpeg12_data
++ = i
* 2;
377 *dec
->mpeg12_data
++ = blocks
[i
];
387 * This loop looks ridiculously written... and it is. I tried a lot of
388 * different ways of achieving this scan, and this was the fastest, at
389 * least on a Core i7-920. Note that it's not necessary to skip the 0's,
390 * the firmware will deal with those just fine. But it's faster to skip
391 * them. Note to people trying benchmarks: make sure to use realistic
392 * mpeg data, which can often be a single data point first followed by
393 * 63 0's, or <data> 7x <0> <data> 7x <0> etc.
396 if (dec
->base
.entrypoint
== PIPE_VIDEO_ENTRYPOINT_BITSTREAM
) {
399 while (likely(i
< 64 && !(tmp
= blocks
[i
]))) i
++;
401 *dec
->mpeg12_data
++ = dec
->zscan
[i
] * 2;
402 tmp
= inverse_quantize(tmp
, quant_matrix
[i
], mpeg1
);
403 *dec
->mpeg12_data
++ = tmp
;
411 while (likely(i
< 64 && !(tmp
= blocks
[i
]))) i
++;
413 *dec
->mpeg12_data
++ = i
* 2;
414 *dec
->mpeg12_data
++ = tmp
;
422 if (dec
->base
.entrypoint
== PIPE_VIDEO_ENTRYPOINT_BITSTREAM
) {
423 if (!mpeg1
&& (sum
& 1) == 0) {
424 if (count
&& *(dec
->mpeg12_data
- 2) == 63 * 2) {
425 uint16_t *val
= dec
->mpeg12_data
- 1;
426 if (*val
& 1) *val
-= 1;
429 *dec
->mpeg12_data
++ = 63 * 2;
430 *dec
->mpeg12_data
++ = 1;
437 *(dec
->mpeg12_data
- 2) |= 1;
439 *dec
->mpeg12_data
++ = 1;
440 *dec
->mpeg12_data
++ = 0;
443 info
.block_counts
[block_index
] = count
;
447 memcpy(dec
->mpeg12_mb_info
, &info
, sizeof(info
));
448 dec
->mpeg12_mb_info
+= sizeof(info
);
450 if (macrob
->num_skipped_macroblocks
) {
452 info
.coded_block_pattern
= 0;
453 info
.skipped
= macrob
->num_skipped_macroblocks
- 1;
454 memset(info
.block_counts
, 0, sizeof(info
.block_counts
));
455 memcpy(dec
->mpeg12_mb_info
, &info
, sizeof(info
));
456 dec
->mpeg12_mb_info
+= sizeof(info
);
460 struct mpeg12_header
{
461 uint32_t luma_top_size
; // 00
462 uint32_t luma_bottom_size
; // 04
463 uint32_t chroma_top_size
; // 08
465 uint32_t mb_info_size
; // 10
466 uint32_t mb_width_minus1
; // 14
467 uint32_t mb_height_minus1
; // 18
468 uint32_t width
; // 1c
469 uint32_t height
; // 20
470 uint8_t progressive
; // 24
471 uint8_t mocomp_only
; // 25
472 uint8_t frames
; // 26
473 uint8_t picture_structure
; // 27
474 uint32_t unk28
; // 28 -- 0x50100
475 uint32_t unk2c
; // 2c
476 uint32_t pad
[4 * 13];
480 nv84_decoder_vp_mpeg12(struct nv84_decoder
*dec
,
481 struct pipe_mpeg12_picture_desc
*desc
,
482 struct nv84_video_buffer
*dest
)
484 struct nouveau_pushbuf
*push
= dec
->vp_pushbuf
;
485 struct nv84_video_buffer
*ref1
= (struct nv84_video_buffer
*)desc
->ref
[0];
486 struct nv84_video_buffer
*ref2
= (struct nv84_video_buffer
*)desc
->ref
[1];
487 struct nouveau_pushbuf_refn bo_refs
[] = {
488 { dest
->interlaced
, NOUVEAU_BO_RDWR
| NOUVEAU_BO_VRAM
},
489 { NULL
, NOUVEAU_BO_RDWR
| NOUVEAU_BO_VRAM
},
490 { NULL
, NOUVEAU_BO_RDWR
| NOUVEAU_BO_VRAM
},
491 { dec
->mpeg12_bo
, NOUVEAU_BO_RDWR
| NOUVEAU_BO_GART
},
493 int i
, num_refs
= sizeof(bo_refs
) / sizeof(*bo_refs
);
494 struct mpeg12_header header
= {0};
495 struct nv50_miptree
*y
= nv50_miptree(dest
->resources
[0]);
496 struct nv50_miptree
*uv
= nv50_miptree(dest
->resources
[1]);
498 STATIC_ASSERT(sizeof(struct mpeg12_header
) == 0x100);
504 bo_refs
[1].bo
= ref1
->interlaced
;
505 bo_refs
[2].bo
= ref2
->interlaced
;
507 header
.luma_top_size
= y
->layer_stride
;
508 header
.luma_bottom_size
= y
->layer_stride
;
509 header
.chroma_top_size
= uv
->layer_stride
;
510 header
.mbs
= mb(dec
->base
.width
) * mb(dec
->base
.height
);
511 header
.mb_info_size
= dec
->mpeg12_mb_info
- dec
->mpeg12_bo
->map
- 0x100;
512 header
.mb_width_minus1
= mb(dec
->base
.width
) - 1;
513 header
.mb_height_minus1
= mb(dec
->base
.height
) - 1;
514 header
.width
= align(dec
->base
.width
, 16);
515 header
.height
= align(dec
->base
.height
, 16);
516 header
.progressive
= desc
->frame_pred_frame_dct
;
517 header
.frames
= 1 + (desc
->ref
[0] != NULL
) + (desc
->ref
[1] != NULL
);
518 header
.picture_structure
= desc
->picture_structure
;
519 header
.unk28
= 0x50100;
521 memcpy(dec
->mpeg12_bo
->map
, &header
, sizeof(header
));
523 PUSH_SPACE(push
, 10 + 3 + 2);
525 nouveau_pushbuf_refn(push
, bo_refs
, num_refs
);
527 BEGIN_NV04(push
, SUBC_VP(0x400), 9);
528 PUSH_DATA (push
, 0x543210); /* each nibble possibly a dma index */
529 PUSH_DATA (push
, 0x555001); /* constant */
530 PUSH_DATA (push
, dec
->mpeg12_bo
->offset
>> 8);
531 PUSH_DATA (push
, (dec
->mpeg12_bo
->offset
+ 0x100) >> 8);
532 PUSH_DATA (push
, (dec
->mpeg12_bo
->offset
+ 0x100 +
533 align(0x20 * mb(dec
->base
.width
) *
534 mb(dec
->base
.height
), 0x100)) >> 8);
535 PUSH_DATA (push
, dest
->interlaced
->offset
>> 8);
536 PUSH_DATA (push
, ref1
->interlaced
->offset
>> 8);
537 PUSH_DATA (push
, ref2
->interlaced
->offset
>> 8);
538 PUSH_DATA (push
, 6 * 64 * 8 * header
.mbs
);
540 BEGIN_NV04(push
, SUBC_VP(0x620), 2);
544 BEGIN_NV04(push
, SUBC_VP(0x300), 1);
547 for (i
= 0; i
< 2; i
++) {
548 struct nv50_miptree
*mt
= nv50_miptree(dest
->resources
[i
]);
549 mt
->base
.status
|= NOUVEAU_BUFFER_STATUS_GPU_WRITING
;