## ffmpeg / libavcodec / x86 / h264_intrapred.asm @ dd68d4db

1 | 4af8cdfc | Jason Garrett-Glaser | ;****************************************************************************** |
2 | ;* H.264 intra prediction asm optimizations |
3 | ;* Copyright (c) 2010 Jason Garrett-Glaser |
4 | ;* |
5 | ;* This file is part of FFmpeg. |
6 | ;* |
7 | ;* FFmpeg is free software; you can redistribute it and/or |
8 | ;* modify it under the terms of the GNU Lesser General Public |
9 | ;* License as published by the Free Software Foundation; either |
10 | ;* version 2.1 of the License, or (at your option) any later version. |
11 | ;* |
12 | ;* FFmpeg is distributed in the hope that it will be useful, |
13 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
15 | ;* Lesser General Public License for more details. |
16 | ;* |
17 | ;* You should have received a copy of the GNU Lesser General Public |
18 | ;* License along with FFmpeg; if not, write to the Free Software |
19 | ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
20 | ;****************************************************************************** |
21 | |||

22 | %include "x86inc.asm" |
23 | |||

24 | SECTION_RODATA |
25 | |||

26 | tm_shuf: times 8 db 0x03, 0x80 |
27 | dd68d4db | Ronald S. Bultje | plane_shuf: db -8, -7, -6, -5, -4, -3, -2, -1 |

28 | db 1, 2, 3, 4, 5, 6, 7, 8 |
29 | plane8_shuf: db -4, -3, -2, -1, 0, 0, 0, 0 |
30 | db 1, 2, 3, 4, 0, 0, 0, 0 |
31 | pw_0to7: dw 0, 1, 2, 3, 4, 5, 6, 7 |
32 | pw_1to8: dw 1, 2, 3, 4, 5, 6, 7, 8 |
33 | pw_m8tom1: dw -8, -7, -6, -5, -4, -3, -2, -1 |
34 | pw_m4to4: dw -4, -3, -2, -1, 1, 2, 3, 4 |
35 | 4af8cdfc | Jason Garrett-Glaser | |

36 | SECTION .text |
37 | |||

38 | bc14f04b | Jason Garrett-Glaser | cextern pb_1 |

39 | 4af8cdfc | Jason Garrett-Glaser | cextern pb_3 |

40 | dd68d4db | Ronald S. Bultje | cextern pw_5 |

41 | cextern pw_16 |
42 | cextern pw_17 |
43 | cextern pw_32 |
44 | 4af8cdfc | Jason Garrett-Glaser | |

45 | ;----------------------------------------------------------------------------- |
46 | ; void pred16x16_vertical(uint8_t *src, int stride) |
||

47 | ;----------------------------------------------------------------------------- |
48 | |||

49 | cglobal pred16x16_vertical_mmx, 2,3 |
||

50 | sub r0, r1 |
51 | mov r2, 8 |
52 | movq mm0, [r0+0] |
53 | movq mm1, [r0+8] |
54 | .loop: |
||

55 | movq [r0+r1*1+0], mm0 |
56 | movq [r0+r1*1+8], mm1 |
57 | movq [r0+r1*2+0], mm0 |
58 | movq [r0+r1*2+8], mm1 |
59 | lea r0, [r0+r1*2] |
60 | dec r2 |
61 | jg .loop |
62 | REP_RET |
63 | |||

64 | cglobal pred16x16_vertical_sse, 2,3 |
||

65 | sub r0, r1 |
66 | mov r2, 4 |
67 | movaps xmm0, [r0] |
68 | .loop: |
||

69 | movaps [r0+r1*1], xmm0 |
70 | movaps [r0+r1*2], xmm0 |
71 | lea r0, [r0+r1*2] |
72 | movaps [r0+r1*1], xmm0 |
73 | movaps [r0+r1*2], xmm0 |
74 | lea r0, [r0+r1*2] |
75 | dec r2 |
76 | jg .loop |
77 | REP_RET |
78 | |||

79 | ;----------------------------------------------------------------------------- |
||

80 | ; void pred16x16_horizontal(uint8_t *src, int stride) |
||

81 | ;----------------------------------------------------------------------------- |
82 | |||

83 | %macro PRED16x16_H 1 |
||

84 | cglobal pred16x16_horizontal_%1, 2,3 |
85 | mov r2, 8 |
86 | %ifidn %1, ssse3 |
||

87 | mova m2, [pb_3] |
||

88 | %endif |
89 | .loop: |
||

90 | movd m0, [r0+r1*0-4] |
91 | movd m1, [r0+r1*1-4] |
92 | |||

93 | %ifidn %1, ssse3 |
||

94 | pshufb m0, m2 |
||

95 | pshufb m1, m2 |
||

96 | %else |
97 | punpcklbw m0, m0 |
98 | punpcklbw m1, m1 |
99 | %ifidn %1, mmxext |
||

100 | pshufw m0, m0, 0xff |
101 | pshufw m1, m1, 0xff |
102 | %else |
||

103 | punpckhwd m0, m0 |
||

104 | punpckhwd m1, m1 |
105 | punpckhdq m0, m0 |
106 | punpckhdq m1, m1 |
107 | %endif |
||

108 | mova [r0+r1*0+8], m0 |
||

109 | mova [r0+r1*1+8], m1 |
110 | %endif |
111 | |||

112 | mova [r0+r1*0], m0 |
||

113 | mova [r0+r1*1], m1 |
114 | lea r0, [r0+r1*2] |
115 | dec r2 |
116 | jg .loop |
117 | REP_RET |
118 | %endmacro |
119 | |||

120 | INIT_MMX |
121 | PRED16x16_H mmx |
122 | PRED16x16_H mmxext |
123 | INIT_XMM |
||

124 | PRED16x16_H ssse3 |
125 | |||

126 | ;----------------------------------------------------------------------------- |
||

127 | ; void pred16x16_dc(uint8_t *src, int stride) |
128 | ;----------------------------------------------------------------------------- |
||

129 | |||

130 | 17dc7c7a | Jason Garrett-Glaser | %macro PRED16x16_DC 1 |

131 | 4af8cdfc | Jason Garrett-Glaser | cglobal pred16x16_dc_%1, 2,7 |

132 | mov r4, r0 |
133 | sub r0, r1 |
134 | pxor mm0, mm0 |
135 | pxor mm1, mm1 |
136 | psadbw mm0, [r0+0] |
137 | psadbw mm1, [r0+8] |
138 | dec r0 |
139 | movzx r5d, byte [r0+r1*1] |
140 | paddw mm0, mm1 |
141 | movd r6d, mm0 |
142 | lea r0, [r0+r1*2] |
143 | %rep 7 |
||

144 | movzx r2d, byte [r0+r1*0] |
145 | movzx r3d, byte [r0+r1*1] |
146 | add r5d, r2d |
147 | add r6d, r3d |
148 | lea r0, [r0+r1*2] |
149 | %endrep |
||

150 | movzx r2d, byte [r0+r1*0] |
151 | add r5d, r6d |
152 | lea r2d, [r2+r5+16] |
153 | shr r2d, 5 |
154 | 270a85d2 | Jason Garrett-Glaser | %ifidn %1, mmxext |

155 | 4af8cdfc | Jason Garrett-Glaser | movd m0, r2d |

156 | punpcklbw m0, m0 |
157 | pshufw m0, m0, 0 |
158 | %elifidn %1, sse2 |
||

159 | movd m0, r2d |
160 | punpcklbw m0, m0 |
161 | pshuflw m0, m0, 0 |
162 | punpcklqdq m0, m0 |
163 | %elifidn %1, ssse3 |
||

164 | pxor m1, m1 |
165 | movd m0, r2d |
166 | pshufb m0, m1 |
167 | %endif |
168 | |||

169 | %if mmsize==8 |
||

170 | mov r3d, 8 |
171 | .loop: |
||

172 | 17dc7c7a | Jason Garrett-Glaser | mova [r4+r1*0+0], m0 |

173 | mova [r4+r1*0+8], m0 |
174 | mova [r4+r1*1+0], m0 |
175 | mova [r4+r1*1+8], m0 |
176 | 4af8cdfc | Jason Garrett-Glaser | %else |

177 | mov r3d, 4 |
||

178 | .loop: |
||

179 | 17dc7c7a | Jason Garrett-Glaser | mova [r4+r1*0], m0 |

||

181 | 4af8cdfc | Jason Garrett-Glaser | lea r4, [r4+r1*2] |

182 | 17dc7c7a | Jason Garrett-Glaser | mova [r4+r1*0], m0 |

||

184 | 4af8cdfc | Jason Garrett-Glaser | %endif |

||

||

||

||

189 | %endmacro |
190 | |||

191 | INIT_MMX |
192 | 17dc7c7a | Jason Garrett-Glaser | PRED16x16_DC mmxext |

193 | 4af8cdfc | Jason Garrett-Glaser | INIT_XMM |

194 | 17dc7c7a | Jason Garrett-Glaser | PRED16x16_DC sse2 |

195 | PRED16x16_DC ssse3 |
196 | 4af8cdfc | Jason Garrett-Glaser | |

197 | ;----------------------------------------------------------------------------- |
198 | ; void pred16x16_tm_vp8(uint8_t *src, int stride) |
||

199 | ;----------------------------------------------------------------------------- |
||

200 | |||

201 | %macro PRED16x16_TM_MMX 1 |
||

202 | cglobal pred16x16_tm_vp8_%1, 2,5 |
||

||

||

||

||

||

||

||

||

||

||

||

||

215 | .loop: |
||

||

||

||

219 | %ifidn %1, mmx |
||

||

||

222 | %else |
||

223 | pshufw mm4, mm4, 0 |
224 | %endif |
||

225 | movq mm5, mm4 |
226 | movq mm6, mm4 |
227 | movq mm7, mm4 |
228 | paddw mm4, mm0 |
229 | paddw mm5, mm1 |
230 | paddw mm6, mm2 |
231 | paddw mm7, mm3 |
232 | packuswb mm4, mm5 |
233 | packuswb mm6, mm7 |
234 | movq [r0+r1+0], mm4 |
235 | movq [r0+r1+8], mm6 |
236 | add r0, r1 |
237 | dec r4d |
238 | jg .loop |
239 | REP_RET |
240 | %endmacro |
241 | |||

242 | PRED16x16_TM_MMX mmx |
243 | PRED16x16_TM_MMX mmxext |
244 | |||

245 | cglobal pred16x16_tm_vp8_sse2, 2,6,6 |
||

||

||

||

||

||

||

||

||

254 | .loop: |
||

||

||

||

||

||

||

||

||

||

||

||

||

||

||

||

||

||

||

||

||

||

||

||

||

279 | |||

280 | ;----------------------------------------------------------------------------- |
281 | dd68d4db | Ronald S. Bultje | ; void pred16x16_plane(uint8_t *src, int stride) |

282 | ;----------------------------------------------------------------------------- |
||

283 | |||

284 | %macro H264_PRED16x16_PLANE 3 |
||

285 | cglobal pred16x16_plane_%3_%1, 2, 7, %2 |
||

||

||

288 | |||

289 | movh m0, [r0+r1 -1] |
290 | %if mmsize == 8 |
||

291 | pxor m4, m4 |
||

||

||

294 | movh m3, [r0+r1 +12] |
||

295 | punpcklbw m0, m4 |
296 | punpcklbw m1, m4 |
297 | punpcklbw m2, m4 |
298 | punpcklbw m3, m4 |
299 | pmullw m0, [pw_m8tom1 ] |
300 | pmullw m1, [pw_m8tom1+8] |
301 | pmullw m2, [pw_1to8 ] |
302 | pmullw m3, [pw_1to8 +8] |
303 | paddw m0, m2 |
304 | paddw m1, m3 |
305 | %else ; mmsize == 16 |
||

306 | %ifidn %1, sse2 |
||

307 | pxor m2, m2 |
308 | movh m1, [r0+r1 +8] |
309 | punpcklbw m0, m2 |
310 | punpcklbw m1, m2 |
311 | pmullw m0, [pw_m8tom1] |
312 | pmullw m1, [pw_1to8] |
313 | paddw m0, m1 |
||

314 | %else ; ssse3 |
||

315 | movhps m0, [r0+r1 +8] |
316 | pmaddubsw m0, [plane_shuf] ; H coefficients |
317 | %endif |
318 | movhlps m1, m0 |
||

319 | %endif |
320 | paddw m0, m1 |
321 | %ifidn %1, mmx |
||

322 | mova m1, m0 |
||

323 | psrlq m1, 32 |
324 | %elifidn %1, mmx2 |
||

325 | pshufw m1, m0, 0xE |
326 | %else ; mmsize == 16 |
||

327 | pshuflw m1, m0, 0xE |
328 | %endif |
329 | paddw m0, m1 |
330 | %ifidn %1, mmx |
||

331 | mova m1, m0 |
||

332 | psrlq m1, 16 |
333 | %elifidn %1, mmx2 |
||

334 | pshufw m1, m0, 0x1 |
335 | %else |
336 | pshuflw m1, m0, 0x1 |
337 | %endif |
338 | paddw m0, m1 ; sum of H coefficients |
339 | |||

340 | %ifidn %3, h264 |
||

341 | pmullw m0, [pw_5] |
||

||

||

344 | %elifidn %3, rv40 |
||

345 | pmullw m0, [pw_5] |
346 | psraw m0, 6 |
347 | %elifidn %3, svq3 |
||

348 | movd r3, m0 |
349 | movsx r3, r3w |
350 | test r3, r3 |
351 | lea r4, [r3+3] |
352 | cmovs r3, r4 |
353 | sar r3, 2 ; H/4 |
354 | lea r3, [r3*5] ; 5*(H/4) |
355 | test r3, r3 |
356 | lea r4, [r3+15] |
357 | cmovs r3, r4 |
358 | sar r3, 4 ; (5*(H/4))/16 |
359 | movd m0, r3d |
360 | %endif |
361 | |||

362 | lea r4, [r0+r2*8-1] |
363 | lea r3, [r0+r2*4-1] |
364 | add r4, r2 |
365 | |||

366 | %ifdef ARCH_X86_64 |
||

367 | %define e_reg r11 |
368 | %else |
369 | %define e_reg r0 |
370 | %endif |
371 | |||

372 | movzx e_reg, byte [r3+r2*2 ] |
||

||

||

376 | movzx e_reg, byte [r3+r2 ] |
377 | movzx r6, byte [r4 ] |
378 | sub r6, e_reg |
379 | lea r5, [r5+r6*2] |
380 | |||

381 | movzx e_reg, byte [r3+r1 ] |
||

||

||

||

385 | |||

386 | movzx e_reg, byte [r3 ] |
||

387 | %ifdef ARCH_X86_64 |
388 | movzx r10, byte [r4+r2 ] |
389 | sub r10, e_reg |
390 | %else |
391 | movzx r6, byte [r4+r2 ] |
392 | sub r6, e_reg |
393 | lea r5, [r5+r6*4] |
394 | sub r5, r6 |
395 | %endif |
396 | |||

397 | lea e_reg, [r3+r1*4] |
||

||

399 | |||

400 | movzx r4, byte [e_reg+r2 ] |
401 | movzx r6, byte [r3 ] |
402 | sub r6, r4 |
403 | %ifdef ARCH_X86_64 |
404 | lea r6, [r10+r6*2] |
405 | lea r5, [r5+r6*2] |
406 | add r5, r6 |
407 | %else |
408 | lea r5, [r5+r6*4] |
409 | lea r5, [r5+r6*2] |
410 | %endif |
411 | |||

412 | movzx r4, byte [e_reg ] |
||

||

||

||

||

417 | %else |
||

||

||

||

||

||

423 | |||

424 | movzx r4, byte [e_reg+r1 ] |
||

||

||

427 | %ifdef ARCH_X86_64 |
||

||

||

430 | lea r5, [r5+r6*8] |
431 | |||

432 | movzx r4, byte [e_reg+r2*2] |
||

||

||

||

436 | add r5, r6 ; sum of V coefficients |
437 | |||

438 | %ifndef ARCH_X86_64 |
||

439 | mov r0, r0m |
440 | %endif |
441 | |||

442 | %ifidn %3, h264 |
||

443 | lea r5, [r5*5+32] |
444 | sar r5, 6 |
445 | %elifidn %3, rv40 |
||

446 | lea r5, [r5*5] |
447 | sar r5, 6 |
448 | %elifidn %3, svq3 |
||

449 | test r5, r5 |
450 | lea r6, [r5+3] |
451 | cmovs r5, r6 |
452 | sar r5, 2 ; V/4 |
453 | lea r5, [r5*5] ; 5*(V/4) |
454 | test r5, r5 |
455 | lea r6, [r5+15] |
456 | cmovs r5, r6 |
457 | sar r5, 4 ; (5*(V/4))/16 |
458 | %endif |
||

459 | |||

460 | movzx r4, byte [r0+r1 +15] |
461 | movzx r3, byte [r3+r2*2 ] |
462 | lea r3, [r3+r4+1] |
463 | shl r3, 4 |
464 | movd r1d, m0 |
465 | movsx r1d, r1w |
466 | add r1d, r5d |
467 | add r3d, r1d |
468 | shl r1d, 3 |
469 | sub r3d, r1d ; a |
470 | |||

471 | movd m1, r5d |
472 | movd m3, r3d |
473 | %ifidn %1, mmx |
||

474 | punpcklwd m0, m0 |
475 | punpcklwd m1, m1 |
476 | punpcklwd m3, m3 |
477 | punpckldq m0, m0 |
478 | punpckldq m1, m1 |
479 | punpckldq m3, m3 |
480 | %elifidn %1, mmx2 |
||

481 | pshufw m0, m0, 0x0 |
482 | pshufw m1, m1, 0x0 |
483 | pshufw m3, m3, 0x0 |
484 | %else |
||

485 | pshuflw m0, m0, 0x0 |
486 | pshuflw m1, m1, 0x0 |
487 | pshuflw m3, m3, 0x0 |
488 | punpcklqdq m0, m0 ; splat H (words) |
489 | punpcklqdq m1, m1 ; splat V (words) |
490 | punpcklqdq m3, m3 ; splat a (words) |
491 | %endif |
492 | %ifidn %3, svq3 |
||

493 | SWAP 0, 1 |
||

494 | %endif |
||

495 | mova m2, m0 |
496 | %if mmsize == 8 |
||

497 | mova m5, m0 |
||

498 | %endif |
499 | pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words) |
||

500 | %if mmsize == 16 |
||

501 | psllw m2, 3 |
502 | %else |
503 | psllw m5, 3 |
504 | psllw m2, 2 |
505 | mova m6, m5 |
506 | paddw m6, m2 |
507 | %endif |
508 | paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H |
||

509 | paddw m2, m0 ; a + {8,9,10,11,12,13,14,15}*H |
510 | %if mmsize == 8 |
511 | paddw m5, m0 ; a + {8,9,10,11}*H |
512 | paddw m6, m0 ; a + {12,13,14,15}*H |
513 | %endif |
514 | |||

||

||

||

||

||

||

||

||

||

||

||

||

||

528 | packuswb m3, m4 |
||

529 | mova [r0+8], m3 |
||

530 | %endif |
531 | paddw m0, m1 |
532 | paddw m2, m1 |
533 | %if mmsize == 8 |
||

534 | paddw m5, m1 |
535 | paddw m6, m1 |
536 | %endif |
537 | |||

538 | mova m3, m0 ; b[0..7] |
||

||

||

||

||

||

544 | %if mmsize == 8 |
||

545 | mova m3, m5 ; b[8..11] |
546 | mova m4, m6 ; b[12..15] |
547 | psraw m3, 5 |
548 | psraw m4, 5 |
549 | packuswb m3, m4 |
550 | mova [r0+r2+8], m3 |
551 | %endif |
552 | paddw m0, m1 |
||

553 | paddw m2, m1 |
554 | %if mmsize == 8 |
555 | paddw m5, m1 |
556 | paddw m6, m1 |
557 | %endif |
558 | |||

559 | lea r0, [r0+r2*2] |
560 | dec r4 |
561 | jg .loop |
562 | REP_RET |
563 | %endmacro |
564 | |||

565 | INIT_MMX |
566 | H264_PRED16x16_PLANE mmx, 0, h264 |
567 | H264_PRED16x16_PLANE mmx, 0, rv40 |
568 | H264_PRED16x16_PLANE mmx, 0, svq3 |
569 | H264_PRED16x16_PLANE mmx2, 0, h264 |
570 | H264_PRED16x16_PLANE mmx2, 0, rv40 |
571 | H264_PRED16x16_PLANE mmx2, 0, svq3 |
572 | INIT_XMM |
573 | H264_PRED16x16_PLANE sse2, 8, h264 |
574 | H264_PRED16x16_PLANE sse2, 8, rv40 |
575 | H264_PRED16x16_PLANE sse2, 8, svq3 |
576 | H264_PRED16x16_PLANE ssse3, 8, h264 |
577 | H264_PRED16x16_PLANE ssse3, 8, rv40 |
578 | H264_PRED16x16_PLANE ssse3, 8, svq3 |
579 | |||

580 | ;----------------------------------------------------------------------------- |
||

||

582 | ;----------------------------------------------------------------------------- |
||

583 | |||

584 | %macro H264_PRED8x8_PLANE 2 |
||

585 | cglobal pred8x8_plane_%1, 2, 7, %2 |
||

||

||

588 | |||

589 | movd m0, [r0+r1 -1] |
590 | %if mmsize == 8 |
||

591 | pxor m2, m2 |
592 | movh m1, [r0+r1 +4 ] |
593 | punpcklbw m0, m2 |
594 | punpcklbw m1, m2 |
595 | pmullw m0, [pw_m4to4] |
596 | pmullw m1, [pw_m4to4+8] |
597 | %else ; mmsize == 16 |
||

598 | %ifidn %1, sse2 |
||

599 | pxor m2, m2 |
600 | movd m1, [r0+r1 +4] |
601 | punpckldq m0, m1 |
602 | punpcklbw m0, m2 |
603 | pmullw m0, [pw_m4to4] |
604 | %else ; ssse3 |
||

605 | movhps m0, [r0+r1 +4] ; this reads 4 bytes more than necessary |
||

606 | pmaddubsw m0, [plane8_shuf] ; H coefficients |
607 | %endif |
608 | movhlps m1, m0 |
609 | %endif |
610 | paddw m0, m1 |
611 | |||

612 | %ifnidn %1, ssse3 |
||

613 | %ifidn %1, mmx |
||

614 | mova m1, m0 |
615 | psrlq m1, 32 |
616 | %elifidn %1, mmx2 |
||

617 | pshufw m1, m0, 0xE |
618 | %else ; mmsize == 16 |
||

619 | pshuflw m1, m0, 0xE |
620 | %endif |
621 | paddw m0, m1 |
622 | %endif ; !ssse3 |
||

623 | |||

624 | %ifidn %1, mmx |
||

625 | mova m1, m0 |
626 | psrlq m1, 16 |
627 | %elifidn %1, mmx2 |
||

628 | pshufw m1, m0, 0x1 |
629 | %else |
||

630 | pshuflw m1, m0, 0x1 |
631 | %endif |
632 | paddw m0, m1 ; sum of H coefficients |
633 | |||

634 | pmullw m0, [pw_17] |
635 | paddw m0, [pw_16] |
636 | psraw m0, 5 |
637 | |||

638 | lea r4, [r0+r2*4-1] |
639 | lea r3, [r0 -1] |
640 | add r4, r2 |
641 | |||

642 | %ifdef ARCH_X86_64 |
||

643 | %define e_reg r11 |
644 | %else |
645 | %define e_reg r0 |
646 | %endif |
647 | |||

648 | movzx e_reg, byte [r3+r2*2 ] |
649 | movzx r5, byte [r4+r1 ] |
650 | sub r5, e_reg |
651 | |||

652 | movzx e_reg, byte [r3 ] |
653 | %ifdef ARCH_X86_64 |
||

654 | movzx r10, byte [r4+r2 ] |
655 | sub r10, e_reg |
656 | sub r5, r10 |
657 | %else |
||

658 | movzx r6, byte [r4+r2 ] |
659 | sub r6, e_reg |
660 | lea r5, [r5+r6*4] |
661 | sub r5, r6 |
662 | %endif |
663 | |||

664 | movzx e_reg, byte [r3+r1 ] |
665 | movzx r6, byte [r4+r2*2 ] |
666 | sub r6, e_reg |
667 | %ifdef ARCH_X86_64 |
||

668 | add r6, r10 |
669 | %endif |
670 | lea r5, [r5+r6*4] |
671 | |||

672 | movzx e_reg, byte [r3+r2 ] |
673 | movzx r6, byte [r4 ] |
674 | sub r6, e_reg |
675 | lea r6, [r5+r6*2] |
676 | |||

677 | lea r5, [r6*9+16] |
678 | lea r5, [r5+r6*8] |
679 | sar r5, 5 |
680 | |||

681 | %ifndef ARCH_X86_64 |
682 | mov r0, r0m |
683 | %endif |
684 | |||

685 | movzx r3, byte [r4+r2*2 ] |
686 | movzx r4, byte [r0+r1 +7] |
687 | lea r3, [r3+r4+1] |
688 | shl r3, 4 |
689 | movd r1d, m0 |
690 | movsx r1d, r1w |
691 | add r1d, r5d |
692 | sub r3d, r1d |
693 | add r1d, r1d |
694 | sub r3d, r1d ; a |
695 | |||

696 | movd m1, r5d |
697 | movd m3, r3d |
698 | %ifidn %1, mmx |
||

699 | punpcklwd m0, m0 |
700 | punpcklwd m1, m1 |
701 | punpcklwd m3, m3 |
702 | punpckldq m0, m0 |
703 | punpckldq m1, m1 |
704 | punpckldq m3, m3 |
705 | %elifidn %1, mmx2 |
||

706 | pshufw m0, m0, 0x0 |
707 | pshufw m1, m1, 0x0 |
708 | pshufw m3, m3, 0x0 |
709 | %else |
||

710 | pshuflw m0, m0, 0x0 |
711 | pshuflw m1, m1, 0x0 |
712 | pshuflw m3, m3, 0x0 |
713 | punpcklqdq m0, m0 ; splat H (words) |
714 | punpcklqdq m1, m1 ; splat V (words) |
715 | punpcklqdq m3, m3 ; splat a (words) |
716 | %endif |
717 | %if mmsize == 8 |
||

718 | mova m2, m0 |
719 | %endif |
720 | pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words) |
721 | paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H |
722 | %if mmsize == 8 |
||

723 | psllw m2, 2 |
724 | paddw m2, m0 ; a + {4,5,6,7}*H |
725 | %endif |
726 | |||

727 | mov r4, 4 |
728 | ALIGN 16 |
729 | .loop |
||

730 | %if mmsize == 16 |
||

731 | mova m3, m0 ; b[0..7] |
732 | paddw m0, m1 |
733 | psraw m3, 5 |
734 | mova m4, m0 ; V+b[0..7] |
735 | paddw m0, m1 |
736 | psraw m4, 5 |
737 | packuswb m3, m4 |
738 | movh [r0], m3 |
739 | movhps [r0+r2], m3 |
740 | %else ; mmsize == 8 |
||

741 | mova m3, m0 ; b[0..3] |
742 | mova m4, m2 ; b[4..7] |
743 | paddw m0, m1 |
744 | paddw m2, m1 |
745 | psraw m3, 5 |
746 | psraw m4, 5 |
747 | mova m5, m0 ; V+b[0..3] |
748 | mova m6, m2 ; V+b[4..7] |
749 | paddw m0, m1 |
750 | paddw m2, m1 |
751 | psraw m5, 5 |
752 | psraw m6, 5 |
753 | packuswb m3, m4 |
754 | packuswb m5, m6 |
755 | mova [r0], m3 |
756 | mova [r0+r2], m5 |
757 | %endif |
758 | |||

759 | lea r0, [r0+r2*2] |
760 | dec r4 |
761 | jg .loop |
762 | REP_RET |
763 | %endmacro |
764 | |||

765 | INIT_MMX |
766 | H264_PRED8x8_PLANE mmx, 0 |
767 | H264_PRED8x8_PLANE mmx2, 0 |
768 | INIT_XMM |
769 | H264_PRED8x8_PLANE sse2, 8 |
770 | H264_PRED8x8_PLANE ssse3, 8 |
771 | |||

772 | ;----------------------------------------------------------------------------- |
773 | 4af8cdfc | Jason Garrett-Glaser | ; void pred8x8_vertical(uint8_t *src, int stride) |

774 | ;----------------------------------------------------------------------------- |
775 | |||

776 | cglobal pred8x8_vertical_mmx, 2,2 |
||

||

||

779 | %rep 3 |
||

780 | movq [r0+r1*1], mm0 |
781 | movq [r0+r1*2], mm0 |
782 | lea r0, [r0+r1*2] |
783 | %endrep |
||

784 | movq [r0+r1*1], mm0 |
785 | movq [r0+r1*2], mm0 |
786 | RET |
787 | |||

788 | ;----------------------------------------------------------------------------- |
789 | ; void pred8x8_horizontal(uint8_t *src, int stride) |
||

790 | ;----------------------------------------------------------------------------- |
791 | |||

792 | %macro PRED8x8_H 1 |
||

793 | cglobal pred8x8_horizontal_%1, 2,3 |
||

||

||

796 | mova m2, [pb_3] |
797 | %endif |
798 | .loop: |
||

799 | movd m0, [r0+r1*0-4] |
800 | movd m1, [r0+r1*1-4] |
801 | %ifidn %1, ssse3 |
||

802 | pshufb m0, m2 |
803 | pshufb m1, m2 |
804 | %else |
||

805 | punpcklbw m0, m0 |
806 | punpcklbw m1, m1 |
807 | %ifidn %1, mmxext |
||

808 | pshufw m0, m0, 0xff |
809 | pshufw m1, m1, 0xff |
810 | %else |
||

811 | punpckhwd m0, m0 |
812 | punpckhwd m1, m1 |
813 | punpckhdq m0, m0 |
814 | punpckhdq m1, m1 |
815 | %endif |
816 | %endif |
||

817 | mova [r0+r1*0], m0 |
818 | mova [r0+r1*1], m1 |
819 | lea r0, [r0+r1*2] |
820 | dec r2 |
821 | jg .loop |
822 | REP_RET |
823 | %endmacro |
824 | |||

825 | INIT_MMX |
||

826 | PRED8x8_H mmx |
||

827 | PRED8x8_H mmxext |
||

828 | PRED8x8_H ssse3 |
||

829 | |||

830 | ;----------------------------------------------------------------------------- |
||

831 | ; void pred8x8_dc_rv40(uint8_t *src, int stride) |
||

832 | ;----------------------------------------------------------------------------- |
||

833 | |||

834 | 270a85d2 | Jason Garrett-Glaser | cglobal pred8x8_dc_rv40_mmxext, 2,7 |

835 | 4af8cdfc | Jason Garrett-Glaser | mov r4, r0 |

836 | sub r0, r1 |
||

837 | pxor mm0, mm0 |
||

838 | psadbw mm0, [r0] |
||

839 | dec r0 |
||

840 | movzx r5d, byte [r0+r1*1] |
||

841 | movd r6d, mm0 |
||

842 | lea r0, [r0+r1*2] |
||

843 | %rep 3 |
||

844 | movzx r2d, byte [r0+r1*0] |
||

845 | movzx r3d, byte [r0+r1*1] |
||

846 | add r5d, r2d |
||

847 | add r6d, r3d |
||

848 | lea r0, [r0+r1*2] |
||

849 | %endrep |
||

850 | movzx r2d, byte [r0+r1*0] |
||

851 | add r5d, r6d |
||

852 | lea r2d, [r2+r5+8] |
||

853 | shr r2d, 4 |
||

854 | movd mm0, r2d |
||

855 | punpcklbw mm0, mm0 |
||

856 | pshufw mm0, mm0, 0 |
||

857 | mov r3d, 4 |
||

858 | .loop: |
||

859 | movq [r4+r1*0], mm0 |
||

860 | movq [r4+r1*1], mm0 |
||

861 | lea r4, [r4+r1*2] |
||

862 | dec r3d |
||

863 | jg .loop |
||

864 | REP_RET |
||

865 | |||

866 | ;----------------------------------------------------------------------------- |
||

867 | ; void pred8x8_tm_vp8(uint8_t *src, int stride) |
||

868 | ;----------------------------------------------------------------------------- |
||

869 | |||

870 | %macro PRED8x8_TM_MMX 1 |
||

871 | cglobal pred8x8_tm_vp8_%1, 2,6 |
||

872 | sub r0, r1 |
||

873 | pxor mm7, mm7 |
||

874 | movq mm0, [r0] |
||

875 | movq mm1, mm0 |
||

876 | punpcklbw mm0, mm7 |
||

877 | punpckhbw mm1, mm7 |
||

878 | movzx r4d, byte [r0-1] |
||

879 | mov r5d, 4 |
||

880 | .loop: |
||

881 | movzx r2d, byte [r0+r1*1-1] |
||

882 | movzx r3d, byte [r0+r1*2-1] |
||

883 | sub r2d, r4d |
||

884 | sub r3d, r4d |
||

885 | movd mm2, r2d |
||

886 | movd mm4, r3d |
||

887 | %ifidn %1, mmx |
||

888 | punpcklwd mm2, mm2 |
||

889 | punpcklwd mm4, mm4 |
||

890 | punpckldq mm2, mm2 |
||

891 | punpckldq mm4, mm4 |
||

892 | %else |
||

893 | pshufw mm2, mm2, 0 |
||

894 | pshufw mm4, mm4, 0 |
||

895 | %endif |
||

896 | movq mm3, mm2 |
||

897 | movq mm5, mm4 |
||

898 | paddw mm2, mm0 |
||

899 | paddw mm3, mm1 |
||

900 | paddw mm4, mm0 |
||

901 | paddw mm5, mm1 |
||

902 | packuswb mm2, mm3 |
||

903 | packuswb mm4, mm5 |
||

904 | movq [r0+r1*1], mm2 |
||

905 | movq [r0+r1*2], mm4 |
||

906 | lea r0, [r0+r1*2] |
||

907 | dec r5d |
||

908 | jg .loop |
||

909 | REP_RET |
||

910 | %endmacro |
||

911 | |||

912 | PRED8x8_TM_MMX mmx |
||

913 | PRED8x8_TM_MMX mmxext |
||

914 | |||

915 | cglobal pred8x8_tm_vp8_sse2, 2,6,4 |
||

916 | sub r0, r1 |
||

917 | pxor xmm1, xmm1 |
||

918 | movq xmm0, [r0] |
||

919 | punpcklbw xmm0, xmm1 |
||

920 | movzx r4d, byte [r0-1] |
||

921 | mov r5d, 4 |
||

922 | .loop: |
||

923 | movzx r2d, byte [r0+r1*1-1] |
||

924 | movzx r3d, byte [r0+r1*2-1] |
||

925 | sub r2d, r4d |
||

926 | sub r3d, r4d |
||

927 | movd xmm2, r2d |
||

928 | movd xmm3, r3d |
||

929 | pshuflw xmm2, xmm2, 0 |
||

930 | pshuflw xmm3, xmm3, 0 |
||

931 | punpcklqdq xmm2, xmm2 |
||

932 | punpcklqdq xmm3, xmm3 |
||

933 | paddw xmm2, xmm0 |
||

934 | paddw xmm3, xmm0 |
||

935 | packuswb xmm2, xmm3 |
||

936 | movq [r0+r1*1], xmm2 |
||

937 | movhps [r0+r1*2], xmm2 |
||

938 | lea r0, [r0+r1*2] |
||

939 | dec r5d |
||

940 | jg .loop |
||

941 | REP_RET |
||

942 | |||

943 | cglobal pred8x8_tm_vp8_ssse3, 2,3,6 |
||

944 | sub r0, r1 |
||

945 | movdqa xmm4, [tm_shuf] |
||

946 | pxor xmm1, xmm1 |
||

947 | movq xmm0, [r0] |
||

948 | punpcklbw xmm0, xmm1 |
||

949 | movd xmm5, [r0-4] |
||

950 | pshufb xmm5, xmm4 |
||

951 | mov r2d, 4 |
||

952 | .loop: |
||

953 | movd xmm2, [r0+r1*1-4] |
||

954 | movd xmm3, [r0+r1*2-4] |
||

955 | pshufb xmm2, xmm4 |
||

956 | pshufb xmm3, xmm4 |
||

957 | psubw xmm2, xmm5 |
||

958 | psubw xmm3, xmm5 |
||

959 | paddw xmm2, xmm0 |
||

960 | paddw xmm3, xmm0 |
||

961 | packuswb xmm2, xmm3 |
||

962 | movq [r0+r1*1], xmm2 |
||

963 | movhps [r0+r1*2], xmm2 |
||

964 | lea r0, [r0+r1*2] |
||

965 | dec r2d |
||

966 | jg .loop |
||

967 | REP_RET |
||

968 | 270a85d2 | Jason Garrett-Glaser | |

969 | 8b746bb4 | Jason Garrett-Glaser | ;----------------------------------------------------------------------------- |

970 | ; void pred4x4_dc_mmxext(uint8_t *src, const uint8_t *topright, int stride) |
||

971 | ;----------------------------------------------------------------------------- |
||

972 | |||

973 | 270a85d2 | Jason Garrett-Glaser | cglobal pred4x4_dc_mmxext, 3,5 |

974 | pxor mm7, mm7 |
||

975 | mov r4, r0 |
||

976 | sub r0, r2 |
||

977 | movd mm0, [r0] |
||

978 | psadbw mm0, mm7 |
||

979 | movzx r1d, byte [r0+r2*1-1] |
||

980 | movd r3d, mm0 |
||

981 | add r3d, r1d |
||

982 | movzx r1d, byte [r0+r2*2-1] |
||

983 | lea r0, [r0+r2*2] |
||

984 | add r3d, r1d |
||

985 | movzx r1d, byte [r0+r2*1-1] |
||

986 | add r3d, r1d |
||

987 | movzx r1d, byte [r0+r2*2-1] |
||

988 | add r3d, r1d |
||

989 | add r3d, 4 |
||

990 | shr r3d, 3 |
||

991 | imul r3d, 0x01010101 |
||

992 | mov [r4+r2*0], r3d |
||

993 | mov [r0+r2*0], r3d |
||

994 | mov [r0+r2*1], r3d |
||

995 | mov [r0+r2*2], r3d |
||

996 | RET |
||

997 | fb9927ad | Jason Garrett-Glaser | |

998 | ;----------------------------------------------------------------------------- |
||

999 | ; void pred4x4_tm_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride) |
||

1000 | ;----------------------------------------------------------------------------- |
||

1001 | |||

1002 | %macro PRED4x4_TM_MMX 1 |
||

1003 | cglobal pred4x4_tm_vp8_%1, 3,6 |
||

1004 | sub r0, r2 |
||

1005 | pxor mm7, mm7 |
||

1006 | movd mm0, [r0] |
||

1007 | punpcklbw mm0, mm7 |
||

1008 | movzx r4d, byte [r0-1] |
||

1009 | mov r5d, 2 |
||

1010 | .loop: |
||

1011 | movzx r1d, byte [r0+r2*1-1] |
||

1012 | movzx r3d, byte [r0+r2*2-1] |
||

1013 | sub r1d, r4d |
||

1014 | sub r3d, r4d |
||

1015 | movd mm2, r1d |
||

1016 | movd mm4, r3d |
||

1017 | %ifidn %1, mmx |
||

1018 | punpcklwd mm2, mm2 |
||

1019 | punpcklwd mm4, mm4 |
||

1020 | punpckldq mm2, mm2 |
||

1021 | punpckldq mm4, mm4 |
||

1022 | %else |
||

1023 | pshufw mm2, mm2, 0 |
||

1024 | pshufw mm4, mm4, 0 |
||

1025 | %endif |
||

1026 | paddw mm2, mm0 |
||

1027 | paddw mm4, mm0 |
||

1028 | packuswb mm2, mm2 |
||

1029 | packuswb mm4, mm4 |
||

1030 | movd [r0+r2*1], mm2 |
||

1031 | movd [r0+r2*2], mm4 |
||

1032 | lea r0, [r0+r2*2] |
||

1033 | dec r5d |
||

1034 | jg .loop |
||

1035 | REP_RET |
||

1036 | %endmacro |
||

1037 | |||

1038 | PRED4x4_TM_MMX mmx |
||

1039 | PRED4x4_TM_MMX mmxext |
||

1040 | |||

1041 | cglobal pred4x4_tm_vp8_ssse3, 3,3 |
||

1042 | sub r0, r2 |
||

1043 | movq mm6, [tm_shuf] |
||

1044 | pxor mm1, mm1 |
||

1045 | movd mm0, [r0] |
||

1046 | punpcklbw mm0, mm1 |
||

1047 | movd mm7, [r0-4] |
||

1048 | pshufb mm7, mm6 |
||

1049 | lea r1, [r0+r2*2] |
||

1050 | movd mm2, [r0+r2*1-4] |
||

1051 | movd mm3, [r0+r2*2-4] |
||

1052 | movd mm4, [r1+r2*1-4] |
||

1053 | movd mm5, [r1+r2*2-4] |
||

1054 | pshufb mm2, mm6 |
||

1055 | pshufb mm3, mm6 |
||

1056 | pshufb mm4, mm6 |
||

1057 | pshufb mm5, mm6 |
||

1058 | psubw mm2, mm7 |
||

1059 | psubw mm3, mm7 |
||

1060 | psubw mm4, mm7 |
||

1061 | psubw mm5, mm7 |
||

1062 | paddw mm2, mm0 |
||

1063 | paddw mm3, mm0 |
||

1064 | paddw mm4, mm0 |
||

1065 | paddw mm5, mm0 |
||

1066 | packuswb mm2, mm2 |
||

1067 | packuswb mm3, mm3 |
||

1068 | packuswb mm4, mm4 |
||

1069 | packuswb mm5, mm5 |
||

1070 | movd [r0+r2*1], mm2 |
||

1071 | movd [r0+r2*2], mm3 |
||

1072 | movd [r1+r2*1], mm4 |
||

1073 | movd [r1+r2*2], mm5 |
||

1074 | RET |
||

1075 | bc14f04b | Jason Garrett-Glaser | |

1076 | ; dest, left, right, src, tmp |
||

1077 | ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2 |
||

1078 | %macro PRED4x4_LOWPASS 5 |
||

1079 | mova %5, %2 |
||

1080 | pavgb %2, %3 |
||

1081 | pxor %3, %5 |
||

1082 | mova %1, %4 |
||

1083 | pand %3, [pb_1] |
||

1084 | psubusb %2, %3 |
||

1085 | pavgb %1, %2 |
||

1086 | %endmacro |
||

1087 | |||

1088 | ;----------------------------------------------------------------------------- |
||

1089 | ; void pred4x4_vertical_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride) |
||

1090 | ;----------------------------------------------------------------------------- |
||

1091 | |||

1092 | INIT_MMX |
||

1093 | cglobal pred4x4_vertical_vp8_mmxext, 3,3 |
||

1094 | sub r0, r2 |
||

1095 | movd m1, [r0-1] |
||

1096 | movd m0, [r0] |
||

1097 | mova m2, m0 ;t0 t1 t2 t3 |
||

1098 | punpckldq m0, [r1] ;t0 t1 t2 t3 t4 t5 t6 t7 |
||

1099 | lea r1, [r0+r2*2] |
||

1100 | psrlq m0, 8 ;t1 t2 t3 t4 |
||

1101 | PRED4x4_LOWPASS m3, m1, m0, m2, m4 |
||

1102 | movd [r0+r2*1], m3 |
||

1103 | movd [r0+r2*2], m3 |
||

1104 | movd [r1+r2*1], m3 |
||

1105 | movd [r1+r2*2], m3 |
||

1106 | RET |