## ffmpeg / libavcodec / x86 / h264_intrapred.asm @ dd68d4db

History | View | Annotate | Download (26.6 KB)

1 | 4af8cdfc | Jason Garrett-Glaser | ;****************************************************************************** |
---|---|---|---|

2 | ;* H.264 intra prediction asm optimizations |
||

3 | ;* Copyright (c) 2010 Jason Garrett-Glaser |
||

4 | ;* |
||

5 | ;* This file is part of FFmpeg. |
||

6 | ;* |
||

7 | ;* FFmpeg is free software; you can redistribute it and/or |
||

8 | ;* modify it under the terms of the GNU Lesser General Public |
||

9 | ;* License as published by the Free Software Foundation; either |
||

10 | ;* version 2.1 of the License, or (at your option) any later version. |
||

11 | ;* |
||

12 | ;* FFmpeg is distributed in the hope that it will be useful, |
||

13 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||

14 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||

15 | ;* Lesser General Public License for more details. |
||

16 | ;* |
||

17 | ;* You should have received a copy of the GNU Lesser General Public |
||

18 | ;* License along with FFmpeg; if not, write to the Free Software |
||

19 | ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||

20 | ;****************************************************************************** |
||

21 | |||

22 | %include "x86inc.asm" |
||

23 | |||

24 | SECTION_RODATA |
||

25 | |||

26 | tm_shuf: times 8 db 0x03, 0x80 |
||

27 | dd68d4db | Ronald S. Bultje | plane_shuf: db -8, -7, -6, -5, -4, -3, -2, -1 |

28 | db 1, 2, 3, 4, 5, 6, 7, 8 |
||

29 | plane8_shuf: db -4, -3, -2, -1, 0, 0, 0, 0 |
||

30 | db 1, 2, 3, 4, 0, 0, 0, 0 |
||

31 | pw_0to7: dw 0, 1, 2, 3, 4, 5, 6, 7 |
||

32 | pw_1to8: dw 1, 2, 3, 4, 5, 6, 7, 8 |
||

33 | pw_m8tom1: dw -8, -7, -6, -5, -4, -3, -2, -1 |
||

34 | pw_m4to4: dw -4, -3, -2, -1, 1, 2, 3, 4 |
||

35 | 4af8cdfc | Jason Garrett-Glaser | |

36 | SECTION .text |
||

37 | |||

38 | bc14f04b | Jason Garrett-Glaser | cextern pb_1 |

39 | 4af8cdfc | Jason Garrett-Glaser | cextern pb_3 |

40 | dd68d4db | Ronald S. Bultje | cextern pw_5 |

41 | cextern pw_16 |
||

42 | cextern pw_17 |
||

43 | cextern pw_32 |
||

44 | 4af8cdfc | Jason Garrett-Glaser | |

45 | ;----------------------------------------------------------------------------- |
||

46 | ; void pred16x16_vertical(uint8_t *src, int stride) |
||

47 | ;----------------------------------------------------------------------------- |
||

48 | |||

49 | cglobal pred16x16_vertical_mmx, 2,3 |
||

50 | sub r0, r1 |
||

51 | mov r2, 8 |
||

52 | movq mm0, [r0+0] |
||

53 | movq mm1, [r0+8] |
||

54 | .loop: |
||

55 | movq [r0+r1*1+0], mm0 |
||

56 | movq [r0+r1*1+8], mm1 |
||

57 | movq [r0+r1*2+0], mm0 |
||

58 | movq [r0+r1*2+8], mm1 |
||

59 | lea r0, [r0+r1*2] |
||

60 | dec r2 |
||

61 | jg .loop |
||

62 | REP_RET |
||

63 | |||

64 | cglobal pred16x16_vertical_sse, 2,3 |
||

65 | sub r0, r1 |
||

66 | mov r2, 4 |
||

67 | movaps xmm0, [r0] |
||

68 | .loop: |
||

69 | movaps [r0+r1*1], xmm0 |
||

70 | movaps [r0+r1*2], xmm0 |
||

71 | lea r0, [r0+r1*2] |
||

72 | movaps [r0+r1*1], xmm0 |
||

73 | movaps [r0+r1*2], xmm0 |
||

74 | lea r0, [r0+r1*2] |
||

75 | dec r2 |
||

76 | jg .loop |
||

77 | REP_RET |
||

78 | |||

79 | ;----------------------------------------------------------------------------- |
||

80 | ; void pred16x16_horizontal(uint8_t *src, int stride) |
||

81 | ;----------------------------------------------------------------------------- |
||

82 | |||

83 | %macro PRED16x16_H 1 |
||

84 | cglobal pred16x16_horizontal_%1, 2,3 |
||

85 | mov r2, 8 |
||

86 | %ifidn %1, ssse3 |
||

87 | mova m2, [pb_3] |
||

88 | %endif |
||

89 | .loop: |
||

90 | movd m0, [r0+r1*0-4] |
||

91 | movd m1, [r0+r1*1-4] |
||

92 | |||

93 | %ifidn %1, ssse3 |
||

94 | pshufb m0, m2 |
||

95 | pshufb m1, m2 |
||

96 | %else |
||

97 | punpcklbw m0, m0 |
||

98 | punpcklbw m1, m1 |
||

99 | %ifidn %1, mmxext |
||

100 | pshufw m0, m0, 0xff |
||

101 | pshufw m1, m1, 0xff |
||

102 | %else |
||

103 | punpckhwd m0, m0 |
||

104 | punpckhwd m1, m1 |
||

105 | punpckhdq m0, m0 |
||

106 | punpckhdq m1, m1 |
||

107 | %endif |
||

108 | mova [r0+r1*0+8], m0 |
||

109 | mova [r0+r1*1+8], m1 |
||

110 | %endif |
||

111 | |||

112 | mova [r0+r1*0], m0 |
||

113 | mova [r0+r1*1], m1 |
||

114 | lea r0, [r0+r1*2] |
||

115 | dec r2 |
||

116 | jg .loop |
||

117 | REP_RET |
||

118 | %endmacro |
||

119 | |||

120 | INIT_MMX |
||

121 | PRED16x16_H mmx |
||

122 | PRED16x16_H mmxext |
||

123 | INIT_XMM |
||

124 | PRED16x16_H ssse3 |
||

125 | |||

126 | ;----------------------------------------------------------------------------- |
||

127 | ; void pred16x16_dc(uint8_t *src, int stride) |
||

128 | ;----------------------------------------------------------------------------- |
||

129 | |||

130 | 17dc7c7a | Jason Garrett-Glaser | %macro PRED16x16_DC 1 |

131 | 4af8cdfc | Jason Garrett-Glaser | cglobal pred16x16_dc_%1, 2,7 |

132 | mov r4, r0 |
||

133 | sub r0, r1 |
||

134 | pxor mm0, mm0 |
||

135 | pxor mm1, mm1 |
||

136 | psadbw mm0, [r0+0] |
||

137 | psadbw mm1, [r0+8] |
||

138 | dec r0 |
||

139 | movzx r5d, byte [r0+r1*1] |
||

140 | paddw mm0, mm1 |
||

141 | movd r6d, mm0 |
||

142 | lea r0, [r0+r1*2] |
||

143 | %rep 7 |
||

144 | movzx r2d, byte [r0+r1*0] |
||

145 | movzx r3d, byte [r0+r1*1] |
||

146 | add r5d, r2d |
||

147 | add r6d, r3d |
||

148 | lea r0, [r0+r1*2] |
||

149 | %endrep |
||

150 | movzx r2d, byte [r0+r1*0] |
||

151 | add r5d, r6d |
||

152 | lea r2d, [r2+r5+16] |
||

153 | shr r2d, 5 |
||

154 | 270a85d2 | Jason Garrett-Glaser | %ifidn %1, mmxext |

155 | 4af8cdfc | Jason Garrett-Glaser | movd m0, r2d |

156 | punpcklbw m0, m0 |
||

157 | pshufw m0, m0, 0 |
||

158 | %elifidn %1, sse2 |
||

159 | movd m0, r2d |
||

160 | punpcklbw m0, m0 |
||

161 | pshuflw m0, m0, 0 |
||

162 | punpcklqdq m0, m0 |
||

163 | %elifidn %1, ssse3 |
||

164 | pxor m1, m1 |
||

165 | movd m0, r2d |
||

166 | pshufb m0, m1 |
||

167 | %endif |
||

168 | |||

169 | %if mmsize==8 |
||

170 | mov r3d, 8 |
||

171 | .loop: |
||

172 | 17dc7c7a | Jason Garrett-Glaser | mova [r4+r1*0+0], m0 |

173 | mova [r4+r1*0+8], m0 |
||

174 | mova [r4+r1*1+0], m0 |
||

175 | mova [r4+r1*1+8], m0 |
||

176 | 4af8cdfc | Jason Garrett-Glaser | %else |

177 | mov r3d, 4 |
||

178 | .loop: |
||

179 | 17dc7c7a | Jason Garrett-Glaser | mova [r4+r1*0], m0 |

180 | mova [r4+r1*1], m0 |
||

181 | 4af8cdfc | Jason Garrett-Glaser | lea r4, [r4+r1*2] |

182 | 17dc7c7a | Jason Garrett-Glaser | mova [r4+r1*0], m0 |

183 | mova [r4+r1*1], m0 |
||

184 | 4af8cdfc | Jason Garrett-Glaser | %endif |

185 | lea r4, [r4+r1*2] |
||

186 | dec r3d |
||

187 | jg .loop |
||

188 | REP_RET |
||

189 | %endmacro |
||

190 | |||

191 | INIT_MMX |
||

192 | 17dc7c7a | Jason Garrett-Glaser | PRED16x16_DC mmxext |

193 | 4af8cdfc | Jason Garrett-Glaser | INIT_XMM |

194 | 17dc7c7a | Jason Garrett-Glaser | PRED16x16_DC sse2 |

195 | PRED16x16_DC ssse3 |
||

196 | 4af8cdfc | Jason Garrett-Glaser | |

197 | ;----------------------------------------------------------------------------- |
||

198 | ; void pred16x16_tm_vp8(uint8_t *src, int stride) |
||

199 | ;----------------------------------------------------------------------------- |
||

200 | |||

201 | %macro PRED16x16_TM_MMX 1 |
||

202 | cglobal pred16x16_tm_vp8_%1, 2,5 |
||

203 | sub r0, r1 |
||

204 | pxor mm7, mm7 |
||

205 | movq mm0, [r0+0] |
||

206 | movq mm2, [r0+8] |
||

207 | movq mm1, mm0 |
||

208 | movq mm3, mm2 |
||

209 | punpcklbw mm0, mm7 |
||

210 | punpckhbw mm1, mm7 |
||

211 | punpcklbw mm2, mm7 |
||

212 | punpckhbw mm3, mm7 |
||

213 | movzx r3d, byte [r0-1] |
||

214 | mov r4d, 16 |
||

215 | .loop: |
||

216 | movzx r2d, byte [r0+r1-1] |
||

217 | sub r2d, r3d |
||

218 | movd mm4, r2d |
||

219 | %ifidn %1, mmx |
||

220 | punpcklwd mm4, mm4 |
||

221 | punpckldq mm4, mm4 |
||

222 | %else |
||

223 | pshufw mm4, mm4, 0 |
||

224 | %endif |
||

225 | movq mm5, mm4 |
||

226 | movq mm6, mm4 |
||

227 | movq mm7, mm4 |
||

228 | paddw mm4, mm0 |
||

229 | paddw mm5, mm1 |
||

230 | paddw mm6, mm2 |
||

231 | paddw mm7, mm3 |
||

232 | packuswb mm4, mm5 |
||

233 | packuswb mm6, mm7 |
||

234 | movq [r0+r1+0], mm4 |
||

235 | movq [r0+r1+8], mm6 |
||

236 | add r0, r1 |
||

237 | dec r4d |
||

238 | jg .loop |
||

239 | REP_RET |
||

240 | %endmacro |
||

241 | |||

242 | PRED16x16_TM_MMX mmx |
||

243 | PRED16x16_TM_MMX mmxext |
||

244 | |||

245 | cglobal pred16x16_tm_vp8_sse2, 2,6,6 |
||

246 | sub r0, r1 |
||

247 | pxor xmm2, xmm2 |
||

248 | movdqa xmm0, [r0] |
||

249 | movdqa xmm1, xmm0 |
||

250 | punpcklbw xmm0, xmm2 |
||

251 | punpckhbw xmm1, xmm2 |
||

252 | movzx r4d, byte [r0-1] |
||

253 | mov r5d, 8 |
||

254 | .loop: |
||

255 | movzx r2d, byte [r0+r1*1-1] |
||

256 | movzx r3d, byte [r0+r1*2-1] |
||

257 | sub r2d, r4d |
||

258 | sub r3d, r4d |
||

259 | movd xmm2, r2d |
||

260 | movd xmm4, r3d |
||

261 | pshuflw xmm2, xmm2, 0 |
||

262 | pshuflw xmm4, xmm4, 0 |
||

263 | punpcklqdq xmm2, xmm2 |
||

264 | punpcklqdq xmm4, xmm4 |
||

265 | movdqa xmm3, xmm2 |
||

266 | movdqa xmm5, xmm4 |
||

267 | paddw xmm2, xmm0 |
||

268 | paddw xmm3, xmm1 |
||

269 | paddw xmm4, xmm0 |
||

270 | paddw xmm5, xmm1 |
||

271 | packuswb xmm2, xmm3 |
||

272 | packuswb xmm4, xmm5 |
||

273 | movdqa [r0+r1*1], xmm2 |
||

274 | movdqa [r0+r1*2], xmm4 |
||

275 | lea r0, [r0+r1*2] |
||

276 | dec r5d |
||

277 | jg .loop |
||

278 | REP_RET |
||

279 | |||

280 | ;----------------------------------------------------------------------------- |
||

281 | dd68d4db | Ronald S. Bultje | ; void pred16x16_plane(uint8_t *src, int stride) |

282 | ;----------------------------------------------------------------------------- |
||

283 | |||

284 | %macro H264_PRED16x16_PLANE 3 |
||

285 | cglobal pred16x16_plane_%3_%1, 2, 7, %2 |
||

286 | mov r2, r1 ; +stride |
||

287 | neg r1 ; -stride |
||

288 | |||

289 | movh m0, [r0+r1 -1] |
||

290 | %if mmsize == 8 |
||

291 | pxor m4, m4 |
||

292 | movh m1, [r0+r1 +3 ] |
||

293 | movh m2, [r0+r1 +8 ] |
||

294 | movh m3, [r0+r1 +12] |
||

295 | punpcklbw m0, m4 |
||

296 | punpcklbw m1, m4 |
||

297 | punpcklbw m2, m4 |
||

298 | punpcklbw m3, m4 |
||

299 | pmullw m0, [pw_m8tom1 ] |
||

300 | pmullw m1, [pw_m8tom1+8] |
||

301 | pmullw m2, [pw_1to8 ] |
||

302 | pmullw m3, [pw_1to8 +8] |
||

303 | paddw m0, m2 |
||

304 | paddw m1, m3 |
||

305 | %else ; mmsize == 16 |
||

306 | %ifidn %1, sse2 |
||

307 | pxor m2, m2 |
||

308 | movh m1, [r0+r1 +8] |
||

309 | punpcklbw m0, m2 |
||

310 | punpcklbw m1, m2 |
||

311 | pmullw m0, [pw_m8tom1] |
||

312 | pmullw m1, [pw_1to8] |
||

313 | paddw m0, m1 |
||

314 | %else ; ssse3 |
||

315 | movhps m0, [r0+r1 +8] |
||

316 | pmaddubsw m0, [plane_shuf] ; H coefficients |
||

317 | %endif |
||

318 | movhlps m1, m0 |
||

319 | %endif |
||

320 | paddw m0, m1 |
||

321 | %ifidn %1, mmx |
||

322 | mova m1, m0 |
||

323 | psrlq m1, 32 |
||

324 | %elifidn %1, mmx2 |
||

325 | pshufw m1, m0, 0xE |
||

326 | %else ; mmsize == 16 |
||

327 | pshuflw m1, m0, 0xE |
||

328 | %endif |
||

329 | paddw m0, m1 |
||

330 | %ifidn %1, mmx |
||

331 | mova m1, m0 |
||

332 | psrlq m1, 16 |
||

333 | %elifidn %1, mmx2 |
||

334 | pshufw m1, m0, 0x1 |
||

335 | %else |
||

336 | pshuflw m1, m0, 0x1 |
||

337 | %endif |
||

338 | paddw m0, m1 ; sum of H coefficients |
||

339 | |||

340 | %ifidn %3, h264 |
||

341 | pmullw m0, [pw_5] |
||

342 | paddw m0, [pw_32] |
||

343 | psraw m0, 6 |
||

344 | %elifidn %3, rv40 |
||

345 | pmullw m0, [pw_5] |
||

346 | psraw m0, 6 |
||

347 | %elifidn %3, svq3 |
||

348 | movd r3, m0 |
||

349 | movsx r3, r3w |
||

350 | test r3, r3 |
||

351 | lea r4, [r3+3] |
||

352 | cmovs r3, r4 |
||

353 | sar r3, 2 ; H/4 |
||

354 | lea r3, [r3*5] ; 5*(H/4) |
||

355 | test r3, r3 |
||

356 | lea r4, [r3+15] |
||

357 | cmovs r3, r4 |
||

358 | sar r3, 4 ; (5*(H/4))/16 |
||

359 | movd m0, r3d |
||

360 | %endif |
||

361 | |||

362 | lea r4, [r0+r2*8-1] |
||

363 | lea r3, [r0+r2*4-1] |
||

364 | add r4, r2 |
||

365 | |||

366 | %ifdef ARCH_X86_64 |
||

367 | %define e_reg r11 |
||

368 | %else |
||

369 | %define e_reg r0 |
||

370 | %endif |
||

371 | |||

372 | movzx e_reg, byte [r3+r2*2 ] |
||

373 | movzx r5, byte [r4+r1 ] |
||

374 | sub r5, e_reg |
||

375 | |||

376 | movzx e_reg, byte [r3+r2 ] |
||

377 | movzx r6, byte [r4 ] |
||

378 | sub r6, e_reg |
||

379 | lea r5, [r5+r6*2] |
||

380 | |||

381 | movzx e_reg, byte [r3+r1 ] |
||

382 | movzx r6, byte [r4+r2*2 ] |
||

383 | sub r6, e_reg |
||

384 | lea r5, [r5+r6*4] |
||

385 | |||

386 | movzx e_reg, byte [r3 ] |
||

387 | %ifdef ARCH_X86_64 |
||

388 | movzx r10, byte [r4+r2 ] |
||

389 | sub r10, e_reg |
||

390 | %else |
||

391 | movzx r6, byte [r4+r2 ] |
||

392 | sub r6, e_reg |
||

393 | lea r5, [r5+r6*4] |
||

394 | sub r5, r6 |
||

395 | %endif |
||

396 | |||

397 | lea e_reg, [r3+r1*4] |
||

398 | lea r3, [r4+r2*4] |
||

399 | |||

400 | movzx r4, byte [e_reg+r2 ] |
||

401 | movzx r6, byte [r3 ] |
||

402 | sub r6, r4 |
||

403 | %ifdef ARCH_X86_64 |
||

404 | lea r6, [r10+r6*2] |
||

405 | lea r5, [r5+r6*2] |
||

406 | add r5, r6 |
||

407 | %else |
||

408 | lea r5, [r5+r6*4] |
||

409 | lea r5, [r5+r6*2] |
||

410 | %endif |
||

411 | |||

412 | movzx r4, byte [e_reg ] |
||

413 | %ifdef ARCH_X86_64 |
||

414 | movzx r10, byte [r3 +r2 ] |
||

415 | sub r10, r4 |
||

416 | sub r5, r10 |
||

417 | %else |
||

418 | movzx r6, byte [r3 +r2 ] |
||

419 | sub r6, r4 |
||

420 | lea r5, [r5+r6*8] |
||

421 | sub r5, r6 |
||

422 | %endif |
||

423 | |||

424 | movzx r4, byte [e_reg+r1 ] |
||

425 | movzx r6, byte [r3 +r2*2] |
||

426 | sub r6, r4 |
||

427 | %ifdef ARCH_X86_64 |
||

428 | add r6, r10 |
||

429 | %endif |
||

430 | lea r5, [r5+r6*8] |
||

431 | |||

432 | movzx r4, byte [e_reg+r2*2] |
||

433 | movzx r6, byte [r3 +r1 ] |
||

434 | sub r6, r4 |
||

435 | lea r5, [r5+r6*4] |
||

436 | add r5, r6 ; sum of V coefficients |
||

437 | |||

438 | %ifndef ARCH_X86_64 |
||

439 | mov r0, r0m |
||

440 | %endif |
||

441 | |||

442 | %ifidn %3, h264 |
||

443 | lea r5, [r5*5+32] |
||

444 | sar r5, 6 |
||

445 | %elifidn %3, rv40 |
||

446 | lea r5, [r5*5] |
||

447 | sar r5, 6 |
||

448 | %elifidn %3, svq3 |
||

449 | test r5, r5 |
||

450 | lea r6, [r5+3] |
||

451 | cmovs r5, r6 |
||

452 | sar r5, 2 ; V/4 |
||

453 | lea r5, [r5*5] ; 5*(V/4) |
||

454 | test r5, r5 |
||

455 | lea r6, [r5+15] |
||

456 | cmovs r5, r6 |
||

457 | sar r5, 4 ; (5*(V/4))/16 |
||

458 | %endif |
||

459 | |||

460 | movzx r4, byte [r0+r1 +15] |
||

461 | movzx r3, byte [r3+r2*2 ] |
||

462 | lea r3, [r3+r4+1] |
||

463 | shl r3, 4 |
||

464 | movd r1d, m0 |
||

465 | movsx r1d, r1w |
||

466 | add r1d, r5d |
||

467 | add r3d, r1d |
||

468 | shl r1d, 3 |
||

469 | sub r3d, r1d ; a |
||

470 | |||

471 | movd m1, r5d |
||

472 | movd m3, r3d |
||

473 | %ifidn %1, mmx |
||

474 | punpcklwd m0, m0 |
||

475 | punpcklwd m1, m1 |
||

476 | punpcklwd m3, m3 |
||

477 | punpckldq m0, m0 |
||

478 | punpckldq m1, m1 |
||

479 | punpckldq m3, m3 |
||

480 | %elifidn %1, mmx2 |
||

481 | pshufw m0, m0, 0x0 |
||

482 | pshufw m1, m1, 0x0 |
||

483 | pshufw m3, m3, 0x0 |
||

484 | %else |
||

485 | pshuflw m0, m0, 0x0 |
||

486 | pshuflw m1, m1, 0x0 |
||

487 | pshuflw m3, m3, 0x0 |
||

488 | punpcklqdq m0, m0 ; splat H (words) |
||

489 | punpcklqdq m1, m1 ; splat V (words) |
||

490 | punpcklqdq m3, m3 ; splat a (words) |
||

491 | %endif |
||

492 | %ifidn %3, svq3 |
||

493 | SWAP 0, 1 |
||

494 | %endif |
||

495 | mova m2, m0 |
||

496 | %if mmsize == 8 |
||

497 | mova m5, m0 |
||

498 | %endif |
||

499 | pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words) |
||

500 | %if mmsize == 16 |
||

501 | psllw m2, 3 |
||

502 | %else |
||

503 | psllw m5, 3 |
||

504 | psllw m2, 2 |
||

505 | mova m6, m5 |
||

506 | paddw m6, m2 |
||

507 | %endif |
||

508 | paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H |
||

509 | paddw m2, m0 ; a + {8,9,10,11,12,13,14,15}*H |
||

510 | %if mmsize == 8 |
||

511 | paddw m5, m0 ; a + {8,9,10,11}*H |
||

512 | paddw m6, m0 ; a + {12,13,14,15}*H |
||

513 | %endif |
||

514 | |||

515 | mov r4, 8 |
||

516 | .loop |
||

517 | mova m3, m0 ; b[0..7] |
||

518 | mova m4, m2 ; b[8..15] |
||

519 | psraw m3, 5 |
||

520 | psraw m4, 5 |
||

521 | packuswb m3, m4 |
||

522 | mova [r0], m3 |
||

523 | %if mmsize == 8 |
||

524 | mova m3, m5 ; b[8..11] |
||

525 | mova m4, m6 ; b[12..15] |
||

526 | psraw m3, 5 |
||

527 | psraw m4, 5 |
||

528 | packuswb m3, m4 |
||

529 | mova [r0+8], m3 |
||

530 | %endif |
||

531 | paddw m0, m1 |
||

532 | paddw m2, m1 |
||

533 | %if mmsize == 8 |
||

534 | paddw m5, m1 |
||

535 | paddw m6, m1 |
||

536 | %endif |
||

537 | |||

538 | mova m3, m0 ; b[0..7] |
||

539 | mova m4, m2 ; b[8..15] |
||

540 | psraw m3, 5 |
||

541 | psraw m4, 5 |
||

542 | packuswb m3, m4 |
||

543 | mova [r0+r2], m3 |
||

544 | %if mmsize == 8 |
||

545 | mova m3, m5 ; b[8..11] |
||

546 | mova m4, m6 ; b[12..15] |
||

547 | psraw m3, 5 |
||

548 | psraw m4, 5 |
||

549 | packuswb m3, m4 |
||

550 | mova [r0+r2+8], m3 |
||

551 | %endif |
||

552 | paddw m0, m1 |
||

553 | paddw m2, m1 |
||

554 | %if mmsize == 8 |
||

555 | paddw m5, m1 |
||

556 | paddw m6, m1 |
||

557 | %endif |
||

558 | |||

559 | lea r0, [r0+r2*2] |
||

560 | dec r4 |
||

561 | jg .loop |
||

562 | REP_RET |
||

563 | %endmacro |
||

564 | |||

565 | INIT_MMX |
||

566 | H264_PRED16x16_PLANE mmx, 0, h264 |
||

567 | H264_PRED16x16_PLANE mmx, 0, rv40 |
||

568 | H264_PRED16x16_PLANE mmx, 0, svq3 |
||

569 | H264_PRED16x16_PLANE mmx2, 0, h264 |
||

570 | H264_PRED16x16_PLANE mmx2, 0, rv40 |
||

571 | H264_PRED16x16_PLANE mmx2, 0, svq3 |
||

572 | INIT_XMM |
||

573 | H264_PRED16x16_PLANE sse2, 8, h264 |
||

574 | H264_PRED16x16_PLANE sse2, 8, rv40 |
||

575 | H264_PRED16x16_PLANE sse2, 8, svq3 |
||

576 | H264_PRED16x16_PLANE ssse3, 8, h264 |
||

577 | H264_PRED16x16_PLANE ssse3, 8, rv40 |
||

578 | H264_PRED16x16_PLANE ssse3, 8, svq3 |
||

579 | |||

580 | ;----------------------------------------------------------------------------- |
||

581 | ; void pred8x8_plane(uint8_t *src, int stride) |
||

582 | ;----------------------------------------------------------------------------- |
||

583 | |||

584 | %macro H264_PRED8x8_PLANE 2 |
||

585 | cglobal pred8x8_plane_%1, 2, 7, %2 |
||

586 | mov r2, r1 ; +stride |
||

587 | neg r1 ; -stride |
||

588 | |||

589 | movd m0, [r0+r1 -1] |
||

590 | %if mmsize == 8 |
||

591 | pxor m2, m2 |
||

592 | movh m1, [r0+r1 +4 ] |
||

593 | punpcklbw m0, m2 |
||

594 | punpcklbw m1, m2 |
||

595 | pmullw m0, [pw_m4to4] |
||

596 | pmullw m1, [pw_m4to4+8] |
||

597 | %else ; mmsize == 16 |
||

598 | %ifidn %1, sse2 |
||

599 | pxor m2, m2 |
||

600 | movd m1, [r0+r1 +4] |
||

601 | punpckldq m0, m1 |
||

602 | punpcklbw m0, m2 |
||

603 | pmullw m0, [pw_m4to4] |
||

604 | %else ; ssse3 |
||

605 | movhps m0, [r0+r1 +4] ; this reads 4 bytes more than necessary |
||

606 | pmaddubsw m0, [plane8_shuf] ; H coefficients |
||

607 | %endif |
||

608 | movhlps m1, m0 |
||

609 | %endif |
||

610 | paddw m0, m1 |
||

611 | |||

612 | %ifnidn %1, ssse3 |
||

613 | %ifidn %1, mmx |
||

614 | mova m1, m0 |
||

615 | psrlq m1, 32 |
||

616 | %elifidn %1, mmx2 |
||

617 | pshufw m1, m0, 0xE |
||

618 | %else ; mmsize == 16 |
||

619 | pshuflw m1, m0, 0xE |
||

620 | %endif |
||

621 | paddw m0, m1 |
||

622 | %endif ; !ssse3 |
||

623 | |||

624 | %ifidn %1, mmx |
||

625 | mova m1, m0 |
||

626 | psrlq m1, 16 |
||

627 | %elifidn %1, mmx2 |
||

628 | pshufw m1, m0, 0x1 |
||

629 | %else |
||

630 | pshuflw m1, m0, 0x1 |
||

631 | %endif |
||

632 | paddw m0, m1 ; sum of H coefficients |
||

633 | |||

634 | pmullw m0, [pw_17] |
||

635 | paddw m0, [pw_16] |
||

636 | psraw m0, 5 |
||

637 | |||

638 | lea r4, [r0+r2*4-1] |
||

639 | lea r3, [r0 -1] |
||

640 | add r4, r2 |
||

641 | |||

642 | %ifdef ARCH_X86_64 |
||

643 | %define e_reg r11 |
||

644 | %else |
||

645 | %define e_reg r0 |
||

646 | %endif |
||

647 | |||

648 | movzx e_reg, byte [r3+r2*2 ] |
||

649 | movzx r5, byte [r4+r1 ] |
||

650 | sub r5, e_reg |
||

651 | |||

652 | movzx e_reg, byte [r3 ] |
||

653 | %ifdef ARCH_X86_64 |
||

654 | movzx r10, byte [r4+r2 ] |
||

655 | sub r10, e_reg |
||

656 | sub r5, r10 |
||

657 | %else |
||

658 | movzx r6, byte [r4+r2 ] |
||

659 | sub r6, e_reg |
||

660 | lea r5, [r5+r6*4] |
||

661 | sub r5, r6 |
||

662 | %endif |
||

663 | |||

664 | movzx e_reg, byte [r3+r1 ] |
||

665 | movzx r6, byte [r4+r2*2 ] |
||

666 | sub r6, e_reg |
||

667 | %ifdef ARCH_X86_64 |
||

668 | add r6, r10 |
||

669 | %endif |
||

670 | lea r5, [r5+r6*4] |
||

671 | |||

672 | movzx e_reg, byte [r3+r2 ] |
||

673 | movzx r6, byte [r4 ] |
||

674 | sub r6, e_reg |
||

675 | lea r6, [r5+r6*2] |
||

676 | |||

677 | lea r5, [r6*9+16] |
||

678 | lea r5, [r5+r6*8] |
||

679 | sar r5, 5 |
||

680 | |||

681 | %ifndef ARCH_X86_64 |
||

682 | mov r0, r0m |
||

683 | %endif |
||

684 | |||

685 | movzx r3, byte [r4+r2*2 ] |
||

686 | movzx r4, byte [r0+r1 +7] |
||

687 | lea r3, [r3+r4+1] |
||

688 | shl r3, 4 |
||

689 | movd r1d, m0 |
||

690 | movsx r1d, r1w |
||

691 | add r1d, r5d |
||

692 | sub r3d, r1d |
||

693 | add r1d, r1d |
||

694 | sub r3d, r1d ; a |
||

695 | |||

696 | movd m1, r5d |
||

697 | movd m3, r3d |
||

698 | %ifidn %1, mmx |
||

699 | punpcklwd m0, m0 |
||

700 | punpcklwd m1, m1 |
||

701 | punpcklwd m3, m3 |
||

702 | punpckldq m0, m0 |
||

703 | punpckldq m1, m1 |
||

704 | punpckldq m3, m3 |
||

705 | %elifidn %1, mmx2 |
||

706 | pshufw m0, m0, 0x0 |
||

707 | pshufw m1, m1, 0x0 |
||

708 | pshufw m3, m3, 0x0 |
||

709 | %else |
||

710 | pshuflw m0, m0, 0x0 |
||

711 | pshuflw m1, m1, 0x0 |
||

712 | pshuflw m3, m3, 0x0 |
||

713 | punpcklqdq m0, m0 ; splat H (words) |
||

714 | punpcklqdq m1, m1 ; splat V (words) |
||

715 | punpcklqdq m3, m3 ; splat a (words) |
||

716 | %endif |
||

717 | %if mmsize == 8 |
||

718 | mova m2, m0 |
||

719 | %endif |
||

720 | pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words) |
||

721 | paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H |
||

722 | %if mmsize == 8 |
||

723 | psllw m2, 2 |
||

724 | paddw m2, m0 ; a + {4,5,6,7}*H |
||

725 | %endif |
||

726 | |||

727 | mov r4, 4 |
||

728 | ALIGN 16 |
||

729 | .loop |
||

730 | %if mmsize == 16 |
||

731 | mova m3, m0 ; b[0..7] |
||

732 | paddw m0, m1 |
||

733 | psraw m3, 5 |
||

734 | mova m4, m0 ; V+b[0..7] |
||

735 | paddw m0, m1 |
||

736 | psraw m4, 5 |
||

737 | packuswb m3, m4 |
||

738 | movh [r0], m3 |
||

739 | movhps [r0+r2], m3 |
||

740 | %else ; mmsize == 8 |
||

741 | mova m3, m0 ; b[0..3] |
||

742 | mova m4, m2 ; b[4..7] |
||

743 | paddw m0, m1 |
||

744 | paddw m2, m1 |
||

745 | psraw m3, 5 |
||

746 | psraw m4, 5 |
||

747 | mova m5, m0 ; V+b[0..3] |
||

748 | mova m6, m2 ; V+b[4..7] |
||

749 | paddw m0, m1 |
||

750 | paddw m2, m1 |
||

751 | psraw m5, 5 |
||

752 | psraw m6, 5 |
||

753 | packuswb m3, m4 |
||

754 | packuswb m5, m6 |
||

755 | mova [r0], m3 |
||

756 | mova [r0+r2], m5 |
||

757 | %endif |
||

758 | |||

759 | lea r0, [r0+r2*2] |
||

760 | dec r4 |
||

761 | jg .loop |
||

762 | REP_RET |
||

763 | %endmacro |
||

764 | |||

765 | INIT_MMX |
||

766 | H264_PRED8x8_PLANE mmx, 0 |
||

767 | H264_PRED8x8_PLANE mmx2, 0 |
||

768 | INIT_XMM |
||

769 | H264_PRED8x8_PLANE sse2, 8 |
||

770 | H264_PRED8x8_PLANE ssse3, 8 |
||

771 | |||

772 | ;----------------------------------------------------------------------------- |
||

773 | 4af8cdfc | Jason Garrett-Glaser | ; void pred8x8_vertical(uint8_t *src, int stride) |

774 | ;----------------------------------------------------------------------------- |
||

775 | |||

776 | cglobal pred8x8_vertical_mmx, 2,2 |
||

777 | sub r0, r1 |
||

778 | movq mm0, [r0] |
||

779 | %rep 3 |
||

780 | movq [r0+r1*1], mm0 |
||

781 | movq [r0+r1*2], mm0 |
||

782 | lea r0, [r0+r1*2] |
||

783 | %endrep |
||

784 | movq [r0+r1*1], mm0 |
||

785 | movq [r0+r1*2], mm0 |
||

786 | RET |
||

787 | |||

788 | ;----------------------------------------------------------------------------- |
||

789 | ; void pred8x8_horizontal(uint8_t *src, int stride) |
||

790 | ;----------------------------------------------------------------------------- |
||

791 | |||

792 | %macro PRED8x8_H 1 |
||

793 | cglobal pred8x8_horizontal_%1, 2,3 |
||

794 | mov r2, 4 |
||

795 | %ifidn %1, ssse3 |
||

796 | mova m2, [pb_3] |
||

797 | %endif |
||

798 | .loop: |
||

799 | movd m0, [r0+r1*0-4] |
||

800 | movd m1, [r0+r1*1-4] |
||

801 | %ifidn %1, ssse3 |
||

802 | pshufb m0, m2 |
||

803 | pshufb m1, m2 |
||

804 | %else |
||

805 | punpcklbw m0, m0 |
||

806 | punpcklbw m1, m1 |
||

807 | %ifidn %1, mmxext |
||

808 | pshufw m0, m0, 0xff |
||

809 | pshufw m1, m1, 0xff |
||

810 | %else |
||

811 | punpckhwd m0, m0 |
||

812 | punpckhwd m1, m1 |
||

813 | punpckhdq m0, m0 |
||

814 | punpckhdq m1, m1 |
||

815 | %endif |
||

816 | %endif |
||

817 | mova [r0+r1*0], m0 |
||

818 | mova [r0+r1*1], m1 |
||

819 | lea r0, [r0+r1*2] |
||

820 | dec r2 |
||

821 | jg .loop |
||

822 | REP_RET |
||

823 | %endmacro |
||

824 | |||

825 | INIT_MMX |
||

826 | PRED8x8_H mmx |
||

827 | PRED8x8_H mmxext |
||

828 | PRED8x8_H ssse3 |
||

829 | |||

830 | ;----------------------------------------------------------------------------- |
||

831 | ; void pred8x8_dc_rv40(uint8_t *src, int stride) |
||

832 | ;----------------------------------------------------------------------------- |
||

833 | |||

834 | 270a85d2 | Jason Garrett-Glaser | cglobal pred8x8_dc_rv40_mmxext, 2,7 |

835 | 4af8cdfc | Jason Garrett-Glaser | mov r4, r0 |

836 | sub r0, r1 |
||

837 | pxor mm0, mm0 |
||

838 | psadbw mm0, [r0] |
||

839 | dec r0 |
||

840 | movzx r5d, byte [r0+r1*1] |
||

841 | movd r6d, mm0 |
||

842 | lea r0, [r0+r1*2] |
||

843 | %rep 3 |
||

844 | movzx r2d, byte [r0+r1*0] |
||

845 | movzx r3d, byte [r0+r1*1] |
||

846 | add r5d, r2d |
||

847 | add r6d, r3d |
||

848 | lea r0, [r0+r1*2] |
||

849 | %endrep |
||

850 | movzx r2d, byte [r0+r1*0] |
||

851 | add r5d, r6d |
||

852 | lea r2d, [r2+r5+8] |
||

853 | shr r2d, 4 |
||

854 | movd mm0, r2d |
||

855 | punpcklbw mm0, mm0 |
||

856 | pshufw mm0, mm0, 0 |
||

857 | mov r3d, 4 |
||

858 | .loop: |
||

859 | movq [r4+r1*0], mm0 |
||

860 | movq [r4+r1*1], mm0 |
||

861 | lea r4, [r4+r1*2] |
||

862 | dec r3d |
||

863 | jg .loop |
||

864 | REP_RET |
||

865 | |||

866 | ;----------------------------------------------------------------------------- |
||

867 | ; void pred8x8_tm_vp8(uint8_t *src, int stride) |
||

868 | ;----------------------------------------------------------------------------- |
||

869 | |||

870 | %macro PRED8x8_TM_MMX 1 |
||

871 | cglobal pred8x8_tm_vp8_%1, 2,6 |
||

872 | sub r0, r1 |
||

873 | pxor mm7, mm7 |
||

874 | movq mm0, [r0] |
||

875 | movq mm1, mm0 |
||

876 | punpcklbw mm0, mm7 |
||

877 | punpckhbw mm1, mm7 |
||

878 | movzx r4d, byte [r0-1] |
||

879 | mov r5d, 4 |
||

880 | .loop: |
||

881 | movzx r2d, byte [r0+r1*1-1] |
||

882 | movzx r3d, byte [r0+r1*2-1] |
||

883 | sub r2d, r4d |
||

884 | sub r3d, r4d |
||

885 | movd mm2, r2d |
||

886 | movd mm4, r3d |
||

887 | %ifidn %1, mmx |
||

888 | punpcklwd mm2, mm2 |
||

889 | punpcklwd mm4, mm4 |
||

890 | punpckldq mm2, mm2 |
||

891 | punpckldq mm4, mm4 |
||

892 | %else |
||

893 | pshufw mm2, mm2, 0 |
||

894 | pshufw mm4, mm4, 0 |
||

895 | %endif |
||

896 | movq mm3, mm2 |
||

897 | movq mm5, mm4 |
||

898 | paddw mm2, mm0 |
||

899 | paddw mm3, mm1 |
||

900 | paddw mm4, mm0 |
||

901 | paddw mm5, mm1 |
||

902 | packuswb mm2, mm3 |
||

903 | packuswb mm4, mm5 |
||

904 | movq [r0+r1*1], mm2 |
||

905 | movq [r0+r1*2], mm4 |
||

906 | lea r0, [r0+r1*2] |
||

907 | dec r5d |
||

908 | jg .loop |
||

909 | REP_RET |
||

910 | %endmacro |
||

911 | |||

912 | PRED8x8_TM_MMX mmx |
||

913 | PRED8x8_TM_MMX mmxext |
||

914 | |||

915 | cglobal pred8x8_tm_vp8_sse2, 2,6,4 |
||

916 | sub r0, r1 |
||

917 | pxor xmm1, xmm1 |
||

918 | movq xmm0, [r0] |
||

919 | punpcklbw xmm0, xmm1 |
||

920 | movzx r4d, byte [r0-1] |
||

921 | mov r5d, 4 |
||

922 | .loop: |
||

923 | movzx r2d, byte [r0+r1*1-1] |
||

924 | movzx r3d, byte [r0+r1*2-1] |
||

925 | sub r2d, r4d |
||

926 | sub r3d, r4d |
||

927 | movd xmm2, r2d |
||

928 | movd xmm3, r3d |
||

929 | pshuflw xmm2, xmm2, 0 |
||

930 | pshuflw xmm3, xmm3, 0 |
||

931 | punpcklqdq xmm2, xmm2 |
||

932 | punpcklqdq xmm3, xmm3 |
||

933 | paddw xmm2, xmm0 |
||

934 | paddw xmm3, xmm0 |
||

935 | packuswb xmm2, xmm3 |
||

936 | movq [r0+r1*1], xmm2 |
||

937 | movhps [r0+r1*2], xmm2 |
||

938 | lea r0, [r0+r1*2] |
||

939 | dec r5d |
||

940 | jg .loop |
||

941 | REP_RET |
||

942 | |||

943 | cglobal pred8x8_tm_vp8_ssse3, 2,3,6 |
||

944 | sub r0, r1 |
||

945 | movdqa xmm4, [tm_shuf] |
||

946 | pxor xmm1, xmm1 |
||

947 | movq xmm0, [r0] |
||

948 | punpcklbw xmm0, xmm1 |
||

949 | movd xmm5, [r0-4] |
||

950 | pshufb xmm5, xmm4 |
||

951 | mov r2d, 4 |
||

952 | .loop: |
||

953 | movd xmm2, [r0+r1*1-4] |
||

954 | movd xmm3, [r0+r1*2-4] |
||

955 | pshufb xmm2, xmm4 |
||

956 | pshufb xmm3, xmm4 |
||

957 | psubw xmm2, xmm5 |
||

958 | psubw xmm3, xmm5 |
||

959 | paddw xmm2, xmm0 |
||

960 | paddw xmm3, xmm0 |
||

961 | packuswb xmm2, xmm3 |
||

962 | movq [r0+r1*1], xmm2 |
||

963 | movhps [r0+r1*2], xmm2 |
||

964 | lea r0, [r0+r1*2] |
||

965 | dec r2d |
||

966 | jg .loop |
||

967 | REP_RET |
||

968 | 270a85d2 | Jason Garrett-Glaser | |

969 | 8b746bb4 | Jason Garrett-Glaser | ;----------------------------------------------------------------------------- |

970 | ; void pred4x4_dc_mmxext(uint8_t *src, const uint8_t *topright, int stride) |
||

971 | ;----------------------------------------------------------------------------- |
||

972 | |||

973 | 270a85d2 | Jason Garrett-Glaser | cglobal pred4x4_dc_mmxext, 3,5 |

974 | pxor mm7, mm7 |
||

975 | mov r4, r0 |
||

976 | sub r0, r2 |
||

977 | movd mm0, [r0] |
||

978 | psadbw mm0, mm7 |
||

979 | movzx r1d, byte [r0+r2*1-1] |
||

980 | movd r3d, mm0 |
||

981 | add r3d, r1d |
||

982 | movzx r1d, byte [r0+r2*2-1] |
||

983 | lea r0, [r0+r2*2] |
||

984 | add r3d, r1d |
||

985 | movzx r1d, byte [r0+r2*1-1] |
||

986 | add r3d, r1d |
||

987 | movzx r1d, byte [r0+r2*2-1] |
||

988 | add r3d, r1d |
||

989 | add r3d, 4 |
||

990 | shr r3d, 3 |
||

991 | imul r3d, 0x01010101 |
||

992 | mov [r4+r2*0], r3d |
||

993 | mov [r0+r2*0], r3d |
||

994 | mov [r0+r2*1], r3d |
||

995 | mov [r0+r2*2], r3d |
||

996 | RET |
||

997 | fb9927ad | Jason Garrett-Glaser | |

998 | ;----------------------------------------------------------------------------- |
||

999 | ; void pred4x4_tm_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride) |
||

1000 | ;----------------------------------------------------------------------------- |
||

1001 | |||

1002 | %macro PRED4x4_TM_MMX 1 |
||

1003 | cglobal pred4x4_tm_vp8_%1, 3,6 |
||

1004 | sub r0, r2 |
||

1005 | pxor mm7, mm7 |
||

1006 | movd mm0, [r0] |
||

1007 | punpcklbw mm0, mm7 |
||

1008 | movzx r4d, byte [r0-1] |
||

1009 | mov r5d, 2 |
||

1010 | .loop: |
||

1011 | movzx r1d, byte [r0+r2*1-1] |
||

1012 | movzx r3d, byte [r0+r2*2-1] |
||

1013 | sub r1d, r4d |
||

1014 | sub r3d, r4d |
||

1015 | movd mm2, r1d |
||

1016 | movd mm4, r3d |
||

1017 | %ifidn %1, mmx |
||

1018 | punpcklwd mm2, mm2 |
||

1019 | punpcklwd mm4, mm4 |
||

1020 | punpckldq mm2, mm2 |
||

1021 | punpckldq mm4, mm4 |
||

1022 | %else |
||

1023 | pshufw mm2, mm2, 0 |
||

1024 | pshufw mm4, mm4, 0 |
||

1025 | %endif |
||

1026 | paddw mm2, mm0 |
||

1027 | paddw mm4, mm0 |
||

1028 | packuswb mm2, mm2 |
||

1029 | packuswb mm4, mm4 |
||

1030 | movd [r0+r2*1], mm2 |
||

1031 | movd [r0+r2*2], mm4 |
||

1032 | lea r0, [r0+r2*2] |
||

1033 | dec r5d |
||

1034 | jg .loop |
||

1035 | REP_RET |
||

1036 | %endmacro |
||

1037 | |||

1038 | PRED4x4_TM_MMX mmx |
||

1039 | PRED4x4_TM_MMX mmxext |
||

1040 | |||

1041 | cglobal pred4x4_tm_vp8_ssse3, 3,3 |
||

1042 | sub r0, r2 |
||

1043 | movq mm6, [tm_shuf] |
||

1044 | pxor mm1, mm1 |
||

1045 | movd mm0, [r0] |
||

1046 | punpcklbw mm0, mm1 |
||

1047 | movd mm7, [r0-4] |
||

1048 | pshufb mm7, mm6 |
||

1049 | lea r1, [r0+r2*2] |
||

1050 | movd mm2, [r0+r2*1-4] |
||

1051 | movd mm3, [r0+r2*2-4] |
||

1052 | movd mm4, [r1+r2*1-4] |
||

1053 | movd mm5, [r1+r2*2-4] |
||

1054 | pshufb mm2, mm6 |
||

1055 | pshufb mm3, mm6 |
||

1056 | pshufb mm4, mm6 |
||

1057 | pshufb mm5, mm6 |
||

1058 | psubw mm2, mm7 |
||

1059 | psubw mm3, mm7 |
||

1060 | psubw mm4, mm7 |
||

1061 | psubw mm5, mm7 |
||

1062 | paddw mm2, mm0 |
||

1063 | paddw mm3, mm0 |
||

1064 | paddw mm4, mm0 |
||

1065 | paddw mm5, mm0 |
||

1066 | packuswb mm2, mm2 |
||

1067 | packuswb mm3, mm3 |
||

1068 | packuswb mm4, mm4 |
||

1069 | packuswb mm5, mm5 |
||

1070 | movd [r0+r2*1], mm2 |
||

1071 | movd [r0+r2*2], mm3 |
||

1072 | movd [r1+r2*1], mm4 |
||

1073 | movd [r1+r2*2], mm5 |
||

1074 | RET |
||

1075 | bc14f04b | Jason Garrett-Glaser | |

1076 | ; dest, left, right, src, tmp |
||

1077 | ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2 |
||

1078 | %macro PRED4x4_LOWPASS 5 |
||

1079 | mova %5, %2 |
||

1080 | pavgb %2, %3 |
||

1081 | pxor %3, %5 |
||

1082 | mova %1, %4 |
||

1083 | pand %3, [pb_1] |
||

1084 | psubusb %2, %3 |
||

1085 | pavgb %1, %2 |
||

1086 | %endmacro |
||

1087 | |||

1088 | ;----------------------------------------------------------------------------- |
||

1089 | ; void pred4x4_vertical_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride) |
||

1090 | ;----------------------------------------------------------------------------- |
||

1091 | |||

1092 | INIT_MMX |
||

1093 | cglobal pred4x4_vertical_vp8_mmxext, 3,3 |
||

1094 | sub r0, r2 |
||

1095 | movd m1, [r0-1] |
||

1096 | movd m0, [r0] |
||

1097 | mova m2, m0 ;t0 t1 t2 t3 |
||

1098 | punpckldq m0, [r1] ;t0 t1 t2 t3 t4 t5 t6 t7 |
||

1099 | lea r1, [r0+r2*2] |
||

1100 | psrlq m0, 8 ;t1 t2 t3 t4 |
||

1101 | PRED4x4_LOWPASS m3, m1, m0, m2, m4 |
||

1102 | movd [r0+r2*1], m3 |
||

1103 | movd [r0+r2*2], m3 |
||

1104 | movd [r1+r2*1], m3 |
||

1105 | movd [r1+r2*2], m3 |
||

1106 | RET |