This is an automated email from the git hooks/post-receive script.
pierov pushed a commit to branch geckoview-99.0.1-11.0-1 in repository tor-browser.
commit 0c755a24b73056a9381d7cf3cd2b38281dbe4c43 Author: Jon Bauman jbauman@mozilla.com AuthorDate: Mon Mar 14 17:17:59 2022 +0000
Bug 1758482 - Update dav1d to new version 28a9c46e1c36540d3276299f2e284ece1d2386be from 2022-02-04T23:02:17.000-03:00. r=media-playback-reviewers,padenot a=dmeehan
Normally updatebot would create a revision, allowing me to review it single-handedly, but https://phabricator.services.mozilla.com/D140519 was a failure because of changes updatebot didn't know how to handle, so I just need **someone** to approve the update to get this landed. This has a bit of priority since it's blocking https://bugzilla.mozilla.org/show_bug.cgi?id=1757971, which we want to get uplifted for Fx99.
Differential Revision: https://phabricator.services.mozilla.com/D140921 --- media/libdav1d/README_MOZILLA | 3 +- media/libdav1d/asm/moz.build | 30 +- media/libdav1d/moz.build | 6 +- media/libdav1d/moz.yaml | 4 +- media/libdav1d/vcs_version.h | 2 +- media/libdav1d/version.h | 4 +- third_party/dav1d/NEWS | 29 + third_party/dav1d/README.md | 12 +- third_party/dav1d/THANKS.md | 28 +- third_party/dav1d/include/dav1d/common.h | 5 + third_party/dav1d/include/dav1d/dav1d.h | 13 + third_party/dav1d/meson.build | 8 +- third_party/dav1d/src/arm/32/filmgrain.S | 2039 +++++++++++++++++++ third_party/dav1d/src/arm/32/filmgrain16.S | 2137 ++++++++++++++++++++ third_party/dav1d/src/arm/32/itx.S | 358 ++-- third_party/dav1d/src/arm/64/filmgrain.S | 2010 ++++++++++++++++++ third_party/dav1d/src/arm/64/filmgrain16.S | 1997 ++++++++++++++++++ third_party/dav1d/src/arm/64/itx.S | 282 +-- third_party/dav1d/src/arm/64/looprestoration.S | 4 + third_party/dav1d/src/arm/64/looprestoration16.S | 4 + third_party/dav1d/src/arm/asm.S | 115 +- ...ilm_grain_init_tmpl.c => filmgrain_init_tmpl.c} | 2 +- third_party/dav1d/src/data.c | 13 +- third_party/dav1d/src/data.h | 1 + third_party/dav1d/src/decode.c | 20 +- third_party/dav1d/src/ext/x86/x86inc.asm | 35 +- third_party/dav1d/src/fg_apply.h | 25 +- third_party/dav1d/src/fg_apply_tmpl.c | 140 +- third_party/dav1d/src/filmgrain.h | 86 + third_party/dav1d/src/filmgrain_tmpl.c | 433 ++++ third_party/dav1d/src/internal.h | 53 +- third_party/dav1d/src/lib.c | 92 +- third_party/dav1d/src/meson.build | 26 +- third_party/dav1d/src/obu.c | 44 +- third_party/dav1d/src/picture.c | 1 + third_party/dav1d/src/tables.c | 2 +- third_party/dav1d/src/thread_task.c | 137 +- third_party/dav1d/src/thread_task.h | 2 + third_party/dav1d/src/x86/cdef16_avx2.asm | 8 - third_party/dav1d/src/x86/cdef16_sse.asm | 8 - ...{film_grain16_avx2.asm => filmgrain16_avx2.asm} | 31 +- .../{film_grain16_sse.asm => filmgrain16_sse.asm} | 31 +- .../{film_grain_avx2.asm => filmgrain_avx2.asm} | 48 +- third_party/dav1d/src/x86/filmgrain_avx512.asm | 1079 ++++++++++ third_party/dav1d/src/x86/filmgrain_common.asm | 46 + ...ilm_grain_init_tmpl.c => filmgrain_init_tmpl.c} | 53 +- .../x86/{film_grain_sse.asm => filmgrain_sse.asm} | 31 +- third_party/dav1d/src/x86/ipred16_avx2.asm | 10 +- third_party/dav1d/src/x86/ipred16_avx512.asm | 833 ++++++++ third_party/dav1d/src/x86/ipred16_sse.asm | 8 - third_party/dav1d/src/x86/ipred_init_tmpl.c | 2 +- third_party/dav1d/src/x86/itx16_avx2.asm | 8 - third_party/dav1d/src/x86/itx16_sse.asm | 8 - third_party/dav1d/src/x86/itx_avx2.asm | 9 - third_party/dav1d/src/x86/itx_avx512.asm | 9 - third_party/dav1d/src/x86/itx_init_tmpl.c | 22 +- third_party/dav1d/src/x86/itx_sse.asm | 340 ++-- third_party/dav1d/src/x86/loopfilter16_avx2.asm | 8 - third_party/dav1d/src/x86/loopfilter16_sse.asm | 8 - .../dav1d/src/x86/looprestoration16_avx2.asm | 8 - third_party/dav1d/src/x86/mc16_avx2.asm | 8 - third_party/dav1d/src/x86/mc16_avx512.asm | 8 - third_party/dav1d/src/x86/mc16_sse.asm | 12 +- third_party/dav1d/src/x86/mc_avx512.asm | 8 - third_party/dav1d/tests/checkasm/filmgrain.c | 10 +- 65 files changed, 11853 insertions(+), 1003 deletions(-)
diff --git a/media/libdav1d/README_MOZILLA b/media/libdav1d/README_MOZILLA index eba613b2969da..bc5c2708696e1 100644 --- a/media/libdav1d/README_MOZILLA +++ b/media/libdav1d/README_MOZILLA @@ -39,7 +39,8 @@ The rough steps are: moz.build which has a condition on CONFIG['CPU_ARCH']. - Files ending in _tmpl.c may be automatically added to SOURCES, resulting in build failures. To fix this, the file must be moved to the appropriate bitdepth_basenames list where - generate_source.py will generate the templates into buildable source files. + generate_source.py will generate the templates into buildable source files. In general, + all *_tmpl.c files require BITDEPTH to be defined. - Clone the tag from the dav1d repo and build a stand-alone libdav1d following the steps here: https://code.videolan.org/videolan/dav1d#compile - Copy vcs_version.h from the local build/include/vcs_version.h diff --git a/media/libdav1d/asm/moz.build b/media/libdav1d/asm/moz.build index 02647876d4904..7530c087ce30a 100644 --- a/media/libdav1d/asm/moz.build +++ b/media/libdav1d/asm/moz.build @@ -87,8 +87,14 @@ if CONFIG['CPU_ARCH'] in ('x86', 'x86_64'): '../../../third_party/dav1d/src/x86/cdef16_avx2.asm', # moved from autovendored '../../../third_party/dav1d/src/x86/cdef_avx2.asm', '../../../third_party/dav1d/src/x86/cdef_avx512.asm', - '../../../third_party/dav1d/src/x86/film_grain16_avx2.asm', - '../../../third_party/dav1d/src/x86/film_grain_avx2.asm', + '../../../third_party/dav1d/src/x86/filmgrain16_avx2.asm', + '../../../third_party/dav1d/src/x86/filmgrain_avx2.asm', + '../../../third_party/dav1d/src/x86/filmgrain_avx512.asm', + # '../../../third_party/dav1d/src/x86/filmgrain_common.asm', + # ^ filmgrain_common.asm must *not* be in SOURCES because it's only + # used as an %include for other .asm files. Trying to assemble it + # will result in a nasm error: parser: instruction expected + '../../../third_party/dav1d/src/x86/ipred16_avx512.asm', '../../../third_party/dav1d/src/x86/ipred_avx2.asm', '../../../third_party/dav1d/src/x86/ipred_avx512.asm', '../../../third_party/dav1d/src/x86/itx16_avx2.asm', @@ -111,8 +117,8 @@ if CONFIG['CPU_ARCH'] in ('x86', 'x86_64'): '../../../third_party/dav1d/src/x86/cdef16_sse.asm', '../../../third_party/dav1d/src/x86/cdef_sse.asm', '../../../third_party/dav1d/src/x86/cpuid.asm', - '../../../third_party/dav1d/src/x86/film_grain16_sse.asm', - '../../../third_party/dav1d/src/x86/film_grain_sse.asm', + '../../../third_party/dav1d/src/x86/filmgrain16_sse.asm', + '../../../third_party/dav1d/src/x86/filmgrain_sse.asm', '../../../third_party/dav1d/src/x86/ipred16_avx2.asm', '../../../third_party/dav1d/src/x86/ipred16_sse.asm', '../../../third_party/dav1d/src/x86/ipred_sse.asm', @@ -129,10 +135,12 @@ if CONFIG['CPU_ARCH'] in ('x86', 'x86_64'): ]
# BITDEPTH + # All the files here should be *_tmpl.c, and the should not appear in SOURCES, + # since they require BITDEPTH to be defined relative_path = '../../../third_party/dav1d/src/x86/' bitdepth_basenames = [ 'cdef_init_tmpl.c', - 'film_grain_init_tmpl.c', + 'filmgrain_init_tmpl.c', 'ipred_init_tmpl.c', 'itx_init_tmpl.c', 'loopfilter_init_tmpl.c', @@ -167,10 +175,12 @@ elif CONFIG['CPU_ARCH'] == 'arm' or CONFIG['CPU_ARCH'] == 'aarch64': ]
# BITDEPTH c file + # All the files here should be *_tmpl.c, and the should not appear in SOURCES, + # since they require BITDEPTH to be defined relative_path = '../../../third_party/dav1d/src/arm/' bitdepth_basenames = [ 'cdef_init_tmpl.c', - 'film_grain_init_tmpl.c', + 'filmgrain_init_tmpl.c', 'ipred_init_tmpl.c', 'itx_init_tmpl.c', 'loopfilter_init_tmpl.c', @@ -198,8 +208,8 @@ elif CONFIG['CPU_ARCH'] == 'arm' or CONFIG['CPU_ARCH'] == 'aarch64': '../../../third_party/dav1d/src/arm/64/cdef.S', '../../../third_party/dav1d/src/arm/64/cdef16.S', '../../../third_party/dav1d/src/arm/64/cdef_tmpl.S', - '../../../third_party/dav1d/src/arm/64/film_grain.S', - '../../../third_party/dav1d/src/arm/64/film_grain16.S', + '../../../third_party/dav1d/src/arm/64/filmgrain.S', + '../../../third_party/dav1d/src/arm/64/filmgrain16.S', '../../../third_party/dav1d/src/arm/64/ipred.S', '../../../third_party/dav1d/src/arm/64/ipred16.S', # itx.S is used for both 8 and 16 bpc. @@ -221,8 +231,8 @@ elif CONFIG['CPU_ARCH'] == 'arm' or CONFIG['CPU_ARCH'] == 'aarch64': '../../../third_party/dav1d/src/arm/32/cdef.S', '../../../third_party/dav1d/src/arm/32/cdef16.S', '../../../third_party/dav1d/src/arm/32/cdef_tmpl.S', - '../../../third_party/dav1d/src/arm/32/film_grain.S', - '../../../third_party/dav1d/src/arm/32/film_grain16.S', + '../../../third_party/dav1d/src/arm/32/filmgrain.S', + '../../../third_party/dav1d/src/arm/32/filmgrain16.S', '../../../third_party/dav1d/src/arm/32/ipred.S', '../../../third_party/dav1d/src/arm/32/ipred16.S', '../../../third_party/dav1d/src/arm/32/itx.S', diff --git a/media/libdav1d/moz.build b/media/libdav1d/moz.build index c84cef8802b8e..923dca05ebead 100644 --- a/media/libdav1d/moz.build +++ b/media/libdav1d/moz.build @@ -103,7 +103,7 @@ EXPORTS.dav1d.src += [ '../../third_party/dav1d/src/data.h', '../../third_party/dav1d/src/decode.h', '../../third_party/dav1d/src/dequant_tables.h', - '../../third_party/dav1d/src/film_grain.h', + '../../third_party/dav1d/src/filmgrain.h', '../../third_party/dav1d/src/getbits.h', '../../third_party/dav1d/src/intra_edge.h', '../../third_party/dav1d/src/lf_mask.h', @@ -123,12 +123,14 @@ EXPORTS.dav1d.src += [ ]
# common BITDEPTH 8, 16 +# All the files here should be *_tmpl.c, and the should not appear in SOURCES, +# since they require BITDEPTH to be defined relative_path = '../../third_party/dav1d/src/' bitdepth_basenames = [ 'cdef_apply_tmpl.c', 'cdef_tmpl.c', 'fg_apply_tmpl.c', - 'film_grain_tmpl.c', + 'filmgrain_tmpl.c', 'ipred_prepare_tmpl.c', 'ipred_tmpl.c', 'itx_tmpl.c', diff --git a/media/libdav1d/moz.yaml b/media/libdav1d/moz.yaml index 2de4cdc699621..623ce9625d4de 100644 --- a/media/libdav1d/moz.yaml +++ b/media/libdav1d/moz.yaml @@ -20,11 +20,11 @@ origin:
# Human-readable identifier for this version/release # Generally "version NNN", "tag SSS", "bookmark SSS" - release: commit 1f09a9119fb794ab41b1e527d848c2a210ca43d4 (2022-02-04T23:02:17.000-03:00). + release: commit 28a9c46e1c36540d3276299f2e284ece1d2386be (2022-03-10T14:49:23.000+00:00).
# Revision to pull in # Must be a long or short commit SHA (long preferred) - revision: 1f09a9119fb794ab41b1e527d848c2a210ca43d4 + revision: 28a9c46e1c36540d3276299f2e284ece1d2386be
# The package's license, where possible using the mnemonic from # https://spdx.org/licenses/ diff --git a/media/libdav1d/vcs_version.h b/media/libdav1d/vcs_version.h index 980b8d307dcaf..a449a2c0360d7 100644 --- a/media/libdav1d/vcs_version.h +++ b/media/libdav1d/vcs_version.h @@ -1,2 +1,2 @@ /* auto-generated, do not edit */ -#define DAV1D_VERSION "1f09a9119fb794ab41b1e527d848c2a210ca43d4" +#define DAV1D_VERSION "0.9.2-170-g28a9c46" diff --git a/media/libdav1d/version.h b/media/libdav1d/version.h index a797de5df28c5..5619d66bb76a1 100644 --- a/media/libdav1d/version.h +++ b/media/libdav1d/version.h @@ -27,8 +27,8 @@ #ifndef DAV1D_VERSION_H #define DAV1D_VERSION_H
-#define DAV1D_API_VERSION_MAJOR 9 -#define DAV1D_API_VERSION_MINOR 2 +#define DAV1D_API_VERSION_MAJOR 6 +#define DAV1D_API_VERSION_MINOR 6 #define DAV1D_API_VERSION_PATCH 0
#endif /* DAV1D_VERSION_H */ diff --git a/third_party/dav1d/NEWS b/third_party/dav1d/NEWS index 2a4c4d00ca637..6158d780487bd 100644 --- a/third_party/dav1d/NEWS +++ b/third_party/dav1d/NEWS @@ -1,3 +1,32 @@ +Changes for 1.0.0 'Peregrine falcon': +------------------------------------- + +1.0.0 is a major release of dav1d, adding important features and bug fixes. + +It notably changes, in an important way, the way threading works, by adding +an automatic thread management. + +It also adds support for AVX-512 acceleration, and adds speedups to existing x86 +code (from SSE2 to AVX2). + +1.0.0 adds new grain API to ease acceleration on the GPU. + +Finally, 1.0.0 fixes numerous small bugs that were reported since the beginning +of the project to have a proper release. + + .''. + .''. . *''* :_/_: . + :_/_: _(/_ .:.*_/_* : /\ : .'.:.'. + .''.: /\ : ./)\ ':'* /\ * : '..'. -=:o:=- + :_/_:'.:::. ' *''* * '.'/.' _(/_'.':'.' + : /\ : ::::: *_/_* -= o =- /)\ ' * + '..' ':::' * /\ * .'/.'. ' + * *..* : + * : + * 1.0.0 + + + Changes for 0.9.2 'Golden Eagle': ---------------------------------
diff --git a/third_party/dav1d/README.md b/third_party/dav1d/README.md index 34a351e247f21..a8a35e1c6fe2d 100644 --- a/third_party/dav1d/README.md +++ b/third_party/dav1d/README.md @@ -2,11 +2,13 @@
# dav1d
-**dav1d** is a new **AV1** cross-platform **d**ecoder, open-source, and focused on speed and correctness. +**dav1d** is an **AV1** cross-platform **d**ecoder, open-source, and focused on speed and correctness. + +It is now battle-tested and production-ready and can be used everywhere.
The canonical repository URL for this repo is https://code.videolan.org/videolan/dav1d
-This project is partially funded by the *Alliance for Open Media*/**AOM**. +This project was partially funded by the *Alliance for Open Media*/**AOM**.
## Goal and Features
@@ -38,11 +40,11 @@ The plan is the following: 9. Make high bit-depth fast on older mobile, by writing asm for ARMv7 chips, 10. Make high bit-depth fast on desktop, by writing asm for AVX2 chips, 11. Make high bit-depth fast on older desktop, by writing asm for SSSE3+ chips, +12. Improve threading.
### On-going -12. Improve C code base with [various tweaks](https://code.videolan.org/videolan/dav1d/wikis/task-list), -13. Accelerate for less common architectures, like PPC, SSE2 or AVX-512. -14. Improve threading. +13. Improve C code base with [various tweaks](https://code.videolan.org/videolan/dav1d/wikis/task-list), +14. Accelerate for less common architectures, like PPC, SSE2, RISC-V or AVX-512.
### After 15. Use more GPU decoding, when possible. diff --git a/third_party/dav1d/THANKS.md b/third_party/dav1d/THANKS.md index 06d879853c3e6..2791e3dcc61bc 100644 --- a/third_party/dav1d/THANKS.md +++ b/third_party/dav1d/THANKS.md @@ -16,16 +16,18 @@ The Alliance for Open Media (AOM) for funding this project.
And all the dav1d Authors (git shortlog -sn), including:
-Martin Storsjö, Janne Grunau, Henrik Gramner, Ronald S. Bultje, James Almer, -Marvin Scholz, Luc Trudeau, Victorien Le Couviour--Tuffet, Jean-Baptiste Kempf, -Hugo Beauzée-Luyssen, Matthias Dressel, Konstantin Pavlov, David Michael Barr, -Steve Lhomme, Niklas Haas, B Krishnan Iyer, Francois Cartegnie, Liwei Wang, -Nathan E. Egge, Derek Buitenhuis, Michael Bradshaw, Raphaël Zumer, -Xuefeng Jiang, Luca Barbato, Jan Beich, Wan-Teh Chang, Justin Bull, Boyuan Xiao, -Dale Curtis, Kyle Siefring, Raphael Zumer, Rupert Swarbrick, Thierry Foucu, -Thomas Daede, Colin Lee, Emmanuel Gil Peyrot, Lynne, Michail Alvanos, -Nico Weber, SmilingWolf, Tristan Laurent, Vittorio Giovara, Anisse Astier, -Dmitriy Sychov, Ewout ter Hoeven, Fred Barbier, Jean-Yves Avenard, -Mark Shuttleworth, Matthieu Bouron, Nicolas Frattaroli, Pablo Stebler, -Rostislav Pehlivanov, Shiz, Steinar Midtskogen, Sylvestre Ledru, Timo Gurr, -Tristan Matthews, Xavier Claessens, Xu Guangxin, kossh1 and skal. +Martin Storsjö, Henrik Gramner, Ronald S. Bultje, Janne Grunau, James Almer, +Victorien Le Couviour--Tuffet, Matthias Dressel, Marvin Scholz, Luc Trudeau, +Jean-Baptiste Kempf, Hugo Beauzée-Luyssen, Niklas Haas, Konstantin Pavlov, +David Michael Barr, Steve Lhomme, Nathan E. Egge, Kyle Siefring, Raphaël Zumer, +B Krishnan Iyer, Francois Cartegnie, Liwei Wang, Derek Buitenhuis, +Michael Bradshaw, Wan-Teh Chang, Xuefeng Jiang, Luca Barbato, Jan Beich, +Christophe Gisquet, Justin Bull, Boyuan Xiao, Dale Curtis, Emmanuel Gil Peyrot, +Rupert Swarbrick, Thierry Foucu, Thomas Daede, Colin Lee, Jonathan Wright, +Lynne, Michail Alvanos, Nico Weber, Salome Thirot, SmilingWolf, Tristan Laurent, +Vittorio Giovara, Yannis Guyon, André Kempe, Anisse Astier, Anton Mitrofanov, +Dmitriy Sychov, Ewout ter Hoeven, Fred Barbier, Jean-Yves Avenard, Joe Drago, +Mark Shuttleworth, Matthieu Bouron, Mehdi Sabwat, Nicolas Frattaroli, +Pablo Stebler, Rostislav Pehlivanov, Shiz, Steinar Midtskogen, Sylvain BERTRAND, +Sylvestre Ledru, Timo Gurr, Tristan Matthews, Vibhoothi, Xavier Claessens, +Xu Guangxin, kossh1 and skal diff --git a/third_party/dav1d/include/dav1d/common.h b/third_party/dav1d/include/dav1d/common.h index b55e9399f06f6..8685b4f078285 100644 --- a/third_party/dav1d/include/dav1d/common.h +++ b/third_party/dav1d/include/dav1d/common.h @@ -78,4 +78,9 @@ typedef struct Dav1dDataProps { struct Dav1dUserData user_data; ///< user-configurable data, default NULL members } Dav1dDataProps;
+/** + * Release reference to a Dav1dDataProps. + */ +DAV1D_API void dav1d_data_props_unref(Dav1dDataProps *props); + #endif /* DAV1D_COMMON_H */ diff --git a/third_party/dav1d/include/dav1d/dav1d.h b/third_party/dav1d/include/dav1d/dav1d.h index 0938156f759d5..fd3b622505b7e 100644 --- a/third_party/dav1d/include/dav1d/dav1d.h +++ b/third_party/dav1d/include/dav1d/dav1d.h @@ -274,6 +274,19 @@ enum Dav1dEventFlags { */ DAV1D_API int dav1d_get_event_flags(Dav1dContext *c, enum Dav1dEventFlags *flags);
+/** + * Retrieve the user-provided metadata associated with the input data packet + * for the last decoding error reported to the user, i.e. a negative return + * value (not EAGAIN) from dav1d_send_data() or dav1d_get_picture(). + * + * @param c Input decoder instance. + * @param out Output Dav1dDataProps. On success, the caller assumes ownership of + * the returned reference. + * + * @return 0 on success, or < 0 (a negative DAV1D_ERR code) on error. + */ +DAV1D_API int dav1d_get_decode_error_data_props(Dav1dContext *c, Dav1dDataProps *out); + # ifdef __cplusplus } # endif diff --git a/third_party/dav1d/meson.build b/third_party/dav1d/meson.build index 0e63d3bced698..5efb88c5d3fb4 100644 --- a/third_party/dav1d/meson.build +++ b/third_party/dav1d/meson.build @@ -30,7 +30,7 @@ project('dav1d', ['c'], 'b_ndebug=if-release'], meson_version: '>= 0.49.0')
-dav1d_soname_version = '6.4.0' +dav1d_soname_version = '6.6.0' dav1d_api_version_array = dav1d_soname_version.split('.') dav1d_api_version_major = dav1d_api_version_array[0] dav1d_api_version_minor = dav1d_api_version_array[1] @@ -413,11 +413,7 @@ if is_asm_enabled and host_machine.cpu_family().startswith('x86')
# check NASM version if nasm.found() - nasm_r = run_command(nasm, '-v') - - if nasm_r.returncode() != 0 - error('failed running nasm to obtain its version') - endif + nasm_r = run_command(nasm, '-v', check: true)
out = nasm_r.stdout().strip().split() if out[1].to_lower() == 'version' diff --git a/third_party/dav1d/src/arm/32/filmgrain.S b/third_party/dav1d/src/arm/32/filmgrain.S new file mode 100644 index 0000000000000..d1f83efb98e01 --- /dev/null +++ b/third_party/dav1d/src/arm/32/filmgrain.S @@ -0,0 +1,2039 @@ +/* + * Copyright © 2021, VideoLAN and dav1d authors + * Copyright © 2021, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" +#include "src/arm/asm-offsets.h" + +#define GRAIN_WIDTH 82 +#define GRAIN_HEIGHT 73 + +#define SUB_GRAIN_WIDTH 44 +#define SUB_GRAIN_HEIGHT 38 + +.macro increment_seed steps, shift=1 + lsr r11, r2, #3 + lsr r12, r2, #12 + lsr lr, r2, #1 + eor r11, r2, r11 // (r >> 0) ^ (r >> 3) + eor r12, r12, lr // (r >> 12) ^ (r >> 1) + eor r11, r11, r12 // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1) +.if \shift + lsr r2, r2, #\steps +.endif + and r11, r11, #((1 << \steps) - 1) // bit +.if \shift + orr r2, r2, r11, lsl #(16 - \steps) // *state +.else + orr r2, r2, r11, lsl #16 // *state +.endif +.endm + +.macro read_rand dest, bits, age + ubfx \dest, r2, #16 - \bits - \age, #\bits +.endm + +.macro read_shift_rand dest, bits + ubfx \dest, r2, #17 - \bits, #\bits + lsr r2, r2, #1 +.endm + +// special calling convention: +// r2 holds seed +// r3 holds dav1d_gaussian_sequence +// clobbers r11-r12 +// returns in d0-d1 +function get_gaussian_neon + push {r5-r6,lr} + increment_seed 4 + read_rand r5, 11, 3 + read_rand r6, 11, 2 + add r5, r3, r5, lsl #1 + add r6, r3, r6, lsl #1 + vld1.16 {d0[0]}, [r5] + read_rand r5, 11, 1 + vld1.16 {d0[1]}, [r6] + add r5, r3, r5, lsl #1 + read_rand r6, 11, 0 + increment_seed 4 + add r6, r3, r6, lsl #1 + vld1.16 {d0[2]}, [r5] + read_rand r5, 11, 3 + vld1.16 {d0[3]}, [r6] + add r5, r3, r5, lsl #1 + read_rand r6, 11, 2 + vld1.16 {d1[0]}, [r5] + add r6, r3, r6, lsl #1 + read_rand r5, 11, 1 + vld1.16 {d1[1]}, [r6] + read_rand r6, 11, 0 + add r5, r3, r5, lsl #1 + add r6, r3, r6, lsl #1 + vld1.16 {d1[2]}, [r5] + vld1.16 {d1[3]}, [r6] + pop {r5-r6,pc} +endfunc + +.macro get_grain_row r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10 + bl get_gaussian_neon + vrshl.s16 q0, q0, q15 + vmovn.i16 \r0, q0 + bl get_gaussian_neon + vrshl.s16 q0, q0, q15 + vmovn.i16 \r1, q0 + bl get_gaussian_neon + vrshl.s16 q0, q0, q15 + vmovn.i16 \r2, q0 + bl get_gaussian_neon + vrshl.s16 q0, q0, q15 + vmovn.i16 \r3, q0 + bl get_gaussian_neon + vrshl.s16 q0, q0, q15 + vmovn.i16 \r4, q0 + bl get_gaussian_neon + vrshl.s16 q0, q0, q15 + vmovn.i16 \r5, q0 + bl get_gaussian_neon + vrshl.s16 q0, q0, q15 + vmovn.i16 \r6, q0 + bl get_gaussian_neon + vrshl.s16 q0, q0, q15 + vmovn.i16 \r7, q0 + bl get_gaussian_neon + vrshl.s16 q0, q0, q15 + vmovn.i16 \r8, q0 + bl get_gaussian_neon + vrshl.s16 q0, q0, q15 + vmovn.i16 \r9, q0 + increment_seed 2 + read_rand r11, 11, 1 + read_rand r12, 11, 0 + add r11, r3, r11, lsl #1 + add r12, r3, r12, lsl #1 + vld1.16 {d0[0]}, [r11] + vld1.16 {d0[1]}, [r12] + vrshl.s16 d0, d0, d30 + vmovn.i16 \r10, q0 +.endm + +.macro store_grain_row r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10 + vst1.16 {\r0, \r1, \r2, \r3}, [r0]! + vst1.16 {\r4, \r5, \r6, \r7}, [r0]! + vst1.16 {\r8, \r9}, [r0]! + vst1.16 {\r10[0]}, [r0]! +.endm + +.macro get_grain_row_44 r0, r1, r2, r3, r4, r5 + bl get_gaussian_neon + vrshl.s16 q0, q0, q15 + vmovn.i16 \r0, q0 + bl get_gaussian_neon + vrshl.s16 q0, q0, q15 + vmovn.i16 \r1, q0 + bl get_gaussian_neon + vrshl.s16 q0, q0, q15 + vmovn.i16 \r2, q0 + bl get_gaussian_neon + vrshl.s16 q0, q0, q15 + vmovn.i16 \r3, q0 + bl get_gaussian_neon + vrshl.s16 q0, q0, q15 + vmovn.i16 \r4, q0 + increment_seed 4 + read_rand r11, 11, 3 + read_rand r12, 11, 2 + add r11, r3, r11, lsl #1 + add r12, r3, r12, lsl #1 + vld1.16 {d0[]}, [r11] + read_rand r11, 11, 1 + vld1.16 {d0[1]}, [r12] + add r11, r3, r11, lsl #1 + read_rand r12, 11, 0 + vld1.16 {d0[2]}, [r11] + add r12, r3, r12, lsl #1 + vld1.16 {d0[3]}, [r12] + vrshl.s16 d0, d0, d30 + vmovn.i16 \r5, q0 +.endm + +.macro store_grain_row_44 r0, r1, r2, r3, r4, r5 + vst1.16 {\r0, \r1, \r2, \r3}, [r0]! + vst1.16 {\r4, \r5}, [r0] + add r0, r0, #GRAIN_WIDTH-32 +.endm + +function get_grain_2_neon + push {r11,lr} + increment_seed 2 + read_rand r11, 11, 1 + read_rand r12, 11, 0 + add r11, r3, r11, lsl #1 + add r12, r3, r12, lsl #1 + vld1.16 {d0[0]}, [r11] + vld1.16 {d0[1]}, [r12] + vrshl.s16 d0, d0, d30 + vmovn.i16 d0, q0 + pop {r11,pc} +endfunc + +.macro get_grain_2 dst + bl get_grain_2_neon +.ifnc \dst, d0 + vmov \dst, d0 +.endif +.endm + +// r1 holds the number of entries to produce +// r6, r8 and r10 hold the previous output entries +// q0 holds the vector of produced entries +// q1 holds the input vector of sums from above +.macro output_lag n +function output_lag\n()_neon + push {r0, lr} +.if \n == 1 + mov lr, #-128 +.else + mov r0, #1 + mov lr, #1 + sub r7, r7, #1 + sub r9, r9, #1 + lsl r0, r0, r7 + lsl lr, lr, r9 + add r7, r7, #1 + add r9, r9, #1 +.endif +1: + read_shift_rand r12, 11 + vmov.32 r11, d2[0] + lsl r12, r12, #1 + vext.8 q0, q0, q0, #1 + ldrsh r12, [r3, r12] +.if \n == 1 + mla r11, r6, r4, r11 // sum (above) + *coeff * prev output + add r6, r11, r8 // 1 << (ar_coeff_shift - 1) + add r12, r12, r10 + asr r6, r6, r7 // >> ar_coeff_shift + asr r12, r12, r9 // >> (4 + grain_scale_shift) + add r6, r6, r12 + cmp r6, r5 +.elseif \n == 2 + mla r11, r8, r4, r11 // sum (above) + *coeff * prev output 1 + mla r11, r6, r10, r11 // += *coeff * prev output 2 + mov r8, r6 + add r6, r11, r0 // 1 << (ar_coeff_shift - 1) + add r12, r12, lr // 1 << (4 + grain_scale_shift - 1) + asr r6, r6, r7 // >> ar_coeff_shift + asr r12, r12, r9 // >> (4 + grain_scale_shift) + add r6, r6, r12 + push {lr} + cmp r6, r5 + mov lr, #-128 +.else + push {r1-r3} + sbfx r1, r4, #0, #8 + sbfx r2, r4, #8, #8 + sbfx r3, r4, #16, #8 + mla r11, r10, r1, r11 // sum (above) + *coeff * prev output 1 + mla r11, r8, r2, r11 // sum (above) + *coeff * prev output 2 + mla r11, r6, r3, r11 // += *coeff * prev output 3 + pop {r1-r3} + mov r10, r8 + mov r8, r6 + + add r6, r11, r0 // 1 << (ar_coeff_shift - 1) + add r12, r12, lr // 1 << (4 + grain_scale_shift - 1) + asr r6, r6, r7 // >> ar_coeff_shift + asr r12, r12, r9 // >> (4 + grain_scale_shift) + add r6, r6, r12 + push {lr} + cmp r6, r5 + mov lr, #-128 +.endif + it gt + movgt r6, r5 + cmp r6, lr + it lt + movlt r6, lr +.if \n >= 2 + pop {lr} +.endif + subs r1, r1, #1 + vext.8 q1, q1, q1, #4 + vmov.8 d1[7], r6 + bgt 1b + pop {r0, pc} +endfunc +.endm + +output_lag 1 +output_lag 2 +output_lag 3 + + +function sum_lag1_above_neon + vmull.s8 q2, d6, d28 + vmull.s8 q3, d7, d28 + vmull.s8 q4, d0, d27 + vmull.s8 q5, d1, d27 + + vaddl.s16 q0, d4, d8 + vaddl.s16 q2, d5, d9 + vaddl.s16 q4, d6, d10 + vaddl.s16 q5, d7, d11 + + vmull.s8 q3, d3, d29 + vmull.s8 q1, d2, d29 + + vaddw.s16 q4, q4, d6 + vaddw.s16 q5, q5, d7 + vaddw.s16 q3, q2, d3 + vaddw.s16 q2, q0, d2 + bx lr +endfunc + +.macro sum_lag_n_body lag, type, uv_layout, edge, elems, store, uv_coeff +.ifc \lag()_\edge, lag3_left + bl sum_lag3_left_above_neon +.else + bl sum_\lag()_above_neon +.endif +.ifc \type, uv_420 + vpush {q6-q7} + add r12, r11, #GRAIN_WIDTH + vld1.16 {q0, q1}, [r11]! + vld1.16 {q6, q7}, [r12]! + vpaddl.s8 q0, q0 + vpaddl.s8 q1, q1 + vpaddl.s8 q6, q6 + vpaddl.s8 q7, q7 + vadd.i16 q0, q0, q6 + vadd.i16 q1, q1, q7 + vpop {q6-q7} + vrshrn.s16 d0, q0, #2 + vrshrn.s16 d1, q1, #2 +.endif +.ifc \type, uv_422 + vld1.8 {q0, q1}, [r11]! + vpaddl.s8 q0, q0 + vpaddl.s8 q1, q1 + vrshrn.s16 d0, q0, #1 + vrshrn.s16 d1, q1, #1 +.endif +.ifc \type, uv_444 + vld1.8 {q0}, [r11]! +.endif +.if \uv_layout +.ifnb \uv_coeff + vdup.8 d13, \uv_coeff +.endif + vmull.s8 q1, d0, d13 + vmull.s8 q0, d1, d13 + vaddw.s16 q2, q2, d2 + vaddw.s16 q3, q3, d3 + vaddw.s16 q4, q4, d0 + vaddw.s16 q5, q5, d1 +.endif +.if \uv_layout && \elems == 16 + b sum_\lag()_y_\edge()_start +.elseif \uv_layout == 444 && \elems == 15 + b sum_\lag()_y_\edge()_start +.elseif \uv_layout == 422 && \elems == 9 + b sum_\lag()_uv_420_\edge()_start +.else +sum_\lag()_\type()_\edge()_start: + push {r11} +.ifc \edge, left + increment_seed 4 + read_rand r11, 11, 3 + read_rand r12, 11, 2 + add r11, r3, r11, lsl #1 + add r12, r3, r12, lsl #1 + vld1.16 {d1[1]}, [r11] + read_rand r11, 11, 1 + vld1.16 {d1[2]}, [r12] + add r11, r3, r11, lsl #1 + vld1.16 {d1[3]}, [r11] + lsl r2, r2, #1 // shift back the state as if we'd done increment_seed with shift=0 + vrshl.s16 d1, d1, d30 + vmovn.i16 d1, q0 + vext.8 q2, q2, q2, #12 +.ifc \lag, lag3 + vmov.s8 r10, d1[5] +.endif +.ifnc \lag, lag1 + vmov.s8 r8, d1[6] +.endif + vmov.s8 r6, d1[7] + + vmov q1, q2 + mov r1, #1 + bl output_\lag()_neon +.else + increment_seed 4, shift=0 + vmov q1, q2 + mov r1, #4 + bl output_\lag()_neon +.endif + + increment_seed 4, shift=0 + vmov q1, q3 + mov r1, #4 + bl output_\lag()_neon + + increment_seed 4, shift=0 + vmov q1, q4 +.if \elems == 9 + mov r1, #1 + bl output_\lag()_neon + lsr r2, r2, #3 + + read_rand r11, 11, 2 + read_rand r12, 11, 1 + add r11, r3, r11, lsl #1 + add r12, r3, r12, lsl #1 + vld1.16 {d2[0]}, [r11] + read_rand r11, 11, 0 + vld1.16 {d2[1]}, [r12] + add r11, r3, r11, lsl #1 + vld1.16 {d2[2]}, [r11] + vrshl.s16 d2, d2, d30 + vmovn.i16 d2, q1 + vext.8 q0, q0, q1, #7 +.else + mov r1, #4 + bl output_\lag()_neon + + increment_seed 4, shift=0 + vmov q1, q5 + +.ifc \edge, right + mov r1, #3 + bl output_\lag()_neon + read_shift_rand r11, 11 + add r11, r3, r11, lsl #1 + vld1.16 {d2[0]}, [r11] + vrshl.s16 d2, d2, d30 + vext.8 q0, q0, q1, #1 +.else + mov r1, #4 + bl output_\lag()_neon +.endif +.endif +.if \store + vst1.8 {q0}, [r0]! +.endif + pop {r11} + pop {r1, pc} +.endif +.endm + +.macro sum_lag1_func type, uv_layout, edge, elems=16 +function sum_\type()_lag1_\edge()_neon + push {r1, lr} + sum_lag_n_body lag1, \type, \uv_layout, \edge, \elems, store=0 +endfunc +.endm + +sum_lag1_func y, 0, left +sum_lag1_func y, 0, mid +sum_lag1_func y, 0, right, 15 +sum_lag1_func uv_444, 444, left +sum_lag1_func uv_444, 444, mid +sum_lag1_func uv_444, 444, right, 15 +sum_lag1_func uv_422, 422, left +sum_lag1_func uv_422, 422, mid +sum_lag1_func uv_422, 422, right, 9 +sum_lag1_func uv_420, 420, left +sum_lag1_func uv_420, 420, mid +sum_lag1_func uv_420, 420, right, 9 + +.macro sum_lag1 type, dst, left, mid, right, edge=mid + vmov q3, \mid + vext.8 q0, \left, \mid, #15 + vext.8 q1, \mid, \right, #1 + bl sum_\type()_lag1_\edge()_neon + vmov \dst, q0 +.endm + +.macro sum_y_lag1 dst, left, mid, right, edge=mid + sum_lag1 y, \dst, \left, \mid, \right, \edge +.endm + +.macro sum_uv_444_lag1 dst, left, mid, right, edge=mid + sum_lag1 uv_444, \dst, \left, \mid, \right, \edge +.endm + +.macro sum_uv_422_lag1 dst, left, mid, right, edge=mid + sum_lag1 uv_422, \dst, \left, \mid, \right, \edge +.endm + +.macro sum_uv_420_lag1 dst, left, mid, right, edge=mid + sum_lag1 uv_420, \dst, \left, \mid, \right, \edge +.endm + + +function sum_lag2_above_neon + push {lr} + sub r12, r0, #2*GRAIN_WIDTH - 16 + sub lr, r0, #1*GRAIN_WIDTH - 16 + vld1.8 {q10}, [r12] // load top right + vld1.8 {q13}, [lr] + + vext.8 q6, q8, q9, #14 // top left, top mid + vdup.8 d14, d28[0] + vext.8 q8, q8, q9, #15 + vdup.8 d15, d28[1] + + vmull.s8 q0, d12, d14 + vmull.s8 q1, d13, d14 + vmull.s8 q6, d16, d15 + vmull.s8 q8, d17, d15 + + vaddl.s16 q2, d0, d12 + vaddl.s16 q3, d1, d13 + vaddl.s16 q4, d2, d16 + vaddl.s16 q5, d3, d17 + + vext.8 q6, q9, q10, #1 // top mid, top right + vdup.8 d14, d28[3] + vext.8 q8, q9, q10, #2 + vdup.8 d15, d28[4] + + vmull.s8 q0, d12, d14 + vmull.s8 q1, d13, d14 + vmull.s8 q6, d16, d15 + vmull.s8 q8, d17, d15 + + vaddl.s16 q7, d0, d12 + vaddl.s16 q0, d1, d13 + vaddl.s16 q6, d2, d16 + vaddl.s16 q1, d3, d17 + + vadd.i32 q2, q2, q7 + vadd.i32 q3, q3, q0 + vadd.i32 q4, q4, q6 + vadd.i32 q5, q5, q1 + + vext.8 q6, q11, q12, #14 // top left, top mid + vdup.8 d14, d28[5] + vext.8 q8, q11, q12, #15 + vdup.8 d15, d28[6] + + vmull.s8 q0, d12, d14 + vmull.s8 q1, d13, d14 + vmull.s8 q6, d16, d15 + vmull.s8 q8, d17, d15 + + vaddl.s16 q7, d0, d12 + vaddl.s16 q0, d1, d13 + vaddl.s16 q6, d2, d16 + vaddl.s16 q1, d3, d17 + + vadd.i32 q2, q2, q7 + vadd.i32 q3, q3, q0 + vadd.i32 q4, q4, q6 + vadd.i32 q5, q5, q1 + + vext.8 q6, q12, q13, #1 // top mid, top right + vdup.8 d14, d29[0] + vext.8 q8, q12, q13, #2 + vdup.8 d15, d29[1] + + vmull.s8 q0, d12, d14 + vmull.s8 q1, d13, d14 + vmull.s8 q6, d16, d15 + vmull.s8 q8, d17, d15 + + vaddl.s16 q7, d0, d12 + vaddl.s16 q0, d1, d13 + vaddl.s16 q6, d2, d16 + vaddl.s16 q1, d3, d17 + + vadd.i32 q2, q2, q7 + vadd.i32 q3, q3, q0 + vadd.i32 q4, q4, q6 + vadd.i32 q5, q5, q1 + + vdup.8 d14, d28[2] + vdup.8 d15, d28[7] + + vmull.s8 q0, d18, d14 + vmull.s8 q1, d19, d14 + vmull.s8 q6, d24, d15 + vmull.s8 q8, d25, d15 + + vaddl.s16 q7, d0, d12 + vaddl.s16 q0, d1, d13 + vaddl.s16 q6, d2, d16 + vaddl.s16 q1, d3, d17 + + vmov q8, q9 + vmov q9, q10 + + vadd.i32 q2, q2, q7 + vadd.i32 q3, q3, q0 + vadd.i32 q4, q4, q6 + vadd.i32 q5, q5, q1 + + vmov q11, q12 + vmov q12, q13 + + pop {pc} +endfunc + +.macro sum_lag2_func type, uv_layout, edge, elems=16 +function sum_\type()_lag2_\edge()_neon + push {r1, lr} +.ifc \edge, left + sub r12, r0, #2*GRAIN_WIDTH + sub lr, r0, #1*GRAIN_WIDTH + vld1.8 {q9}, [r12] // load the previous block right above + vld1.8 {q12}, [lr] +.endif + sum_lag_n_body lag2, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=d29[4] +endfunc +.endm + +sum_lag2_func y, 0, left +sum_lag2_func y, 0, mid +sum_lag2_func y, 0, right, 15 +sum_lag2_func uv_444, 444, left +sum_lag2_func uv_444, 444, mid +sum_lag2_func uv_444, 444, right, 15 +sum_lag2_func uv_422, 422, left +sum_lag2_func uv_422, 422, mid +sum_lag2_func uv_422, 422, right, 9 +sum_lag2_func uv_420, 420, left +sum_lag2_func uv_420, 420, mid +sum_lag2_func uv_420, 420, right, 9 + + +function sum_lag3_left_above_neon + // A separate codepath for the left edge, to avoid reading outside + // of the edge of the buffer. + sub r12, r0, #3*GRAIN_WIDTH + vld1.8 {q11, q12}, [r12] + vext.8 q12, q11, q12, #13 + vext.8 q11, q11, q11, #13 + b sum_lag3_above_start +endfunc + +function sum_lag3_above_neon + sub r12, r0, #3*GRAIN_WIDTH + 3 + vld1.8 {q11, q12}, [r12] + +sum_lag3_above_start: + vdup.8 d20, d26[0] + vext.8 q9, q11, q12, #1 + vdup.8 d21, d26[1] + + vmull.s8 q0, d22, d20 + vmull.s8 q1, d23, d20 + vmull.s8 q6, d18, d21 + vmull.s8 q7, d19, d21 + + vext.8 q8, q11, q12, #2 + vdup.8 d20, d26[2] + vext.8 q9, q11, q12, #3 + vdup.8 d21, d26[3] + + vaddl.s16 q2, d0, d12 + vaddl.s16 q3, d1, d13 + vaddl.s16 q4, d2, d14 + vaddl.s16 q5, d3, d15 + + vmull.s8 q0, d16, d20 + vmull.s8 q1, d17, d20 + vmull.s8 q6, d18, d21 + vmull.s8 q7, d19, d21 + + vaddl.s16 q8, d0, d12 + vaddl.s16 q9, d1, d13 + vaddl.s16 q0, d2, d14 + vaddl.s16 q1, d3, d15 + + vext.8 q6, q11, q12, #4 + vdup.8 d20, d26[4] + vext.8 q7, q11, q12, #5 + vdup.8 d21, d26[5] + + vadd.i32 q2, q2, q8 + vadd.i32 q3, q3, q9 + vadd.i32 q4, q4, q0 + vadd.i32 q5, q5, q1 + + vmull.s8 q0, d12, d20 + vmull.s8 q1, d13, d20 + vmull.s8 q8, d14, d21 + vmull.s8 q9, d15, d21 + + sub r12, r0, #2*GRAIN_WIDTH + 3 + + vaddl.s16 q6, d0, d16 + vaddl.s16 q7, d1, d17 + vaddl.s16 q0, d2, d18 + vaddl.s16 q1, d3, d19 + + vext.8 q8, q11, q12, #6 + vld1.8 {q11, q12}, [r12] + vdup.8 d20, d26[6] + vdup.8 d21, d26[7] + + vadd.i32 q2, q2, q6 + vadd.i32 q3, q3, q7 + vadd.i32 q4, q4, q0 + vadd.i32 q5, q5, q1 + + vmull.s8 q0, d16, d20 + vmull.s8 q1, d17, d20 + vmull.s8 q6, d22, d21 + vmull.s8 q7, d23, d21 + + vaddl.s16 q8, d0, d12 + vaddl.s16 q9, d1, d13 + vaddl.s16 q0, d2, d14 + vaddl.s16 q1, d3, d15 + + vext.8 q6, q11, q12, #1 + vdup.8 d20, d27[0] + vext.8 q7, q11, q12, #2 + vdup.8 d21, d27[1] + + vadd.i32 q2, q2, q8 + vadd.i32 q3, q3, q9 + vadd.i32 q4, q4, q0 + vadd.i32 q5, q5, q1 + + vmull.s8 q0, d12, d20 + vmull.s8 q1, d13, d20 + vmull.s8 q8, d14, d21 + vmull.s8 q9, d15, d21 + + vaddl.s16 q6, d0, d16 + vaddl.s16 q7, d1, d17 + vaddl.s16 q0, d2, d18 + vaddl.s16 q1, d3, d19 + + vext.8 q8, q11, q12, #3 + vdup.8 d20, d27[2] + vext.8 q9, q11, q12, #4 + vdup.8 d21, d27[3] + + vadd.i32 q2, q2, q6 + vadd.i32 q3, q3, q7 + vadd.i32 q4, q4, q0 + vadd.i32 q5, q5, q1 + + vmull.s8 q0, d16, d20 + vmull.s8 q1, d17, d20 + vmull.s8 q6, d18, d21 + vmull.s8 q7, d19, d21 + + sub r12, r0, #1*GRAIN_WIDTH + 3 + + vaddl.s16 q8, d0, d12 + vaddl.s16 q9, d1, d13 + vaddl.s16 q0, d2, d14 + vaddl.s16 q1, d3, d15 + + vext.8 q6, q11, q12, #5 + vdup.8 d20, d27[4] + vext.8 q7, q11, q12, #6 + vdup.8 d21, d27[5] + + vld1.8 {q11, q12}, [r12] + + vadd.i32 q2, q2, q8 + vadd.i32 q3, q3, q9 + vadd.i32 q4, q4, q0 + vadd.i32 q5, q5, q1 + + vmull.s8 q0, d12, d20 + vmull.s8 q1, d13, d20 + vmull.s8 q8, d14, d21 + vmull.s8 q9, d15, d21 + + vaddl.s16 q6, d0, d16 + vaddl.s16 q7, d1, d17 + vaddl.s16 q0, d2, d18 + vaddl.s16 q1, d3, d19 + + vdup.8 d20, d27[6] + vext.8 q9, q11, q12, #1 + vdup.8 d21, d27[7] + + vadd.i32 q2, q2, q6 + vadd.i32 q3, q3, q7 + vadd.i32 q4, q4, q0 + vadd.i32 q5, q5, q1 + + vmull.s8 q0, d22, d20 + vmull.s8 q1, d23, d20 + vmull.s8 q6, d18, d21 + vmull.s8 q7, d19, d21 + + vaddl.s16 q8, d0, d12 + vaddl.s16 q9, d1, d13 + vaddl.s16 q0, d2, d14 + vaddl.s16 q1, d3, d15 + + vext.8 q6, q11, q12, #2 + vdup.8 d20, d28[0] + vext.8 q7, q11, q12, #3 + vdup.8 d21, d28[1] + + vadd.i32 q2, q2, q8 + vadd.i32 q3, q3, q9 + vadd.i32 q4, q4, q0 + vadd.i32 q5, q5, q1 + + vmull.s8 q0, d12, d20 + vmull.s8 q1, d13, d20 + vmull.s8 q8, d14, d21 + vmull.s8 q9, d15, d21 + + vaddl.s16 q6, d0, d16 + vaddl.s16 q7, d1, d17 + vaddl.s16 q0, d2, d18 + vaddl.s16 q1, d3, d19 + + vext.8 q8, q11, q12, #4 + vdup.8 d20, d28[2] + vext.8 q9, q11, q12, #5 + vdup.8 d21, d28[3] + + vadd.i32 q2, q2, q6 + vadd.i32 q3, q3, q7 + vadd.i32 q4, q4, q0 + vadd.i32 q5, q5, q1 + + vmull.s8 q0, d16, d20 + vmull.s8 q1, d17, d20 + vmull.s8 q6, d18, d21 + vmull.s8 q7, d19, d21 + + vaddl.s16 q8, d0, d12 + vaddl.s16 q9, d1, d13 + vaddl.s16 q0, d2, d14 + vaddl.s16 q1, d3, d15 + + vext.8 q6, q11, q12, #6 + vdup.8 d20, d28[4] + + vadd.i32 q2, q2, q8 + vadd.i32 q3, q3, q9 + vadd.i32 q4, q4, q0 + vadd.i32 q5, q5, q1 + + vmull.s8 q0, d12, d20 + vmull.s8 q1, d13, d20 + + vaddw.s16 q2, q2, d0 + vaddw.s16 q3, q3, d1 + vaddw.s16 q4, q4, d2 + vaddw.s16 q5, q5, d3 + + bx lr +endfunc + +.macro sum_lag3_func type, uv_layout, edge, elems=16 +function sum_\type()_lag3_\edge()_neon + push {r1, lr} + sum_lag_n_body lag3, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=d29[0] +endfunc +.endm + +sum_lag3_func y, 0, left +sum_lag3_func y, 0, mid +sum_lag3_func y, 0, right, 15 +sum_lag3_func uv_444, 444, left +sum_lag3_func uv_444, 444, mid +sum_lag3_func uv_444, 444, right, 15 +sum_lag3_func uv_422, 422, left +sum_lag3_func uv_422, 422, mid +sum_lag3_func uv_422, 422, right, 9 +sum_lag3_func uv_420, 420, left +sum_lag3_func uv_420, 420, mid +sum_lag3_func uv_420, 420, right, 9 + +function generate_grain_rows_neon + push {r11,lr} +1: + get_grain_row d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26 + subs r1, r1, #1 + store_grain_row d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26 + bgt 1b + pop {r11,pc} +endfunc + +function generate_grain_rows_44_neon + push {r11,lr} +1: + get_grain_row_44 d16, d17, d18, d19, d20, d21 + subs r1, r1, #1 + store_grain_row_44 d16, d17, d18, d19, d20, d21 + bgt 1b + pop {r11,pc} +endfunc + +function gen_grain_uv_444_lag0_neon + vld1.8 {q3}, [r11]! + push {r11,lr} + bl get_gaussian_neon + vrshl.s16 q8, q0, q15 + bl get_gaussian_neon + vrshl.s16 q9, q0, q15 + vqmovn.s16 d0, q8 + vqmovn.s16 d1, q9 + + vand q3, q3, q1 + vmull.s8 q2, d6, d22 + vmull.s8 q3, d7, d22 + vrshl.s16 q2, q2, q12 + vrshl.s16 q3, q3, q12 + vaddw.s8 q2, q2, d0 + vaddw.s8 q3, q3, d1 + vqmovn.s16 d4, q2 + vqmovn.s16 d5, q3 + vst1.8 {q2}, [r0]! + pop {r11,pc} +endfunc + +function get_grain_row_44_neon + push {r11,lr} + get_grain_row_44 d16, d17, d18, d19, d20, d21 + pop {r11,pc} +endfunc + +function add_uv_420_coeff_lag0_neon + vld1.16 {q2, q3}, [r11]! + vld1.16 {q4, q5}, [r12]! + vpaddl.s8 q2, q2 + vpaddl.s8 q3, q3 + vpaddl.s8 q4, q4 + vpaddl.s8 q5, q5 + vadd.i16 q2, q2, q4 + vadd.i16 q3, q3, q5 + vrshrn.s16 d4, q2, #2 + vrshrn.s16 d5, q3, #2 + b add_coeff_lag0_start +endfunc + +function add_uv_422_coeff_lag0_neon + vld1.16 {q2, q3}, [r11]! + vpaddl.s8 q2, q2 + vpaddl.s8 q3, q3 + vrshrn.s16 d4, q2, #1 + vrshrn.s16 d5, q3, #1 + +add_coeff_lag0_start: + vand q3, q2, q1 + vmull.s8 q2, d6, d22 + vmull.s8 q3, d7, d22 + vrshl.s16 q2, q2, q12 + vrshl.s16 q3, q3, q12 + vaddw.s8 q2, q2, d0 + vaddw.s8 q3, q3, d1 + vqmovn.s16 d4, q2 + vqmovn.s16 d5, q3 + bx lr +endfunc + +.macro gen_grain_82 type +function generate_grain_\type()_8bpc_neon, export=1 + push {r4-r11,lr} + +.ifc \type, uv_444 + mov r12, r3 + mov lr, #28 + add r11, r1, #3*GRAIN_WIDTH + mov r1, r2 + mul r12, r12, lr +.endif + movrel r3, X(gaussian_sequence) + ldr r2, [r1, #FGD_SEED] + ldr r9, [r1, #FGD_GRAIN_SCALE_SHIFT] +.ifc \type, y + add r4, r1, #FGD_AR_COEFFS_Y +.else + add r4, r1, #FGD_AR_COEFFS_UV +.endif + adr r5, L(gen_grain_\type()_tbl) + ldr r6, [r1, #FGD_AR_COEFF_LAG] + add r9, r9, #4 + ldr r6, [r5, r6, lsl #2] + vdup.16 q15, r9 // 4 + data->grain_scale_shift + add r5, r5, r6 + vneg.s16 q15, q15 + +.ifc \type, uv_444 + cmp r12, #0 + movw r10, #0x49d8 + movw lr, #0xb524 + // Intentionally using a separate register instead of moveq with an + // immediate constant, to avoid armv8 deprecated it instruction forms. + it eq + moveq r10, lr + add r4, r4, r12 // Add offset to ar_coeffs_uv[1] + eor r2, r2, r10 +.endif + + ldr r7, [r1, #FGD_AR_COEFF_SHIFT] + mov r8, #1 + mov r10, #1 + lsl r8, r8, r7 // 1 << ar_coeff_shift + lsl r10, r10, r9 // 1 << (4 + data->grain_scale_shift) + lsr r8, r8, #1 // 1 << (ar_coeff_shift - 1) + lsr r10, r10, #1 // 1 << (4 + data->grain_scale_shift - 1) + + bx r5 + + .align 2 +L(gen_grain_\type()_tbl): + .word L(generate_grain_\type()_lag0) - L(gen_grain_\type()_tbl) + CONFIG_THUMB + .word L(generate_grain_\type()_lag1) - L(gen_grain_\type()_tbl) + CONFIG_THUMB + .word L(generate_grain_\type()_lag2) - L(gen_grain_\type()_tbl) + CONFIG_THUMB + .word L(generate_grain_\type()_lag3) - L(gen_grain_\type()_tbl) + CONFIG_THUMB + +L(generate_grain_\type()_lag0): +.ifc \type, y + mov r1, #GRAIN_HEIGHT + bl generate_grain_rows_neon +.else + + mov r1, #3 + bl generate_grain_rows_neon + mov r1, #GRAIN_HEIGHT-3 + + vdup.16 q12, r7 + vld1.8 {d22[]}, [r4] // ar_coeffs_uv[0] + vmov.i8 q0, #0 + vmov.i8 q1, #255 + vext.8 q13, q0, q1, #13 + vext.8 q14, q1, q0, #1 + vneg.s16 q12, q12 + +1: + vmov q1, q13 + bl gen_grain_uv_444_lag0_neon // 16 + vmov.i8 q1, #255 + bl gen_grain_uv_444_lag0_neon // 32 + bl gen_grain_uv_444_lag0_neon // 48 + bl gen_grain_uv_444_lag0_neon // 64 + vmov q1, q14 + bl gen_grain_uv_444_lag0_neon // 80 + get_grain_2 d16 + subs r1, r1, #1 + add r11, r11, #2 + vst1.16 {d16[0]}, [r0]! + bgt 1b +.endif + pop {r4-r11,pc} + +L(generate_grain_\type()_lag1): + vpush {q4-q7} + mov r5, #127 + vld1.8 {d27[]}, [r4]! // ar_coeffs_y[0] + vld1.8 {d28[]}, [r4]! // ar_coeffs_y[1] + vld1.8 {d29[]}, [r4] // ar_coeffs_y[2] +.ifc \type, y + ldrsb r4, [r4, #1] // ar_coeffs_y[3] +.else + add r4, r4, #2 +.endif + + mov r1, #3 +.ifc \type, uv_444 + vld1.8 {d13[]}, [r4] // ar_coeffs_uv[4] + ldrsb r4, [r4, #-1] // ar_coeffs_uv[3] +.endif + bl generate_grain_rows_neon + + mov r1, #GRAIN_HEIGHT - 3 +1: + sum_\type()_lag1 q7, q8, q8, q9, left + sum_\type()_lag1 q8, q8, q9, q10 + sum_\type()_lag1 q9, q9, q10, q11 + sum_\type()_lag1 q10, q10, q11, q12 + sum_\type()_lag1 q12, q11, q12, q13, right + get_grain_2 d26 + subs r1, r1, #1 +.ifc \type, uv_444 + add r11, r11, #2 +.endif + store_grain_row d14, d15, d16, d17, d18, d19, d20, d21, d24, d25, d26 + vmov q11, q10 + vmov q10, q9 + vmov q9, q8 + vmov q8, q7 + bgt 1b + + vpop {q4-q7} + pop {r4-r11,pc} + +L(generate_grain_\type()_lag2): + vpush {q4-q7} + mov r5, #127 + vld1.8 {d28,d29}, [r4] // ar_coeffs_y[0-11], ar_coeffs_uv[0-12] + + vmov.s8 r4, d29[2] + vmov.s8 r10, d29[3] + + mov r1, #3 + bl generate_grain_rows_neon + + mov r1, #GRAIN_HEIGHT - 3 +1: + bl sum_\type()_lag2_left_neon + bl sum_\type()_lag2_mid_neon + bl sum_\type()_lag2_mid_neon + bl sum_\type()_lag2_mid_neon + bl sum_\type()_lag2_right_neon + get_grain_2 d16 + subs r1, r1, #1 +.ifc \type, uv_444 + add r11, r11, #2 +.endif + vst1.16 {d16[0]}, [r0]! + bgt 1b + + vpop {q4-q7} + pop {r4-r11,pc} + +L(generate_grain_\type()_lag3): + vpush {q4-q7} + mov r5, #127 + vld1.8 {q13, q14}, [r4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24] + + vmov.u8 r4, d28[5] + vmov.u8 r10, d28[6] + vmov.u8 r12, d28[7] + + orr r4, r4, r10, lsl #8 + orr r4, r4, r12, lsl #16 + + mov r1, #3 + vpush {d26} + bl generate_grain_rows_neon + vpop {d26} + + mov r1, #GRAIN_HEIGHT - 3 +1: + bl sum_\type()_lag3_left_neon + bl sum_\type()_lag3_mid_neon + bl sum_\type()_lag3_mid_neon + bl sum_\type()_lag3_mid_neon + bl sum_\type()_lag3_right_neon + get_grain_2 d16 + subs r1, r1, #1 +.ifc \type, uv_444 + add r11, r11, #2 +.endif + vst1.16 {d16[0]}, [r0]! + bgt 1b + + vpop {q4-q7} + pop {r4-r11,pc} +endfunc +.endm + +gen_grain_82 y +gen_grain_82 uv_444 + +.macro set_height dst, type +.ifc \type, uv_420 + mov \dst, #SUB_GRAIN_HEIGHT-3 +.else + mov \dst, #GRAIN_HEIGHT-3 +.endif +.endm + +.macro increment_y_ptr reg, type +.ifc \type, uv_420 + add \reg, \reg, #2*GRAIN_WIDTH-(3*32) +.else + sub \reg, \reg, #3*32-GRAIN_WIDTH +.endif +.endm + +.macro gen_grain_44 type +function generate_grain_\type()_8bpc_neon, export=1 + push {r4-r11,lr} + + mov r12, r3 + mov lr, #28 + add r11, r1, #3*GRAIN_WIDTH-3 + mov r1, r2 + mul r12, r12, lr + + movrel r3, X(gaussian_sequence) + ldr r2, [r1, #FGD_SEED] + ldr r9, [r1, #FGD_GRAIN_SCALE_SHIFT] + add r4, r1, #FGD_AR_COEFFS_UV + adr r5, L(gen_grain_\type()_tbl) + ldr r6, [r1, #FGD_AR_COEFF_LAG] + add r9, r9, #4 + ldr r6, [r5, r6, lsl #2] + vdup.16 q15, r9 // 4 + data->grain_scale_shift + add r5, r5, r6 + vneg.s16 q15, q15 + + cmp r12, #0 + movw r10, #0x49d8 + movw lr, #0xb524 + // Intentionally using a separate register instead of moveq with an + // immediate constant, to avoid armv8 deprecated it instruction forms. + it eq + moveq r10, lr + add r4, r4, r12 // Add offset to ar_coeffs_uv[1] + eor r2, r2, r10 + + ldr r7, [r1, #FGD_AR_COEFF_SHIFT] + mov r8, #1 + mov r10, #1 + lsl r8, r8, r7 // 1 << ar_coeff_shift + lsl r10, r10, r9 // 1 << (4 + data->grain_scale_shift) + lsr r8, r8, #1 // 1 << (ar_coeff_shift - 1) + lsr r10, r10, #1 // 1 << (4 + data->grain_scale_shift - 1) + bx r5 + + .align 2 +L(gen_grain_\type()_tbl): + .word L(generate_grain_\type()_lag0) - L(gen_grain_\type()_tbl) + CONFIG_THUMB + .word L(generate_grain_\type()_lag1) - L(gen_grain_\type()_tbl) + CONFIG_THUMB + .word L(generate_grain_\type()_lag2) - L(gen_grain_\type()_tbl) + CONFIG_THUMB + .word L(generate_grain_\type()_lag3) - L(gen_grain_\type()_tbl) + CONFIG_THUMB + +L(generate_grain_\type()_lag0): +.ifc \type, uv_420 + vpush {q4-q5} +.endif + mov r1, #3 + bl generate_grain_rows_44_neon + set_height r1, \type + + vdup.16 q12, r7 + vld1.8 {d22[]}, [r4] // ar_coeffs_uv[0] + vmov.i8 q0, #0 + vmov.i8 q1, #255 + vext.8 q13, q0, q1, #13 + vext.8 q14, q1, q0, #7 + vneg.s16 q12, q12 + +1: + bl get_grain_row_44_neon +.ifc \type, uv_420 + add r12, r11, #GRAIN_WIDTH +.endif + vmov q1, q13 + vmov q0, q8 + bl add_\type()_coeff_lag0_neon + vmov.i8 q1, #255 + vmov q0, q9 + vmov q8, q2 + bl add_\type()_coeff_lag0_neon + vmov.i8 q1, q14 + vmov q0, q10 + vmov q9, q2 + bl add_\type()_coeff_lag0_neon + vmov q10, q2 + subs r1, r1, #1 + increment_y_ptr r11, \type + store_grain_row_44 d16, d17, d18, d19, d20, d21 + bgt 1b + +.ifc \type, uv_420 + vpop {q4-q5} +.endif + pop {r4-r11,pc} + +L(generate_grain_\type()_lag1): + vpush {q4-q7} + mov r5, #127 + vld1.8 {d27[]}, [r4]! // ar_coeffs_uv[0] + vld1.8 {d28[]}, [r4]! // ar_coeffs_uv[1] + vld1.8 {d29[]}, [r4] // ar_coeffs_uv[2] + add r4, r4, #2 + + mov r1, #3 + vld1.8 {d13[]}, [r4] // ar_coeffs_uv[4] + ldrsb r4, [r4, #-1] // ar_coeffs_uv[3] + bl generate_grain_rows_44_neon + + set_height r1, \type +1: + sum_\type()_lag1 q7, q8, q8, q9, left + sum_\type()_lag1 q8, q8, q9, q10 + sum_\type()_lag1 q10, q9, q10, q11, right + subs r1, r1, #1 + increment_y_ptr r11, \type + store_grain_row_44 d14, d15, d16, d17, d20, d21 + vmov q9, q8 + vmov q8, q7 + bgt 1b + + vpop {q4-q7} + pop {r4-r11,pc} + +L(generate_grain_\type()_lag2): + vpush {q4-q7} + mov r5, #127 + vld1.8 {d28,d29}, [r4] // ar_coeffs_uv[0-12] + + vmov.s8 r4, d29[2] + vmov.s8 r10, d29[3] + + mov r1, #3 + bl generate_grain_rows_44_neon + + set_height r1, \type +1: + bl sum_\type()_lag2_left_neon + bl sum_\type()_lag2_mid_neon + bl sum_\type()_lag2_right_neon + subs r1, r1, #1 + increment_y_ptr r11, \type + add r0, r0, #GRAIN_WIDTH-48 + bgt 1b + + vpop {q4-q7} + pop {r4-r11,pc} + +L(generate_grain_\type()_lag3): + vpush {q4-q7} + mov r5, #127 + vld1.8 {q13, q14}, [r4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24] + + vmov.u8 r4, d28[5] + vmov.u8 r10, d28[6] + vmov.u8 r12, d28[7] + + orr r4, r4, r10, lsl #8 + orr r4, r4, r12, lsl #16 + + mov r1, #3 + bl generate_grain_rows_44_neon + + set_height r1, \type +1: + bl sum_\type()_lag3_left_neon + bl sum_\type()_lag3_mid_neon + bl sum_\type()_lag3_right_neon + subs r1, r1, #1 + increment_y_ptr r11, \type + add r0, r0, #GRAIN_WIDTH-48 + bgt 1b + + vpop {q4-q7} + pop {r4-r11,pc} +endfunc +.endm + +gen_grain_44 uv_420 +gen_grain_44 uv_422 + +.macro gather_interleaved dst1, dst2, src1, src2, off + vmov.u8 r11, \src1[0+\off] + vmov.u8 r12, \src2[0+\off] + add r11, r11, r3 + vmov.u8 lr, \src1[2+\off] + add r12, r12, r3 + vld1.8 {\dst1[0+\off]}, [r11] + vmov.u8 r11, \src2[2+\off] + add lr, lr, r3 + vld1.8 {\dst2[0+\off]}, [r12] + vmov.u8 r12, \src1[4+\off] + add r11, r11, r3 + vld1.8 {\dst1[2+\off]}, [lr] + vmov.u8 lr, \src2[4+\off] + add r12, r12, r3 + vld1.8 {\dst2[2+\off]}, [r11] + vmov.u8 r11, \src1[6+\off] + add lr, lr, r3 + vld1.8 {\dst1[4+\off]}, [r12] + vmov.u8 r12, \src2[6+\off] + add r11, r11, r3 + vld1.8 {\dst2[4+\off]}, [lr] + add r12, r12, r3 + vld1.8 {\dst1[6+\off]}, [r11] + vld1.8 {\dst2[6+\off]}, [r12] +.endm + +.macro gather dst1, dst2, dst3, dst4, src1, src2, src3, src4 + gather_interleaved \dst1, \dst3, \src1, \src3, 0 + gather_interleaved \dst1, \dst3, \src1, \src3, 1 + gather_interleaved \dst2, \dst4, \src2, \src4, 0 + gather_interleaved \dst2, \dst4, \src2, \src4, 1 +.endm + +function gather32_neon + push {r11-r12,lr} + gather d8, d9, d10, d11, d0, d1, d2, d3 + pop {r11-r12,pc} +endfunc + +function gather16_neon + push {r11-r12,lr} + gather_interleaved d8, d9, d0, d1, 0 + gather_interleaved d8, d9, d0, d1, 1 + pop {r11-r12,pc} +endfunc + +const overlap_coeffs_0, align=4 + .byte 27, 17, 0, 0, 0, 0, 0, 0 + .byte 17, 27, 32, 32, 32, 32, 32, 32 +endconst + +const overlap_coeffs_1, align=4 + .byte 23, 0, 0, 0, 0, 0, 0, 0 + .byte 22, 32, 32, 32, 32, 32, 32, 32 +endconst + +.macro calc_offset offx, offy, src, sx, sy + and \offy, \src, #0xF // randval & 0xF + lsr \offx, \src, #4 // randval >> 4 +.if \sy == 0 + add \offy, \offy, \offy // 2 * (randval & 0xF) +.endif +.if \sx == 0 + add \offx, \offx, \offx // 2 * (randval >> 4) +.endif +.endm + +.macro add_offset dst, offx, offy, src, stride + mla \dst, \stride, \offy, \src // grain_lut += grain_stride * offy + add \dst, \dst, \offx // grain_lut += offx +.endm + +// void dav1d_fgy_32x32_8bpc_neon(pixel *const dst, const pixel *const src, +// const ptrdiff_t stride, +// const uint8_t scaling[SCALING_SIZE], +// const int scaling_shift, +// const entry grain_lut[][GRAIN_WIDTH], +// const int offsets[][2], +// const int h, const ptrdiff_t clip, +// const ptrdiff_t type); +function fgy_32x32_8bpc_neon, export=1 + push {r4-r11,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #100] // scaling_shift, grain_lut + ldrd r6, r7, [sp, #108] // offsets, h + ldr r8, [sp, #116] // clip + mov r9, #GRAIN_WIDTH // grain_lut stride + + neg r4, r4 + vdup.16 q13, r4 // -scaling_shift + cmp r8, #0 + + movrel_local r12, overlap_coeffs_0 + + beq 1f + // clip + vmov.i8 q14, #16 + vmov.i8 q15, #235 + b 2f +1: + // no clip + vmov.i8 q14, #0 + vmov.i8 q15, #255 +2: + + vld1.8 {d24, d25}, [r12, :128] // overlap_coeffs + + add r5, r5, #9 // grain_lut += 9 + add r5, r5, r9, lsl #3 // grain_lut += 8 * grain_stride + add r5, r5, r9 // grain_lut += grain_stride + + ldr r10, [r6, #8] // offsets[1][0] + calc_offset r10, r4, r10, 0, 0 + add_offset r4, r10, r4, r5, r9 + ldr r10, [r6, #4] // offsets[0][1] + calc_offset r10, r11, r10, 0, 0 + add_offset r11, r10, r11, r5, r9 + ldr r10, [r6, #12] // offsets[1][1] + calc_offset r10, r8, r10, 0, 0 + add_offset r8, r10, r8, r5, r9 + ldr r6, [r6] // offsets[0][0] + calc_offset r6, lr, r6, 0, 0 + add_offset r5, r6, lr, r5, r9 + + add r4, r4, #32 // grain_lut += BLOCK_SIZE * bx + add r6, r11, r9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by + + ldr r10, [sp, #120] // type + adr r11, L(fgy_loop_tbl) + + tst r10, #1 + ldr r10, [r11, r10, lsl #2] + + add r8, r8, r9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by + add r8, r8, #32 // grain_lut += BLOCK_SIZE * bx + + add r11, r11, r10 + + beq 1f + // y overlap + vdup.8 d14, d24[0] + vdup.8 d15, d24[1] + mov r10, r7 // backup actual h + mov r7, #2 +1: + bx r11 +endfunc + +function fgy_loop_neon +L(fgy_loop_tbl): + .word L(loop_00) - L(fgy_loop_tbl) + CONFIG_THUMB + .word L(loop_01) - L(fgy_loop_tbl) + CONFIG_THUMB + .word L(loop_10) - L(fgy_loop_tbl) + CONFIG_THUMB + .word L(loop_11) - L(fgy_loop_tbl) + CONFIG_THUMB + +.macro fgy ox, oy +L(loop_\ox\oy): +1: +.if \ox + vld1.8 {d8}, [r4], r9 // grain_lut old +.endif +.if \oy + vld1.8 {q2, q3}, [r6], r9 // grain_lut top +.endif +.if \ox && \oy + vld1.8 {d10}, [r8], r9 // grain_lut top old +.endif + vld1.8 {q0, q1}, [r1, :128], r2 // src + vld1.8 {q10, q11}, [r5], r9 // grain_lut + +.if \ox + vmull.s8 q4, d8, d24 + vmlal.s8 q4, d20, d25 +.endif + +.if \oy +.if \ox + vmull.s8 q5, d10, d24 + vmlal.s8 q5, d4, d25 + vqrshrn.s16 d20, q4, #5 + vqrshrn.s16 d4, q5, #5 +.endif + + vmull.s8 q4, d20, d15 + vmull.s8 q5, d21, d15 + vmull.s8 q8, d22, d15 + vmull.s8 q9, d23, d15 + vmlal.s8 q4, d4, d14 + vmlal.s8 q5, d5, d14 + vmlal.s8 q8, d6, d14 + vmlal.s8 q9, d7, d14 + vqrshrn.s16 d20, q4, #5 + vqrshrn.s16 d21, q5, #5 + vqrshrn.s16 d22, q8, #5 + vqrshrn.s16 d23, q9, #5 +.elseif \ox + vqrshrn.s16 d20, q4, #5 +.endif + + bl gather32_neon + + vmovl.s8 q8, d20 // grain + vmovl.s8 q9, d21 + vmovl.s8 q10, d22 + vmovl.s8 q11, d23 + + vmovl.u8 q2, d8 // scaling + vmovl.u8 q3, d9 + vmovl.u8 q4, d10 + vmovl.u8 q5, d11 + + vmul.i16 q8, q8, q2 // scaling * grain + vmul.i16 q9, q9, q3 + vmul.i16 q10, q10, q4 + vmul.i16 q11, q11, q5 + + vrshl.s16 q8, q8, q13 // round2(scaling * grain, scaling_shift) + vrshl.s16 q9, q9, q13 + vrshl.s16 q10, q10, q13 + vrshl.s16 q11, q11, q13 + + vaddw.u8 q8, q8, d0 // *src + noise + vaddw.u8 q9, q9, d1 + vaddw.u8 q10, q10, d2 + vaddw.u8 q11, q11, d3 + + vqmovun.s16 d0, q8 + vqmovun.s16 d1, q9 + vqmovun.s16 d2, q10 + vqmovun.s16 d3, q11 + + vmax.u8 q0, q0, q14 + vmax.u8 q1, q1, q14 + vmin.u8 q0, q0, q15 + vmin.u8 q1, q1, q15 + + subs r7, r7, #1 +.if \oy + vdup.8 d14, d25[0] + vdup.8 d15, d25[1] +.endif + vst1.8 {q0, q1}, [r0, :128], r2 // dst + bgt 1b + +.if \oy + cmp r10, #2 + sub r7, r10, #2 // restore actual remaining h + bgt L(loop_\ox()0) +.endif + vpop {q4-q7} + pop {r4-r11,pc} +.endm + + fgy 0, 0 + fgy 0, 1 + fgy 1, 0 + fgy 1, 1 +endfunc + +// void dav1d_fguv_32x32_420_8bpc_neon(pixel *const dst, +// const pixel *const src, +// const ptrdiff_t stride, +// const uint8_t scaling[SCALING_SIZE], +// const Dav1dFilmGrainData *const data, +// const entry grain_lut[][GRAIN_WIDTH], +// const pixel *const luma_row, +// const ptrdiff_t luma_stride, +// const int offsets[][2], +// const ptrdiff_t h, const ptrdiff_t uv, +// const ptrdiff_t is_id, +// const ptrdiff_t type); +.macro fguv layout, sx, sy +function fguv_32x32_\layout()_8bpc_neon, export=1 + push {r4-r11,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #100] // data, grain_lut + ldrd r6, r7, [sp, #108] // luma_row, luma_stride + ldrd r8, r9, [sp, #116] // offsets, h + ldrd r10, r11, [sp, #124] // uv, is_id + + // !csfl + add r10, r4, r10, lsl #2 // + 4*uv + add r12, r10, #FGD_UV_LUMA_MULT + add lr, r10, #FGD_UV_MULT + add r10, r10, #FGD_UV_OFFSET + vld1.16 {d4[]}, [r12] // uv_luma_mult + vld1.16 {d4[2]}, [r10] // uv_offset + vld1.16 {d4[1]}, [lr] // uv_mult + + ldr lr, [r4, #FGD_SCALING_SHIFT] + ldr r12, [r4, #FGD_CLIP_TO_RESTRICTED_RANGE] + neg lr, lr // -scaling_shift + + cmp r12, #0 + vdup.16 q13, lr // -scaling_shift + + beq 1f + // clip + cmp r11, #0 + vmov.i8 q14, #16 + vmov.i8 q15, #240 + beq 2f + // is_id + vmov.i8 q15, #235 + b 2f +1: + // no clip + vmov.i8 q14, #0 + vmov.i8 q15, #255 +2: + + mov r10, #GRAIN_WIDTH // grain_lut stride + + add r5, r5, #(3 + (2 >> \sx)*3) // grain_lut += 9 or 6 +.if \sy + add r5, r5, r10, lsl #2 // grain_lut += 4 * grain_stride + add r5, r5, r10, lsl #1 // grain_lut += 2 * grain_stride +.else + add r5, r5, r10, lsl #3 // grain_lut += 8 * grain_stride + add r5, r5, r10 // grain_lut += grain_stride +.endif + + ldr r12, [r8, #8] // offsets[1][0] + calc_offset r12, r4, r12, \sx, \sy + add_offset r4, r12, r4, r5, r10 + + ldr r12, [r8, #4] // offsets[0][1] + calc_offset r12, lr, r12, \sx, \sy + add_offset lr, r12, lr, r5, r10 + + ldr r12, [r8, #12] // offsets[1][1] + calc_offset r12, r11, r12, \sx, \sy + add_offset r11, r12, r11, r5, r10 + + ldr r8, [r8] // offsets[0][0] + calc_offset r8, r12, r8, \sx, \sy + add_offset r5, r8, r12, r5, r10 + + add r4, r4, #(32 >> \sx) // grain_lut += BLOCK_SIZE * bx + add r8, lr, r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by + add r11, r11, r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by + add r11, r11, #(32 >> \sx) // grain_lut += BLOCK_SIZE * bx + + movrel_local r12, overlap_coeffs_\sx + ldr lr, [sp, #132] // type + + vld1.8 {d24, d25}, [r12, :128] // overlap_coeffs + + movrel_local r12, L(fguv_loop_sx\sx()_tbl) +#if CONFIG_THUMB + // This uses movrel_local instead of adr above, because the target + // can be out of range for adr. But movrel_local leaves the thumb bit + // set on COFF (but probably wouldn't if building for thumb on ELF), + // thus try to clear the bit for robustness. + bic r12, r12, #1 +#endif + + tst lr, #1 + ldr lr, [r12, lr, lsl #2] + + add r12, r12, lr + + beq 1f + // y overlap + sub lr, r9, #(2 >> \sy) // backup remaining h + mov r9, #(2 >> \sy) + +1: + +.if \sy + vmov.i8 d6, #23 + vmov.i8 d7, #22 +.else + vmov.i8 d6, #27 + vmov.i8 d7, #17 +.endif + +.if \sy + add r7, r7, r7 // luma_stride *= 2 +.endif + + bx r12 +endfunc +.endm + +fguv 420, 1, 1 +fguv 422, 1, 0 +fguv 444, 0, 0 + +function fguv_loop_sx0_neon +L(fguv_loop_sx0_tbl): + .word L(fguv_loop_sx0_csfl0_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl0_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl0_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl0_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl1_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl1_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl1_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl1_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + +.macro fguv_loop_sx0 csfl, ox, oy +L(fguv_loop_sx0_csfl\csfl()_\ox\oy): +.if \oy + mov r12, lr +.endif +1: +.if \ox + vld1.8 {d8}, [r4], r10 // grain_lut old +.endif +.if \oy + vld1.8 {q8, q9}, [r8], r10 // grain_lut top +.endif +.if \ox && \oy + vld1.8 {d10}, [r11], r10 // grain_lut top old +.endif + vld1.8 {q0, q1}, [r6, :128], r7 // luma + vld1.8 {q10, q11}, [r5], r10 // grain_lut + +.if \ox + vmull.s8 q4, d8, d24 + vmlal.s8 q4, d20, d25 +.endif + +.if \oy +.if \ox + vmull.s8 q5, d10, d24 + vmlal.s8 q5, d16, d25 + vqrshrn.s16 d20, q4, #5 + vqrshrn.s16 d16, q5, #5 +.endif + + vmull.s8 q4, d20, d7 + vmull.s8 q5, d21, d7 + vmull.s8 q6, d22, d7 + vmull.s8 q7, d23, d7 + vmlal.s8 q4, d16, d6 + vmlal.s8 q5, d17, d6 + vmlal.s8 q6, d18, d6 + vmlal.s8 q7, d19, d6 + vqrshrn.s16 d20, q4, #5 + vqrshrn.s16 d21, q5, #5 + vqrshrn.s16 d22, q6, #5 + vqrshrn.s16 d23, q7, #5 +.elseif \ox + vqrshrn.s16 d20, q4, #5 +.endif +.if !\csfl + vld1.8 {q8, q9}, [r1, :128] // src + vmovl.u8 q4, d0 + vmovl.u8 q5, d1 + vmovl.u8 q6, d2 + vmovl.u8 q7, d3 + vmovl.u8 q0, d16 + vmovl.u8 q1, d17 + vmovl.u8 q8, d18 + vmovl.u8 q9, d19 + vmul.i16 q4, q4, d4[0] + vmul.i16 q5, q5, d4[0] + vmul.i16 q6, q6, d4[0] + vmul.i16 q7, q7, d4[0] + vmul.i16 q0, q0, d4[1] + vmul.i16 q1, q1, d4[1] + vmul.i16 q8, q8, d4[1] + vmul.i16 q9, q9, d4[1] + vqadd.s16 q4, q4, q0 + vqadd.s16 q5, q5, q1 + vqadd.s16 q6, q6, q8 + vqadd.s16 q7, q7, q9 + vdup.16 q0, d4[2] + vshr.s16 q4, q4, #6 + vshr.s16 q5, q5, #6 + vshr.s16 q6, q6, #6 + vshr.s16 q7, q7, #6 + vadd.i16 q4, q4, q0 + vadd.i16 q5, q5, q0 + vadd.i16 q6, q6, q0 + vadd.i16 q7, q7, q0 + vqmovun.s16 d0, q4 + vqmovun.s16 d1, q5 + vqmovun.s16 d2, q6 + vqmovun.s16 d3, q7 +.endif + + bl gather32_neon + + vld1.8 {q0, q1}, [r1, :128], r2 // src + + vmovl.s8 q8, d20 // grain + vmovl.s8 q9, d21 + vmovl.s8 q10, d22 + vmovl.s8 q11, d23 + + vmovl.u8 q6, d8 // scaling + vmovl.u8 q7, d9 + vmovl.u8 q4, d10 + vmovl.u8 q5, d11 + + vmul.i16 q8, q8, q6 // scaling * grain + vmul.i16 q9, q9, q7 + vmul.i16 q10, q10, q4 + vmul.i16 q11, q11, q5 + + vrshl.s16 q8, q8, q13 // round2(scaling * grain, scaling_shift) + vrshl.s16 q9, q9, q13 + vrshl.s16 q10, q10, q13 + vrshl.s16 q11, q11, q13 + + vaddw.u8 q8, q8, d0 // *src + noise + vaddw.u8 q9, q9, d1 + vaddw.u8 q10, q10, d2 + vaddw.u8 q11, q11, d3 + + vqmovun.s16 d0, q8 + vqmovun.s16 d1, q9 + vqmovun.s16 d2, q10 + vqmovun.s16 d3, q11 + + vmax.u8 q0, q0, q14 + vmax.u8 q1, q1, q14 + vmin.u8 q0, q0, q15 + vmin.u8 q1, q1, q15 + + subs r9, r9, #1 +.if \oy + vdup.8 d6, d25[0] + vdup.8 d7, d25[1] +.endif + + vst1.8 {q0, q1}, [r0, :128], r2 // dst + bgt 1b + +.if \oy + cmp r12, #0 + mov r9, r12 // restore actual remaining h + bgt L(fguv_loop_sx0_csfl\csfl()_\ox()0) +.endif + b 9f +.endm + fguv_loop_sx0 0, 0, 0 + fguv_loop_sx0 0, 0, 1 + fguv_loop_sx0 0, 1, 0 + fguv_loop_sx0 0, 1, 1 + fguv_loop_sx0 1, 0, 0 + fguv_loop_sx0 1, 0, 1 + fguv_loop_sx0 1, 1, 0 + fguv_loop_sx0 1, 1, 1 + +9: + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +function fguv_loop_sx1_neon +L(fguv_loop_sx1_tbl): + .word L(fguv_loop_sx1_csfl0_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl0_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl0_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl0_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl1_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl1_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl1_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl1_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + +.macro fguv_loop_sx1 csfl, ox, oy +L(fguv_loop_sx1_csfl\csfl()_\ox\oy): +.if \oy + mov r12, lr +.endif +1: +.if \ox + vld1.8 {d8}, [r4], r10 // grain_lut old +.endif +.if \oy + vld1.8 {q8}, [r8], r10 // grain_lut top +.endif +.if \ox && \oy + vld1.8 {d10}, [r11], r10 // grain_lut top old +.endif + vld1.8 {q0, q1}, [r6, :128], r7 // luma + vld1.8 {q10}, [r5], r10 // grain_lut + vld1.8 {q11}, [r1, :128], r2 // src + +.if \ox + vmull.s8 q4, d8, d24 + vmlal.s8 q4, d20, d25 +.endif + + vpaddl.u8 q0, q0 + vpaddl.u8 q1, q1 +.if \oy +.if \ox + vmull.s8 q5, d10, d24 + vmlal.s8 q5, d16, d25 + vqrshrn.s16 d20, q4, #5 + vqrshrn.s16 d16, q5, #5 +.endif + + vmull.s8 q4, d20, d7 + vmull.s8 q5, d21, d7 + vmlal.s8 q4, d16, d6 + vmlal.s8 q5, d17, d6 + vqrshrn.s16 d20, q4, #5 + vqrshrn.s16 d21, q5, #5 +.elseif \ox + vqrshrn.s16 d20, q4, #5 +.endif +.if \csfl + vrshrn.u16 d0, q0, #1 + vrshrn.u16 d1, q1, #1 +.else + vrshr.u16 q4, q0, #1 + vrshr.u16 q5, q1, #1 + vmovl.u8 q0, d22 + vmovl.u8 q1, d23 + vmul.i16 q4, q4, d4[0] + vmul.i16 q5, q5, d4[0] + vmul.i16 q0, q0, d4[1] + vmul.i16 q1, q1, d4[1] + vqadd.s16 q4, q4, q0 + vqadd.s16 q5, q5, q1 + vdup.16 q0, d4[2] + vshr.s16 q4, q4, #6 + vshr.s16 q5, q5, #6 + vadd.i16 q4, q4, q0 + vadd.i16 q5, q5, q0 + vqmovun.s16 d0, q4 + vqmovun.s16 d1, q5 +.endif + + bl gather16_neon + + vmovl.s8 q8, d20 // grain + vmovl.s8 q9, d21 + + vmovl.u8 q6, d8 // scaling + vmovl.u8 q7, d9 + + vmul.i16 q8, q8, q6 // scaling * grain + vmul.i16 q9, q9, q7 + + vrshl.s16 q8, q8, q13 // round2(scaling * grain, scaling_shift) + vrshl.s16 q9, q9, q13 + + vaddw.u8 q8, q8, d22 // *src + noise + vaddw.u8 q9, q9, d23 + + vqmovun.s16 d0, q8 + vqmovun.s16 d1, q9 + + vmax.u8 q0, q0, q14 + vmin.u8 q0, q0, q15 + + subs r9, r9, #1 +.if \oy + vswp d6, d7 +.endif + vst1.8 {q0}, [r0, :128], r2 // dst + bgt 1b + +.if \oy + cmp r12, #0 + mov r9, r12 // restore actual remaining h + bgt L(fguv_loop_sx1_csfl\csfl()_\ox()0) +.endif + + b 9f +.endm + fguv_loop_sx1 0, 0, 0 + fguv_loop_sx1 0, 0, 1 + fguv_loop_sx1 0, 1, 0 + fguv_loop_sx1 0, 1, 1 + fguv_loop_sx1 1, 0, 0 + fguv_loop_sx1 1, 0, 1 + fguv_loop_sx1 1, 1, 0 + fguv_loop_sx1 1, 1, 1 + +9: + vpop {q4-q7} + pop {r4-r11,pc} +endfunc diff --git a/third_party/dav1d/src/arm/32/filmgrain16.S b/third_party/dav1d/src/arm/32/filmgrain16.S new file mode 100644 index 0000000000000..6c36caceae5a5 --- /dev/null +++ b/third_party/dav1d/src/arm/32/filmgrain16.S @@ -0,0 +1,2137 @@ +/* + * Copyright © 2021, VideoLAN and dav1d authors + * Copyright © 2021, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" +#include "src/arm/asm-offsets.h" + +#define GRAIN_WIDTH 82 +#define GRAIN_HEIGHT 73 + +#define SUB_GRAIN_WIDTH 44 +#define SUB_GRAIN_HEIGHT 38 + +.macro increment_seed steps, shift=1 + lsr r11, r2, #3 + lsr r12, r2, #12 + lsr lr, r2, #1 + eor r11, r2, r11 // (r >> 0) ^ (r >> 3) + eor r12, r12, lr // (r >> 12) ^ (r >> 1) + eor r11, r11, r12 // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1) +.if \shift + lsr r2, r2, #\steps +.endif + and r11, r11, #((1 << \steps) - 1) // bit +.if \shift + orr r2, r2, r11, lsl #(16 - \steps) // *state +.else + orr r2, r2, r11, lsl #16 // *state +.endif +.endm + +.macro read_rand dest, bits, age + ubfx \dest, r2, #16 - \bits - \age, #\bits +.endm + +.macro read_shift_rand dest, bits + ubfx \dest, r2, #17 - \bits, #\bits + lsr r2, r2, #1 +.endm + +// special calling convention: +// r2 holds seed +// r3 holds dav1d_gaussian_sequence +// clobbers r11-r12 +// returns in d0-d1 +function get_gaussian_neon + push {r5-r6,lr} + increment_seed 4 + read_rand r5, 11, 3 + read_rand r6, 11, 2 + add r5, r3, r5, lsl #1 + add r6, r3, r6, lsl #1 + vld1.16 {d0[0]}, [r5] + read_rand r5, 11, 1 + vld1.16 {d0[1]}, [r6] + add r5, r3, r5, lsl #1 + read_rand r6, 11, 0 + increment_seed 4 + add r6, r3, r6, lsl #1 + vld1.16 {d0[2]}, [r5] + read_rand r5, 11, 3 + vld1.16 {d0[3]}, [r6] + add r5, r3, r5, lsl #1 + read_rand r6, 11, 2 + vld1.16 {d1[0]}, [r5] + add r6, r3, r6, lsl #1 + read_rand r5, 11, 1 + vld1.16 {d1[1]}, [r6] + read_rand r6, 11, 0 + add r5, r3, r5, lsl #1 + add r6, r3, r6, lsl #1 + vld1.16 {d1[2]}, [r5] + vld1.16 {d1[3]}, [r6] + pop {r5-r6,pc} +endfunc + +function get_grain_2_neon + push {r11,lr} + increment_seed 2 + read_rand r11, 11, 1 + read_rand r12, 11, 0 + add r11, r3, r11, lsl #1 + add r12, r3, r12, lsl #1 + vld1.16 {d0[0]}, [r11] + vld1.16 {d0[1]}, [r12] + vrshl.s16 d0, d0, d30 + pop {r11,pc} +endfunc + +.macro get_grain_2 dst + bl get_grain_2_neon +.ifnc \dst, d0 + vmov \dst, d0 +.endif +.endm + +function get_grain_4_neon + push {r11,lr} + increment_seed 4 + read_rand r11, 11, 3 + read_rand r12, 11, 2 + add r11, r3, r11, lsl #1 + add r12, r3, r12, lsl #1 + vld1.16 {d0[0]}, [r11] + read_rand r11, 11, 1 + vld1.16 {d0[1]}, [r12] + read_rand r12, 11, 0 + add r11, r3, r11, lsl #1 + add r12, r3, r12, lsl #1 + vld1.16 {d0[2]}, [r11] + vld1.16 {d0[3]}, [r12] + vrshl.s16 d0, d0, d30 + pop {r11,pc} +endfunc + +.macro get_grain_4 dst + bl get_grain_4_neon +.ifnc \dst, d0 + vmov \dst, d0 +.endif +.endm + +// r1 holds the number of entries to produce +// r6, r8 and r10 hold the previous output entries +// q0 holds the vector of produced entries +// q1 holds the input vector of sums from above +.macro output_lag n +function output_lag\n()_neon + push {r0, lr} +.if \n == 1 + mvn lr, r5 // grain_min = ~grain_max +.else + mov r0, #1 + mov lr, #1 + sub r7, r7, #1 + sub r9, r9, #1 + lsl r0, r0, r7 + lsl lr, lr, r9 + add r7, r7, #1 + add r9, r9, #1 +.endif +1: + read_shift_rand r12, 11 + vmov.32 r11, d2[0] + lsl r12, r12, #1 + vext.8 q0, q0, q0, #2 + ldrsh r12, [r3, r12] +.if \n == 1 + mla r11, r6, r4, r11 // sum (above) + *coeff * prev output + add r6, r11, r8 // 1 << (ar_coeff_shift - 1) + add r12, r12, r10 + asr r6, r6, r7 // >> ar_coeff_shift + asr r12, r12, r9 // >> (4 - bitdepth_min_8 + grain_scale_shift) + add r6, r6, r12 + cmp r6, r5 +.elseif \n == 2 + mla r11, r8, r4, r11 // sum (above) + *coeff * prev output 1 + mla r11, r6, r10, r11 // += *coeff * prev output 2 + mov r8, r6 + add r6, r11, r0 // 1 << (ar_coeff_shift - 1) + add r12, r12, lr // 1 << (4 - bitdepth_min_8 + grain_scale_shift - 1) + asr r6, r6, r7 // >> ar_coeff_shift + asr r12, r12, r9 // >> (4 - bitdepth_min_8 + grain_scale_shift) + add r6, r6, r12 + push {lr} + cmp r6, r5 + mvn lr, r5 // grain_min = ~grain_max +.else + push {r1-r3} + sbfx r1, r4, #0, #8 + sbfx r2, r4, #8, #8 + sbfx r3, r4, #16, #8 + mla r11, r10, r1, r11 // sum (above) + *coeff * prev output 1 + mla r11, r8, r2, r11 // sum (above) + *coeff * prev output 2 + mla r11, r6, r3, r11 // += *coeff * prev output 3 + pop {r1-r3} + mov r10, r8 + mov r8, r6 + + add r6, r11, r0 // 1 << (ar_coeff_shift - 1) + add r12, r12, lr // 1 << (4 - bitdepth_min_8 + grain_scale_shift - 1) + asr r6, r6, r7 // >> ar_coeff_shift + asr r12, r12, r9 // >> (4 - bitdepth_min_8 + grain_scale_shift) + add r6, r6, r12 + push {lr} + cmp r6, r5 + mvn lr, r5 // grain_min = ~grain_max +.endif + it gt + movgt r6, r5 + cmp r6, lr + it lt + movlt r6, lr +.if \n >= 2 + pop {lr} +.endif + subs r1, r1, #1 + vext.8 q1, q1, q1, #4 + vmov.16 d1[3], r6 + bgt 1b + pop {r0, pc} +endfunc +.endm + +output_lag 1 +output_lag 2 +output_lag 3 + + +function sum_lag1_above_neon + sub r12, r0, #1*GRAIN_WIDTH*2 - 16 + vld1.16 {q10}, [r12] // load top right + + vext.8 q0, q8, q9, #14 // top left, top mid + vext.8 q1, q9, q10, #2 // top left, top mid + + vmull.s16 q2, d18, d28 + vmlal.s16 q2, d0, d27 + vmlal.s16 q2, d2, d29 + vmull.s16 q3, d19, d28 + vmlal.s16 q3, d1, d27 + vmlal.s16 q3, d3, d29 + + vmov q8, q9 + vmov q9, q10 + + bx lr +endfunc + +.macro sum_lag_n_body lag, type, uv_layout, edge, elems, uv_coeff +.ifc \lag()_\edge, lag3_left + bl sum_lag3_left_above_neon +.else + bl sum_\lag()_above_neon +.endif +.ifc \type, uv_420 + vpush {q6-q7} + add r12, r11, #GRAIN_WIDTH*2 + vld1.16 {q0, q1}, [r11]! + vld1.16 {q6, q7}, [r12]! + vpadd.i16 d0, d0, d1 + vpadd.i16 d1, d2, d3 + vpadd.i16 d12, d12, d13 + vpadd.i16 d13, d14, d15 + vadd.i16 q0, q0, q6 + vpop {q6-q7} + vrshr.s16 q0, q0, #2 +.endif +.ifc \type, uv_422 + vld1.16 {q0, q1}, [r11]! + vpadd.i16 d0, d0, d1 + vpadd.i16 d1, d2, d3 + vrshr.s16 q0, q0, #1 +.endif +.ifc \type, uv_444 + vld1.16 {q0}, [r11]! +.endif +.if \uv_layout +.ifnb \uv_coeff + vdup.8 d13, \uv_coeff + vmovl.s8 q6, d13 +.endif + vmlal.s16 q2, d0, d13 + vmlal.s16 q3, d1, d13 +.endif +.if \uv_layout && \elems == 8 + b sum_\lag()_y_\edge()_start +.elseif \uv_layout == 444 && \elems == 7 + b sum_\lag()_y_\edge()_start +.elseif \uv_layout == 422 && \elems == 1 + b sum_\lag()_uv_420_\edge()_start +.else +sum_\lag()_\type()_\edge()_start: + push {r11} +.if \elems > 4 +.ifc \edge, left + increment_seed 4 + read_rand r11, 11, 3 + read_rand r12, 11, 2 + add r11, r3, r11, lsl #1 + add r12, r3, r12, lsl #1 + vld1.16 {d1[1]}, [r11] + read_rand r11, 11, 1 + vld1.16 {d1[2]}, [r12] + add r11, r3, r11, lsl #1 + vld1.16 {d1[3]}, [r11] + lsl r2, r2, #1 // shift back the state as if we'd done increment_seed with shift=0 + vrshl.s16 d1, d1, d30 + vext.8 q2, q2, q2, #12 +.ifc \lag, lag3 + vmov.s16 r10, d1[1] +.endif +.ifnc \lag, lag1 + vmov.s16 r8, d1[2] +.endif + vmov.s16 r6, d1[3] + + vmov q1, q2 + mov r1, #1 + bl output_\lag()_neon +.else + increment_seed 4, shift=0 + vmov q1, q2 + mov r1, #4 + bl output_\lag()_neon +.endif + + increment_seed 4, shift=0 + vmov q1, q3 +.ifc \edge, right + mov r1, #3 + bl output_\lag()_neon + read_shift_rand r12, 11 + add r12, r3, r12, lsl #1 + vld1.16 {d2[0]}, [r12] + vrshl.s16 d2, d2, d30 + vext.8 q0, q0, q1, #2 +.else + mov r1, #4 + bl output_\lag()_neon +.endif +.else + // elems == 1 + increment_seed 4, shift=0 + vmov q1, q2 + mov r1, #1 + bl output_\lag()_neon + lsr r2, r2, #3 + + read_rand r11, 11, 2 + read_rand r12, 11, 1 + add r11, r3, r11, lsl #1 + add r12, r3, r12, lsl #1 + vld1.16 {d2[0]}, [r11] + read_rand r11, 11, 0 + vld1.16 {d2[1]}, [r12] + add r11, r3, r11, lsl #1 + vld1.16 {d2[2]}, [r11] + vrshl.s16 d2, d2, d30 + vext.8 q0, q0, q1, #14 +.endif + vst1.16 {q0}, [r0]! + pop {r11} + pop {r1, pc} +.endif +.endm + +.macro sum_lag1_func type, uv_layout, edge, elems=8 +function sum_\type()_lag1_\edge()_neon + push {r1, lr} +.ifc \edge, left + sub r12, r0, #1*GRAIN_WIDTH*2 + vld1.8 {q9}, [r12] // load the previous block right above +.endif + sum_lag_n_body lag1, \type, \uv_layout, \edge, \elems +endfunc +.endm + +sum_lag1_func y, 0, left +sum_lag1_func y, 0, mid +sum_lag1_func y, 0, right, 7 +sum_lag1_func uv_444, 444, left +sum_lag1_func uv_444, 444, mid +sum_lag1_func uv_444, 444, right, 7 +sum_lag1_func uv_422, 422, left +sum_lag1_func uv_422, 422, mid +sum_lag1_func uv_422, 422, right, 1 +sum_lag1_func uv_420, 420, left +sum_lag1_func uv_420, 420, mid +sum_lag1_func uv_420, 420, right, 1 + + +function sum_lag2_above_neon + push {lr} + sub r12, r0, #2*GRAIN_WIDTH*2 - 16 + sub lr, r0, #1*GRAIN_WIDTH*2 - 16 + vld1.16 {q10}, [r12] // load top right + vld1.16 {q13}, [lr] + + vdup.8 d10, d28[0] + vext.8 q0, q8, q9, #12 // top left, top mid + vdup.8 d12, d28[1] + vext.8 q1, q8, q9, #14 + vdup.8 d14, d28[3] + vext.8 q4, q9, q10, #2 // top mid, top right + vmovl.s8 q5, d10 + vmovl.s8 q6, d12 + vmovl.s8 q7, d14 + + vmull.s16 q2, d0, d10 + vmlal.s16 q2, d2, d12 + vmlal.s16 q2, d8, d14 + vmull.s16 q3, d1, d10 + vmlal.s16 q3, d3, d12 + vmlal.s16 q3, d9, d14 + + vdup.8 d10, d28[4] + vext.8 q0, q9, q10, #4 // top mid, top right + vdup.8 d12, d28[5] + vext.8 q1, q11, q12, #12 // top left, top mid + vdup.8 d14, d28[6] + vext.8 q4, q11, q12, #14 + vmovl.s8 q5, d10 + vmovl.s8 q6, d12 + vmovl.s8 q7, d14 + + vmlal.s16 q2, d0, d10 + vmlal.s16 q2, d2, d12 + vmlal.s16 q2, d8, d14 + vmlal.s16 q3, d1, d10 + vmlal.s16 q3, d3, d12 + vmlal.s16 q3, d9, d14 + + vdup.8 d10, d29[0] + vext.8 q0, q12, q13, #2 // top mid, top right + vdup.8 d12, d29[1] + vext.8 q1, q12, q13, #4 + + vdup.8 d14, d28[2] + vdup.8 d8, d28[7] + + vmovl.s8 q5, d10 + vmovl.s8 q6, d12 + vmovl.s8 q7, d14 + vmovl.s8 q4, d8 + + vmlal.s16 q2, d0, d10 + vmlal.s16 q2, d2, d12 + vmlal.s16 q2, d18, d14 + vmlal.s16 q2, d24, d8 + vmlal.s16 q3, d1, d10 + vmlal.s16 q3, d3, d12 + vmlal.s16 q3, d19, d14 + vmlal.s16 q3, d25, d8 + + vmov q8, q9 + vmov q9, q10 + + vmov q11, q12 + vmov q12, q13 + + pop {pc} +endfunc + +.macro sum_lag2_func type, uv_layout, edge, elems=8 +function sum_\type()_lag2_\edge()_neon + push {r1, lr} +.ifc \edge, left + sub r12, r0, #2*GRAIN_WIDTH*2 + sub lr, r0, #1*GRAIN_WIDTH*2 + vld1.16 {q9}, [r12] // load the previous block right above + vld1.16 {q12}, [lr] +.endif + sum_lag_n_body lag2, \type, \uv_layout, \edge, \elems, uv_coeff=d29[4] +endfunc +.endm + +sum_lag2_func y, 0, left +sum_lag2_func y, 0, mid +sum_lag2_func y, 0, right, 7 +sum_lag2_func uv_444, 444, left +sum_lag2_func uv_444, 444, mid +sum_lag2_func uv_444, 444, right, 7 +sum_lag2_func uv_422, 422, left +sum_lag2_func uv_422, 422, mid +sum_lag2_func uv_422, 422, right, 1 +sum_lag2_func uv_420, 420, left +sum_lag2_func uv_420, 420, mid +sum_lag2_func uv_420, 420, right, 1 + + +function sum_lag3_left_above_neon + // A separate codepath for the left edge, to avoid reading outside + // of the edge of the buffer. + sub r12, r0, #3*GRAIN_WIDTH*2 + vld1.8 {q11, q12}, [r12] + vext.8 q12, q11, q12, #10 + vext.8 q11, q11, q11, #10 + b sum_lag3_above_start +endfunc + +function sum_lag3_above_neon + movw r12, #(3*GRAIN_WIDTH + 3)*2 + sub r12, r0, r12 + vld1.8 {q11, q12}, [r12] + +sum_lag3_above_start: + vdup.8 d12, d26[0] + vext.8 q1, q11, q12, #2 + vdup.8 d14, d26[1] + vext.8 q4, q11, q12, #4 + vdup.8 d16, d26[2] + vext.8 q5, q11, q12, #6 + vdup.8 d18, d26[3] + vmovl.s8 q6, d12 + vmovl.s8 q7, d14 + vmovl.s8 q8, d16 + vmovl.s8 q9, d18 + + movw r12, #(2*GRAIN_WIDTH + 3)*2 + sub r12, r0, r12 + + vmull.s16 q2, d22, d12 + vmlal.s16 q2, d2, d14 + vmlal.s16 q2, d8, d16 + vmlal.s16 q2, d10, d18 + vmull.s16 q3, d23, d12 + vmlal.s16 q3, d3, d14 + vmlal.s16 q3, d9, d16 + vmlal.s16 q3, d11, d18 + + vdup.8 d12, d26[4] + vext.8 q0, q11, q12, #8 + vdup.8 d14, d26[5] + vext.8 q1, q11, q12, #10 + vdup.8 d16, d26[6] + vext.8 q4, q11, q12, #12 + vld1.8 {q11, q12}, [r12] + vdup.8 d18, d26[7] + vmovl.s8 q6, d12 + vmovl.s8 q7, d14 + vmovl.s8 q8, d16 + vmovl.s8 q9, d18 + + vmlal.s16 q2, d0, d12 + vmlal.s16 q2, d2, d14 + vmlal.s16 q2, d8, d16 + vmlal.s16 q2, d22, d18 + vmlal.s16 q3, d1, d12 + vmlal.s16 q3, d3, d14 + vmlal.s16 q3, d9, d16 + vmlal.s16 q3, d23, d18 + + vdup.8 d12, d27[0] + vext.8 q0, q11, q12, #2 + vdup.8 d14, d27[1] + vext.8 q1, q11, q12, #4 + vdup.8 d16, d27[2] + vext.8 q4, q11, q12, #6 + vdup.8 d18, d27[3] + vext.8 q5, q11, q12, #8 + vmovl.s8 q6, d12 + vmovl.s8 q7, d14 + vmovl.s8 q8, d16 + vmovl.s8 q9, d18 + + sub r12, r0, #(1*GRAIN_WIDTH + 3)*2 + + vmlal.s16 q2, d0, d12 + vmlal.s16 q2, d2, d14 + vmlal.s16 q2, d8, d16 + vmlal.s16 q2, d10, d18 + vmlal.s16 q3, d1, d12 + vmlal.s16 q3, d3, d14 + vmlal.s16 q3, d9, d16 + vmlal.s16 q3, d11, d18 + + vdup.8 d12, d27[4] + vext.8 q0, q11, q12, #10 + vdup.8 d14, d27[5] + vext.8 q1, q11, q12, #12 + vld1.8 {q11, q12}, [r12] + vdup.8 d16, d27[6] + vdup.8 d18, d27[7] + vmovl.s8 q6, d12 + vmovl.s8 q7, d14 + vext.8 q5, q11, q12, #2 + vmovl.s8 q8, d16 + vmovl.s8 q9, d18 + + vmlal.s16 q2, d0, d12 + vmlal.s16 q2, d2, d14 + vmlal.s16 q2, d22, d16 + vmlal.s16 q2, d10, d18 + vmlal.s16 q3, d1, d12 + vmlal.s16 q3, d3, d14 + vmlal.s16 q3, d23, d16 + vmlal.s16 q3, d11, d18 + + vdup.8 d12, d28[0] + vext.8 q0, q11, q12, #4 + vdup.8 d14, d28[1] + vext.8 q1, q11, q12, #6 + vdup.8 d16, d28[2] + vext.8 q4, q11, q12, #8 + vdup.8 d18, d28[3] + vext.8 q5, q11, q12, #10 + vmovl.s8 q6, d12 + vmovl.s8 q7, d14 + vmovl.s8 q8, d16 + vmovl.s8 q9, d18 + + vmlal.s16 q2, d0, d12 + vmlal.s16 q2, d2, d14 + vmlal.s16 q2, d8, d16 + vmlal.s16 q2, d10, d18 + vmlal.s16 q3, d1, d12 + vmlal.s16 q3, d3, d14 + vmlal.s16 q3, d9, d16 + vmlal.s16 q3, d11, d18 + + vdup.8 d12, d28[4] + vext.8 q0, q11, q12, #12 + vmovl.s8 q6, d12 + + vmlal.s16 q2, d0, d12 + vmlal.s16 q3, d1, d12 + + bx lr +endfunc + +.macro sum_lag3_func type, uv_layout, edge, elems=8 +function sum_\type()_lag3_\edge()_neon + push {r1, lr} + sum_lag_n_body lag3, \type, \uv_layout, \edge, \elems, uv_coeff=d29[0] +endfunc +.endm + +sum_lag3_func y, 0, left +sum_lag3_func y, 0, mid +sum_lag3_func y, 0, right, 7 +sum_lag3_func uv_444, 444, left +sum_lag3_func uv_444, 444, mid +sum_lag3_func uv_444, 444, right, 7 +sum_lag3_func uv_422, 422, left +sum_lag3_func uv_422, 422, mid +sum_lag3_func uv_422, 422, right, 1 +sum_lag3_func uv_420, 420, left +sum_lag3_func uv_420, 420, mid +sum_lag3_func uv_420, 420, right, 1 + +function generate_grain_rows_neon + push {r10-r11,lr} +1: + mov r10, #80 +2: + bl get_gaussian_neon + vrshl.s16 q0, q0, q15 + subs r10, r10, #8 + vst1.16 {q0}, [r0]! + bgt 2b + get_grain_2 d0 + subs r1, r1, #1 + vst1.32 {d0[0]}, [r0]! + bgt 1b + pop {r10-r11,pc} +endfunc + +function generate_grain_rows_44_neon + push {r10-r11,lr} +1: + mov r10, #40 +2: + bl get_gaussian_neon + vrshl.s16 q0, q0, q15 + subs r10, r10, #8 + vst1.16 {q0}, [r0]! + bgt 2b + get_grain_4 d0 + subs r1, r1, #1 + vst1.16 {d0}, [r0] + add r0, r0, #GRAIN_WIDTH*2-80 + bgt 1b + pop {r10-r11,pc} +endfunc + +function gen_grain_uv_444_lag0_neon + vld1.16 {q3}, [r11]! +gen_grain_uv_lag0_8_start: + push {r11,lr} + bl get_gaussian_neon + vrshl.s16 q0, q0, q15 +gen_grain_uv_lag0_8_add: + vand q3, q3, q1 + vmull.s16 q2, d6, d22 + vmull.s16 q3, d7, d22 + vrshl.s32 q2, q2, q12 + vrshl.s32 q3, q3, q12 + vqmovn.s32 d4, q2 + vqmovn.s32 d5, q3 + vqadd.s16 q2, q2, q0 + vmin.s16 q2, q2, q9 + vmax.s16 q2, q2, q10 + vst1.16 {q2}, [r0]! + pop {r11,pc} +endfunc + +function gen_grain_uv_420_lag0_8_neon + add r12, r11, #GRAIN_WIDTH*2 + vld1.16 {q2,q3}, [r11]! + vld1.16 {q4,q5}, [r12] + vpadd.i16 d4, d4, d5 + vpadd.i16 d5, d6, d7 + vpadd.i16 d8, d8, d9 + vpadd.i16 d9, d10, d11 + vadd.i16 q2, q2, q4 + vrshr.s16 q3, q2, #2 + b gen_grain_uv_lag0_8_start +endfunc + +function gen_grain_uv_422_lag0_8_neon + vld1.16 {q2,q3}, [r11]! + vpadd.i16 d4, d4, d5 + vpadd.i16 d5, d6, d7 + vrshr.s16 q3, q2, #1 + b gen_grain_uv_lag0_8_start +endfunc + +function gen_grain_uv_420_lag0_4_neon + add r12, r11, #GRAIN_WIDTH*2 + vld1.16 {q2}, [r11] + vld1.16 {q0}, [r12] + add r11, r11, #32 + vpadd.i16 d4, d4, d5 + vpadd.i16 d0, d0, d1 + vadd.i16 d4, d4, d0 + vrshr.s16 d6, d4, #2 + push {r11,lr} + get_grain_4 d0 + b gen_grain_uv_lag0_8_add +endfunc + +function gen_grain_uv_422_lag0_4_neon + vld1.16 {q2}, [r11] + add r11, r11, #32 + vpadd.i16 d4, d4, d5 + vrshr.s16 d6, d4, #1 + push {r11,lr} + get_grain_4 d0 + b gen_grain_uv_lag0_8_add +endfunc + +.macro gen_grain_82 type +function generate_grain_\type()_16bpc_neon, export=1 + push {r4-r11,lr} + +.ifc \type, uv_444 + ldr r4, [sp, #36] + mov r12, r3 + mov lr, #28 + add r11, r1, #3*GRAIN_WIDTH*2 + mov r1, r2 + mul r12, r12, lr + clz lr, r4 +.else + clz lr, r2 +.endif + movrel r3, X(gaussian_sequence) + sub lr, lr, #24 // -bitdepth_min_8 + ldr r2, [r1, #FGD_SEED] + ldr r9, [r1, #FGD_GRAIN_SCALE_SHIFT] +.ifc \type, y + add r4, r1, #FGD_AR_COEFFS_Y +.else + add r4, r1, #FGD_AR_COEFFS_UV +.endif + add r9, r9, lr // grain_scale_shift - bitdepth_min_8 + adr r5, L(gen_grain_\type()_tbl) + ldr r6, [r1, #FGD_AR_COEFF_LAG] + add r9, r9, #4 + ldr r6, [r5, r6, lsl #2] + vdup.16 q15, r9 // 4 - bitdepth_min_8 + data->grain_scale_shift + add r5, r5, r6 + vneg.s16 q15, q15 + +.ifc \type, uv_444 + push {lr} + cmp r12, #0 + movw r10, #0x49d8 + movw lr, #0xb524 + // Intentionally using a separate register instead of moveq with an + // immediate constant, to avoid armv8 deprecated it instruction forms. + it eq + moveq r10, lr + add r4, r4, r12 // Add offset to ar_coeffs_uv[1] + eor r2, r2, r10 + pop {lr} +.endif + + ldr r7, [r1, #FGD_AR_COEFF_SHIFT] + neg lr, lr // bitdepth_min_8 + mov r8, #1 + mov r10, #1 + lsl r8, r8, r7 // 1 << ar_coeff_shift + lsl r10, r10, r9 // 1 << (4 + data->grain_scale_shift) + lsr r8, r8, #1 // 1 << (ar_coeff_shift - 1) + lsr r10, r10, #1 // 1 << (4 + data->grain_scale_shift - 1) + + bx r5 + + .align 2 +L(gen_grain_\type()_tbl): + .word L(generate_grain_\type()_lag0) - L(gen_grain_\type()_tbl) + CONFIG_THUMB + .word L(generate_grain_\type()_lag1) - L(gen_grain_\type()_tbl) + CONFIG_THUMB + .word L(generate_grain_\type()_lag2) - L(gen_grain_\type()_tbl) + CONFIG_THUMB + .word L(generate_grain_\type()_lag3) - L(gen_grain_\type()_tbl) + CONFIG_THUMB + +L(generate_grain_\type()_lag0): +.ifc \type, y + mov r1, #GRAIN_HEIGHT + bl generate_grain_rows_neon +.else + mov r5, #128 + lsl r5, r5, lr // 128 << bitdepth_min_8 + sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 + mvn r6, r5 // grain_min = ~grain_max + + mov r1, #3 + bl generate_grain_rows_neon + mov r1, #GRAIN_HEIGHT-3 + + vdup.32 q12, r7 + vld1.8 {d22[]}, [r4] // ar_coeffs_uv[0] + vmov.i8 q0, #0 + vmov.i8 q1, #255 + vdup.16 q9, r5 + vdup.16 q10, r6 + vext.8 q13, q0, q1, #10 + vext.8 q14, q1, q0, #2 + vneg.s32 q12, q12 + vmovl.s8 q11, d22 + +1: + vmov q1, q13 + bl gen_grain_uv_444_lag0_neon // 8 + vmov.i8 q1, #255 + bl gen_grain_uv_444_lag0_neon // 16 + bl gen_grain_uv_444_lag0_neon // 24 + bl gen_grain_uv_444_lag0_neon // 32 + bl gen_grain_uv_444_lag0_neon // 40 + bl gen_grain_uv_444_lag0_neon // 48 + bl gen_grain_uv_444_lag0_neon // 56 + bl gen_grain_uv_444_lag0_neon // 64 + bl gen_grain_uv_444_lag0_neon // 72 + vmov q1, q14 + bl gen_grain_uv_444_lag0_neon // 80 + get_grain_2 d16 + subs r1, r1, #1 + add r11, r11, #4 + vst1.32 {d16[0]}, [r0]! + bgt 1b +.endif + pop {r4-r11,pc} + +L(generate_grain_\type()_lag1): + vpush {q4-q7} + mov r5, #128 + lsl r5, r5, lr // 128 << bitdepth_min_8 + sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 + vld1.8 {d27[]}, [r4]! // ar_coeffs_y[0] + vld1.8 {d28[]}, [r4]! // ar_coeffs_y[1] + vld1.8 {d29[]}, [r4] // ar_coeffs_y[2] +.ifc \type, y + ldrsb r4, [r4, #1] // ar_coeffs_y[3] +.else + add r4, r4, #2 +.endif + + mov r1, #3 +.ifc \type, uv_444 + vld1.8 {d13[]}, [r4] // ar_coeffs_uv[4] + ldrsb r4, [r4, #-1] // ar_coeffs_uv[3] +.endif + bl generate_grain_rows_neon + vmovl.s8 q13, d27 + vmovl.s8 q12, d29 + vmovl.s8 q14, d28 + vmov d29, d24 +.ifc \type, uv_444 + vmovl.s8 q6, d13 +.endif + + mov r1, #GRAIN_HEIGHT - 3 +1: + bl sum_\type()_lag1_left_neon // 8 + bl sum_\type()_lag1_mid_neon // 16 + bl sum_\type()_lag1_mid_neon // 24 + bl sum_\type()_lag1_mid_neon // 32 + bl sum_\type()_lag1_mid_neon // 40 + bl sum_\type()_lag1_mid_neon // 48 + bl sum_\type()_lag1_mid_neon // 56 + bl sum_\type()_lag1_mid_neon // 64 + bl sum_\type()_lag1_mid_neon // 72 + bl sum_\type()_lag1_right_neon // 80 + get_grain_2 d16 + subs r1, r1, #1 +.ifc \type, uv_444 + add r11, r11, #4 +.endif + vst1.32 {d16[0]}, [r0]! + bgt 1b + + vpop {q4-q7} + pop {r4-r11,pc} + +L(generate_grain_\type()_lag2): + vpush {q4-q7} + mov r5, #128 + lsl r5, r5, lr // 128 << bitdepth_min_8 + sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 + vld1.8 {d28,d29}, [r4] // ar_coeffs_y[0-11], ar_coeffs_uv[0-12] + + vmov.s8 r4, d29[2] + vmov.s8 r10, d29[3] + + mov r1, #3 + bl generate_grain_rows_neon + + mov r1, #GRAIN_HEIGHT - 3 +1: + bl sum_\type()_lag2_left_neon // 8 + bl sum_\type()_lag2_mid_neon // 16 + bl sum_\type()_lag2_mid_neon // 24 + bl sum_\type()_lag2_mid_neon // 32 + bl sum_\type()_lag2_mid_neon // 40 + bl sum_\type()_lag2_mid_neon // 48 + bl sum_\type()_lag2_mid_neon // 56 + bl sum_\type()_lag2_mid_neon // 64 + bl sum_\type()_lag2_mid_neon // 72 + bl sum_\type()_lag2_right_neon // 80 + get_grain_2 d16 + subs r1, r1, #1 +.ifc \type, uv_444 + add r11, r11, #4 +.endif + vst1.32 {d16[0]}, [r0]! + bgt 1b + + vpop {q4-q7} + pop {r4-r11,pc} + +L(generate_grain_\type()_lag3): + vpush {q4-q7} + mov r5, #128 + lsl r5, r5, lr // 128 << bitdepth_min_8 + sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 + vld1.8 {q13, q14}, [r4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24] + + vmov.u8 r4, d28[5] + vmov.u8 r10, d28[6] + vmov.u8 r12, d28[7] + + orr r4, r4, r10, lsl #8 + orr r4, r4, r12, lsl #16 + + mov r1, #3 + vpush {d26} + bl generate_grain_rows_neon + vpop {d26} + + mov r1, #GRAIN_HEIGHT - 3 +1: + bl sum_\type()_lag3_left_neon // 8 + bl sum_\type()_lag3_mid_neon // 16 + bl sum_\type()_lag3_mid_neon // 24 + bl sum_\type()_lag3_mid_neon // 32 + bl sum_\type()_lag3_mid_neon // 40 + bl sum_\type()_lag3_mid_neon // 48 + bl sum_\type()_lag3_mid_neon // 56 + bl sum_\type()_lag3_mid_neon // 64 + bl sum_\type()_lag3_mid_neon // 72 + bl sum_\type()_lag3_right_neon // 80 + get_grain_2 d16 + subs r1, r1, #1 +.ifc \type, uv_444 + add r11, r11, #4 +.endif + vst1.32 {d16[0]}, [r0]! + bgt 1b + + vpop {q4-q7} + pop {r4-r11,pc} +endfunc +.endm + +gen_grain_82 y +gen_grain_82 uv_444 + +.macro set_height dst, type +.ifc \type, uv_420 + mov \dst, #SUB_GRAIN_HEIGHT-3 +.else + mov \dst, #GRAIN_HEIGHT-3 +.endif +.endm + +.macro increment_y_ptr reg, type +.ifc \type, uv_420 + add \reg, \reg, #2*GRAIN_WIDTH*2-(6*32) +.else + sub \reg, \reg, #6*32-GRAIN_WIDTH*2 +.endif +.endm + +.macro gen_grain_44 type +function generate_grain_\type()_16bpc_neon, export=1 + push {r4-r11,lr} + + ldr r4, [sp, #36] + mov r12, r3 + movw r11, #(3*GRAIN_WIDTH-3)*2 + mov lr, #28 + add r11, r1, r11 + mov r1, r2 + mul r12, r12, lr + clz lr, r4 + + movrel r3, X(gaussian_sequence) + sub lr, lr, #24 // -bitdepth_min_8 + ldr r2, [r1, #FGD_SEED] + ldr r9, [r1, #FGD_GRAIN_SCALE_SHIFT] + add r4, r1, #FGD_AR_COEFFS_UV + add r9, r9, lr // grain_scale_shift - bitdepth_min_8 + adr r5, L(gen_grain_\type()_tbl) + ldr r6, [r1, #FGD_AR_COEFF_LAG] + add r9, r9, #4 + ldr r6, [r5, r6, lsl #2] + vdup.16 q15, r9 // 4 - bitdepth_min_8 + data->grain_scale_shift + add r5, r5, r6 + vneg.s16 q15, q15 + + push {lr} + cmp r12, #0 + movw r10, #0x49d8 + movw lr, #0xb524 + // Intentionally using a separate register instead of moveq with an + // immediate constant, to avoid armv8 deprecated it instruction forms. + it eq + moveq r10, lr + add r4, r4, r12 // Add offset to ar_coeffs_uv[1] + eor r2, r2, r10 + pop {lr} + + ldr r7, [r1, #FGD_AR_COEFF_SHIFT] + neg lr, lr + mov r8, #1 + mov r10, #1 + lsl r8, r8, r7 // 1 << ar_coeff_shift + lsl r10, r10, r9 // 1 << (4 + data->grain_scale_shift) + lsr r8, r8, #1 // 1 << (ar_coeff_shift - 1) + lsr r10, r10, #1 // 1 << (4 + data->grain_scale_shift - 1) + bx r5 + + .align 2 +L(gen_grain_\type()_tbl): + .word L(generate_grain_\type()_lag0) - L(gen_grain_\type()_tbl) + CONFIG_THUMB + .word L(generate_grain_\type()_lag1) - L(gen_grain_\type()_tbl) + CONFIG_THUMB + .word L(generate_grain_\type()_lag2) - L(gen_grain_\type()_tbl) + CONFIG_THUMB + .word L(generate_grain_\type()_lag3) - L(gen_grain_\type()_tbl) + CONFIG_THUMB + +L(generate_grain_\type()_lag0): +.ifc \type, uv_420 + vpush {q4-q5} +.endif + mov r5, #128 + lsl r5, r5, lr // 128 << bitdepth_min_8 + sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 + mvn r6, r5 // grain_min = ~grain_max + + mov r1, #3 + bl generate_grain_rows_44_neon + set_height r1, \type + + vdup.32 q12, r7 + vld1.8 {d22[]}, [r4] // ar_coeffs_uv[0] + vmov.i8 q0, #0 + vmov.i8 q1, #255 + vdup.16 q9, r5 + vdup.16 q10, r6 + vext.8 q13, q0, q1, #10 + vext.8 q14, q1, q0, #14 + vneg.s32 q12, q12 + vmovl.s8 q11, d22 + +1: + vmov q1, q13 + bl gen_grain_\type()_lag0_8_neon // 8 + vmov.i8 q1, #255 + bl gen_grain_\type()_lag0_8_neon // 16 + bl gen_grain_\type()_lag0_8_neon // 24 + bl gen_grain_\type()_lag0_8_neon // 32 + bl gen_grain_\type()_lag0_8_neon // 40 + vmov q1, q14 + bl gen_grain_\type()_lag0_4_neon // 44 + subs r1, r1, #1 + increment_y_ptr r11, \type + add r0, r0, #GRAIN_WIDTH*2-6*16 + bgt 1b + +.ifc \type, uv_420 + vpop {q4-q5} +.endif + pop {r4-r11,pc} + +L(generate_grain_\type()_lag1): + vpush {q4-q7} + mov r5, #128 + lsl r5, r5, lr // 128 << bitdepth_min_8 + sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 + vld1.8 {d27[]}, [r4]! // ar_coeffs_uv[0] + vld1.8 {d28[]}, [r4]! // ar_coeffs_uv[1] + vld1.8 {d29[]}, [r4] // ar_coeffs_uv[2] + add r4, r4, #2 + + mov r1, #3 + vld1.8 {d13[]}, [r4] // ar_coeffs_uv[4] + ldrsb r4, [r4, #-1] // ar_coeffs_uv[3] + bl generate_grain_rows_44_neon + vmovl.s8 q13, d27 + vmovl.s8 q12, d29 + vmovl.s8 q14, d28 + vmov d29, d24 + vmovl.s8 q6, d13 + + set_height r1, \type +1: + bl sum_\type()_lag1_left_neon // 8 + bl sum_\type()_lag1_mid_neon // 16 + bl sum_\type()_lag1_mid_neon // 24 + bl sum_\type()_lag1_mid_neon // 32 + bl sum_\type()_lag1_mid_neon // 40 + bl sum_\type()_lag1_right_neon // 44 + subs r1, r1, #1 + increment_y_ptr r11, \type + add r0, r0, #GRAIN_WIDTH*2-6*16 + bgt 1b + + vpop {q4-q7} + pop {r4-r11,pc} + +L(generate_grain_\type()_lag2): + vpush {q4-q7} + mov r5, #128 + lsl r5, r5, lr // 128 << bitdepth_min_8 + sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 + vld1.8 {d28,d29}, [r4] // ar_coeffs_uv[0-12] + + vmov.s8 r4, d29[2] + vmov.s8 r10, d29[3] + + mov r1, #3 + bl generate_grain_rows_44_neon + + set_height r1, \type +1: + bl sum_\type()_lag2_left_neon // 8 + bl sum_\type()_lag2_mid_neon // 16 + bl sum_\type()_lag2_mid_neon // 24 + bl sum_\type()_lag2_mid_neon // 32 + bl sum_\type()_lag2_mid_neon // 40 + bl sum_\type()_lag2_right_neon // 44 + subs r1, r1, #1 + increment_y_ptr r11, \type + add r0, r0, #GRAIN_WIDTH*2-6*16 + bgt 1b + + vpop {q4-q7} + pop {r4-r11,pc} + +L(generate_grain_\type()_lag3): + vpush {q4-q7} + mov r5, #128 + lsl r5, r5, lr // 128 << bitdepth_min_8 + sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 + vld1.8 {q13, q14}, [r4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24] + + vmov.u8 r4, d28[5] + vmov.u8 r10, d28[6] + vmov.u8 r12, d28[7] + + orr r4, r4, r10, lsl #8 + orr r4, r4, r12, lsl #16 + + mov r1, #3 + bl generate_grain_rows_44_neon + + set_height r1, \type +1: + bl sum_\type()_lag3_left_neon // 8 + bl sum_\type()_lag3_mid_neon // 16 + bl sum_\type()_lag3_mid_neon // 24 + bl sum_\type()_lag3_mid_neon // 32 + bl sum_\type()_lag3_mid_neon // 40 + bl sum_\type()_lag3_right_neon // 44 + subs r1, r1, #1 + increment_y_ptr r11, \type + add r0, r0, #GRAIN_WIDTH*2-6*16 + bgt 1b + + vpop {q4-q7} + pop {r4-r11,pc} +endfunc +.endm + +gen_grain_44 uv_420 +gen_grain_44 uv_422 + +.macro gather_interleaved dst1, dst2, src1, src2, src3, src4, off + vmov.u16 r11, \src1[0+\off] + vmov.u16 r12, \src3[0+\off] + add r11, r11, r3 + vmov.u16 lr, \src1[2+\off] + add r12, r12, r3 + vld1.8 {\dst1[0+\off]}, [r11] + vmov.u16 r11, \src3[2+\off] + add lr, lr, r3 + vld1.8 {\dst2[0+\off]}, [r12] + vmov.u16 r12, \src2[0+\off] + add r11, r11, r3 + vld1.8 {\dst1[2+\off]}, [lr] + vmov.u16 lr, \src4[0+\off] + add r12, r12, r3 + vld1.8 {\dst2[2+\off]}, [r11] + vmov.u16 r11, \src2[2+\off] + add lr, lr, r3 + vld1.8 {\dst1[4+\off]}, [r12] + vmov.u16 r12, \src4[2+\off] + add r11, r11, r3 + vld1.8 {\dst2[4+\off]}, [lr] + add r12, r12, r3 + vld1.8 {\dst1[6+\off]}, [r11] + vld1.8 {\dst2[6+\off]}, [r12] +.endm + +.macro gather dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, src7, src8 + gather_interleaved \dst1, \dst3, \src1, \src2, \src5, \src6, 0 + gather_interleaved \dst1, \dst3, \src1, \src2, \src5, \src6, 1 + gather_interleaved \dst2, \dst4, \src3, \src4, \src7, \src8, 0 + gather_interleaved \dst2, \dst4, \src3, \src4, \src7, \src8, 1 +.endm + +function gather32_neon + push {r11-r12,lr} + gather d8, d9, d10, d11, d0, d1, d2, d3, d4, d5, d6, d7 + pop {r11-r12,pc} +endfunc + +function gather16_neon + push {r11-r12,lr} + gather_interleaved d8, d9, d0, d1, d2, d3, 0 + gather_interleaved d8, d9, d0, d1, d2, d3, 1 + pop {r11-r12,pc} +endfunc + +const overlap_coeffs_0, align=4 + .short 27, 17, 0, 0 + .short 17, 27, 32, 32 +endconst + +const overlap_coeffs_1, align=4 + .short 23, 0, 0, 0 + .short 22, 32, 32, 32 +endconst + +.macro calc_offset offx, offy, src, sx, sy + and \offy, \src, #0xF // randval & 0xF + lsr \offx, \src, #4 // randval >> 4 +.if \sy == 0 + add \offy, \offy, \offy // 2 * (randval & 0xF) +.endif +.if \sx == 0 + add \offx, \offx, \offx // 2 * (randval >> 4) +.endif +.endm + +.macro add_offset dst, offx, offy, src, stride + mla \dst, \stride, \offy, \src // grain_lut += grain_stride * offy + add \dst, \dst, \offx, lsl #1 // grain_lut += offx +.endm + +// void dav1d_fgy_32x32_16bpc_neon(pixel *const dst, const pixel *const src, +// const ptrdiff_t stride, +// const uint8_t scaling[SCALING_SIZE], +// const int scaling_shift, +// const entry grain_lut[][GRAIN_WIDTH], +// const int offsets[][2], +// const int h, const ptrdiff_t clip, +// const ptrdiff_t type, +// const int bitdepth_max); +function fgy_32x32_16bpc_neon, export=1 + push {r4-r11,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #100] // scaling_shift, grain_lut + ldrd r6, r7, [sp, #108] // offsets, h + ldr r8, [sp, #116] // clip + mov r9, #GRAIN_WIDTH*2 // grain_lut stride + ldr r10, [sp, #124] // bitdepth_max + + eor r4, r4, #15 // 15 - scaling_shift + vdup.16 q6, r10 // bitdepth_max + clz r10, r10 + vdup.16 q13, r4 // 15 - scaling_shift + rsb r10, r10, #24 // bitdepth_min_8 + cmp r8, #0 + vdup.16 q12, r10 // bitdepth_min_8 + + movrel_local r12, overlap_coeffs_0 + + beq 1f + // clip + vmov.i16 q14, #16 + vmov.i16 q15, #235 + vshl.s16 q14, q14, q12 + vshl.s16 q15, q15, q12 + b 2f +1: + // no clip + vmov.i16 q14, #0 + vmov q15, q6 +2: + vshr.u16 q6, q6, #1 // grain_max + + vld1.16 {d24, d25}, [r12, :128] // overlap_coeffs + + add r5, r5, #18 // grain_lut += 9 + add r5, r5, r9, lsl #3 // grain_lut += 8 * grain_stride + add r5, r5, r9 // grain_lut += grain_stride + + ldr r10, [r6, #8] // offsets[1][0] + calc_offset r10, r4, r10, 0, 0 + add_offset r4, r10, r4, r5, r9 + ldr r10, [r6, #4] // offsets[0][1] + calc_offset r10, r11, r10, 0, 0 + add_offset r11, r10, r11, r5, r9 + ldr r10, [r6, #12] // offsets[1][1] + calc_offset r10, r8, r10, 0, 0 + add_offset r8, r10, r8, r5, r9 + ldr r6, [r6] // offsets[0][0] + calc_offset r6, lr, r6, 0, 0 + add_offset r5, r6, lr, r5, r9 + + add r4, r4, #32*2 // grain_lut += BLOCK_SIZE * bx + add r6, r11, r9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by + + ldr r10, [sp, #120] // type + adr r11, L(fgy_loop_tbl) + + tst r10, #1 + ldr r10, [r11, r10, lsl #2] + + add r8, r8, r9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by + add r8, r8, #32*2 // grain_lut += BLOCK_SIZE * bx + + add r11, r11, r10 + + beq 1f + // y overlap + vdup.16 d14, d24[0] + vdup.16 d15, d24[1] + mov r10, r7 // backup actual h + mov r7, #2 +1: + sub r2, r2, #32 // src_stride -= 32 + sub r9, r9, #32 // grain_stride -= 32 + bx r11 +endfunc + +function fgy_loop_neon +L(fgy_loop_tbl): + .word L(loop_00) - L(fgy_loop_tbl) + CONFIG_THUMB + .word L(loop_01) - L(fgy_loop_tbl) + CONFIG_THUMB + .word L(loop_10) - L(fgy_loop_tbl) + CONFIG_THUMB + .word L(loop_11) - L(fgy_loop_tbl) + CONFIG_THUMB + +.macro fgy ox, oy +L(loop_\ox\oy): +1: +.if \ox + vld1.16 {d0}, [r4], r9 // grain_lut old +.endif +.if \oy + vld1.16 {q2, q3}, [r6]! // grain_lut top +.endif +.if \ox && \oy + vld1.16 {d2}, [r8], r9 // grain_lut top old +.endif +.if \oy + vld1.16 {q4, q5}, [r6], r9 // grain_lut top +.endif +.if !\ox && !\oy + vld1.16 {q0, q1}, [r1, :128]! // src +.endif + vld1.16 {q8, q9}, [r5]! // grain_lut +.if !\ox && !\oy + vld1.16 {q2, q3}, [r1, :128], r2 // src +.endif +.if !\oy + vmvn.i16 q5, #0xf000 // 0x0fff +.endif + vld1.16 {q10, q11}, [r5], r9 // grain_lut + +.if \ox + add r4, r4, #32 + vmull.s16 q0, d0, d24 + vmlal.s16 q0, d16, d25 +.endif + +.if \oy +.if \ox + add r8, r8, #32 + vmull.s16 q1, d2, d24 + vmlal.s16 q1, d4, d25 + vqrshrn.s32 d16, q0, #5 + vmvn d0, d12 // grain_min + vqrshrn.s32 d4, q1, #5 + vmin.s16 d16, d16, d12 + vmin.s16 d4, d4, d12 + vmax.s16 d16, d16, d0 + vmax.s16 d4, d4, d0 +.endif + + vmull.s16 q0, d4, d14 + vmull.s16 q1, d5, d14 + vmull.s16 q2, d6, d14 + vmull.s16 q3, d7, d14 + vmlal.s16 q0, d16, d15 + vmlal.s16 q1, d17, d15 + vmlal.s16 q2, d18, d15 + vmlal.s16 q3, d19, d15 + vmull.s16 q8, d20, d15 + vmull.s16 q9, d21, d15 + vmull.s16 q10, d22, d15 + vmull.s16 q11, d23, d15 + vmlal.s16 q8, d8, d14 + vmlal.s16 q9, d9, d14 + vmlal.s16 q10, d10, d14 + vmlal.s16 q11, d11, d14 + vmvn q4, q6 // grain_min + vqrshrn.s32 d0, q0, #5 + vqrshrn.s32 d1, q1, #5 + vqrshrn.s32 d2, q2, #5 + vqrshrn.s32 d3, q3, #5 + vqrshrn.s32 d4, q8, #5 + vqrshrn.s32 d5, q9, #5 + vqrshrn.s32 d6, q10, #5 + vqrshrn.s32 d7, q11, #5 + vmin.s16 q8, q0, q6 + vmin.s16 q9, q1, q6 + vld1.16 {q0, q1}, [r1, :128]! // src + vmin.s16 q10, q2, q6 + vmin.s16 q11, q3, q6 + vmax.s16 q8, q8, q4 + vmax.s16 q9, q9, q4 + vld1.16 {q2, q3}, [r1, :128], r2 // src + vmvn.i16 q5, #0xf000 // 0x0fff + vmax.s16 q10, q10, q4 + vmax.s16 q11, q11, q4 +.elseif \ox + vmvn d4, d12 // grain_min + vqrshrn.s32 d16, q0, #5 + vld1.16 {q0, q1}, [r1, :128]! // src + vmin.s16 d16, d16, d12 + vmax.s16 d16, d16, d4 + vld1.16 {q2, q3}, [r1, :128], r2 // src +.endif + + // Make sure that uninitialized pixels out of range past the right + // edge are in range; their actual values shouldn't matter. + vand q0, q0, q5 + vand q1, q1, q5 + vand q2, q2, q5 + vand q3, q3, q5 + + bl gather32_neon + +.if \ox || \oy + vpush {q6-q7} +.endif + + vmovl.u8 q6, d8 // scaling + vmovl.u8 q7, d9 + vmovl.u8 q4, d10 + vmovl.u8 q5, d11 + + vshl.u16 q6, q6, q13 // scaling << (15 - scaling_shift) + vshl.u16 q7, q7, q13 + vshl.u16 q4, q4, q13 + vshl.u16 q5, q5, q13 + + vqrdmulh.s16 q8, q8, q6 // round2((scaling << (15 - scaling_shift) * grain, 15) + vqrdmulh.s16 q9, q9, q7 + vqrdmulh.s16 q10, q10, q4 + vqrdmulh.s16 q11, q11, q5 + +.if \ox || \oy + vpop {q6-q7} +.endif + + vqadd.s16 q0, q0, q8 // *src + noise + vqadd.s16 q1, q1, q9 + vqadd.s16 q2, q2, q10 + vqadd.s16 q3, q3, q11 + + vmax.s16 q0, q0, q14 + vmax.s16 q1, q1, q14 + vmax.s16 q2, q2, q14 + vmax.s16 q3, q3, q14 + vmin.s16 q0, q0, q15 + vmin.s16 q1, q1, q15 + vmin.s16 q2, q2, q15 + vmin.s16 q3, q3, q15 + + vst1.16 {q0, q1}, [r0, :128]! // dst + subs r7, r7, #1 +.if \oy + vdup.16 d14, d25[0] + vdup.16 d15, d25[1] +.endif + vst1.16 {q2, q3}, [r0, :128], r2 // dst + bgt 1b + +.if \oy + cmp r10, #2 + sub r7, r10, #2 // restore actual remaining h + bgt L(loop_\ox()0) +.endif + vpop {q4-q7} + pop {r4-r11,pc} +.endm + + fgy 0, 0 + fgy 0, 1 + fgy 1, 0 + fgy 1, 1 +endfunc + +// void dav1d_fguv_32x32_420_16bpc_neon(pixel *const dst, +// const pixel *const src, +// const ptrdiff_t stride, +// const uint8_t scaling[SCALING_SIZE], +// const Dav1dFilmGrainData *const data, +// const entry grain_lut[][GRAIN_WIDTH], +// const pixel *const luma_row, +// const ptrdiff_t luma_stride, +// const int offsets[][2], +// const ptrdiff_t h, const ptrdiff_t uv, +// const ptrdiff_t is_id, +// const ptrdiff_t type, +// const int bitdepth_max); +.macro fguv layout, sx, sy +function fguv_32x32_\layout()_16bpc_neon, export=1 + push {r4-r11,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #100] // data, grain_lut + ldrd r10, r11, [sp, #124] // uv, is_id + ldr r6, [sp, #136] // bitdepth_max + + clz r7, r6 + rsb r7, r7, #24 // bitdepth_min_8 + + // !csfl + add r10, r4, r10, lsl #2 // + 4*uv + add r12, r10, #FGD_UV_LUMA_MULT + add lr, r10, #FGD_UV_MULT + ldrh r10, [r10, #FGD_UV_OFFSET] // uv_offset + vld1.16 {d30[]}, [r12] // uv_luma_mult + lsl r10, r10, r7 // uv_offset << bitdepth_min_8 + vld1.16 {d30[1]}, [lr] // uv_mult + + ldr lr, [r4, #FGD_SCALING_SHIFT] + ldr r12, [r4, #FGD_CLIP_TO_RESTRICTED_RANGE] + eor lr, lr, #15 // 15 - scaling_shift + + vmov.16 d30[2], r10 // uv_offset << bitdepth_min_8 + + cmp r12, #0 + vdup.16 q13, lr // 15 - scaling_shift + + beq 1f + // clip + cmp r11, #0 + mov r8, #16 + mov r9, #240 + lsl r8, r8, r7 + lsl r9, r9, r7 + beq 2f + // is_id + mov r9, #235 + lsl r9, r9, r7 + b 2f +1: + // no clip + mov r8, #0 + mov r9, r6 // bitdepth_max +2: + vmov.16 d30[3], r6 // bitdepth_max + vdup.16 d31, r8 // clip_min + + mov r10, #GRAIN_WIDTH*2 // grain_lut stride + +.if \sy + mov r6, #23 + mov r7, #22 +.else + mov r6, #27 + mov r7, #17 +.endif + vmov.16 d31[1], r9 // clip_max + + ldrd r8, r9, [sp, #116] // offsets, h + + add r5, r5, #(2*(3 + (2 >> \sx)*3)) // grain_lut += 9 or 6 +.if \sy + add r5, r5, r10, lsl #2 // grain_lut += 4 * grain_stride + add r5, r5, r10, lsl #1 // grain_lut += 2 * grain_stride +.else + add r5, r5, r10, lsl #3 // grain_lut += 8 * grain_stride + add r5, r5, r10 // grain_lut += grain_stride +.endif + vmov.16 d31[2], r6 // overlap y [0] + + ldr r12, [r8, #8] // offsets[1][0] + calc_offset r12, r4, r12, \sx, \sy + add_offset r4, r12, r4, r5, r10 + + ldr r12, [r8, #4] // offsets[0][1] + calc_offset r12, lr, r12, \sx, \sy + add_offset lr, r12, lr, r5, r10 + + ldr r12, [r8, #12] // offsets[1][1] + calc_offset r12, r11, r12, \sx, \sy + add_offset r11, r12, r11, r5, r10 + + ldr r8, [r8] // offsets[0][0] + calc_offset r8, r12, r8, \sx, \sy + add_offset r5, r8, r12, r5, r10 + + vmov.16 d31[3], r7 // overlap y [1] + + add r4, r4, #2*(32 >> \sx) // grain_lut += BLOCK_SIZE * bx + add r8, lr, r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by + add r11, r11, r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by + add r11, r11, #2*(32 >> \sx) // grain_lut += BLOCK_SIZE * bx + + movrel_local r12, overlap_coeffs_\sx + ldr lr, [sp, #132] // type + ldrd r6, r7, [sp, #108] // luma_row, luma_stride + + vld1.16 {d24, d25}, [r12, :128] // overlap_coeffs + + movrel_local r12, L(fguv_loop_sx\sx()_tbl) +#if CONFIG_THUMB + // This uses movrel_local instead of adr above, because the target + // can be out of range for adr. But movrel_local leaves the thumb bit + // set on COFF (but probably wouldn't if building for thumb on ELF), + // thus try to clear the bit for robustness. + bic r12, r12, #1 +#endif + + tst lr, #1 + ldr lr, [r12, lr, lsl #2] + + add r12, r12, lr + + beq 1f + // y overlap + sub lr, r9, #(2 >> \sy) // backup remaining h + mov r9, #(2 >> \sy) + +1: +.if \sy + add r7, r7, r7 // luma_stride *= 2 +.endif + sub r7, r7, #32 // luma_stride -= 32 + + bx r12 +endfunc +.endm + +fguv 420, 1, 1 +fguv 422, 1, 0 +fguv 444, 0, 0 + +function fguv_loop_sx0_neon +L(fguv_loop_sx0_tbl): + .word L(fguv_loop_sx0_csfl0_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl0_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl0_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl0_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl1_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl1_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl1_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl1_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + +.macro fguv_loop_sx0 csfl, ox, oy +L(fguv_loop_sx0_csfl\csfl()_\ox\oy): + sub r2, r2, #32 // src_stride -= 32 + sub r10, r10, #32 // grain_stride -= 32 +.if \oy + mov r12, lr +.endif +L(fguv_loop_sx0_csfl\csfl()_\ox\oy()_loopstart): +1: +.if \ox + vld1.16 {d0}, [r4], r10 // grain_lut old +.endif +.if \oy + vld1.16 {q2, q3}, [r8]! // grain_lut top +.endif +.if \ox && \oy + vld1.16 {d2}, [r11], r10 // grain_lut top old +.endif +.if !\ox && !\oy + vld1.16 {q0, q1}, [r6, :128]! // luma +.endif + vld1.16 {q8, q9}, [r5]! // grain_lut +.if \oy + vld1.16 {q4, q5}, [r8], r10 // grain_lut top +.endif +.if !\ox && !\oy + vld1.16 {q2, q3}, [r6, :128], r7 // luma +.endif +.if \oy + vdup.16 d28, d31[2] // overlap y coeff + vdup.16 d29, d31[3] // overlap y coeff +.endif + vld1.16 {q10, q11}, [r5], r10 // grain_lut + +.if \ox + vdup.16 q7, d30[3] // bitdepth_max + add r4, r4, #32 + vmull.s16 q0, d0, d24 + vshr.u16 q7, q7, #1 // grain_max + vmlal.s16 q0, d16, d25 + vmvn q6, q7 // grain_min +.endif + +.if \oy +.if \ox + add r11, r11, #32 + vmull.s16 q1, d2, d24 + vmlal.s16 q1, d4, d25 + vqrshrn.s32 d16, q0, #5 + vqrshrn.s32 d4, q1, #5 + vmin.s16 d4, d4, d14 + vmin.s16 d16, d16, d14 + vmax.s16 d4, d4, d12 + vmax.s16 d16, d16, d12 +.endif + + vmull.s16 q0, d4, d28 + vmull.s16 q1, d5, d28 + vmull.s16 q2, d6, d28 + vmull.s16 q3, d7, d28 +.if !\ox + vdup.16 q7, d30[3] // bitdepth_max +.endif + vmlal.s16 q0, d16, d29 + vmlal.s16 q1, d17, d29 + vmlal.s16 q2, d18, d29 + vmlal.s16 q3, d19, d29 +.if !\ox + vshr.u16 q7, q7, #1 // grain_max +.endif + vmull.s16 q8, d20, d29 + vmull.s16 q9, d21, d29 + vmull.s16 q10, d22, d29 + vmull.s16 q11, d23, d29 +.if !\ox + vmvn q6, q7 // grain_min +.endif + vmlal.s16 q8, d8, d28 + vmlal.s16 q9, d9, d28 + vmlal.s16 q10, d10, d28 + vmlal.s16 q11, d11, d28 + vqrshrn.s32 d0, q0, #5 + vqrshrn.s32 d1, q1, #5 + vqrshrn.s32 d2, q2, #5 + vqrshrn.s32 d3, q3, #5 + vqrshrn.s32 d4, q8, #5 + vqrshrn.s32 d5, q9, #5 + vqrshrn.s32 d6, q10, #5 + vqrshrn.s32 d7, q11, #5 + vmin.s16 q8, q0, q7 + vmin.s16 q9, q1, q7 + vld1.16 {q0, q1}, [r6, :128]! // luma + vmin.s16 q10, q2, q7 + vmin.s16 q11, q3, q7 + vmax.s16 q8, q8, q6 + vmax.s16 q9, q9, q6 + vld1.16 {q2, q3}, [r6, :128], r7 // luma + vmax.s16 q10, q10, q6 + vmax.s16 q11, q11, q6 +.elseif \ox + vqrshrn.s32 d16, q0, #5 + vld1.16 {q0, q1}, [r6, :128]! // luma + vmin.s16 d16, d16, d14 + vld1.16 {q2, q3}, [r6, :128], r7 // luma + vmax.s16 d16, d16, d12 +.endif + +.if !\csfl + vdup.16 d28, d30[0] // uv_luma_mult + vld1.16 {q4, q5}, [r1, :128]! // src + vdup.16 d29, d30[1] // uv_mult + vmull.s16 q6, d0, d28 + vmull.s16 q7, d1, d28 + vmull.s16 q0, d2, d28 + vmull.s16 q1, d3, d28 + vmlal.s16 q6, d8, d29 + vmlal.s16 q7, d9, d29 + vmlal.s16 q0, d10, d29 + vmlal.s16 q1, d11, d29 + vld1.16 {q4, q5}, [r1, :128] // src + sub r1, r1, #32 + vshrn.s32 d12, q6, #6 + vshrn.s32 d13, q7, #6 + vshrn.s32 d14, q0, #6 + vshrn.s32 d15, q1, #6 + vmull.s16 q0, d4, d28 + vmull.s16 q1, d5, d28 + vmull.s16 q2, d6, d28 + vmull.s16 q3, d7, d28 + vmlal.s16 q0, d8, d29 + vmlal.s16 q1, d9, d29 + vmlal.s16 q2, d10, d29 + vmlal.s16 q3, d11, d29 + vdup.16 q14, d30[2] // uv_offset + vshrn.s32 d0, q0, #6 + vshrn.s32 d1, q1, #6 + vshrn.s32 d2, q2, #6 + vshrn.s32 d3, q3, #6 + vdup.16 q4, d30[3] // bitdepth_max + vmov.i16 q5, #0 + vadd.i16 q6, q6, q14 + vadd.i16 q7, q7, q14 + vadd.i16 q2, q0, q14 + vadd.i16 q3, q1, q14 + vmin.s16 q0, q6, q4 + vmin.s16 q1, q7, q4 + vmin.s16 q2, q2, q4 + vmin.s16 q3, q3, q4 + vmax.s16 q0, q0, q5 + vmax.s16 q1, q1, q5 + vmax.s16 q2, q2, q5 + vmax.s16 q3, q3, q5 +.else + vdup.16 q14, d30[3] // bitdepth_max + // Make sure that uninitialized pixels out of range past the right + // edge are in range; their actual values shouldn't matter. + vand q0, q0, q14 + vand q1, q1, q14 + vand q2, q2, q14 + vand q3, q3, q14 +.endif + + bl gather32_neon + + vld1.16 {q0, q1}, [r1, :128]! // src + + vmovl.u8 q6, d8 // scaling + vmovl.u8 q7, d9 + vmovl.u8 q4, d10 + vmovl.u8 q5, d11 + + vld1.16 {q2, q3}, [r1, :128], r2 // src + + vshl.u16 q6, q6, q13 // scaling << (15 - scaling_shift) + vshl.u16 q7, q7, q13 + vshl.u16 q4, q4, q13 + vshl.u16 q5, q5, q13 + + vqrdmulh.s16 q8, q8, q6 // round2((scaling << (15 - scaling_shift) * grain, 15) + vqrdmulh.s16 q9, q9, q7 + vqrdmulh.s16 q10, q10, q4 + vqrdmulh.s16 q11, q11, q5 + + + vdup.16 q4, d31[0] // clip_min + vdup.16 q5, d31[1] // clip_max + + vqadd.s16 q0, q0, q8 // *src + noise + vqadd.s16 q1, q1, q9 + vqadd.s16 q2, q2, q10 + vqadd.s16 q3, q3, q11 + +.if \oy + vmov.32 lr, d25[0] // 2 first 16 bit coeffs from overlap x +.endif + + vmax.s16 q0, q0, q4 + vmax.s16 q1, q1, q4 + vmax.s16 q2, q2, q4 + vmax.s16 q3, q3, q4 + vmin.s16 q0, q0, q5 + vmin.s16 q1, q1, q5 + vmin.s16 q2, q2, q5 + vmin.s16 q3, q3, q5 + + vst1.16 {q0, q1}, [r0, :128]! // dst + + subs r9, r9, #1 +.if \oy + vmov.32 d31[1], lr // new coeffs for overlap y +.endif + + vst1.16 {q2, q3}, [r0, :128], r2 // dst + bgt 1b + +.if \oy + cmp r12, #0 + mov r9, r12 // restore actual remaining h + bgt L(fguv_loop_sx0_csfl\csfl()_\ox()0_loopstart) +.endif + b 9f +.endm + fguv_loop_sx0 0, 0, 0 + fguv_loop_sx0 0, 0, 1 + fguv_loop_sx0 0, 1, 0 + fguv_loop_sx0 0, 1, 1 + fguv_loop_sx0 1, 0, 0 + fguv_loop_sx0 1, 0, 1 + fguv_loop_sx0 1, 1, 0 + fguv_loop_sx0 1, 1, 1 + +9: + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +function fguv_loop_sx1_neon +L(fguv_loop_sx1_tbl): + .word L(fguv_loop_sx1_csfl0_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl0_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl0_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl0_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl1_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl1_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl1_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl1_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + +.macro fguv_loop_sx1 csfl, ox, oy +L(fguv_loop_sx1_csfl\csfl()_\ox\oy): +.if \oy + mov r12, lr +.endif +1: +.if \ox + vld1.16 {d0}, [r4], r10 // grain_lut old +.endif +.if \ox && \oy + vld1.16 {d2}, [r11], r10 // grain_lut top old +.endif +.if \oy + vld1.16 {q2, q3}, [r8], r10 // grain_lut top +.endif +.if !\ox && !\oy + vld1.16 {q0, q1}, [r6, :128]! // luma +.endif + vld1.16 {q8, q9}, [r5], r10 // grain_lut +.if \oy + vdup.16 d28, d31[2] // overlap y coeff + vdup.16 d29, d31[3] // overlap y coeff +.endif +.if !\ox && !\oy + vld1.16 {q2, q3}, [r6, :128], r7 // luma +.endif + +.if \ox + vdup.16 q7, d30[3] // bitdepth_max + vmull.s16 q0, d0, d24 + vshr.u16 q7, q7, #1 // grain_max + vmlal.s16 q0, d16, d25 + vmvn q6, q7 // grain_min +.endif + +.if \oy +.if \ox + vmull.s16 q1, d2, d24 + vmlal.s16 q1, d4, d25 + vqrshrn.s32 d16, q0, #5 + vqrshrn.s32 d4, q1, #5 + vmin.s16 d4, d4, d14 + vmin.s16 d16, d16, d14 + vmax.s16 d4, d4, d12 + vmax.s16 d16, d16, d12 +.endif + + vmull.s16 q0, d4, d28 + vmull.s16 q1, d5, d28 + vmull.s16 q2, d6, d28 + vmull.s16 q3, d7, d28 +.if !\ox + vdup.16 q7, d30[3] // bitdepth_max +.endif + vmlal.s16 q0, d16, d29 + vmlal.s16 q1, d17, d29 + vmlal.s16 q2, d18, d29 + vmlal.s16 q3, d19, d29 +.if !\ox + vshr.u16 q7, q7, #1 // grain_max +.endif + vqrshrn.s32 d16, q0, #5 + vqrshrn.s32 d17, q1, #5 + vqrshrn.s32 d18, q2, #5 + vqrshrn.s32 d19, q3, #5 +.if !\ox + vmvn q6, q7 // grain_min +.endif + vld1.16 {q0, q1}, [r6, :128]! // luma + vmin.s16 q8, q8, q7 + vmin.s16 q9, q9, q7 + vmax.s16 q8, q8, q6 + vmax.s16 q9, q9, q6 + vld1.16 {q2, q3}, [r6, :128], r7 // luma +.elseif \ox + vqrshrn.s32 d16, q0, #5 + vld1.16 {q0, q1}, [r6, :128]! // luma + vmin.s16 d16, d16, d14 + vld1.16 {q2, q3}, [r6, :128], r7 // luma + vmax.s16 d16, d16, d12 +.endif + + vpadd.i16 d0, d0, d1 + vpadd.i16 d1, d2, d3 + vpadd.i16 d2, d4, d5 + vpadd.i16 d3, d6, d7 + vrshr.u16 q0, q0, #1 + vrshr.u16 q1, q1, #1 +.if !\csfl + vdup.16 d28, d30[0] // uv_luma_mult + vld1.16 {q2, q3}, [r1, :128], r2 // src + vdup.16 d29, d30[1] // uv_mult + vmull.s16 q6, d0, d28 + vmull.s16 q7, d1, d28 + vmull.s16 q0, d2, d28 + vmull.s16 q1, d3, d28 + vmlal.s16 q6, d4, d29 + vmlal.s16 q7, d5, d29 + vmlal.s16 q0, d6, d29 + vmlal.s16 q1, d7, d29 + vshrn.s32 d12, q6, #6 + vshrn.s32 d13, q7, #6 + vshrn.s32 d14, q0, #6 + vshrn.s32 d15, q1, #6 + vdup.16 q14, d30[2] // uv_offset + vdup.16 q4, d30[3] // bitdepth_max + vmov.i16 q5, #0 + vadd.i16 q6, q6, q14 + vadd.i16 q7, q7, q14 + vmin.s16 q0, q6, q4 + vmin.s16 q1, q7, q4 + vmax.s16 q0, q0, q5 + vmax.s16 q1, q1, q5 +.else + vdup.16 q14, d30[3] // bitdepth_max + vld1.16 {q2, q3}, [r1, :128], r2 // src + + // Make sure that uninitialized pixels out of range past the right + // edge are in range; their actual values shouldn't matter. + vand q0, q0, q14 + vand q1, q1, q14 +.endif + + bl gather16_neon + + vmovl.u8 q6, d8 // scaling + vmovl.u8 q7, d9 + + vshl.u16 q6, q6, q13 // scaling << (15 - scaling_shift) + vshl.u16 q7, q7, q13 + + vqrdmulh.s16 q8, q8, q6 // round2((scaling << (15 - scaling_shift) * grain, 15) + vqrdmulh.s16 q9, q9, q7 + + + vdup.16 q4, d31[0] // clip_min + vdup.16 q5, d31[1] // clip_max + + vqadd.s16 q0, q2, q8 // *src + noise + vqadd.s16 q1, q3, q9 + +.if \oy + // Swap the two last coefficients of d31, place them first in d28 + vrev64.16 d28, d31 +.endif + + vmax.s16 q0, q0, q4 + vmax.s16 q1, q1, q4 + vmin.s16 q0, q0, q5 + vmin.s16 q1, q1, q5 + + subs r9, r9, #1 +.if \oy + // Take the first two 16 bit coefficients of d28 and place them at the + // end of d31 + vtrn.32 d31, d28 +.endif + + vst1.16 {q0, q1}, [r0, :128], r2 // dst + bgt 1b + +.if \oy + cmp r12, #0 + mov r9, r12 // restore actual remaining h + bgt L(fguv_loop_sx1_csfl\csfl()_\ox()0) +.endif + + b 9f +.endm + fguv_loop_sx1 0, 0, 0 + fguv_loop_sx1 0, 0, 1 + fguv_loop_sx1 0, 1, 0 + fguv_loop_sx1 0, 1, 1 + fguv_loop_sx1 1, 0, 0 + fguv_loop_sx1 1, 0, 1 + fguv_loop_sx1 1, 1, 0 + fguv_loop_sx1 1, 1, 1 + +9: + vpop {q4-q7} + pop {r4-r11,pc} +endfunc diff --git a/third_party/dav1d/src/arm/32/itx.S b/third_party/dav1d/src/arm/32/itx.S index a1aea4139baf5..ceea025e45ce9 100644 --- a/third_party/dav1d/src/arm/32/itx.S +++ b/third_party/dav1d/src/arm/32/itx.S @@ -134,9 +134,9 @@ endconst vmlsl.s16 \d1, \s3, \c1 .endm
-.macro vrshrn_8h d0, d1, s0, s1, shift - vrshrn.i32 \d0, \s0, \shift - vrshrn.i32 \d1, \s1, \shift +.macro vqrshrn_8h d0, d1, s0, s1, shift + vqrshrn.s32 \d0, \s0, \shift + vqrshrn.s32 \d1, \s1, \shift .endm
.macro scale_input c, r0, r1, r2 r3, r4, r5, r6, r7 @@ -418,11 +418,11 @@ endfunc vmull_vmlal q3, \r1, \r3, d0[3], d0[2] vmull_vmlsl q2, \r1, \r3, d0[2], d0[3] vmull_vmlal q1, \r0, \r2, d0[0], d0[0] - vrshrn.i32 d6, q3, #12 - vrshrn.i32 d7, q2, #12 + vqrshrn.s32 d6, q3, #12 + vqrshrn.s32 d7, q2, #12 vmull_vmlsl q2, \r0, \r2, d0[0], d0[0] - vrshrn.i32 d2, q1, #12 - vrshrn.i32 d3, q2, #12 + vqrshrn.s32 d2, q1, #12 + vqrshrn.s32 d3, q2, #12 vqadd.s16 \r0, d2, d6 vqsub.s16 \r3, d2, d6 vqadd.s16 \r1, d3, d7 @@ -433,11 +433,11 @@ endfunc vmull_vmlal_8h q6, q7, \r2, \r3, \r6, \r7, d0[3], d0[2] vmull_vmlsl_8h q4, q5, \r2, \r3, \r6, \r7, d0[2], d0[3] vmull_vmlal_8h q2, q3, \r0, \r1, \r4, \r5, d0[0], d0[0] - vrshrn_8h d12, d13, q6, q7, #12 - vrshrn_8h d14, d15, q4, q5, #12 + vqrshrn_8h d12, d13, q6, q7, #12 + vqrshrn_8h d14, d15, q4, q5, #12 vmull_vmlsl_8h q4, q5, \r0, \r1, \r4, \r5, d0[0], d0[0] - vrshrn_8h d4, d5, q2, q3, #12 - vrshrn_8h d6, d7, q4, q5, #12 + vqrshrn_8h d4, d5, q2, q3, #12 + vqrshrn_8h d6, d7, q4, q5, #12 vqadd.s16 \q0, q2, q6 vqsub.s16 \q3, q2, q6 vqadd.s16 \q1, q3, q7 @@ -478,10 +478,10 @@ endfunc vadd.s32 q3, q3, q10 vsub.s32 q11, q11, q10
- vrshrn.i32 \o0, q2, #12 - vrshrn.i32 \o2, q1, #12 - vrshrn.i32 \o1, q3, #12 - vrshrn.i32 \o3, q11, #12 + vqrshrn.s32 \o0, q2, #12 + vqrshrn.s32 \o2, q1, #12 + vqrshrn.s32 \o1, q3, #12 + vqrshrn.s32 \o3, q11, #12 .endm
function inv_adst_4h_x4_neon, export=1 @@ -533,21 +533,21 @@ endfunc vsub.s32 q4, q4, q2 // out3 vsub.s32 q5, q5, q3
- vrshrn.i32 d20, q10, #12 - vrshrn.i32 d21, q11, #12 + vqrshrn.s32 d20, q10, #12 + vqrshrn.s32 d21, q11, #12
- vrshrn.i32 \o0, q8, #12 - vrshrn.i32 \o1, q9, #12 + vqrshrn.s32 \o0, q8, #12 + vqrshrn.s32 \o1, q9, #12
.ifc \o4, d18 vmov q9, q10 .endif
- vrshrn.i32 \o2, q6, #12 - vrshrn.i32 \o3, q7, #12 + vqrshrn.s32 \o2, q6, #12 + vqrshrn.s32 \o3, q7, #12
- vrshrn.i32 \o6, q4, #12 - vrshrn.i32 \o7, q5, #12 + vqrshrn.s32 \o6, q4, #12 + vqrshrn.s32 \o7, q5, #12 .endm
function inv_adst_8h_x4_neon, export=1 @@ -702,11 +702,11 @@ def_fn_4x4 identity, flipadst vmull_vmlsl_8h q2, q3, \r2, \r3, \r14, \r15, d1[0], d1[1] // -> t4a vmull_vmlal_8h q4, q5, \r2, \r3, \r14, \r15, d1[1], d1[0] // -> t7a vmull_vmlsl_8h q6, q7, \r10, \r11, \r6, \r7, d1[2], d1[3] // -> t5a - vrshrn_8h \r2, \r3, q2, q3, #12 // t4a - vrshrn_8h \r14, \r15, q4, q5, #12 // t7a + vqrshrn_8h \r2, \r3, q2, q3, #12 // t4a + vqrshrn_8h \r14, \r15, q4, q5, #12 // t7a vmull_vmlal_8h q2, q3, \r10, \r11, \r6, \r7, d1[3], d1[2] // -> t6a - vrshrn_8h \r6, \r7, q6, q7, #12 // t5a - vrshrn_8h \r10, \r11, q2, q3, #12 // t6a + vqrshrn_8h \r6, \r7, q6, q7, #12 // t5a + vqrshrn_8h \r10, \r11, q2, q3, #12 // t6a
vqadd.s16 q2, \q1, \q3 // t4 vqsub.s16 \q1, \q1, \q3 // t5a @@ -715,8 +715,8 @@ def_fn_4x4 identity, flipadst
vmull_vmlsl_8h q4, q5, \r6, \r7, \r2, \r3, d0[0], d0[0] // -> t5 vmull_vmlal_8h q6, q7, \r6, \r7, \r2, \r3, d0[0], d0[0] // -> t6 - vrshrn_8h d8, d9, q4, q5, #12 // t5 - vrshrn_8h d10, d11, q6, q7, #12 // t6 + vqrshrn_8h d8, d9, q4, q5, #12 // t5 + vqrshrn_8h d10, d11, q6, q7, #12 // t6
vqsub.s16 \q7, \q0, q3 // out7 vqadd.s16 \q0, \q0, q3 // out0 @@ -735,11 +735,11 @@ def_fn_4x4 identity, flipadst vmull_vmlsl q1, \r1, \r7, d1[0], d1[1] // -> t4a vmull_vmlal q2, \r1, \r7, d1[1], d1[0] // -> t7a vmull_vmlsl q3, \r5, \r3, d1[2], d1[3] // -> t5a - vrshrn.i32 \r1, q1, #12 // t4a + vqrshrn.s32 \r1, q1, #12 // t4a vmull_vmlal q1, \r5, \r3, d1[3], d1[2] // -> t6a - vrshrn.i32 \r7, q2, #12 // t7a - vrshrn.i32 \r3, q3, #12 // t5a - vrshrn.i32 \r5, q1, #12 // taa + vqrshrn.s32 \r7, q2, #12 // t7a + vqrshrn.s32 \r3, q3, #12 // t5a + vqrshrn.s32 \r5, q1, #12 // taa
vqadd.s16 d2, \r1, \r3 // t4 vqsub.s16 \r1, \r1, \r3 // t5a @@ -748,8 +748,8 @@ def_fn_4x4 identity, flipadst
vmull_vmlsl q2, \r3, \r1, d0[0], d0[0] // -> t5 vmull_vmlal q3, \r3, \r1, d0[0], d0[0] // -> t6 - vrshrn.i32 d4, q2, #12 // t5 - vrshrn.i32 d5, q3, #12 // t6 + vqrshrn.s32 d4, q2, #12 // t5 + vqrshrn.s32 d5, q3, #12 // t6
vqsub.s16 \r7, \r0, d3 // out7 vqadd.s16 \r0, \r0, d3 // out0 @@ -783,19 +783,19 @@ endfunc vmull_vmlal_8h q2, q3, d30, d31, d16, d17, d0[0], d0[1] vmull_vmlsl_8h q4, q5, d30, d31, d16, d17, d0[1], d0[0] vmull_vmlal_8h q6, q7, d26, d27, d20, d21, d0[2], d0[3] - vrshrn_8h d16, d17, q2, q3, #12 // t0a - vrshrn_8h d30, d31, q4, q5, #12 // t1a + vqrshrn_8h d16, d17, q2, q3, #12 // t0a + vqrshrn_8h d30, d31, q4, q5, #12 // t1a vmull_vmlsl_8h q2, q3, d26, d27, d20, d21, d0[3], d0[2] vmull_vmlal_8h q4, q5, d22, d23, d24, d25, d1[0], d1[1] - vrshrn_8h d20, d21, q6, q7, #12 // t2a - vrshrn_8h d26, d27, q2, q3, #12 // t3a + vqrshrn_8h d20, d21, q6, q7, #12 // t2a + vqrshrn_8h d26, d27, q2, q3, #12 // t3a vmull_vmlsl_8h q6, q7, d22, d23, d24, d25, d1[1], d1[0] vmull_vmlal_8h q2, q3, d18, d19, d28, d29, d1[2], d1[3] - vrshrn_8h d24, d25, q4, q5, #12 // t4a - vrshrn_8h d22, d23, q6, q7, #12 // t5a + vqrshrn_8h d24, d25, q4, q5, #12 // t4a + vqrshrn_8h d22, d23, q6, q7, #12 // t5a vmull_vmlsl_8h q4, q5, d18, d19, d28, d29, d1[3], d1[2] - vrshrn_8h d28, d29, q2, q3, #12 // t6a - vrshrn_8h d18, d19, q4, q5, #12 // t7a + vqrshrn_8h d28, d29, q2, q3, #12 // t6a + vqrshrn_8h d18, d19, q4, q5, #12 // t7a
vqadd.s16 q2, q8, q12 // t0 vqsub.s16 q3, q8, q12 // t4 @@ -810,13 +810,13 @@ endfunc vmull_vmlsl_8h q12, q13, d6, d7, d10, d11, d2[2], d2[3] vmull_vmlsl_8h q14, q15, d22, d23, d14, d15, d2[3], d2[2]
- vrshrn_8h d6, d7, q8, q9, #12 // t4a - vrshrn_8h d10, d11, q12, q13, #12 // t5a + vqrshrn_8h d6, d7, q8, q9, #12 // t4a + vqrshrn_8h d10, d11, q12, q13, #12 // t5a
vmull_vmlal_8h q8, q9, d22, d23, d14, d15, d2[2], d2[3]
- vrshrn_8h d14, d15, q14, q15, #12 // t6a - vrshrn_8h d22, d23, q8, q9, #12 // t7a + vqrshrn_8h d14, d15, q14, q15, #12 // t6a + vqrshrn_8h d22, d23, q8, q9, #12 // t7a
vqadd.s16 \q0, q2, q6 // out0 vqsub.s16 q2, q2, q6 // t2 @@ -833,11 +833,11 @@ endfunc vmull_vmlal_8h q10, q11, d4, d5, d8, d9, d2[0], d2[0] // -> out3 (q11 or q12) vmull_vmlsl_8h q6, q7, d4, d5, d8, d9, d2[0], d2[0] // -> out4 (q12 or q11) vmull_vmlsl_8h q12, q13, d6, d7, d10, d11, d2[0], d2[0] // -> out5 (q13 or q10) - vrshrn_8h d4, d5, q10, q11, #12 // out3 + vqrshrn_8h d4, d5, q10, q11, #12 // out3 vmull_vmlal_8h q10, q11, d6, d7, d10, d11, d2[0], d2[0] // -> out2 (q10 or q13) - vrshrn_8h d6, d7, q12, q13, #12 // out5 - vrshrn_8h \r4, \r5, q10, q11, #12 // out2 (q10 or q13) - vrshrn_8h \r8, \r9, q6, q7, #12 // out4 (q12 or q11) + vqrshrn_8h d6, d7, q12, q13, #12 // out5 + vqrshrn_8h \r4, \r5, q10, q11, #12 // out2 (q10 or q13) + vqrshrn_8h \r8, \r9, q6, q7, #12 // out4 (q12 or q11)
vqneg.s16 \q3, q2 // out3 vqneg.s16 \q5, q3 // out5 @@ -850,19 +850,19 @@ endfunc vmull_vmlal q2, d23, d16, d0[0], d0[1] vmull_vmlsl q3, d23, d16, d0[1], d0[0] vmull_vmlal q4, d21, d18, d0[2], d0[3] - vrshrn.i32 d16, q2, #12 // t0a - vrshrn.i32 d23, q3, #12 // t1a + vqrshrn.s32 d16, q2, #12 // t0a + vqrshrn.s32 d23, q3, #12 // t1a vmull_vmlsl q5, d21, d18, d0[3], d0[2] vmull_vmlal q6, d19, d20, d1[0], d1[1] - vrshrn.i32 d18, q4, #12 // t2a - vrshrn.i32 d21, q5, #12 // t3a + vqrshrn.s32 d18, q4, #12 // t2a + vqrshrn.s32 d21, q5, #12 // t3a vmull_vmlsl q7, d19, d20, d1[1], d1[0] vmull_vmlal q2, d17, d22, d1[2], d1[3] - vrshrn.i32 d20, q6, #12 // t4a - vrshrn.i32 d19, q7, #12 // t5a + vqrshrn.s32 d20, q6, #12 // t4a + vqrshrn.s32 d19, q7, #12 // t5a vmull_vmlsl q3, d17, d22, d1[3], d1[2] - vrshrn.i32 d22, q2, #12 // t6a - vrshrn.i32 d17, q3, #12 // t7a + vqrshrn.s32 d22, q2, #12 // t6a + vqrshrn.s32 d17, q3, #12 // t7a
vqadd.s16 d4, d16, d20 // t0 vqsub.s16 d5, d16, d20 // t4 @@ -877,13 +877,13 @@ endfunc vmull_vmlsl q10, d5, d7, d2[2], d2[3] vmull_vmlsl q11, d19, d9, d2[3], d2[2]
- vrshrn.i32 d5, q8, #12 // t4a - vrshrn.i32 d7, q10, #12 // t5a + vqrshrn.s32 d5, q8, #12 // t4a + vqrshrn.s32 d7, q10, #12 // t5a
vmull_vmlal q8, d19, d9, d2[2], d2[3]
- vrshrn.i32 d9, q11, #12 // t6a - vrshrn.i32 d19, q8, #12 // t7a + vqrshrn.s32 d9, q11, #12 // t6a + vqrshrn.s32 d19, q8, #12 // t7a
vqadd.s16 \r0, d4, d8 // out0 vqsub.s16 d4, d4, d8 // t2 @@ -900,11 +900,11 @@ endfunc vmull_vmlal q9, d4, d6, d2[0], d2[0] // -> out3 (d19 or d20) vmull_vmlsl q4, d4, d6, d2[0], d2[0] // -> out4 (d20 or d19) vmull_vmlsl q10, d5, d7, d2[0], d2[0] // -> out5 (d21 or d18) - vrshrn.i32 d4, q9, #12 // out3 + vqrshrn.s32 d4, q9, #12 // out3 vmull_vmlal q9, d5, d7, d2[0], d2[0] // -> out2 (d18 or d21) - vrshrn.i32 d5, q10, #12 // out5 - vrshrn.i32 \r2, q9, #12 // out2 (d18 or d21) - vrshrn.i32 \r4, q4, #12 // out4 (d20 or d19) + vqrshrn.s32 d5, q10, #12 // out5 + vqrshrn.s32 \r2, q9, #12 // out2 (d18 or d21) + vqrshrn.s32 \r4, q4, #12 // out4 (d20 or d19)
vqneg.s16 \r3, d4 // out3 vqneg.s16 \r5, d5 // out5 @@ -1122,19 +1122,19 @@ function inv_dct_4h_x16_neon, export=1 vmull_vmlsl q2, d17, d31, d2[0], d2[1] // -> t8a vmull_vmlal q3, d17, d31, d2[1], d2[0] // -> t15a vmull_vmlsl q4, d25, d23, d2[2], d2[3] // -> t9a - vrshrn.i32 d17, q2, #12 // t8a - vrshrn.i32 d31, q3, #12 // t15a + vqrshrn.s32 d17, q2, #12 // t8a + vqrshrn.s32 d31, q3, #12 // t15a vmull_vmlal q2, d25, d23, d2[3], d2[2] // -> t14a vmull_vmlsl q3, d21, d27, d3[0], d3[1] // -> t10a - vrshrn.i32 d23, q4, #12 // t9a - vrshrn.i32 d25, q2, #12 // t14a + vqrshrn.s32 d23, q4, #12 // t9a + vqrshrn.s32 d25, q2, #12 // t14a vmull_vmlal q4, d21, d27, d3[1], d3[0] // -> t13a vmull_vmlsl q2, d29, d19, d3[2], d3[3] // -> t11a - vrshrn.i32 d21, q3, #12 // t10a - vrshrn.i32 d27, q4, #12 // t13a + vqrshrn.s32 d21, q3, #12 // t10a + vqrshrn.s32 d27, q4, #12 // t13a vmull_vmlal q3, d29, d19, d3[3], d3[2] // -> t12a - vrshrn.i32 d19, q2, #12 // t11a - vrshrn.i32 d29, q3, #12 // t12a + vqrshrn.s32 d19, q2, #12 // t11a + vqrshrn.s32 d29, q3, #12 // t12a
idct_4h_x8 d16, d18, d20, d22, d24, d26, d28, d30
@@ -1149,14 +1149,14 @@ function inv_dct_4h_x16_neon, export=1
vmull_vmlsl q3, d5, d4, d0[2], d0[3] // -> t9a vmull_vmlal q4, d5, d4, d0[3], d0[2] // -> t14a - vrshrn.i32 d21, q3, #12 // t9a - vrshrn.i32 d27, q4, #12 // t14a + vqrshrn.s32 d21, q3, #12 // t9a + vqrshrn.s32 d27, q4, #12 // t14a
vmull_vmlsl q3, d29, d23, d0[2], d0[3] // -> t13a vmull_vmlal q4, d29, d23, d0[3], d0[2] // -> t10a - vrshrn.i32 d29, q3, #12 // t13a + vqrshrn.s32 d29, q3, #12 // t13a vneg.s32 q4, q4 - vrshrn.i32 d23, q4, #12 // t10a + vqrshrn.s32 d23, q4, #12 // t10a
vqsub.s16 d4, d17, d19 // t11a vqadd.s16 d17, d17, d19 // t8a @@ -1171,11 +1171,11 @@ function inv_dct_4h_x16_neon, export=1 vmull_vmlal q4, d5, d4, d0[0], d0[0] // -> t12 vmull_vmlsl q2, d25, d21, d0[0], d0[0] // -> t10a
- vrshrn.i32 d6, q3, #12 // t11 - vrshrn.i32 d7, q4, #12 // t12 + vqrshrn.s32 d6, q3, #12 // t11 + vqrshrn.s32 d7, q4, #12 // t12 vmull_vmlal q4, d25, d21, d0[0], d0[0] // -> t13a - vrshrn.i32 d4, q2, #12 // t10a - vrshrn.i32 d5, q4, #12 // t13a + vqrshrn.s32 d4, q2, #12 // t10a + vqrshrn.s32 d5, q4, #12 // t13a
vqadd.s16 d8, d16, d31 // out0 vqsub.s16 d31, d16, d31 // out15 @@ -1208,35 +1208,35 @@ endfunc vmull_vmlal q2, d31, d16, d0[0], d0[1] // -> t0 vmull_vmlsl q3, d31, d16, d0[1], d0[0] // -> t1 vmull_vmlal q4, d29, d18, d0[2], d0[3] // -> t2 - vrshrn.i32 d16, q2, #12 // t0 - vrshrn.i32 d31, q3, #12 // t1 + vqrshrn.s32 d16, q2, #12 // t0 + vqrshrn.s32 d31, q3, #12 // t1 vmull_vmlsl q2, d29, d18, d0[3], d0[2] // -> t3 vmull_vmlal q3, d27, d20, d1[0], d1[1] // -> t4 - vrshrn.i32 d18, q4, #12 // t2 - vrshrn.i32 d29, q2, #12 // t3 + vqrshrn.s32 d18, q4, #12 // t2 + vqrshrn.s32 d29, q2, #12 // t3 vmull_vmlsl q4, d27, d20, d1[1], d1[0] // -> t5 vmull_vmlal q2, d25, d22, d1[2], d1[3] // -> t6 - vrshrn.i32 d20, q3, #12 // t4 - vrshrn.i32 d27, q4, #12 // t5 + vqrshrn.s32 d20, q3, #12 // t4 + vqrshrn.s32 d27, q4, #12 // t5 vmull_vmlsl q3, d25, d22, d1[3], d1[2] // -> t7 vmull_vmlal q4, d23, d24, d2[0], d2[1] // -> t8 - vrshrn.i32 d22, q2, #12 // t6 - vrshrn.i32 d25, q3, #12 // t7 + vqrshrn.s32 d22, q2, #12 // t6 + vqrshrn.s32 d25, q3, #12 // t7 vmull_vmlsl q2, d23, d24, d2[1], d2[0] // -> t9 vmull_vmlal q3, d21, d26, d2[2], d2[3] // -> t10 - vrshrn.i32 d23, q4, #12 // t8 - vrshrn.i32 d24, q2, #12 // t9 + vqrshrn.s32 d23, q4, #12 // t8 + vqrshrn.s32 d24, q2, #12 // t9 vmull_vmlsl q4, d21, d26, d2[3], d2[2] // -> t11 vmull_vmlal q2, d19, d28, d3[0], d3[1] // -> t12 - vrshrn.i32 d21, q3, #12 // t10 - vrshrn.i32 d26, q4, #12 // t11 + vqrshrn.s32 d21, q3, #12 // t10 + vqrshrn.s32 d26, q4, #12 // t11 vmull_vmlsl q3, d19, d28, d3[1], d3[0] // -> t13 vmull_vmlal q4, d17, d30, d3[2], d3[3] // -> t14 - vrshrn.i32 d19, q2, #12 // t12 - vrshrn.i32 d28, q3, #12 // t13 + vqrshrn.s32 d19, q2, #12 // t12 + vqrshrn.s32 d28, q3, #12 // t13 vmull_vmlsl q2, d17, d30, d3[3], d3[2] // -> t15 - vrshrn.i32 d17, q4, #12 // t14 - vrshrn.i32 d30, q2, #12 // t15 + vqrshrn.s32 d17, q4, #12 // t14 + vqrshrn.s32 d30, q2, #12 // t15
vld1.16 {q0}, [r12, :128]
@@ -1260,19 +1260,19 @@ endfunc vmull_vmlal q2, d2, d3, d1[1], d1[0] // -> t8 vmull_vmlsl q3, d2, d3, d1[0], d1[1] // -> t9 vmull_vmlal q4, d18, d29, d1[3], d1[2] // -> t10 - vrshrn.i32 d17, q2, #12 // t8 - vrshrn.i32 d30, q3, #12 // t9 + vqrshrn.s32 d17, q2, #12 // t8 + vqrshrn.s32 d30, q3, #12 // t9 vmull_vmlsl q2, d18, d29, d1[2], d1[3] // -> t11 vmull_vmlsl q3, d27, d20, d1[1], d1[0] // -> t12 - vrshrn.i32 d18, q4, #12 // t10 - vrshrn.i32 d29, q2, #12 // t11 + vqrshrn.s32 d18, q4, #12 // t10 + vqrshrn.s32 d29, q2, #12 // t11 vmull_vmlal q4, d27, d20, d1[0], d1[1] // -> t13 vmull_vmlsl q2, d25, d22, d1[3], d1[2] // -> t14 - vrshrn.i32 d27, q3, #12 // t12 - vrshrn.i32 d20, q4, #12 // t13 + vqrshrn.s32 d27, q3, #12 // t12 + vqrshrn.s32 d20, q4, #12 // t13 vmull_vmlal q3, d25, d22, d1[2], d1[3] // -> t15 - vrshrn.i32 d25, q2, #12 // t14 - vrshrn.i32 d22, q3, #12 // t15 + vqrshrn.s32 d25, q2, #12 // t14 + vqrshrn.s32 d22, q3, #12 // t15
vqsub.s16 d2, d16, d21 // t4 vqadd.s16 d16, d16, d21 // t0 @@ -1294,19 +1294,19 @@ endfunc vmull_vmlal q2, d2, d3, d0[3], d0[2] // -> t4a vmull_vmlsl q3, d2, d3, d0[2], d0[3] // -> t5a vmull_vmlsl q4, d24, d23, d0[3], d0[2] // -> t6a - vrshrn.i32 d22, q2, #12 // t4a - vrshrn.i32 d25, q3, #12 // t5a + vqrshrn.s32 d22, q2, #12 // t4a + vqrshrn.s32 d25, q3, #12 // t5a vmull_vmlal q2, d24, d23, d0[2], d0[3] // -> t7a vmull_vmlal q3, d17, d30, d0[3], d0[2] // -> t12 - vrshrn.i32 d24, q4, #12 // t6a - vrshrn.i32 d23, q2, #12 // t7a + vqrshrn.s32 d24, q4, #12 // t6a + vqrshrn.s32 d23, q2, #12 // t7a vmull_vmlsl q4, d17, d30, d0[2], d0[3] // -> t13 vmull_vmlsl q2, d29, d18, d0[3], d0[2] // -> t14 - vrshrn.i32 d17, q3, #12 // t12 + vqrshrn.s32 d17, q3, #12 // t12 vmull_vmlal q3, d29, d18, d0[2], d0[3] // -> t15 - vrshrn.i32 d29, q4, #12 // t13 - vrshrn.i32 d30, q2, #12 // t14 - vrshrn.i32 d18, q3, #12 // t15 + vqrshrn.s32 d29, q4, #12 // t13 + vqrshrn.s32 d30, q2, #12 // t14 + vqrshrn.s32 d18, q3, #12 // t15
vqsub.s16 d2, d16, d21 // t2a .ifc \o0, d16 @@ -1343,21 +1343,21 @@ endfunc vmull_vmlal q2, d2, d21, d0[0], d0[0] // -> out7 (d23 or d24) vmull_vmlal q3, d26, d3, d0[0], d0[0] // -> out5 (d21 or d26)
- vrshrn.i32 d24, q12, #12 // out8 - vrshrn.i32 d4, q2, #12 // out7 - vrshrn.i32 d5, q3, #12 // out5 + vqrshrn.s32 d24, q12, #12 // out8 + vqrshrn.s32 d4, q2, #12 // out7 + vqrshrn.s32 d5, q3, #12 // out5 vmull_vmlsl q4, d26, d3, d0[0], d0[0] // -> out10 (d26 or d21) vmull_vmlal q1, d22, d23, d0[0], d0[0] // -> out4 (d20 or d27) - vrshrn.i32 d26, q4, #12 // out10 + vqrshrn.s32 d26, q4, #12 // out10
vmull_vmlsl q4, d22, d23, d0[0], d0[0] // -> out11 (d27 or d20) vmull_vmlal q11, d27, d20, d0[0], d0[0] // -> out6 (d22 or d25) vmull_vmlsl q3, d27, d20, d0[0], d0[0] // -> out9 (d25 or d22)
- vrshrn.i32 \o4, q1, #12 // out4 - vrshrn.i32 d7, q3, #12 // out9 - vrshrn.i32 d6, q4, #12 // out11 - vrshrn.i32 \o6, q11, #12 // out6 + vqrshrn.s32 \o4, q1, #12 // out4 + vqrshrn.s32 d7, q3, #12 // out9 + vqrshrn.s32 d6, q4, #12 // out11 + vqrshrn.s32 \o6, q11, #12 // out6
.ifc \o8, d23 vmov \o8, d24 @@ -1927,35 +1927,35 @@ function inv_dct32_odd_4h_x16_neon, export=1 vmull_vmlsl q2, d16, d31, d0[0], d0[1] // -> t16a vmull_vmlal q3, d16, d31, d0[1], d0[0] // -> t31a vmull_vmlsl q4, d24, d23, d0[2], d0[3] // -> t17a - vrshrn.i32 d16, q2, #12 // t16a - vrshrn.i32 d31, q3, #12 // t31a + vqrshrn.s32 d16, q2, #12 // t16a + vqrshrn.s32 d31, q3, #12 // t31a vmull_vmlal q2, d24, d23, d0[3], d0[2] // -> t30a vmull_vmlsl q3, d20, d27, d1[0], d1[1] // -> t18a - vrshrn.i32 d24, q4, #12 // t17a - vrshrn.i32 d23, q2, #12 // t30a + vqrshrn.s32 d24, q4, #12 // t17a + vqrshrn.s32 d23, q2, #12 // t30a vmull_vmlal q4, d20, d27, d1[1], d1[0] // -> t29a vmull_vmlsl q2, d28, d19, d1[2], d1[3] // -> t19a - vrshrn.i32 d20, q3, #12 // t18a - vrshrn.i32 d27, q4, #12 // t29a + vqrshrn.s32 d20, q3, #12 // t18a + vqrshrn.s32 d27, q4, #12 // t29a vmull_vmlal q3, d28, d19, d1[3], d1[2] // -> t28a vmull_vmlsl q4, d18, d29, d2[0], d2[1] // -> t20a - vrshrn.i32 d28, q2, #12 // t19a - vrshrn.i32 d19, q3, #12 // t28a + vqrshrn.s32 d28, q2, #12 // t19a + vqrshrn.s32 d19, q3, #12 // t28a vmull_vmlal q2, d18, d29, d2[1], d2[0] // -> t27a vmull_vmlsl q3, d26, d21, d2[2], d2[3] // -> t21a - vrshrn.i32 d18, q4, #12 // t20a - vrshrn.i32 d29, q2, #12 // t27a + vqrshrn.s32 d18, q4, #12 // t20a + vqrshrn.s32 d29, q2, #12 // t27a vmull_vmlal q4, d26, d21, d2[3], d2[2] // -> t26a vmull_vmlsl q2, d22, d25, d3[0], d3[1] // -> t22a - vrshrn.i32 d26, q3, #12 // t21a - vrshrn.i32 d21, q4, #12 // t26a + vqrshrn.s32 d26, q3, #12 // t21a + vqrshrn.s32 d21, q4, #12 // t26a vmull_vmlal q3, d22, d25, d3[1], d3[0] // -> t25a vmull_vmlsl q4, d30, d17, d3[2], d3[3] // -> t23a - vrshrn.i32 d22, q2, #12 // t22a - vrshrn.i32 d25, q3, #12 // t25a + vqrshrn.s32 d22, q2, #12 // t22a + vqrshrn.s32 d25, q3, #12 // t25a vmull_vmlal q2, d30, d17, d3[3], d3[2] // -> t24a - vrshrn.i32 d30, q4, #12 // t23a - vrshrn.i32 d17, q2, #12 // t24a + vqrshrn.s32 d30, q4, #12 // t23a + vqrshrn.s32 d17, q2, #12 // t24a
vld1.16 {q0}, [r12, :128]
@@ -1979,21 +1979,21 @@ function inv_dct32_odd_4h_x16_neon, export=1 vmull_vmlsl q2, d3, d2, d1[0], d1[1] // -> t17a vmull_vmlal q3, d3, d2, d1[1], d1[0] // -> t30a vmull_vmlal q4, d19, d24, d1[1], d1[0] // -> t18a - vrshrn.i32 d21, q2, #12 // t17a - vrshrn.i32 d27, q3, #12 // t30a + vqrshrn.s32 d21, q2, #12 // t17a + vqrshrn.s32 d27, q3, #12 // t30a vneg.s32 q4, q4 // -> t18a vmull_vmlsl q1, d19, d24, d1[0], d1[1] // -> t29a vmull_vmlsl q2, d22, d18, d1[2], d1[3] // -> t21a - vrshrn.i32 d19, q4, #12 // t18a - vrshrn.i32 d24, q1, #12 // t29a + vqrshrn.s32 d19, q4, #12 // t18a + vqrshrn.s32 d24, q1, #12 // t29a vmull_vmlal q3, d22, d18, d1[3], d1[2] // -> t26a vmull_vmlal q4, d17, d20, d1[3], d1[2] // -> t22a - vrshrn.i32 d22, q2, #12 // t21a - vrshrn.i32 d18, q3, #12 // t26a + vqrshrn.s32 d22, q2, #12 // t21a + vqrshrn.s32 d18, q3, #12 // t26a vneg.s32 q4, q4 // -> t22a vmull_vmlsl q1, d17, d20, d1[2], d1[3] // -> t25a - vrshrn.i32 d17, q4, #12 // t22a - vrshrn.i32 d20, q1, #12 // t25a + vqrshrn.s32 d17, q4, #12 // t22a + vqrshrn.s32 d20, q1, #12 // t25a
vqsub.s16 d2, d27, d24 // t29 vqadd.s16 d27, d27, d24 // t30 @@ -2015,21 +2015,21 @@ function inv_dct32_odd_4h_x16_neon, export=1 vmull_vmlsl q2, d2, d3, d0[2], d0[3] // -> t18a vmull_vmlal q3, d2, d3, d0[3], d0[2] // -> t29a vmull_vmlsl q4, d29, d24, d0[2], d0[3] // -> t19 - vrshrn.i32 d18, q2, #12 // t18a - vrshrn.i32 d25, q3, #12 // t29a + vqrshrn.s32 d18, q2, #12 // t18a + vqrshrn.s32 d25, q3, #12 // t29a vmull_vmlal q1, d29, d24, d0[3], d0[2] // -> t28 vmull_vmlal q2, d26, d19, d0[3], d0[2] // -> t20 - vrshrn.i32 d29, q4, #12 // t19 - vrshrn.i32 d24, q1, #12 // t28 + vqrshrn.s32 d29, q4, #12 // t19 + vqrshrn.s32 d24, q1, #12 // t28 vneg.s32 q2, q2 // -> t20 vmull_vmlsl q3, d26, d19, d0[2], d0[3] // -> t27 vmull_vmlal q4, d20, d28, d0[3], d0[2] // -> t21a - vrshrn.i32 d26, q2, #12 // t20 - vrshrn.i32 d19, q3, #12 // t27 + vqrshrn.s32 d26, q2, #12 // t20 + vqrshrn.s32 d19, q3, #12 // t27 vneg.s32 q4, q4 // -> t21a vmull_vmlsl q1, d20, d28, d0[2], d0[3] // -> t26a - vrshrn.i32 d20, q4, #12 // t21a - vrshrn.i32 d28, q1, #12 // t26a + vqrshrn.s32 d20, q4, #12 // t21a + vqrshrn.s32 d28, q1, #12 // t26a
vqsub.s16 d2, d16, d30 // t23 vqadd.s16 d16, d16, d30 // t16 = out16 @@ -2051,24 +2051,24 @@ function inv_dct32_odd_4h_x16_neon, export=1
vmull_vmlsl q2, d24, d26, d0[0], d0[0] // -> t20 vmull_vmlal q3, d24, d26, d0[0], d0[0] // -> t27 - vrshrn.i32 d20, q2, #12 // t20 - vrshrn.i32 d22, q3, #12 // t27 + vqrshrn.s32 d20, q2, #12 // t20 + vqrshrn.s32 d22, q3, #12 // t27
vmull_vmlal q2, d25, d27, d0[0], d0[0] // -> t26a vmull_vmlsl q3, d25, d27, d0[0], d0[0] // -> t21a vmov d27, d22 // t27 - vrshrn.i32 d26, q2, #12 // t26a + vqrshrn.s32 d26, q2, #12 // t26a
vmull_vmlsl q12, d21, d23, d0[0], d0[0] // -> t22 vmull_vmlal q2, d21, d23, d0[0], d0[0] // -> t25 - vrshrn.i32 d21, q3, #12 // t21a - vrshrn.i32 d22, q12, #12 // t22 - vrshrn.i32 d25, q2, #12 // t25 + vqrshrn.s32 d21, q3, #12 // t21a + vqrshrn.s32 d22, q12, #12 // t22 + vqrshrn.s32 d25, q2, #12 // t25
vmull_vmlsl q2, d3, d2, d0[0], d0[0] // -> t23a vmull_vmlal q3, d3, d2, d0[0], d0[0] // -> t24a - vrshrn.i32 d23, q2, #12 // t23a - vrshrn.i32 d24, q3, #12 // t24a + vqrshrn.s32 d23, q2, #12 // t23a + vqrshrn.s32 d24, q3, #12 // t24a
bx lr endfunc @@ -2679,11 +2679,11 @@ function inv_dct64_step1_neon vmull_vmlsl q3, d29, d26, d2[1], d2[0] // -> t61a vneg.s32 q2, q2 // t34a vmull_vmlsl q4, d30, d25, d2[1], d2[0] // -> t33a - vrshrn.i32 d26, q2, #12 // t34a + vqrshrn.s32 d26, q2, #12 // t34a vmull_vmlal q2, d30, d25, d2[0], d2[1] // -> t62a - vrshrn.i32 d29, q3, #12 // t61a - vrshrn.i32 d25, q4, #12 // t33a - vrshrn.i32 d30, q2, #12 // t62a + vqrshrn.s32 d29, q3, #12 // t61a + vqrshrn.s32 d25, q4, #12 // t33a + vqrshrn.s32 d30, q2, #12 // t62a
vqadd.s16 d16, d24, d27 // t32a vqsub.s16 d19, d24, d27 // t35a @@ -2697,11 +2697,11 @@ function inv_dct64_step1_neon vmull_vmlal q2, d21, d18, d2[2], d2[3] // -> t61a vmull_vmlsl q3, d21, d18, d2[3], d2[2] // -> t34a vmull_vmlal q4, d20, d19, d2[2], d2[3] // -> t60 - vrshrn.i32 d21, q2, #12 // t61a - vrshrn.i32 d18, q3, #12 // t34a + vqrshrn.s32 d21, q2, #12 // t61a + vqrshrn.s32 d18, q3, #12 // t34a vmull_vmlsl q2, d20, d19, d2[3], d2[2] // -> t35 - vrshrn.i32 d20, q4, #12 // t60 - vrshrn.i32 d19, q2, #12 // t35 + vqrshrn.s32 d20, q4, #12 // t60 + vqrshrn.s32 d19, q2, #12 // t35
vst1.16 {d16, d17, d18, d19}, [r6, :128]! vst1.16 {d20, d21, d22, d23}, [r6, :128]! @@ -2738,12 +2738,12 @@ function inv_dct64_step2_neon vmull_vmlal q2, d27, d25, d0[3], d0[2] // -> t56a vmull_vmlsl q3, d27, d25, d0[2], d0[3] // -> t39a vmull_vmlal q4, d31, d28, d0[3], d0[2] // -> t40a - vrshrn.i32 d25, q2, #12 // t56a - vrshrn.i32 d27, q3, #12 // t39a + vqrshrn.s32 d25, q2, #12 // t56a + vqrshrn.s32 d27, q3, #12 // t39a vneg.s32 q4, q4 // t40a vmull_vmlsl q2, d31, d28, d0[2], d0[3] // -> t55a - vrshrn.i32 d31, q4, #12 // t40a - vrshrn.i32 d28, q2, #12 // t55a + vqrshrn.s32 d31, q4, #12 // t40a + vqrshrn.s32 d28, q2, #12 // t55a
vqadd.s16 d16, d24, d29 // t32a vqsub.s16 d19, d24, d29 // t47a @@ -2757,11 +2757,11 @@ function inv_dct64_step2_neon vmull_vmlsl q2, d21, d18, d0[0], d0[0] // -> t40a vmull_vmlal q3, d21, d18, d0[0], d0[0] // -> t55a vmull_vmlsl q4, d20, d19, d0[0], d0[0] // -> t47 - vrshrn.i32 d18, q2, #12 // t40a - vrshrn.i32 d21, q3, #12 // t55a + vqrshrn.s32 d18, q2, #12 // t40a + vqrshrn.s32 d21, q3, #12 // t55a vmull_vmlal q2, d20, d19, d0[0], d0[0] // -> t48 - vrshrn.i32 d19, q4, #12 // t47 - vrshrn.i32 d20, q2, #12 // t48 + vqrshrn.s32 d19, q4, #12 // t47 + vqrshrn.s32 d20, q2, #12 // t48
vstr d16, [r6, #2*4*0] // t32a vstr d17, [r9, #2*4*0] // t39 diff --git a/third_party/dav1d/src/arm/64/filmgrain.S b/third_party/dav1d/src/arm/64/filmgrain.S new file mode 100644 index 0000000000000..6cdd7ec5fa399 --- /dev/null +++ b/third_party/dav1d/src/arm/64/filmgrain.S @@ -0,0 +1,2010 @@ +/* + * Copyright © 2021, VideoLAN and dav1d authors + * Copyright © 2021, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" +#include "src/arm/asm-offsets.h" + +#define GRAIN_WIDTH 82 +#define GRAIN_HEIGHT 73 + +#define SUB_GRAIN_WIDTH 44 +#define SUB_GRAIN_HEIGHT 38 + +.macro increment_seed steps, shift=1 + lsr w11, w2, #3 + lsr w12, w2, #12 + lsr w13, w2, #1 + eor w11, w2, w11 // (r >> 0) ^ (r >> 3) + eor w12, w12, w13 // (r >> 12) ^ (r >> 1) + eor w11, w11, w12 // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1) +.if \shift + lsr w2, w2, #\steps +.endif + and w11, w11, #((1 << \steps) - 1) // bit +.if \shift + orr w2, w2, w11, lsl #(16 - \steps) // *state +.else + orr w2, w2, w11, lsl #16 // *state +.endif +.endm + +.macro read_rand dest, bits, age + ubfx \dest, x2, #16 - \bits - \age, #\bits +.endm + +.macro read_shift_rand dest, bits + ubfx \dest, x2, #17 - \bits, #\bits + lsr w2, w2, #1 +.endm + +// special calling convention: +// w2 holds seed +// x3 holds dav1d_gaussian_sequence +// clobbers x11-x15 +// returns in v0.8h +function get_gaussian_neon + increment_seed 4 + read_rand x14, 11, 3 + read_rand x15, 11, 2 + add x14, x3, x14, lsl #1 + add x15, x3, x15, lsl #1 + ld1 {v0.h}[0], [x14] + read_rand x14, 11, 1 + ld1 {v0.h}[1], [x15] + add x14, x3, x14, lsl #1 + read_rand x15, 11, 0 + increment_seed 4 + add x15, x3, x15, lsl #1 + ld1 {v0.h}[2], [x14] + read_rand x14, 11, 3 + ld1 {v0.h}[3], [x15] + add x14, x3, x14, lsl #1 + read_rand x15, 11, 2 + ld1 {v0.h}[4], [x14] + add x15, x3, x15, lsl #1 + read_rand x14, 11, 1 + ld1 {v0.h}[5], [x15] + read_rand x15, 11, 0 + add x14, x3, x14, lsl #1 + add x15, x3, x15, lsl #1 + ld1 {v0.h}[6], [x14] + ld1 {v0.h}[7], [x15] + ret +endfunc + +.macro get_grain_row r0, r1, r2, r3, r4, r5 + bl get_gaussian_neon + srshl \r5().8h, v0.8h, v31.8h + xtn \r0().8b, \r5().8h + bl get_gaussian_neon + srshl \r5().8h, v0.8h, v31.8h + xtn2 \r0().16b, \r5().8h + bl get_gaussian_neon + srshl \r5().8h, v0.8h, v31.8h + xtn \r1().8b, \r5().8h + bl get_gaussian_neon + srshl \r5().8h, v0.8h, v31.8h + xtn2 \r1().16b, \r5().8h + bl get_gaussian_neon + srshl \r5().8h, v0.8h, v31.8h + xtn \r2().8b, \r5().8h + bl get_gaussian_neon + srshl \r5().8h, v0.8h, v31.8h + xtn2 \r2().16b, \r5().8h + bl get_gaussian_neon + srshl \r5().8h, v0.8h, v31.8h + xtn \r3().8b, \r5().8h + bl get_gaussian_neon + srshl \r5().8h, v0.8h, v31.8h + xtn2 \r3().16b, \r5().8h + bl get_gaussian_neon + srshl \r5().8h, v0.8h, v31.8h + xtn \r4().8b, \r5().8h + bl get_gaussian_neon + srshl \r5().8h, v0.8h, v31.8h + xtn2 \r4().16b, \r5().8h + increment_seed 2 + read_rand x14, 11, 1 + read_rand x15, 11, 0 + add x14, x3, x14, lsl #1 + add x15, x3, x15, lsl #1 + ld1 {\r5().h}[0], [x14] + ld1 {\r5().h}[1], [x15] + srshl v0.4h, \r5().4h, v31.4h + xtn \r5().8b, v0.8h +.endm + +.macro store_grain_row r0, r1, r2, r3, r4, r5 + st1 {\r0().16b,\r1().16b}, [x0], #32 + st1 {\r2().16b,\r3().16b}, [x0], #32 + st1 {\r4().16b}, [x0], #16 + st1 {\r5().h}[0], [x0], #2 +.endm + +.macro get_grain_row_44 r0, r1, r2 + bl get_gaussian_neon + srshl \r2().8h, v0.8h, v31.8h + xtn \r0().8b, \r2().8h + bl get_gaussian_neon + srshl \r2().8h, v0.8h, v31.8h + xtn2 \r0().16b, \r2().8h + bl get_gaussian_neon + srshl \r2().8h, v0.8h, v31.8h + xtn \r1().8b, \r2().8h + bl get_gaussian_neon + srshl \r2().8h, v0.8h, v31.8h + xtn2 \r1().16b, \r2().8h + bl get_gaussian_neon + srshl \r2().8h, v0.8h, v31.8h + xtn \r2().8b, \r2().8h + + increment_seed 4 + read_rand x14, 11, 3 + read_rand x15, 11, 2 + add x14, x3, x14, lsl #1 + add x15, x3, x15, lsl #1 + ld1 {v0.h}[0], [x14] + read_rand x14, 11, 1 + ld1 {v0.h}[1], [x15] + read_rand x15, 11, 0 + add x14, x3, x14, lsl #1 + add x15, x3, x15, lsl #1 + ld1 {v0.h}[2], [x14] + ld1 {v0.h}[3], [x15] + srshl v0.4h, v0.4h, v31.4h + xtn2 \r2().16b, v0.8h +.endm + +.macro store_grain_row_44 r0, r1, r2 + st1 {\r0().16b,\r1().16b}, [x0], #32 + st1 {\r2().16b}, [x0] + add x0, x0, #GRAIN_WIDTH-32 +.endm + +function get_grain_2_neon + increment_seed 2 + read_rand x14, 11, 1 + read_rand x15, 11, 0 + add x14, x3, x14, lsl #1 + add x15, x3, x15, lsl #1 + ld1 {v0.h}[0], [x14] + ld1 {v0.h}[1], [x15] + srshl v0.4h, v0.4h, v31.4h + xtn v0.8b, v0.8h + ret +endfunc + +.macro get_grain_2 dst + bl get_grain_2_neon +.ifnc \dst, v0 + mov \dst().8b, v0.8b +.endif +.endm + +// w15 holds the number of entries to produce +// w14, w16 and w17 hold the previous output entries +// v0 holds the vector of produced entries +// v1 holds the input vector of sums from above +.macro output_lag n +function output_lag\n()_neon +1: + read_shift_rand x13, 11 + mov w11, v1.s[0] + ldrsh w12, [x3, x13, lsl #1] + ext v0.16b, v0.16b, v0.16b, #1 +.if \n == 1 + madd w11, w14, w4, w11 // sum (above) + *coeff * prev output +.elseif \n == 2 + madd w11, w16, w4, w11 // sum (above) + *coeff * prev output 1 + madd w11, w14, w17, w11 // += *coeff * prev output 2 + mov w16, w14 +.else + madd w11, w17, w4, w11 // sum (above) + *coeff * prev output 1 + madd w11, w16, w20, w11 // sum (above) + *coeff * prev output 2 + madd w11, w14, w21, w11 // += *coeff * prev output 3 + mov w17, w16 + mov w16, w14 +.endif + add w14, w11, w8 // 1 << (ar_coeff_shift - 1) + add w12, w12, w10 // 1 << (4 + grain_scale_shift - 1) + asr w14, w14, w7 // >> ar_coeff_shift + asr w12, w12, w9 // >> (4 + grain_scale_shift) + add w14, w14, w12 + cmp w14, w5 + csel w14, w14, w5, le + cmp w14, w6 + csel w14, w14, w6, ge + subs w15, w15, #1 + ext v1.16b, v1.16b, v1.16b, #4 + ins v0.b[15], w14 + b.gt 1b + ret +endfunc +.endm + +output_lag 1 +output_lag 2 +output_lag 3 + + +function sum_lag1_above_neon + smull v2.8h, v3.8b, v28.8b + smull2 v3.8h, v3.16b, v28.16b + smull v4.8h, v0.8b, v27.8b + smull2 v5.8h, v0.16b, v27.16b + smull v6.8h, v1.8b, v29.8b + smull2 v7.8h, v1.16b, v29.16b + saddl v0.4s, v2.4h, v4.4h + saddl2 v1.4s, v2.8h, v4.8h + saddl v2.4s, v3.4h, v5.4h + saddl2 v3.4s, v3.8h, v5.8h + saddw v4.4s, v0.4s, v6.4h + saddw2 v5.4s, v1.4s, v6.8h + saddw v6.4s, v2.4s, v7.4h + saddw2 v7.4s, v3.4s, v7.8h + ret +endfunc + +.macro sum_lag_n_body lag, type, uv_layout, edge, elems, store, uv_coeff + bl sum_\lag()_above_neon +.ifc \type, uv_420 + add x12, x19, #GRAIN_WIDTH + ld1 {v22.16b, v23.16b}, [x19], #32 + ld1 {v24.16b, v25.16b}, [x12] + saddlp v22.8h, v22.16b + saddlp v23.8h, v23.16b + saddlp v24.8h, v24.16b + saddlp v25.8h, v25.16b + add v22.8h, v22.8h, v24.8h + add v23.8h, v23.8h, v25.8h + rshrn v0.8b, v22.8h, #2 + rshrn2 v0.16b, v23.8h, #2 +.endif +.ifc \type, uv_422 + ld1 {v22.16b, v23.16b}, [x19], #32 + saddlp v22.8h, v22.16b + saddlp v23.8h, v23.16b + rshrn v0.8b, v22.8h, #1 + rshrn2 v0.16b, v23.8h, #1 +.endif +.ifc \type, uv_444 + ld1 {v0.16b}, [x19], #16 +.endif +.if \uv_layout +.ifnb \uv_coeff + dup v1.16b, \uv_coeff + smull v2.8h, v0.8b, v1.8b + smull2 v3.8h, v0.16b, v1.16b +.else + smull v2.8h, v0.8b, v30.8b + smull2 v3.8h, v0.16b, v30.16b +.endif + saddw v4.4s, v4.4s, v2.4h + saddw2 v5.4s, v5.4s, v2.8h + saddw v6.4s, v6.4s, v3.4h + saddw2 v7.4s, v7.4s, v3.8h +.endif +.if \uv_layout && \elems == 16 + b sum_\lag()_y_\edge()_start +.elseif \uv_layout == 444 && \elems == 15 + b sum_\lag()_y_\edge()_start +.elseif \uv_layout == 422 && \elems == 9 + b sum_\lag()_uv_420_\edge()_start +.else +sum_\lag()_\type()_\edge()_start: +.ifc \edge, left + increment_seed 4 + read_rand x12, 11, 3 + read_rand x13, 11, 2 + read_rand x14, 11, 1 + add x12, x3, x12, lsl #1 + add x13, x3, x13, lsl #1 + add x14, x3, x14, lsl #1 + ld1 {v0.h}[5], [x12] + ld1 {v0.h}[6], [x13] + ld1 {v0.h}[7], [x14] + lsl x2, x2, #1 // shift back the state as if we'd done increment_seed with shift=0 + srshl v0.8h, v0.8h, v31.8h + xtn2 v0.16b, v0.8h + ext v4.16b, v4.16b, v4.16b, #12 +.ifc \lag, lag3 + smov w17, v0.b[13] +.endif +.ifnc \lag, lag1 + smov w16, v0.b[14] +.endif + smov w14, v0.b[15] + + mov v1.16b, v4.16b + mov w15, #1 + bl output_\lag()_neon +.else + increment_seed 4, shift=0 + mov v1.16b, v4.16b + mov w15, #4 + bl output_\lag()_neon +.endif + + increment_seed 4, shift=0 + mov v1.16b, v5.16b + mov w15, #4 + bl output_\lag()_neon + + increment_seed 4, shift=0 + mov v1.16b, v6.16b +.if \elems == 9 + mov w15, #1 + bl output_\lag()_neon + lsr w2, w2, #3 + + read_rand x12, 11, 2 + read_rand x13, 11, 1 + read_rand x14, 11, 0 + add x12, x3, x12, lsl #1 + add x13, x3, x13, lsl #1 + add x14, x3, x14, lsl #1 + ld1 {v1.h}[0], [x12] + ld1 {v1.h}[1], [x13] + ld1 {v1.h}[2], [x14] + srshl v1.4h, v1.4h, v31.4h + xtn v1.8b, v1.8h + ext v0.16b, v0.16b, v1.16b, #7 +.else + mov w15, #4 + bl output_\lag()_neon + + increment_seed 4, shift=0 + mov v1.16b, v7.16b + +.ifc \edge, right + mov w15, #3 + bl output_\lag()_neon + read_shift_rand x15, 11 + add x15, x3, x15, lsl #1 + ld1 {v1.h}[0], [x15] + srshl v1.4h, v1.4h, v31.4h + ext v0.16b, v0.16b, v1.16b, #1 +.else + mov w15, #4 + bl output_\lag()_neon +.endif +.endif +.if \store + st1 {v0.16b}, [x0], #16 +.endif + ldr x30, [sp], #16 + AARCH64_VALIDATE_LINK_REGISTER + ret +.endif +.endm + +.macro sum_lag1_func type, uv_layout, edge, elems=16 +function sum_\type()_lag1_\edge()_neon + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-16]! + sum_lag_n_body lag1, \type, \uv_layout, \edge, \elems, store=0 +endfunc +.endm + +sum_lag1_func y, 0, left +sum_lag1_func y, 0, mid +sum_lag1_func y, 0, right, 15 +sum_lag1_func uv_444, 444, left +sum_lag1_func uv_444, 444, mid +sum_lag1_func uv_444, 444, right, 15 +sum_lag1_func uv_422, 422, left +sum_lag1_func uv_422, 422, mid +sum_lag1_func uv_422, 422, right, 9 +sum_lag1_func uv_420, 420, left +sum_lag1_func uv_420, 420, mid +sum_lag1_func uv_420, 420, right, 9 + +.macro sum_lag1 type, dst, left, mid, right, edge=mid + mov v3.16b, \mid().16b + ext v0.16b, \left().16b, \mid().16b, #15 + ext v1.16b, \mid().16b, \right().16b, #1 + bl sum_\type()_lag1_\edge()_neon + mov \dst().16b, v0.16b +.endm + +.macro sum_y_lag1 dst, left, mid, right, edge=mid + sum_lag1 y, \dst, \left, \mid, \right, \edge +.endm + +.macro sum_uv_444_lag1 dst, left, mid, right, edge=mid + sum_lag1 uv_444, \dst, \left, \mid, \right, \edge +.endm + +.macro sum_uv_422_lag1 dst, left, mid, right, edge=mid + sum_lag1 uv_422, \dst, \left, \mid, \right, \edge +.endm + +.macro sum_uv_420_lag1 dst, left, mid, right, edge=mid + sum_lag1 uv_420, \dst, \left, \mid, \right, \edge +.endm + + +function sum_lag2_above_neon + sub x12, x0, #2*GRAIN_WIDTH - 16 + sub x13, x0, #1*GRAIN_WIDTH - 16 + ld1 {v18.16b}, [x12] // load top right + ld1 {v21.16b}, [x13] + + ext v22.16b, v16.16b, v17.16b, #14 // top left, top mid + dup v26.16b, v30.b[0] + ext v23.16b, v16.16b, v17.16b, #15 + dup v27.16b, v30.b[1] + ext v0.16b, v17.16b, v18.16b, #1 // top mid, top right + dup v28.16b, v30.b[3] + ext v1.16b, v17.16b, v18.16b, #2 + dup v29.16b, v30.b[4] + + smull v2.8h, v22.8b, v26.8b + smull2 v3.8h, v22.16b, v26.16b + smull v4.8h, v23.8b, v27.8b + smull2 v5.8h, v23.16b, v27.16b + smull v6.8h, v0.8b, v28.8b + smull2 v7.8h, v0.16b, v28.16b + smull v0.8h, v1.8b, v29.8b + smull2 v1.8h, v1.16b, v29.16b + saddl v22.4s, v2.4h, v4.4h + saddl2 v23.4s, v2.8h, v4.8h + saddl v26.4s, v3.4h, v5.4h + saddl2 v27.4s, v3.8h, v5.8h + saddl v2.4s, v0.4h, v6.4h + saddl2 v3.4s, v0.8h, v6.8h + saddl v6.4s, v1.4h, v7.4h + saddl2 v7.4s, v1.8h, v7.8h + add v4.4s, v22.4s, v2.4s + add v5.4s, v23.4s, v3.4s + add v6.4s, v26.4s, v6.4s + add v7.4s, v27.4s, v7.4s + + ext v22.16b, v19.16b, v20.16b, #14 // top left, top mid + dup v26.16b, v30.b[5] + ext v23.16b, v19.16b, v20.16b, #15 + dup v27.16b, v30.b[6] + ext v0.16b, v20.16b, v21.16b, #1 // top mid, top right + dup v28.16b, v30.b[8] + ext v1.16b, v20.16b, v21.16b, #2 + dup v29.16b, v30.b[9] + + smull v2.8h, v22.8b, v26.8b + smull2 v3.8h, v22.16b, v26.16b + smull v22.8h, v23.8b, v27.8b + smull2 v23.8h, v23.16b, v27.16b + smull v26.8h, v0.8b, v28.8b + smull2 v27.8h, v0.16b, v28.16b + smull v28.8h, v1.8b, v29.8b + smull2 v29.8h, v1.16b, v29.16b + saddl v0.4s, v2.4h, v22.4h + saddl2 v1.4s, v2.8h, v22.8h + saddl v2.4s, v3.4h, v23.4h + saddl2 v3.4s, v3.8h, v23.8h + saddl v22.4s, v26.4h, v28.4h + saddl2 v23.4s, v26.8h, v28.8h + saddl v26.4s, v27.4h, v29.4h + saddl2 v27.4s, v27.8h, v29.8h + add v0.4s, v0.4s, v22.4s + add v1.4s, v1.4s, v23.4s + add v2.4s, v2.4s, v26.4s + add v3.4s, v3.4s, v27.4s + dup v26.16b, v30.b[2] + dup v27.16b, v30.b[7] + smull v22.8h, v17.8b, v26.8b + smull2 v23.8h, v17.16b, v26.16b + smull v24.8h, v20.8b, v27.8b + smull2 v25.8h, v20.16b, v27.16b + add v4.4s, v4.4s, v0.4s + add v5.4s, v5.4s, v1.4s + add v6.4s, v6.4s, v2.4s + add v7.4s, v7.4s, v3.4s + + mov v16.16b, v17.16b + mov v17.16b, v18.16b + + saddl v0.4s, v22.4h, v24.4h + saddl2 v1.4s, v22.8h, v24.8h + saddl v2.4s, v23.4h, v25.4h + saddl2 v3.4s, v23.8h, v25.8h + mov v19.16b, v20.16b + mov v20.16b, v21.16b + add v4.4s, v4.4s, v0.4s + add v5.4s, v5.4s, v1.4s + add v6.4s, v6.4s, v2.4s + add v7.4s, v7.4s, v3.4s + ret +endfunc + +.macro sum_lag2_func type, uv_layout, edge, elems=16 +function sum_\type()_lag2_\edge()_neon + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-16]! +.ifc \edge, left + sub x12, x0, #2*GRAIN_WIDTH + sub x13, x0, #1*GRAIN_WIDTH + ld1 {v17.16b}, [x12] // load the previous block right above + ld1 {v20.16b}, [x13] +.endif + sum_lag_n_body lag2, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=v30.b[12] +endfunc +.endm + +sum_lag2_func y, 0, left +sum_lag2_func y, 0, mid +sum_lag2_func y, 0, right, 15 +sum_lag2_func uv_444, 444, left +sum_lag2_func uv_444, 444, mid +sum_lag2_func uv_444, 444, right, 15 +sum_lag2_func uv_422, 422, left +sum_lag2_func uv_422, 422, mid +sum_lag2_func uv_422, 422, right, 9 +sum_lag2_func uv_420, 420, left +sum_lag2_func uv_420, 420, mid +sum_lag2_func uv_420, 420, right, 9 + + +function sum_lag3_above_neon + sub x11, x0, #3*GRAIN_WIDTH - 16 + sub x12, x0, #2*GRAIN_WIDTH - 16 + sub x13, x0, #1*GRAIN_WIDTH - 16 + ld1 {v15.16b}, [x11] // load top right + ld1 {v18.16b}, [x12] + ld1 {v21.16b}, [x13] + + ext v8.16b, v13.16b, v14.16b, #13 // top left, top mid + dup v22.16b, v29.b[0] + ext v9.16b, v13.16b, v14.16b, #14 + dup v23.16b, v29.b[1] + ext v10.16b, v13.16b, v14.16b, #15 + dup v24.16b, v29.b[2] + dup v25.16b, v29.b[3] + ext v11.16b, v14.16b, v15.16b, #1 // top mid, top right + dup v26.16b, v29.b[4] + ext v12.16b, v14.16b, v15.16b, #2 + dup v27.16b, v29.b[5] + ext v13.16b, v14.16b, v15.16b, #3 + dup v28.16b, v29.b[6] + + smull v0.8h, v8.8b, v22.8b + smull2 v1.8h, v8.16b, v22.16b + smull v2.8h, v9.8b, v23.8b + smull2 v3.8h, v9.16b, v23.16b + smull v8.8h, v10.8b, v24.8b + smull2 v9.8h, v10.16b, v24.16b + smull v10.8h, v11.8b, v26.8b + smull2 v11.8h, v11.16b, v26.16b + saddl v22.4s, v0.4h, v2.4h + saddl2 v23.4s, v0.8h, v2.8h + saddl v24.4s, v1.4h, v3.4h + saddl2 v26.4s, v1.8h, v3.8h + saddl v0.4s, v8.4h, v10.4h + saddl2 v1.4s, v8.8h, v10.8h + saddl v2.4s, v9.4h, v11.4h + saddl2 v3.4s, v9.8h, v11.8h + smull v8.8h, v12.8b, v27.8b + smull2 v9.8h, v12.16b, v27.16b + smull v10.8h, v13.8b, v28.8b + smull2 v11.8h, v13.16b, v28.16b + smull v12.8h, v14.8b, v25.8b + smull2 v13.8h, v14.16b, v25.16b + add v4.4s, v22.4s, v0.4s + add v5.4s, v23.4s, v1.4s + add v6.4s, v24.4s, v2.4s + add v7.4s, v26.4s, v3.4s + saddl v0.4s, v8.4h, v10.4h + saddl2 v1.4s, v8.8h, v10.8h + saddl v2.4s, v9.4h, v11.4h + saddl2 v3.4s, v9.8h, v11.8h + add v4.4s, v4.4s, v0.4s + add v5.4s, v5.4s, v1.4s + add v6.4s, v6.4s, v2.4s + add v7.4s, v7.4s, v3.4s + saddw v4.4s, v4.4s, v12.4h + saddw2 v5.4s, v5.4s, v12.8h + saddw v6.4s, v6.4s, v13.4h + saddw2 v7.4s, v7.4s, v13.8h + + ext v8.16b, v16.16b, v17.16b, #13 // top left, top mid + dup v22.16b, v29.b[7] + ext v9.16b, v16.16b, v17.16b, #14 + dup v23.16b, v29.b[8] + ext v10.16b, v16.16b, v17.16b, #15 + dup v24.16b, v29.b[9] + dup v25.16b, v29.b[10] + ext v11.16b, v17.16b, v18.16b, #1 // top mid, top right + dup v26.16b, v29.b[11] + ext v12.16b, v17.16b, v18.16b, #2 + dup v27.16b, v29.b[12] + ext v13.16b, v17.16b, v18.16b, #3 + dup v28.16b, v29.b[13] + + smull v0.8h, v8.8b, v22.8b + smull2 v1.8h, v8.16b, v22.16b + smull v2.8h, v9.8b, v23.8b + smull2 v3.8h, v9.16b, v23.16b + smull v8.8h, v10.8b, v24.8b + smull2 v9.8h, v10.16b, v24.16b + smull v10.8h, v11.8b, v26.8b + smull2 v11.8h, v11.16b, v26.16b + saddl v22.4s, v0.4h, v2.4h + saddl2 v23.4s, v0.8h, v2.8h + saddl v24.4s, v1.4h, v3.4h + saddl2 v26.4s, v1.8h, v3.8h + saddl v0.4s, v8.4h, v10.4h + saddl2 v1.4s, v8.8h, v10.8h + saddl v2.4s, v9.4h, v11.4h + saddl2 v3.4s, v9.8h, v11.8h + smull v8.8h, v12.8b, v27.8b + smull2 v9.8h, v12.16b, v27.16b + smull v10.8h, v13.8b, v28.8b + smull2 v11.8h, v13.16b, v28.16b + smull v12.8h, v17.8b, v25.8b + smull2 v13.8h, v17.16b, v25.16b + add v22.4s, v22.4s, v0.4s + add v23.4s, v23.4s, v1.4s + add v24.4s, v24.4s, v2.4s + add v26.4s, v26.4s, v3.4s + saddl v0.4s, v8.4h, v10.4h + saddl2 v1.4s, v8.8h, v10.8h + saddl v2.4s, v9.4h, v11.4h + saddl2 v3.4s, v9.8h, v11.8h + add v4.4s, v4.4s, v22.4s + add v5.4s, v5.4s, v23.4s + add v6.4s, v6.4s, v24.4s + add v7.4s, v7.4s, v26.4s + add v4.4s, v4.4s, v0.4s + add v5.4s, v5.4s, v1.4s + add v6.4s, v6.4s, v2.4s + add v7.4s, v7.4s, v3.4s + saddw v4.4s, v4.4s, v12.4h + saddw2 v5.4s, v5.4s, v12.8h + saddw v6.4s, v6.4s, v13.4h + saddw2 v7.4s, v7.4s, v13.8h + + ext v8.16b, v19.16b, v20.16b, #13 // top left, top mid + dup v22.16b, v29.b[14] + ext v9.16b, v19.16b, v20.16b, #14 + dup v23.16b, v29.b[15] + ext v10.16b, v19.16b, v20.16b, #15 + dup v24.16b, v30.b[0] + dup v25.16b, v30.b[1] + ext v11.16b, v20.16b, v21.16b, #1 // top mid, top right + dup v26.16b, v30.b[2] + ext v12.16b, v20.16b, v21.16b, #2 + dup v27.16b, v30.b[3] + ext v13.16b, v20.16b, v21.16b, #3 + dup v28.16b, v30.b[4] + + smull v0.8h, v8.8b, v22.8b + smull2 v1.8h, v8.16b, v22.16b + smull v2.8h, v9.8b, v23.8b + smull2 v3.8h, v9.16b, v23.16b + smull v8.8h, v10.8b, v24.8b + smull2 v9.8h, v10.16b, v24.16b + smull v10.8h, v11.8b, v26.8b + smull2 v11.8h, v11.16b, v26.16b + saddl v22.4s, v0.4h, v2.4h + saddl2 v23.4s, v0.8h, v2.8h + saddl v24.4s, v1.4h, v3.4h + saddl2 v26.4s, v1.8h, v3.8h + saddl v0.4s, v8.4h, v10.4h + saddl2 v1.4s, v8.8h, v10.8h + saddl v2.4s, v9.4h, v11.4h + saddl2 v3.4s, v9.8h, v11.8h + smull v8.8h, v12.8b, v27.8b + smull2 v9.8h, v12.16b, v27.16b + smull v10.8h, v13.8b, v28.8b + smull2 v11.8h, v13.16b, v28.16b + smull v12.8h, v20.8b, v25.8b + smull2 v19.8h, v20.16b, v25.16b + add v22.4s, v22.4s, v0.4s + add v23.4s, v23.4s, v1.4s + add v24.4s, v24.4s, v2.4s + add v26.4s, v26.4s, v3.4s + saddl v0.4s, v8.4h, v10.4h + saddl2 v1.4s, v8.8h, v10.8h + saddl v2.4s, v9.4h, v11.4h + saddl2 v3.4s, v9.8h, v11.8h + add v4.4s, v4.4s, v22.4s + add v5.4s, v5.4s, v23.4s + add v6.4s, v6.4s, v24.4s + add v7.4s, v7.4s, v26.4s + mov v13.16b, v14.16b + mov v14.16b, v15.16b + add v4.4s, v4.4s, v0.4s + add v5.4s, v5.4s, v1.4s + add v6.4s, v6.4s, v2.4s + add v7.4s, v7.4s, v3.4s + mov v16.16b, v17.16b + mov v17.16b, v18.16b + saddw v4.4s, v4.4s, v12.4h + saddw2 v5.4s, v5.4s, v12.8h + saddw v6.4s, v6.4s, v19.4h + saddw2 v7.4s, v7.4s, v19.8h + + mov v19.16b, v20.16b + mov v20.16b, v21.16b + ret +endfunc + +.macro sum_lag3_func type, uv_layout, edge, elems=16 +function sum_\type()_lag3_\edge()_neon + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-16]! +.ifc \edge, left + sub x11, x0, #3*GRAIN_WIDTH + sub x12, x0, #2*GRAIN_WIDTH + sub x13, x0, #1*GRAIN_WIDTH + ld1 {v14.16b}, [x11] // load the previous block right above + ld1 {v17.16b}, [x12] + ld1 {v20.16b}, [x13] +.endif + sum_lag_n_body lag3, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=v30.b[8] +endfunc +.endm + +sum_lag3_func y, 0, left +sum_lag3_func y, 0, mid +sum_lag3_func y, 0, right, 15 +sum_lag3_func uv_444, 444, left +sum_lag3_func uv_444, 444, mid +sum_lag3_func uv_444, 444, right, 15 +sum_lag3_func uv_422, 422, left +sum_lag3_func uv_422, 422, mid +sum_lag3_func uv_422, 422, right, 9 +sum_lag3_func uv_420, 420, left +sum_lag3_func uv_420, 420, mid +sum_lag3_func uv_420, 420, right, 9 + +function generate_grain_rows_neon + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-16]! +1: + get_grain_row v16, v17, v18, v19, v20, v21 + subs w1, w1, #1 + store_grain_row v16, v17, v18, v19, v20, v21 + b.gt 1b + ldr x30, [sp], #16 + AARCH64_VALIDATE_LINK_REGISTER + ret +endfunc + +function generate_grain_rows_44_neon + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-16]! +1: + get_grain_row_44 v16, v17, v18 + subs w1, w1, #1 + store_grain_row_44 v16, v17, v18 + b.gt 1b + ldr x30, [sp], #16 + AARCH64_VALIDATE_LINK_REGISTER + ret +endfunc + +function get_grain_row_neon + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-16]! + get_grain_row v16, v17, v18, v19, v20, v21 + ldr x30, [sp], #16 + AARCH64_VALIDATE_LINK_REGISTER + ret +endfunc + +function get_grain_row_44_neon + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-16]! + get_grain_row_44 v16, v17, v18 + ldr x30, [sp], #16 + AARCH64_VALIDATE_LINK_REGISTER + ret +endfunc + +function add_uv_444_coeff_lag0_neon +add_coeff_lag0_start: + smull v2.8h, v0.8b, v27.8b + smull2 v3.8h, v0.16b, v27.16b + srshl v2.8h, v2.8h, v28.8h + srshl v3.8h, v3.8h, v28.8h + saddw v2.8h, v2.8h, v1.8b + saddw2 v3.8h, v3.8h, v1.16b + sqxtn v2.8b, v2.8h + sqxtn2 v2.16b, v3.8h + ret +endfunc + +function add_uv_420_coeff_lag0_neon + ld1 {v4.16b, v5.16b}, [x19], #32 + ld1 {v6.16b, v7.16b}, [x12], #32 + saddlp v4.8h, v4.16b + saddlp v5.8h, v5.16b + saddlp v6.8h, v6.16b + saddlp v7.8h, v7.16b + add v4.8h, v4.8h, v6.8h + add v5.8h, v5.8h, v7.8h + rshrn v4.8b, v4.8h, #2 + rshrn2 v4.16b, v5.8h, #2 + and v0.16b, v4.16b, v0.16b + b add_coeff_lag0_start +endfunc + +function add_uv_422_coeff_lag0_neon + ld1 {v4.16b, v5.16b}, [x19], #32 + saddlp v4.8h, v4.16b + saddlp v5.8h, v5.16b + rshrn v4.8b, v4.8h, #1 + rshrn2 v4.16b, v5.8h, #1 + and v0.16b, v4.16b, v0.16b + b add_coeff_lag0_start +endfunc + +.macro gen_grain_82 type +function generate_grain_\type()_8bpc_neon, export=1 + AARCH64_SIGN_LINK_REGISTER + stp x30, x19, [sp, #-96]! + +.ifc \type, uv_444 + mov w13, w3 + mov w14, #28 + add x19, x1, #3*GRAIN_WIDTH + mov x1, x2 + mul w13, w13, w14 +.endif + movrel x3, X(gaussian_sequence) + ldr w2, [x1, #FGD_SEED] + ldr w9, [x1, #FGD_GRAIN_SCALE_SHIFT] +.ifc \type, y + add x4, x1, #FGD_AR_COEFFS_Y +.else + add x4, x1, #FGD_AR_COEFFS_UV +.endif + adr x16, L(gen_grain_\type()_tbl) + ldr w17, [x1, #FGD_AR_COEFF_LAG] + add w9, w9, #4 + ldrh w17, [x16, w17, uxtw #1] + dup v31.8h, w9 // 4 + data->grain_scale_shift + sub x16, x16, w17, uxtw + neg v31.8h, v31.8h + +.ifc \type, uv_444 + cmp w13, #0 + mov w11, #0x49d8 + mov w14, #0xb524 + add x4, x4, w13, uxtw // Add offset to ar_coeffs_uv[1] + csel w11, w11, w14, ne +.endif + + ldr w7, [x1, #FGD_AR_COEFF_SHIFT] + mov w8, #1 + mov w10, #1 + lsl w8, w8, w7 // 1 << ar_coeff_shift + lsl w10, w10, w9 // 1 << (4 + data->grain_scale_shift) + lsr w8, w8, #1 // 1 << (ar_coeff_shift - 1) + lsr w10, w10, #1 // 1 << (4 + data->grain_scale_shift - 1) + mov w5, #127 + mov w6, #-128 + +.ifc \type, uv_444 + eor w2, w2, w11 +.endif + + br x16 + +L(generate_grain_\type()_lag0): + AARCH64_VALID_JUMP_TARGET +.ifc \type, y + mov w1, #GRAIN_HEIGHT + bl generate_grain_rows_neon +.else + dup v28.8h, w7 + ld1r {v27.16b}, [x4] // ar_coeffs_uv[0] + movi v0.16b, #0 + movi v1.16b, #255 + ext v29.16b, v0.16b, v1.16b, #13 + ext v30.16b, v1.16b, v0.16b, #1 + neg v28.8h, v28.8h + + mov w1, #3 + bl generate_grain_rows_neon + mov w1, #GRAIN_HEIGHT-3 +1: + ld1 {v22.16b, v23.16b, v24.16b, v25.16b}, [x19], #64 + bl get_grain_row_neon + and v0.16b, v22.16b, v29.16b + mov v1.16b, v16.16b + bl add_uv_444_coeff_lag0_neon + mov v0.16b, v23.16b + mov v1.16b, v17.16b + mov v16.16b, v2.16b + bl add_uv_444_coeff_lag0_neon + ld1 {v26.16b}, [x19], #16 + mov v0.16b, v24.16b + mov v1.16b, v18.16b + mov v17.16b, v2.16b + bl add_uv_444_coeff_lag0_neon + add x19, x19, #2 + mov v0.16b, v25.16b + mov v1.16b, v19.16b + mov v18.16b, v2.16b + bl add_uv_444_coeff_lag0_neon + and v0.16b, v26.16b, v30.16b + mov v1.16b, v20.16b + mov v19.16b, v2.16b + bl add_uv_444_coeff_lag0_neon + mov v20.16b, v2.16b + subs w1, w1, #1 + store_grain_row v16, v17, v18, v19, v20, v21 + b.gt 1b +.endif + ldp x30, x19, [sp], #96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(generate_grain_\type()_lag1): + AARCH64_VALID_JUMP_TARGET + ld1r {v27.16b}, [x4], #1 // ar_coeffs_y[0] + ld1r {v28.16b}, [x4], #1 // ar_coeffs_y[1] + ld1r {v29.16b}, [x4] // ar_coeffs_y[2] +.ifc \type, y + ldrsb w4, [x4, #1] // ar_coeffs_y[3] +.else + add x4, x4, #2 +.endif + + mov w1, #3 +.ifc \type, uv_444 + ld1r {v30.16b}, [x4] // ar_coeffs_uv[4] + ldursb w4, [x4, #-1] // ar_coeffs_uv[3] +.endif + bl generate_grain_rows_neon + + mov w1, #GRAIN_HEIGHT - 3 +1: + sum_\type()_lag1 v22, v16, v16, v17, left + sum_\type()_lag1 v23, v16, v17, v18 + sum_\type()_lag1 v24, v17, v18, v19 + sum_\type()_lag1 v25, v18, v19, v20 + sum_\type()_lag1 v20, v19, v20, v21, right + get_grain_2 v21 + subs w1, w1, #1 +.ifc \type, uv_444 + add x19, x19, #2 +.endif + store_grain_row v22, v23, v24, v25, v20, v21 + mov v16.16b, v22.16b + mov v17.16b, v23.16b + mov v18.16b, v24.16b + mov v19.16b, v25.16b + b.gt 1b + + ldp x30, x19, [sp], #96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(generate_grain_\type()_lag2): + AARCH64_VALID_JUMP_TARGET + ld1 {v30.16b}, [x4] // ar_coeffs_y[0-11], ar_coeffs_uv[0-12] + + smov w4, v30.b[10] + smov w17, v30.b[11] + + mov w1, #3 + bl generate_grain_rows_neon + + mov w1, #GRAIN_HEIGHT - 3 +1: + bl sum_\type()_lag2_left_neon + bl sum_\type()_lag2_mid_neon + bl sum_\type()_lag2_mid_neon + bl sum_\type()_lag2_mid_neon + bl sum_\type()_lag2_right_neon + get_grain_2 v16 + subs w1, w1, #1 +.ifc \type, uv_444 + add x19, x19, #2 +.endif + st1 {v16.h}[0], [x0], #2 + b.gt 1b + + ldp x30, x19, [sp], #96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(generate_grain_\type()_lag3): + AARCH64_VALID_JUMP_TARGET + ld1 {v29.16b, v30.16b}, [x4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24] + stp d8, d9, [sp, #16] + stp d10, d11, [sp, #32] + stp d12, d13, [sp, #48] + stp d14, d15, [sp, #64] + stp x20, x21, [sp, #80] + + smov w4, v30.b[5] + smov w20, v30.b[6] + smov w21, v30.b[7] + + mov w1, #3 + bl generate_grain_rows_neon + + mov w1, #GRAIN_HEIGHT - 3 +1: + bl sum_\type()_lag3_left_neon + bl sum_\type()_lag3_mid_neon + bl sum_\type()_lag3_mid_neon + bl sum_\type()_lag3_mid_neon + bl sum_\type()_lag3_right_neon + get_grain_2 v16 + subs w1, w1, #1 +.ifc \type, uv_444 + add x19, x19, #2 +.endif + st1 {v16.h}[0], [x0], #2 + b.gt 1b + + ldp x20, x21, [sp, #80] + ldp d14, d15, [sp, #64] + ldp d12, d13, [sp, #48] + ldp d10, d11, [sp, #32] + ldp d8, d9, [sp, #16] + ldp x30, x19, [sp], #96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(gen_grain_\type()_tbl): + .hword L(gen_grain_\type()_tbl) - L(generate_grain_\type()_lag0) + .hword L(gen_grain_\type()_tbl) - L(generate_grain_\type()_lag1) + .hword L(gen_grain_\type()_tbl) - L(generate_grain_\type()_lag2) + .hword L(gen_grain_\type()_tbl) - L(generate_grain_\type()_lag3) +endfunc +.endm + +gen_grain_82 y +gen_grain_82 uv_444 + +.macro set_height dst, type +.ifc \type, uv_420 + mov \dst, #SUB_GRAIN_HEIGHT-3 +.else + mov \dst, #GRAIN_HEIGHT-3 +.endif +.endm + +.macro increment_y_ptr reg, type +.ifc \type, uv_420 + add \reg, \reg, #2*GRAIN_WIDTH-(3*32) +.else + sub \reg, \reg, #3*32-GRAIN_WIDTH +.endif +.endm + +.macro gen_grain_44 type +function generate_grain_\type()_8bpc_neon, export=1 + AARCH64_SIGN_LINK_REGISTER + stp x30, x19, [sp, #-96]! + + mov w13, w3 + mov w14, #28 + add x19, x1, #3*GRAIN_WIDTH-3 + mov x1, x2 + mul w13, w13, w14 + + movrel x3, X(gaussian_sequence) + ldr w2, [x1, #FGD_SEED] + ldr w9, [x1, #FGD_GRAIN_SCALE_SHIFT] + add x4, x1, #FGD_AR_COEFFS_UV + adr x16, L(gen_grain_\type()_tbl) + ldr w17, [x1, #FGD_AR_COEFF_LAG] + add w9, w9, #4 + ldrh w17, [x16, w17, uxtw #1] + dup v31.8h, w9 // 4 + data->grain_scale_shift + sub x16, x16, w17, uxtw + neg v31.8h, v31.8h + + cmp w13, #0 + mov w11, #0x49d8 + mov w14, #0xb524 + add x4, x4, w13, uxtw // Add offset to ar_coeffs_uv[1] + csel w11, w11, w14, ne + + ldr w7, [x1, #FGD_AR_COEFF_SHIFT] + mov w8, #1 + mov w10, #1 + lsl w8, w8, w7 // 1 << ar_coeff_shift + lsl w10, w10, w9 // 1 << (4 + data->grain_scale_shift) + lsr w8, w8, #1 // 1 << (ar_coeff_shift - 1) + lsr w10, w10, #1 // 1 << (4 + data->grain_scale_shift - 1) + mov w5, #127 + mov w6, #-128 + + eor w2, w2, w11 + + br x16 + +L(generate_grain_\type()_lag0): + AARCH64_VALID_JUMP_TARGET + dup v28.8h, w7 + ld1r {v27.16b}, [x4] // ar_coeffs_uv[0] + movi v0.16b, #0 + movi v1.16b, #255 + ext v29.16b, v0.16b, v1.16b, #13 + ext v30.16b, v1.16b, v0.16b, #7 + neg v28.8h, v28.8h + + mov w1, #3 + bl generate_grain_rows_44_neon + set_height w1, \type +1: + bl get_grain_row_44_neon +.ifc \type, uv_420 + add x12, x19, #GRAIN_WIDTH +.endif + mov v0.16b, v29.16b + mov v1.16b, v16.16b + bl add_\type()_coeff_lag0_neon + movi v0.16b, #255 + mov v1.16b, v17.16b + mov v16.16b, v2.16b + bl add_\type()_coeff_lag0_neon + mov v0.16b, v30.16b + mov v1.16b, v18.16b + mov v17.16b, v2.16b + bl add_\type()_coeff_lag0_neon + mov v18.16b, v2.16b + subs w1, w1, #1 + increment_y_ptr x19, \type + store_grain_row_44 v16, v17, v18 + b.gt 1b + + ldp x30, x19, [sp], #96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(generate_grain_\type()_lag1): + AARCH64_VALID_JUMP_TARGET + ld1r {v27.16b}, [x4], #1 // ar_coeffs_uv[0] + ld1r {v28.16b}, [x4], #1 // ar_coeffs_uv[1] + ld1r {v29.16b}, [x4] // ar_coeffs_uv[2] + add x4, x4, #2 + + mov w1, #3 + ld1r {v30.16b}, [x4] // ar_coeffs_u4[4] + ldursb w4, [x4, #-1] // ar_coeffs_uv[3] + bl generate_grain_rows_44_neon + + set_height w1, \type +1: + sum_\type()_lag1 v20, v16, v16, v17, left + sum_\type()_lag1 v21, v16, v17, v18 + sum_\type()_lag1 v18, v17, v18, v18, right + subs w1, w1, #1 + increment_y_ptr x19, \type + store_grain_row_44 v20, v21, v18 + mov v16.16b, v20.16b + mov v17.16b, v21.16b + b.gt 1b + + ldp x30, x19, [sp], #96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(generate_grain_\type()_lag2): + AARCH64_VALID_JUMP_TARGET + ld1 {v30.16b}, [x4] // ar_coeffs_uv[0-12] + + smov w4, v30.b[10] + smov w17, v30.b[11] + + mov w1, #3 + bl generate_grain_rows_44_neon + + set_height w1, \type +1: + bl sum_\type()_lag2_left_neon + bl sum_\type()_lag2_mid_neon + bl sum_\type()_lag2_right_neon + subs w1, w1, #1 + increment_y_ptr x19, \type + add x0, x0, #GRAIN_WIDTH-48 + b.gt 1b + + ldp x30, x19, [sp], #96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(generate_grain_\type()_lag3): + AARCH64_VALID_JUMP_TARGET + ldr q29, [x4] // ar_coeffs_uv[0-15] + ldr q30, [x4, #16] // ar_coeffs_uv[16-24] + stp d8, d9, [sp, #16] + stp d10, d11, [sp, #32] + stp d12, d13, [sp, #48] + stp d14, d15, [sp, #64] + stp x20, x21, [sp, #80] + + smov w4, v30.b[5] + smov w20, v30.b[6] + smov w21, v30.b[7] + + mov w1, #3 + bl generate_grain_rows_44_neon + + set_height w1, \type +1: + bl sum_\type()_lag3_left_neon + bl sum_\type()_lag3_mid_neon + bl sum_\type()_lag3_right_neon + subs w1, w1, #1 + increment_y_ptr x19, \type + add x0, x0, #GRAIN_WIDTH-48 + b.gt 1b + + ldp x20, x21, [sp, #80] + ldp d14, d15, [sp, #64] + ldp d12, d13, [sp, #48] + ldp d10, d11, [sp, #32] + ldp d8, d9, [sp, #16] + ldp x30, x19, [sp], #96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(gen_grain_\type()_tbl): + .hword L(gen_grain_\type()_tbl) - L(generate_grain_\type()_lag0) + .hword L(gen_grain_\type()_tbl) - L(generate_grain_\type()_lag1) + .hword L(gen_grain_\type()_tbl) - L(generate_grain_\type()_lag2) + .hword L(gen_grain_\type()_tbl) - L(generate_grain_\type()_lag3) +endfunc +.endm + +gen_grain_44 uv_420 +gen_grain_44 uv_422 + +.macro gather_interleaved dst1, dst2, src1, src2, off + umov w14, \src1[0+\off] + umov w15, \src2[8+\off] + umov w16, \src1[2+\off] + add x14, x14, x3 + umov w17, \src2[10+\off] + add x15, x15, x3 + ld1 {\dst1}[0+\off], [x14] + umov w14, \src1[4+\off] + add x16, x16, x3 + ld1 {\dst2}[8+\off], [x15] + umov w15, \src2[12+\off] + add x17, x17, x3 + ld1 {\dst1}[2+\off], [x16] + umov w16, \src1[6+\off] + add x14, x14, x3 + ld1 {\dst2}[10+\off], [x17] + umov w17, \src2[14+\off] + add x15, x15, x3 + ld1 {\dst1}[4+\off], [x14] + add x16, x16, x3 + ld1 {\dst2}[12+\off], [x15] + add x17, x17, x3 + ld1 {\dst1}[6+\off], [x16] + ld1 {\dst2}[14+\off], [x17] +.endm + +.macro gather dst1, dst2, src1, src2 + gather_interleaved \dst1, \dst2, \src1, \src2, 0 + gather_interleaved \dst2, \dst1, \src2, \src1, 0 + gather_interleaved \dst1, \dst2, \src1, \src2, 1 + gather_interleaved \dst2, \dst1, \src2, \src1, 1 +.endm + +function gather32_neon + gather v4.b, v5.b, v0.b, v1.b + ret +endfunc + +function gather16_neon + gather_interleaved v4.b, v5.b, v0.b, v0.b, 0 + gather_interleaved v4.b, v5.b, v0.b, v0.b, 1 + ins v4.d[1], v5.d[1] + ret +endfunc + +const overlap_coeffs_0, align=4 + .byte 27, 17, 0, 0, 0, 0, 0, 0 + .byte 17, 27, 32, 32, 32, 32, 32, 32 +endconst + +const overlap_coeffs_1, align=4 + .byte 23, 0, 0, 0, 0, 0, 0, 0 + .byte 22, 32, 32, 32, 32, 32, 32, 32 +endconst + +.macro calc_offset offx, offy, src, sx, sy + and \offy, \src, #0xF // randval & 0xF + lsr \offx, \src, #4 // randval >> 4 +.if \sy == 0 + add \offy, \offy, \offy // 2 * (randval & 0xF) +.endif +.if \sx == 0 + add \offx, \offx, \offx // 2 * (randval >> 4) +.endif +.endm + +.macro add_offset dst, offx, offy, src, stride + madd \dst, \stride, \offy, \src // grain_lut += grain_stride * offy + add \dst, \dst, \offx, uxtw // grain_lut += offx +.endm + +// void dav1d_fgy_32x32_8bpc_neon(pixel *const dst, const pixel *const src, +// const ptrdiff_t stride, +// const uint8_t scaling[SCALING_SIZE], +// const int scaling_shift, +// const entry grain_lut[][GRAIN_WIDTH], +// const int offsets[][2], +// const int h, const ptrdiff_t clip, +// const ptrdiff_t type); +function fgy_32x32_8bpc_neon, export=1 + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-16]! + ldr w11, [x6, #8] // offsets[1][0] + ldr w13, [x6, #4] // offsets[0][1] + ldr w15, [x6, #12] // offsets[1][1] + ldr w6, [x6] // offsets[0][0] + ldr w8, [sp, #16] // clip + mov x9, #GRAIN_WIDTH // grain_lut stride + + neg w4, w4 + dup v29.8h, w4 // -scaling_shift + + movrel x16, overlap_coeffs_0 + + cbz w8, 1f + // clip + movi v30.16b, #16 + movi v31.16b, #235 + b 2f +1: + // no clip + movi v30.16b, #0 + movi v31.16b, #255 +2: + + ld1 {v27.8b, v28.8b}, [x16] // overlap_coeffs + + add x5, x5, #9 // grain_lut += 9 + add x5, x5, x9, lsl #3 // grain_lut += 8 * grain_stride + add x5, x5, x9 // grain_lut += grain_stride + + calc_offset w11, w12, w11, 0, 0 + calc_offset w13, w14, w13, 0, 0 + calc_offset w15, w16, w15, 0, 0 + calc_offset w6, w10, w6, 0, 0 + + add_offset x12, w11, x12, x5, x9 + add_offset x14, w13, x14, x5, x9 + add_offset x16, w15, x16, x5, x9 + add_offset x5, w6, x10, x5, x9 + + ldr w11, [sp, #24] // type + adr x13, L(fgy_loop_tbl) + + add x4, x12, #32 // grain_lut += BLOCK_SIZE * bx + add x6, x14, x9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by + + tst w11, #1 + ldrh w11, [x13, w11, uxtw #1] + + add x8, x16, x9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by + add x8, x8, #32 // grain_lut += BLOCK_SIZE * bx + + sub x11, x13, w11, uxtw + + b.eq 1f + // y overlap + dup v6.16b, v27.b[0] + dup v7.16b, v27.b[1] + mov w10, w7 // backup actual h + mov w7, #2 +1: + br x11 +endfunc + +function fgy_loop_neon +.macro fgy ox, oy +L(loop_\ox\oy): + AARCH64_VALID_JUMP_TARGET +1: + ld1 {v0.16b, v1.16b}, [x1], x2 // src +.if \ox + ld1 {v20.8b}, [x4], x9 // grain_lut old +.endif +.if \oy + ld1 {v22.16b, v23.16b}, [x6], x9 // grain_lut top +.endif +.if \ox && \oy + ld1 {v21.8b}, [x8], x9 // grain_lut top old +.endif + ld1 {v18.16b, v19.16b}, [x5], x9 // grain_lut + + bl gather32_neon + +.if \ox + smull v20.8h, v20.8b, v27.8b + smlal v20.8h, v18.8b, v28.8b +.endif + +.if \oy +.if \ox + smull v21.8h, v21.8b, v27.8b + smlal v21.8h, v22.8b, v28.8b + sqrshrn v20.8b, v20.8h, #5 + sqrshrn v21.8b, v21.8h, #5 +.endif + +.if \ox + smull v16.8h, v20.8b, v7.8b +.else + smull v16.8h, v18.8b, v7.8b +.endif + smull2 v17.8h, v18.16b, v7.16b + smull v18.8h, v19.8b, v7.8b + smull2 v19.8h, v19.16b, v7.16b +.if \ox + smlal v16.8h, v21.8b, v6.8b +.else + smlal v16.8h, v22.8b, v6.8b +.endif + smlal2 v17.8h, v22.16b, v6.16b + smlal v18.8h, v23.8b, v6.8b + smlal2 v19.8h, v23.16b, v6.16b + sqrshrn v22.8b, v16.8h, #5 + sqrshrn2 v22.16b, v17.8h, #5 + sqrshrn v23.8b, v18.8h, #5 + sqrshrn2 v23.16b, v19.8h, #5 +.endif + + // sxtl of grain +.if \oy + sxtl v16.8h, v22.8b + sxtl2 v17.8h, v22.16b + sxtl v18.8h, v23.8b + sxtl2 v19.8h, v23.16b +.elseif \ox + sqrshrn v20.8b, v20.8h, #5 + sxtl2 v17.8h, v18.16b + sxtl v18.8h, v19.8b + sxtl2 v19.8h, v19.16b + sxtl v16.8h, v20.8b +.else + sxtl v16.8h, v18.8b + sxtl2 v17.8h, v18.16b + sxtl v18.8h, v19.8b + sxtl2 v19.8h, v19.16b +.endif + + uxtl v2.8h, v4.8b // scaling + uxtl2 v3.8h, v4.16b + uxtl v4.8h, v5.8b + uxtl2 v5.8h, v5.16b + + mul v16.8h, v16.8h, v2.8h // scaling * grain + mul v17.8h, v17.8h, v3.8h + mul v18.8h, v18.8h, v4.8h + mul v19.8h, v19.8h, v5.8h + + srshl v16.8h, v16.8h, v29.8h // round2(scaling * grain, scaling_shift) + srshl v17.8h, v17.8h, v29.8h + srshl v18.8h, v18.8h, v29.8h + srshl v19.8h, v19.8h, v29.8h + + uaddw v16.8h, v16.8h, v0.8b // *src + noise + uaddw2 v17.8h, v17.8h, v0.16b + uaddw v18.8h, v18.8h, v1.8b + uaddw2 v19.8h, v19.8h, v1.16b + + sqxtun v0.8b, v16.8h + sqxtun2 v0.16b, v17.8h + sqxtun v1.8b, v18.8h + sqxtun2 v1.16b, v19.8h + + umax v0.16b, v0.16b, v30.16b + umax v1.16b, v1.16b, v30.16b + umin v0.16b, v0.16b, v31.16b + umin v1.16b, v1.16b, v31.16b + + subs w7, w7, #1 +.if \oy + dup v6.16b, v28.b[0] + dup v7.16b, v28.b[1] +.endif + st1 {v0.16b, v1.16b}, [x0], x2 // dst + b.gt 1b + +.if \oy + cmp w10, #2 + sub w7, w10, #2 // restore actual remaining h + b.gt L(loop_\ox()0) +.endif + ldr x30, [sp], #16 + AARCH64_VALIDATE_LINK_REGISTER + ret +.endm + + fgy 0, 0 + fgy 0, 1 + fgy 1, 0 + fgy 1, 1 + +L(fgy_loop_tbl): + .hword L(fgy_loop_tbl) - L(loop_00) + .hword L(fgy_loop_tbl) - L(loop_01) + .hword L(fgy_loop_tbl) - L(loop_10) + .hword L(fgy_loop_tbl) - L(loop_11) +endfunc + +// void dav1d_fguv_32x32_420_8bpc_neon(pixel *const dst, +// const pixel *const src, +// const ptrdiff_t stride, +// const uint8_t scaling[SCALING_SIZE], +// const Dav1dFilmGrainData *const data, +// const entry grain_lut[][GRAIN_WIDTH], +// const pixel *const luma_row, +// const ptrdiff_t luma_stride, +// const int offsets[][2], +// const ptrdiff_t h, const ptrdiff_t uv, +// const ptrdiff_t is_id, +// const ptrdiff_t type); +.macro fguv layout, sx, sy +function fguv_32x32_\layout()_8bpc_neon, export=1 + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-32]! + str d8, [sp, #16] + ldp x8, x9, [sp, #32] // offsets, h + ldp x10, x11, [sp, #48] // uv, is_id + + ldr w13, [x4, #FGD_SCALING_SHIFT] + ldr w12, [x4, #FGD_CLIP_TO_RESTRICTED_RANGE] + neg w13, w13 // -scaling_shift + + // !csfl + add x10, x4, x10, lsl #2 // + 4*uv + add x14, x10, #FGD_UV_LUMA_MULT + add x15, x10, #FGD_UV_MULT + add x10, x10, #FGD_UV_OFFSET + ld1 {v8.h}[0], [x14] // uv_luma_mult + ld1r {v24.8h}, [x10] // uv_offset + ld1 {v8.h}[1], [x15] // uv_mult + + dup v29.8h, w13 // -scaling_shift + + cbz w12, 1f + // clip + movi v30.16b, #16 + movi v31.16b, #240 + cbz w11, 2f + // is_id + movi v31.16b, #235 + b 2f +1: + // no clip + movi v30.16b, #0 + movi v31.16b, #255 +2: + + ldr w12, [x8, #8] // offsets[1][0] + ldr w14, [x8, #4] // offsets[0][1] + ldr w16, [x8, #12] // offsets[1][1] + ldr w8, [x8] // offsets[0][0] + + mov x10, #GRAIN_WIDTH // grain_lut stride + + add x5, x5, #(3 + (2 >> \sx)*3) // grain_lut += 9 or 6 +.if \sy + add x5, x5, x10, lsl #2 // grain_lut += 4 * grain_stride + add x5, x5, x10, lsl #1 // grain_lut += 2 * grain_stride +.else + add x5, x5, x10, lsl #3 // grain_lut += 8 * grain_stride + add x5, x5, x10 // grain_lut += grain_stride +.endif + + calc_offset w12, w13, w12, \sx, \sy + calc_offset w14, w15, w14, \sx, \sy + calc_offset w16, w17, w16, \sx, \sy + calc_offset w8, w11, w8, \sx, \sy + + add_offset x13, w12, x13, x5, x10 + add_offset x15, w14, x15, x5, x10 + add_offset x17, w16, x17, x5, x10 + add_offset x5, w8, x11, x5, x10 + + add x4, x13, #(32 >> \sx) // grain_lut += BLOCK_SIZE * bx + add x8, x15, x10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by + add x11, x17, x10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by + add x11, x11, #(32 >> \sx) // grain_lut += BLOCK_SIZE * bx + + ldr w13, [sp, #64] // type + + movrel x16, overlap_coeffs_\sx + adr x14, L(fguv_loop_sx\sx()_tbl) + + ld1 {v27.8b, v28.8b}, [x16] // overlap_coeffs + tst w13, #1 + ldrh w13, [x14, w13, uxtw #1] + + b.eq 1f + // y overlap + sub w12, w9, #(2 >> \sy) // backup remaining h + mov w9, #(2 >> \sy) + +1: + sub x13, x14, w13, uxtw + +.if \sy + movi v25.16b, #23 + movi v26.16b, #22 +.else + movi v25.16b, #27 + movi v26.16b, #17 +.endif + +.if \sy + add x7, x7, x7 // luma_stride *= 2 +.endif + + br x13 +endfunc +.endm + +fguv 420, 1, 1 +fguv 422, 1, 0 +fguv 444, 0, 0 + +function fguv_loop_sx0_neon +.macro fguv_loop_sx0 csfl, ox, oy +L(fguv_loop_sx0_csfl\csfl()_\ox\oy): + AARCH64_VALID_JUMP_TARGET +1: + ld1 {v0.16b, v1.16b}, [x6], x7 // luma + ld1 {v6.16b, v7.16b}, [x1], x2 // src +.if \ox + ld1 {v20.8b}, [x4], x10 // grain_lut old +.endif +.if \oy + ld1 {v22.16b, v23.16b}, [x8], x10 // grain_lut top +.endif +.if \ox && \oy + ld1 {v21.8b}, [x11], x10 // grain_lut top old +.endif + ld1 {v18.16b, v19.16b}, [x5], x10 // grain_lut + +.if !\csfl + uxtl v2.8h, v0.8b + uxtl2 v3.8h, v0.16b + uxtl v4.8h, v1.8b + uxtl2 v5.8h, v1.16b + uxtl v0.8h, v6.8b + uxtl2 v1.8h, v6.16b + uxtl v16.8h, v7.8b + uxtl2 v17.8h, v7.16b + mul v2.8h, v2.8h, v8.h[0] + mul v3.8h, v3.8h, v8.h[0] + mul v4.8h, v4.8h, v8.h[0] + mul v5.8h, v5.8h, v8.h[0] + mul v0.8h, v0.8h, v8.h[1] + mul v1.8h, v1.8h, v8.h[1] + mul v16.8h, v16.8h, v8.h[1] + mul v17.8h, v17.8h, v8.h[1] + sqadd v2.8h, v2.8h, v0.8h + sqadd v3.8h, v3.8h, v1.8h + sqadd v4.8h, v4.8h, v16.8h + sqadd v5.8h, v5.8h, v17.8h + sshr v2.8h, v2.8h, #6 + sshr v3.8h, v3.8h, #6 + sshr v4.8h, v4.8h, #6 + sshr v5.8h, v5.8h, #6 + add v2.8h, v2.8h, v24.8h + add v3.8h, v3.8h, v24.8h + add v4.8h, v4.8h, v24.8h + add v5.8h, v5.8h, v24.8h + sqxtun v0.8b, v2.8h + sqxtun2 v0.16b, v3.8h + sqxtun v1.8b, v4.8h + sqxtun2 v1.16b, v5.8h +.endif + + bl gather32_neon + +.if \ox + smull v20.8h, v20.8b, v27.8b + smlal v20.8h, v18.8b, v28.8b +.endif + +.if \oy +.if \ox + smull v21.8h, v21.8b, v27.8b + smlal v21.8h, v22.8b, v28.8b + sqrshrn v20.8b, v20.8h, #5 + sqrshrn v21.8b, v21.8h, #5 +.endif + +.if \ox + smull v16.8h, v20.8b, v26.8b +.else + smull v16.8h, v18.8b, v26.8b +.endif + smull2 v17.8h, v18.16b, v26.16b + smull v18.8h, v19.8b, v26.8b + smull2 v19.8h, v19.16b, v26.16b +.if \ox + smlal v16.8h, v21.8b, v25.8b +.else + smlal v16.8h, v22.8b, v25.8b +.endif + smlal2 v17.8h, v22.16b, v25.16b + smlal v18.8h, v23.8b, v25.8b + smlal2 v19.8h, v23.16b, v25.16b + sqrshrn v22.8b, v16.8h, #5 + sqrshrn2 v22.16b, v17.8h, #5 + sqrshrn v23.8b, v18.8h, #5 + sqrshrn2 v23.16b, v19.8h, #5 +.endif + + // sxtl of grain +.if \oy + sxtl v16.8h, v22.8b + sxtl2 v17.8h, v22.16b + sxtl v18.8h, v23.8b + sxtl2 v19.8h, v23.16b +.elseif \ox + sqrshrn v20.8b, v20.8h, #5 + sxtl2 v17.8h, v18.16b + sxtl v18.8h, v19.8b + sxtl2 v19.8h, v19.16b + sxtl v16.8h, v20.8b +.else + sxtl v16.8h, v18.8b + sxtl2 v17.8h, v18.16b + sxtl v18.8h, v19.8b + sxtl2 v19.8h, v19.16b +.endif + + uxtl v2.8h, v4.8b // scaling + uxtl2 v3.8h, v4.16b + uxtl v4.8h, v5.8b + uxtl2 v5.8h, v5.16b + + mul v16.8h, v16.8h, v2.8h // scaling * grain + mul v17.8h, v17.8h, v3.8h + mul v18.8h, v18.8h, v4.8h + mul v19.8h, v19.8h, v5.8h + + srshl v16.8h, v16.8h, v29.8h // round2(scaling * grain, scaling_shift) + srshl v17.8h, v17.8h, v29.8h + srshl v18.8h, v18.8h, v29.8h + srshl v19.8h, v19.8h, v29.8h + + uaddw v16.8h, v16.8h, v6.8b // *src + noise + uaddw2 v17.8h, v17.8h, v6.16b + uaddw v18.8h, v18.8h, v7.8b + uaddw2 v19.8h, v19.8h, v7.16b + + sqxtun v0.8b, v16.8h + sqxtun2 v0.16b, v17.8h + sqxtun v1.8b, v18.8h + sqxtun2 v1.16b, v19.8h + + umax v0.16b, v0.16b, v30.16b + umax v1.16b, v1.16b, v30.16b + umin v0.16b, v0.16b, v31.16b + umin v1.16b, v1.16b, v31.16b + + subs w9, w9, #1 +.if \oy + dup v25.16b, v28.b[0] + dup v26.16b, v28.b[1] +.endif + st1 {v0.16b, v1.16b}, [x0], x2 // dst + b.gt 1b + +.if \oy + cmp w12, #0 + mov w9, w12 // restore actual remaining h + b.gt L(fguv_loop_sx0_csfl\csfl()_\ox()0) +.endif + b 9f +.endm + fguv_loop_sx0 0, 0, 0 + fguv_loop_sx0 0, 0, 1 + fguv_loop_sx0 0, 1, 0 + fguv_loop_sx0 0, 1, 1 + fguv_loop_sx0 1, 0, 0 + fguv_loop_sx0 1, 0, 1 + fguv_loop_sx0 1, 1, 0 + fguv_loop_sx0 1, 1, 1 + +9: + ldr d8, [sp, #16] + ldr x30, [sp], #32 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(fguv_loop_sx0_tbl): + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_00) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_01) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_10) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_11) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_00) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_01) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_10) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_11) +endfunc + +function fguv_loop_sx1_neon +.macro fguv_loop_sx1 csfl, ox, oy +L(fguv_loop_sx1_csfl\csfl()_\ox\oy): + AARCH64_VALID_JUMP_TARGET +1: + ld1 {v0.16b, v1.16b}, [x6], x7 // luma + ld1 {v6.16b}, [x1], x2 // src +.if \ox + ld1 {v20.8b}, [x4], x10 // grain_lut old +.endif +.if \oy + ld1 {v22.16b}, [x8], x10 // grain_lut top +.endif +.if \ox && \oy + ld1 {v21.8b}, [x11], x10 // grain_lut top old +.endif + ld1 {v18.16b}, [x5], x10 // grain_lut + + uaddlp v2.8h, v0.16b + uaddlp v3.8h, v1.16b +.if \csfl + rshrn v0.8b, v2.8h, #1 + rshrn2 v0.16b, v3.8h, #1 +.else + urshr v2.8h, v2.8h, #1 + urshr v3.8h, v3.8h, #1 + uxtl v0.8h, v6.8b + uxtl2 v1.8h, v6.16b + mul v2.8h, v2.8h, v8.h[0] + mul v3.8h, v3.8h, v8.h[0] + mul v0.8h, v0.8h, v8.h[1] + mul v1.8h, v1.8h, v8.h[1] + sqadd v2.8h, v2.8h, v0.8h + sqadd v3.8h, v3.8h, v1.8h + sshr v2.8h, v2.8h, #6 + sshr v3.8h, v3.8h, #6 + add v2.8h, v2.8h, v24.8h + add v3.8h, v3.8h, v24.8h + sqxtun v0.8b, v2.8h + sqxtun2 v0.16b, v3.8h +.endif + + bl gather16_neon + +.if \ox + smull v20.8h, v20.8b, v27.8b + smlal v20.8h, v18.8b, v28.8b +.endif + +.if \oy +.if \ox + smull v21.8h, v21.8b, v27.8b + smlal v21.8h, v22.8b, v28.8b + sqrshrn v20.8b, v20.8h, #5 + sqrshrn v21.8b, v21.8h, #5 +.endif + +.if \ox + smull v16.8h, v20.8b, v26.8b +.else + smull v16.8h, v18.8b, v26.8b +.endif + smull2 v17.8h, v18.16b, v26.16b +.if \ox + smlal v16.8h, v21.8b, v25.8b +.else + smlal v16.8h, v22.8b, v25.8b +.endif + smlal2 v17.8h, v22.16b, v25.16b + sqrshrn v22.8b, v16.8h, #5 + sqrshrn2 v22.16b, v17.8h, #5 +.endif + + // sxtl of grain +.if \oy + sxtl v16.8h, v22.8b + sxtl2 v17.8h, v22.16b +.elseif \ox + sqrshrn v20.8b, v20.8h, #5 + sxtl2 v17.8h, v18.16b + sxtl v16.8h, v20.8b +.else + sxtl v16.8h, v18.8b + sxtl2 v17.8h, v18.16b +.endif + + uxtl v2.8h, v4.8b // scaling + uxtl2 v3.8h, v4.16b + + mul v16.8h, v16.8h, v2.8h // scaling * grain + mul v17.8h, v17.8h, v3.8h + + srshl v16.8h, v16.8h, v29.8h // round2(scaling * grain, scaling_shift) + srshl v17.8h, v17.8h, v29.8h + + uaddw v16.8h, v16.8h, v6.8b // *src + noise + uaddw2 v17.8h, v17.8h, v6.16b + + sqxtun v0.8b, v16.8h + sqxtun2 v0.16b, v17.8h + + umax v0.16b, v0.16b, v30.16b + umin v0.16b, v0.16b, v31.16b + +.if \oy + mov v16.16b, v25.16b +.endif + subs w9, w9, #1 +.if \oy + mov v25.16b, v26.16b + mov v26.16b, v16.16b +.endif + st1 {v0.16b}, [x0], x2 // dst + b.gt 1b + +.if \oy + cmp w12, #0 + mov w9, w12 // restore actual remaining h + b.gt L(fguv_loop_sx1_csfl\csfl()_\ox()0) +.endif + + b 9f +.endm + fguv_loop_sx1 0, 0, 0 + fguv_loop_sx1 0, 0, 1 + fguv_loop_sx1 0, 1, 0 + fguv_loop_sx1 0, 1, 1 + fguv_loop_sx1 1, 0, 0 + fguv_loop_sx1 1, 0, 1 + fguv_loop_sx1 1, 1, 0 + fguv_loop_sx1 1, 1, 1 + +9: + ldr d8, [sp, #16] + ldr x30, [sp], #32 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(fguv_loop_sx1_tbl): + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_00) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_01) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_10) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_11) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_00) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_01) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_10) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_11) +endfunc diff --git a/third_party/dav1d/src/arm/64/filmgrain16.S b/third_party/dav1d/src/arm/64/filmgrain16.S new file mode 100644 index 0000000000000..7c4ff6dda9435 --- /dev/null +++ b/third_party/dav1d/src/arm/64/filmgrain16.S @@ -0,0 +1,1997 @@ +/* + * Copyright © 2021, VideoLAN and dav1d authors + * Copyright © 2021, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" +#include "src/arm/asm-offsets.h" + +#define GRAIN_WIDTH 82 +#define GRAIN_HEIGHT 73 + +#define SUB_GRAIN_WIDTH 44 +#define SUB_GRAIN_HEIGHT 38 + +.macro increment_seed steps, shift=1 + lsr w11, w2, #3 + lsr w12, w2, #12 + lsr w13, w2, #1 + eor w11, w2, w11 // (r >> 0) ^ (r >> 3) + eor w12, w12, w13 // (r >> 12) ^ (r >> 1) + eor w11, w11, w12 // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1) +.if \shift + lsr w2, w2, #\steps +.endif + and w11, w11, #((1 << \steps) - 1) // bit +.if \shift + orr w2, w2, w11, lsl #(16 - \steps) // *state +.else + orr w2, w2, w11, lsl #16 // *state +.endif +.endm + +.macro read_rand dest, bits, age + ubfx \dest, x2, #16 - \bits - \age, #\bits +.endm + +.macro read_shift_rand dest, bits + ubfx \dest, x2, #17 - \bits, #\bits + lsr w2, w2, #1 +.endm + +// special calling convention: +// w2 holds seed +// x3 holds dav1d_gaussian_sequence +// clobbers x11-x15 +// returns in v0.8h +function get_gaussian_neon + increment_seed 4 + read_rand x14, 11, 3 + read_rand x15, 11, 2 + add x14, x3, x14, lsl #1 + add x15, x3, x15, lsl #1 + ld1 {v0.h}[0], [x14] + read_rand x14, 11, 1 + ld1 {v0.h}[1], [x15] + add x14, x3, x14, lsl #1 + read_rand x15, 11, 0 + increment_seed 4 + add x15, x3, x15, lsl #1 + ld1 {v0.h}[2], [x14] + read_rand x14, 11, 3 + ld1 {v0.h}[3], [x15] + add x14, x3, x14, lsl #1 + read_rand x15, 11, 2 + ld1 {v0.h}[4], [x14] + add x15, x3, x15, lsl #1 + read_rand x14, 11, 1 + ld1 {v0.h}[5], [x15] + read_rand x15, 11, 0 + add x14, x3, x14, lsl #1 + add x15, x3, x15, lsl #1 + ld1 {v0.h}[6], [x14] + ld1 {v0.h}[7], [x15] + ret +endfunc + +.macro store_grain_row r0, r1, r2, r3, r4, r5 + st1 {\r0().16b,\r1().16b}, [x0], #32 + st1 {\r2().16b,\r3().16b}, [x0], #32 + st1 {\r4().16b}, [x0], #16 + st1 {\r5().h}[0], [x0], #2 +.endm + +function get_grain_2_neon + increment_seed 2 + read_rand x14, 11, 1 + read_rand x15, 11, 0 + add x14, x3, x14, lsl #1 + add x15, x3, x15, lsl #1 + ld1 {v0.h}[0], [x14] + ld1 {v0.h}[1], [x15] + srshl v0.4h, v0.4h, v31.4h + ret +endfunc + +.macro get_grain_2 dst + bl get_grain_2_neon +.ifnc \dst, v0 + mov \dst().8b, v0.8b +.endif +.endm + +function get_grain_4_neon + increment_seed 4 + read_rand x14, 11, 3 + read_rand x15, 11, 2 + add x14, x3, x14, lsl #1 + add x15, x3, x15, lsl #1 + ld1 {v0.h}[0], [x14] + read_rand x14, 11, 1 + ld1 {v0.h}[1], [x15] + add x14, x3, x14, lsl #1 + read_rand x15, 11, 0 + add x15, x3, x15, lsl #1 + ld1 {v0.h}[2], [x14] + ld1 {v0.h}[3], [x15] + srshl v0.4h, v0.4h, v31.4h + ret +endfunc + +.macro get_grain_4 dst + bl get_grain_4_neon +.ifnc \dst, v0 + mov \dst().8b, v0.8b +.endif +.endm + +// w15 holds the number of entries to produce +// w14, w16 and w17 hold the previous output entries +// v0 holds the vector of produced entries +// v1 holds the input vector of sums from above +.macro output_lag n +function output_lag\n()_neon +1: + read_shift_rand x13, 11 + mov w11, v1.s[0] + ldrsh w12, [x3, x13, lsl #1] + ext v0.16b, v0.16b, v0.16b, #2 +.if \n == 1 + madd w11, w14, w4, w11 // sum (above) + *coeff * prev output +.elseif \n == 2 + madd w11, w16, w4, w11 // sum (above) + *coeff * prev output 1 + madd w11, w14, w17, w11 // += *coeff * prev output 2 + mov w16, w14 +.else + madd w11, w17, w4, w11 // sum (above) + *coeff * prev output 1 + madd w11, w16, w20, w11 // sum (above) + *coeff * prev output 2 + madd w11, w14, w21, w11 // += *coeff * prev output 3 + mov w17, w16 + mov w16, w14 +.endif + add w14, w11, w8 // 1 << (ar_coeff_shift - 1) + add w12, w12, w10 // 1 << (4 - bitdepth_min_8 + grain_scale_shift - 1) + asr w14, w14, w7 // >> ar_coeff_shift + asr w12, w12, w9 // >> (4 - bitdepth_min_8 + grain_scale_shift) + add w14, w14, w12 + cmp w14, w5 + csel w14, w14, w5, le + cmp w14, w6 + csel w14, w14, w6, ge + subs w15, w15, #1 + ext v1.16b, v1.16b, v1.16b, #4 + ins v0.h[7], w14 + b.gt 1b + ret +endfunc +.endm + +output_lag 1 +output_lag 2 +output_lag 3 + + +function sum_lag1_above_neon + sub x12, x0, #1*GRAIN_WIDTH*2 - 16 + ld1 {v18.8h}, [x12] // load top right + + ext v0.16b, v16.16b, v17.16b, #14 // top left, top mid + ext v1.16b, v17.16b, v18.16b, #2 // top mid, top right + + smull v4.4s, v17.4h, v28.4h + smlal v4.4s, v0.4h, v27.4h + smlal v4.4s, v1.4h, v29.4h + smull2 v5.4s, v17.8h, v28.8h + smlal2 v5.4s, v0.8h, v27.8h + smlal2 v5.4s, v1.8h, v29.8h + + mov v16.16b, v17.16b + mov v17.16b, v18.16b + + ret +endfunc + +.macro sum_lag_n_body lag, type, uv_layout, edge, elems, uv_coeff + bl sum_\lag()_above_neon +.ifc \type, uv_420 + add x12, x19, #GRAIN_WIDTH*2 + ld1 {v22.8h, v23.8h}, [x19], #32 + ld1 {v24.8h, v25.8h}, [x12] + addp v22.8h, v22.8h, v23.8h + addp v23.8h, v24.8h, v25.8h + add v22.8h, v22.8h, v23.8h + srshr v0.8h, v22.8h, #2 +.endif +.ifc \type, uv_422 + ld1 {v22.8h, v23.8h}, [x19], #32 + addp v22.8h, v22.8h, v23.8h + srshr v0.8h, v22.8h, #1 +.endif +.ifc \type, uv_444 + ld1 {v0.8h}, [x19], #16 +.endif +.if \uv_layout +.ifnb \uv_coeff + dup v1.8b, \uv_coeff + sxtl v1.8h, v1.8b + smlal v4.4s, v0.4h, v1.4h + smlal2 v5.4s, v0.8h, v1.8h +.else + smlal v4.4s, v0.4h, v30.4h + smlal2 v5.4s, v0.8h, v30.8h +.endif +.endif +.if \uv_layout && \elems == 8 + b sum_\lag()_y_\edge()_start +.elseif \uv_layout == 444 && \elems == 7 + b sum_\lag()_y_\edge()_start +.elseif \uv_layout == 422 && \elems == 1 + b sum_\lag()_uv_420_\edge()_start +.else +sum_\lag()_\type()_\edge()_start: +.if \elems > 4 +.ifc \edge, left + increment_seed 4 + read_rand x12, 11, 3 + read_rand x13, 11, 2 + read_rand x14, 11, 1 + add x12, x3, x12, lsl #1 + add x13, x3, x13, lsl #1 + add x14, x3, x14, lsl #1 + ld1 {v0.h}[5], [x12] + ld1 {v0.h}[6], [x13] + ld1 {v0.h}[7], [x14] + lsl x2, x2, #1 // shift back the state as if we'd done increment_seed with shift=0 + srshl v0.8h, v0.8h, v31.8h + ext v4.16b, v4.16b, v4.16b, #12 +.ifc \lag, lag3 + smov w17, v0.h[5] +.endif +.ifnc \lag, lag1 + smov w16, v0.h[6] +.endif + smov w14, v0.h[7] + + mov v1.16b, v4.16b + mov w15, #1 + bl output_\lag()_neon +.else + increment_seed 4, shift=0 + mov v1.16b, v4.16b + mov w15, #4 + bl output_\lag()_neon +.endif + + increment_seed 4, shift=0 + mov v1.16b, v5.16b +.ifc \edge, right + mov w15, #3 + bl output_\lag()_neon + read_shift_rand x15, 11 + add x15, x3, x15, lsl #1 + ld1 {v1.h}[0], [x15] + srshl v1.4h, v1.4h, v31.4h + ext v0.16b, v0.16b, v1.16b, #2 +.else + mov w15, #4 + bl output_\lag()_neon +.endif +.else + // elems == 1 + increment_seed 4, shift=0 + mov v1.16b, v4.16b + mov w15, #1 + bl output_\lag()_neon + lsr w2, w2, #3 + + read_rand x12, 11, 2 + read_rand x13, 11, 1 + read_rand x14, 11, 0 + add x12, x3, x12, lsl #1 + add x13, x3, x13, lsl #1 + add x14, x3, x14, lsl #1 + ld1 {v1.h}[0], [x12] + ld1 {v1.h}[1], [x13] + ld1 {v1.h}[2], [x14] + srshl v1.4h, v1.4h, v31.4h + ext v0.16b, v0.16b, v1.16b, #14 +.endif + st1 {v0.8h}, [x0], #16 + ldr x30, [sp], #16 + AARCH64_VALIDATE_LINK_REGISTER + ret +.endif +.endm + +.macro sum_lag1_func type, uv_layout, edge, elems=8 +function sum_\type()_lag1_\edge()_neon + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-16]! +.ifc \edge, left + sub x12, x0, #1*GRAIN_WIDTH*2 + ld1 {v17.8h}, [x12] // load the previous block right above +.endif + sum_lag_n_body lag1, \type, \uv_layout, \edge, \elems +endfunc +.endm + +sum_lag1_func y, 0, left +sum_lag1_func y, 0, mid +sum_lag1_func y, 0, right, 7 +sum_lag1_func uv_444, 444, left +sum_lag1_func uv_444, 444, mid +sum_lag1_func uv_444, 444, right, 7 +sum_lag1_func uv_422, 422, left +sum_lag1_func uv_422, 422, mid +sum_lag1_func uv_422, 422, right, 1 +sum_lag1_func uv_420, 420, left +sum_lag1_func uv_420, 420, mid +sum_lag1_func uv_420, 420, right, 1 + + +function sum_lag2_above_neon + sub x12, x0, #2*GRAIN_WIDTH*2 - 16 + sub x13, x0, #1*GRAIN_WIDTH*2 - 16 + ld1 {v18.8h}, [x12] // load top right + ld1 {v21.8h}, [x13] + + dup v26.8b, v30.b[0] + ext v22.16b, v16.16b, v17.16b, #12 // top left, top mid + dup v27.8b, v30.b[1] + ext v23.16b, v16.16b, v17.16b, #14 + sxtl v26.8h, v26.8b + dup v28.8b, v30.b[3] + ext v0.16b, v17.16b, v18.16b, #2 // top mid, top right + sxtl v27.8h, v27.8b + dup v29.8b, v30.b[4] + ext v1.16b, v17.16b, v18.16b, #4 + sxtl v28.8h, v28.8b + sxtl v29.8h, v29.8b + + smull v4.4s, v22.4h, v26.4h + smlal v4.4s, v23.4h, v27.4h + smlal v4.4s, v0.4h, v28.4h + smlal v4.4s, v1.4h, v29.4h + smull2 v5.4s, v22.8h, v26.8h + smlal2 v5.4s, v23.8h, v27.8h + smlal2 v5.4s, v0.8h, v28.8h + smlal2 v5.4s, v1.8h, v29.8h + + dup v26.16b, v30.b[5] + ext v22.16b, v19.16b, v20.16b, #12 // top left, top mid + dup v27.16b, v30.b[6] + ext v23.16b, v19.16b, v20.16b, #14 + sxtl v26.8h, v26.8b + dup v28.16b, v30.b[8] + ext v0.16b, v20.16b, v21.16b, #2 // top mid, top right + sxtl v27.8h, v27.8b + dup v29.16b, v30.b[9] + ext v1.16b, v20.16b, v21.16b, #4 + sxtl v28.8h, v28.8b + sxtl v29.8h, v29.8b + + smlal v4.4s, v22.4h, v26.4h + smlal v4.4s, v23.4h, v27.4h + smlal v4.4s, v0.4h, v28.4h + smlal v4.4s, v1.4h, v29.4h + smlal2 v5.4s, v22.8h, v26.8h + smlal2 v5.4s, v23.8h, v27.8h + smlal2 v5.4s, v0.8h, v28.8h + smlal2 v5.4s, v1.8h, v29.8h + + dup v26.16b, v30.b[2] + dup v27.16b, v30.b[7] + sxtl v26.8h, v26.8b + sxtl v27.8h, v27.8b + + smlal v4.4s, v17.4h, v26.4h + smlal v4.4s, v20.4h, v27.4h + smlal2 v5.4s, v17.8h, v26.8h + smlal2 v5.4s, v20.8h, v27.8h + mov v16.16b, v17.16b + mov v17.16b, v18.16b + + mov v19.16b, v20.16b + mov v20.16b, v21.16b + ret +endfunc + +.macro sum_lag2_func type, uv_layout, edge, elems=8 +function sum_\type()_lag2_\edge()_neon + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-16]! +.ifc \edge, left + sub x12, x0, #2*GRAIN_WIDTH*2 + sub x13, x0, #1*GRAIN_WIDTH*2 + ld1 {v17.8h}, [x12] // load the previous block right above + ld1 {v20.8h}, [x13] +.endif + sum_lag_n_body lag2, \type, \uv_layout, \edge, \elems, v30.b[12] +endfunc +.endm + +sum_lag2_func y, 0, left +sum_lag2_func y, 0, mid +sum_lag2_func y, 0, right, 7 +sum_lag2_func uv_444, 444, left +sum_lag2_func uv_444, 444, mid +sum_lag2_func uv_444, 444, right, 7 +sum_lag2_func uv_422, 422, left +sum_lag2_func uv_422, 422, mid +sum_lag2_func uv_422, 422, right, 1 +sum_lag2_func uv_420, 420, left +sum_lag2_func uv_420, 420, mid +sum_lag2_func uv_420, 420, right, 1 + + +function sum_lag3_above_neon + sub x11, x0, #3*GRAIN_WIDTH*2 - 16 + sub x12, x0, #2*GRAIN_WIDTH*2 - 16 + sub x13, x0, #1*GRAIN_WIDTH*2 - 16 + ld1 {v15.8h}, [x11] // load top right + ld1 {v18.8h}, [x12] + ld1 {v21.8h}, [x13] + + dup v22.8b, v29.b[0] + ext v8.16b, v13.16b, v14.16b, #10 // top left, top mid + dup v23.8b, v29.b[1] + ext v9.16b, v13.16b, v14.16b, #12 + sxtl v22.8h, v22.8b + dup v24.8b, v29.b[2] + sxtl v23.8h, v23.8b + dup v25.8b, v29.b[3] + ext v10.16b, v13.16b, v14.16b, #14 + sxtl v24.8h, v24.8b + dup v26.8b, v29.b[4] + ext v11.16b, v14.16b, v15.16b, #2 // top mid, top right + sxtl v25.8h, v25.8b + dup v27.8b, v29.b[5] + ext v12.16b, v14.16b, v15.16b, #4 + sxtl v26.8h, v26.8b + dup v28.8b, v29.b[6] + ext v13.16b, v14.16b, v15.16b, #6 + sxtl v27.8h, v27.8b + sxtl v28.8h, v28.8b + + smull v4.4s, v8.4h, v22.4h + smlal v4.4s, v9.4h, v23.4h + smlal v4.4s, v10.4h, v24.4h + smlal v4.4s, v11.4h, v26.4h + smlal v4.4s, v12.4h, v27.4h + smlal v4.4s, v13.4h, v28.4h + smlal v4.4s, v14.4h, v25.4h + smull2 v5.4s, v8.8h, v22.8h + smlal2 v5.4s, v9.8h, v23.8h + smlal2 v5.4s, v10.8h, v24.8h + smlal2 v5.4s, v11.8h, v26.8h + smlal2 v5.4s, v12.8h, v27.8h + smlal2 v5.4s, v13.8h, v28.8h + smlal2 v5.4s, v14.8h, v25.8h + + dup v22.8b, v29.b[7] + ext v8.16b, v16.16b, v17.16b, #10 // top left, top mid + dup v23.8b, v29.b[8] + ext v9.16b, v16.16b, v17.16b, #12 + sxtl v22.8h, v22.8b + dup v24.8b, v29.b[9] + sxtl v23.8h, v23.8b + dup v25.8b, v29.b[10] + ext v10.16b, v16.16b, v17.16b, #14 + sxtl v24.8h, v24.8b + dup v26.8b, v29.b[11] + ext v11.16b, v17.16b, v18.16b, #2 // top mid, top right + sxtl v25.8h, v25.8b + dup v27.8b, v29.b[12] + ext v12.16b, v17.16b, v18.16b, #4 + sxtl v26.8h, v26.8b + dup v28.8b, v29.b[13] + ext v13.16b, v17.16b, v18.16b, #6 + sxtl v27.8h, v27.8b + sxtl v28.8h, v28.8b + + smlal v4.4s, v8.4h, v22.4h + smlal v4.4s, v9.4h, v23.4h + smlal v4.4s, v10.4h, v24.4h + smlal v4.4s, v11.4h, v26.4h + smlal v4.4s, v12.4h, v27.4h + smlal v4.4s, v13.4h, v28.4h + smlal v4.4s, v17.4h, v25.4h + smlal2 v5.4s, v8.8h, v22.8h + smlal2 v5.4s, v9.8h, v23.8h + smlal2 v5.4s, v10.8h, v24.8h + smlal2 v5.4s, v11.8h, v26.8h + smlal2 v5.4s, v12.8h, v27.8h + smlal2 v5.4s, v13.8h, v28.8h + smlal2 v5.4s, v17.8h, v25.8h + + dup v22.8b, v29.b[14] + ext v8.16b, v19.16b, v20.16b, #10 // top left, top mid + dup v23.8b, v29.b[15] + ext v9.16b, v19.16b, v20.16b, #12 + sxtl v22.8h, v22.8b + dup v24.8b, v30.b[0] + sxtl v23.8h, v23.8b + dup v25.8b, v30.b[1] + ext v10.16b, v19.16b, v20.16b, #14 + sxtl v24.8h, v24.8b + dup v26.8b, v30.b[2] + ext v11.16b, v20.16b, v21.16b, #2 // top mid, top right + sxtl v25.8h, v25.8b + dup v27.8b, v30.b[3] + ext v12.16b, v20.16b, v21.16b, #4 + sxtl v26.8h, v26.8b + dup v28.8b, v30.b[4] + ext v13.16b, v20.16b, v21.16b, #6 + sxtl v27.8h, v27.8b + sxtl v28.8h, v28.8b + + smlal v4.4s, v8.4h, v22.4h + smlal v4.4s, v9.4h, v23.4h + smlal v4.4s, v10.4h, v24.4h + smlal v4.4s, v11.4h, v26.4h + smlal v4.4s, v12.4h, v27.4h + smlal v4.4s, v13.4h, v28.4h + smlal v4.4s, v20.4h, v25.4h + mov v16.16b, v17.16b + mov v17.16b, v18.16b + smlal2 v5.4s, v8.8h, v22.8h + smlal2 v5.4s, v9.8h, v23.8h + smlal2 v5.4s, v10.8h, v24.8h + smlal2 v5.4s, v11.8h, v26.8h + smlal2 v5.4s, v12.8h, v27.8h + smlal2 v5.4s, v13.8h, v28.8h + smlal2 v5.4s, v20.8h, v25.8h + + mov v13.16b, v14.16b + mov v14.16b, v15.16b + + mov v19.16b, v20.16b + mov v20.16b, v21.16b + ret +endfunc + +.macro sum_lag3_func type, uv_layout, edge, elems=8 +function sum_\type()_lag3_\edge()_neon + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-16]! +.ifc \edge, left + sub x11, x0, #3*GRAIN_WIDTH*2 + sub x12, x0, #2*GRAIN_WIDTH*2 + sub x13, x0, #1*GRAIN_WIDTH*2 + ld1 {v14.8h}, [x11] // load the previous block right above + ld1 {v17.8h}, [x12] + ld1 {v20.8h}, [x13] +.endif + sum_lag_n_body lag3, \type, \uv_layout, \edge, \elems, v30.b[8] +endfunc +.endm + +sum_lag3_func y, 0, left +sum_lag3_func y, 0, mid +sum_lag3_func y, 0, right, 7 +sum_lag3_func uv_444, 444, left +sum_lag3_func uv_444, 444, mid +sum_lag3_func uv_444, 444, right, 7 +sum_lag3_func uv_422, 422, left +sum_lag3_func uv_422, 422, mid +sum_lag3_func uv_422, 422, right, 1 +sum_lag3_func uv_420, 420, left +sum_lag3_func uv_420, 420, mid +sum_lag3_func uv_420, 420, right, 1 + +function generate_grain_rows_neon + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-16]! +1: + mov w16, #80 +2: + bl get_gaussian_neon + srshl v0.8h, v0.8h, v31.8h + subs w16, w16, #8 + st1 {v0.8h}, [x0], #16 + b.gt 2b + get_grain_2 v0 + subs w1, w1, #1 + st1 {v0.s}[0], [x0], #4 + b.gt 1b + ldr x30, [sp], #16 + AARCH64_VALIDATE_LINK_REGISTER + ret +endfunc + +function generate_grain_rows_44_neon + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-16]! +1: + mov w16, #40 +2: + bl get_gaussian_neon + srshl v0.8h, v0.8h, v31.8h + subs w16, w16, #8 + st1 {v0.8h}, [x0], #16 + b.gt 2b + get_grain_4 v0 + subs w1, w1, #1 + st1 {v0.4h}, [x0] + add x0, x0, #GRAIN_WIDTH*2-80 + b.gt 1b + ldr x30, [sp], #16 + AARCH64_VALIDATE_LINK_REGISTER + ret +endfunc + +function gen_grain_uv_444_lag0_neon + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-16]! + ld1 {v4.8h}, [x19], #16 +gen_grain_uv_lag0_8_start: + bl get_gaussian_neon + srshl v0.8h, v0.8h, v31.8h +gen_grain_uv_lag0_8_add: + and v4.16b, v4.16b, v1.16b + smull v2.4s, v4.4h, v27.4h + smull2 v3.4s, v4.8h, v27.8h + srshl v2.4s, v2.4s, v28.4s + srshl v3.4s, v3.4s, v28.4s + sqxtn v2.4h, v2.4s + sqxtn2 v2.8h, v3.4s + sqadd v2.8h, v2.8h, v0.8h + smin v2.8h, v2.8h, v25.8h + smax v2.8h, v2.8h, v26.8h + st1 {v2.8h}, [x0], #16 + ldr x30, [sp], #16 + AARCH64_VALIDATE_LINK_REGISTER + ret +endfunc + +function gen_grain_uv_420_lag0_8_neon + AARCH64_SIGN_LINK_REGISTER + add x12, x19, #GRAIN_WIDTH*2 + str x30, [sp, #-16]! + ld1 {v16.8h, v17.8h}, [x19], #32 + ld1 {v18.8h, v19.8h}, [x12] + addp v16.8h, v16.8h, v17.8h + addp v17.8h, v18.8h, v19.8h + add v16.8h, v16.8h, v17.8h + srshr v4.8h, v16.8h, #2 + b gen_grain_uv_lag0_8_start +endfunc + +function gen_grain_uv_422_lag0_8_neon + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-16]! + ld1 {v16.8h, v17.8h}, [x19], #32 + addp v16.8h, v16.8h, v17.8h + srshr v4.8h, v16.8h, #1 + b gen_grain_uv_lag0_8_start +endfunc + +function gen_grain_uv_420_lag0_4_neon + add x12, x19, #GRAIN_WIDTH*2 + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-16]! + ld1 {v16.4h, v17.4h}, [x19] + ld1 {v18.4h, v19.4h}, [x12] + add x19, x19, #32 + addp v16.4h, v16.4h, v17.4h + addp v17.4h, v18.4h, v19.4h + add v16.4h, v16.4h, v17.4h + srshr v4.4h, v16.4h, #2 + get_grain_4 v0 + b gen_grain_uv_lag0_8_add +endfunc + +function gen_grain_uv_422_lag0_4_neon + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-16]! + ld1 {v16.4h, v17.4h}, [x19] + add x19, x19, #32 + addp v16.4h, v16.4h, v17.4h + srshr v4.4h, v16.4h, #1 + get_grain_4 v0 + b gen_grain_uv_lag0_8_add +endfunc + +.macro gen_grain_82 type +function generate_grain_\type()_16bpc_neon, export=1 + AARCH64_SIGN_LINK_REGISTER + stp x30, x19, [sp, #-96]! + +.ifc \type, uv_444 + mov w13, w3 + mov w14, #28 + add x19, x1, #3*GRAIN_WIDTH*2 + mov x1, x2 + mul w13, w13, w14 + clz w15, w4 +.else + clz w15, w2 +.endif + movrel x3, X(gaussian_sequence) + sub w15, w15, #24 // -bitdepth_min_8 + ldr w2, [x1, #FGD_SEED] + ldr w9, [x1, #FGD_GRAIN_SCALE_SHIFT] +.ifc \type, y + add x4, x1, #FGD_AR_COEFFS_Y +.else + add x4, x1, #FGD_AR_COEFFS_UV +.endif + add w9, w9, w15 // grain_scale_shift - bitdepth_min_8 + adr x16, L(gen_grain_\type()_tbl) + ldr w17, [x1, #FGD_AR_COEFF_LAG] + add w9, w9, #4 + ldrh w17, [x16, w17, uxtw #1] + dup v31.8h, w9 // 4 - bitdepth_min_8 + data->grain_scale_shift + sub x16, x16, w17, uxtw + neg v31.8h, v31.8h + +.ifc \type, uv_444 + cmp w13, #0 + mov w11, #0x49d8 + mov w14, #0xb524 + add x4, x4, w13, uxtw // Add offset to ar_coeffs_uv[1] + csel w11, w11, w14, ne +.endif + + ldr w7, [x1, #FGD_AR_COEFF_SHIFT] + neg w15, w15 // bitdepth_min_8 + mov w8, #1 + mov w10, #1 + lsl w8, w8, w7 // 1 << ar_coeff_shift + lsl w10, w10, w9 // 1 << (4 + data->grain_scale_shift) + lsr w8, w8, #1 // 1 << (ar_coeff_shift - 1) + lsr w10, w10, #1 // 1 << (4 + data->grain_scale_shift - 1) + mov w5, #128 + lsl w5, w5, w15 // 128 << bitdepth_min_8 + neg w6, w5 // -(128 << bitpdeth_min_8) + sub w5, w5, #1 // (128 << bitdepth_min_8) - 1 + +.ifc \type, uv_444 + eor w2, w2, w11 +.endif + + br x16 + +L(generate_grain_\type()_lag0): + AARCH64_VALID_JUMP_TARGET +.ifc \type, y + mov w1, #GRAIN_HEIGHT + bl generate_grain_rows_neon +.else + dup v28.4s, w7 + ld1r {v27.8b}, [x4] // ar_coeffs_uv[0] + movi v0.16b, #0 + movi v1.16b, #255 + dup v25.8h, w5 + dup v26.8h, w6 + ext v29.16b, v0.16b, v1.16b, #10 + ext v30.16b, v1.16b, v0.16b, #2 + neg v28.4s, v28.4s + sxtl v27.8h, v27.8b + + mov w1, #3 + bl generate_grain_rows_neon + mov w1, #GRAIN_HEIGHT-3 +1: + mov v1.16b, v29.16b + bl gen_grain_uv_444_lag0_neon // 8 + movi v1.16b, #255 + bl gen_grain_uv_444_lag0_neon // 16 + bl gen_grain_uv_444_lag0_neon // 24 + bl gen_grain_uv_444_lag0_neon // 32 + bl gen_grain_uv_444_lag0_neon // 40 + bl gen_grain_uv_444_lag0_neon // 48 + bl gen_grain_uv_444_lag0_neon // 56 + bl gen_grain_uv_444_lag0_neon // 64 + bl gen_grain_uv_444_lag0_neon // 72 + mov v1.16b, v30.16b + bl gen_grain_uv_444_lag0_neon // 80 + get_grain_2 v16 + subs w1, w1, #1 + add x19, x19, #4 + st1 {v16.s}[0], [x0], #4 + b.gt 1b +.endif + ldp x30, x19, [sp], #96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(generate_grain_\type()_lag1): + AARCH64_VALID_JUMP_TARGET + ld1r {v27.8b}, [x4], #1 // ar_coeffs_y[0] + ld1r {v28.8b}, [x4], #1 // ar_coeffs_y[1] + ld1r {v29.8b}, [x4] // ar_coeffs_y[2] +.ifc \type, y + ldrsb w4, [x4, #1] // ar_coeffs_y[3] +.else + add x4, x4, #2 +.endif + + mov w1, #3 +.ifc \type, uv_444 + ld1r {v30.8b}, [x4] // ar_coeffs_uv[4] + ldursb w4, [x4, #-1] // ar_coeffs_uv[3] +.endif + bl generate_grain_rows_neon + sxtl v27.8h, v27.8b + sxtl v28.8h, v28.8b + sxtl v29.8h, v29.8b +.ifc \type, uv_444 + sxtl v30.8h, v30.8b +.endif + + mov w1, #GRAIN_HEIGHT - 3 +1: + bl sum_\type()_lag1_left_neon // 8 + bl sum_\type()_lag1_mid_neon // 16 + bl sum_\type()_lag1_mid_neon // 24 + bl sum_\type()_lag1_mid_neon // 32 + bl sum_\type()_lag1_mid_neon // 40 + bl sum_\type()_lag1_mid_neon // 48 + bl sum_\type()_lag1_mid_neon // 56 + bl sum_\type()_lag1_mid_neon // 64 + bl sum_\type()_lag1_mid_neon // 72 + bl sum_\type()_lag1_right_neon // 80 + get_grain_2 v16 + subs w1, w1, #1 +.ifc \type, uv_444 + add x19, x19, #4 +.endif + st1 {v16.s}[0], [x0], #4 + b.gt 1b + + ldp x30, x19, [sp], #96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(generate_grain_\type()_lag2): + AARCH64_VALID_JUMP_TARGET + ld1 {v30.16b}, [x4] // ar_coeffs_y[0-11], ar_coeffs_uv[0-12] + + smov w4, v30.b[10] + smov w17, v30.b[11] + + mov w1, #3 + bl generate_grain_rows_neon + + mov w1, #GRAIN_HEIGHT - 3 +1: + bl sum_\type()_lag2_left_neon // 8 + bl sum_\type()_lag2_mid_neon // 16 + bl sum_\type()_lag2_mid_neon // 24 + bl sum_\type()_lag2_mid_neon // 32 + bl sum_\type()_lag2_mid_neon // 40 + bl sum_\type()_lag2_mid_neon // 48 + bl sum_\type()_lag2_mid_neon // 56 + bl sum_\type()_lag2_mid_neon // 64 + bl sum_\type()_lag2_mid_neon // 72 + bl sum_\type()_lag2_right_neon // 80 + get_grain_2 v16 + subs w1, w1, #1 +.ifc \type, uv_444 + add x19, x19, #4 +.endif + st1 {v16.s}[0], [x0], #4 + b.gt 1b + + ldp x30, x19, [sp], #96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(generate_grain_\type()_lag3): + AARCH64_VALID_JUMP_TARGET + ld1 {v29.16b, v30.16b}, [x4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24] + stp d8, d9, [sp, #16] + stp d10, d11, [sp, #32] + stp d12, d13, [sp, #48] + stp d14, d15, [sp, #64] + stp x20, x21, [sp, #80] + + smov w4, v30.b[5] + smov w20, v30.b[6] + smov w21, v30.b[7] + + mov w1, #3 + bl generate_grain_rows_neon + + mov w1, #GRAIN_HEIGHT - 3 +1: + bl sum_\type()_lag3_left_neon // 8 + bl sum_\type()_lag3_mid_neon // 16 + bl sum_\type()_lag3_mid_neon // 24 + bl sum_\type()_lag3_mid_neon // 32 + bl sum_\type()_lag3_mid_neon // 40 + bl sum_\type()_lag3_mid_neon // 48 + bl sum_\type()_lag3_mid_neon // 56 + bl sum_\type()_lag3_mid_neon // 64 + bl sum_\type()_lag3_mid_neon // 72 + bl sum_\type()_lag3_right_neon // 80 + get_grain_2 v16 + subs w1, w1, #1 +.ifc \type, uv_444 + add x19, x19, #4 +.endif + st1 {v16.s}[0], [x0], #4 + b.gt 1b + + ldp x20, x21, [sp, #80] + ldp d14, d15, [sp, #64] + ldp d12, d13, [sp, #48] + ldp d10, d11, [sp, #32] + ldp d8, d9, [sp, #16] + ldp x30, x19, [sp], #96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(gen_grain_\type()_tbl): + .hword L(gen_grain_\type()_tbl) - L(generate_grain_\type()_lag0) + .hword L(gen_grain_\type()_tbl) - L(generate_grain_\type()_lag1) + .hword L(gen_grain_\type()_tbl) - L(generate_grain_\type()_lag2) + .hword L(gen_grain_\type()_tbl) - L(generate_grain_\type()_lag3) +endfunc +.endm + +gen_grain_82 y +gen_grain_82 uv_444 + +.macro set_height dst, type +.ifc \type, uv_420 + mov \dst, #SUB_GRAIN_HEIGHT-3 +.else + mov \dst, #GRAIN_HEIGHT-3 +.endif +.endm + +.macro increment_y_ptr reg, type +.ifc \type, uv_420 + add \reg, \reg, #2*GRAIN_WIDTH*2-(6*32) +.else + sub \reg, \reg, #6*32-GRAIN_WIDTH*2 +.endif +.endm + +.macro gen_grain_44 type +function generate_grain_\type()_16bpc_neon, export=1 + AARCH64_SIGN_LINK_REGISTER + stp x30, x19, [sp, #-96]! + + mov w13, w3 + mov w14, #28 + add x19, x1, #(3*GRAIN_WIDTH-3)*2 + mov x1, x2 + mul w13, w13, w14 + clz w15, w4 + + movrel x3, X(gaussian_sequence) + sub w15, w15, #24 // -bitdepth_min_8 + ldr w2, [x1, #FGD_SEED] + ldr w9, [x1, #FGD_GRAIN_SCALE_SHIFT] + add x4, x1, #FGD_AR_COEFFS_UV + add w9, w9, w15 // grain_scale_shift - bitdepth_min_8 + adr x16, L(gen_grain_\type()_tbl) + ldr w17, [x1, #FGD_AR_COEFF_LAG] + add w9, w9, #4 + ldrh w17, [x16, w17, uxtw #1] + dup v31.8h, w9 // 4 - bitdepth_min_8 + data->grain_scale_shift + sub x16, x16, w17, uxtw + neg v31.8h, v31.8h + + cmp w13, #0 + mov w11, #0x49d8 + mov w14, #0xb524 + add x4, x4, w13, uxtw // Add offset to ar_coeffs_uv[1] + csel w11, w11, w14, ne + + ldr w7, [x1, #FGD_AR_COEFF_SHIFT] + neg w15, w15 // bitdepth_min_8 + mov w8, #1 + mov w10, #1 + lsl w8, w8, w7 // 1 << ar_coeff_shift + lsl w10, w10, w9 // 1 << (4 + data->grain_scale_shift) + lsr w8, w8, #1 // 1 << (ar_coeff_shift - 1) + lsr w10, w10, #1 // 1 << (4 + data->grain_scale_shift - 1) + mov w5, #128 + lsl w5, w5, w15 // 128 << bitdepth_min_8 + neg w6, w5 // -(128 << bitpdeth_min_8) + sub w5, w5, #1 // (128 << bitdepth_min_8) - 1 + + eor w2, w2, w11 + + br x16 + +L(generate_grain_\type()_lag0): + AARCH64_VALID_JUMP_TARGET + dup v28.4s, w7 + ld1r {v27.8b}, [x4] // ar_coeffs_uv[0] + movi v0.16b, #0 + movi v1.16b, #255 + dup v25.8h, w5 + dup v26.8h, w6 + ext v29.16b, v0.16b, v1.16b, #10 + ext v30.16b, v1.16b, v0.16b, #14 + neg v28.4s, v28.4s + sxtl v27.8h, v27.8b + + mov w1, #3 + bl generate_grain_rows_44_neon + set_height w1, \type +1: + mov v1.16b, v29.16b + bl gen_grain_\type()_lag0_8_neon // 8 + movi v1.16b, #255 + bl gen_grain_\type()_lag0_8_neon // 16 + bl gen_grain_\type()_lag0_8_neon // 24 + bl gen_grain_\type()_lag0_8_neon // 32 + bl gen_grain_\type()_lag0_8_neon // 40 + mov v1.16b, v30.16b + bl gen_grain_\type()_lag0_4_neon // 44 + subs w1, w1, #1 + increment_y_ptr x19, \type + add x0, x0, #GRAIN_WIDTH*2-6*16 + b.gt 1b + + ldp x30, x19, [sp], #96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(generate_grain_\type()_lag1): + AARCH64_VALID_JUMP_TARGET + ld1r {v27.8b}, [x4], #1 // ar_coeffs_uv[0] + ld1r {v28.8b}, [x4], #1 // ar_coeffs_uv[1] + ld1r {v29.8b}, [x4] // ar_coeffs_uv[2] + add x4, x4, #2 + + mov w1, #3 + ld1r {v30.8b}, [x4] // ar_coeffs_u4[4] + ldursb w4, [x4, #-1] // ar_coeffs_uv[3] + bl generate_grain_rows_44_neon + + sxtl v27.8h, v27.8b + sxtl v28.8h, v28.8b + sxtl v29.8h, v29.8b + sxtl v30.8h, v30.8b + set_height w1, \type +1: + bl sum_\type()_lag1_left_neon // 8 + bl sum_\type()_lag1_mid_neon // 16 + bl sum_\type()_lag1_mid_neon // 24 + bl sum_\type()_lag1_mid_neon // 32 + bl sum_\type()_lag1_mid_neon // 40 + bl sum_\type()_lag1_right_neon // 44 + subs w1, w1, #1 + increment_y_ptr x19, \type + add x0, x0, #GRAIN_WIDTH*2-6*16 + b.gt 1b + + ldp x30, x19, [sp], #96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(generate_grain_\type()_lag2): + AARCH64_VALID_JUMP_TARGET + ld1 {v30.16b}, [x4] // ar_coeffs_uv[0-12] + + smov w4, v30.b[10] + smov w17, v30.b[11] + + mov w1, #3 + bl generate_grain_rows_44_neon + + set_height w1, \type +1: + bl sum_\type()_lag2_left_neon // 8 + bl sum_\type()_lag2_mid_neon // 16 + bl sum_\type()_lag2_mid_neon // 24 + bl sum_\type()_lag2_mid_neon // 32 + bl sum_\type()_lag2_mid_neon // 40 + bl sum_\type()_lag2_right_neon // 44 + subs w1, w1, #1 + increment_y_ptr x19, \type + add x0, x0, #GRAIN_WIDTH*2-6*16 + b.gt 1b + + ldp x30, x19, [sp], #96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(generate_grain_\type()_lag3): + AARCH64_VALID_JUMP_TARGET + ldr q29, [x4] // ar_coeffs_uv[0-15] + ldr q30, [x4, #16] // ar_coeffs_uv[16-24] + stp d8, d9, [sp, #16] + stp d10, d11, [sp, #32] + stp d12, d13, [sp, #48] + stp d14, d15, [sp, #64] + stp x20, x21, [sp, #80] + + smov w4, v30.b[5] + smov w20, v30.b[6] + smov w21, v30.b[7] + + mov w1, #3 + bl generate_grain_rows_44_neon + + set_height w1, \type +1: + bl sum_\type()_lag3_left_neon // 8 + bl sum_\type()_lag3_mid_neon // 16 + bl sum_\type()_lag3_mid_neon // 24 + bl sum_\type()_lag3_mid_neon // 32 + bl sum_\type()_lag3_mid_neon // 40 + bl sum_\type()_lag3_right_neon // 44 + subs w1, w1, #1 + increment_y_ptr x19, \type + add x0, x0, #GRAIN_WIDTH*2-6*16 + b.gt 1b + + ldp x20, x21, [sp, #80] + ldp d14, d15, [sp, #64] + ldp d12, d13, [sp, #48] + ldp d10, d11, [sp, #32] + ldp d8, d9, [sp, #16] + ldp x30, x19, [sp], #96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(gen_grain_\type()_tbl): + .hword L(gen_grain_\type()_tbl) - L(generate_grain_\type()_lag0) + .hword L(gen_grain_\type()_tbl) - L(generate_grain_\type()_lag1) + .hword L(gen_grain_\type()_tbl) - L(generate_grain_\type()_lag2) + .hword L(gen_grain_\type()_tbl) - L(generate_grain_\type()_lag3) +endfunc +.endm + +gen_grain_44 uv_420 +gen_grain_44 uv_422 + +.macro gather_interleaved dst1, dst2, src1, src2, off + umov w14, \src1[0] + umov w15, \src2[1] + umov w16, \src1[2] + add x14, x14, x3 + umov w17, \src2[3] + add x15, x15, x3 + ld1 {\dst1}[0+\off], [x14] + umov w14, \src1[4] + add x16, x16, x3 + ld1 {\dst2}[1+\off], [x15] + umov w15, \src2[5] + add x17, x17, x3 + ld1 {\dst1}[2+\off], [x16] + umov w16, \src1[6] + add x14, x14, x3 + ld1 {\dst2}[3+\off], [x17] + umov w17, \src2[7] + add x15, x15, x3 + ld1 {\dst1}[4+\off], [x14] + add x16, x16, x3 + ld1 {\dst2}[5+\off], [x15] + add x17, x17, x3 + ld1 {\dst1}[6+\off], [x16] + ld1 {\dst2}[7+\off], [x17] +.endm + +.macro gather dst1, dst2, src1, src2, src3, src4 + gather_interleaved \dst1, \dst2, \src1, \src3, 0 + gather_interleaved \dst2, \dst1, \src3, \src1, 0 + gather_interleaved \dst1, \dst2, \src2, \src4, 8 + gather_interleaved \dst2, \dst1, \src4, \src2, 8 +.endm + +function gather32_neon + gather v6.b, v7.b, v0.h, v1.h, v2.h, v3.h + ret +endfunc + +function gather16_neon + gather_interleaved v6.b, v7.b, v0.h, v1.h, 0 + gather_interleaved v7.b, v6.b, v1.h, v0.h, 0 + ins v6.d[1], v7.d[0] + ret +endfunc + +const overlap_coeffs_0, align=4 + .short 27, 17, 0, 0 + .short 17, 27, 32, 32 +endconst + +const overlap_coeffs_1, align=4 + .short 23, 0, 0, 0 + .short 22, 32, 32, 32 +endconst + +.macro calc_offset offx, offy, src, sx, sy + and \offy, \src, #0xF // randval & 0xF + lsr \offx, \src, #4 // randval >> 4 +.if \sy == 0 + add \offy, \offy, \offy // 2 * (randval & 0xF) +.endif +.if \sx == 0 + add \offx, \offx, \offx // 2 * (randval >> 4) +.endif +.endm + +.macro add_offset dst, offx, offy, src, stride + madd \dst, \stride, \offy, \src // grain_lut += grain_stride * offy + add \dst, \dst, \offx, uxtw #1 // grain_lut += offx +.endm + +// void dav1d_fgy_32x32_16bpc_neon(pixel *const dst, const pixel *const src, +// const ptrdiff_t stride, +// const uint8_t scaling[SCALING_SIZE], +// const int scaling_shift, +// const entry grain_lut[][GRAIN_WIDTH], +// const int offsets[][2], +// const int h, const ptrdiff_t clip, +// const ptrdiff_t type, +// const int bitdepth_max); +function fgy_32x32_16bpc_neon, export=1 + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-80]! + stp d8, d9, [sp, #16] + stp d10, d11, [sp, #32] + stp d12, d13, [sp, #48] + str d14, [sp, #64] + eor w4, w4, #15 // 15 - scaling_shift + ldr w11, [x6, #8] // offsets[1][0] + ldr w13, [x6, #4] // offsets[0][1] + ldr w15, [x6, #12] // offsets[1][1] + ldr w10, [sp, #96] // bitdepth_max + ldr w6, [x6] // offsets[0][0] + dup v26.8h, w10 // bitdepth_max + clz w10, w10 + ldr w8, [sp, #80] // clip + sub w10, w10, #24 // -bitdepth_min_8 + mov x9, #GRAIN_WIDTH*2 // grain_lut stride + neg w10, w10 // bitdepth_min_8 + + dup v29.8h, w4 // 15 - scaling_shift + dup v27.8h, w10 // bitdepth_min_8 + + movrel x16, overlap_coeffs_0 + + cbz w8, 1f + // clip + movi v30.8h, #16 + movi v31.8h, #235 + sshl v30.8h, v30.8h, v27.8h + sshl v31.8h, v31.8h, v27.8h + b 2f +1: + // no clip + movi v30.8h, #0 + mov v31.16b, v26.16b // bitdepth_max +2: + + ushr v26.8h, v26.8h, #1 // grain_max + not v25.16b, v26.16b // grain_min + + ld1 {v27.4h, v28.4h}, [x16] // overlap_coeffs + + add x5, x5, #18 // grain_lut += 9 + add x5, x5, x9, lsl #3 // grain_lut += 8 * grain_stride + add x5, x5, x9 // grain_lut += grain_stride + + calc_offset w11, w12, w11, 0, 0 + calc_offset w13, w14, w13, 0, 0 + calc_offset w15, w16, w15, 0, 0 + calc_offset w6, w10, w6, 0, 0 + + add_offset x12, w11, x12, x5, x9 + add_offset x14, w13, x14, x5, x9 + add_offset x16, w15, x16, x5, x9 + add_offset x5, w6, x10, x5, x9 + + ldr w11, [sp, #88] // type + adr x13, L(fgy_loop_tbl) + + add x4, x12, #32*2 // grain_lut += BLOCK_SIZE * bx + add x6, x14, x9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by + + tst w11, #1 + ldrh w11, [x13, w11, uxtw #1] + + add x8, x16, x9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by + add x8, x8, #32*2 // grain_lut += BLOCK_SIZE * bx + + sub x11, x13, w11, uxtw + + b.eq 1f + // y overlap + dup v8.8h, v27.h[0] + dup v9.8h, v27.h[1] + mov w10, w7 // backup actual h + mov w7, #2 +1: + br x11 +endfunc + +function fgy_loop_neon +.macro fgy ox, oy +L(loop_\ox\oy): + AARCH64_VALID_JUMP_TARGET +1: + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2 // src +.if \ox + ld1 {v20.4h}, [x4], x9 // grain_lut old +.endif +.if \oy + ld1 {v21.8h, v22.8h, v23.8h, v24.8h}, [x6], x9 // grain_lut top +.endif +.if \ox && \oy + ld1 {v14.4h}, [x8], x9 // grain_lut top old +.endif + mvni v4.8h, #0xf0, lsl #8 // 0x0fff + ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x9 // grain_lut + + // Make sure that uninitialized pixels out of range past the right + // edge are in range; their actual values shouldn't matter. + and v0.16b, v0.16b, v4.16b + and v1.16b, v1.16b, v4.16b + and v2.16b, v2.16b, v4.16b + and v3.16b, v3.16b, v4.16b + bl gather32_neon + +.if \ox + smull v20.4s, v20.4h, v27.4h + smlal v20.4s, v16.4h, v28.4h +.endif + +.if \oy +.if \ox + smull v14.4s, v14.4h, v27.4h + smlal v14.4s, v21.4h, v28.4h + sqrshrn v20.4h, v20.4s, #5 + sqrshrn v14.4h, v14.4s, #5 + smin v20.4h, v20.4h, v26.4h + smin v14.4h, v14.4h, v26.4h + smax v20.4h, v20.4h, v25.4h + smax v14.4h, v14.4h, v25.4h +.endif + +.if \ox + smull v10.4s, v20.4h, v9.4h +.else + smull v10.4s, v16.4h, v9.4h +.endif + smull2 v11.4s, v16.8h, v9.8h + smull v12.4s, v17.4h, v9.4h + smull2 v13.4s, v17.8h, v9.8h + smull v16.4s, v18.4h, v9.4h + smull2 v17.4s, v18.8h, v9.8h + smull v18.4s, v19.4h, v9.4h + smull2 v19.4s, v19.8h, v9.8h +.if \ox + smlal v10.4s, v14.4h, v8.4h +.else + smlal v10.4s, v21.4h, v8.4h +.endif + smlal2 v11.4s, v21.8h, v8.8h + smlal v12.4s, v22.4h, v8.4h + smlal2 v13.4s, v22.8h, v8.8h + smlal v16.4s, v23.4h, v8.4h + smlal2 v17.4s, v23.8h, v8.8h + smlal v18.4s, v24.4h, v8.4h + smlal2 v19.4s, v24.8h, v8.8h + sqrshrn v10.4h, v10.4s, #5 + sqrshrn2 v10.8h, v11.4s, #5 + sqrshrn v11.4h, v12.4s, #5 + sqrshrn2 v11.8h, v13.4s, #5 + sqrshrn v12.4h, v16.4s, #5 + sqrshrn2 v12.8h, v17.4s, #5 + sqrshrn v13.4h, v18.4s, #5 + sqrshrn2 v13.8h, v19.4s, #5 + smin v16.8h, v10.8h, v26.8h + smin v17.8h, v11.8h, v26.8h + smin v18.8h, v12.8h, v26.8h + smin v19.8h, v13.8h, v26.8h + smax v16.8h, v16.8h, v25.8h + smax v17.8h, v17.8h, v25.8h + smax v18.8h, v18.8h, v25.8h + smax v19.8h, v19.8h, v25.8h +.endif + + uxtl v4.8h, v6.8b // scaling +.if \ox && !\oy + sqrshrn v20.4h, v20.4s, #5 +.endif + uxtl2 v5.8h, v6.16b +.if \ox && !\oy + smin v20.4h, v20.4h, v26.4h +.endif + uxtl v6.8h, v7.8b +.if \ox && !\oy + smax v20.4h, v20.4h, v25.4h +.endif + uxtl2 v7.8h, v7.16b +.if \ox && !\oy + ins v16.d[0], v20.d[0] +.endif + ushl v4.8h, v4.8h, v29.8h // scaling << (15 - scaling_shift) + ushl v5.8h, v5.8h, v29.8h + ushl v6.8h, v6.8h, v29.8h + ushl v7.8h, v7.8h, v29.8h + + sqrdmulh v20.8h, v16.8h, v4.8h // round2((scaling << (15 - scaling_shift) * grain, 15) + sqrdmulh v21.8h, v17.8h, v5.8h + sqrdmulh v22.8h, v18.8h, v6.8h + sqrdmulh v23.8h, v19.8h, v7.8h + + usqadd v0.8h, v20.8h // *src + noise + usqadd v1.8h, v21.8h + usqadd v2.8h, v22.8h + usqadd v3.8h, v23.8h + + umax v0.8h, v0.8h, v30.8h + umax v1.8h, v1.8h, v30.8h + umax v2.8h, v2.8h, v30.8h + umax v3.8h, v3.8h, v30.8h + umin v0.8h, v0.8h, v31.8h + umin v1.8h, v1.8h, v31.8h + umin v2.8h, v2.8h, v31.8h + umin v3.8h, v3.8h, v31.8h + + subs w7, w7, #1 +.if \oy + dup v8.8h, v28.h[0] + dup v9.8h, v28.h[1] +.endif + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x2 // dst + b.gt 1b + +.if \oy + cmp w10, #2 + sub w7, w10, #2 // restore actual remaining h + b.gt L(loop_\ox()0) +.endif + ldr d14, [sp, #64] + ldp d12, d13, [sp, #48] + ldp d10, d11, [sp, #32] + ldp d8, d9, [sp, #16] + ldr x30, [sp], #80 + AARCH64_VALIDATE_LINK_REGISTER + ret +.endm + + fgy 0, 0 + fgy 0, 1 + fgy 1, 0 + fgy 1, 1 + +L(fgy_loop_tbl): + .hword L(fgy_loop_tbl) - L(loop_00) + .hword L(fgy_loop_tbl) - L(loop_01) + .hword L(fgy_loop_tbl) - L(loop_10) + .hword L(fgy_loop_tbl) - L(loop_11) +endfunc + +// void dav1d_fguv_32x32_420_16bpc_neon(pixel *const dst, +// const pixel *const src, +// const ptrdiff_t stride, +// const uint8_t scaling[SCALING_SIZE], +// const Dav1dFilmGrainData *const data, +// const entry grain_lut[][GRAIN_WIDTH], +// const pixel *const luma_row, +// const ptrdiff_t luma_stride, +// const int offsets[][2], +// const ptrdiff_t h, const ptrdiff_t uv, +// const ptrdiff_t is_id, +// const ptrdiff_t type, +// const int bitdepth_max); +.macro fguv layout, sx, sy +function fguv_32x32_\layout()_16bpc_neon, export=1 + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-80]! + stp d8, d9, [sp, #16] + stp d10, d11, [sp, #32] + stp d12, d13, [sp, #48] + stp d14, d15, [sp, #64] + + ldp x8, x9, [sp, #80] // offsets, h + ldp x10, x11, [sp, #96] // uv, is_id + ldr w16, [sp, #120] // bitdepth_max + + ldr w13, [x4, #FGD_SCALING_SHIFT] + ldr w12, [x4, #FGD_CLIP_TO_RESTRICTED_RANGE] + dup v23.8h, w16 // bitdepth_max + clz w16, w16 + eor w13, w13, #15 // 15 - scaling_shift + sub w16, w16, #24 // -bitdepth_min_8 + + // !csfl + add x10, x4, x10, lsl #2 // + 4*uv + add x14, x10, #FGD_UV_LUMA_MULT + add x15, x10, #FGD_UV_MULT + add x10, x10, #FGD_UV_OFFSET + neg w16, w16 // bitdepth_min_8 + ld1r {v8.8h}, [x14] // uv_luma_mult + ld1r {v24.8h}, [x10] // uv_offset + ld1r {v9.8h}, [x15] // uv_mult + + dup v29.8h, w13 // 15 - scaling_shift + dup v27.8h, w16 // bitdepth_min_8 + + cbz w12, 1f + // clip + movi v30.8h, #16 + movi v31.8h, #240 + sshl v30.8h, v30.8h, v27.8h + sshl v31.8h, v31.8h, v27.8h + cbz w11, 2f + // is_id + movi v31.8h, #235 + sshl v31.8h, v31.8h, v27.8h + b 2f +1: + // no clip + movi v30.8h, #0 + mov v31.16b, v23.16b // bitdepth_max +2: + + ushr v15.8h, v23.8h, #1 // grain_max + sshl v24.8h, v24.8h, v27.8h // uv_offset << bitdepth_min_8 + not v14.16b, v15.16b // grain_min + + ldr w12, [x8, #8] // offsets[1][0] + ldr w14, [x8, #4] // offsets[0][1] + ldr w16, [x8, #12] // offsets[1][1] + ldr w8, [x8] // offsets[0][0] + + mov x10, #GRAIN_WIDTH*2 // grain_lut stride + + add x5, x5, #(2*(3 + (2 >> \sx)*3)) // grain_lut += 9 or 6 +.if \sy + add x5, x5, x10, lsl #2 // grain_lut += 4 * grain_stride + add x5, x5, x10, lsl #1 // grain_lut += 2 * grain_stride +.else + add x5, x5, x10, lsl #3 // grain_lut += 8 * grain_stride + add x5, x5, x10 // grain_lut += grain_stride +.endif + + calc_offset w12, w13, w12, \sx, \sy + calc_offset w14, w15, w14, \sx, \sy + calc_offset w16, w17, w16, \sx, \sy + calc_offset w8, w11, w8, \sx, \sy + + add_offset x13, w12, x13, x5, x10 + add_offset x15, w14, x15, x5, x10 + add_offset x17, w16, x17, x5, x10 + add_offset x5, w8, x11, x5, x10 + + add x4, x13, #2*(32 >> \sx) // grain_lut += BLOCK_SIZE * bx + add x8, x15, x10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by + add x11, x17, x10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by + add x11, x11, #2*(32 >> \sx) // grain_lut += BLOCK_SIZE * bx + + ldr w13, [sp, #112] // type + + movrel x16, overlap_coeffs_\sx + adr x14, L(fguv_loop_sx\sx()_tbl) + + ld1 {v27.4h, v28.4h}, [x16] // overlap_coeffs + tst w13, #1 + ldrh w13, [x14, w13, uxtw #1] + + b.eq 1f + // y overlap + sub w12, w9, #(2 >> \sy) // backup remaining h + mov w9, #(2 >> \sy) + +1: + sub x13, x14, w13, uxtw + +.if \sy + movi v25.8h, #23 + movi v26.8h, #22 +.else + movi v25.8h, #27 + movi v26.8h, #17 +.endif + +.if \sy + add x7, x7, x7 // luma_stride *= 2 +.endif + + br x13 +endfunc +.endm + +fguv 420, 1, 1 +fguv 422, 1, 0 +fguv 444, 0, 0 + +function fguv_loop_sx0_neon +.macro fguv_loop_sx0 csfl, ox, oy +L(fguv_loop_sx0_csfl\csfl()_\ox\oy): + AARCH64_VALID_JUMP_TARGET +1: +.if \ox + ld1 {v4.4h}, [x4], x10 // grain_lut old +.endif +.if \oy + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x8], x10 // grain_lut top +.endif +.if \ox && \oy + ld1 {v5.4h}, [x11], x10 // grain_lut top old +.endif + ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x10 // grain_lut + +.if \ox + smull v4.4s, v4.4h, v27.4h + smlal v4.4s, v16.4h, v28.4h +.endif + +.if \oy +.if \ox + smull v5.4s, v5.4h, v27.4h + smlal v5.4s, v0.4h, v28.4h + sqrshrn v4.4h, v4.4s, #5 + sqrshrn v5.4h, v5.4s, #5 + smin v4.4h, v4.4h, v15.4h + smin v5.4h, v5.4h, v15.4h + smax v4.4h, v4.4h, v14.4h + smax v5.4h, v5.4h, v14.4h + ins v16.d[0], v4.d[0] + ins v0.d[0], v5.d[0] +.endif + + smull v6.4s, v16.4h, v26.4h + smull2 v7.4s, v16.8h, v26.8h + smull v10.4s, v17.4h, v26.4h + smull2 v11.4s, v17.8h, v26.8h + smull v16.4s, v18.4h, v26.4h + smull2 v17.4s, v18.8h, v26.8h + smull v18.4s, v19.4h, v26.4h + smull2 v19.4s, v19.8h, v26.8h + smlal v6.4s, v0.4h, v25.4h + smlal2 v7.4s, v0.8h, v25.8h + smlal v10.4s, v1.4h, v25.4h + smlal2 v11.4s, v1.8h, v25.8h + smlal v16.4s, v2.4h, v25.4h + smlal2 v17.4s, v2.8h, v25.8h + smlal v18.4s, v3.4h, v25.4h + smlal2 v19.4s, v3.8h, v25.8h + sqrshrn v6.4h, v6.4s, #5 + sqrshrn2 v6.8h, v7.4s, #5 + sqrshrn v7.4h, v10.4s, #5 + sqrshrn2 v7.8h, v11.4s, #5 + sqrshrn v10.4h, v16.4s, #5 + sqrshrn2 v10.8h, v17.4s, #5 + sqrshrn v11.4h, v18.4s, #5 + sqrshrn2 v11.8h, v19.4s, #5 +.endif + +.if \ox && !\oy + sqrshrn v4.4h, v4.4s, #5 + smin v4.4h, v4.4h, v15.4h +.endif + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7 // luma +.if \oy + smin v16.8h, v6.8h, v15.8h + smin v17.8h, v7.8h, v15.8h + smin v18.8h, v10.8h, v15.8h + smin v19.8h, v11.8h, v15.8h + smax v16.8h, v16.8h, v14.8h + smax v17.8h, v17.8h, v14.8h + smax v18.8h, v18.8h, v14.8h + smax v19.8h, v19.8h, v14.8h +.endif + +.if \ox && !\oy + smax v4.4h, v4.4h, v14.4h +.endif + ld1 {v10.8h, v11.8h, v12.8h, v13.8h}, [x1], x2 // src +.if \ox && !\oy + ins v16.d[0], v4.d[0] +.endif + +.if !\csfl + smull v4.4s, v0.4h, v8.4h + smull2 v5.4s, v0.8h, v8.8h + smull v6.4s, v1.4h, v8.4h + smull2 v7.4s, v1.8h, v8.8h + smull v0.4s, v2.4h, v8.4h + smull2 v1.4s, v2.8h, v8.8h + smull v2.4s, v3.4h, v8.4h + smull2 v3.4s, v3.8h, v8.8h + smlal v4.4s, v10.4h, v9.4h + smlal2 v5.4s, v10.8h, v9.8h + smlal v6.4s, v11.4h, v9.4h + smlal2 v7.4s, v11.8h, v9.8h + smlal v0.4s, v12.4h, v9.4h + smlal2 v1.4s, v12.8h, v9.8h + smlal v2.4s, v13.4h, v9.4h + smlal2 v3.4s, v13.8h, v9.8h + shrn v4.4h, v4.4s, #6 + shrn2 v4.8h, v5.4s, #6 + shrn v5.4h, v6.4s, #6 + shrn2 v5.8h, v7.4s, #6 + shrn v6.4h, v0.4s, #6 + shrn2 v6.8h, v1.4s, #6 + shrn v7.4h, v2.4s, #6 + shrn2 v7.8h, v3.4s, #6 + add v0.8h, v4.8h, v24.8h + add v1.8h, v5.8h, v24.8h + add v2.8h, v6.8h, v24.8h + add v3.8h, v7.8h, v24.8h + movi v20.8h, #0 + smin v0.8h, v0.8h, v23.8h + smin v1.8h, v1.8h, v23.8h + smin v2.8h, v2.8h, v23.8h + smin v3.8h, v3.8h, v23.8h + smax v0.8h, v0.8h, v20.8h + smax v1.8h, v1.8h, v20.8h + smax v2.8h, v2.8h, v20.8h + smax v3.8h, v3.8h, v20.8h +.else + // Make sure that uninitialized pixels out of range past the right + // edge are in range; their actual values shouldn't matter. + and v0.16b, v0.16b, v23.16b + and v1.16b, v1.16b, v23.16b + and v2.16b, v2.16b, v23.16b + and v3.16b, v3.16b, v23.16b +.endif + + bl gather32_neon + + uxtl v4.8h, v6.8b // scaling + uxtl2 v5.8h, v6.16b + uxtl v6.8h, v7.8b + uxtl2 v7.8h, v7.16b + + ushl v4.8h, v4.8h, v29.8h // scaling << (15 - scaling_shift) + ushl v5.8h, v5.8h, v29.8h + ushl v6.8h, v6.8h, v29.8h + ushl v7.8h, v7.8h, v29.8h + + sqrdmulh v16.8h, v16.8h, v4.8h // round2((scaling << (15 - scaling_shift) * grain, 15) + sqrdmulh v17.8h, v17.8h, v5.8h + sqrdmulh v18.8h, v18.8h, v6.8h + sqrdmulh v19.8h, v19.8h, v7.8h + + usqadd v10.8h, v16.8h // *src + noise + usqadd v11.8h, v17.8h + usqadd v12.8h, v18.8h + usqadd v13.8h, v19.8h + + umax v0.8h, v10.8h, v30.8h + umax v1.8h, v11.8h, v30.8h + umax v2.8h, v12.8h, v30.8h + umax v3.8h, v13.8h, v30.8h + umin v0.8h, v0.8h, v31.8h + umin v1.8h, v1.8h, v31.8h + umin v2.8h, v2.8h, v31.8h + umin v3.8h, v3.8h, v31.8h + + subs w9, w9, #1 +.if \oy + dup v25.8h, v28.h[0] + dup v26.8h, v28.h[1] +.endif + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x2 // dst + b.gt 1b + +.if \oy + cmp w12, #0 + mov w9, w12 // restore actual remaining h + b.gt L(fguv_loop_sx0_csfl\csfl()_\ox()0) +.endif + b 9f +.endm + fguv_loop_sx0 0, 0, 0 + fguv_loop_sx0 0, 0, 1 + fguv_loop_sx0 0, 1, 0 + fguv_loop_sx0 0, 1, 1 + fguv_loop_sx0 1, 0, 0 + fguv_loop_sx0 1, 0, 1 + fguv_loop_sx0 1, 1, 0 + fguv_loop_sx0 1, 1, 1 + +9: + ldp d14, d15, [sp, #64] + ldp d12, d13, [sp, #48] + ldp d10, d11, [sp, #32] + ldp d8, d9, [sp, #16] + ldr x30, [sp], #80 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(fguv_loop_sx0_tbl): + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_00) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_01) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_10) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_11) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_00) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_01) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_10) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_11) +endfunc + +function fguv_loop_sx1_neon +.macro fguv_loop_sx1 csfl, ox, oy +L(fguv_loop_sx1_csfl\csfl()_\ox\oy): + AARCH64_VALID_JUMP_TARGET +1: +.if \ox + ld1 {v18.4h}, [x4], x10 // grain_lut old +.endif +.if \oy + ld1 {v20.8h, v21.8h}, [x8], x10 // grain_lut top +.endif +.if \ox && \oy + ld1 {v19.4h}, [x11], x10 // grain_lut top old +.endif + ld1 {v16.8h, v17.8h}, [x5], x10 // grain_lut + +.if \ox + smull v18.4s, v18.4h, v27.4h + smlal v18.4s, v16.4h, v28.4h +.endif + +.if \oy +.if \ox + smull v19.4s, v19.4h, v27.4h + smlal v19.4s, v20.4h, v28.4h + sqrshrn v18.4h, v18.4s, #5 + sqrshrn v19.4h, v19.4s, #5 + smin v18.4h, v18.4h, v15.4h + smin v19.4h, v19.4h, v15.4h + smax v18.4h, v18.4h, v14.4h + smax v19.4h, v19.4h, v14.4h + ins v16.d[0], v18.d[0] + ins v20.d[0], v19.d[0] +.endif + + smull v0.4s, v16.4h, v26.4h + smull2 v1.4s, v16.8h, v26.8h + smull v2.4s, v17.4h, v26.4h + smull2 v3.4s, v17.8h, v26.8h + smlal v0.4s, v20.4h, v25.4h + smlal2 v1.4s, v20.8h, v25.8h + smlal v2.4s, v21.4h, v25.4h + smlal2 v3.4s, v21.8h, v25.8h + sqrshrn v16.4h, v0.4s, #5 + sqrshrn2 v16.8h, v1.4s, #5 + sqrshrn v17.4h, v2.4s, #5 + sqrshrn2 v17.8h, v3.4s, #5 +.endif + +.if \ox && !\oy + sqrshrn v18.4h, v18.4s, #5 + smin v18.4h, v18.4h, v15.4h +.endif + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7 // luma +.if \oy + smin v16.8h, v16.8h, v15.8h + smin v17.8h, v17.8h, v15.8h + smax v16.8h, v16.8h, v14.8h + smax v17.8h, v17.8h, v14.8h +.endif + +.if \ox && !\oy + smax v18.4h, v18.4h, v14.4h +.endif + ld1 {v10.8h, v11.8h}, [x1], x2 // src +.if \ox && !\oy + ins v16.d[0], v18.d[0] +.endif + addp v0.8h, v0.8h, v1.8h + addp v1.8h, v2.8h, v3.8h + urshr v0.8h, v0.8h, #1 + urshr v1.8h, v1.8h, #1 +.if !\csfl + smull v2.4s, v0.4h, v8.4h + smull2 v3.4s, v0.8h, v8.8h + smull v0.4s, v1.4h, v8.4h + smull2 v1.4s, v1.8h, v8.8h + smlal v2.4s, v10.4h, v9.4h + smlal2 v3.4s, v10.8h, v9.8h + smlal v0.4s, v11.4h, v9.4h + smlal2 v1.4s, v11.8h, v9.8h + shrn v2.4h, v2.4s, #6 + shrn2 v2.8h, v3.4s, #6 + shrn v3.4h, v0.4s, #6 + shrn2 v3.8h, v1.4s, #6 + add v0.8h, v2.8h, v24.8h + add v1.8h, v3.8h, v24.8h + movi v2.8h, #0 + smin v0.8h, v0.8h, v23.8h + smin v1.8h, v1.8h, v23.8h + smax v0.8h, v0.8h, v2.8h + smax v1.8h, v1.8h, v2.8h +.else + // Make sure that uninitialized pixels out of range past the right + // edge are in range; their actual values shouldn't matter. + and v0.16b, v0.16b, v23.16b + and v1.16b, v1.16b, v23.16b +.endif + + bl gather16_neon + + uxtl v4.8h, v6.8b // scaling + uxtl2 v5.8h, v6.16b + + ushl v4.8h, v4.8h, v29.8h // scaling << (15 - scaling_shift) + ushl v5.8h, v5.8h, v29.8h + + sqrdmulh v16.8h, v16.8h, v4.8h // round2((scaling << (15 - scaling_shift) * grain, 15) + sqrdmulh v17.8h, v17.8h, v5.8h + + usqadd v10.8h, v16.8h // *src + noise + usqadd v11.8h, v17.8h + + umax v0.8h, v10.8h, v30.8h + umax v1.8h, v11.8h, v30.8h + umin v0.8h, v0.8h, v31.8h + umin v1.8h, v1.8h, v31.8h + +.if \oy + mov v16.16b, v25.16b +.endif + subs w9, w9, #1 +.if \oy + mov v25.16b, v26.16b + mov v26.16b, v16.16b +.endif + st1 {v0.8h, v1.8h}, [x0], x2 // dst + b.gt 1b + +.if \oy + cmp w12, #0 + mov w9, w12 // restore actual remaining h + b.gt L(fguv_loop_sx1_csfl\csfl()_\ox()0) +.endif + + b 9f +.endm + fguv_loop_sx1 0, 0, 0 + fguv_loop_sx1 0, 0, 1 + fguv_loop_sx1 0, 1, 0 + fguv_loop_sx1 0, 1, 1 + fguv_loop_sx1 1, 0, 0 + fguv_loop_sx1 1, 0, 1 + fguv_loop_sx1 1, 1, 0 + fguv_loop_sx1 1, 1, 1 + +9: + ldp d14, d15, [sp, #64] + ldp d12, d13, [sp, #48] + ldp d10, d11, [sp, #32] + ldp d8, d9, [sp, #16] + ldr x30, [sp], #80 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(fguv_loop_sx1_tbl): + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_00) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_01) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_10) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_11) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_00) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_01) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_10) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_11) +endfunc diff --git a/third_party/dav1d/src/arm/64/itx.S b/third_party/dav1d/src/arm/64/itx.S index ec932af0ce66d..c9650e9d544b7 100644 --- a/third_party/dav1d/src/arm/64/itx.S +++ b/third_party/dav1d/src/arm/64/itx.S @@ -133,10 +133,10 @@ endconst .endif .endm
-.macro rshrn_sz d0, s0, s1, shift, sz - rshrn \d0().4h, \s0().4s, \shift +.macro sqrshrn_sz d0, s0, s1, shift, sz + sqrshrn \d0().4h, \s0().4s, \shift .ifc \sz, .8h - rshrn2 \d0().8h, \s1().4s, \shift + sqrshrn2 \d0().8h, \s1().4s, \shift .endif .endm
@@ -438,11 +438,11 @@ endfunc smull_smlal v6, v7, \r1, \r3, v0.h[3], v0.h[2], \sz smull_smlsl v4, v5, \r1, \r3, v0.h[2], v0.h[3], \sz smull_smlal v2, v3, \r0, \r2, v0.h[0], v0.h[0], \sz - rshrn_sz v6, v6, v7, #12, \sz - rshrn_sz v7, v4, v5, #12, \sz + sqrshrn_sz v6, v6, v7, #12, \sz + sqrshrn_sz v7, v4, v5, #12, \sz smull_smlsl v4, v5, \r0, \r2, v0.h[0], v0.h[0], \sz - rshrn_sz v2, v2, v3, #12, \sz - rshrn_sz v3, v4, v5, #12, \sz + sqrshrn_sz v2, v2, v3, #12, \sz + sqrshrn_sz v3, v4, v5, #12, \sz sqadd \r0\sz, v2\sz, v6\sz sqsub \r3\sz, v2\sz, v6\sz sqadd \r1\sz, v3\sz, v7\sz @@ -714,11 +714,11 @@ def_fn_4x4 identity, flipadst smull_smlsl v2, v3, \r1, \r7, v0.h[4], v0.h[5], \sz // -> t4a smull_smlal v4, v5, \r1, \r7, v0.h[5], v0.h[4], \sz // -> t7a smull_smlsl v6, v7, \r5, \r3, v0.h[6], v0.h[7], \sz // -> t5a - rshrn_sz \r1, v2, v3, #12, \sz // t4a - rshrn_sz \r7, v4, v5, #12, \sz // t7a + sqrshrn_sz \r1, v2, v3, #12, \sz // t4a + sqrshrn_sz \r7, v4, v5, #12, \sz // t7a smull_smlal v2, v3, \r5, \r3, v0.h[7], v0.h[6], \sz // -> t6a - rshrn_sz \r3, v6, v7, #12, \sz // t5a - rshrn_sz \r5, v2, v3, #12, \sz // t6a + sqrshrn_sz \r3, v6, v7, #12, \sz // t5a + sqrshrn_sz \r5, v2, v3, #12, \sz // t6a
sqadd v2\sz, \r1\sz, \r3\sz // t4 sqsub \r1\sz, \r1\sz, \r3\sz // t5a @@ -727,8 +727,8 @@ def_fn_4x4 identity, flipadst
smull_smlsl v4, v5, \r3, \r1, v0.h[0], v0.h[0], \sz // -> t5 smull_smlal v6, v7, \r3, \r1, v0.h[0], v0.h[0], \sz // -> t6 - rshrn_sz v4, v4, v5, #12, \sz // t5 - rshrn_sz v5, v6, v7, #12, \sz // t6 + sqrshrn_sz v4, v4, v5, #12, \sz // t5 + sqrshrn_sz v5, v6, v7, #12, \sz // t6
sqsub \r7\sz, \r0\sz, v3\sz // out7 sqadd \r0\sz, \r0\sz, v3\sz // out0 @@ -762,19 +762,19 @@ endfunc smull_smlal v2, v3, v23, v16, v0.h[0], v0.h[1], \sz smull_smlsl v4, v5, v23, v16, v0.h[1], v0.h[0], \sz smull_smlal v6, v7, v21, v18, v0.h[2], v0.h[3], \sz - rshrn_sz v16, v2, v3, #12, \sz // t0a - rshrn_sz v23, v4, v5, #12, \sz // t1a + sqrshrn_sz v16, v2, v3, #12, \sz // t0a + sqrshrn_sz v23, v4, v5, #12, \sz // t1a smull_smlsl v2, v3, v21, v18, v0.h[3], v0.h[2], \sz smull_smlal v4, v5, v19, v20, v0.h[4], v0.h[5], \sz - rshrn_sz v18, v6, v7, #12, \sz // t2a - rshrn_sz v21, v2, v3, #12, \sz // t3a + sqrshrn_sz v18, v6, v7, #12, \sz // t2a + sqrshrn_sz v21, v2, v3, #12, \sz // t3a smull_smlsl v6, v7, v19, v20, v0.h[5], v0.h[4], \sz smull_smlal v2, v3, v17, v22, v0.h[6], v0.h[7], \sz - rshrn_sz v20, v4, v5, #12, \sz // t4a - rshrn_sz v19, v6, v7, #12, \sz // t5a + sqrshrn_sz v20, v4, v5, #12, \sz // t4a + sqrshrn_sz v19, v6, v7, #12, \sz // t5a smull_smlsl v4, v5, v17, v22, v0.h[7], v0.h[6], \sz - rshrn_sz v22, v2, v3, #12, \sz // t6a - rshrn_sz v17, v4, v5, #12, \sz // t7a + sqrshrn_sz v22, v2, v3, #12, \sz // t6a + sqrshrn_sz v17, v4, v5, #12, \sz // t7a
sqadd v2\sz, v16\sz, v20\sz // t0 sqsub v3\sz, v16\sz, v20\sz // t4 @@ -789,13 +789,13 @@ endfunc smull_smlsl v20, v21, v3, v5, v1.h[2], v1.h[3], \sz smull_smlsl v22, v23, v19, v7, v1.h[3], v1.h[2], \sz
- rshrn_sz v3, v16, v17, #12, \sz // t4a - rshrn_sz v5, v20, v21, #12, \sz // t5a + sqrshrn_sz v3, v16, v17, #12, \sz // t4a + sqrshrn_sz v5, v20, v21, #12, \sz // t5a
smull_smlal v16, v17, v19, v7, v1.h[2], v1.h[3], \sz
- rshrn_sz v7, v22, v23, #12, \sz // t6a - rshrn_sz v19, v16, v17, #12, \sz // t7a + sqrshrn_sz v7, v22, v23, #12, \sz // t6a + sqrshrn_sz v19, v16, v17, #12, \sz // t7a
sqadd \o0()\sz, v2\sz, v6\sz // out0 sqsub v2\sz, v2\sz, v6\sz // t2 @@ -812,11 +812,11 @@ endfunc smull_smlal v18, v19, v2, v4, v1.h[0], v1.h[0], \sz // -> out3 (v19 or v20) smull_smlsl v6, v7, v2, v4, v1.h[0], v1.h[0], \sz // -> out4 (v20 or v19) smull_smlsl v20, v21, v3, v5, v1.h[0], v1.h[0], \sz // -> out5 (v21 or v18) - rshrn_sz v2, v18, v19, #12, \sz // out3 + sqrshrn_sz v2, v18, v19, #12, \sz // out3 smull_smlal v18, v19, v3, v5, v1.h[0], v1.h[0], \sz // -> out2 (v18 or v21) - rshrn_sz v3, v20, v21, #12, \sz // out5 - rshrn_sz \o2, v18, v19, #12, \sz // out2 (v18 or v21) - rshrn_sz \o4, v6, v7, #12, \sz // out4 (v20 or v19) + sqrshrn_sz v3, v20, v21, #12, \sz // out5 + sqrshrn_sz \o2, v18, v19, #12, \sz // out2 (v18 or v21) + sqrshrn_sz \o4, v6, v7, #12, \sz // out4 (v20 or v19)
sqneg \o3()\sz, v2\sz // out3 sqneg \o5()\sz, v3\sz // out5 @@ -1033,19 +1033,19 @@ def_fns_48 8, 4 smull_smlsl v2, v3, v17, v31, v1.h[0], v1.h[1], \sz // -> t8a smull_smlal v4, v5, v17, v31, v1.h[1], v1.h[0], \sz // -> t15a smull_smlsl v6, v7, v25, v23, v1.h[2], v1.h[3], \sz // -> t9a - rshrn_sz v17, v2, v3, #12, \sz // t8a - rshrn_sz v31, v4, v5, #12, \sz // t15a + sqrshrn_sz v17, v2, v3, #12, \sz // t8a + sqrshrn_sz v31, v4, v5, #12, \sz // t15a smull_smlal v2, v3, v25, v23, v1.h[3], v1.h[2], \sz // -> t14a smull_smlsl v4, v5, v21, v27, v1.h[4], v1.h[5], \sz // -> t10a - rshrn_sz v23, v6, v7, #12, \sz // t9a - rshrn_sz v25, v2, v3, #12, \sz // t14a + sqrshrn_sz v23, v6, v7, #12, \sz // t9a + sqrshrn_sz v25, v2, v3, #12, \sz // t14a smull_smlal v6, v7, v21, v27, v1.h[5], v1.h[4], \sz // -> t13a smull_smlsl v2, v3, v29, v19, v1.h[6], v1.h[7], \sz // -> t11a - rshrn_sz v21, v4, v5, #12, \sz // t10a - rshrn_sz v27, v6, v7, #12, \sz // t13a + sqrshrn_sz v21, v4, v5, #12, \sz // t10a + sqrshrn_sz v27, v6, v7, #12, \sz // t13a smull_smlal v4, v5, v29, v19, v1.h[7], v1.h[6], \sz // -> t12a - rshrn_sz v19, v2, v3, #12, \sz // t11a - rshrn_sz v29, v4, v5, #12, \sz // t12a + sqrshrn_sz v19, v2, v3, #12, \sz // t11a + sqrshrn_sz v29, v4, v5, #12, \sz // t12a
sqsub v2\sz, v17\sz, v23\sz // t9 sqadd v17\sz, v17\sz, v23\sz // t8 @@ -1058,17 +1058,17 @@ def_fns_48 8, 4
smull_smlsl v4, v5, v3, v2, v0.h[2], v0.h[3], \sz // -> t9a smull_smlal v6, v7, v3, v2, v0.h[3], v0.h[2], \sz // -> t14a - rshrn_sz v21, v4, v5, #12, \sz // t9a - rshrn_sz v27, v6, v7, #12, \sz // t14a + sqrshrn_sz v21, v4, v5, #12, \sz // t9a + sqrshrn_sz v27, v6, v7, #12, \sz // t14a
smull_smlsl v4, v5, v29, v23, v0.h[2], v0.h[3], \sz // -> t13a smull_smlal v6, v7, v29, v23, v0.h[3], v0.h[2], \sz // -> t10a - rshrn_sz v29, v4, v5, #12, \sz // t13a + sqrshrn_sz v29, v4, v5, #12, \sz // t13a neg v6.4s, v6.4s .ifc \sz, .8h neg v7.4s, v7.4s .endif - rshrn_sz v23, v6, v7, #12, \sz // t10a + sqrshrn_sz v23, v6, v7, #12, \sz // t10a
sqsub v2\sz, v17\sz, v19\sz // t11a sqadd v17\sz, v17\sz, v19\sz // t8a @@ -1083,11 +1083,11 @@ def_fns_48 8, 4 smull_smlal v6, v7, v3, v2, v0.h[0], v0.h[0], \sz // -> t12 smull_smlsl v2, v3, v25, v21, v0.h[0], v0.h[0], \sz // -> t10a
- rshrn_sz v4, v4, v5, #12, \sz // t11 - rshrn_sz v5, v6, v7, #12, \sz // t12 + sqrshrn_sz v4, v4, v5, #12, \sz // t11 + sqrshrn_sz v5, v6, v7, #12, \sz // t12 smull_smlal v6, v7, v25, v21, v0.h[0], v0.h[0], \sz // -> t13a - rshrn_sz v2, v2, v3, #12, \sz // t10a - rshrn_sz v3, v6, v7, #12, \sz // t13a + sqrshrn_sz v2, v2, v3, #12, \sz // t10a + sqrshrn_sz v3, v6, v7, #12, \sz // t13a
sqadd v6\sz, v16\sz, v31\sz // out0 sqsub v31\sz, v16\sz, v31\sz // out15 @@ -1132,35 +1132,35 @@ endfunc smull_smlal v2, v3, v31, v16, v0.h[0], v0.h[1], \sz // -> t0 smull_smlsl v4, v5, v31, v16, v0.h[1], v0.h[0], \sz // -> t1 smull_smlal v6, v7, v29, v18, v0.h[2], v0.h[3], \sz // -> t2 - rshrn_sz v16, v2, v3, #12, \sz // t0 - rshrn_sz v31, v4, v5, #12, \sz // t1 + sqrshrn_sz v16, v2, v3, #12, \sz // t0 + sqrshrn_sz v31, v4, v5, #12, \sz // t1 smull_smlsl v2, v3, v29, v18, v0.h[3], v0.h[2], \sz // -> t3 smull_smlal v4, v5, v27, v20, v0.h[4], v0.h[5], \sz // -> t4 - rshrn_sz v18, v6, v7, #12, \sz // t2 - rshrn_sz v29, v2, v3, #12, \sz // t3 + sqrshrn_sz v18, v6, v7, #12, \sz // t2 + sqrshrn_sz v29, v2, v3, #12, \sz // t3 smull_smlsl v6, v7, v27, v20, v0.h[5], v0.h[4], \sz // -> t5 smull_smlal v2, v3, v25, v22, v0.h[6], v0.h[7], \sz // -> t6 - rshrn_sz v20, v4, v5, #12, \sz // t4 - rshrn_sz v27, v6, v7, #12, \sz // t5 + sqrshrn_sz v20, v4, v5, #12, \sz // t4 + sqrshrn_sz v27, v6, v7, #12, \sz // t5 smull_smlsl v4, v5, v25, v22, v0.h[7], v0.h[6], \sz // -> t7 smull_smlal v6, v7, v23, v24, v1.h[0], v1.h[1], \sz // -> t8 - rshrn_sz v22, v2, v3, #12, \sz // t6 - rshrn_sz v25, v4, v5, #12, \sz // t7 + sqrshrn_sz v22, v2, v3, #12, \sz // t6 + sqrshrn_sz v25, v4, v5, #12, \sz // t7 smull_smlsl v2, v3, v23, v24, v1.h[1], v1.h[0], \sz // -> t9 smull_smlal v4, v5, v21, v26, v1.h[2], v1.h[3], \sz // -> t10 - rshrn_sz v23, v6, v7, #12, \sz // t8 - rshrn_sz v24, v2, v3, #12, \sz // t9 + sqrshrn_sz v23, v6, v7, #12, \sz // t8 + sqrshrn_sz v24, v2, v3, #12, \sz // t9 smull_smlsl v6, v7, v21, v26, v1.h[3], v1.h[2], \sz // -> t11 smull_smlal v2, v3, v19, v28, v1.h[4], v1.h[5], \sz // -> t12 - rshrn_sz v21, v4, v5, #12, \sz // t10 - rshrn_sz v26, v6, v7, #12, \sz // t11 + sqrshrn_sz v21, v4, v5, #12, \sz // t10 + sqrshrn_sz v26, v6, v7, #12, \sz // t11 smull_smlsl v4, v5, v19, v28, v1.h[5], v1.h[4], \sz // -> t13 smull_smlal v6, v7, v17, v30, v1.h[6], v1.h[7], \sz // -> t14 - rshrn_sz v19, v2, v3, #12, \sz // t12 - rshrn_sz v28, v4, v5, #12, \sz // t13 + sqrshrn_sz v19, v2, v3, #12, \sz // t12 + sqrshrn_sz v28, v4, v5, #12, \sz // t13 smull_smlsl v2, v3, v17, v30, v1.h[7], v1.h[6], \sz // -> t15 - rshrn_sz v17, v6, v7, #12, \sz // t14 - rshrn_sz v30, v2, v3, #12, \sz // t15 + sqrshrn_sz v17, v6, v7, #12, \sz // t14 + sqrshrn_sz v30, v2, v3, #12, \sz // t15
ld1 {v0.8h}, [x16]
@@ -1184,19 +1184,19 @@ endfunc smull_smlal v4, v5, v2, v3, v0.h[5], v0.h[4], \sz // -> t8 smull_smlsl v6, v7, v2, v3, v0.h[4], v0.h[5], \sz // -> t9 smull_smlal v2, v3, v18, v29, v0.h[7], v0.h[6], \sz // -> t10 - rshrn_sz v17, v4, v5, #12, \sz // t8 - rshrn_sz v30, v6, v7, #12, \sz // t9 + sqrshrn_sz v17, v4, v5, #12, \sz // t8 + sqrshrn_sz v30, v6, v7, #12, \sz // t9 smull_smlsl v4, v5, v18, v29, v0.h[6], v0.h[7], \sz // -> t11 smull_smlsl v6, v7, v27, v20, v0.h[5], v0.h[4], \sz // -> t12 - rshrn_sz v18, v2, v3, #12, \sz // t10 - rshrn_sz v29, v4, v5, #12, \sz // t11 + sqrshrn_sz v18, v2, v3, #12, \sz // t10 + sqrshrn_sz v29, v4, v5, #12, \sz // t11 smull_smlal v2, v3, v27, v20, v0.h[4], v0.h[5], \sz // -> t13 smull_smlsl v4, v5, v25, v22, v0.h[7], v0.h[6], \sz // -> t14 - rshrn_sz v27, v6, v7, #12, \sz // t12 - rshrn_sz v20, v2, v3, #12, \sz // t13 + sqrshrn_sz v27, v6, v7, #12, \sz // t12 + sqrshrn_sz v20, v2, v3, #12, \sz // t13 smull_smlal v6, v7, v25, v22, v0.h[6], v0.h[7], \sz // -> t15 - rshrn_sz v25, v4, v5, #12, \sz // t14 - rshrn_sz v22, v6, v7, #12, \sz // t15 + sqrshrn_sz v25, v4, v5, #12, \sz // t14 + sqrshrn_sz v22, v6, v7, #12, \sz // t15
sqsub v2\sz, v16\sz, v21\sz // t4 sqadd v16\sz, v16\sz, v21\sz // t0 @@ -1218,19 +1218,19 @@ endfunc smull_smlal v4, v5, v2, v3, v0.h[3], v0.h[2], \sz // -> t4a smull_smlsl v6, v7, v2, v3, v0.h[2], v0.h[3], \sz // -> t5a smull_smlsl v2, v3, v24, v23, v0.h[3], v0.h[2], \sz // -> t6a - rshrn_sz v22, v4, v5, #12, \sz // t4a - rshrn_sz v25, v6, v7, #12, \sz // t5a + sqrshrn_sz v22, v4, v5, #12, \sz // t4a + sqrshrn_sz v25, v6, v7, #12, \sz // t5a smull_smlal v4, v5, v24, v23, v0.h[2], v0.h[3], \sz // -> t7a smull_smlal v6, v7, v17, v30, v0.h[3], v0.h[2], \sz // -> t12 - rshrn_sz v24, v2, v3, #12, \sz // t6a - rshrn_sz v23, v4, v5, #12, \sz // t7a + sqrshrn_sz v24, v2, v3, #12, \sz // t6a + sqrshrn_sz v23, v4, v5, #12, \sz // t7a smull_smlsl v2, v3, v17, v30, v0.h[2], v0.h[3], \sz // -> t13 smull_smlsl v4, v5, v29, v18, v0.h[3], v0.h[2], \sz // -> t14 - rshrn_sz v17, v6, v7, #12, \sz // t12 + sqrshrn_sz v17, v6, v7, #12, \sz // t12 smull_smlal v6, v7, v29, v18, v0.h[2], v0.h[3], \sz // -> t15 - rshrn_sz v29, v2, v3, #12, \sz // t13 - rshrn_sz v30, v4, v5, #12, \sz // t14 - rshrn_sz v18, v6, v7, #12, \sz // t15 + sqrshrn_sz v29, v2, v3, #12, \sz // t13 + sqrshrn_sz v30, v4, v5, #12, \sz // t14 + sqrshrn_sz v18, v6, v7, #12, \sz // t15
sqsub v2\sz, v16\sz, v21\sz // t2a .ifc \o0, v16 @@ -1267,21 +1267,21 @@ endfunc smull_smlal v4, v5, v2, v21, v0.h[0], v0.h[0], \sz // -> out7 (v23 or v24) smull_smlal v6, v7, v26, v3, v0.h[0], v0.h[0], \sz // -> out5 (v21 or v26)
- rshrn_sz v24, v24, v25, #12, \sz // out8 - rshrn_sz v4, v4, v5, #12, \sz // out7 - rshrn_sz v5, v6, v7, #12, \sz // out5 + sqrshrn_sz v24, v24, v25, #12, \sz // out8 + sqrshrn_sz v4, v4, v5, #12, \sz // out7 + sqrshrn_sz v5, v6, v7, #12, \sz // out5 smull_smlsl v6, v7, v26, v3, v0.h[0], v0.h[0], \sz // -> out10 (v26 or v21) smull_smlal v2, v3, v22, v23, v0.h[0], v0.h[0], \sz // -> out4 (v20 or v27) - rshrn_sz v26, v6, v7, #12, \sz // out10 + sqrshrn_sz v26, v6, v7, #12, \sz // out10
smull_smlsl v6, v7, v22, v23, v0.h[0], v0.h[0], \sz // -> out11 (v27 or v20) smull_smlal v22, v23, v27, v20, v0.h[0], v0.h[0], \sz // -> out6 (v22 or v25) smull_smlsl v21, v25, v27, v20, v0.h[0], v0.h[0], \sz // -> out9 (v25 or v22)
- rshrn_sz \o4, v2, v3, #12, \sz // out4 - rshrn_sz v6, v6, v7, #12, \sz // out11 - rshrn_sz v7, v21, v25, #12, \sz // out9 - rshrn_sz \o6, v22, v23, #12, \sz // out6 + sqrshrn_sz \o4, v2, v3, #12, \sz // out4 + sqrshrn_sz v6, v6, v7, #12, \sz // out11 + sqrshrn_sz v7, v21, v25, #12, \sz // out9 + sqrshrn_sz \o6, v22, v23, #12, \sz // out6
.ifc \o8, v23 mov \o8\szb, v24\szb @@ -1860,35 +1860,35 @@ function inv_dct32_odd_8h_x16_neon, export=1 smull_smlsl v2, v3, v16, v31, v0.h[0], v0.h[1], .8h // -> t16a smull_smlal v4, v5, v16, v31, v0.h[1], v0.h[0], .8h // -> t31a smull_smlsl v6, v7, v24, v23, v0.h[2], v0.h[3], .8h // -> t17a - rshrn_sz v16, v2, v3, #12, .8h // t16a - rshrn_sz v31, v4, v5, #12, .8h // t31a + sqrshrn_sz v16, v2, v3, #12, .8h // t16a + sqrshrn_sz v31, v4, v5, #12, .8h // t31a smull_smlal v2, v3, v24, v23, v0.h[3], v0.h[2], .8h // -> t30a smull_smlsl v4, v5, v20, v27, v0.h[4], v0.h[5], .8h // -> t18a - rshrn_sz v24, v6, v7, #12, .8h // t17a - rshrn_sz v23, v2, v3, #12, .8h // t30a + sqrshrn_sz v24, v6, v7, #12, .8h // t17a + sqrshrn_sz v23, v2, v3, #12, .8h // t30a smull_smlal v6, v7, v20, v27, v0.h[5], v0.h[4], .8h // -> t29a smull_smlsl v2, v3, v28, v19, v0.h[6], v0.h[7], .8h // -> t19a - rshrn_sz v20, v4, v5, #12, .8h // t18a - rshrn_sz v27, v6, v7, #12, .8h // t29a + sqrshrn_sz v20, v4, v5, #12, .8h // t18a + sqrshrn_sz v27, v6, v7, #12, .8h // t29a smull_smlal v4, v5, v28, v19, v0.h[7], v0.h[6], .8h // -> t28a smull_smlsl v6, v7, v18, v29, v1.h[0], v1.h[1], .8h // -> t20a - rshrn_sz v28, v2, v3, #12, .8h // t19a - rshrn_sz v19, v4, v5, #12, .8h // t28a + sqrshrn_sz v28, v2, v3, #12, .8h // t19a + sqrshrn_sz v19, v4, v5, #12, .8h // t28a smull_smlal v2, v3, v18, v29, v1.h[1], v1.h[0], .8h // -> t27a smull_smlsl v4, v5, v26, v21, v1.h[2], v1.h[3], .8h // -> t21a - rshrn_sz v18, v6, v7, #12, .8h // t20a - rshrn_sz v29, v2, v3, #12, .8h // t27a + sqrshrn_sz v18, v6, v7, #12, .8h // t20a + sqrshrn_sz v29, v2, v3, #12, .8h // t27a smull_smlal v6, v7, v26, v21, v1.h[3], v1.h[2], .8h // -> t26a smull_smlsl v2, v3, v22, v25, v1.h[4], v1.h[5], .8h // -> t22a - rshrn_sz v26, v4, v5, #12, .8h // t21a - rshrn_sz v21, v6, v7, #12, .8h // t26a + sqrshrn_sz v26, v4, v5, #12, .8h // t21a + sqrshrn_sz v21, v6, v7, #12, .8h // t26a smull_smlal v4, v5, v22, v25, v1.h[5], v1.h[4], .8h // -> t25a smull_smlsl v6, v7, v30, v17, v1.h[6], v1.h[7], .8h // -> t23a - rshrn_sz v22, v2, v3, #12, .8h // t22a - rshrn_sz v25, v4, v5, #12, .8h // t25a + sqrshrn_sz v22, v2, v3, #12, .8h // t22a + sqrshrn_sz v25, v4, v5, #12, .8h // t25a smull_smlal v2, v3, v30, v17, v1.h[7], v1.h[6], .8h // -> t24a - rshrn_sz v30, v6, v7, #12, .8h // t23a - rshrn_sz v17, v2, v3, #12, .8h // t24a + sqrshrn_sz v30, v6, v7, #12, .8h // t23a + sqrshrn_sz v17, v2, v3, #12, .8h // t24a
ld1 {v0.8h}, [x16]
@@ -1912,23 +1912,23 @@ function inv_dct32_odd_8h_x16_neon, export=1 smull_smlsl v4, v5, v3, v2, v0.h[4], v0.h[5], .8h // -> t17a smull_smlal v6, v7, v3, v2, v0.h[5], v0.h[4], .8h // -> t30a smull_smlal v2, v3, v19, v24, v0.h[5], v0.h[4], .8h // -> t18a - rshrn_sz v21, v4, v5, #12, .8h // t17a - rshrn_sz v27, v6, v7, #12, .8h // t30a + sqrshrn_sz v21, v4, v5, #12, .8h // t17a + sqrshrn_sz v27, v6, v7, #12, .8h // t30a neg v2.4s, v2.4s // -> t18a neg v3.4s, v3.4s // -> t18a smull_smlsl v4, v5, v19, v24, v0.h[4], v0.h[5], .8h // -> t29a smull_smlsl v6, v7, v22, v18, v0.h[6], v0.h[7], .8h // -> t21a - rshrn_sz v19, v2, v3, #12, .8h // t18a - rshrn_sz v24, v4, v5, #12, .8h // t29a + sqrshrn_sz v19, v2, v3, #12, .8h // t18a + sqrshrn_sz v24, v4, v5, #12, .8h // t29a smull_smlal v2, v3, v22, v18, v0.h[7], v0.h[6], .8h // -> t26a smull_smlal v4, v5, v17, v20, v0.h[7], v0.h[6], .8h // -> t22a - rshrn_sz v22, v6, v7, #12, .8h // t21a - rshrn_sz v18, v2, v3, #12, .8h // t26a + sqrshrn_sz v22, v6, v7, #12, .8h // t21a + sqrshrn_sz v18, v2, v3, #12, .8h // t26a neg v4.4s, v4.4s // -> t22a neg v5.4s, v5.4s // -> t22a smull_smlsl v6, v7, v17, v20, v0.h[6], v0.h[7], .8h // -> t25a - rshrn_sz v17, v4, v5, #12, .8h // t22a - rshrn_sz v20, v6, v7, #12, .8h // t25a + sqrshrn_sz v17, v4, v5, #12, .8h // t22a + sqrshrn_sz v20, v6, v7, #12, .8h // t25a
sqsub v2.8h, v27.8h, v24.8h // t29 sqadd v27.8h, v27.8h, v24.8h // t30 @@ -1950,23 +1950,23 @@ function inv_dct32_odd_8h_x16_neon, export=1 smull_smlsl v4, v5, v2, v3, v0.h[2], v0.h[3], .8h // -> t18a smull_smlal v6, v7, v2, v3, v0.h[3], v0.h[2], .8h // -> t29a smull_smlsl v2, v3, v29, v24, v0.h[2], v0.h[3], .8h // -> t19 - rshrn_sz v18, v4, v5, #12, .8h // t18a - rshrn_sz v25, v6, v7, #12, .8h // t29a + sqrshrn_sz v18, v4, v5, #12, .8h // t18a + sqrshrn_sz v25, v6, v7, #12, .8h // t29a smull_smlal v4, v5, v29, v24, v0.h[3], v0.h[2], .8h // -> t28 smull_smlal v6, v7, v26, v19, v0.h[3], v0.h[2], .8h // -> t20 - rshrn_sz v29, v2, v3, #12, .8h // t19 - rshrn_sz v24, v4, v5, #12, .8h // t28 + sqrshrn_sz v29, v2, v3, #12, .8h // t19 + sqrshrn_sz v24, v4, v5, #12, .8h // t28 neg v6.4s, v6.4s // -> t20 neg v7.4s, v7.4s // -> t20 smull_smlsl v2, v3, v26, v19, v0.h[2], v0.h[3], .8h // -> t27 smull_smlal v4, v5, v20, v28, v0.h[3], v0.h[2], .8h // -> t21a - rshrn_sz v26, v6, v7, #12, .8h // t20 - rshrn_sz v19, v2, v3, #12, .8h // t27 + sqrshrn_sz v26, v6, v7, #12, .8h // t20 + sqrshrn_sz v19, v2, v3, #12, .8h // t27 neg v4.4s, v4.4s // -> t21a neg v5.4s, v5.4s // -> t21a smull_smlsl v6, v7, v20, v28, v0.h[2], v0.h[3], .8h // -> t26a - rshrn_sz v20, v4, v5, #12, .8h // t21a - rshrn_sz v28, v6, v7, #12, .8h // t26a + sqrshrn_sz v20, v4, v5, #12, .8h // t21a + sqrshrn_sz v28, v6, v7, #12, .8h // t26a
sqsub v2.8h, v16.8h, v30.8h // t23 sqadd v16.8h, v16.8h, v30.8h // t16 = out16 @@ -1988,24 +1988,24 @@ function inv_dct32_odd_8h_x16_neon, export=1
smull_smlsl v4, v5, v24, v26, v0.h[0], v0.h[0], .8h // -> t20 smull_smlal v6, v7, v24, v26, v0.h[0], v0.h[0], .8h // -> t27 - rshrn_sz v20, v4, v5, #12, .8h // t20 - rshrn_sz v22, v6, v7, #12, .8h // t27 + sqrshrn_sz v20, v4, v5, #12, .8h // t20 + sqrshrn_sz v22, v6, v7, #12, .8h // t27
smull_smlal v4, v5, v25, v27, v0.h[0], v0.h[0], .8h // -> t26a smull_smlsl v6, v7, v25, v27, v0.h[0], v0.h[0], .8h // -> t21a mov v27.16b, v22.16b // t27 - rshrn_sz v26, v4, v5, #12, .8h // t26a + sqrshrn_sz v26, v4, v5, #12, .8h // t26a
smull_smlsl v24, v25, v21, v23, v0.h[0], v0.h[0], .8h // -> t22 smull_smlal v4, v5, v21, v23, v0.h[0], v0.h[0], .8h // -> t25 - rshrn_sz v21, v6, v7, #12, .8h // t21a - rshrn_sz v22, v24, v25, #12, .8h // t22 - rshrn_sz v25, v4, v5, #12, .8h // t25 + sqrshrn_sz v21, v6, v7, #12, .8h // t21a + sqrshrn_sz v22, v24, v25, #12, .8h // t22 + sqrshrn_sz v25, v4, v5, #12, .8h // t25
smull_smlsl v4, v5, v3, v2, v0.h[0], v0.h[0], .8h // -> t23a smull_smlal v6, v7, v3, v2, v0.h[0], v0.h[0], .8h // -> t24a - rshrn_sz v23, v4, v5, #12, .8h // t23a - rshrn_sz v24, v6, v7, #12, .8h // t24a + sqrshrn_sz v23, v4, v5, #12, .8h // t23a + sqrshrn_sz v24, v6, v7, #12, .8h // t24a
ret endfunc @@ -2594,11 +2594,11 @@ function inv_dct64_step1_neon neg v2.4s, v2.4s // t34a neg v3.4s, v3.4s // t34a smull_smlsl v6, v7, v30, v25, v1.h[1], v1.h[0], .8h // -> t33a - rshrn_sz v26, v2, v3, #12, .8h // t34a + sqrshrn_sz v26, v2, v3, #12, .8h // t34a smull_smlal v2, v3, v30, v25, v1.h[0], v1.h[1], .8h // -> t62a - rshrn_sz v29, v4, v5, #12, .8h // t61a - rshrn_sz v25, v6, v7, #12, .8h // t33a - rshrn_sz v30, v2, v3, #12, .8h // t62a + sqrshrn_sz v29, v4, v5, #12, .8h // t61a + sqrshrn_sz v25, v6, v7, #12, .8h // t33a + sqrshrn_sz v30, v2, v3, #12, .8h // t62a
sqadd v16.8h, v24.8h, v27.8h // t32a sqsub v19.8h, v24.8h, v27.8h // t35a @@ -2612,11 +2612,11 @@ function inv_dct64_step1_neon smull_smlal v2, v3, v21, v18, v1.h[2], v1.h[3], .8h // -> t61a smull_smlsl v4, v5, v21, v18, v1.h[3], v1.h[2], .8h // -> t34a smull_smlal v6, v7, v20, v19, v1.h[2], v1.h[3], .8h // -> t60 - rshrn_sz v21, v2, v3, #12, .8h // t61a - rshrn_sz v18, v4, v5, #12, .8h // t34a + sqrshrn_sz v21, v2, v3, #12, .8h // t61a + sqrshrn_sz v18, v4, v5, #12, .8h // t34a smull_smlsl v2, v3, v20, v19, v1.h[3], v1.h[2], .8h // -> t35 - rshrn_sz v20, v6, v7, #12, .8h // t60 - rshrn_sz v19, v2, v3, #12, .8h // t35 + sqrshrn_sz v20, v6, v7, #12, .8h // t60 + sqrshrn_sz v19, v2, v3, #12, .8h // t35
st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x6], #64 st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x6], #64 @@ -2653,13 +2653,13 @@ function inv_dct64_step2_neon smull_smlal v2, v3, v27, v25, v0.h[3], v0.h[2], .8h // -> t56a smull_smlsl v4, v5, v27, v25, v0.h[2], v0.h[3], .8h // -> t39a smull_smlal v6, v7, v31, v28, v0.h[3], v0.h[2], .8h // -> t40a - rshrn_sz v25, v2, v3, #12, .8h // t56a - rshrn_sz v27, v4, v5, #12, .8h // t39a + sqrshrn_sz v25, v2, v3, #12, .8h // t56a + sqrshrn_sz v27, v4, v5, #12, .8h // t39a neg v6.4s, v6.4s // t40a neg v7.4s, v7.4s // t40a smull_smlsl v2, v3, v31, v28, v0.h[2], v0.h[3], .8h // -> t55a - rshrn_sz v31, v6, v7, #12, .8h // t40a - rshrn_sz v28, v2, v3, #12, .8h // t55a + sqrshrn_sz v31, v6, v7, #12, .8h // t40a + sqrshrn_sz v28, v2, v3, #12, .8h // t55a
sqadd v16.8h, v24.8h, v29.8h // t32a sqsub v19.8h, v24.8h, v29.8h // t47a @@ -2673,11 +2673,11 @@ function inv_dct64_step2_neon smull_smlsl v2, v3, v21, v18, v0.h[0], v0.h[0], .8h // -> t40a smull_smlal v4, v5, v21, v18, v0.h[0], v0.h[0], .8h // -> t55a smull_smlsl v6, v7, v20, v19, v0.h[0], v0.h[0], .8h // -> t47 - rshrn_sz v18, v2, v3, #12, .8h // t40a - rshrn_sz v21, v4, v5, #12, .8h // t55a + sqrshrn_sz v18, v2, v3, #12, .8h // t40a + sqrshrn_sz v21, v4, v5, #12, .8h // t55a smull_smlal v2, v3, v20, v19, v0.h[0], v0.h[0], .8h // -> t48 - rshrn_sz v19, v6, v7, #12, .8h // t47 - rshrn_sz v20, v2, v3, #12, .8h // t48 + sqrshrn_sz v19, v6, v7, #12, .8h // t47 + sqrshrn_sz v20, v2, v3, #12, .8h // t48
str q16, [x6, #2*8*0] // t32a str q17, [x9, #2*8*0] // t39 diff --git a/third_party/dav1d/src/arm/64/looprestoration.S b/third_party/dav1d/src/arm/64/looprestoration.S index 778448a0f3f02..a598b72b03951 100644 --- a/third_party/dav1d/src/arm/64/looprestoration.S +++ b/third_party/dav1d/src/arm/64/looprestoration.S @@ -50,6 +50,7 @@ endconst // const int16_t filter[2][8], // const enum LrEdgeFlags edges); function wiener_filter7_8bpc_neon, export=1 + AARCH64_SIGN_LINK_REGISTER stp x29, x30, [sp, #-16]! mov x29, sp ld1 {v0.8h, v1.8h}, [x6] @@ -121,6 +122,7 @@ L(v1_7):
mov sp, x29 ldp x29, x30, [sp], #16 + AARCH64_VALIDATE_LINK_REGISTER ret
L(no_top_7): @@ -538,6 +540,7 @@ endfunc // const int16_t filter[2][8], // const enum LrEdgeFlags edges); function wiener_filter5_8bpc_neon, export=1 + AARCH64_SIGN_LINK_REGISTER stp x29, x30, [sp, #-16]! mov x29, sp ld1 {v0.8h, v1.8h}, [x6] @@ -598,6 +601,7 @@ L(end_5):
mov sp, x29 ldp x29, x30, [sp], #16 + AARCH64_VALIDATE_LINK_REGISTER ret
L(no_top_5): diff --git a/third_party/dav1d/src/arm/64/looprestoration16.S b/third_party/dav1d/src/arm/64/looprestoration16.S index fcb4f84e7ef88..8954e604cf559 100644 --- a/third_party/dav1d/src/arm/64/looprestoration16.S +++ b/third_party/dav1d/src/arm/64/looprestoration16.S @@ -52,6 +52,7 @@ endconst // const int bitdepth_max); function wiener_filter7_16bpc_neon, export=1 ldr w8, [sp] + AARCH64_SIGN_LINK_REGISTER stp x29, x30, [sp, #-32]! stp d8, d9, [sp, #16] mov x29, sp @@ -137,6 +138,7 @@ L(v1_7): mov sp, x29 ldp d8, d9, [sp, #16] ldp x29, x30, [sp], #32 + AARCH64_VALIDATE_LINK_REGISTER ret
L(no_top_7): @@ -595,6 +597,7 @@ endfunc // const int bitdepth_max); function wiener_filter5_16bpc_neon, export=1 ldr w8, [sp] + AARCH64_SIGN_LINK_REGISTER stp x29, x30, [sp, #-32]! stp d8, d9, [sp, #16] mov x29, sp @@ -669,6 +672,7 @@ L(end_5): mov sp, x29 ldp d8, d9, [sp, #16] ldp x29, x30, [sp], #32 + AARCH64_VALIDATE_LINK_REGISTER ret
L(no_top_5): diff --git a/third_party/dav1d/src/arm/asm.S b/third_party/dav1d/src/arm/asm.S index 017c89c9117d7..d1083c6b561b5 100644 --- a/third_party/dav1d/src/arm/asm.S +++ b/third_party/dav1d/src/arm/asm.S @@ -34,10 +34,78 @@ #define x18 do_not_use_x18 #define w18 do_not_use_w18
-/* Support macros for the Armv8.5-A Branch Target Identification feature which - * requires emitting a .note.gnu.property section with the appropriate - * architecture-dependent feature bits set. - * Read more: "ELF for the Arm® 64-bit Architecture" +/* Support macros for + * - Armv8.3-A Pointer Authentication and + * - Armv8.5-A Branch Target Identification + * features which require emitting a .note.gnu.property section with the + * appropriate architecture-dependent feature bits set. + * + * |AARCH64_SIGN_LINK_REGISTER| and |AARCH64_VALIDATE_LINK_REGISTER| expand to + * PACIxSP and AUTIxSP, respectively. |AARCH64_SIGN_LINK_REGISTER| should be + * used immediately before saving the LR register (x30) to the stack. + * |AARCH64_VALIDATE_LINK_REGISTER| should be used immediately after restoring + * it. Note |AARCH64_SIGN_LINK_REGISTER|'s modifications to LR must be undone + * with |AARCH64_VALIDATE_LINK_REGISTER| before RET. The SP register must also + * have the same value at the two points. For example: + * + * .global f + * f: + * AARCH64_SIGN_LINK_REGISTER + * stp x29, x30, [sp, #-96]! + * mov x29, sp + * ... + * ldp x29, x30, [sp], #96 + * AARCH64_VALIDATE_LINK_REGISTER + * ret + * + * |AARCH64_VALID_CALL_TARGET| expands to BTI 'c'. Either it, or + * |AARCH64_SIGN_LINK_REGISTER|, must be used at every point that may be an + * indirect call target. In particular, all symbols exported from a file must + * begin with one of these macros. For example, a leaf function that does not + * save LR can instead use |AARCH64_VALID_CALL_TARGET|: + * + * .globl return_zero + * return_zero: + * AARCH64_VALID_CALL_TARGET + * mov x0, #0 + * ret + * + * A non-leaf function which does not immediately save LR may need both macros + * because |AARCH64_SIGN_LINK_REGISTER| appears late. For example, the function + * may jump to an alternate implementation before setting up the stack: + * + * .globl with_early_jump + * with_early_jump: + * AARCH64_VALID_CALL_TARGET + * cmp x0, #128 + * b.lt .Lwith_early_jump_128 + * AARCH64_SIGN_LINK_REGISTER + * stp x29, x30, [sp, #-96]! + * mov x29, sp + * ... + * ldp x29, x30, [sp], #96 + * AARCH64_VALIDATE_LINK_REGISTER + * ret + * + * .Lwith_early_jump_128: + * ... + * ret + * + * These annotations are only required with indirect calls. Private symbols that + * are only the target of direct calls do not require annotations. Also note + * that |AARCH64_VALID_CALL_TARGET| is only valid for indirect calls (BLR), not + * indirect jumps (BR). Indirect jumps in assembly are supported through + * |AARCH64_VALID_JUMP_TARGET|. Landing Pads which shall serve for jumps and + * calls can be created using |AARCH64_VALID_JUMP_CALL_TARGET|. + * + * Although not necessary, it is safe to use these macros in 32-bit ARM + * assembly. This may be used to simplify dual 32-bit and 64-bit files. + * + * References: + * - "ELF for the Arm® 64-bit Architecture" + * https: *github.com/ARM-software/abi-aa/blob/master/aaelf64/aaelf64.rst + * - "Providing protection for complex software" + * https://developer.arm.com/architectures/learn-the-architecture/providing-pro... */ #if defined(__ARM_FEATURE_BTI_DEFAULT) && (__ARM_FEATURE_BTI_DEFAULT == 1) #define GNU_PROPERTY_AARCH64_BTI (1 << 0) // Has Branch Target Identification @@ -51,7 +119,32 @@ #define AARCH64_VALID_JUMP_TARGET #endif
-#if (GNU_PROPERTY_AARCH64_BTI != 0) +#if defined(__ARM_FEATURE_PAC_DEFAULT) + +#if ((__ARM_FEATURE_PAC_DEFAULT & (1 << 0)) != 0) // authentication using key A +#define AARCH64_SIGN_LINK_REGISTER paciasp +#define AARCH64_VALIDATE_LINK_REGISTER autiasp +#elif ((__ARM_FEATURE_PAC_DEFAULT & (1 << 1)) != 0) // authentication using key B +#define AARCH64_SIGN_LINK_REGISTER pacibsp +#define AARCH64_VALIDATE_LINK_REGISTER autibsp +#else +#error Pointer authentication defines no valid key! +#endif +#if ((__ARM_FEATURE_PAC_DEFAULT & (1 << 2)) != 0) // authentication of leaf functions +#error Authentication of leaf functions is enabled but not supported in dav1d! +#endif +#define GNU_PROPERTY_AARCH64_PAC (1 << 1) + +#else /* __ARM_FEATURE_PAC_DEFAULT */ + +#define GNU_PROPERTY_AARCH64_PAC 0 +#define AARCH64_SIGN_LINK_REGISTER +#define AARCH64_VALIDATE_LINK_REGISTER + +#endif /* !__ARM_FEATURE_PAC_DEFAULT */ + + +#if (GNU_PROPERTY_AARCH64_BTI != 0 || GNU_PROPERTY_AARCH64_PAC != 0) && defined(__ELF__) .pushsection .note.gnu.property, "a" .balign 8 .long 4 @@ -60,11 +153,11 @@ .asciz "GNU" .long 0xc0000000 /* GNU_PROPERTY_AARCH64_FEATURE_1_AND */ .long 4 - .long GNU_PROPERTY_AARCH64_BTI + .long (GNU_PROPERTY_AARCH64_BTI | GNU_PROPERTY_AARCH64_PAC) .long 0 .popsection -#endif -#endif +#endif /* (GNU_PROPERTY_AARCH64_BTI != 0 || GNU_PROPERTY_AARCH64_PAC != 0) && defined(__ELF__) */ +#endif /* ARCH_AARCH64 */
#if ARCH_ARM .syntax unified @@ -74,7 +167,7 @@ .eabi_attribute 10, 0 // suppress Tag_FP_arch .eabi_attribute 12, 0 // suppress Tag_Advanced_SIMD_arch .section .note.GNU-stack,"",%progbits // Mark stack as non-executable -#endif +#endif /* __ELF__ */
#ifdef _WIN32 #define CONFIG_THUMB 1 @@ -89,8 +182,8 @@ #else #define A #define T @ -#endif -#endif +#endif /* CONFIG_THUMB */ +#endif /* ARCH_ARM */
#if !defined(PIC) #if defined(__PIC__) diff --git a/third_party/dav1d/src/arm/film_grain_init_tmpl.c b/third_party/dav1d/src/arm/filmgrain_init_tmpl.c similarity index 99% rename from third_party/dav1d/src/arm/film_grain_init_tmpl.c rename to third_party/dav1d/src/arm/filmgrain_init_tmpl.c index 3a416020532f5..2156047d02ad6 100644 --- a/third_party/dav1d/src/arm/film_grain_init_tmpl.c +++ b/third_party/dav1d/src/arm/filmgrain_init_tmpl.c @@ -28,7 +28,7 @@ */
#include "src/cpu.h" -#include "src/film_grain.h" +#include "src/filmgrain.h" #include "asm-offsets.h"
CHECK_OFFSET(Dav1dFilmGrainData, seed, FGD_SEED); diff --git a/third_party/dav1d/src/data.c b/third_party/dav1d/src/data.c index 4c7bf82715c48..fa6165ec72179 100644 --- a/third_party/dav1d/src/data.c +++ b/third_party/dav1d/src/data.c @@ -116,11 +116,17 @@ void dav1d_data_props_copy(Dav1dDataProps *const dst, void dav1d_data_props_set_defaults(Dav1dDataProps *const props) { assert(props != NULL);
+ memset(props, 0, sizeof(*props)); props->timestamp = INT64_MIN; - props->duration = 0; props->offset = -1; - props->user_data.data = NULL; - props->user_data.ref = NULL; +} + +void dav1d_data_props_unref_internal(Dav1dDataProps *const props) { + validate_input(props != NULL); + + struct Dav1dRef *user_data_ref = props->user_data.ref; + dav1d_data_props_set_defaults(props); + dav1d_ref_dec(&user_data_ref); }
void dav1d_data_unref_internal(Dav1dData *const buf) { @@ -132,5 +138,6 @@ void dav1d_data_unref_internal(Dav1dData *const buf) { dav1d_ref_dec(&buf->ref); } memset(buf, 0, sizeof(*buf)); + dav1d_data_props_set_defaults(&buf->m); dav1d_ref_dec(&user_data_ref); } diff --git a/third_party/dav1d/src/data.h b/third_party/dav1d/src/data.h index 5b07021c532ef..b34c1db702ca3 100644 --- a/third_party/dav1d/src/data.h +++ b/third_party/dav1d/src/data.h @@ -51,5 +51,6 @@ int dav1d_data_wrap_user_data_internal(Dav1dData *buf, void *cookie), void *cookie); void dav1d_data_unref_internal(Dav1dData *buf); +void dav1d_data_props_unref_internal(Dav1dDataProps *props);
#endif /* DAV1D_SRC_DATA_H */ diff --git a/third_party/dav1d/src/decode.c b/third_party/dav1d/src/decode.c index cec91e9fb3009..bd13014947e3b 100644 --- a/third_party/dav1d/src/decode.c +++ b/third_party/dav1d/src/decode.c @@ -42,7 +42,7 @@ #include "src/decode.h" #include "src/dequant_tables.h" #include "src/env.h" -#include "src/film_grain.h" +#include "src/filmgrain.h" #include "src/log.h" #include "src/qm.h" #include "src/recon.h" @@ -1242,6 +1242,7 @@ static int decode_b(Dav1dTaskContext *const t, if (DEBUG_BLOCK_INFO) printf("Post-uvmode[%d]: r=%d\n", b->uv_mode, ts->msac.rng);
+ b->uv_angle = 0; if (b->uv_mode == CFL_PRED) { #define SIGN(a) (!!(a) + ((a) > 0)) const int sign = dav1d_msac_decode_symbol_adapt8(&ts->msac, @@ -1274,8 +1275,6 @@ static int decode_b(Dav1dTaskContext *const t, uint16_t *const acdf = ts->cdf.m.angle_delta[b->uv_mode - VERT_PRED]; const int angle = dav1d_msac_decode_symbol_adapt8(&ts->msac, acdf, 6); b->uv_angle = angle - 3; - } else { - b->uv_angle = 0; } }
@@ -3231,8 +3230,6 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) { if (ret < 0) goto error; }
- retval = DAV1D_ERR(EINVAL); - // setup dequant tables init_quant_tables(f->seq_hdr, f->frame_hdr, f->frame_hdr->quant.yac, f->dq); if (f->frame_hdr->quant.qm) @@ -3356,7 +3353,7 @@ error:
int dav1d_decode_frame_main(Dav1dFrameContext *const f) { const Dav1dContext *const c = f->c; - int retval = DAV1D_ERR(ENOMEM); + int retval = DAV1D_ERR(EINVAL);
assert(f->c->n_tc == 1);
@@ -3500,7 +3497,13 @@ int dav1d_submit_frame(Dav1dContext *const c) { if (c->task_thread.cur && c->task_thread.cur < c->n_fc) c->task_thread.cur--; } - if (out_delayed->p.data[0]) { + const int error = f->task_thread.retval; + if (error) { + f->task_thread.retval = 0; + c->cached_error = error; + dav1d_data_props_copy(&c->cached_error_props, &out_delayed->p.m); + dav1d_thread_picture_unref(out_delayed); + } else if (out_delayed->p.data[0]) { const unsigned progress = atomic_load_explicit(&out_delayed->progress[1], memory_order_relaxed); if ((out_delayed->visible || c->output_invisible_frames) && @@ -3842,7 +3845,7 @@ int dav1d_submit_frame(Dav1dContext *const c) { dav1d_ref_dec(&c->refs[i].refmvs); } } - return res; + goto error; } } else { dav1d_task_frame_init(f); @@ -3869,6 +3872,7 @@ error: dav1d_ref_dec(&f->mvs_ref); dav1d_ref_dec(&f->seq_hdr_ref); dav1d_ref_dec(&f->frame_hdr_ref); + dav1d_data_props_copy(&c->cached_error_props, &c->in.m);
for (int i = 0; i < f->n_tile_data; i++) dav1d_data_unref_internal(&f->tile[i].data); diff --git a/third_party/dav1d/src/ext/x86/x86inc.asm b/third_party/dav1d/src/ext/x86/x86inc.asm index c9f2d6398cde1..68b1f74f4bb02 100644 --- a/third_party/dav1d/src/ext/x86/x86inc.asm +++ b/third_party/dav1d/src/ext/x86/x86inc.asm @@ -1,7 +1,7 @@ ;***************************************************************************** ;* x86inc.asm: x86 abstraction layer ;***************************************************************************** -;* Copyright (C) 2005-2021 x264 project +;* Copyright (C) 2005-2022 x264 project ;* ;* Authors: Loren Merritt lorenm@u.washington.edu ;* Henrik Gramner henrik@gramner.com @@ -238,6 +238,16 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 %endif %endmacro
+; Repeats an instruction/operation for multiple arguments. +; Example usage: "REPX {psrlw x, 8}, m0, m1, m2, m3" +%macro REPX 2-* ; operation, args + %xdefine %%f(x) %1 + %rep %0 - 1 + %rotate 1 + %%f(%1) + %endrep +%endmacro + %macro PUSH 1 push %1 %ifidn rstk, rsp @@ -1342,7 +1352,20 @@ INIT_XMM %1 %6, __src2 %endif %elif %0 >= 9 - __instr %6, %7, %8, %9 + %if avx_enabled && __sizeofreg >= 16 && %4 == 1 + %ifnnum regnumof%7 + %if %3 + vmovaps %6, %7 + %else + vmovdqa %6, %7 + %endif + __instr %6, %6, %8, %9 + %else + __instr %6, %7, %8, %9 + %endif + %else + __instr %6, %7, %8, %9 + %endif %elif %0 == 8 %if avx_enabled && __sizeofreg >= 16 && %4 == 0 %xdefine __src1 %7 @@ -1379,7 +1402,7 @@ INIT_XMM %else vmovdqa %6, %7 %endif - __instr %6, %8 + __instr %6, %6, %8 %else __instr %6, __src1, __src2 %endif @@ -1448,8 +1471,8 @@ AVX_INSTR andpd, sse2, 1, 0, 1 AVX_INSTR andps, sse, 1, 0, 1 AVX_INSTR blendpd, sse4, 1, 1, 0 AVX_INSTR blendps, sse4, 1, 1, 0 -AVX_INSTR blendvpd, sse4 ; can't be emulated -AVX_INSTR blendvps, sse4 ; can't be emulated +AVX_INSTR blendvpd, sse4, 1, 1, 0 ; last operand must be xmm0 with legacy encoding +AVX_INSTR blendvps, sse4, 1, 1, 0 ; last operand must be xmm0 with legacy encoding AVX_INSTR cmpeqpd, sse2, 1, 0, 1 AVX_INSTR cmpeqps, sse, 1, 0, 1 AVX_INSTR cmpeqsd, sse2, 1, 0, 0 @@ -1582,7 +1605,7 @@ AVX_INSTR pand, mmx, 0, 0, 1 AVX_INSTR pandn, mmx, 0, 0, 0 AVX_INSTR pavgb, mmx2, 0, 0, 1 AVX_INSTR pavgw, mmx2, 0, 0, 1 -AVX_INSTR pblendvb, sse4 ; can't be emulated +AVX_INSTR pblendvb, sse4, 0, 1, 0 ; last operand must be xmm0 with legacy encoding AVX_INSTR pblendw, sse4, 0, 1, 0 AVX_INSTR pclmulhqhqdq, fnord, 0, 0, 0 AVX_INSTR pclmulhqlqdq, fnord, 0, 0, 0 diff --git a/third_party/dav1d/src/fg_apply.h b/third_party/dav1d/src/fg_apply.h index 779549bb2d020..be6685d8018bc 100644 --- a/third_party/dav1d/src/fg_apply.h +++ b/third_party/dav1d/src/fg_apply.h @@ -32,10 +32,27 @@
#include "common/bitdepth.h"
-#include "src/film_grain.h" +#include "src/filmgrain.h"
-bitfn_decls(void dav1d_apply_grain, const Dav1dFilmGrainDSPContext *const dsp, - Dav1dPicture *const out, - const Dav1dPicture *const in); +#ifdef BITDEPTH +# define array_decl(type, name, sz) type name sz +#else +# define array_decl(type, name, sz) void *name +#endif + +bitfn_decls(void dav1d_apply_grain, + const Dav1dFilmGrainDSPContext *const dsp, + Dav1dPicture *const out, const Dav1dPicture *const in); +bitfn_decls(void dav1d_prep_grain, + const Dav1dFilmGrainDSPContext *const dsp, + Dav1dPicture *const out, const Dav1dPicture *const in, + array_decl(uint8_t, scaling, [3][SCALING_SIZE]), + array_decl(entry, grain_lut, [3][GRAIN_HEIGHT+1][GRAIN_WIDTH])); +bitfn_decls(void dav1d_apply_grain_row, + const Dav1dFilmGrainDSPContext *const dsp, + Dav1dPicture *const out, const Dav1dPicture *const in, + array_decl(const uint8_t, scaling, [3][SCALING_SIZE]), + array_decl(const entry, grain_lut, [3][GRAIN_HEIGHT+1][GRAIN_WIDTH]), + const int row);
#endif /* DAV1D_SRC_FG_APPLY_H */ diff --git a/third_party/dav1d/src/fg_apply_tmpl.c b/third_party/dav1d/src/fg_apply_tmpl.c index c254a3dffa4f8..ee14db9a4cea6 100644 --- a/third_party/dav1d/src/fg_apply_tmpl.c +++ b/third_party/dav1d/src/fg_apply_tmpl.c @@ -30,13 +30,13 @@
#include <stdint.h>
+#include "dav1d/common.h" #include "dav1d/picture.h"
-#include "common.h" #include "common/intops.h" #include "common/bitdepth.h"
-#include "fg_apply.h" +#include "src/fg_apply.h"
static void generate_scaling(const int bitdepth, const uint8_t points[][2], const int num, @@ -44,14 +44,15 @@ static void generate_scaling(const int bitdepth, { #if BITDEPTH == 8 const int shift_x = 0; + const int scaling_size = SCALING_SIZE; #else + assert(bitdepth > 8); const int shift_x = bitdepth - 8; -#endif const int scaling_size = 1 << bitdepth; +#endif
// Fill up the preceding entries with the initial value - for (int i = 0; i < points[0][0] << shift_x; i++) - scaling[i] = points[0][1]; + memset(scaling, points[0][1], points[0][0] << shift_x);
// Linearly interpolate the values in the middle for (int i = 0; i < num - 1; i++) { @@ -61,16 +62,17 @@ static void generate_scaling(const int bitdepth, const int ey = points[i+1][1]; const int dx = ex - bx; const int dy = ey - by; + assert(dx > 0); const int delta = dy * ((0x10000 + (dx >> 1)) / dx); - for (int x = 0; x < dx; x++) { - const int v = by + ((x * delta + 0x8000) >> 16); - scaling[(bx + x) << shift_x] = v; + for (int x = 0, d = 0x8000; x < dx; x++) { + scaling[(bx + x) << shift_x] = by + (d >> 16); + d += delta; } }
// Fill up the remaining entries with the final value - for (int i = points[num - 1][0] << shift_x; i < scaling_size; i++) - scaling[i] = points[num - 1][1]; + const int n = points[num - 1][0] << shift_x; + memset(&scaling[n], points[num - 1][1], scaling_size - n);
#if BITDEPTH != 8 const int pad = 1 << shift_x, rnd = pad >> 1; @@ -80,8 +82,9 @@ static void generate_scaling(const int bitdepth, const int dx = ex - bx; for (int x = 0; x < dx; x += pad) { const int range = scaling[bx + x + pad] - scaling[bx + x]; - for (int n = 1; n < pad; n++) { - scaling[bx + x + n] = scaling[bx + x] + ((range * n + rnd) >> shift_x); + for (int n = 1, r = rnd; n < pad; n++) { + r += range; + scaling[bx + x + n] = scaling[bx + x] + (r >> shift_x); } } } @@ -89,14 +92,13 @@ static void generate_scaling(const int bitdepth, }
#ifndef UNIT_TEST -void bitfn(dav1d_apply_grain)(const Dav1dFilmGrainDSPContext *const dsp, - Dav1dPicture *const out, - const Dav1dPicture *const in) +void bitfn(dav1d_prep_grain)(const Dav1dFilmGrainDSPContext *const dsp, + Dav1dPicture *const out, + const Dav1dPicture *const in, + uint8_t scaling[3][SCALING_SIZE], + entry grain_lut[3][GRAIN_HEIGHT+1][GRAIN_WIDTH]) { const Dav1dFilmGrainData *const data = &out->frame_hdr->film_grain.data; - - ALIGN_STK_16(entry, grain_lut, 3,[GRAIN_HEIGHT + 1][GRAIN_WIDTH]); - uint8_t scaling[3][SCALING_SIZE]; #if BITDEPTH != 8 const int bitdepth_max = (1 << out->p.bpc) - 1; #endif @@ -150,60 +152,86 @@ void bitfn(dav1d_apply_grain)(const Dav1dFilmGrainDSPContext *const dsp, memcpy(out->data[2], in->data[2], sz); } } +}
+void bitfn(dav1d_apply_grain_row)(const Dav1dFilmGrainDSPContext *const dsp, + Dav1dPicture *const out, + const Dav1dPicture *const in, + const uint8_t scaling[3][SCALING_SIZE], + const entry grain_lut[3][GRAIN_HEIGHT+1][GRAIN_WIDTH], + const int row) +{ // Synthesize grain for the affected planes - const int rows = (out->p.h + 31) >> 5; + const Dav1dFilmGrainData *const data = &out->frame_hdr->film_grain.data; const int ss_y = in->p.layout == DAV1D_PIXEL_LAYOUT_I420; const int ss_x = in->p.layout != DAV1D_PIXEL_LAYOUT_I444; const int cpw = (out->p.w + ss_x) >> ss_x; const int is_id = out->seq_hdr->mtrx == DAV1D_MC_IDENTITY; - for (int row = 0; row < rows; row++) { - pixel *const luma_src = - ((pixel *) in->data[0]) + row * BLOCK_SIZE * PXSTRIDE(in->stride[0]); - - if (data->num_y_points) { - const int bh = imin(out->p.h - row * BLOCK_SIZE, BLOCK_SIZE); - dsp->fgy_32x32xn(((pixel *) out->data[0]) + row * BLOCK_SIZE * PXSTRIDE(out->stride[0]), - luma_src, out->stride[0], data, - out->p.w, scaling[0], grain_lut[0], bh, row HIGHBD_TAIL_SUFFIX); - } + pixel *const luma_src = + ((pixel *) in->data[0]) + row * BLOCK_SIZE * PXSTRIDE(in->stride[0]); +#if BITDEPTH != 8 + const int bitdepth_max = (1 << out->p.bpc) - 1; +#endif
- if (!data->num_uv_points[0] && !data->num_uv_points[1] && - !data->chroma_scaling_from_luma) - { - continue; - } + if (data->num_y_points) { + const int bh = imin(out->p.h - row * BLOCK_SIZE, BLOCK_SIZE); + dsp->fgy_32x32xn(((pixel *) out->data[0]) + row * BLOCK_SIZE * PXSTRIDE(out->stride[0]), + luma_src, out->stride[0], data, + out->p.w, scaling[0], grain_lut[0], bh, row HIGHBD_TAIL_SUFFIX); + }
- const int bh = (imin(out->p.h - row * BLOCK_SIZE, BLOCK_SIZE) + ss_y) >> ss_y; + if (!data->num_uv_points[0] && !data->num_uv_points[1] && + !data->chroma_scaling_from_luma) + { + return; + }
- // extend padding pixels - if (out->p.w & ss_x) { - pixel *ptr = luma_src; - for (int y = 0; y < bh; y++) { - ptr[out->p.w] = ptr[out->p.w - 1]; - ptr += PXSTRIDE(in->stride[0]) << ss_y; - } + const int bh = (imin(out->p.h - row * BLOCK_SIZE, BLOCK_SIZE) + ss_y) >> ss_y; + + // extend padding pixels + if (out->p.w & ss_x) { + pixel *ptr = luma_src; + for (int y = 0; y < bh; y++) { + ptr[out->p.w] = ptr[out->p.w - 1]; + ptr += PXSTRIDE(in->stride[0]) << ss_y; } + }
- const ptrdiff_t uv_off = row * BLOCK_SIZE * PXSTRIDE(out->stride[1]) >> ss_y; - if (data->chroma_scaling_from_luma) { - for (int pl = 0; pl < 2; pl++) + const ptrdiff_t uv_off = row * BLOCK_SIZE * PXSTRIDE(out->stride[1]) >> ss_y; + if (data->chroma_scaling_from_luma) { + for (int pl = 0; pl < 2; pl++) + dsp->fguv_32x32xn[in->p.layout - 1](((pixel *) out->data[1 + pl]) + uv_off, + ((const pixel *) in->data[1 + pl]) + uv_off, + in->stride[1], data, cpw, + scaling[0], grain_lut[1 + pl], + bh, row, luma_src, in->stride[0], + pl, is_id HIGHBD_TAIL_SUFFIX); + } else { + for (int pl = 0; pl < 2; pl++) + if (data->num_uv_points[pl]) dsp->fguv_32x32xn[in->p.layout - 1](((pixel *) out->data[1 + pl]) + uv_off, ((const pixel *) in->data[1 + pl]) + uv_off, in->stride[1], data, cpw, - scaling[0], grain_lut[1 + pl], + scaling[1 + pl], grain_lut[1 + pl], bh, row, luma_src, in->stride[0], pl, is_id HIGHBD_TAIL_SUFFIX); - } else { - for (int pl = 0; pl < 2; pl++) - if (data->num_uv_points[pl]) - dsp->fguv_32x32xn[in->p.layout - 1](((pixel *) out->data[1 + pl]) + uv_off, - ((const pixel *) in->data[1 + pl]) + uv_off, - in->stride[1], data, cpw, - scaling[1 + pl], grain_lut[1 + pl], - bh, row, luma_src, in->stride[0], - pl, is_id HIGHBD_TAIL_SUFFIX); - } } } + +void bitfn(dav1d_apply_grain)(const Dav1dFilmGrainDSPContext *const dsp, + Dav1dPicture *const out, + const Dav1dPicture *const in) +{ + ALIGN_STK_16(entry, grain_lut, 3,[GRAIN_HEIGHT + 1][GRAIN_WIDTH]); +#if ARCH_X86_64 && BITDEPTH == 8 + ALIGN_STK_64(uint8_t, scaling, 3,[SCALING_SIZE]); +#else + uint8_t scaling[3][SCALING_SIZE]; +#endif + const int rows = (out->p.h + 31) >> 5; + + bitfn(dav1d_prep_grain)(dsp, out, in, scaling, grain_lut); + for (int row = 0; row < rows; row++) + bitfn(dav1d_apply_grain_row)(dsp, out, in, scaling, grain_lut, row); +} #endif diff --git a/third_party/dav1d/src/filmgrain.h b/third_party/dav1d/src/filmgrain.h new file mode 100644 index 0000000000000..d953542a82a73 --- /dev/null +++ b/third_party/dav1d/src/filmgrain.h @@ -0,0 +1,86 @@ +/* + * Copyright © 2018-2021, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_FILM_GRAIN_H +#define DAV1D_SRC_FILM_GRAIN_H + +#include "common/bitdepth.h" + +#include "src/levels.h" + +#define GRAIN_WIDTH 82 +#define GRAIN_HEIGHT 73 +#define BLOCK_SIZE 32 +#if !defined(BITDEPTH) || BITDEPTH == 8 +#define SCALING_SIZE 256 +typedef int8_t entry; +#else +#define SCALING_SIZE 4096 +typedef int16_t entry; +#endif + +#define decl_generate_grain_y_fn(name) \ +void (name)(entry buf[][GRAIN_WIDTH], \ + const Dav1dFilmGrainData *const data HIGHBD_DECL_SUFFIX) +typedef decl_generate_grain_y_fn(*generate_grain_y_fn); + +#define decl_generate_grain_uv_fn(name) \ +void (name)(entry buf[][GRAIN_WIDTH], \ + const entry buf_y[][GRAIN_WIDTH], \ + const Dav1dFilmGrainData *const data, const intptr_t uv HIGHBD_DECL_SUFFIX) +typedef decl_generate_grain_uv_fn(*generate_grain_uv_fn); + +#define decl_fgy_32x32xn_fn(name) \ +void (name)(pixel *dst_row, const pixel *src_row, ptrdiff_t stride, \ + const Dav1dFilmGrainData *data, \ + size_t pw, const uint8_t scaling[SCALING_SIZE], \ + const entry grain_lut[][GRAIN_WIDTH], \ + int bh, int row_num HIGHBD_DECL_SUFFIX) +typedef decl_fgy_32x32xn_fn(*fgy_32x32xn_fn); + +#define decl_fguv_32x32xn_fn(name) \ +void (name)(pixel *dst_row, const pixel *src_row, ptrdiff_t stride, \ + const Dav1dFilmGrainData *data, int pw, \ + const uint8_t scaling[SCALING_SIZE], \ + const entry grain_lut[][GRAIN_WIDTH], int bh, int row_num, \ + const pixel *luma_row, ptrdiff_t luma_stride, \ + int uv_pl, int is_id HIGHBD_DECL_SUFFIX) +typedef decl_fguv_32x32xn_fn(*fguv_32x32xn_fn); + +typedef struct Dav1dFilmGrainDSPContext { + generate_grain_y_fn generate_grain_y; + generate_grain_uv_fn generate_grain_uv[3]; + + fgy_32x32xn_fn fgy_32x32xn; + fguv_32x32xn_fn fguv_32x32xn[3]; +} Dav1dFilmGrainDSPContext; + +bitfn_decls(void dav1d_film_grain_dsp_init, Dav1dFilmGrainDSPContext *c); +bitfn_decls(void dav1d_film_grain_dsp_init_arm, Dav1dFilmGrainDSPContext *c); +bitfn_decls(void dav1d_film_grain_dsp_init_x86, Dav1dFilmGrainDSPContext *c); + +#endif /* DAV1D_SRC_FILM_GRAIN_H */ diff --git a/third_party/dav1d/src/filmgrain_tmpl.c b/third_party/dav1d/src/filmgrain_tmpl.c new file mode 100644 index 0000000000000..883c5cbb7b9a8 --- /dev/null +++ b/third_party/dav1d/src/filmgrain_tmpl.c @@ -0,0 +1,433 @@ +/* + * Copyright © 2018, Niklas Haas + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "common/attributes.h" +#include "common/intops.h" + +#include "src/filmgrain.h" +#include "src/tables.h" + +#define SUB_GRAIN_WIDTH 44 +#define SUB_GRAIN_HEIGHT 38 + +static inline int get_random_number(const int bits, unsigned *const state) { + const int r = *state; + unsigned bit = ((r >> 0) ^ (r >> 1) ^ (r >> 3) ^ (r >> 12)) & 1; + *state = (r >> 1) | (bit << 15); + + return (*state >> (16 - bits)) & ((1 << bits) - 1); +} + +static inline int round2(const int x, const uint64_t shift) { + return (x + ((1 << shift) >> 1)) >> shift; +} + +static void generate_grain_y_c(entry buf[][GRAIN_WIDTH], + const Dav1dFilmGrainData *const data + HIGHBD_DECL_SUFFIX) +{ + const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; + unsigned seed = data->seed; + const int shift = 4 - bitdepth_min_8 + data->grain_scale_shift; + const int grain_ctr = 128 << bitdepth_min_8; + const int grain_min = -grain_ctr, grain_max = grain_ctr - 1; + + for (int y = 0; y < GRAIN_HEIGHT; y++) { + for (int x = 0; x < GRAIN_WIDTH; x++) { + const int value = get_random_number(11, &seed); + buf[y][x] = round2(dav1d_gaussian_sequence[ value ], shift); + } + } + + const int ar_pad = 3; + const int ar_lag = data->ar_coeff_lag; + + for (int y = ar_pad; y < GRAIN_HEIGHT; y++) { + for (int x = ar_pad; x < GRAIN_WIDTH - ar_pad; x++) { + const int8_t *coeff = data->ar_coeffs_y; + int sum = 0; + for (int dy = -ar_lag; dy <= 0; dy++) { + for (int dx = -ar_lag; dx <= ar_lag; dx++) { + if (!dx && !dy) + break; + sum += *(coeff++) * buf[y + dy][x + dx]; + } + } + + const int grain = buf[y][x] + round2(sum, data->ar_coeff_shift); + buf[y][x] = iclip(grain, grain_min, grain_max); + } + } +} + +static NOINLINE void +generate_grain_uv_c(entry buf[][GRAIN_WIDTH], + const entry buf_y[][GRAIN_WIDTH], + const Dav1dFilmGrainData *const data, const intptr_t uv, + const int subx, const int suby HIGHBD_DECL_SUFFIX) +{ + const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; + unsigned seed = data->seed ^ (uv ? 0x49d8 : 0xb524); + const int shift = 4 - bitdepth_min_8 + data->grain_scale_shift; + const int grain_ctr = 128 << bitdepth_min_8; + const int grain_min = -grain_ctr, grain_max = grain_ctr - 1; + + const int chromaW = subx ? SUB_GRAIN_WIDTH : GRAIN_WIDTH; + const int chromaH = suby ? SUB_GRAIN_HEIGHT : GRAIN_HEIGHT; + + for (int y = 0; y < chromaH; y++) { + for (int x = 0; x < chromaW; x++) { + const int value = get_random_number(11, &seed); + buf[y][x] = round2(dav1d_gaussian_sequence[ value ], shift); + } + } + + const int ar_pad = 3; + const int ar_lag = data->ar_coeff_lag; + + for (int y = ar_pad; y < chromaH; y++) { + for (int x = ar_pad; x < chromaW - ar_pad; x++) { + const int8_t *coeff = data->ar_coeffs_uv[uv]; + int sum = 0; + for (int dy = -ar_lag; dy <= 0; dy++) { + for (int dx = -ar_lag; dx <= ar_lag; dx++) { + // For the final (current) pixel, we need to add in the + // contribution from the luma grain texture + if (!dx && !dy) { + if (!data->num_y_points) + break; + int luma = 0; + const int lumaX = ((x - ar_pad) << subx) + ar_pad; + const int lumaY = ((y - ar_pad) << suby) + ar_pad; + for (int i = 0; i <= suby; i++) { + for (int j = 0; j <= subx; j++) { + luma += buf_y[lumaY + i][lumaX + j]; + } + } + luma = round2(luma, subx + suby); + sum += luma * (*coeff); + break; + } + + sum += *(coeff++) * buf[y + dy][x + dx]; + } + } + + const int grain = buf[y][x] + round2(sum, data->ar_coeff_shift); + buf[y][x] = iclip(grain, grain_min, grain_max); + } + } +} + +#define gnuv_ss_fn(nm, ss_x, ss_y) \ +static decl_generate_grain_uv_fn(generate_grain_uv_##nm##_c) { \ + generate_grain_uv_c(buf, buf_y, data, uv, ss_x, ss_y HIGHBD_TAIL_SUFFIX); \ +} + +gnuv_ss_fn(420, 1, 1); +gnuv_ss_fn(422, 1, 0); +gnuv_ss_fn(444, 0, 0); + +// samples from the correct block of a grain LUT, while taking into account the +// offsets provided by the offsets cache +static inline entry sample_lut(const entry grain_lut[][GRAIN_WIDTH], + const int offsets[2][2], const int subx, const int suby, + const int bx, const int by, const int x, const int y) +{ + const int randval = offsets[bx][by]; + const int offx = 3 + (2 >> subx) * (3 + (randval >> 4)); + const int offy = 3 + (2 >> suby) * (3 + (randval & 0xF)); + return grain_lut[offy + y + (BLOCK_SIZE >> suby) * by] + [offx + x + (BLOCK_SIZE >> subx) * bx]; +} + +static void fgy_32x32xn_c(pixel *const dst_row, const pixel *const src_row, + const ptrdiff_t stride, + const Dav1dFilmGrainData *const data, const size_t pw, + const uint8_t scaling[SCALING_SIZE], + const entry grain_lut[][GRAIN_WIDTH], + const int bh, const int row_num HIGHBD_DECL_SUFFIX) +{ + const int rows = 1 + (data->overlap_flag && row_num > 0); + const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; + const int grain_ctr = 128 << bitdepth_min_8; + const int grain_min = -grain_ctr, grain_max = grain_ctr - 1; + + int min_value, max_value; + if (data->clip_to_restricted_range) { + min_value = 16 << bitdepth_min_8; + max_value = 235 << bitdepth_min_8; + } else { + min_value = 0; + max_value = BITDEPTH_MAX; + } + + // seed[0] contains the current row, seed[1] contains the previous + unsigned seed[2]; + for (int i = 0; i < rows; i++) { + seed[i] = data->seed; + seed[i] ^= (((row_num - i) * 37 + 178) & 0xFF) << 8; + seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF); + } + + assert(stride % (BLOCK_SIZE * sizeof(pixel)) == 0); + + int offsets[2 /* col offset */][2 /* row offset */]; + + // process this row in BLOCK_SIZE^2 blocks + for (unsigned bx = 0; bx < pw; bx += BLOCK_SIZE) { + const int bw = imin(BLOCK_SIZE, (int) pw - bx); + + if (data->overlap_flag && bx) { + // shift previous offsets left + for (int i = 0; i < rows; i++) + offsets[1][i] = offsets[0][i]; + } + + // update current offsets + for (int i = 0; i < rows; i++) + offsets[0][i] = get_random_number(8, &seed[i]); + + // x/y block offsets to compensate for overlapped regions + const int ystart = data->overlap_flag && row_num ? imin(2, bh) : 0; + const int xstart = data->overlap_flag && bx ? imin(2, bw) : 0; + + static const int w[2][2] = { { 27, 17 }, { 17, 27 } }; + +#define add_noise_y(x, y, grain) \ + const pixel *const src = src_row + (y) * PXSTRIDE(stride) + (x) + bx; \ + pixel *const dst = dst_row + (y) * PXSTRIDE(stride) + (x) + bx; \ + const int noise = round2(scaling[ *src ] * (grain), data->scaling_shift); \ + *dst = iclip(*src + noise, min_value, max_value); + + for (int y = ystart; y < bh; y++) { + // Non-overlapped image region (straightforward) + for (int x = xstart; x < bw; x++) { + int grain = sample_lut(grain_lut, offsets, 0, 0, 0, 0, x, y); + add_noise_y(x, y, grain); + } + + // Special case for overlapped column + for (int x = 0; x < xstart; x++) { + int grain = sample_lut(grain_lut, offsets, 0, 0, 0, 0, x, y); + int old = sample_lut(grain_lut, offsets, 0, 0, 1, 0, x, y); + grain = round2(old * w[x][0] + grain * w[x][1], 5); + grain = iclip(grain, grain_min, grain_max); + add_noise_y(x, y, grain); + } + } + + for (int y = 0; y < ystart; y++) { + // Special case for overlapped row (sans corner) + for (int x = xstart; x < bw; x++) { + int grain = sample_lut(grain_lut, offsets, 0, 0, 0, 0, x, y); + int old = sample_lut(grain_lut, offsets, 0, 0, 0, 1, x, y); + grain = round2(old * w[y][0] + grain * w[y][1], 5); + grain = iclip(grain, grain_min, grain_max); + add_noise_y(x, y, grain); + } + + // Special case for doubly-overlapped corner + for (int x = 0; x < xstart; x++) { + // Blend the top pixel with the top left block + int top = sample_lut(grain_lut, offsets, 0, 0, 0, 1, x, y); + int old = sample_lut(grain_lut, offsets, 0, 0, 1, 1, x, y); + top = round2(old * w[x][0] + top * w[x][1], 5); + top = iclip(top, grain_min, grain_max); + + // Blend the current pixel with the left block + int grain = sample_lut(grain_lut, offsets, 0, 0, 0, 0, x, y); + old = sample_lut(grain_lut, offsets, 0, 0, 1, 0, x, y); + grain = round2(old * w[x][0] + grain * w[x][1], 5); + grain = iclip(grain, grain_min, grain_max); + + // Mix the row rows together and apply grain + grain = round2(top * w[y][0] + grain * w[y][1], 5); + grain = iclip(grain, grain_min, grain_max); + add_noise_y(x, y, grain); + } + } + } +} + +static NOINLINE void +fguv_32x32xn_c(pixel *const dst_row, const pixel *const src_row, + const ptrdiff_t stride, const Dav1dFilmGrainData *const data, + const int pw, const uint8_t scaling[SCALING_SIZE], + const entry grain_lut[][GRAIN_WIDTH], const int bh, + const int row_num, const pixel *const luma_row, + const ptrdiff_t luma_stride, const int uv, const int is_id, + const int sx, const int sy HIGHBD_DECL_SUFFIX) +{ + const int rows = 1 + (data->overlap_flag && row_num > 0); + const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; + const int grain_ctr = 128 << bitdepth_min_8; + const int grain_min = -grain_ctr, grain_max = grain_ctr - 1; + + int min_value, max_value; + if (data->clip_to_restricted_range) { + min_value = 16 << bitdepth_min_8; + max_value = (is_id ? 235 : 240) << bitdepth_min_8; + } else { + min_value = 0; + max_value = BITDEPTH_MAX; + } + + // seed[0] contains the current row, seed[1] contains the previous + unsigned seed[2]; + for (int i = 0; i < rows; i++) { + seed[i] = data->seed; + seed[i] ^= (((row_num - i) * 37 + 178) & 0xFF) << 8; + seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF); + } + + assert(stride % (BLOCK_SIZE * sizeof(pixel)) == 0); + + int offsets[2 /* col offset */][2 /* row offset */]; + + // process this row in BLOCK_SIZE^2 blocks (subsampled) + for (int bx = 0; bx < pw; bx += BLOCK_SIZE >> sx) { + const int bw = imin(BLOCK_SIZE >> sx, pw - bx); + if (data->overlap_flag && bx) { + // shift previous offsets left + for (int i = 0; i < rows; i++) + offsets[1][i] = offsets[0][i]; + } + + // update current offsets + for (int i = 0; i < rows; i++) + offsets[0][i] = get_random_number(8, &seed[i]); + + // x/y block offsets to compensate for overlapped regions + const int ystart = data->overlap_flag && row_num ? imin(2 >> sy, bh) : 0; + const int xstart = data->overlap_flag && bx ? imin(2 >> sx, bw) : 0; + + static const int w[2 /* sub */][2 /* off */][2] = { + { { 27, 17 }, { 17, 27 } }, + { { 23, 22 } }, + }; + +#define add_noise_uv(x, y, grain) \ + const int lx = (bx + x) << sx; \ + const int ly = y << sy; \ + const pixel *const luma = luma_row + ly * PXSTRIDE(luma_stride) + lx; \ + pixel avg = luma[0]; \ + if (sx) \ + avg = (avg + luma[1] + 1) >> 1; \ + const pixel *const src = src_row + (y) * PXSTRIDE(stride) + (bx + (x)); \ + pixel *const dst = dst_row + (y) * PXSTRIDE(stride) + (bx + (x)); \ + int val = avg; \ + if (!data->chroma_scaling_from_luma) { \ + const int combined = avg * data->uv_luma_mult[uv] + \ + *src * data->uv_mult[uv]; \ + val = iclip_pixel( (combined >> 6) + \ + (data->uv_offset[uv] * (1 << bitdepth_min_8)) ); \ + } \ + const int noise = round2(scaling[ val ] * (grain), data->scaling_shift); \ + *dst = iclip(*src + noise, min_value, max_value); + + for (int y = ystart; y < bh; y++) { + // Non-overlapped image region (straightforward) + for (int x = xstart; x < bw; x++) { + int grain = sample_lut(grain_lut, offsets, sx, sy, 0, 0, x, y); + add_noise_uv(x, y, grain); + } + + // Special case for overlapped column + for (int x = 0; x < xstart; x++) { + int grain = sample_lut(grain_lut, offsets, sx, sy, 0, 0, x, y); + int old = sample_lut(grain_lut, offsets, sx, sy, 1, 0, x, y); + grain = round2(old * w[sx][x][0] + grain * w[sx][x][1], 5); + grain = iclip(grain, grain_min, grain_max); + add_noise_uv(x, y, grain); + } + } + + for (int y = 0; y < ystart; y++) { + // Special case for overlapped row (sans corner) + for (int x = xstart; x < bw; x++) { + int grain = sample_lut(grain_lut, offsets, sx, sy, 0, 0, x, y); + int old = sample_lut(grain_lut, offsets, sx, sy, 0, 1, x, y); + grain = round2(old * w[sy][y][0] + grain * w[sy][y][1], 5); + grain = iclip(grain, grain_min, grain_max); + add_noise_uv(x, y, grain); + } + + // Special case for doubly-overlapped corner + for (int x = 0; x < xstart; x++) { + // Blend the top pixel with the top left block + int top = sample_lut(grain_lut, offsets, sx, sy, 0, 1, x, y); + int old = sample_lut(grain_lut, offsets, sx, sy, 1, 1, x, y); + top = round2(old * w[sx][x][0] + top * w[sx][x][1], 5); + top = iclip(top, grain_min, grain_max); + + // Blend the current pixel with the left block + int grain = sample_lut(grain_lut, offsets, sx, sy, 0, 0, x, y); + old = sample_lut(grain_lut, offsets, sx, sy, 1, 0, x, y); + grain = round2(old * w[sx][x][0] + grain * w[sx][x][1], 5); + grain = iclip(grain, grain_min, grain_max); + + // Mix the row rows together and apply to image + grain = round2(top * w[sy][y][0] + grain * w[sy][y][1], 5); + grain = iclip(grain, grain_min, grain_max); + add_noise_uv(x, y, grain); + } + } + } +} + +#define fguv_ss_fn(nm, ss_x, ss_y) \ +static decl_fguv_32x32xn_fn(fguv_32x32xn_##nm##_c) { \ + fguv_32x32xn_c(dst_row, src_row, stride, data, pw, scaling, grain_lut, bh, \ + row_num, luma_row, luma_stride, uv_pl, is_id, ss_x, ss_y \ + HIGHBD_TAIL_SUFFIX); \ +} + +fguv_ss_fn(420, 1, 1); +fguv_ss_fn(422, 1, 0); +fguv_ss_fn(444, 0, 0); + +COLD void bitfn(dav1d_film_grain_dsp_init)(Dav1dFilmGrainDSPContext *const c) { + c->generate_grain_y = generate_grain_y_c; + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = generate_grain_uv_420_c; + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = generate_grain_uv_422_c; + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = generate_grain_uv_444_c; + + c->fgy_32x32xn = fgy_32x32xn_c; + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = fguv_32x32xn_420_c; + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = fguv_32x32xn_422_c; + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = fguv_32x32xn_444_c; + +#if HAVE_ASM +#if ARCH_AARCH64 || ARCH_ARM + bitfn(dav1d_film_grain_dsp_init_arm)(c); +#elif ARCH_X86 + bitfn(dav1d_film_grain_dsp_init_x86)(c); +#endif +#endif +} diff --git a/third_party/dav1d/src/internal.h b/third_party/dav1d/src/internal.h index 8edb80f21ce3e..eceda98eca4da 100644 --- a/third_party/dav1d/src/internal.h +++ b/third_party/dav1d/src/internal.h @@ -43,7 +43,7 @@ typedef struct Dav1dTask Dav1dTask; #include "src/cdf.h" #include "src/data.h" #include "src/env.h" -#include "src/film_grain.h" +#include "src/filmgrain.h" #include "src/intra_edge.h" #include "src/ipred.h" #include "src/itx.h" @@ -73,6 +73,22 @@ struct Dav1dTileGroup { int start, end; };
+enum TaskType { + DAV1D_TASK_TYPE_INIT, + DAV1D_TASK_TYPE_INIT_CDF, + DAV1D_TASK_TYPE_TILE_ENTROPY, + DAV1D_TASK_TYPE_ENTROPY_PROGRESS, + DAV1D_TASK_TYPE_TILE_RECONSTRUCTION, + DAV1D_TASK_TYPE_DEBLOCK_COLS, + DAV1D_TASK_TYPE_DEBLOCK_ROWS, + DAV1D_TASK_TYPE_CDEF, + DAV1D_TASK_TYPE_SUPER_RESOLUTION, + DAV1D_TASK_TYPE_LOOP_RESTORATION, + DAV1D_TASK_TYPE_RECONSTRUCTION_PROGRESS, + DAV1D_TASK_TYPE_FG_PREP, + DAV1D_TASK_TYPE_FG_APPLY, +}; + struct Dav1dContext { Dav1dFrameContext *fc; unsigned n_fc; @@ -123,6 +139,24 @@ struct Dav1dContext { // See src/thread_task.c:reset_task_cur(). atomic_uint reset_task_cur; atomic_int cond_signaled; + struct { + int exec; + pthread_cond_t cond; + const Dav1dPicture *in; + Dav1dPicture *out; + enum TaskType type; + atomic_int progress[2]; /* [0]=started, [1]=completed */ + union { + struct { + ALIGN(int8_t grain_lut_8bpc[3][GRAIN_HEIGHT + 1][GRAIN_WIDTH], 16); + ALIGN(uint8_t scaling_8bpc[3][256], 64); + }; + struct { + ALIGN(int16_t grain_lut_16bpc[3][GRAIN_HEIGHT + 1][GRAIN_WIDTH], 16); + ALIGN(uint8_t scaling_16bpc[3][4096], 64); + }; + }; + } delayed_fg; int inited; } task_thread;
@@ -155,6 +189,7 @@ struct Dav1dContext { int operating_point; unsigned operating_point_idc; int all_layers; + int max_spatial_id; unsigned frame_size_limit; int strict_std_compliance; int output_invisible_frames; @@ -162,26 +197,14 @@ struct Dav1dContext { int drain; enum PictureFlags frame_flags; enum Dav1dEventFlags event_flags; + Dav1dDataProps cached_error_props; + int cached_error;
Dav1dLogger logger;
Dav1dMemPool *picture_pool; };
-enum TaskType { - DAV1D_TASK_TYPE_INIT, - DAV1D_TASK_TYPE_INIT_CDF, - DAV1D_TASK_TYPE_TILE_ENTROPY, - DAV1D_TASK_TYPE_ENTROPY_PROGRESS, - DAV1D_TASK_TYPE_TILE_RECONSTRUCTION, - DAV1D_TASK_TYPE_DEBLOCK_COLS, - DAV1D_TASK_TYPE_DEBLOCK_ROWS, - DAV1D_TASK_TYPE_CDEF, - DAV1D_TASK_TYPE_SUPER_RESOLUTION, - DAV1D_TASK_TYPE_LOOP_RESTORATION, - DAV1D_TASK_TYPE_RECONSTRUCTION_PROGRESS, -}; - struct Dav1dTask { unsigned frame_idx; // frame thread id enum TaskType type; // task work diff --git a/third_party/dav1d/src/lib.c b/third_party/dav1d/src/lib.c index 6b50a536786d2..b21a735964f23 100644 --- a/third_party/dav1d/src/lib.c +++ b/third_party/dav1d/src/lib.c @@ -120,7 +120,7 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
pthread_attr_setstacksize(&thread_attr, stack_size);
- Dav1dContext *const c = *c_out = dav1d_alloc_aligned(sizeof(*c), 32); + Dav1dContext *const c = *c_out = dav1d_alloc_aligned(sizeof(*c), 64); if (!c) goto error; memset(c, 0, sizeof(*c));
@@ -134,6 +134,8 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) { c->output_invisible_frames = s->output_invisible_frames; c->inloop_filters = s->inloop_filters;
+ dav1d_data_props_set_defaults(&c->cached_error_props); + if (dav1d_mem_pool_init(&c->seq_hdr_pool) || dav1d_mem_pool_init(&c->frame_hdr_pool) || dav1d_mem_pool_init(&c->segmap_pool) || @@ -197,6 +199,11 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) { pthread_mutex_destroy(&c->task_thread.lock); goto error; } + if (pthread_cond_init(&c->task_thread.delayed_fg.cond, NULL)) { + pthread_cond_destroy(&c->task_thread.cond); + pthread_mutex_destroy(&c->task_thread.lock); + goto error; + } c->task_thread.cur = c->n_fc; atomic_init(&c->task_thread.reset_task_cur, UINT_MAX); atomic_init(&c->task_thread.cond_signaled, 0); @@ -317,7 +324,8 @@ static int output_image(Dav1dContext *const c, Dav1dPicture *const out) { int res = 0;
- Dav1dThreadPicture *const in = c->all_layers ? &c->out : &c->cache; + Dav1dThreadPicture *const in = (c->all_layers || !c->max_spatial_id) + ? &c->out : &c->cache; if (!c->apply_grain || !has_grain(&in->p)) { dav1d_picture_move_ref(out, &in->p); dav1d_thread_picture_unref(in); @@ -327,18 +335,17 @@ static int output_image(Dav1dContext *const c, Dav1dPicture *const out) res = dav1d_apply_grain(c, out, &in->p); dav1d_thread_picture_unref(in); end: - if (!c->all_layers && c->out.p.data[0]) { + if (!c->all_layers && c->max_spatial_id && c->out.p.data[0]) { dav1d_thread_picture_move_ref(in, &c->out); } return res; }
static int output_picture_ready(Dav1dContext *const c, const int drain) { - if (!c->all_layers) { + if (c->cached_error) return 1; + if (!c->all_layers && c->max_spatial_id) { if (c->out.p.data[0] && c->cache.p.data[0]) { - const unsigned spatial_mask = c->operating_point_idc >> 8; - const int max_spatial_id = spatial_mask ? ulog2(spatial_mask) : 0; - if (max_spatial_id == c->cache.p.frame_hdr->spatial_id || + if (c->max_spatial_id == c->cache.p.frame_hdr->spatial_id || c->out.flags & PICTURE_FLAG_NEW_TEMPORAL_UNIT) return 1; dav1d_thread_picture_unref(&c->cache); @@ -377,6 +384,13 @@ static int drain_picture(Dav1dContext *const c, Dav1dPicture *const out) { if (++c->frame_thread.next == c->n_fc) c->frame_thread.next = 0; pthread_mutex_unlock(&c->task_thread.lock); + const int error = f->task_thread.retval; + if (error) { + f->task_thread.retval = 0; + dav1d_data_props_copy(&c->cached_error_props, &out_delayed->p.m); + dav1d_thread_picture_unref(out_delayed); + return error; + } if (out_delayed->p.data[0]) { const unsigned progress = atomic_load_explicit(&out_delayed->progress[1], @@ -457,6 +471,12 @@ int dav1d_get_picture(Dav1dContext *const c, Dav1dPicture *const out) if (res < 0) return res;
+ if (c->cached_error) { + const int res = c->cached_error; + c->cached_error = 0; + return res; + } + if (output_picture_ready(c, c->n_fc == 1)) return output_image(c, out);
@@ -479,33 +499,43 @@ int dav1d_apply_grain(Dav1dContext *const c, Dav1dPicture *const out, }
int res = dav1d_picture_alloc_copy(c, out, in->p.w, in); - if (res < 0) { - dav1d_picture_unref_internal(out); - return res; - } + if (res < 0) goto error;
- switch (out->p.bpc) { + if (c->n_tc > 1) { + dav1d_task_delayed_fg(c, out, in); + } else { + switch (out->p.bpc) { #if CONFIG_8BPC - case 8: - dav1d_apply_grain_8bpc(&c->dsp[0].fg, out, in); - break; + case 8: + dav1d_apply_grain_8bpc(&c->dsp[0].fg, out, in); + break; #endif #if CONFIG_16BPC - case 10: - case 12: - dav1d_apply_grain_16bpc(&c->dsp[(out->p.bpc >> 1) - 4].fg, out, in); - break; + case 10: + case 12: + dav1d_apply_grain_16bpc(&c->dsp[(out->p.bpc >> 1) - 4].fg, out, in); + break; #endif - default: - assert(0); + default: abort(); + } }
return 0; + +error: + dav1d_picture_unref_internal(out); + return res; }
void dav1d_flush(Dav1dContext *const c) { dav1d_data_unref_internal(&c->in); + if (c->out.p.data[0]) + dav1d_thread_picture_unref(&c->out); + if (c->cache.p.data[0]) + dav1d_thread_picture_unref(&c->cache); + c->drain = 0; + c->cached_error = 0;
for (int i = 0; i < 8; i++) { if (c->refs[i].p.p.data[0]) @@ -525,6 +555,8 @@ void dav1d_flush(Dav1dContext *const c) { dav1d_ref_dec(&c->content_light_ref); dav1d_ref_dec(&c->itut_t35_ref);
+ dav1d_data_props_unref_internal(&c->cached_error_props); + if (c->n_fc == 1 && c->n_tc == 1) return; atomic_store(c->flush, 1);
@@ -556,6 +588,7 @@ void dav1d_flush(Dav1dContext *const c) { Dav1dFrameContext *const f = &c->fc[next]; dav1d_decode_frame_exit(f, -1); f->n_tile_data = 0; + f->task_thread.retval = 0; Dav1dThreadPicture *out_delayed = &c->frame_thread.out_delayed[next]; if (out_delayed->p.data[0]) { dav1d_thread_picture_unref(out_delayed); @@ -592,6 +625,7 @@ static COLD void close_internal(Dav1dContext **const c_out, int flush) { pthread_cond_destroy(&pf->task_thread.td.cond); pthread_mutex_destroy(&pf->task_thread.td.lock); } + pthread_cond_destroy(&ttd->delayed_fg.cond); pthread_cond_destroy(&ttd->cond); pthread_mutex_destroy(&ttd->lock); } @@ -631,7 +665,6 @@ static COLD void close_internal(Dav1dContext **const c_out, int flush) { dav1d_free_aligned(f->lf.lr_line_buf); } dav1d_free_aligned(c->fc); - dav1d_data_unref_internal(&c->in); if (c->n_fc > 1 && c->frame_thread.out_delayed) { for (unsigned n = 0; n < c->n_fc; n++) if (c->frame_thread.out_delayed[n].p.data[0]) @@ -674,6 +707,17 @@ int dav1d_get_event_flags(Dav1dContext *const c, enum Dav1dEventFlags *const fla return 0; }
+int dav1d_get_decode_error_data_props(Dav1dContext *const c, Dav1dDataProps *const out) { + validate_input_or_ret(c != NULL, DAV1D_ERR(EINVAL)); + validate_input_or_ret(out != NULL, DAV1D_ERR(EINVAL)); + + dav1d_data_props_unref_internal(out); + *out = c->cached_error_props; + dav1d_data_props_set_defaults(&c->cached_error_props); + + return 0; +} + void dav1d_picture_unref(Dav1dPicture *const p) { dav1d_picture_unref_internal(p); } @@ -706,3 +750,7 @@ int dav1d_data_wrap_user_data(Dav1dData *const buf, void dav1d_data_unref(Dav1dData *const buf) { dav1d_data_unref_internal(buf); } + +void dav1d_data_props_unref(Dav1dDataProps *const props) { + dav1d_data_props_unref_internal(props); +} diff --git a/third_party/dav1d/src/meson.build b/third_party/dav1d/src/meson.build index 9095f0ba97a94..b06aee6d70d84 100644 --- a/third_party/dav1d/src/meson.build +++ b/third_party/dav1d/src/meson.build @@ -58,7 +58,7 @@ libdav1d_tmpl_sources = files( 'cdef_apply_tmpl.c', 'cdef_tmpl.c', 'fg_apply_tmpl.c', - 'film_grain_tmpl.c', + 'filmgrain_tmpl.c', 'ipred_prepare_tmpl.c', 'ipred_tmpl.c', 'itx_tmpl.c', @@ -96,7 +96,7 @@ if is_asm_enabled ) libdav1d_tmpl_sources += files( 'arm/cdef_init_tmpl.c', - 'arm/film_grain_init_tmpl.c', + 'arm/filmgrain_init_tmpl.c', 'arm/ipred_init_tmpl.c', 'arm/itx_init_tmpl.c', 'arm/loopfilter_init_tmpl.c', @@ -116,7 +116,7 @@ if is_asm_enabled if dav1d_bitdepths.contains('8') libdav1d_sources_asm += files( 'arm/64/cdef.S', - 'arm/64/film_grain.S', + 'arm/64/filmgrain.S', 'arm/64/ipred.S', 'arm/64/loopfilter.S', 'arm/64/looprestoration.S', @@ -127,7 +127,7 @@ if is_asm_enabled if dav1d_bitdepths.contains('16') libdav1d_sources_asm += files( 'arm/64/cdef16.S', - 'arm/64/film_grain16.S', + 'arm/64/filmgrain16.S', 'arm/64/ipred16.S', 'arm/64/itx16.S', 'arm/64/loopfilter16.S', @@ -147,7 +147,7 @@ if is_asm_enabled if dav1d_bitdepths.contains('8') libdav1d_sources_asm += files( 'arm/32/cdef.S', - 'arm/32/film_grain.S', + 'arm/32/filmgrain.S', 'arm/32/ipred.S', 'arm/32/loopfilter.S', 'arm/32/looprestoration.S', @@ -158,7 +158,7 @@ if is_asm_enabled if dav1d_bitdepths.contains('16') libdav1d_sources_asm += files( 'arm/32/cdef16.S', - 'arm/32/film_grain16.S', + 'arm/32/filmgrain16.S', 'arm/32/ipred16.S', 'arm/32/itx16.S', 'arm/32/loopfilter16.S', @@ -183,7 +183,7 @@ if is_asm_enabled
libdav1d_tmpl_sources += files( 'x86/cdef_init_tmpl.c', - 'x86/film_grain_init_tmpl.c', + 'x86/filmgrain_init_tmpl.c', 'x86/ipred_init_tmpl.c', 'x86/itx_init_tmpl.c', 'x86/loopfilter_init_tmpl.c', @@ -206,16 +206,17 @@ if is_asm_enabled if dav1d_bitdepths.contains('8') libdav1d_sources_asm += files( 'x86/cdef_avx512.asm', + 'x86/filmgrain_avx512.asm', 'x86/ipred_avx512.asm', 'x86/itx_avx512.asm', 'x86/loopfilter_avx512.asm', 'x86/looprestoration_avx512.asm', 'x86/mc_avx512.asm', - 'x86/mc_avx2.asm', - 'x86/film_grain_avx2.asm', + 'x86/filmgrain_avx2.asm', 'x86/ipred_avx2.asm', 'x86/loopfilter_avx2.asm', - 'x86/film_grain_sse.asm', + 'x86/mc_avx2.asm', + 'x86/filmgrain_sse.asm', 'x86/ipred_sse.asm', 'x86/loopfilter_sse.asm', 'x86/looprestoration_sse.asm', @@ -225,17 +226,18 @@ if is_asm_enabled
if dav1d_bitdepths.contains('16') libdav1d_sources_asm += files( + 'x86/ipred16_avx512.asm', 'x86/looprestoration16_avx512.asm', 'x86/mc16_avx512.asm', 'x86/cdef16_avx2.asm', - 'x86/film_grain16_avx2.asm', + 'x86/filmgrain16_avx2.asm', 'x86/ipred16_avx2.asm', 'x86/itx16_avx2.asm', 'x86/loopfilter16_avx2.asm', 'x86/looprestoration16_avx2.asm', 'x86/mc16_avx2.asm', 'x86/cdef16_sse.asm', - 'x86/film_grain16_sse.asm', + 'x86/filmgrain16_sse.asm', 'x86/ipred16_sse.asm', 'x86/itx16_sse.asm', 'x86/loopfilter16_sse.asm', diff --git a/third_party/dav1d/src/obu.c b/third_party/dav1d/src/obu.c index dee6e13de913d..7df6850a8c392 100644 --- a/third_party/dav1d/src/obu.c +++ b/third_party/dav1d/src/obu.c @@ -135,15 +135,18 @@ static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb, op->initial_display_delay = dav1d_get_bits(gb, 4) + 1; } } - const int op_idx = - c->operating_point < hdr->num_operating_points ? c->operating_point : 0; - c->operating_point_idc = hdr->operating_points[op_idx].idc; #if DEBUG_SEQ_HDR printf("SEQHDR: post-operating-points: off=%u\n", dav1d_get_bits_pos(gb) - init_bit_pos); #endif }
+ const int op_idx = + c->operating_point < hdr->num_operating_points ? c->operating_point : 0; + c->operating_point_idc = hdr->operating_points[op_idx].idc; + const unsigned spatial_mask = c->operating_point_idc >> 8; + c->max_spatial_id = spatial_mask ? ulog2(spatial_mask) : 0; + hdr->width_n_bits = dav1d_get_bits(gb, 4) + 1; hdr->height_n_bits = dav1d_get_bits(gb, 4) + 1; hdr->max_width = dav1d_get_bits(gb, hdr->width_n_bits) + 1; @@ -383,7 +386,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { if (seqhdr->frame_id_numbers_present) { hdr->frame_id = dav1d_get_bits(gb, seqhdr->frame_id_n_bits); Dav1dFrameHeader *const ref_frame_hdr = c->refs[hdr->existing_frame_idx].p.p.frame_hdr; - if (!ref_frame_hdr || ref_frame_hdr->frame_id != hdr->frame_id) return DAV1D_ERR(EINVAL); + if (!ref_frame_hdr || ref_frame_hdr->frame_id != hdr->frame_id) goto error; } return 0; } @@ -767,7 +770,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { // segmentation data from the reference frame. assert(hdr->primary_ref_frame != DAV1D_PRIMARY_REF_NONE); const int pri_ref = hdr->refidx[hdr->primary_ref_frame]; - if (!c->refs[pri_ref].p.p.frame_hdr) return DAV1D_ERR(EINVAL); + if (!c->refs[pri_ref].p.p.frame_hdr) goto error; hdr->segmentation.seg_data = c->refs[pri_ref].p.p.frame_hdr->segmentation.seg_data; } @@ -829,7 +832,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { hdr->loopfilter.mode_ref_deltas = default_mode_ref_deltas; } else { const int ref = hdr->refidx[hdr->primary_ref_frame]; - if (!c->refs[ref].p.p.frame_hdr) return DAV1D_ERR(EINVAL); + if (!c->refs[ref].p.p.frame_hdr) goto error; hdr->loopfilter.mode_ref_deltas = c->refs[ref].p.p.frame_hdr->loopfilter.mode_ref_deltas; } @@ -932,7 +935,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { int off_after = -1; int off_before_idx, off_after_idx; for (int i = 0; i < 7; i++) { - if (!c->refs[hdr->refidx[i]].p.p.data[0]) return DAV1D_ERR(EINVAL); + if (!c->refs[hdr->refidx[i]].p.p.data[0]) goto error; const unsigned refpoc = c->refs[hdr->refidx[i]].p.p.frame_hdr->frame_offset;
const int diff = get_poc_diff(seqhdr->order_hint_n_bits, refpoc, poc); @@ -960,7 +963,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { unsigned off_before2 = 0xFFFFFFFFU; int off_before2_idx; for (int i = 0; i < 7; i++) { - if (!c->refs[hdr->refidx[i]].p.p.data[0]) return DAV1D_ERR(EINVAL); + if (!c->refs[hdr->refidx[i]].p.p.data[0]) goto error; const unsigned refpoc = c->refs[hdr->refidx[i]].p.p.frame_hdr->frame_offset; if (get_poc_diff(seqhdr->order_hint_n_bits, refpoc, off_before) < 0) { @@ -1015,7 +1018,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { ref_gmv = &dav1d_default_wm_params; } else { const int pri_ref = hdr->refidx[hdr->primary_ref_frame]; - if (!c->refs[pri_ref].p.p.frame_hdr) return DAV1D_ERR(EINVAL); + if (!c->refs[pri_ref].p.p.frame_hdr) goto error; ref_gmv = &c->refs[pri_ref].p.p.frame_hdr->gmv[i]; } int32_t *const mat = hdr->gmv[i].matrix; @@ -1245,11 +1248,11 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa memset(seq_hdr, 0, sizeof(*seq_hdr)); if ((res = parse_seq_hdr(c, &gb, seq_hdr)) < 0) { dav1d_ref_dec(&ref); - return res; + goto error; } if (check_for_overrun(c, &gb, init_bit_pos, len)) { dav1d_ref_dec(&ref); - return DAV1D_ERR(EINVAL); + goto error; } // If we have read a sequence header which is different from // the old one, this is a new video sequence and can't use any @@ -1307,7 +1310,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa c->frame_hdr->spatial_id = spatial_id; if ((res = parse_frame_hdr(c, &gb)) < 0) { c->frame_hdr = NULL; - return res; + goto error; } for (int n = 0; n < c->n_tile_data; n++) dav1d_data_unref_internal(&c->tile[n].data); @@ -1319,7 +1322,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa dav1d_get_bits(&gb, 1); if (check_for_overrun(c, &gb, init_bit_pos, len)) { c->frame_hdr = NULL; - return DAV1D_ERR(EINVAL); + goto error; } }
@@ -1360,7 +1363,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa // Align to the next byte boundary and check for overrun. dav1d_bytealign_get_bits(&gb); if (check_for_overrun(c, &gb, init_bit_pos, len)) - return DAV1D_ERR(EINVAL); + goto error; // The current bit position is a multiple of 8 (because we // just aligned it) and less than 8*pkt_bytelen because // otherwise the overrun check would have fired. @@ -1547,7 +1550,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa
if (c->seq_hdr && c->frame_hdr) { if (c->frame_hdr->show_existing_frame) { - if (!c->refs[c->frame_hdr->existing_frame_idx].p.p.data[0]) return DAV1D_ERR(EINVAL); + if (!c->refs[c->frame_hdr->existing_frame_idx].p.p.data[0]) goto error; if (c->n_fc == 1) { dav1d_thread_picture_ref(&c->out, &c->refs[c->frame_hdr->existing_frame_idx].p); @@ -1574,7 +1577,13 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa if (c->task_thread.cur && c->task_thread.cur < c->n_fc) c->task_thread.cur--; } - if (out_delayed->p.data[0]) { + const int error = f->task_thread.retval; + if (error) { + c->cached_error = error; + f->task_thread.retval = 0; + dav1d_data_props_copy(&c->cached_error_props, &out_delayed->p.m); + dav1d_thread_picture_unref(out_delayed); + } else if (out_delayed->p.data[0]) { const unsigned progress = atomic_load_explicit(&out_delayed->progress[1], memory_order_relaxed); if ((out_delayed->visible || c->output_invisible_frames) && @@ -1613,7 +1622,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa c->frame_hdr = NULL; } else if (c->n_tiles == c->frame_hdr->tiling.cols * c->frame_hdr->tiling.rows) { if (!c->n_tile_data) - return DAV1D_ERR(EINVAL); + goto error; if ((res = dav1d_submit_frame(c)) < 0) return res; assert(!c->n_tile_data); @@ -1625,6 +1634,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa return len + init_byte_pos;
error: + dav1d_data_props_copy(&c->cached_error_props, &in->m); dav1d_log(c, "Error parsing OBU data\n"); return DAV1D_ERR(EINVAL); } diff --git a/third_party/dav1d/src/picture.c b/third_party/dav1d/src/picture.c index 461c9d0522b37..bebc4dd9c17b1 100644 --- a/third_party/dav1d/src/picture.c +++ b/third_party/dav1d/src/picture.c @@ -283,6 +283,7 @@ void dav1d_picture_unref_internal(Dav1dPicture *const p) { dav1d_ref_dec(&p->itut_t35_ref); } memset(p, 0, sizeof(*p)); + dav1d_data_props_set_defaults(&p->m); }
void dav1d_thread_picture_unref(Dav1dThreadPicture *const p) { diff --git a/third_party/dav1d/src/tables.c b/third_party/dav1d/src/tables.c index 92b8c4b5a8dcc..9752f15c40df2 100644 --- a/third_party/dav1d/src/tables.c +++ b/third_party/dav1d/src/tables.c @@ -756,7 +756,7 @@ const uint16_t dav1d_dr_intra_derivative[44] = { [1*idx+32] = f4, [1*idx+40] = f5, \ [1*idx+48] = f6 #endif -const int8_t ALIGN(dav1d_filter_intra_taps[5][64], 16) = { +const int8_t ALIGN(dav1d_filter_intra_taps[5][64], 64) = { { F( 0, -6, 10, 0, 0, 0, 12, 0 ), F( 1, -5, 2, 10, 0, 0, 9, 0 ), diff --git a/third_party/dav1d/src/thread_task.c b/third_party/dav1d/src/thread_task.c index bcda3182894a7..53aa41e5c8ad7 100644 --- a/third_party/dav1d/src/thread_task.c +++ b/third_party/dav1d/src/thread_task.c @@ -30,6 +30,7 @@ #include "common/frame.h"
#include "src/thread_task.h" +#include "src/fg_apply.h"
// This function resets the cur pointer to the first frame theoretically // executable after a task completed (ie. each time we update some progress or @@ -281,6 +282,22 @@ void dav1d_task_frame_init(Dav1dFrameContext *const f) { insert_task(f, t, 1); }
+void dav1d_task_delayed_fg(Dav1dContext *const c, Dav1dPicture *const out, + const Dav1dPicture *const in) +{ + struct TaskThreadData *const ttd = &c->task_thread; + ttd->delayed_fg.in = in; + ttd->delayed_fg.out = out; + ttd->delayed_fg.type = DAV1D_TASK_TYPE_FG_PREP; + atomic_init(&ttd->delayed_fg.progress[0], 0); + atomic_init(&ttd->delayed_fg.progress[1], 0); + pthread_mutex_lock(&ttd->lock); + ttd->delayed_fg.exec = 1; + pthread_cond_signal(&ttd->cond); + pthread_cond_wait(&ttd->delayed_fg.cond, &ttd->lock); + pthread_mutex_unlock(&ttd->lock); +} + static inline int ensure_progress(struct TaskThreadData *const ttd, Dav1dFrameContext *const f, Dav1dTask *const t, const enum TaskType type, @@ -352,18 +369,104 @@ static inline int check_tile(Dav1dTask *const t, Dav1dFrameContext *const f, return 0; }
-static inline void abort_frame(Dav1dFrameContext *const f) { - atomic_store(&f->task_thread.error, 1); +static inline void abort_frame(Dav1dFrameContext *const f, const int error) { + atomic_store(&f->task_thread.error, error == DAV1D_ERR(EINVAL) ? 1 : -1); f->task_thread.task_counter = 0; f->task_thread.done[0] = 1; f->task_thread.done[1] = 1; atomic_store(&f->sr_cur.progress[0], FRAME_ERROR); atomic_store(&f->sr_cur.progress[1], FRAME_ERROR); - dav1d_decode_frame_exit(f, -1); + dav1d_decode_frame_exit(f, error); f->n_tile_data = 0; pthread_cond_signal(&f->task_thread.cond); }
+static inline void delayed_fg_task(const Dav1dContext *const c, + struct TaskThreadData *const ttd) +{ + const Dav1dPicture *const in = ttd->delayed_fg.in; + Dav1dPicture *const out = ttd->delayed_fg.out; +#if CONFIG_16BPC + int off; + if (out->p.bpc != 8) + off = (out->p.bpc >> 1) - 4; +#endif + switch (ttd->delayed_fg.type) { + case DAV1D_TASK_TYPE_FG_PREP: + ttd->delayed_fg.exec = 0; + if (atomic_load(&ttd->cond_signaled)) + pthread_cond_signal(&ttd->cond); + pthread_mutex_unlock(&ttd->lock); + switch (out->p.bpc) { +#if CONFIG_8BPC + case 8: + dav1d_prep_grain_8bpc(&c->dsp[0].fg, out, in, + ttd->delayed_fg.scaling_8bpc, + ttd->delayed_fg.grain_lut_8bpc); + break; +#endif +#if CONFIG_16BPC + case 10: + case 12: + dav1d_prep_grain_16bpc(&c->dsp[off].fg, out, in, + ttd->delayed_fg.scaling_16bpc, + ttd->delayed_fg.grain_lut_16bpc); + break; +#endif + default: abort(); + } + ttd->delayed_fg.type = DAV1D_TASK_TYPE_FG_APPLY; + pthread_mutex_lock(&ttd->lock); + ttd->delayed_fg.exec = 1; + // fall-through + case DAV1D_TASK_TYPE_FG_APPLY:; + int row = atomic_fetch_add(&ttd->delayed_fg.progress[0], 1); + pthread_mutex_unlock(&ttd->lock); + int progmax = (out->p.h + 31) >> 5; + fg_apply_loop: + if (row + 1 < progmax) + pthread_cond_signal(&ttd->cond); + else if (row + 1 >= progmax) { + pthread_mutex_lock(&ttd->lock); + ttd->delayed_fg.exec = 0; + if (row >= progmax) goto end_add; + pthread_mutex_unlock(&ttd->lock); + } + switch (out->p.bpc) { +#if CONFIG_8BPC + case 8: + dav1d_apply_grain_row_8bpc(&c->dsp[0].fg, out, in, + ttd->delayed_fg.scaling_8bpc, + ttd->delayed_fg.grain_lut_8bpc, row); + break; +#endif +#if CONFIG_16BPC + case 10: + case 12: + dav1d_apply_grain_row_16bpc(&c->dsp[off].fg, out, in, + ttd->delayed_fg.scaling_16bpc, + ttd->delayed_fg.grain_lut_16bpc, row); + break; +#endif + default: abort(); + } + row = atomic_fetch_add(&ttd->delayed_fg.progress[0], 1); + int done = atomic_fetch_add(&ttd->delayed_fg.progress[1], 1) + 1; + if (row < progmax) goto fg_apply_loop; + pthread_mutex_lock(&ttd->lock); + ttd->delayed_fg.exec = 0; + end_add: + done = atomic_fetch_add(&ttd->delayed_fg.progress[1], 1) + 1; + progmax = atomic_load(&ttd->delayed_fg.progress[0]); + // signal for completion only once the last runner reaches this + if (done < progmax) + break; + pthread_cond_signal(&ttd->delayed_fg.cond); + break; + default: abort(); + } +} + void *dav1d_worker_task(void *data) { Dav1dTaskContext *const tc = data; const Dav1dContext *const c = tc->c; @@ -373,11 +476,15 @@ void *dav1d_worker_task(void *data) {
pthread_mutex_lock(&ttd->lock); for (;;) { - Dav1dFrameContext *f; - Dav1dTask *t, *prev_t = NULL; if (tc->task_thread.die) break; if (atomic_load(c->flush)) goto park; - if (c->n_fc > 1) { // run init tasks first + if (ttd->delayed_fg.exec) { // run delayed film grain first + delayed_fg_task(c, ttd); + continue; + } + Dav1dFrameContext *f; + Dav1dTask *t, *prev_t = NULL; + if (c->n_fc > 1) { // run init tasks second for (unsigned i = 0; i < c->n_fc; i++) { const unsigned first = atomic_load(&ttd->first); f = &c->fc[(first + i) % c->n_fc]; @@ -395,7 +502,7 @@ void *dav1d_worker_task(void *data) { } } } - while (ttd->cur < c->n_fc) { + while (ttd->cur < c->n_fc) { // run decoding tasks last const unsigned first = atomic_load(&ttd->first); f = &c->fc[(first + ttd->cur) % c->n_fc]; prev_t = f->task_thread.task_cur_prev; @@ -497,7 +604,7 @@ void *dav1d_worker_task(void *data) { int p1 = f->in_cdf.progress ? atomic_load(f->in_cdf.progress) : 1; if (res || p1 == TILE_ERROR) { pthread_mutex_lock(&ttd->lock); - abort_frame(f); + abort_frame(f, res ? res : DAV1D_ERR(EINVAL)); } else if (!res) { t->type = DAV1D_TASK_TYPE_INIT_CDF; if (p1) goto found_unlocked; @@ -509,7 +616,7 @@ void *dav1d_worker_task(void *data) { } case DAV1D_TASK_TYPE_INIT_CDF: { assert(c->n_fc > 1); - int res = -1; + int res = DAV1D_ERR(EINVAL); if (!atomic_load(&f->task_thread.error)) res = dav1d_decode_frame_init_cdf(f); pthread_mutex_lock(&ttd->lock); @@ -523,19 +630,19 @@ void *dav1d_worker_task(void *data) { if (res) { // memory allocation failed f->task_thread.done[2 - p] = 1; - atomic_store(&f->task_thread.error, 1); + atomic_store(&f->task_thread.error, -1); f->task_thread.task_counter -= f->sbh + f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows; atomic_store(&f->sr_cur.progress[p - 1], FRAME_ERROR); if (p == 2 && f->task_thread.done[1]) { assert(!f->task_thread.task_counter); - dav1d_decode_frame_exit(f, -1); + dav1d_decode_frame_exit(f, DAV1D_ERR(ENOMEM)); f->n_tile_data = 0; pthread_cond_signal(&f->task_thread.cond); } } } - } else abort_frame(f); + } else abort_frame(f, res); reset_task_cur(c, ttd, t->frame_idx); f->task_thread.init_done = 1; continue; @@ -588,7 +695,8 @@ void *dav1d_worker_task(void *data) { if (!--f->task_thread.task_counter && f->task_thread.done[0] && (!uses_2pass || f->task_thread.done[1])) { - dav1d_decode_frame_exit(f, error ? -1 : 0); + dav1d_decode_frame_exit(f, error == 1 ? DAV1D_ERR(EINVAL) : + error ? DAV1D_ERR(ENOMEM) : 0); f->n_tile_data = 0; pthread_cond_signal(&f->task_thread.cond); } @@ -703,7 +811,8 @@ void *dav1d_worker_task(void *data) { if (!--f->task_thread.task_counter && f->task_thread.done[0] && (!uses_2pass || f->task_thread.done[1])) { - dav1d_decode_frame_exit(f, error ? -1 : 0); + dav1d_decode_frame_exit(f, error == 1 ? DAV1D_ERR(EINVAL) : + error ? DAV1D_ERR(ENOMEM) : 0); f->n_tile_data = 0; pthread_cond_signal(&f->task_thread.cond); } diff --git a/third_party/dav1d/src/thread_task.h b/third_party/dav1d/src/thread_task.h index 0ff2228bed7c5..257da1a470c70 100644 --- a/third_party/dav1d/src/thread_task.h +++ b/third_party/dav1d/src/thread_task.h @@ -39,6 +39,8 @@ int dav1d_task_create_tile_sbrow(Dav1dFrameContext *f, int pass, int cond_signal); void dav1d_task_frame_init(Dav1dFrameContext *f);
+void dav1d_task_delayed_fg(Dav1dContext *c, Dav1dPicture *out, const Dav1dPicture *in); + void *dav1d_worker_task(void *data);
int dav1d_decode_frame_init(Dav1dFrameContext *f); diff --git a/third_party/dav1d/src/x86/cdef16_avx2.asm b/third_party/dav1d/src/x86/cdef16_avx2.asm index 9e2c7b361e0b1..4c8d3bca4377b 100644 --- a/third_party/dav1d/src/x86/cdef16_avx2.asm +++ b/third_party/dav1d/src/x86/cdef16_avx2.asm @@ -59,14 +59,6 @@ cextern cdef_dir_8bpc_avx2.main
SECTION .text
-%macro REPX 2-* - %xdefine %%f(x) %1 -%rep %0 - 1 - %rotate 1 - %%f(%1) -%endrep -%endmacro - %macro CDEF_FILTER 2 ; w, h DEFINE_ARGS dst, stride, _, dir, pridmp, pri, sec, tmp movifnidn prid, r5m diff --git a/third_party/dav1d/src/x86/cdef16_sse.asm b/third_party/dav1d/src/x86/cdef16_sse.asm index 03736b422c0b9..1bd67ace64d09 100644 --- a/third_party/dav1d/src/x86/cdef16_sse.asm +++ b/third_party/dav1d/src/x86/cdef16_sse.asm @@ -64,14 +64,6 @@ cextern shufw_6543210x
SECTION .text
-%macro REPX 2-* - %xdefine %%f(x) %1 -%rep %0 - 1 - %rotate 1 - %%f(%1) -%endrep -%endmacro - %if ARCH_X86_32 DECLARE_REG_TMP 5, 3 %elif WIN64 diff --git a/third_party/dav1d/src/x86/film_grain16_avx2.asm b/third_party/dav1d/src/x86/filmgrain16_avx2.asm similarity index 98% rename from third_party/dav1d/src/x86/film_grain16_avx2.asm rename to third_party/dav1d/src/x86/filmgrain16_avx2.asm index 1ece90b68433b..5c2e3868a349a 100644 --- a/third_party/dav1d/src/x86/film_grain16_avx2.asm +++ b/third_party/dav1d/src/x86/filmgrain16_avx2.asm @@ -25,6 +25,7 @@
%include "config.asm" %include "ext/x86/x86inc.asm" +%include "x86/filmgrain_common.asm"
%if ARCH_X86_64
@@ -81,38 +82,8 @@ JMP_TABLE generate_grain_uv_420_16bpc_avx2, 0, 1, 2, 3 JMP_TABLE generate_grain_uv_422_16bpc_avx2, 0, 1, 2, 3 JMP_TABLE generate_grain_uv_444_16bpc_avx2, 0, 1, 2, 3
-struc FGData - .seed: resd 1 - .num_y_points: resd 1 - .y_points: resb 14 * 2 - .chroma_scaling_from_luma: resd 1 - .num_uv_points: resd 2 - .uv_points: resb 2 * 10 * 2 - .scaling_shift: resd 1 - .ar_coeff_lag: resd 1 - .ar_coeffs_y: resb 24 - .ar_coeffs_uv: resb 2 * 28 ; includes padding - .ar_coeff_shift: resq 1 - .grain_scale_shift: resd 1 - .uv_mult: resd 2 - .uv_luma_mult: resd 2 - .uv_offset: resd 2 - .overlap_flag: resd 1 - .clip_to_restricted_range: resd 1 -endstruc - -cextern gaussian_sequence - SECTION .text
-%macro REPX 2-* - %xdefine %%f(x) %1 -%rep %0 - 1 - %rotate 1 - %%f(%1) -%endrep -%endmacro - %define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
INIT_YMM avx2 diff --git a/third_party/dav1d/src/x86/film_grain16_sse.asm b/third_party/dav1d/src/x86/filmgrain16_sse.asm similarity index 99% rename from third_party/dav1d/src/x86/film_grain16_sse.asm rename to third_party/dav1d/src/x86/filmgrain16_sse.asm index 3f86e7d9a5907..6b0daaac0ba35 100644 --- a/third_party/dav1d/src/x86/film_grain16_sse.asm +++ b/third_party/dav1d/src/x86/filmgrain16_sse.asm @@ -25,6 +25,7 @@
%include "config.asm" %include "ext/x86/x86inc.asm" +%include "x86/filmgrain_common.asm"
SECTION_RODATA 16 pd_16: times 4 dd 16 @@ -66,38 +67,8 @@ JMP_TABLE generate_grain_uv_420_16bpc_ssse3, 0, 1, 2, 3 JMP_TABLE generate_grain_uv_422_16bpc_ssse3, 0, 1, 2, 3 JMP_TABLE generate_grain_uv_444_16bpc_ssse3, 0, 1, 2, 3
-struc FGData - .seed: resd 1 - .num_y_points: resd 1 - .y_points: resb 14 * 2 - .chroma_scaling_from_luma: resd 1 - .num_uv_points: resd 2 - .uv_points: resb 2 * 10 * 2 - .scaling_shift: resd 1 - .ar_coeff_lag: resd 1 - .ar_coeffs_y: resb 24 - .ar_coeffs_uv: resb 2 * 28 ; includes padding - .ar_coeff_shift: resq 1 - .grain_scale_shift: resd 1 - .uv_mult: resd 2 - .uv_luma_mult: resd 2 - .uv_offset: resd 2 - .overlap_flag: resd 1 - .clip_to_restricted_range: resd 1 -endstruc - -cextern gaussian_sequence - SECTION .text
-%macro REPX 2-* - %xdefine %%f(x) %1 -%rep %0 - 1 - %rotate 1 - %%f(%1) -%endrep -%endmacro - %if ARCH_X86_32 %undef base %define PIC_ptr(a) base+a diff --git a/third_party/dav1d/src/x86/film_grain_avx2.asm b/third_party/dav1d/src/x86/filmgrain_avx2.asm similarity index 98% rename from third_party/dav1d/src/x86/film_grain_avx2.asm rename to third_party/dav1d/src/x86/filmgrain_avx2.asm index dda43a9baa579..7da8105dfacd0 100644 --- a/third_party/dav1d/src/x86/film_grain_avx2.asm +++ b/third_party/dav1d/src/x86/filmgrain_avx2.asm @@ -25,6 +25,7 @@
%include "config.asm" %include "ext/x86/x86inc.asm" +%include "x86/filmgrain_common.asm"
%if ARCH_X86_64
@@ -74,38 +75,8 @@ JMP_TABLE generate_grain_uv_420, avx2, 0, 1, 2, 3 JMP_TABLE generate_grain_uv_422, avx2, 0, 1, 2, 3 JMP_TABLE generate_grain_uv_444, avx2, 0, 1, 2, 3
-struc FGData - .seed: resd 1 - .num_y_points: resd 1 - .y_points: resb 14 * 2 - .chroma_scaling_from_luma: resd 1 - .num_uv_points: resd 2 - .uv_points: resb 2 * 10 * 2 - .scaling_shift: resd 1 - .ar_coeff_lag: resd 1 - .ar_coeffs_y: resb 24 - .ar_coeffs_uv: resb 2 * 28 ; includes padding - .ar_coeff_shift: resq 1 - .grain_scale_shift: resd 1 - .uv_mult: resd 2 - .uv_luma_mult: resd 2 - .uv_offset: resd 2 - .overlap_flag: resd 1 - .clip_to_restricted_range: resd 1 -endstruc - -cextern gaussian_sequence - SECTION .text
-%macro REPX 2-* - %xdefine %%f(x) %1 -%rep %0 - 1 - %rotate 1 - %%f(%1) -%endrep -%endmacro - INIT_YMM avx2 cglobal generate_grain_y_8bpc, 2, 9, 8, buf, fg_data %define base r4-generate_grain_y_8bpc_avx2_table @@ -1097,9 +1068,6 @@ cglobal fgy_32x32xn_8bpc, 6, 13, 15, dst, src, stride, fg_data, w, scaling, \ jne .loop_x_hv_overlap jmp .loop_x_h_overlap
-.end: - RET - .vertical_overlap: DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ unused, sby, see, overlap @@ -1214,7 +1182,7 @@ cglobal fgy_32x32xn_8bpc, 6, 13, 15, dst, src, stride, fg_data, w, scaling, \ jmp .loop_y .end_y_v_overlap: add wq, 32 - jge .end_hv + jge .end lea srcq, [src_bakq+wq]
; since fg_dataq.overlap is guaranteed to be set, we never jump @@ -1334,7 +1302,7 @@ cglobal fgy_32x32xn_8bpc, 6, 13, 15, dst, src, stride, fg_data, w, scaling, \ add wq, 32 lea srcq, [src_bakq+wq] jl .loop_x_hv_overlap -.end_hv: +.end: RET
%macro FGUV_FN 3 ; name, ss_hor, ss_ver @@ -1691,9 +1659,6 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, jne %%loop_x_hv_overlap jmp %%loop_x_h_overlap
-%%end: - RET - %%vertical_overlap: DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \ sby, see, overlap, unused1, unused2, lstride @@ -1887,7 +1852,7 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
%%end_y_v_overlap: add wq, 32>>%2 - jge %%end_hv + jge %%end mov srcq, r11mp mov dstq, r12mp lea lumaq, [r14+wq*(1+%2)] @@ -2116,15 +2081,14 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
%%end_y_hv_overlap: add wq, 32>>%2 - jge %%end_hv + jge %%end mov srcq, r11mp mov dstq, r12mp lea lumaq, [r14+wq*(1+%2)] add srcq, wq add dstq, wq jmp %%loop_x_hv_overlap - -%%end_hv: +%%end: RET %endmacro
diff --git a/third_party/dav1d/src/x86/filmgrain_avx512.asm b/third_party/dav1d/src/x86/filmgrain_avx512.asm new file mode 100644 index 0000000000000..6d277464a5976 --- /dev/null +++ b/third_party/dav1d/src/x86/filmgrain_avx512.asm @@ -0,0 +1,1079 @@ +; Copyright © 2022, VideoLAN and dav1d authors +; Copyright © 2022, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" +%include "x86/filmgrain_common.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 64 + +pb_even: db 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 + db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62 + db 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94 + db 96, 98,100,102,104,106,108,110,112,114,116,118,120,122,124,126 +pb_odd: db 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 + db 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63 + db 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95 + db 97, 99,101,103,105,107,109,111,113,115,117,119,121,123,125,127 +interleave_hl: db 8, 0, 9, 1, 10, 2, 11, 3, 12, 4, 13, 5, 14, 6, 15, 7 +pb_27_17_17_27: db 27, 17, 17, 27, 0, 32, 0, 32 +pb_23_22_0_32: db 23, 22, 0, 32, 0, 32, 0, 32 +pb_27_17: times 2 db 27, 17 +pb_23_22: times 2 db 23, 22 +pw_8: times 2 dw 8 +pw_1024: times 2 dw 1024 +pb_17_27: times 2 db 17, 27 +fg_max: times 4 db 255 + times 4 db 240 + times 4 db 235 +fg_min: times 4 db 0 + times 4 db 16 +noise_rnd: times 2 dw 128 + times 2 dw 64 + times 2 dw 32 + times 2 dw 16 + +SECTION .text + +INIT_ZMM avx512icl +cglobal fgy_32x32xn_8bpc, 6, 13, 22, dst, src, stride, fg_data, w, scaling, \ + grain_lut, h, sby, see, overlap +%define base r11-fg_min + lea r11, [fg_min] + mov r6d, [fg_dataq+FGData.scaling_shift] + mov r7d, [fg_dataq+FGData.clip_to_restricted_range] + mov sbyd, sbym + mov overlapd, [fg_dataq+FGData.overlap_flag] + mov r12, 0x0000000f0000000f ; h_overlap mask + mova m0, [scalingq+64*0] + mova m1, [scalingq+64*1] + mova m2, [scalingq+64*2] + mova m3, [scalingq+64*3] + kmovq k1, r12 + vbroadcasti32x4 m4, [base+interleave_hl] + vpbroadcastd ym16, [base+pb_27_17] + vpbroadcastd m12, [base+pb_17_27] + vpbroadcastd m6, [base+noise_rnd+r6*4-32] + test sbyd, sbyd + setnz r6b + vpbroadcastd m7, [base+fg_min+r7*4] + vpbroadcastd m8, [base+fg_max+r7*8] + pxor m5, m5 + vpbroadcastd m9, [base+pw_1024] + vpbroadcastq m10, [base+pb_27_17_17_27] + vmovdqa64 m12{k1}, m16 + test r6b, overlapb + jnz .v_overlap + + imul seed, sbyd, (173 << 24) | 37 + add seed, (105 << 24) | 178 + rorx seed, seed, 24 + movzx seed, seew + xor seed, [fg_dataq+FGData.seed] + + DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \ + h, sby, see, overlap + + lea src_bakq, [srcq+wq] + neg wq + sub dstq, srcq +.loop_x: + rorx r6, seeq, 1 + or seed, 0xeff4 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + rorx offyd, seed, 8 + rorx offxq, seeq, 12 + and offyd, 0xf + imul offyd, 164 + lea offxd, [offyq+offxq*2+829] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \ + h, sby, see, overlap + + mov grain_lutq, grain_lutmp + mov hd, hm +.loop_y: + mova ym18, [srcq+strideq*0] + vinserti32x8 m18, [srcq+strideq*1], 1 + movu ym21, [grain_lutq+offxyq-82] + vinserti32x8 m21, [grain_lutq+offxyq+ 0], 1 + mova m19, m0 + vpmovb2m k2, m18 + punpcklbw m16, m18, m5 + punpckhbw m17, m18, m5 + vpermt2b m19, m18, m1 ; scaling[ 0..127] + vpermi2b m18, m2, m3 ; scaling[128..255] + punpcklbw m20, m5, m21 ; grain + punpckhbw m21, m5 + vmovdqu8 m19{k2}, m18 ; scaling[src] + pshufb m19, m4 + pmaddubsw m18, m19, m20 + pmaddubsw m19, m21 + add grain_lutq, 82*2 + pmulhrsw m18, m6 ; noise + pmulhrsw m19, m6 + paddw m16, m18 + paddw m17, m19 + packuswb m16, m17 + pmaxub m16, m7 + pminub m16, m8 + mova [dstq+srcq], ym16 + add srcq, strideq + vextracti32x8 [dstq+srcq], m16, 1 + add srcq, strideq + sub hb, 2 + jg .loop_y + add wq, 32 + jge .end + lea srcq, [src_bakq+wq] + test overlapd, overlapd + jz .loop_x + test sbyd, sbyd + jnz .hv_overlap + +.loop_x_h_overlap: + rorx r6, seeq, 1 + or seed, 0xeff4 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + + DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \ + h, sby, see, left_offxy + + rorx offyd, seed, 8 + mov left_offxyd, offxd ; previous column's offy*stride + rorx offxq, seeq, 12 + and offyd, 0xf + imul offyd, 164 + lea offxd, [offyq+offxq*2+829] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \ + h, sby, see, left_offxy + + mov grain_lutq, grain_lutmp + mov hd, hm +.loop_y_h_overlap: + movu ym20, [grain_lutq+offxyq-82] + vinserti32x8 m20, [grain_lutq+offxyq+ 0], 1 + movd xm21, [grain_lutq+left_offxyq-50] + vinserti32x4 m21, [grain_lutq+left_offxyq+32], 2 + mova ym18, [srcq+strideq*0] + vinserti32x8 m18, [srcq+strideq*1], 1 + mova m19, m0 + punpcklbw m21, m20 + vpmovb2m k2, m18 + punpcklbw m16, m18, m5 + punpckhbw m17, m18, m5 + pmaddubsw m21, m10, m21 + vpermt2b m19, m18, m1 + vpermi2b m18, m2, m3 + pmulhrsw m21, m9 + vmovdqu8 m19{k2}, m18 ; scaling[src] + punpckhbw m18, m20, m5 + pshufb m19, m4 + packsswb m20{k1}, m21, m21 + punpcklbw m20, m5, m20 ; grain + pmaddubsw m18, m19, m18 + pmaddubsw m19, m20 + add grain_lutq, 82*2 + pmulhrsw m18, m6 ; noise + pmulhrsw m19, m6 + paddw m17, m18 + paddw m16, m19 + packuswb m16, m17 + pmaxub m16, m7 + pminub m16, m8 + mova [dstq+srcq], ym16 + add srcq, strideq + vextracti32x8 [dstq+srcq], m16, 1 + add srcq, strideq + sub hb, 2 + jg .loop_y_h_overlap + add wq, 32 + jge .end + lea srcq, [src_bakq+wq] + test sbyd, sbyd + jnz .hv_overlap + jmp .loop_x_h_overlap + +.v_overlap: + DEFINE_ARGS dst, src, stride, fg_data, w, offy, offx, \ + h, sby, see, overlap + + movzx r6d, sbyb + imul seed, [fg_dataq+FGData.seed], 0x00010001 + imul r7d, r6d, 173 * 0x00010001 + imul r6d, 37 * 0x01000100 + add r7d, (105 << 16) | 188 + add r6d, (178 << 24) | (141 << 8) + and r7d, 0x00ff00ff + and r6d, 0xff00ff00 + xor seed, r7d + xor seed, r6d ; (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \ + h, sby, see, overlap + + lea src_bakq, [srcq+wq] + neg wq + sub dstq, srcq + + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + rorx offyd, seed, 8 + rorx offxd, seed, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offxd, [offyq+offxq*2+0x10001*829+32*82] + + DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \ + h, sby, see, overlap, top_offxy + + mov grain_lutq, grain_lutmp + mov hd, hm + movzx top_offxyd, offxyw + shr offxyd, 16 + movu ym16, [grain_lutq+offxyq-82] + vinserti32x8 m16, [grain_lutq+offxyq+ 0], 1 + movu ym21, [grain_lutq+top_offxyq-82] + vinserti32x8 m21, [grain_lutq+top_offxyq+ 0], 1 + mova ym18, [srcq+strideq*0] + vinserti32x8 m18, [srcq+strideq*1], 1 + mova m19, m0 + punpcklbw m20, m21, m16 + punpckhbw m21, m16 + vpmovb2m k2, m18 + pmaddubsw m20, m12, m20 + pmaddubsw m21, m12, m21 + punpcklbw m16, m18, m5 + punpckhbw m17, m18, m5 + vpermt2b m19, m18, m1 + vpermi2b m18, m2, m3 + pmulhrsw m20, m9 + pmulhrsw m21, m9 + vmovdqu8 m19{k2}, m18 ; scaling[src] + pshufb m19, m4 + packsswb m20, m21 + punpcklbw m18, m5, m20 ; grain + punpckhbw m20, m5 + pmaddubsw m18, m19, m18 + pmaddubsw m19, m20 + add grain_lutq, 82*2 + pmulhrsw m18, m6 ; noise + pmulhrsw m19, m6 + paddw m16, m18 + paddw m17, m19 + packuswb m16, m17 + pmaxub m16, m7 + pminub m16, m8 + mova [dstq+srcq], ym16 + add srcq, strideq + vextracti32x8 [dstq+srcq], m16, 1 + add srcq, strideq + sub hb, 2 + jg .loop_y + add wq, 32 + jge .end + lea srcq, [src_bakq+wq] + + ; since fg_dataq.overlap is guaranteed to be set, we never jump back + ; to .v_overlap, and instead always fall-through to h+v overlap +.hv_overlap: + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \ + h, sby, see, left_offxy, top_offxy, topleft_offxy + + mov topleft_offxyd, top_offxyd + rorx offyd, seed, 8 + mov left_offxyd, offxd + rorx offxd, seed, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offxd, [offyq+offxq*2+0x10001*829+32*82] + + DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \ + h, sby, see, left_offxy, top_offxy, topleft_offxy + + mov grain_lutq, grain_lutmp + mov hd, hm + movzx top_offxyd, offxyw + shr offxyd, 16 + movu ym19, [grain_lutq+offxyq-82] + vinserti32x8 m19, [grain_lutq+offxyq+ 0], 1 + movd xm16, [grain_lutq+left_offxyq-50] + vinserti32x4 m16, [grain_lutq+left_offxyq+32], 2 + movu ym21, [grain_lutq+top_offxyq-82] + vinserti32x8 m21, [grain_lutq+top_offxyq+ 0], 1 + movd xm17, [grain_lutq+topleft_offxyq-50] + vinserti32x4 m17, [grain_lutq+topleft_offxyq+32], 2 + mova ym18, [srcq+strideq*0] + vinserti32x8 m18, [srcq+strideq*1], 1 + ; do h interpolation first (so top | top/left -> top, left | cur -> cur) + punpcklbw m16, m19 + punpcklbw m17, m21 + pmaddubsw m16, m10, m16 + pmaddubsw m17, m10, m17 + punpckhbw m20, m21, m19 + vpmovb2m k2, m18 + pmulhrsw m16, m9 + pmulhrsw m17, m9 + packsswb m19{k1}, m16, m16 + packsswb m21{k1}, m17, m17 + ; followed by v interpolation (top | cur -> cur) + punpcklbw m21, m19 + mova m19, m0 + pmaddubsw m20, m12, m20 + pmaddubsw m21, m12, m21 + punpcklbw m16, m18, m5 + punpckhbw m17, m18, m5 + vpermt2b m19, m18, m1 ; scaling[ 0..127] + vpermi2b m18, m2, m3 ; scaling[128..255] + pmulhrsw m20, m9 + pmulhrsw m21, m9 + vmovdqu8 m19{k2}, m18 ; scaling[src] + pshufb m19, m4 + packsswb m21, m20 + punpcklbw m20, m5, m21 + punpckhbw m21, m5 + pmaddubsw m18, m19, m20 + pmaddubsw m19, m21 + add grain_lutq, 82*2 + pmulhrsw m18, m6 ; noise + pmulhrsw m19, m6 + paddw m16, m18 + paddw m17, m19 + packuswb m16, m17 + pmaxub m16, m7 + pminub m16, m8 + mova [dstq+srcq], ym16 + add srcq, strideq + vextracti32x8 [dstq+srcq], m16, 1 + add srcq, strideq + sub hb, 2 + jg .loop_y_h_overlap + add wq, 32 + lea srcq, [src_bakq+wq] + jl .hv_overlap +.end: + RET + +%macro FGUV_FN 3 ; name, ss_hor, ss_ver +cglobal fguv_32x32xn_i%1_8bpc, 6, 14+%2, 22, dst, src, stride, fg_data, w, \ + scaling, grain_lut, h, sby, luma, \ + overlap, uv_pl, is_id, _, stride3 + lea r11, [fg_min] + mov r6d, [fg_dataq+FGData.scaling_shift] + mov r7d, [fg_dataq+FGData.clip_to_restricted_range] + mov r9d, is_idm + mov sbyd, sbym + mov overlapd, [fg_dataq+FGData.overlap_flag] +%if %2 + mov r12, 0x000f000f000f000f ; h_overlap mask + vpbroadcastq m10, [base+pb_23_22_0_32] + lea stride3q, [strideq*3] +%else + mov r12, 0x0000000f0000000f + vpbroadcastq m10, [base+pb_27_17_17_27] +%endif + mova m0, [scalingq+64*0] + mova m1, [scalingq+64*1] + mova m2, [scalingq+64*2] + mova m3, [scalingq+64*3] + kmovq k1, r12 + vbroadcasti32x4 m4, [base+interleave_hl] + vpbroadcastd m6, [base+noise_rnd+r6*4-32] + vpbroadcastd m7, [base+fg_min+r7*4] + shlx r7d, r7d, r9d + vpbroadcastd m8, [base+fg_max+r7*4] + test sbyd, sbyd + setnz r7b + vpbroadcastd m9, [base+pw_1024] + mova m11, [base+pb_even] + mova m12, [base+pb_odd] + pxor m5, m5 + mov r5, r10mp ; lstride + cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 + jne .csfl + +%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver + DEFINE_ARGS dst, src, stride, fg_data, w, lstride, grain_lut, \ + h, sby, see, overlap, uv_pl, _, _, stride3 +%if %1 + mov r6d, uv_plm + vpbroadcastd m16, [base+pw_8] + vbroadcasti32x4 m14, [fg_dataq+FGData.uv_mult+r6*4] + vpbroadcastw m15, [fg_dataq+FGData.uv_offset+r6*4] + pshufb m14, m16 ; uv_luma_mult, uv_mult +%endif + test r7b, overlapb + jnz %%v_overlap + + imul seed, sbyd, (173 << 24) | 37 + add seed, (105 << 24) | 178 + rorx seed, seed, 24 + movzx seed, seew + xor seed, [fg_dataq+FGData.seed] + + DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ + offx, offy, see, overlap, _, _, _, stride3 + + mov lumaq, r9mp + lea r11, [srcq+wq] + lea r12, [dstq+wq] + lea r13, [lumaq+wq*(1+%2)] + mov r11mp, r11 + mov r12mp, r12 + neg wq + +%%loop_x: + rorx r6, seeq, 1 + or seed, 0xeff4 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + rorx offyd, seed, 8 + rorx offxq, seeq, 12 + and offyd, 0xf + imul offyd, 164>>%3 + lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ + h, offxy, see, overlap, _, _, _, stride3 + + mov grain_lutq, grain_lutmp + mov hd, hm +%%loop_y: + mova ym18, [lumaq+lstrideq*(0<<%3)] + vinserti32x8 m18, [lumaq+lstrideq*(1<<%3)], 1 + lea lumaq, [lumaq+lstrideq*(2<<%3)] +%if %2 + mova ym20, [lumaq+lstrideq*(0<<%3)] + vinserti32x8 m20, [lumaq+lstrideq*(1<<%3)], 1 + lea lumaq, [lumaq+lstrideq*(2<<%3)] + mova xm17, [srcq+strideq*0] + movu xm21, [grain_lutq+offxyq+82*0] + vinserti128 ym17, [srcq+strideq*1], 1 + vinserti128 ym21, [grain_lutq+offxyq+82*1], 1 + mova m19, m11 + vpermi2b m19, m18, m20 + vpermt2b m18, m12, m20 + vinserti32x4 m17, [srcq+strideq*2], 2 + vinserti32x4 m21, [grain_lutq+offxyq+82*2], 2 + pavgb m18, m19 + vinserti32x4 m17, [srcq+stride3q ], 3 + vinserti32x4 m21, [grain_lutq+offxyq+82*3], 3 +%else + mova ym17, [srcq+strideq*0] + vinserti32x8 m17, [srcq+strideq*1], 1 + movu ym21, [grain_lutq+offxyq+82*0] + vinserti32x8 m21, [grain_lutq+offxyq+82*1], 1 +%endif + lea srcq, [srcq+strideq*(2<<%2)] +%if %1 + punpckhbw m19, m18, m17 + punpcklbw m18, m17 ; { luma, chroma } + pmaddubsw m19, m14 + pmaddubsw m18, m14 + psraw m19, 6 + psraw m18, 6 + paddw m19, m15 + paddw m18, m15 + packuswb m18, m19 +%endif + mova m19, m0 + vpmovb2m k2, m18 + vpermt2b m19, m18, m1 ; scaling[ 0..127] + vpermi2b m18, m2, m3 ; scaling[128..255] + punpcklbw m20, m5, m21 ; grain + punpckhbw m21, m5 + vmovdqu8 m19{k2}, m18 ; scaling[src] + pshufb m19, m4 + pmaddubsw m18, m19, m20 + pmaddubsw m19, m21 + add grain_lutq, 82*2<<%2 + pmulhrsw m18, m6 ; noise + pmulhrsw m19, m6 + punpcklbw m16, m17, m5 ; chroma + punpckhbw m17, m5 + paddw m16, m18 + paddw m17, m19 + packuswb m16, m17 + pmaxub m16, m7 + pminub m16, m8 +%if %2 + mova [dstq+strideq*0], xm16 + vextracti128 [dstq+strideq*1], ym16, 1 + vextracti32x4 [dstq+strideq*2], m16, 2 + vextracti32x4 [dstq+stride3q ], m16, 3 +%else + mova [dstq+strideq*0], ym16 + vextracti32x8 [dstq+strideq*1], m16, 1 +%endif + lea dstq, [dstq+strideq*(2<<%2)] + sub hb, 2<<%2 + jg %%loop_y + add wq, 32>>%2 + jge %%end + mov srcq, r11mp + mov dstq, r12mp + lea lumaq, [r13+wq*(1<<%2)] + add srcq, wq + add dstq, wq + test overlapd, overlapd + jz %%loop_x + cmp dword r8m, 0 ; sby + jne %%hv_overlap + + ; horizontal overlap (without vertical overlap) +%%loop_x_h_overlap: + rorx r6, seeq, 1 + or seed, 0xeff4 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + + DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ + offx, offy, see, left_offxy, _, _, _, stride3 + + lea left_offxyd, [offyq+(32>>%2)] ; previous column's offy*stride+offx + rorx offyd, seed, 8 + rorx offxq, seeq, 12 + and offyd, 0xf + imul offyd, 164>>%3 + lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ + h, offxy, see, left_offxy, _, _, _, stride3 + + mov grain_lutq, grain_lutmp + mov hd, hm +%%loop_y_h_overlap: + ; src +%if %2 + mova ym18, [lumaq+lstrideq*(0<<%3)] + vinserti32x8 m18, [lumaq+lstrideq*(1<<%3)], 1 + lea lumaq, [lumaq+lstrideq*(2<<%3)] + mova ym20, [lumaq+lstrideq*(0<<%3)] + vinserti32x8 m20, [lumaq+lstrideq*(1<<%3)], 1 + lea lumaq, [lumaq+lstrideq*(2<<%3)] + mova xm17, [srcq+strideq*0] + vinserti128 ym17, [srcq+strideq*1], 1 + mova m19, m11 + vpermi2b m19, m18, m20 + vpermt2b m18, m12, m20 + vinserti32x4 m17, [srcq+strideq*2], 2 + pavgb m18, m19 + vinserti32x4 m17, [srcq+stride3q ], 3 +%else + mova ym18, [lumaq+lstrideq*0] + vinserti32x8 m18, [lumaq+lstrideq*1], 1 + mova ym17, [srcq+strideq*0] + vinserti32x8 m17, [srcq+strideq*1], 1 + lea lumaq, [lumaq+lstrideq*2] +%endif + lea srcq, [srcq+strideq*(2<<%2)] +%if %1 + punpckhbw m19, m18, m17 + punpcklbw m18, m17 ; { luma, chroma } + pmaddubsw m19, m14 + pmaddubsw m18, m14 + psraw m19, 6 + psraw m18, 6 + paddw m19, m15 + paddw m18, m15 + packuswb m18, m19 +%endif + mova m19, m0 + vpmovb2m k2, m18 + vpermt2b m19, m18, m1 ; scaling[ 0..127] + vpermi2b m18, m2, m3 ; scaling[128..255] + vmovdqu8 m19{k2}, m18 ; scaling[src] +%if %2 + movu xm20, [grain_lutq+offxyq +82*0] + movd xm18, [grain_lutq+left_offxyq+82*0] + vinserti32x4 ym20, [grain_lutq+offxyq +82*1], 1 + vinserti32x4 ym18, [grain_lutq+left_offxyq+82*1], 1 + vinserti32x4 m20, [grain_lutq+offxyq +82*2], 2 + vinserti32x4 m18, [grain_lutq+left_offxyq+82*2], 2 + vinserti32x4 m20, [grain_lutq+offxyq +82*3], 3 + vinserti32x4 m18, [grain_lutq+left_offxyq+82*3], 3 +%else + movu ym20, [grain_lutq+offxyq + 0] + movd xm18, [grain_lutq+left_offxyq+ 0] + vinserti32x8 m20, [grain_lutq+offxyq +82], 1 + vinserti32x4 m18, [grain_lutq+left_offxyq+82], 2 +%endif + punpcklbw m18, m20 + pmaddubsw m18, m10, m18 + punpckhbw m21, m20, m5 + pshufb m19, m4 + pmulhrsw m18, m9 + vpacksswb m20{k1}, m18, m18 + punpcklbw m20, m5, m20 + pmaddubsw m18, m19, m20 + pmaddubsw m19, m21 + add grain_lutq, 82*2<<%2 + pmulhrsw m18, m6 ; noise + pmulhrsw m19, m6 + punpcklbw m16, m17, m5 ; chroma + punpckhbw m17, m5 + paddw m16, m18 + paddw m17, m19 + packuswb m16, m17 + pmaxub m16, m7 + pminub m16, m8 +%if %2 + mova [dstq+strideq*0], xm16 + vextracti128 [dstq+strideq*1], ym16, 1 + vextracti32x4 [dstq+strideq*2], m16, 2 + vextracti32x4 [dstq+stride3q ], m16, 3 +%else + mova [dstq+strideq*0], ym16 + vextracti32x8 [dstq+strideq*1], m16, 1 +%endif + lea dstq, [dstq+strideq*(2<<%2)] + sub hb, 2<<%2 + jg %%loop_y_h_overlap + add wq, 32>>%2 + jge %%end + mov srcq, r11mp + mov dstq, r12mp + lea lumaq, [r13+wq*(1<<%2)] + add srcq, wq + add dstq, wq + cmp dword r8m, 0 ; sby + jne %%hv_overlap + jmp %%loop_x_h_overlap + +%%v_overlap: + DEFINE_ARGS dst, src, stride, fg_data, w, lstride, grain_lut, \ + _, sby, see, overlap, _, _, _, stride3 + + movzx sbyd, sbyb + imul seed, [fg_dataq+FGData.seed], 0x00010001 + imul r7d, sbyd, 173 * 0x00010001 + imul sbyd, 37 * 0x01000100 + add r7d, (105 << 16) | 188 + add sbyd, (178 << 24) | (141 << 8) + and r7d, 0x00ff00ff + and sbyd, 0xff00ff00 + xor seed, r7d + xor seed, sbyd ; (cur_seed << 16) | top_seed + +%if %3 + vpbroadcastd m13, [base+pb_23_22] + kxnorw k3, k3, k3 ; v_overlap mask +%elif %2 + vbroadcasti32x8 m13, [base+pb_27_17] + kxnord k3, k3, k3 + pshufd m13, m13, q0000 ; 8x27_17, 8x17_27 +%else + vpbroadcastd ym16, [base+pb_27_17] + vpbroadcastd m13, [base+pb_17_27] + vmovdqa64 m13{k1}, m16 +%endif + + DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ + offx, offy, see, overlap, top_offxy, _, _, stride3 + + mov lumaq, r9mp + lea r11, [srcq+wq] + lea r12, [dstq+wq] + lea r13, [lumaq+wq*(1<<%2)] + mov r11mp, r11 + mov r12mp, r12 + neg wq + + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + rorx offyd, seed, 8 + rorx offxd, seed, 12 + and offyd, 0x000f000f + and offxd, 0x000f000f + imul offyd, 164>>%3 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] + + DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ + h, offxy, see, overlap, top_offxy, _, _, stride3 + + mov grain_lutq, grain_lutmp + mov hd, hm + movzx top_offxyd, offxyw + shr offxyd, 16 + +%if %2 + mova ym18, [lumaq+lstrideq*(0<<%3)] + vinserti32x8 m18, [lumaq+lstrideq*(1<<%3)], 1 + lea lumaq, [lumaq+lstrideq*(2<<%3)] + mova ym20, [lumaq+lstrideq*(0<<%3)] + vinserti32x8 m20, [lumaq+lstrideq*(1<<%3)], 1 + lea lumaq, [lumaq+lstrideq*(2<<%3)] + mova xm17, [srcq+strideq*0] + vinserti128 ym17, [srcq+strideq*1], 1 + mova m19, m11 + vpermi2b m19, m18, m20 + vpermt2b m18, m12, m20 + vinserti32x4 m17, [srcq+strideq*2], 2 + pavgb m18, m19 + vinserti32x4 m17, [srcq+stride3q ], 3 +%else + mova ym18, [lumaq+lstrideq*0] + vinserti32x8 m18, [lumaq+lstrideq*1], 1 + mova ym17, [srcq+strideq*0] + vinserti32x8 m17, [srcq+strideq*1], 1 + lea lumaq, [lumaq+lstrideq*2] +%endif + lea srcq, [srcq+strideq*(2<<%2)] +%if %1 + punpckhbw m19, m18, m17 + punpcklbw m18, m17 ; { luma, chroma } + pmaddubsw m19, m14 + pmaddubsw m18, m14 + psraw m19, 6 + psraw m18, 6 + paddw m19, m15 + paddw m18, m15 + packuswb m18, m19 +%endif + mova m19, m0 + vpmovb2m k2, m18 + vpermt2b m19, m18, m1 ; scaling[ 0..127] + vpermi2b m18, m2, m3 ; scaling[128..255] +%if %3 + movu xm21, [grain_lutq+offxyq+82*0] + movu xm16, [grain_lutq+top_offxyq+82*0] + punpcklbw xm20, xm16, xm21 + punpckhbw xm16, xm21 + pmaddubsw xm20, xm13, xm20 + pmaddubsw xm16, xm13, xm16 + ; only interpolate first line, insert remaining line unmodified + vbroadcasti128 ym21, [grain_lutq+offxyq+82*1] + vinserti32x4 m21, [grain_lutq+offxyq+82*2], 2 + vinserti32x4 m21, [grain_lutq+offxyq+82*3], 3 + pmulhrsw xm20, xm9 + pmulhrsw xm16, xm9 + vpacksswb m21{k3}, m20, m16 +%elif %2 + movu xm21, [grain_lutq+offxyq+82*0] + vinserti128 ym21, [grain_lutq+offxyq+82*1], 1 + movu xm16, [grain_lutq+top_offxyq+82*0] + vinserti32x4 ym16, [grain_lutq+top_offxyq+82*1], 1 + punpcklbw ym20, ym16, ym21 + punpckhbw ym16, ym21 + pmaddubsw ym20, ym13, ym20 + pmaddubsw ym16, ym13, ym16 + vbroadcasti32x4 m21, [grain_lutq+offxyq+82*2] + vinserti32x4 m21, [grain_lutq+offxyq+82*3], 3 + pmulhrsw ym20, ym9 + pmulhrsw ym16, ym9 + packsswb m21{k3}, m20, m16 +%else + movu ym16, [grain_lutq+offxyq+82*0] + vinserti32x8 m16, [grain_lutq+offxyq+82*1], 1 + movu ym20, [grain_lutq+top_offxyq+82*0] + vinserti32x8 m20, [grain_lutq+top_offxyq+82*1], 1 + punpcklbw m21, m20, m16 + punpckhbw m20, m16 + pmaddubsw m21, m13, m21 + pmaddubsw m20, m13, m20 + pmulhrsw m21, m9 + pmulhrsw m20, m9 + packsswb m21, m20 +%endif + vmovdqu8 m19{k2}, m18 ; scaling[src] + pshufb m19, m4 + punpcklbw m20, m5, m21 + punpckhbw m21, m5 + pmaddubsw m18, m19, m20 + pmaddubsw m19, m21 + add grain_lutq, 82*2<<%2 + pmulhrsw m18, m6 ; noise + pmulhrsw m19, m6 + punpcklbw m16, m17, m5 ; chroma + punpckhbw m17, m5 + paddw m16, m18 + paddw m17, m19 + packuswb m16, m17 + pmaxub m16, m7 + pminub m16, m8 +%if %2 + mova [dstq+strideq*0], xm16 + vextracti128 [dstq+strideq*1], ym16, 1 + vextracti32x4 [dstq+strideq*2], m16, 2 + vextracti32x4 [dstq+stride3q ], m16, 3 +%else + mova [dstq+strideq*0], ym16 + vextracti32x8 [dstq+strideq*1], m16, 1 +%endif + lea dstq, [dstq+strideq*(2<<%2)] + sub hb, 2<<%2 + jg %%loop_y + add wq, 32>>%2 + jge %%end + mov srcq, r11mp + mov dstq, r12mp + lea lumaq, [r13+wq*(1<<%2)] + add srcq, wq + add dstq, wq + +%%hv_overlap: + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ + offx, offy, see, left_offxy, top_offxy, topleft_offxy, _, stride3 + + lea topleft_offxyd, [top_offxyq+(32>>%2)] + lea left_offxyd, [offyq+(32>>%2)] + rorx offyd, seed, 8 + rorx offxd, seed, 12 + and offyd, 0x000f000f + and offxd, 0x000f000f + imul offyd, 164>>%3 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] + + DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ + h, offxy, see, left_offxy, top_offxy, topleft_offxy, _, stride3 + + mov grain_lutq, grain_lutmp + mov hd, hm + movzx top_offxyd, offxyw + shr offxyd, 16 + +%if %2 + movu xm21, [grain_lutq+offxyq+82*0] + movd xm16, [grain_lutq+left_offxyq+82*0] + vinserti128 ym21, [grain_lutq+offxyq+82*1], 1 + vinserti128 ym16, [grain_lutq+left_offxyq+82*1], 1 + vinserti32x4 m21, [grain_lutq+offxyq+82*2], 2 + vinserti32x4 m16, [grain_lutq+left_offxyq+82*2], 2 + vinserti32x4 m21, [grain_lutq+offxyq+82*3], 3 + vinserti32x4 m16, [grain_lutq+left_offxyq+82*3], 3 + movd xm18, [grain_lutq+topleft_offxyq+82*0] + movu xm20, [grain_lutq+top_offxyq] + ; do h interpolation first (so top | top/left -> top, left | cur -> cur) + punpcklbw m16, m21 +%if %3 + punpcklbw xm18, xm20 +%else + vinserti128 ym18, [grain_lutq+topleft_offxyq+82*1], 1 + vinserti128 ym20, [grain_lutq+top_offxyq+82*1], 1 + punpcklbw ym18, ym20 +%endif + punpcklqdq m16, m18 + pmaddubsw m16, m10, m16 + pmulhrsw m16, m9 + packsswb m16, m16 +%if %3 + vpalignr xm20{k1}, xm16, xm16, 4 +%else + vpalignr ym20{k1}, ym16, ym16, 4 +%endif + vmovdqu8 m21{k1}, m16 +%else + movu ym21, [grain_lutq+offxyq+82*0] + vinserti32x8 m21, [grain_lutq+offxyq+82*1], 1 + movd xm16, [grain_lutq+left_offxyq+82*0] + vinserti32x4 m16, [grain_lutq+left_offxyq+82*1], 2 + movu ym20, [grain_lutq+top_offxyq+82*0] + vinserti32x8 m20, [grain_lutq+top_offxyq+82*1], 1 + movd xm18, [grain_lutq+topleft_offxyq+82*0] + vinserti32x4 m18, [grain_lutq+topleft_offxyq+82*1], 2 + punpcklbw m16, m21 + punpcklbw m18, m20 + punpcklqdq m16, m18 + pmaddubsw m16, m10, m16 + pmulhrsw m16, m9 + packsswb m16, m16 + vpalignr m20{k1}, m16, m16, 4 + vmovdqu8 m21{k1}, m16 +%endif +%if %2 + mova ym18, [lumaq+lstrideq*(0<<%3)] + vinserti32x8 m18, [lumaq+lstrideq*(1<<%3)], 1 + lea lumaq, [lumaq+lstrideq*(2<<%3)] + mova ym16, [lumaq+lstrideq*(0<<%3)] + vinserti32x8 m16, [lumaq+lstrideq*(1<<%3)], 1 + lea lumaq, [lumaq+lstrideq*(2<<%3)] + mova xm17, [srcq+strideq*0] + vinserti128 ym17, [srcq+strideq*1], 1 + mova m19, m11 + vpermi2b m19, m18, m16 + vpermt2b m18, m12, m16 + vinserti32x4 m17, [srcq+strideq*2], 2 + pavgb m18, m19 + vinserti32x4 m17, [srcq+stride3q ], 3 +%else + mova ym18, [lumaq+lstrideq*0] + vinserti32x8 m18, [lumaq+lstrideq*1], 1 + mova ym17, [srcq+strideq*0] + vinserti32x8 m17, [srcq+strideq*1], 1 + lea lumaq, [lumaq+lstrideq*2] +%endif + lea srcq, [srcq+strideq*(2<<%2)] +%if %1 + punpckhbw m19, m18, m17 + punpcklbw m18, m17 ; { luma, chroma } + pmaddubsw m19, m14 + pmaddubsw m18, m14 + psraw m19, 6 + psraw m18, 6 + paddw m19, m15 + paddw m18, m15 + packuswb m18, m19 +%endif + mova m19, m0 + vpmovb2m k2, m18 + vpermt2b m19, m18, m1 ; scaling[ 0..127] + vpermi2b m18, m2, m3 ; scaling[128..255] + ; followed by v interpolation (top | cur -> cur) +%if %3 + punpcklbw xm16, xm20, xm21 + punpckhbw xm20, xm21 + pmaddubsw xm16, xm13, xm16 + pmaddubsw xm20, xm13, xm20 + pmulhrsw xm16, xm9 + pmulhrsw xm20, xm9 + vpacksswb m21{k3}, m16, m20 +%elif %2 + punpcklbw ym16, ym20, ym21 + punpckhbw ym20, ym21 + pmaddubsw ym16, ym13, ym16 + pmaddubsw ym20, ym13, ym20 + pmulhrsw ym16, ym9 + pmulhrsw ym20, ym9 + vpacksswb m21{k3}, m16, m20 +%else + punpcklbw m16, m20, m21 + punpckhbw m20, m21 + pmaddubsw m16, m13, m16 + pmaddubsw m20, m13, m20 + pmulhrsw m16, m9 + pmulhrsw m20, m9 + packsswb m21, m16, m20 +%endif + vmovdqu8 m19{k2}, m18 ; scaling[src] + pshufb m19, m4 + punpcklbw m20, m5, m21 + punpckhbw m21, m5 + pmaddubsw m18, m19, m20 + pmaddubsw m19, m21 + add grain_lutq, 82*2<<%2 + pmulhrsw m18, m6 ; grain + pmulhrsw m19, m6 + punpcklbw m16, m17, m5 ; chroma + punpckhbw m17, m5 + paddw m16, m18 + paddw m17, m19 + packuswb m16, m17 + pmaxub m16, m7 + pminub m16, m8 +%if %2 + mova [dstq+strideq*0], xm16 + vextracti128 [dstq+strideq*1], ym16, 1 + vextracti32x4 [dstq+strideq*2], m16, 2 + vextracti32x4 [dstq+stride3q ], m16, 3 +%else + mova [dstq+strideq*0], ym16 + vextracti32x8 [dstq+strideq*1], m16, 1 +%endif + lea dstq, [dstq+strideq*(2<<%2)] + sub hb, 2<<%2 + jg %%loop_y_h_overlap + add wq, 32>>%2 + jge %%end + mov srcq, r11mp + mov dstq, r12mp + lea lumaq, [r13+wq*(1<<%2)] + add srcq, wq + add dstq, wq + jmp %%hv_overlap +%%end: + RET +%endmacro + + %%FGUV_32x32xN_LOOP 1, %2, %3 +.csfl: + %%FGUV_32x32xN_LOOP 0, %2, %3 +%endmacro + +FGUV_FN 420, 1, 1 +FGUV_FN 422, 1, 0 +FGUV_FN 444, 0, 0 + +%endif ; ARCH_X86_64 diff --git a/third_party/dav1d/src/x86/filmgrain_common.asm b/third_party/dav1d/src/x86/filmgrain_common.asm new file mode 100644 index 0000000000000..74f7044e666cb --- /dev/null +++ b/third_party/dav1d/src/x86/filmgrain_common.asm @@ -0,0 +1,46 @@ +; Copyright © 2019-2022, VideoLAN and dav1d authors +; Copyright © 2019-2022, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +struc FGData + .seed: resd 1 + .num_y_points: resd 1 + .y_points: resb 14 * 2 + .chroma_scaling_from_luma: resd 1 + .num_uv_points: resd 2 + .uv_points: resb 2 * 10 * 2 + .scaling_shift: resd 1 + .ar_coeff_lag: resd 1 + .ar_coeffs_y: resb 24 + .ar_coeffs_uv: resb 2 * 28 ; includes padding + .ar_coeff_shift: resq 1 + .grain_scale_shift: resd 1 + .uv_mult: resd 2 + .uv_luma_mult: resd 2 + .uv_offset: resd 2 + .overlap_flag: resd 1 + .clip_to_restricted_range: resd 1 +endstruc + +cextern gaussian_sequence diff --git a/third_party/dav1d/src/x86/film_grain_init_tmpl.c b/third_party/dav1d/src/x86/filmgrain_init_tmpl.c similarity index 67% rename from third_party/dav1d/src/x86/film_grain_init_tmpl.c rename to third_party/dav1d/src/x86/filmgrain_init_tmpl.c index 606ea3cb56cff..0b783d10d3c45 100644 --- a/third_party/dav1d/src/x86/film_grain_init_tmpl.c +++ b/third_party/dav1d/src/x86/filmgrain_init_tmpl.c @@ -1,6 +1,6 @@ /* - * Copyright © 2018-2021, VideoLAN and dav1d authors - * Copyright © 2018, Two Orioles, LLC + * Copyright © 2018-2022, VideoLAN and dav1d authors + * Copyright © 2018-2022, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -26,25 +26,21 @@ */
#include "src/cpu.h" -#include "src/film_grain.h" +#include "src/filmgrain.h"
-decl_generate_grain_y_fn(BF(dav1d_generate_grain_y, ssse3)); -decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_420, ssse3)); -decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_422, ssse3)); -decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_444, ssse3)); -decl_fgy_32x32xn_fn(BF(dav1d_fgy_32x32xn, ssse3)); -decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i420, ssse3)); -decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i422, ssse3)); -decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i444, ssse3)); +#define decl_fg_fns(ext) \ +decl_generate_grain_y_fn(BF(dav1d_generate_grain_y, ext)); \ +decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_420, ext)); \ +decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_422, ext)); \ +decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_444, ext)); \ +decl_fgy_32x32xn_fn(BF(dav1d_fgy_32x32xn, ext)); \ +decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i420, ext)); \ +decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i422, ext)); \ +decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i444, ext))
-decl_generate_grain_y_fn(BF(dav1d_generate_grain_y, avx2)); -decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_420, avx2)); -decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_422, avx2)); -decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_444, avx2)); -decl_fgy_32x32xn_fn(BF(dav1d_fgy_32x32xn, avx2)); -decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i420, avx2)); -decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i422, avx2)); -decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i444, avx2)); +decl_fg_fns(ssse3); +decl_fg_fns(avx2); +decl_fg_fns(avx512icl);
COLD void bitfn(dav1d_film_grain_dsp_init_x86)(Dav1dFilmGrainDSPContext *const c) { const unsigned flags = dav1d_get_cpu_flags(); @@ -68,11 +64,20 @@ COLD void bitfn(dav1d_film_grain_dsp_init_x86)(Dav1dFilmGrainDSPContext *const c c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_generate_grain_uv_422, avx2); c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_generate_grain_uv_444, avx2);
- if (flags & DAV1D_X86_CPU_FLAG_SLOW_GATHER) return; + if (!(flags & DAV1D_X86_CPU_FLAG_SLOW_GATHER)) { + c->fgy_32x32xn = BF(dav1d_fgy_32x32xn, avx2); + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_fguv_32x32xn_i420, avx2); + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_fguv_32x32xn_i422, avx2); + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_fguv_32x32xn_i444, avx2); + }
- c->fgy_32x32xn = BF(dav1d_fgy_32x32xn, avx2); - c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_fguv_32x32xn_i420, avx2); - c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_fguv_32x32xn_i422, avx2); - c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_fguv_32x32xn_i444, avx2); +#if BITDEPTH == 8 + if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return; + + c->fgy_32x32xn = BF(dav1d_fgy_32x32xn, avx512icl); + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_fguv_32x32xn_i420, avx512icl); + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_fguv_32x32xn_i422, avx512icl); + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_fguv_32x32xn_i444, avx512icl); +#endif #endif } diff --git a/third_party/dav1d/src/x86/film_grain_sse.asm b/third_party/dav1d/src/x86/filmgrain_sse.asm similarity index 99% rename from third_party/dav1d/src/x86/film_grain_sse.asm rename to third_party/dav1d/src/x86/filmgrain_sse.asm index 20334591a93db..0172f987607a7 100644 --- a/third_party/dav1d/src/x86/film_grain_sse.asm +++ b/third_party/dav1d/src/x86/filmgrain_sse.asm @@ -25,6 +25,7 @@
%include "config.asm" %include "ext/x86/x86inc.asm" +%include "x86/filmgrain_common.asm"
SECTION_RODATA
@@ -66,38 +67,8 @@ JMP_TABLE generate_grain_uv_420, ssse3, 0, 1, 2, 3 JMP_TABLE generate_grain_uv_422, ssse3, 0, 1, 2, 3 JMP_TABLE generate_grain_uv_444, ssse3, 0, 1, 2, 3
-struc FGData - .seed: resd 1 - .num_y_points: resd 1 - .y_points: resb 14 * 2 - .chroma_scaling_from_luma: resd 1 - .num_uv_points: resd 2 - .uv_points: resb 2 * 10 * 2 - .scaling_shift: resd 1 - .ar_coeff_lag: resd 1 - .ar_coeffs_y: resb 24 - .ar_coeffs_uv: resb 2 * 28 ; includes padding - .ar_coeff_shift: resq 1 - .grain_scale_shift: resd 1 - .uv_mult: resd 2 - .uv_luma_mult: resd 2 - .uv_offset: resd 2 - .overlap_flag: resd 1 - .clip_to_restricted_range: resd 1 -endstruc - -cextern gaussian_sequence - SECTION .text
-%macro REPX 2-* - %xdefine %%f(x) %1 -%rep %0 - 1 - %rotate 1 - %%f(%1) -%endrep -%endmacro - %if ARCH_X86_32 %define PIC_ptr(a) base+a %else diff --git a/third_party/dav1d/src/x86/ipred16_avx2.asm b/third_party/dav1d/src/x86/ipred16_avx2.asm index e6d4faddee8ae..72300c2a4cd07 100644 --- a/third_party/dav1d/src/x86/ipred16_avx2.asm +++ b/third_party/dav1d/src/x86/ipred16_avx2.asm @@ -26,7 +26,7 @@ %include "config.asm" %include "ext/x86/x86inc.asm"
-SECTION_RODATA 32 +SECTION_RODATA 64
%macro SMOOTH_WEIGHTS 1-* const smooth_weights_1d_16bpc ; sm_weights[] << 7 @@ -134,14 +134,6 @@ cextern filter_intra_taps
SECTION .text
-%macro REPX 2-* - %xdefine %%f(x) %1 -%rep %0 - 1 - %rotate 1 - %%f(%1) -%endrep -%endmacro - INIT_YMM avx2 cglobal ipred_dc_top_16bpc, 3, 7, 6, dst, stride, tl, w, h movifnidn hd, hm diff --git a/third_party/dav1d/src/x86/ipred16_avx512.asm b/third_party/dav1d/src/x86/ipred16_avx512.asm new file mode 100644 index 0000000000000..4a1b060bd5f67 --- /dev/null +++ b/third_party/dav1d/src/x86/ipred16_avx512.asm @@ -0,0 +1,833 @@ +; Copyright © 2022, VideoLAN and dav1d authors +; Copyright © 2022, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 64 + +ipred_shuf: db 14, 15, 14, 15, 0, 1, 2, 3, 6, 7, 6, 7, 0, 1, 2, 3 + db 10, 11, 10, 11, 8, 9, 10, 11, 2, 3, 2, 3, 8, 9, 10, 11 + db 12, 13, 12, 13, 4, 5, 6, 7, 4, 5, 4, 5, 4, 5, 6, 7 + db 8, 9, 8, 9, 12, 13, 14, 15, 0, 1, 0, 1, 12, 13, 14, 15 +smooth_perm: db 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30 + db 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58, 61, 62 + db 65, 66, 69, 70, 73, 74, 77, 78, 81, 82, 85, 86, 89, 90, 93, 94 + db 97, 98,101,102,105,106,109,110,113,114,117,118,121,122,125,126 +pal_pred_perm: db 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39 + db 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47 + db 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55 + db 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63 +filter_permA: times 4 db 6, 7, 8, 9, 14, 15, 4, 5 + times 4 db 10, 11, 12, 13, 2, 3, -1, -1 +filter_permB: times 4 db 22, 23, 24, 25, 30, 31, 6, 7 + times 4 db 26, 27, 28, 29, 14, 15, -1, -1 +filter_permC: dd 8 ; dq 8, 10, 1, 11, 0, 9 +pw_1: times 2 dw 1 + dd 10 +filter_rnd: dd 32 + dd 1 + dd 8 + dd 11 +filter_shift: times 2 dw 6 + dd 0 + times 2 dw 4 + dd 9 + +%macro JMP_TABLE 3-* + %xdefine %1_%2_table (%%table - 2*4) + %xdefine %%base mangle(private_prefix %+ _%1_%2) + %%table: + %rep %0 - 2 + dd %%base %+ .%3 - (%%table - 2*4) + %rotate 1 + %endrep +%endmacro + +JMP_TABLE ipred_paeth_16bpc, avx512icl, w4, w8, w16, w32, w64 +JMP_TABLE ipred_smooth_16bpc, avx512icl, w4, w8, w16, w32, w64 +JMP_TABLE ipred_smooth_h_16bpc, avx512icl, w4, w8, w16, w32, w64 +JMP_TABLE ipred_smooth_v_16bpc, avx512icl, w4, w8, w16, w32, w64 +JMP_TABLE pal_pred_16bpc, avx512icl, w4, w8, w16, w32, w64 + +cextern smooth_weights_1d_16bpc +cextern smooth_weights_2d_16bpc +cextern filter_intra_taps + +SECTION .text + +%macro PAETH 3 ; top, signed_ldiff, ldiff + paddw m0, m%2, m2 + psubw m1, m0, m3 ; tldiff + psubw m0, m%1 ; tdiff + pabsw m1, m1 + pabsw m0, m0 + pcmpgtw k1, m0, m1 + pminsw m0, m1 + pcmpgtw k2, m%3, m0 + vpblendmw m0{k1}, m%1, m3 + vpblendmw m0{k2}, m2, m0 +%endmacro + +INIT_ZMM avx512icl +cglobal ipred_paeth_16bpc, 3, 7, 10, dst, stride, tl, w, h +%define base r6-ipred_paeth_16bpc_avx512icl_table + lea r6, [ipred_paeth_16bpc_avx512icl_table] + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, [r6+wq*4] + vpbroadcastw m3, [tlq] ; topleft + add wq, r6 + jmp wq +.w4: + vpbroadcastq m4, [tlq+2] ; top + movsldup m7, [base+ipred_shuf] + lea r6, [strideq*3] + psubw m5, m4, m3 + pabsw m6, m5 +.w4_loop: + sub tlq, 16 + vbroadcasti32x4 m2, [tlq] + pshufb m2, m7 ; left + PAETH 4, 5, 6 + vextracti32x4 xmm1, m0, 2 + vextracti32x4 xmm2, ym0, 1 + vextracti32x4 xmm3, m0, 3 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xmm1 + movq [dstq+strideq*2], xmm2 + movq [dstq+r6 ], xmm3 + sub hd, 8 + jl .w4_end + lea dstq, [dstq+strideq*4] + movhps [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xmm1 + movhps [dstq+strideq*2], xmm2 + movhps [dstq+r6 ], xmm3 + lea dstq, [dstq+strideq*4] + jg .w4_loop +.w4_end: + RET +.w8: + vbroadcasti32x4 m4, [tlq+2] + movsldup m7, [base+ipred_shuf] + lea r6, [strideq*3] + psubw m5, m4, m3 + pabsw m6, m5 +.w8_loop: + sub tlq, 8 + vpbroadcastq m2, [tlq] + pshufb m2, m7 + PAETH 4, 5, 6 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], m0, 2 + vextracti32x4 [dstq+strideq*2], ym0, 1 + vextracti32x4 [dstq+r6 ], m0, 3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8_loop + RET +.w16: + vbroadcasti32x8 m4, [tlq+2] + movsldup m7, [base+ipred_shuf] + psubw m5, m4, m3 + pabsw m6, m5 +.w16_loop: + sub tlq, 4 + vpbroadcastd m2, [tlq] + pshufb m2, m7 + PAETH 4, 5, 6 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w16_loop + RET +.w32: + movu m4, [tlq+2] + psubw m5, m4, m3 + pabsw m6, m5 +.w32_loop: + sub tlq, 2 + vpbroadcastw m2, [tlq] + PAETH 4, 5, 6 + mova [dstq], m0 + add dstq, strideq + dec hd + jg .w32_loop + RET +.w64: + movu m4, [tlq+ 2] + movu m7, [tlq+66] + psubw m5, m4, m3 + psubw m8, m7, m3 + pabsw m6, m5 + pabsw m9, m8 +.w64_loop: + sub tlq, 2 + vpbroadcastw m2, [tlq] + PAETH 4, 5, 6 + mova [dstq+64*0], m0 + PAETH 7, 8, 9 + mova [dstq+64*1], m0 + add dstq, strideq + dec hd + jg .w64_loop + RET + +cglobal ipred_smooth_v_16bpc, 3, 7, 7, dst, stride, tl, w, h, weights, stride3 +%define base r6-$$ + lea r6, [$$] + tzcnt wd, wm + mov hd, hm + movsxd wq, [base+ipred_smooth_v_16bpc_avx512icl_table+wq*4] + lea weightsq, [base+smooth_weights_1d_16bpc+hq*4] + neg hq + vpbroadcastw m6, [tlq+hq*2] ; bottom + lea wq, [base+ipred_smooth_v_16bpc_avx512icl_table+wq] + lea stride3q, [strideq*3] + jmp wq +.w4: + vpbroadcastq m5, [tlq+2] ; top + movsldup m4, [ipred_shuf] + psubw m5, m6 ; top - bottom +.w4_loop: + vbroadcasti32x4 m3, [weightsq+hq*2] + pshufb m3, m4 + pmulhrsw m3, m5 + paddw m3, m6 + vextracti32x4 xmm0, m3, 3 + vextracti32x4 xmm1, ym3, 1 + vextracti32x4 xmm2, m3, 2 + movhps [dstq+strideq*0], xmm0 + movhps [dstq+strideq*1], xmm1 + movhps [dstq+strideq*2], xmm2 + movhps [dstq+stride3q ], xm3 + add hq, 8 + jg .end + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xmm0 + movq [dstq+strideq*1], xmm1 + movq [dstq+strideq*2], xmm2 + movq [dstq+stride3q ], xm3 + lea dstq, [dstq+strideq*4] + jl .w4_loop +.end: + RET +.w8: + vbroadcasti32x4 m5, [tlq+2] ; top + movsldup m4, [ipred_shuf] + psubw m5, m6 ; top - bottom +.w8_loop: + vpbroadcastq m0, [weightsq+hq*2] + pshufb m0, m4 + pmulhrsw m0, m5 + paddw m0, m6 + vextracti32x4 [dstq+strideq*0], m0, 3 + vextracti32x4 [dstq+strideq*1], ym0, 1 + vextracti32x4 [dstq+strideq*2], m0, 2 + mova [dstq+stride3q ], xm0 + lea dstq, [dstq+strideq*4] + add hq, 4 + jl .w8_loop + RET +.w16: + vbroadcasti32x8 m5, [tlq+2] ; top + movsldup m4, [ipred_shuf] + psubw m5, m6 ; top - bottom +.w16_loop: + vpbroadcastd m0, [weightsq+hq*2+0] + vpbroadcastd m1, [weightsq+hq*2+4] + pshufb m0, m4 + pshufb m1, m4 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + paddw m0, m6 + paddw m1, m6 + vextracti32x8 [dstq+strideq*0], m0, 1 + mova [dstq+strideq*1], ym0 + vextracti32x8 [dstq+strideq*2], m1, 1 + mova [dstq+stride3q ], ym1 + lea dstq, [dstq+strideq*4] + add hq, 4 + jl .w16_loop + RET +.w32: + movu m5, [tlq+2] + psubw m5, m6 +.w32_loop: + vpbroadcastw m0, [weightsq+hq*2+0] + vpbroadcastw m1, [weightsq+hq*2+2] + vpbroadcastw m2, [weightsq+hq*2+4] + vpbroadcastw m3, [weightsq+hq*2+6] + REPX {pmulhrsw x, m5}, m0, m1, m2, m3 + REPX {paddw x, m6}, m0, m1, m2, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + add hq, 4 + jl .w32_loop + RET +.w64: + movu m4, [tlq+ 2] + movu m5, [tlq+66] + psubw m4, m6 + psubw m5, m6 +.w64_loop: + vpbroadcastw m1, [weightsq+hq*2+0] + vpbroadcastw m3, [weightsq+hq*2+2] + pmulhrsw m0, m4, m1 + pmulhrsw m1, m5 + pmulhrsw m2, m4, m3 + pmulhrsw m3, m5 + REPX {paddw x, m6}, m0, m1, m2, m3 + mova [dstq+strideq*0+64*0], m0 + mova [dstq+strideq*0+64*1], m1 + mova [dstq+strideq*1+64*0], m2 + mova [dstq+strideq*1+64*1], m3 + lea dstq, [dstq+strideq*2] + add hq, 2 + jl .w64_loop + RET + +cglobal ipred_smooth_h_16bpc, 3, 7, 7, dst, stride, tl, w, h, stride3 + lea r6, [$$] + mov wd, wm + movifnidn hd, hm + vpbroadcastw m6, [tlq+wq*2] ; right + tzcnt wd, wd + add hd, hd + movsxd wq, [base+ipred_smooth_h_16bpc_avx512icl_table+wq*4] + sub tlq, hq + lea stride3q, [strideq*3] + lea wq, [base+ipred_smooth_h_16bpc_avx512icl_table+wq] + jmp wq +.w4: + movsldup m4, [base+ipred_shuf] + vpbroadcastq m5, [base+smooth_weights_1d_16bpc+4*2] +.w4_loop: + vbroadcasti32x4 m0, [tlq+hq-16] ; left + pshufb m0, m4 + psubw m0, m6 ; left - right + pmulhrsw m0, m5 + paddw m0, m6 + vextracti32x4 xmm1, m0, 2 + vextracti32x4 xmm2, ym0, 1 + vextracti32x4 xmm3, m0, 3 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xmm1 + movq [dstq+strideq*2], xmm2 + movq [dstq+stride3q ], xmm3 + sub hd, 8*2 + jl .end + lea dstq, [dstq+strideq*4] + movhps [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xmm1 + movhps [dstq+strideq*2], xmm2 + movhps [dstq+stride3q ], xmm3 + lea dstq, [dstq+strideq*4] + jg .w4_loop +.end: + RET +.w8: + movsldup m4, [base+ipred_shuf] + vbroadcasti32x4 m5, [base+smooth_weights_1d_16bpc+8*2] +.w8_loop: + vpbroadcastq m0, [tlq+hq-8] ; left + pshufb m0, m4 + psubw m0, m6 ; left - right + pmulhrsw m0, m5 + paddw m0, m6 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], m0, 2 + vextracti32x4 [dstq+strideq*2], ym0, 1 + vextracti32x4 [dstq+stride3q ], m0, 3 + lea dstq, [dstq+strideq*4] + sub hd, 4*2 + jg .w8_loop + RET +.w16: + movsldup m4, [base+ipred_shuf] + vbroadcasti32x8 m5, [base+smooth_weights_1d_16bpc+16*2] +.w16_loop: + vpbroadcastd m0, [tlq+hq-4] + vpbroadcastd m1, [tlq+hq-8] + pshufb m0, m4 + pshufb m1, m4 + psubw m0, m6 + psubw m1, m6 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + paddw m0, m6 + paddw m1, m6 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], ym1 + vextracti32x8 [dstq+stride3q ], m1, 1 + lea dstq, [dstq+strideq*4] + sub hq, 4*2 + jg .w16_loop + RET +.w32: + movu m5, [base+smooth_weights_1d_16bpc+32*2] +.w32_loop: + vpbroadcastq m3, [tlq+hq-8] + punpcklwd m3, m3 + psubw m3, m6 + pshufd m0, m3, q3333 + pshufd m1, m3, q2222 + pshufd m2, m3, q1111 + pshufd m3, m3, q0000 + REPX {pmulhrsw x, m5}, m0, m1, m2, m3 + REPX {paddw x, m6}, m0, m1, m2, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + sub hq, 4*2 + jg .w32_loop + RET +.w64: + movu m4, [base+smooth_weights_1d_16bpc+64*2] + movu m5, [base+smooth_weights_1d_16bpc+64*3] +.w64_loop: + vpbroadcastw m1, [tlq+hq-2] + vpbroadcastw m3, [tlq+hq-4] + psubw m1, m6 + psubw m3, m6 + pmulhrsw m0, m4, m1 + pmulhrsw m1, m5 + pmulhrsw m2, m4, m3 + pmulhrsw m3, m5 + REPX {paddw x, m6}, m0, m1, m2, m3 + mova [dstq+strideq*0+64*0], m0 + mova [dstq+strideq*0+64*1], m1 + mova [dstq+strideq*1+64*0], m2 + mova [dstq+strideq*1+64*1], m3 + lea dstq, [dstq+strideq*2] + sub hq, 2*2 + jg .w64_loop + RET + +cglobal ipred_smooth_16bpc, 3, 7, 16, dst, stride, tl, w, h, v_weights, stride3 + lea r6, [$$] + mov wd, wm + movifnidn hd, hm + vpbroadcastw m13, [tlq+wq*2] ; right + tzcnt wd, wd + add hd, hd + movsxd wq, [base+ipred_smooth_16bpc_avx512icl_table+wq*4] + mov r5d, 0x55555555 + sub tlq, hq + mova m14, [base+smooth_perm] + kmovd k1, r5d + vpbroadcastw m0, [tlq] ; bottom + mov r5, 0x3333333333333333 + pxor m15, m15 + lea wq, [base+ipred_smooth_16bpc_avx512icl_table+wq] + kmovq k2, r5 + lea v_weightsq, [base+smooth_weights_2d_16bpc+hq*2] + jmp wq +.w4: + vpbroadcastq m5, [tlq+hq+2] + movshdup m3, [base+ipred_shuf] + movsldup m4, [base+ipred_shuf] + vbroadcasti32x4 m6, [base+smooth_weights_2d_16bpc+4*4] + lea stride3q, [strideq*3] + punpcklwd m5, m0 ; top, bottom +.w4_loop: + vbroadcasti32x4 m0, [v_weightsq] + vpbroadcastq m2, [tlq+hq-8] + mova m1, m13 + pshufb m0, m3 + pmaddwd m0, m5 + pshufb m1{k2}, m2, m4 ; left, right + vpdpwssd m0, m1, m6 + vpermb m0, m14, m0 + pavgw ym0, ym15 + vextracti32x4 xmm1, ym0, 1 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xmm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xmm1 + lea dstq, [dstq+strideq*4] + add v_weightsq, 4*4 + sub hd, 4*2 + jg .w4_loop + RET +.w8: + vbroadcasti32x4 ym5, [tlq+hq+2] + movshdup m6, [base+ipred_shuf] + movsldup m7, [base+ipred_shuf] + pmovzxwd m5, ym5 + vbroadcasti32x8 m8, [base+smooth_weights_2d_16bpc+8*4] + lea stride3q, [strideq*3] + vpblendmw m5{k1}, m0, m5 ; top, bottom +.w8_loop: + vpbroadcastq m0, [v_weightsq+0] + vpbroadcastq m1, [v_weightsq+8] + vpbroadcastd m3, [tlq+hq-4] + vpbroadcastd m4, [tlq+hq-8] + pshufb m0, m6 + pmaddwd m0, m5 + pshufb m1, m6 + pmaddwd m1, m5 + mova m2, m13 + pshufb m2{k2}, m3, m7 ; left, right + mova m3, m13 + pshufb m3{k2}, m4, m7 + vpdpwssd m0, m2, m8 + vpdpwssd m1, m3, m8 + add v_weightsq, 4*4 + vpermt2b m0, m14, m1 + pavgw m0, m15 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], ym0, 1 + vextracti32x4 [dstq+strideq*2], m0, 2 + vextracti32x4 [dstq+stride3q ], m0, 3 + lea dstq, [dstq+strideq*4] + sub hd, 4*2 + jg .w8_loop + RET +.w16: + pmovzxwd m5, [tlq+hq+2] + mova m6, [base+smooth_weights_2d_16bpc+16*4] + vpblendmw m5{k1}, m0, m5 ; top, bottom +.w16_loop: + vpbroadcastd m0, [v_weightsq+0] + vpbroadcastd m1, [v_weightsq+4] + pmaddwd m0, m5 + pmaddwd m1, m5 + mova m2, m13 + vpbroadcastw m2{k1}, [tlq+hq-2] ; left, right + mova m3, m13 + vpbroadcastw m3{k1}, [tlq+hq-4] + vpdpwssd m0, m2, m6 + vpdpwssd m1, m3, m6 + add v_weightsq, 2*4 + vpermt2b m0, m14, m1 + pavgw m0, m15 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub hq, 2*2 + jg .w16_loop + RET +.w32: + pmovzxwd m5, [tlq+hq+ 2] + pmovzxwd m6, [tlq+hq+34] + mova m7, [base+smooth_weights_2d_16bpc+32*4] + mova m8, [base+smooth_weights_2d_16bpc+32*6] + vpblendmw m5{k1}, m0, m5 ; top, bottom + vpblendmw m6{k1}, m0, m6 +.w32_loop: + vpbroadcastd m2, [v_weightsq+0] + vpbroadcastd m3, [v_weightsq+4] + pmaddwd m0, m5, m2 + pmaddwd m2, m6 + pmaddwd m1, m5, m3 + pmaddwd m3, m6 + mova m4, m13 + vpbroadcastw m4{k1}, [tlq+hq-2] ; left, right + vpdpwssd m0, m4, m7 + vpdpwssd m2, m4, m8 + mova m4, m13 + vpbroadcastw m4{k1}, [tlq+hq-4] + vpdpwssd m1, m4, m7 + vpdpwssd m3, m4, m8 + add v_weightsq, 2*4 + vpermt2b m0, m14, m2 + vpermt2b m1, m14, m3 + pavgw m0, m15 + pavgw m1, m15 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + lea dstq, [dstq+strideq*2] + sub hq, 2*2 + jg .w32_loop + RET +.w64: + pmovzxwd m5, [tlq+hq+ 2] + pmovzxwd m6, [tlq+hq+34] + pmovzxwd m7, [tlq+hq+66] + pmovzxwd m8, [tlq+hq+98] + mova m9, [base+smooth_weights_2d_16bpc+64*4] + vpblendmw m5{k1}, m0, m5 ; top, bottom + mova m10, [base+smooth_weights_2d_16bpc+64*5] + vpblendmw m6{k1}, m0, m6 + mova m11, [base+smooth_weights_2d_16bpc+64*6] + vpblendmw m7{k1}, m0, m7 + mova m12, [base+smooth_weights_2d_16bpc+64*7] + vpblendmw m8{k1}, m0, m8 +.w64_loop: + vpbroadcastd m3, [v_weightsq] + mova m4, m13 + vpbroadcastw m4{k1}, [tlq+hq-2] ; left, right + pmaddwd m0, m5, m3 + pmaddwd m2, m6, m3 + pmaddwd m1, m7, m3 + pmaddwd m3, m8 + vpdpwssd m0, m4, m9 + vpdpwssd m2, m4, m10 + vpdpwssd m1, m4, m11 + vpdpwssd m3, m4, m12 + add v_weightsq, 1*4 + vpermt2b m0, m14, m2 + vpermt2b m1, m14, m3 + pavgw m0, m15 + pavgw m1, m15 + mova [dstq+64*0], m0 + mova [dstq+64*1], m1 + add dstq, strideq + sub hd, 1*2 + jg .w64_loop + RET + +cglobal pal_pred_16bpc, 4, 7, 4, dst, stride, pal, idx, w, h, stride3 + lea r6, [pal_pred_16bpc_avx512icl_table] + tzcnt wd, wm + mova m2, [pal_pred_perm] + movsxd wq, [r6+wq*4] + mova xm3, [palq] + movifnidn hd, hm + add wq, r6 + lea stride3q, [strideq*3] + jmp wq +.w4: + pmovzxbw ym0, [idxq] + add idxq, 16 + vpermw ym0, ym0, ym3 + vextracti32x4 xmm1, ym0, 1 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xmm1 + movhps [dstq+stride3q ], xmm1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4 + RET +.w8: + pmovzxbw m0, [idxq] + add idxq, 32 + vpermw m0, m0, m3 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], ym0, 1 + vextracti32x4 [dstq+strideq*2], m0, 2 + vextracti32x4 [dstq+stride3q ], m0, 3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8 + RET +.w16: + vpermb m1, m2, [idxq] + add idxq, 64 + vpermw m0, m1, m3 + psrlw m1, 8 + vpermw m1, m1, m3 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], ym1 + vextracti32x8 [dstq+stride3q ], m1, 1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w16 + RET +.w32: + vpermb m1, m2, [idxq] + add idxq, 64 + vpermw m0, m1, m3 + psrlw m1, 8 + vpermw m1, m1, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w32 + RET +.w64: + vpermb m1, m2, [idxq] + add idxq, 64 + vpermw m0, m1, m3 + psrlw m1, 8 + vpermw m1, m1, m3 + mova [dstq+64*0], m0 + mova [dstq+64*1], m1 + add dstq, strideq + dec hd + jg .w64 + RET + +; The ipred_filter SIMD processes 4x2 blocks in the following order which +; increases parallelism compared to doing things row by row. +; w4 w8 w16 w32 +; 1 1 2 1 2 5 6 1 2 5 6 9 a d e +; 2 2 3 2 3 6 7 2 3 6 7 a b e f +; 3 3 4 3 4 7 8 3 4 7 8 b c f g +; 4 4 5 4 5 8 9 4 5 8 9 c d g h + +cglobal ipred_filter_16bpc, 4, 7, 14, dst, stride, tl, w, h, filter, top +%define base r6-$$ + lea r6, [$$] +%ifidn filterd, filterm + movzx filterd, filterb +%else + movzx filterd, byte filterm +%endif + shl filterd, 6 + movifnidn hd, hm + movu xm0, [tlq-6] + pmovsxbw m7, [base+filter_intra_taps+filterq+32*0] + pmovsxbw m8, [base+filter_intra_taps+filterq+32*1] + mov r5d, r8m ; bitdepth_max + movsldup m9, [base+filter_permA] + movshdup m10, [base+filter_permA] + shr r5d, 11 ; is_12bpc + jnz .12bpc + psllw m7, 2 ; upshift multipliers so that packusdw + psllw m8, 2 ; will perform clipping for free +.12bpc: + vpbroadcastd m5, [base+filter_rnd+r5*8] + vpbroadcastd m6, [base+filter_shift+r5*8] + sub wd, 8 + jl .w4 +.w8: + call .main4 + movsldup m11, [filter_permB] + lea r5d, [hq*2+2] + movshdup m12, [filter_permB] + lea topq, [tlq+2] + mova m13, [filter_permC] + sub hd, 4 + vinserti32x4 ym0, [topq], 1 ; a0 b0 t0 t1 + sub tlq, r5 +%if WIN64 + push r7 + push r8 +%endif + mov r7, dstq + mov r8d, hd +.w8_loop: + movlps xm4, xm0, [tlq+hq*2] + call .main8 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jge .w8_loop + test wd, wd + jz .end + mov r2d, 0x0d + kmovb k1, r2d + lea r2, [strideq*3] +.w16: + movd xmm0, [r7+strideq*1+12] + vpblendd xmm0, [topq+8], 0x0e ; t1 t2 + pinsrw xm4, xmm0, [r7+strideq*0+14], 2 + call .main8 + add r7, 16 + vinserti32x4 ym0, [topq+16], 1 ; a2 b2 t2 t3 + mov hd, r8d + mov dstq, r7 + add topq, 16 +.w16_loop: + movd xmm1, [dstq+strideq*2-4] + punpcklwd xm4, xmm1, xmm0 + movd xmm0, [dstq+r2-4] + shufps xm4{k1}, xmm0, xm0, q3210 + call .main8 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jge .w16_loop + sub wd, 8 + jg .w16 +.end: + vpermb m2, m11, m0 + mova ym1, ym5 + vpdpwssd m1, m2, m7 + vpermb m2, m12, m0 + vpdpwssd m1, m2, m8 +%if WIN64 + pop r8 + pop r7 +%endif + vextracti32x8 ym2, m1, 1 + paddd ym1, ym2 + packusdw ym1, ym1 + vpsrlvw ym1, ym6 + vpermt2q m0, m13, m1 + vextracti32x4 [dstq+strideq*0], m0, 2 + vextracti32x4 [dstq+strideq*1], ym0, 1 + RET +.w4_loop: + movlps xm0, [tlq-10] + lea dstq, [dstq+strideq*2] + sub tlq, 4 +.w4: + call .main4 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + sub hd, 2 + jg .w4_loop + RET +ALIGN function_align +.main4: + vpermb m2, m9, m0 + mova ym1, ym5 + vpdpwssd m1, m2, m7 + vpermb m0, m10, m0 + vpdpwssd m1, m0, m8 + vextracti32x8 ym0, m1, 1 + paddd ym0, ym1 + vextracti32x4 xm1, ym0, 1 + packusdw xm0, xm1 ; clip + vpsrlvw xm0, xm6 + ret +ALIGN function_align +.main8: + vpermb m3, m11, m0 + mova ym2, ym5 + vpdpwssd m2, m3, m7 + vpermb m3, m9, m4 + mova ym1, ym5 + vpdpwssd m1, m3, m7 + vpermb m3, m12, m0 + vpdpwssd m2, m3, m8 + vpermb m3, m10, m4 + vpdpwssd m1, m3, m8 + vextracti32x8 ym4, m2, 1 + vextracti32x8 ym3, m1, 1 + paddd ym2, ym4 + paddd ym1, ym3 + packusdw ym1, ym2 ; clip + vpsrlvw ym1, ym6 + vpermt2q m0, m13, m1 ; c0 d0 b0 b1 a0 a1 + vextracti32x4 [dstq+strideq*0], m0, 2 + vextracti32x4 [dstq+strideq*1], ym0, 1 + ret + +%endif diff --git a/third_party/dav1d/src/x86/ipred16_sse.asm b/third_party/dav1d/src/x86/ipred16_sse.asm index eaa56b67bcf05..07ea9567e1ab9 100644 --- a/third_party/dav1d/src/x86/ipred16_sse.asm +++ b/third_party/dav1d/src/x86/ipred16_sse.asm @@ -70,14 +70,6 @@ cextern filter_intra_taps
SECTION .text
-%macro REPX 2-* - %xdefine %%f(x) %1 -%rep %0 - 1 - %rotate 1 - %%f(%1) -%endrep -%endmacro - INIT_XMM ssse3 cglobal ipred_dc_top_16bpc, 3, 7, 6, dst, stride, tl, w, h LEA r5, ipred_dc_left_16bpc_ssse3_table diff --git a/third_party/dav1d/src/x86/ipred_init_tmpl.c b/third_party/dav1d/src/x86/ipred_init_tmpl.c index 3f1a3493c2551..0ba0a41088001 100644 --- a/third_party/dav1d/src/x86/ipred_init_tmpl.c +++ b/third_party/dav1d/src/x86/ipred_init_tmpl.c @@ -134,6 +134,7 @@ COLD void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c init_angular_ipred_fn(LEFT_DC_PRED, ipred_dc_left, avx512icl); init_angular_ipred_fn(HOR_PRED, ipred_h, avx512icl); init_angular_ipred_fn(VERT_PRED, ipred_v, avx512icl); +#endif init_angular_ipred_fn(PAETH_PRED, ipred_paeth, avx512icl); init_angular_ipred_fn(SMOOTH_PRED, ipred_smooth, avx512icl); init_angular_ipred_fn(SMOOTH_H_PRED, ipred_smooth_h, avx512icl); @@ -142,5 +143,4 @@ COLD void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c
c->pal_pred = BF(dav1d_pal_pred, avx512icl); #endif -#endif } diff --git a/third_party/dav1d/src/x86/itx16_avx2.asm b/third_party/dav1d/src/x86/itx16_avx2.asm index 071ecbc33a268..c580944c7bbf5 100644 --- a/third_party/dav1d/src/x86/itx16_avx2.asm +++ b/third_party/dav1d/src/x86/itx16_avx2.asm @@ -145,14 +145,6 @@ cextern iadst_16x16_internal_8bpc_avx2.main_pass2_end
SECTION .text
-%macro REPX 2-* - %xdefine %%f(x) %1 -%rep %0 - 1 - %rotate 1 - %%f(%1) -%endrep -%endmacro - %define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
%macro WRAP_XMM 1+ diff --git a/third_party/dav1d/src/x86/itx16_sse.asm b/third_party/dav1d/src/x86/itx16_sse.asm index fa8724691f3ae..4fb30ef4e7a6f 100644 --- a/third_party/dav1d/src/x86/itx16_sse.asm +++ b/third_party/dav1d/src/x86/itx16_sse.asm @@ -174,14 +174,6 @@ tbl_Nx64_offset: db 2* 0, 2*32, 2*16, 2*46
SECTION .text
-%macro REPX 2-* - %xdefine %%f(x) %1 -%rep %0 - 1 - %rotate 1 - %%f(%1) -%endrep -%endmacro - %define m_suffix(x, sfx) mangle(private_prefix %+ _ %+ x %+ sfx) %define m(x) m_suffix(x, SUFFIX)
diff --git a/third_party/dav1d/src/x86/itx_avx2.asm b/third_party/dav1d/src/x86/itx_avx2.asm index 9cd66443482e8..092c842786dfe 100644 --- a/third_party/dav1d/src/x86/itx_avx2.asm +++ b/third_party/dav1d/src/x86/itx_avx2.asm @@ -132,15 +132,6 @@ SECTION .text ; 1-byte offsets as long as data is within +-128 bytes of the base pointer. %define o_base deint_shuf + 128 %define o(x) (r6 - (o_base) + (x)) - -%macro REPX 2-* - %xdefine %%f(x) %1 -%rep %0 - 1 - %rotate 1 - %%f(%1) -%endrep -%endmacro - %define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
; flags: 1 = swap, 2 = interleave, 4: coef_regs diff --git a/third_party/dav1d/src/x86/itx_avx512.asm b/third_party/dav1d/src/x86/itx_avx512.asm index 32b68b1548367..7d01bccb4f52c 100644 --- a/third_party/dav1d/src/x86/itx_avx512.asm +++ b/third_party/dav1d/src/x86/itx_avx512.asm @@ -242,15 +242,6 @@ SECTION .text
%define o_base int8_permA+64*18 %define o(x) (r5 - (o_base) + (x)) - -%macro REPX 2-* - %xdefine %%f(x) %1 -%rep %0 - 1 - %rotate 1 - %%f(%1) -%endrep -%endmacro - %define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
; flags: 1 = swap, 2 = interleave (l), 4 = interleave (t), 8 = no_pack, diff --git a/third_party/dav1d/src/x86/itx_init_tmpl.c b/third_party/dav1d/src/x86/itx_init_tmpl.c index 251d77e4fa1ee..467d38293209d 100644 --- a/third_party/dav1d/src/x86/itx_init_tmpl.c +++ b/third_party/dav1d/src/x86/itx_init_tmpl.c @@ -278,7 +278,27 @@ COLD void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c,
if (bpc > 10) return;
-#if BITDEPTH == 16 +#if BITDEPTH == 8 + assign_itx16_fn( , 4, 4, avx2); + assign_itx16_fn(R, 4, 8, avx2); + assign_itx16_fn(R, 4, 16, avx2); + assign_itx16_fn(R, 8, 4, avx2); + assign_itx16_fn( , 8, 8, avx2); + assign_itx16_fn(R, 8, 16, avx2); + assign_itx2_fn (R, 8, 32, avx2); + assign_itx16_fn(R, 16, 4, avx2); + assign_itx16_fn(R, 16, 8, avx2); + assign_itx12_fn( , 16, 16, avx2); + assign_itx2_fn (R, 16, 32, avx2); + assign_itx1_fn (R, 16, 64, avx2); + assign_itx2_fn (R, 32, 8, avx2); + assign_itx2_fn (R, 32, 16, avx2); + assign_itx2_fn ( , 32, 32, avx2); + assign_itx1_fn (R, 32, 64, avx2); + assign_itx1_fn (R, 64, 16, avx2); + assign_itx1_fn (R, 64, 32, avx2); + assign_itx1_fn ( , 64, 64, avx2); +#elif BITDEPTH == 16 assign_itx16_bpc_fn( , 4, 4, 10, avx2); assign_itx16_bpc_fn(R, 4, 8, 10, avx2); assign_itx16_bpc_fn(R, 4, 16, 10, avx2); diff --git a/third_party/dav1d/src/x86/itx_sse.asm b/third_party/dav1d/src/x86/itx_sse.asm index 7cbd9c3f3be59..ec7e3a52f4676 100644 --- a/third_party/dav1d/src/x86/itx_sse.asm +++ b/third_party/dav1d/src/x86/itx_sse.asm @@ -142,14 +142,6 @@ pw_m301x8: times 8 dw -301*8
SECTION .text
-%macro REPX 2-* - %xdefine %%f(x) %1 -%rep %0 - 1 - %rotate 1 - %%f(%1) -%endrep -%endmacro - %define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
%if ARCH_X86_64 @@ -2388,7 +2380,7 @@ INV_TXFM_8X16_FN identity, identity cglobal iidentity_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS coeffq+16*1, 32, 1 mov r3, tx2q - lea tx2q, [o(m(iidentity_8x16_internal_8bpc).pass1_end)] + lea tx2q, [o(.pass1_end)] mova [rsp+gprsize+16*1], m6 jmp m(idct_8x8_internal_8bpc).pass1_end3
@@ -2400,7 +2392,7 @@ cglobal iidentity_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 jmp m(idct_8x8_internal_8bpc).pass1_end3
.pass2: - lea tx2q, [o(m(iidentity_8x16_internal_8bpc).end1)] + lea tx2q, [o(.end1)]
.end: mova [rsp+gprsize+16*0], m7 @@ -2456,7 +2448,7 @@ cglobal idct_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS coeffq+16*1, 32, 1 call .main mov r3, tx2q - lea tx2q, [o(m(idct_16x8_internal_8bpc).pass1_end)] + lea tx2q, [o(.pass1_end)] jmp m(idct_8x8_internal_8bpc).pass1_end
.pass1_end: @@ -2467,7 +2459,7 @@ cglobal idct_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 jmp m(idct_8x8_internal_8bpc).pass1_end
.pass2: - lea tx2q, [o(m(idct_16x8_internal_8bpc).end)] + lea tx2q, [o(.end)] lea r3, [dstq+8] jmp m(idct_8x8_internal_8bpc).pass2_main
@@ -2595,7 +2587,7 @@ cglobal iadst_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 call .main call .main_pass1_end mov r3, tx2q - lea tx2q, [o(m(iadst_16x8_internal_8bpc).pass1_end)] + lea tx2q, [o(.pass1_end)] jmp m(iadst_8x8_internal_8bpc).pass1_end
.pass1_end: @@ -2606,7 +2598,7 @@ cglobal iadst_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 jmp m(iadst_8x8_internal_8bpc).pass1_end
.pass2: - lea tx2q, [o(m(iadst_16x8_internal_8bpc).end)] + lea tx2q, [o(.end)] lea r3, [dstq+8] jmp m(iadst_8x8_internal_8bpc).pass2_main
@@ -2880,7 +2872,7 @@ cglobal iflipadst_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 mov r3, tx2q - lea tx2q, [o(m(iflipadst_16x8_internal_8bpc).pass1_end)] + lea tx2q, [o(.pass1_end)] jmp m(iflipadst_8x8_internal_8bpc).pass1_end
.pass1_end: @@ -2891,7 +2883,7 @@ cglobal iflipadst_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 jmp m(iflipadst_8x8_internal_8bpc).pass1_end
.pass2: - lea tx2q, [o(m(iflipadst_16x8_internal_8bpc).end)] + lea tx2q, [o(.end)] lea r3, [dstq+8] jmp m(iflipadst_8x8_internal_8bpc).pass2_main
@@ -2914,7 +2906,7 @@ cglobal iidentity_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m6, [coeffq-16*3] mova m7, [coeffq-16*1] mov r3, tx2q - lea tx2q, [o(m(iidentity_16x8_internal_8bpc).pass1_end)] + lea tx2q, [o(.pass1_end)]
.pass1: mova m0, [o(pw_2896x8)] @@ -2972,7 +2964,7 @@ cglobal iidentity_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 jmp .pass1
.pass2: - lea tx2q, [o(m(iidentity_16x8_internal_8bpc).end)] + lea tx2q, [o(.end)] lea r3, [dstq+8] jmp m(iidentity_8x8_internal_8bpc).end
@@ -3010,7 +3002,7 @@ cglobal idct_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS coeffq+16*3, 64 call m(idct_16x8_internal_8bpc).main mov r3, tx2q - lea tx2q, [o(m(idct_16x16_internal_8bpc).pass1_end)] + lea tx2q, [o(.pass1_end)] mova m7, [o(pw_8192)] jmp m(idct_8x8_internal_8bpc).pass1_end1
@@ -3018,7 +3010,7 @@ cglobal idct_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 SAVE_8ROWS coeffq+16*17, 32 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_16x16_internal_8bpc).pass1_end1)] + lea tx2q, [o(.pass1_end1)] mova m7, [o(pw_8192)] jmp m(idct_8x8_internal_8bpc).pass1_end1
@@ -3029,7 +3021,7 @@ cglobal idct_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+16*2, 64 call m(idct_16x8_internal_8bpc).main - lea tx2q, [o(m(idct_16x16_internal_8bpc).pass1_end2)] + lea tx2q, [o(.pass1_end2)] mova m7, [o(pw_8192)] jmp m(idct_8x8_internal_8bpc).pass1_end1
@@ -3042,13 +3034,13 @@ cglobal idct_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 jmp m(idct_8x8_internal_8bpc).pass1_end1
.pass2: - lea tx2q, [o(m(idct_16x16_internal_8bpc).end)] + lea tx2q, [o(.end)] jmp m(idct_8x16_internal_8bpc).pass2_pre
.end: LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_16x16_internal_8bpc).end1)] + lea tx2q, [o(.end1)] mov dstq, r3 lea r3, [dstq+8] jmp m(idct_8x8_internal_8bpc).end @@ -3136,7 +3128,7 @@ cglobal iadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 call m(iadst_16x8_internal_8bpc).main_pass1_end
mov r3, tx2q - lea tx2q, [o(m(iadst_16x16_internal_8bpc).pass1_end)] + lea tx2q, [o(.pass1_end)] mova m7, [o(pw_8192)] jmp m(iadst_8x8_internal_8bpc).pass1_end1
@@ -3144,7 +3136,7 @@ cglobal iadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 SAVE_8ROWS coeffq+16*17, 32 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(iadst_16x16_internal_8bpc).pass1_end1)] + lea tx2q, [o(.pass1_end1)] mova m7, [o(pw_8192)] jmp m(iadst_8x8_internal_8bpc).pass1_end1
@@ -3154,7 +3146,7 @@ cglobal iadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 call m(iadst_16x8_internal_8bpc).main call m(iadst_16x8_internal_8bpc).main_pass1_end
- lea tx2q, [o(m(iadst_16x16_internal_8bpc).pass1_end2)] + lea tx2q, [o(.pass1_end2)] mova m7, [o(pw_8192)] jmp m(iadst_8x8_internal_8bpc).pass1_end1
@@ -3167,13 +3159,13 @@ cglobal iadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 jmp m(iadst_8x8_internal_8bpc).pass1_end1
.pass2: - lea tx2q, [o(m(iadst_16x16_internal_8bpc).end)] + lea tx2q, [o(.end)] jmp m(iadst_8x16_internal_8bpc).pass2_pre
.end: LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(iadst_16x16_internal_8bpc).end1)] + lea tx2q, [o(.end1)] mov dstq, r3 lea r3, [dstq+8] jmp m(iadst_8x8_internal_8bpc).end @@ -3211,7 +3203,7 @@ cglobal iflipadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 call m(iadst_16x8_internal_8bpc).main_pass1_end
mov r3, tx2q - lea tx2q, [o(m(iflipadst_16x16_internal_8bpc).pass1_end)] + lea tx2q, [o(.pass1_end)] mova m7, [o(pw_m8192)] jmp m(iflipadst_8x8_internal_8bpc).pass1_end1
@@ -3219,7 +3211,7 @@ cglobal iflipadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 SAVE_8ROWS coeffq+16*1, 32 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(iflipadst_16x16_internal_8bpc).pass1_end1)] + lea tx2q, [o(.pass1_end1)] mova m7, [o(pw_m8192)] jmp m(iflipadst_8x8_internal_8bpc).pass1_end1
@@ -3233,7 +3225,7 @@ cglobal iflipadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 SAVE_8ROWS coeffq+16*0, 32 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(iflipadst_16x16_internal_8bpc).pass1_end2)] + lea tx2q, [o(.pass1_end2)] mova m7, [o(pw_m8192)] jmp m(iflipadst_8x8_internal_8bpc).pass1_end1
@@ -3246,14 +3238,14 @@ cglobal iflipadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 jmp m(iflipadst_8x8_internal_8bpc).pass1_end1
.pass2: - lea tx2q, [o(m(iflipadst_16x16_internal_8bpc).end)] + lea tx2q, [o(.end)] lea r3, [dstq+8] jmp m(iflipadst_8x16_internal_8bpc).pass2_pre
.end: LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(iflipadst_16x16_internal_8bpc).end1)] + lea tx2q, [o(.end1)] lea dstq, [dstq+strideq*2] jmp m(iflipadst_8x8_internal_8bpc).end
@@ -3276,7 +3268,7 @@ cglobal iflipadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova [rsp+gprsize+16*5], m6 mova [rsp+gprsize+16*6], m7
- lea tx2q, [o(m(iflipadst_16x16_internal_8bpc).end2)] + lea tx2q, [o(.end2)] mov dstq, r3 jmp m(iflipadst_8x16_internal_8bpc).pass2_main
@@ -3300,7 +3292,7 @@ INV_TXFM_16X16_FN identity, identity cglobal iidentity_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 add coeffq, 16*17 mov r3, tx2q - lea tx2q, [o(m(iidentity_16x16_internal_8bpc).pass1_end)] + lea tx2q, [o(.pass1_end)]
.pass1: mova m6, [o(pw_1697x16)] @@ -3321,13 +3313,13 @@ cglobal iidentity_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 .pass1_end: SAVE_8ROWS coeffq, 32 sub coeffq, 16 - lea tx2q, [o(m(iidentity_16x16_internal_8bpc).pass1_end1)] + lea tx2q, [o(.pass1_end1)] jmp .pass1
.pass1_end1: SAVE_8ROWS coeffq, 32 sub coeffq, 15*16 - lea tx2q, [o(m(iidentity_16x16_internal_8bpc).pass1_end2)] + lea tx2q, [o(.pass1_end2)] jmp .pass1
.pass1_end2: @@ -3338,7 +3330,7 @@ cglobal iidentity_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
.pass2: lea r3, [dstq+8] - lea tx2q, [o(m(iidentity_16x16_internal_8bpc).end1)] + lea tx2q, [o(.end1)]
.end: mova [rsp+gprsize+16*0], m7 @@ -3361,7 +3353,7 @@ cglobal iidentity_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
.end1: LOAD_8ROWS coeffq+16*1, 32 - lea tx2q, [o(m(iidentity_16x16_internal_8bpc).end2)] + lea tx2q, [o(.end2)] lea dstq, [dstq+strideq*2] jmp .end
@@ -3371,7 +3363,7 @@ cglobal iidentity_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
add coeffq, 32*8 LOAD_8ROWS coeffq, 32 - lea tx2q, [o(m(iidentity_16x16_internal_8bpc).end3)] + lea tx2q, [o(.end3)] mov dstq, r3 jmp .end
@@ -3403,7 +3395,7 @@ cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, pshuflw m0, m0, q0000 punpcklwd m0, m0 mov r3d, 8 - lea tx2q, [o(m(inv_txfm_add_dct_dct_8x32_8bpc).end)] + lea tx2q, [o(.end)] jmp m(inv_txfm_add_dct_dct_8x8_8bpc).loop
.end: @@ -3412,14 +3404,13 @@ cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob,
cglobal idct_8x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 - %undef cmp cmp eobd, 106 jle .fast
LOAD_8ROWS coeffq+16*3, 64 call m(idct_8x8_internal_8bpc).main mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_8x32_internal_8bpc).pass1)] + lea tx2q, [o(.pass1)] jmp m(idct_8x8_internal_8bpc).pass1_end1
.pass1: @@ -3434,7 +3425,7 @@ cglobal idct_8x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS coeffq+16*2, 64 call m(idct_8x8_internal_8bpc).main mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_8x32_internal_8bpc).pass1_1)] + lea tx2q, [o(.pass1_1)] jmp m(idct_8x8_internal_8bpc).pass1_end1
.pass1_1: @@ -3451,7 +3442,7 @@ cglobal idct_8x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS coeffq+16*1, 64 call m(idct_8x8_internal_8bpc).main mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_8x32_internal_8bpc).pass1_end)] + lea tx2q, [o(.pass1_end)] jmp m(idct_8x8_internal_8bpc).pass1_end1
.pass1_end: @@ -3466,7 +3457,7 @@ cglobal idct_8x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS coeffq+16*0, 64 call m(idct_8x8_internal_8bpc).main mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_8x32_internal_8bpc).pass1_end1)] + lea tx2q, [o(.pass1_end1)] jmp m(idct_8x8_internal_8bpc).pass1_end1
.pass1_end1: @@ -3514,11 +3505,11 @@ cglobal idct_8x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 call .main
.pass2: - lea r3, [o(m(idct_8x32_internal_8bpc).end6)] + lea r3, [o(.end6)]
.end: mova [rsp+gprsize+16*0 ], m7 - lea tx2q, [o(m(idct_8x32_internal_8bpc).end2)] + lea tx2q, [o(.end2)]
.end1: pxor m7, m7 @@ -3530,21 +3521,21 @@ cglobal idct_8x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 jmp tx2q
.end2: - lea tx2q, [o(m(idct_8x32_internal_8bpc).end3)] + lea tx2q, [o(.end3)] jmp m(idct_8x8_internal_8bpc).end
.end3: LOAD_8ROWS rsp+gprsize+16*11, 16 mova [rsp+gprsize+16*0 ], m7 lea dstq, [dstq+strideq*2] - lea tx2q, [o(m(idct_8x32_internal_8bpc).end4)] + lea tx2q, [o(.end4)] jmp m(idct_8x8_internal_8bpc).end
.end4: LOAD_8ROWS rsp+gprsize+16*19, 16 mova [rsp+gprsize+16*0 ], m7 lea dstq, [dstq+strideq*2] - lea tx2q, [o(m(idct_8x32_internal_8bpc).end5)] + lea tx2q, [o(.end5)] jmp m(idct_8x8_internal_8bpc).end
.end5: @@ -3883,7 +3874,7 @@ cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, movd m2, [o(pw_8192)] mov [coeffq], eobd mov r3d, 8 - lea tx2q, [o(m(inv_txfm_add_dct_dct_32x8_8bpc).end)] + lea tx2q, [o(.end)]
.body: pmulhrsw m0, m2 @@ -3919,7 +3910,6 @@ cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob,
cglobal idct_32x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 - %undef cmp LOAD_8ROWS coeffq+16*0, 64 call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 @@ -3958,55 +3948,55 @@ cglobal idct_32x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
.pass2: mova [rsp+gprsize+16*0 ], m7 - lea tx2q, [o(m(idct_32x8_internal_8bpc).end)] + lea tx2q, [o(.end)] jmp m(idct_8x32_internal_8bpc).end1
.end: mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_32x8_internal_8bpc).end1)] + lea tx2q, [o(.end1)] jmp m(idct_8x8_internal_8bpc).pass1_end1
.end1: lea r3, [dstq+8] - lea tx2q, [o(m(idct_32x8_internal_8bpc).end2)] + lea tx2q, [o(.end2)] jmp m(idct_8x8_internal_8bpc).pass2_main
.end2: LOAD_8ROWS rsp+gprsize+16*11, 16 mova [rsp+gprsize+16*0 ], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_32x8_internal_8bpc).end3)] + lea tx2q, [o(.end3)] jmp m(idct_8x8_internal_8bpc).pass1_end1
.end3: mov dstq, r3 add r3, 8 - lea tx2q, [o(m(idct_32x8_internal_8bpc).end4)] + lea tx2q, [o(.end4)] jmp m(idct_8x8_internal_8bpc).pass2_main
.end4: LOAD_8ROWS rsp+gprsize+16*19, 16 mova [rsp+gprsize+16*0 ], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_32x8_internal_8bpc).end5)] + lea tx2q, [o(.end5)] jmp m(idct_8x8_internal_8bpc).pass1_end1
.end5: mov dstq, r3 add r3, 8 - lea tx2q, [o(m(idct_32x8_internal_8bpc).end6)] + lea tx2q, [o(.end6)] jmp m(idct_8x8_internal_8bpc).pass2_main
.end6: LOAD_8ROWS rsp+gprsize+16*27, 16 mova [rsp+gprsize+16*0 ], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_32x8_internal_8bpc).end7)] + lea tx2q, [o(.end7)] jmp m(idct_8x8_internal_8bpc).pass1_end1
.end7: mov dstq, r3 - lea tx2q, [o(m(idct_32x8_internal_8bpc).end8)] + lea tx2q, [o(.end8)] jmp m(idct_8x8_internal_8bpc).pass2_main
.end8: @@ -4085,6 +4075,7 @@ cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob test eobd, eobd jz .dconly call m(idct_16x32_internal_8bpc) +.end: RET
.dconly: @@ -4094,28 +4085,24 @@ cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob mov [coeffq], eobd pmulhrsw m0, m1 mov r2d, 16 - lea tx2q, [o(m(inv_txfm_add_dct_dct_16x32_8bpc).end)] + lea tx2q, [o(.end)] jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
-.end: - RET
cglobal idct_16x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 - %undef cmp - LOAD_8ROWS coeffq+16*1, 128, 1 call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+16*5, 128, 1 call m(idct_16x8_internal_8bpc).main - lea tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end)] + lea tx2q, [o(.pass1_end)] jmp m(idct_8x8_internal_8bpc).pass1_end
.pass1_end: SAVE_8ROWS coeffq+16*33, 64 ;in8~in15 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end1)] + lea tx2q, [o(.pass1_end1)] jmp m(idct_8x8_internal_8bpc).pass1_end
.pass1_end1: @@ -4132,14 +4119,14 @@ cglobal idct_16x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+16*4, 128, 1 call m(idct_16x8_internal_8bpc).main - lea tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end2)] + lea tx2q, [o(.pass1_end2)] jmp m(idct_8x8_internal_8bpc).pass1_end
.pass1_end2: SAVE_8ROWS coeffq+16*32, 64 ;in0~in7 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end3)] + lea tx2q, [o(.pass1_end3)] jmp m(idct_8x8_internal_8bpc).pass1_end
.pass1_end3: @@ -4182,14 +4169,14 @@ cglobal idct_16x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+16*6, 128, 1 call m(idct_16x8_internal_8bpc).main - lea tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end4)] + lea tx2q, [o(.pass1_end4)] jmp m(idct_8x8_internal_8bpc).pass1_end
.pass1_end4: SAVE_8ROWS coeffq+16*34, 64 ;in16~in23 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end5)] + lea tx2q, [o(.pass1_end5)] jmp m(idct_8x8_internal_8bpc).pass1_end
.pass1_end5: @@ -4207,14 +4194,14 @@ cglobal idct_16x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+16*7, 128, 1 call m(idct_16x8_internal_8bpc).main - lea tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end6)] + lea tx2q, [o(.pass1_end6)] jmp m(idct_8x8_internal_8bpc).pass1_end
.pass1_end6: SAVE_8ROWS coeffq+16*35, 64 ;in24~in31 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end7)] + lea tx2q, [o(.pass1_end7)] jmp m(idct_8x8_internal_8bpc).pass1_end
.pass1_end7: @@ -4246,7 +4233,7 @@ cglobal idct_16x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mov [rsp+gprsize*1+16*35], eobd lea r3, [dstq+8] mov [rsp+gprsize*2+16*35], r3 - lea r3, [o(m(idct_16x32_internal_8bpc).end)] + lea r3, [o(.end)] jmp m(idct_8x32_internal_8bpc).end
.end: @@ -4296,7 +4283,7 @@ cglobal idct_16x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 SAVE_8ROWS rsp+gprsize+16*11, 16
call m(idct_8x32_internal_8bpc).main_fast - jmp .end1 + jmp m(idct_8x32_internal_8bpc).pass2
.full1: mova m4, [coeffq+16*2 ] ;in16 @@ -4337,12 +4324,9 @@ cglobal idct_16x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova [rsp+gprsize+16*34], m7 ;in31
call m(idct_8x32_internal_8bpc).main - -.end1: jmp m(idct_8x32_internal_8bpc).pass2
- cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 %if ARCH_X86_32 LEA r5, $$ @@ -4390,10 +4374,8 @@ cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob
cglobal idct_32x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 - %undef cmp - add coeffq, 16 - lea r3, [o(m(idct_32x16_internal_8bpc).pass1_end1)] + lea r3, [o(.pass1_end1)] .pass1: LOAD_8ROWS coeffq+16*0, 128, 1 call m(idct_8x8_internal_8bpc).main @@ -4434,28 +4416,28 @@ cglobal idct_32x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 SAVE_8ROWS coeffq+16*0, 32 LOAD_8ROWS rsp+gprsize+16*11, 16 mova [rsp+gprsize+16*0 ], m7 - lea tx2q, [o(m(idct_32x16_internal_8bpc).pass1_end2)] + lea tx2q, [o(.pass1_end2)] jmp m(idct_8x8_internal_8bpc).pass1_end
.pass1_end2: SAVE_8ROWS coeffq+16*16, 32 LOAD_8ROWS rsp+gprsize+16*19, 16 mova [rsp+gprsize+16*0 ], m7 - lea tx2q, [o(m(idct_32x16_internal_8bpc).pass1_end3)] + lea tx2q, [o(.pass1_end3)] jmp m(idct_8x8_internal_8bpc).pass1_end
.pass1_end3: SAVE_8ROWS coeffq+16*32, 32 LOAD_8ROWS rsp+gprsize+16*27, 16 mova [rsp+gprsize+16*0 ], m7 - lea tx2q, [o(m(idct_32x16_internal_8bpc).pass1_end4)] + lea tx2q, [o(.pass1_end4)] jmp m(idct_8x8_internal_8bpc).pass1_end
.pass1_end4: SAVE_8ROWS coeffq+16*48, 32
sub coeffq, 16 - lea r3, [o(m(idct_32x16_internal_8bpc).end)] + lea r3, [o(.end)] jmp .pass1
.end: @@ -4463,8 +4445,6 @@ cglobal idct_32x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
cglobal inv_txfm_add_identity_identity_16x32_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 - %undef cmp - mov r4d, eobd cmp eobd, 43 ;if (eob > 43) sbb r3d, r3d ; iteration_count++ @@ -4528,8 +4508,6 @@ cglobal inv_txfm_add_identity_identity_16x32_8bpc, 4, 6, 8, 16*4, dst, stride, c
cglobal inv_txfm_add_identity_identity_32x16_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 - %undef cmp - mov r4d, 12 ;0100b mov r5d, 136 ;1000 1000b cmp eobd, 44 ;if (eob > 43) @@ -4608,8 +4586,6 @@ cglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob
cglobal idct_32x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 - %undef cmp - mov r4d, 2 sub eobd, 136 mov [rsp+gprsize*1+16*35], eobd @@ -4684,7 +4660,7 @@ cglobal idct_32x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 .pass1_end: mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_32x32_internal_8bpc).pass1_end1)] + lea tx2q, [o(.pass1_end1)] jmp m(idct_8x8_internal_8bpc).pass1_end1
.pass1_end1: @@ -4692,7 +4668,7 @@ cglobal idct_32x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS rsp+gprsize+16*11, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_32x32_internal_8bpc).pass1_end2)] + lea tx2q, [o(.pass1_end2)] jmp m(idct_8x8_internal_8bpc).pass1_end1
.pass1_end2: @@ -4700,7 +4676,7 @@ cglobal idct_32x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS rsp+gprsize+16*19, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_32x32_internal_8bpc).pass1_end3)] + lea tx2q, [o(.pass1_end3)] jmp m(idct_8x8_internal_8bpc).pass1_end1
.pass1_end3: @@ -4708,7 +4684,7 @@ cglobal idct_32x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS rsp+gprsize+16*27, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_32x32_internal_8bpc).pass1_end4)] + lea tx2q, [o(.pass1_end4)] jmp m(idct_8x8_internal_8bpc).pass1_end1
.pass1_end4: @@ -4722,7 +4698,7 @@ cglobal idct_32x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 .pass2: mov coeffq, [rsp+gprsize*2+16*35] mov r3d, 4 - lea tx2q, [o(m(idct_32x32_internal_8bpc).pass2_end)] + lea tx2q, [o(.pass2_end)]
.pass2_loop: mov [rsp+gprsize*3+16*35], r3d @@ -4818,11 +4794,11 @@ cglobal idct_32x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 jmp tx2q
.pass2_end: - lea r3, [o(m(idct_32x32_internal_8bpc).pass2_end1)] + lea r3, [o(.pass2_end1)] jmp m(idct_8x32_internal_8bpc).end
.pass2_end1: - lea tx2q, [o(m(idct_32x32_internal_8bpc).pass2_end)] + lea tx2q, [o(.pass2_end)] add coeffq, 16*32 mov dstq, [rsp+gprsize*2+16*35] mov r3d, [rsp+gprsize*3+16*35] @@ -4833,8 +4809,6 @@ cglobal idct_32x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
cglobal inv_txfm_add_identity_identity_32x32_8bpc, 4, 6, 8, 16*5, dst, stride, coeff, eob, tx2 - %undef cmp - mov r4d, 2 cmp eobd, 136 mov r3d, 4 @@ -4895,8 +4869,8 @@ cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 6, 8, 16*68, dst, stride, coeff, eob %endif test eobd, eobd jz .dconly - call m(idct_16x64_internal_8bpc) +.end: RET
.dconly: @@ -4905,16 +4879,11 @@ cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 6, 8, 16*68, dst, stride, coeff, eob movd m2, [o(pw_8192)] mov [coeffq], eobd mov r2d, 32 - lea tx2q, [o(m(inv_txfm_add_dct_dct_16x64_8bpc).end)] + lea tx2q, [o(.end)] jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
-.end: - RET -
cglobal idct_16x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 - %undef cmp - mov r4d, 2 sub eobd, 151 mov [rsp+gprsize*1+16*67], eobd @@ -4934,7 +4903,7 @@ cglobal idct_16x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS coeffq+64*1, 64*2 call m(idct_16x8_internal_8bpc).main mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_16x64_internal_8bpc).pass1_end)] + lea tx2q, [o(.pass1_end)] jmp m(idct_8x8_internal_8bpc).pass1_end1
.pass1_end: @@ -4942,7 +4911,7 @@ cglobal idct_16x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_16x64_internal_8bpc).pass1_end1)] + lea tx2q, [o(.pass1_end1)] jmp m(idct_8x8_internal_8bpc).pass1_end1
.pass1_end1: @@ -4956,7 +4925,7 @@ cglobal idct_16x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mov r3d, 2 lea r4, [dstq+8] mov [rsp+gprsize*2+16*67], r4 - lea r4, [o(m(idct_16x64_internal_8bpc).end1)] + lea r4, [o(.end1)]
.pass2_loop: mov [rsp+gprsize*3+16*67], r3d @@ -5082,23 +5051,47 @@ cglobal idct_16x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 .end1: LOAD_8ROWS rsp+gprsize+16*35, 16 lea dstq, [dstq+strideq*2] - add rsp, 16*32 - lea r3, [o(m(idct_16x64_internal_8bpc).end2)] - jmp m(idct_8x32_internal_8bpc).end - -.end2: - add coeffq, 16*32 - sub rsp, 16*32 - + lea r3, [rsp+16*32+gprsize] + call .write mov dstq, [rsp+gprsize*2+16*67] mov r3d, [rsp+gprsize*3+16*67] lea r4, [dstq+8] mov [rsp+gprsize*2+16*67], r4 - lea r4, [o(m(idct_16x64_internal_8bpc).end1)] + lea r4, [o(.end1)]
dec r3d jg .pass2_loop ret +.write: + mova [r3+16*0], m7 + mov r4, -16*32 + pxor m7, m7 + sub coeffq, r4 +.zero_loop: + mova [coeffq+r4+16*0], m7 + mova [coeffq+r4+16*1], m7 + add r4, 16*2 + jl .zero_loop + call .write_main2 + LOAD_8ROWS r3+16*11, 16 + call .write_main + LOAD_8ROWS r3+16*19, 16 + call .write_main + LOAD_8ROWS r3+16*27, 16 +.write_main: + mova [r3+16*0], m7 +.write_main2: + mova m7, [o(pw_2048)] + REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 + pmulhrsw m7, [r3+16*0] + mova [r3+16*2], m5 + mova [r3+16*1], m6 + mova [r3+16*0], m7 + WRITE_8X4 0, 1, 2, 3, 5, 6, 7 + lea dstq, [dstq+strideq*2] + WRITE_8X4 4, [r3+16*2], [r3+16*1], [r3+16*0], 5, 6, 7 + lea dstq, [dstq+strideq*2] + ret
ALIGN function_align @@ -5765,7 +5758,7 @@ cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 6, 8, 16*132, dst, stride, coeff, eo movd m2, [o(pw_8192)] mov [coeffq], eobd mov r3d, 16 - lea tx2q, [o(m(inv_txfm_add_dct_dct_64x16_8bpc).end)] + lea tx2q, [o(.end)]
.body: pmulhrsw m0, m2 @@ -5895,7 +5888,7 @@ cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end)] + lea tx2q, [o(.pass1_end)] jmp m(idct_8x8_internal_8bpc).pass1_end1
.pass1_end: @@ -5903,7 +5896,7 @@ cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS rsp+gprsize+16*11, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end1)] + lea tx2q, [o(.pass1_end1)] jmp m(idct_8x8_internal_8bpc).pass1_end1
.pass1_end1: @@ -5911,7 +5904,7 @@ cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS rsp+gprsize+16*19, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end2)] + lea tx2q, [o(.pass1_end2)] jmp m(idct_8x8_internal_8bpc).pass1_end1
.pass1_end2: @@ -5919,7 +5912,7 @@ cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS rsp+gprsize+16*27, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end3)] + lea tx2q, [o(.pass1_end3)] jmp m(idct_8x8_internal_8bpc).pass1_end1
.pass1_end3: @@ -5927,7 +5920,7 @@ cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS rsp+gprsize+16*35, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end4)] + lea tx2q, [o(.pass1_end4)] jmp m(idct_8x8_internal_8bpc).pass1_end1
.pass1_end4: @@ -5935,7 +5928,7 @@ cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS rsp+gprsize+16*43, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end5)] + lea tx2q, [o(.pass1_end5)] jmp m(idct_8x8_internal_8bpc).pass1_end1
.pass1_end5: @@ -5943,7 +5936,7 @@ cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS rsp+gprsize+16*51, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end6)] + lea tx2q, [o(.pass1_end6)] jmp m(idct_8x8_internal_8bpc).pass1_end1
.pass1_end6: @@ -5951,7 +5944,7 @@ cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS rsp+gprsize+16*59, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end7)] + lea tx2q, [o(.pass1_end7)] jmp m(idct_8x8_internal_8bpc).pass1_end1
.pass1_end7: @@ -5979,14 +5972,14 @@ cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 call m(idct_16x8_internal_8bpc).main
mov r3, dstq - lea tx2q, [o(m(idct_64x16_internal_8bpc).end)] + lea tx2q, [o(.end)] lea dstq, [dstq+strideq*8] jmp m(idct_8x8_internal_8bpc).end
.end: LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_64x16_internal_8bpc).end1)] + lea tx2q, [o(.end1)] mov dstq, r3 jmp m(idct_8x8_internal_8bpc).end
@@ -6016,14 +6009,14 @@ cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 call m(idct_16x8_internal_8bpc).main
mov r3, dstq - lea tx2q, [o(m(idct_64x16_internal_8bpc).end2)] + lea tx2q, [o(.end2)] lea dstq, [dstq+strideq*8] jmp m(idct_8x8_internal_8bpc).end
.end2: LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_64x16_internal_8bpc).end3)] + lea tx2q, [o(.end3)] mov dstq, r3 jmp m(idct_8x8_internal_8bpc).end
@@ -6045,8 +6038,8 @@ cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 6, 8, 16*68, dst, stride, coeff, eob %endif test eobd, eobd jz .dconly - call m(idct_32x64_internal_8bpc) +.end: RET
.dconly: @@ -6056,16 +6049,11 @@ cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 6, 8, 16*68, dst, stride, coeff, eob mov [coeffq], eobd pmulhrsw m0, m1 mov r3d, 64 - lea tx2q, [o(m(inv_txfm_add_dct_dct_32x64_8bpc).end)] + lea tx2q, [o(.end)] jmp m(inv_txfm_add_dct_dct_32x8_8bpc).body
-.end: - RET -
cglobal idct_32x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 - %undef cmp - mov r4d, 2 sub eobd, 136 mov [rsp+gprsize*1+16*67], eobd @@ -6133,28 +6121,28 @@ cglobal idct_32x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
.pass1_end: mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_32x64_internal_8bpc).pass1_end1)] + lea tx2q, [o(.pass1_end1)] jmp m(idct_8x8_internal_8bpc).pass1_end
.pass1_end1: SAVE_8ROWS coeffq+64*0, 64 LOAD_8ROWS rsp+gprsize+16*11, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_32x64_internal_8bpc).pass1_end2)] + lea tx2q, [o(.pass1_end2)] jmp m(idct_8x8_internal_8bpc).pass1_end
.pass1_end2: SAVE_8ROWS coeffq+64*8, 64 LOAD_8ROWS rsp+gprsize+16*19, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_32x64_internal_8bpc).pass1_end3)] + lea tx2q, [o(.pass1_end3)] jmp m(idct_8x8_internal_8bpc).pass1_end
.pass1_end3: SAVE_8ROWS coeffq+64*16, 64 LOAD_8ROWS rsp+gprsize+16*27, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_32x64_internal_8bpc).pass1_end4)] + lea tx2q, [o(.pass1_end4)] jmp m(idct_8x8_internal_8bpc).pass1_end
.pass1_end4: @@ -6179,8 +6167,8 @@ cglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 6, 8, 16*197, dst, stride, coeff, eo %endif test eobd, eobd jz .dconly - call m(idct_64x32_internal_8bpc) +.end: RET
.dconly: @@ -6190,15 +6178,11 @@ cglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 6, 8, 16*197, dst, stride, coeff, eo pmulhrsw m0, m1 mov [coeffq], eobd mov r3d, 32 - lea tx2q, [o(m(inv_txfm_add_dct_dct_64x32_8bpc).end)] + lea tx2q, [o(.end)] jmp m(inv_txfm_add_dct_dct_64x16_8bpc).body
-.end: - RET
cglobal idct_64x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 - %undef cmp - mov r4d, 2 sub eobd, 136 mov [rsp+gprsize*1+16*67], eobd @@ -6266,56 +6250,56 @@ cglobal idct_64x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end)] + lea tx2q, [o(.pass1_end)] jmp m(idct_8x8_internal_8bpc).pass1_end
.pass1_end: SAVE_8ROWS coeffq+64*0, 64 LOAD_8ROWS rsp+gprsize+16*11, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end1)] + lea tx2q, [o(.pass1_end1)] jmp m(idct_8x8_internal_8bpc).pass1_end
.pass1_end1: SAVE_8ROWS coeffq+64*8, 64 LOAD_8ROWS rsp+gprsize+16*19, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end2)] + lea tx2q, [o(.pass1_end2)] jmp m(idct_8x8_internal_8bpc).pass1_end
.pass1_end2: SAVE_8ROWS coeffq+64*16, 64 LOAD_8ROWS rsp+gprsize+16*27, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end3)] + lea tx2q, [o(.pass1_end3)] jmp m(idct_8x8_internal_8bpc).pass1_end
.pass1_end3: SAVE_8ROWS coeffq+64*24, 64 LOAD_8ROWS rsp+gprsize+16*35, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end4)] + lea tx2q, [o(.pass1_end4)] jmp m(idct_8x8_internal_8bpc).pass1_end
.pass1_end4: SAVE_8ROWS dstq+64*0, 64 LOAD_8ROWS rsp+gprsize+16*43, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end5)] + lea tx2q, [o(.pass1_end5)] jmp m(idct_8x8_internal_8bpc).pass1_end
.pass1_end5: SAVE_8ROWS dstq+64*8, 64 LOAD_8ROWS rsp+gprsize+16*51, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end6)] + lea tx2q, [o(.pass1_end6)] jmp m(idct_8x8_internal_8bpc).pass1_end
.pass1_end6: SAVE_8ROWS dstq+64*16, 64 LOAD_8ROWS rsp+gprsize+16*59, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end7)] + lea tx2q, [o(.pass1_end7)] jmp m(idct_8x8_internal_8bpc).pass1_end
.pass1_end7: @@ -6332,17 +6316,17 @@ cglobal idct_64x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mov eobd, [rsp+gprsize*1+16*67] lea dstq, [dstq+32] mov [rsp+gprsize*1+16*35], eobd - lea tx2q, [o(m(idct_64x32_internal_8bpc).pass2_end)] + lea tx2q, [o(.pass2_end)] mov r3d, 4 jmp m(idct_32x32_internal_8bpc).pass2_loop
.pass2_end: mova [rsp+gprsize+16*0], m7 - lea r3, [o(m(idct_64x32_internal_8bpc).pass2_end1)] + lea r3, [o(.pass2_end1)] jmp m(idct_8x32_internal_8bpc).end2
.pass2_end1: - lea tx2q, [o(m(idct_64x32_internal_8bpc).pass2_end)] + lea tx2q, [o(.pass2_end)] add coeffq, 16*32 mov dstq, [rsp+gprsize*2+16*35] mov r3d, [rsp+gprsize*3+16*35] @@ -6377,8 +6361,6 @@ cglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 6, 8, 16*197, dst, stride, coeff, eo jmp m(inv_txfm_add_dct_dct_64x16_8bpc).body
cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 - %undef cmp - mov r5d, 4 mov r4d, 2 sub eobd, 136 @@ -6448,7 +6430,7 @@ cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end)] + lea tx2q, [o(.pass1_end)] jmp m(idct_8x8_internal_8bpc).pass1_end1
.pass1_end: @@ -6456,7 +6438,7 @@ cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS rsp+gprsize+16*11, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end1)] + lea tx2q, [o(.pass1_end1)] jmp m(idct_8x8_internal_8bpc).pass1_end1
.pass1_end1: @@ -6464,7 +6446,7 @@ cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS rsp+gprsize+16*19, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end2)] + lea tx2q, [o(.pass1_end2)] jmp m(idct_8x8_internal_8bpc).pass1_end1
.pass1_end2: @@ -6472,7 +6454,7 @@ cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS rsp+gprsize+16*27, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end3)] + lea tx2q, [o(.pass1_end3)] jmp m(idct_8x8_internal_8bpc).pass1_end1
.pass1_end3: @@ -6480,7 +6462,7 @@ cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS rsp+gprsize+16*35, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end4)] + lea tx2q, [o(.pass1_end4)] jmp m(idct_8x8_internal_8bpc).pass1_end1
.pass1_end4: @@ -6488,7 +6470,7 @@ cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS rsp+gprsize+16*43, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end5)] + lea tx2q, [o(.pass1_end5)] jmp m(idct_8x8_internal_8bpc).pass1_end1
.pass1_end5: @@ -6496,7 +6478,7 @@ cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS rsp+gprsize+16*51, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end6)] + lea tx2q, [o(.pass1_end6)] jmp m(idct_8x8_internal_8bpc).pass1_end1
.pass1_end6: @@ -6504,7 +6486,7 @@ cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS rsp+gprsize+16*59, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end7)] + lea tx2q, [o(.pass1_end7)] jmp m(idct_8x8_internal_8bpc).pass1_end1
.pass1_end7: @@ -6522,26 +6504,20 @@ cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mov r3d, 4 lea r4, [dstq+8] mov [rsp+gprsize*2+16*67], r4 - lea r4, [o(m(idct_64x64_internal_8bpc).pass2_end)] + lea r4, [o(.pass2_end)] jmp m(idct_16x64_internal_8bpc).pass2_loop
.pass2_end: LOAD_8ROWS rsp+gprsize+16*35, 16 lea dstq, [dstq+strideq*2] - add rsp, 16*32 + lea r3, [rsp+16*32+gprsize] mova [rsp+gprsize+16*0], m7 - lea r3, [o(m(idct_64x64_internal_8bpc).pass2_end1)] - jmp m(idct_8x32_internal_8bpc).end2 - -.pass2_end1: - add coeffq, 16*32 - sub rsp, 16*32 - + call m(idct_16x64_internal_8bpc).write mov dstq, [rsp+gprsize*2+16*67] mov r3d, [rsp+gprsize*3+16*67] lea r4, [dstq+8] mov [rsp+gprsize*2+16*67], r4 - lea r4, [o(m(idct_64x64_internal_8bpc).pass2_end)] + lea r4, [o(.pass2_end)]
dec r3d jg m(idct_16x64_internal_8bpc).pass2_loop diff --git a/third_party/dav1d/src/x86/loopfilter16_avx2.asm b/third_party/dav1d/src/x86/loopfilter16_avx2.asm index 0c8618655c46d..361ccc3b883ce 100644 --- a/third_party/dav1d/src/x86/loopfilter16_avx2.asm +++ b/third_party/dav1d/src/x86/loopfilter16_avx2.asm @@ -49,14 +49,6 @@ pb_mask: dd 1, 1, 2, 2, 4, 4, 8, 8
SECTION .text
-%macro REPX 2-* - %xdefine %%f(x) %1 -%rep %0 - 1 - %rotate 1 - %%f(%1) -%endrep -%endmacro - ; in: out: ; mm%1 a b c d a e i m ; mm%2 e f g h b f j n diff --git a/third_party/dav1d/src/x86/loopfilter16_sse.asm b/third_party/dav1d/src/x86/loopfilter16_sse.asm index 3ec3fd81fe3f0..c486b57a2113a 100644 --- a/third_party/dav1d/src/x86/loopfilter16_sse.asm +++ b/third_party/dav1d/src/x86/loopfilter16_sse.asm @@ -106,14 +106,6 @@ ASSERT ARCH_X86_32 %endif %endmacro
-%macro REPX 2-* - %xdefine %%f(x) %1 -%rep %0 - 1 - %rotate 1 - %%f(%1) -%endrep -%endmacro - %macro SPLATD 2 movd %1, %2 pshufd %1, %1, q0000 diff --git a/third_party/dav1d/src/x86/looprestoration16_avx2.asm b/third_party/dav1d/src/x86/looprestoration16_avx2.asm index 98f51d8f1e5a7..ef25c28474416 100644 --- a/third_party/dav1d/src/x86/looprestoration16_avx2.asm +++ b/third_party/dav1d/src/x86/looprestoration16_avx2.asm @@ -66,14 +66,6 @@ cextern sgr_x_by_x_avx2
SECTION .text
-%macro REPX 2-* - %xdefine %%f(x) %1 -%rep %0 - 1 - %rotate 1 - %%f(%1) -%endrep -%endmacro - DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; wiener ring buffer pointers
INIT_YMM avx2 diff --git a/third_party/dav1d/src/x86/mc16_avx2.asm b/third_party/dav1d/src/x86/mc16_avx2.asm index 3dacfe66a6cf7..8b2ec4fa91fa7 100644 --- a/third_party/dav1d/src/x86/mc16_avx2.asm +++ b/third_party/dav1d/src/x86/mc16_avx2.asm @@ -202,14 +202,6 @@ cextern resize_filter
SECTION .text
-%macro REPX 2-* - %xdefine %%f(x) %1 -%rep %0 - 1 - %rotate 1 - %%f({%1}) -%endrep -%endmacro - INIT_XMM avx2 cglobal put_bilin_16bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy mov mxyd, r6m ; mx diff --git a/third_party/dav1d/src/x86/mc16_avx512.asm b/third_party/dav1d/src/x86/mc16_avx512.asm index c2ea090b0c5ba..e83b18ad969cb 100644 --- a/third_party/dav1d/src/x86/mc16_avx512.asm +++ b/third_party/dav1d/src/x86/mc16_avx512.asm @@ -254,14 +254,6 @@ cextern resize_filter
SECTION .text
-%macro REPX 2-* - %xdefine %%f(x) %1 -%rep %0 - 1 - %rotate 1 - %%f(%1) -%endrep -%endmacro - %if WIN64 DECLARE_REG_TMP 4 %else diff --git a/third_party/dav1d/src/x86/mc16_sse.asm b/third_party/dav1d/src/x86/mc16_sse.asm index 6435bd083cb18..fde8e372a3fed 100644 --- a/third_party/dav1d/src/x86/mc16_sse.asm +++ b/third_party/dav1d/src/x86/mc16_sse.asm @@ -166,14 +166,6 @@ cextern resize_filter
SECTION .text
-%macro REPX 2-* - %xdefine %%f(x) %1 -%rep %0 - 1 - %rotate 1 - %%f(%1) -%endrep -%endmacro - %if UNIX64 DECLARE_REG_TMP 7 %else @@ -4799,9 +4791,7 @@ INIT_XMM ssse3 psrad m6, hsh_mem packssdw m11, m6 ; 7 8 %if ARCH_X86_64 - ; fixme a bug in x86inc.asm forces us to explicitly load m9 - mova m9, [stk+0x40] - shufps m9, m11, q1032 ; 6 7 + shufps m9, [stk+0x40], m11, q1032 ; 6 7 mova m0, [stk+0x00] mova [stk+0x40], m11 %else diff --git a/third_party/dav1d/src/x86/mc_avx512.asm b/third_party/dav1d/src/x86/mc_avx512.asm index fb55449f335a8..eb3ca1c427da5 100644 --- a/third_party/dav1d/src/x86/mc_avx512.asm +++ b/third_party/dav1d/src/x86/mc_avx512.asm @@ -303,14 +303,6 @@ BIDIR_JMP_TABLE blend_h, avx512icl, 2, 4, 8, 16, 32, 64, 128
SECTION .text
-%macro REPX 2-* - %xdefine %%f(x) %1 -%rep %0 - 1 - %rotate 1 - %%f(%1) -%endrep -%endmacro - %macro WRAP_YMM 1+ INIT_YMM cpuname %1 diff --git a/third_party/dav1d/tests/checkasm/filmgrain.c b/third_party/dav1d/tests/checkasm/filmgrain.c index a44a3ac422cb3..ff7ffc36c68a6 100644 --- a/third_party/dav1d/tests/checkasm/filmgrain.c +++ b/third_party/dav1d/tests/checkasm/filmgrain.c @@ -30,7 +30,7 @@ #include <string.h>
#include "src/levels.h" -#include "src/film_grain.h" +#include "src/filmgrain.h" #define UNIT_TEST 1 #include "src/fg_apply_tmpl.c"
@@ -155,6 +155,7 @@ static void check_fgy_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
if (check_func(dsp->fgy_32x32xn, "fgy_32x32xn_%dbpc", BITDEPTH)) { ALIGN_STK_16(Dav1dFilmGrainData, fg_data, 16,); + ALIGN_STK_64(uint8_t, scaling, SCALING_SIZE,); fg_data[0].seed = rnd() & 0xFFFF;
#if BITDEPTH == 16 @@ -163,7 +164,6 @@ static void check_fgy_sbrow(const Dav1dFilmGrainDSPContext *const dsp) { const int bitdepth_max = 0xff; #endif
- uint8_t scaling[SCALING_SIZE]; entry grain_lut[GRAIN_HEIGHT + 1][GRAIN_WIDTH]; fg_data[0].grain_scale_shift = rnd() & 3; fg_data[0].ar_coeff_shift = (rnd() & 3) + 6; @@ -267,6 +267,7 @@ static void check_fguv_sbrow(const Dav1dFilmGrainDSPContext *const dsp) { BITDEPTH, ss_name[layout_idx], csfl)) { ALIGN_STK_16(Dav1dFilmGrainData, fg_data, 1,); + ALIGN_STK_64(uint8_t, scaling, SCALING_SIZE,);
fg_data[0].seed = rnd() & 0xFFFF;
@@ -278,7 +279,6 @@ static void check_fguv_sbrow(const Dav1dFilmGrainDSPContext *const dsp) { const int uv_pl = rnd() & 1; const int is_identity = rnd() & 1;
- uint8_t scaling[SCALING_SIZE]; entry grain_lut[2][GRAIN_HEIGHT + 1][GRAIN_WIDTH]; fg_data[0].grain_scale_shift = rnd() & 3; fg_data[0].ar_coeff_shift = (rnd() & 3) + 6; @@ -368,7 +368,7 @@ static void check_fguv_sbrow(const Dav1dFilmGrainDSPContext *const dsp) { checkasm_check_pixel_padded_align(c_dst, stride, a_dst, stride, w, h, "dst", - 32 >> ss_x, 2); + 32 >> ss_x, 4); } }
@@ -380,7 +380,7 @@ static void check_fguv_sbrow(const Dav1dFilmGrainDSPContext *const dsp) { luma_src[y * PXSTRIDE(lstride) + x] &= bitdepth_max; } } - bench_new(a_dst, src, stride, fg_data, 32, scaling, grain_lut[1], 16, + bench_new(a_dst, src, stride, fg_data, 64 >> ss_x, scaling, grain_lut[1], 32 >> ss_y, 1, luma_src, lstride, uv_pl, is_identity HIGHBD_TAIL_SUFFIX); } }