diff options
Diffstat (limited to 'thirdparty')
162 files changed, 8284 insertions, 3619 deletions
diff --git a/thirdparty/README.md b/thirdparty/README.md index 8fabe8a893..5ecbffac13 100644 --- a/thirdparty/README.md +++ b/thirdparty/README.md @@ -318,7 +318,7 @@ Files extracted from upstream source: ## glad - Upstream: https://github.com/Dav1dde/glad -- Version: 2.0.4 (d08b1aa01f8fe57498f04d47b5fa8c48725be877, 2023) +- Version: 2.0.6 (658f48e72aee3c6582e80b05ac0f8787a64fe6bb, 2024) - License: CC0 1.0 and Apache 2.0 Files extracted from upstream source: @@ -334,11 +334,11 @@ Files generated from [upstream web instance](https://gen.glad.sh/): - `glx.c` - `glad/glx.h` -See the permalinks in `glad/gl.h` and `glad/glx.h` to regenrate the files with -a new version of the web instance. +See the permalinks in `glad/egl.h`, `glad/gl.h`, and `glad/glx.h` to regenrate +the files with a new version of the web instance. -Some changes have been made in order to allow loading OpenGL and OpenGLES APIs at the same time. -See the patches in the `patches` directory. +Some changes have been made in order to allow loading OpenGL and OpenGLES APIs +at the same time. See the patches in the `patches` directory. ## glslang @@ -377,16 +377,15 @@ Files extracted from upstream source: ## harfbuzz - Upstream: https://github.com/harfbuzz/harfbuzz -- Version: 8.3.0 (894a1f72ee93a1fd8dc1d9218cb3fd8f048be29a, 2023) +- Version: 8.4.0 (63973005bc07aba599b47fdd4cf788647b601ccd, 2024) - License: MIT Files extracted from upstream source: - `AUTHORS`, `COPYING`, `THANKS` - From the `src` folder, recursively: - - All the `.c`, `.cc`, `.h`, `.hh` files - - Except `main.cc`, `harfbuzz*.cc`, `failing-alloc.c`, `test*.cc`, `hb-wasm*.*`, - and the `wasm` folder + - All the `.cc`, `.h`, `.hh` files + - Except `main.cc`, `harfbuzz*.cc`, `failing-alloc.c`, `test*.cc`, `hb-wasm*.*` ## icu4c @@ -432,18 +431,18 @@ Files extracted from upstream source: ## libktx - Upstream: https://github.com/KhronosGroup/KTX-Software -- Version: 4.3.1 (c0214158d551cfc779624b0f84130bcbbefef59a, 2024) +- Version: 4.3.2 (91ace88675ac59a97e55d0378a6602a9ae6b98bd, 2024) - License: Apache-2.0 Files extracted from upstream source: - `LICENSE.md` -- `include/*` +- `include/` - `lib/dfdutils/LICENSE.adoc` as `LICENSE.dfdutils.adoc` (in root) - `lib/dfdutils/LICENSES/Apache-2.0.txt` as `Apache-2.0.txt` (in root) -- `lib/dfdutils/{KHR/*,dfd.h,colourspaces.c,createdfd.c,interpretdfd.c,printdfd.c,queries.c,dfd2vk.inl,vk2dfd.*}` -- `lib/{basis_sgd.h,formatsize.h,gl_format.h,ktxint.h,uthash.h,vk_format.h,vkformat_enum.h,checkheader.c,swap.c,hashlist.c,vkformat_check.c,basis_transcode.cpp,miniz_wrapper.cpp,filestream.*,memstream.*,texture*}` -- `other_include/KHR/*` +- `lib/dfdutils/{KHR/,dfd.h,colourspaces.c,createdfd.c,interpretdfd.c,printdfd.c,queries.c,dfd2vk.inl,vk2dfd.*}` +- `lib/{basis_sgd.h,formatsize.h,gl_format.h,ktxint.h,uthash.h,vk_format.h,vkformat_enum.h,checkheader.c,swap.c,hashlist.c,vkformat_check.c,vkformat_typesize.c,basis_transcode.cpp,miniz_wrapper.cpp,filestream.*,memstream.*,texture*}` +- `other_include/KHR/` - `utils/unused.h` Some Godot-specific changes are applied via patches included in the `patches` folder. @@ -521,7 +520,7 @@ in the MSVC debugger. ## mbedtls - Upstream: https://github.com/Mbed-TLS/mbedtls -- Version: 2.28.7 (555f84735aecdbd76a566cf087ec8425dfb0c8ab, 2024) +- Version: 2.28.8 (5a764e5555c64337ed17444410269ff21cb617b1, 2024) - License: Apache 2.0 File extracted from upstream release tarball: @@ -592,7 +591,7 @@ to solve some MSVC warnings. See the patches in the `patches` directory. ## miniupnpc - Upstream: https://github.com/miniupnp/miniupnp -- Version: 2.2.6 (faad29d7300f1bfa9dc7795031993c04c5191f59, 2024) +- Version: 2.2.7 (d4d5ec7d48c093b37b2ea5d7171ede21ce9d7ff2, 2024) - License: BSD-3-Clause Files extracted from upstream source: @@ -1036,7 +1035,7 @@ Files extracted from upstream source: ## zstd - Upstream: https://github.com/facebook/zstd -- Version: 1.5.5 (63779c798237346c2b245c546c40b72a5a5913fe, 2023) +- Version: 1.5.6 (794ea1b0afca0f020f4e57b6732332231fb23c70, 2024) - License: BSD-3-Clause Files extracted from upstream source: diff --git a/thirdparty/glad/EGL/eglplatform.h b/thirdparty/glad/EGL/eglplatform.h index 99362a23de..6786afd90b 100644 --- a/thirdparty/glad/EGL/eglplatform.h +++ b/thirdparty/glad/EGL/eglplatform.h @@ -64,6 +64,12 @@ typedef HDC EGLNativeDisplayType; typedef HBITMAP EGLNativePixmapType; typedef HWND EGLNativeWindowType; +#elif defined(__QNX__) + +typedef khronos_uintptr_t EGLNativeDisplayType; +typedef struct _screen_pixmap* EGLNativePixmapType; /* screen_pixmap_t */ +typedef struct _screen_window* EGLNativeWindowType; /* screen_window_t */ + #elif defined(__EMSCRIPTEN__) typedef int EGLNativeDisplayType; diff --git a/thirdparty/glad/gl.c b/thirdparty/glad/gl.c index ee0cc188fc..38ecb514bd 100644 --- a/thirdparty/glad/gl.c +++ b/thirdparty/glad/gl.c @@ -453,6 +453,7 @@ PFNGLMULTITEXCOORDP3UIPROC glad_glMultiTexCoordP3ui = NULL; PFNGLMULTITEXCOORDP3UIVPROC glad_glMultiTexCoordP3uiv = NULL; PFNGLMULTITEXCOORDP4UIPROC glad_glMultiTexCoordP4ui = NULL; PFNGLMULTITEXCOORDP4UIVPROC glad_glMultiTexCoordP4uiv = NULL; +PFNGLNAMEDFRAMEBUFFERTEXTUREMULTIVIEWOVRPROC glad_glNamedFramebufferTextureMultiviewOVR = NULL; PFNGLNEWLISTPROC glad_glNewList = NULL; PFNGLNORMAL3BPROC glad_glNormal3b = NULL; PFNGLNORMAL3BVPROC glad_glNormal3bv = NULL; @@ -2108,40 +2109,29 @@ static void glad_gl_load_GL_EXT_framebuffer_object( GLADuserptrloadfunc load, vo static void glad_gl_load_GL_OVR_multiview( GLADuserptrloadfunc load, void* userptr) { if(!GLAD_GL_OVR_multiview) return; glad_glFramebufferTextureMultiviewOVR = (PFNGLFRAMEBUFFERTEXTUREMULTIVIEWOVRPROC) load(userptr, "glFramebufferTextureMultiviewOVR"); + glad_glNamedFramebufferTextureMultiviewOVR = (PFNGLNAMEDFRAMEBUFFERTEXTUREMULTIVIEWOVRPROC) load(userptr, "glNamedFramebufferTextureMultiviewOVR"); } -#if defined(GL_ES_VERSION_3_0) || defined(GL_VERSION_3_0) -#define GLAD_GL_IS_SOME_NEW_VERSION 1 -#else -#define GLAD_GL_IS_SOME_NEW_VERSION 0 -#endif - -static int glad_gl_get_extensions( int version, const char **out_exts, unsigned int *out_num_exts_i, char ***out_exts_i) { -#if GLAD_GL_IS_SOME_NEW_VERSION - if(GLAD_VERSION_MAJOR(version) < 3) { -#else - GLAD_UNUSED(version); - GLAD_UNUSED(out_num_exts_i); - GLAD_UNUSED(out_exts_i); -#endif - if (glad_glGetString == NULL) { - return 0; +static void glad_gl_free_extensions(char **exts_i) { + if (exts_i != NULL) { + unsigned int index; + for(index = 0; exts_i[index]; index++) { + free((void *) (exts_i[index])); } - *out_exts = (const char *)glad_glGetString(GL_EXTENSIONS); -#if GLAD_GL_IS_SOME_NEW_VERSION - } else { + free((void *)exts_i); + exts_i = NULL; + } +} +static int glad_gl_get_extensions( const char **out_exts, char ***out_exts_i) { +#if defined(GL_ES_VERSION_3_0) || defined(GL_VERSION_3_0) + if (glad_glGetStringi != NULL && glad_glGetIntegerv != NULL) { unsigned int index = 0; unsigned int num_exts_i = 0; char **exts_i = NULL; - if (glad_glGetStringi == NULL || glad_glGetIntegerv == NULL) { - return 0; - } glad_glGetIntegerv(GL_NUM_EXTENSIONS, (int*) &num_exts_i); - if (num_exts_i > 0) { - exts_i = (char **) malloc(num_exts_i * (sizeof *exts_i)); - } + exts_i = (char **) malloc((num_exts_i + 1) * (sizeof *exts_i)); if (exts_i == NULL) { return 0; } @@ -2150,31 +2140,40 @@ static int glad_gl_get_extensions( int version, const char **out_exts, unsigned size_t len = strlen(gl_str_tmp) + 1; char *local_str = (char*) malloc(len * sizeof(char)); - if(local_str != NULL) { - memcpy(local_str, gl_str_tmp, len * sizeof(char)); + if(local_str == NULL) { + exts_i[index] = NULL; + glad_gl_free_extensions(exts_i); + return 0; } + memcpy(local_str, gl_str_tmp, len * sizeof(char)); exts_i[index] = local_str; } + exts_i[index] = NULL; - *out_num_exts_i = num_exts_i; *out_exts_i = exts_i; + + return 1; } +#else + GLAD_UNUSED(out_exts_i); #endif + if (glad_glGetString == NULL) { + return 0; + } + *out_exts = (const char *)glad_glGetString(GL_EXTENSIONS); return 1; } -static void glad_gl_free_extensions(char **exts_i, unsigned int num_exts_i) { - if (exts_i != NULL) { +static int glad_gl_has_extension(const char *exts, char **exts_i, const char *ext) { + if(exts_i) { unsigned int index; - for(index = 0; index < num_exts_i; index++) { - free((void *) (exts_i[index])); + for(index = 0; exts_i[index]; index++) { + const char *e = exts_i[index]; + if(strcmp(e, ext) == 0) { + return 1; + } } - free((void *)exts_i); - exts_i = NULL; - } -} -static int glad_gl_has_extension(int version, const char *exts, unsigned int num_exts_i, char **exts_i, const char *ext) { - if(GLAD_VERSION_MAJOR(version) < 3 || !GLAD_GL_IS_SOME_NEW_VERSION) { + } else { const char *extensions; const char *loc; const char *terminator; @@ -2194,14 +2193,6 @@ static int glad_gl_has_extension(int version, const char *exts, unsigned int num } extensions = terminator; } - } else { - unsigned int index; - for(index = 0; index < num_exts_i; index++) { - const char *e = exts_i[index]; - if(strcmp(e, ext) == 0) { - return 1; - } - } } return 0; } @@ -2210,22 +2201,21 @@ static GLADapiproc glad_gl_get_proc_from_userptr(void *userptr, const char* name return (GLAD_GNUC_EXTENSION (GLADapiproc (*)(const char *name)) userptr)(name); } -static int glad_gl_find_extensions_gl( int version) { +static int glad_gl_find_extensions_gl(void) { const char *exts = NULL; - unsigned int num_exts_i = 0; char **exts_i = NULL; - if (!glad_gl_get_extensions(version, &exts, &num_exts_i, &exts_i)) return 0; + if (!glad_gl_get_extensions(&exts, &exts_i)) return 0; - GLAD_GL_ARB_debug_output = glad_gl_has_extension(version, exts, num_exts_i, exts_i, "GL_ARB_debug_output"); - GLAD_GL_ARB_framebuffer_object = glad_gl_has_extension(version, exts, num_exts_i, exts_i, "GL_ARB_framebuffer_object"); - GLAD_GL_ARB_get_program_binary = glad_gl_has_extension(version, exts, num_exts_i, exts_i, "GL_ARB_get_program_binary"); - GLAD_GL_EXT_framebuffer_blit = glad_gl_has_extension(version, exts, num_exts_i, exts_i, "GL_EXT_framebuffer_blit"); - GLAD_GL_EXT_framebuffer_multisample = glad_gl_has_extension(version, exts, num_exts_i, exts_i, "GL_EXT_framebuffer_multisample"); - GLAD_GL_EXT_framebuffer_object = glad_gl_has_extension(version, exts, num_exts_i, exts_i, "GL_EXT_framebuffer_object"); - GLAD_GL_OVR_multiview = glad_gl_has_extension(version, exts, num_exts_i, exts_i, "GL_OVR_multiview"); - GLAD_GL_OVR_multiview2 = glad_gl_has_extension(version, exts, num_exts_i, exts_i, "GL_OVR_multiview2"); + GLAD_GL_ARB_debug_output = glad_gl_has_extension(exts, exts_i, "GL_ARB_debug_output"); + GLAD_GL_ARB_framebuffer_object = glad_gl_has_extension(exts, exts_i, "GL_ARB_framebuffer_object"); + GLAD_GL_ARB_get_program_binary = glad_gl_has_extension(exts, exts_i, "GL_ARB_get_program_binary"); + GLAD_GL_EXT_framebuffer_blit = glad_gl_has_extension(exts, exts_i, "GL_EXT_framebuffer_blit"); + GLAD_GL_EXT_framebuffer_multisample = glad_gl_has_extension(exts, exts_i, "GL_EXT_framebuffer_multisample"); + GLAD_GL_EXT_framebuffer_object = glad_gl_has_extension(exts, exts_i, "GL_EXT_framebuffer_object"); + GLAD_GL_OVR_multiview = glad_gl_has_extension(exts, exts_i, "GL_OVR_multiview"); + GLAD_GL_OVR_multiview2 = glad_gl_has_extension(exts, exts_i, "GL_OVR_multiview2"); - glad_gl_free_extensions(exts_i, num_exts_i); + glad_gl_free_extensions(exts_i); return 1; } @@ -2275,7 +2265,6 @@ int gladLoadGLUserPtr( GLADuserptrloadfunc load, void *userptr) { glad_glGetString = (PFNGLGETSTRINGPROC) load(userptr, "glGetString"); if(glad_glGetString == NULL) return 0; - if(glad_glGetString(GL_VERSION) == NULL) return 0; version = glad_gl_find_core_gl(); glad_gl_load_GL_VERSION_1_0(load, userptr); @@ -2291,7 +2280,7 @@ int gladLoadGLUserPtr( GLADuserptrloadfunc load, void *userptr) { glad_gl_load_GL_VERSION_3_2(load, userptr); glad_gl_load_GL_VERSION_3_3(load, userptr); - if (!glad_gl_find_extensions_gl(version)) return 0; + if (!glad_gl_find_extensions_gl()) return 0; glad_gl_load_GL_ARB_debug_output(load, userptr); glad_gl_load_GL_ARB_framebuffer_object(load, userptr); glad_gl_load_GL_ARB_get_program_binary(load, userptr); @@ -2310,16 +2299,15 @@ int gladLoadGL( GLADloadfunc load) { return gladLoadGLUserPtr( glad_gl_get_proc_from_userptr, GLAD_GNUC_EXTENSION (void*) load); } -static int glad_gl_find_extensions_gles2( int version) { +static int glad_gl_find_extensions_gles2(void) { const char *exts = NULL; - unsigned int num_exts_i = 0; char **exts_i = NULL; - if (!glad_gl_get_extensions(version, &exts, &num_exts_i, &exts_i)) return 0; + if (!glad_gl_get_extensions(&exts, &exts_i)) return 0; - GLAD_GL_OVR_multiview = glad_gl_has_extension(version, exts, num_exts_i, exts_i, "GL_OVR_multiview"); - GLAD_GL_OVR_multiview2 = glad_gl_has_extension(version, exts, num_exts_i, exts_i, "GL_OVR_multiview2"); + GLAD_GL_OVR_multiview = glad_gl_has_extension(exts, exts_i, "GL_OVR_multiview"); + GLAD_GL_OVR_multiview2 = glad_gl_has_extension(exts, exts_i, "GL_OVR_multiview2"); - glad_gl_free_extensions(exts_i, num_exts_i); + glad_gl_free_extensions(exts_i); return 1; } @@ -2361,7 +2349,6 @@ int gladLoadGLES2UserPtr( GLADuserptrloadfunc load, void *userptr) { glad_glGetString = (PFNGLGETSTRINGPROC) load(userptr, "glGetString"); if(glad_glGetString == NULL) return 0; - if(glad_glGetString(GL_VERSION) == NULL) return 0; version = glad_gl_find_core_gles2(); glad_gl_load_GL_ES_VERSION_2_0(load, userptr); @@ -2369,7 +2356,7 @@ int gladLoadGLES2UserPtr( GLADuserptrloadfunc load, void *userptr) { glad_gl_load_GL_ES_VERSION_3_1(load, userptr); glad_gl_load_GL_ES_VERSION_3_2(load, userptr); - if (!glad_gl_find_extensions_gles2(version)) return 0; + if (!glad_gl_find_extensions_gles2()) return 0; glad_gl_load_GL_OVR_multiview(load, userptr); @@ -2627,10 +2614,9 @@ static GLADapiproc glad_dlsym_handle(void* handle, const char *name) { typedef __eglMustCastToProperFunctionPointerType (GLAD_API_PTR *PFNEGLGETPROCADDRESSPROC)(const char *name); #endif extern __eglMustCastToProperFunctionPointerType emscripten_GetProcAddress(const char *name); -#elif EGL_STATIC - typedef void (*__eglMustCastToProperFunctionPointerType)(void); +#elif defined(GLAD_GLES2_USE_SYSTEM_EGL) + #include <EGL/egl.h> typedef __eglMustCastToProperFunctionPointerType (GLAD_API_PTR *PFNEGLGETPROCADDRESSPROC)(const char *name); - extern __eglMustCastToProperFunctionPointerType GLAD_API_PTR eglGetProcAddress(const char *name); #else #include <glad/egl.h> #endif @@ -2658,7 +2644,7 @@ static GLADapiproc glad_gles2_get_proc(void *vuserptr, const char* name) { return result; } -static void* _glad_GL_loader_handle = NULL; +static void* _glad_GLES2_loader_handle = NULL; static void* glad_gles2_dlopen_handle(void) { #if GLAD_PLATFORM_EMSCRIPTEN @@ -2674,11 +2660,11 @@ static void* glad_gles2_dlopen_handle(void) { GLAD_UNUSED(glad_get_dlopen_handle); return NULL; #else - if (_glad_GL_loader_handle == NULL) { - _glad_GL_loader_handle = glad_get_dlopen_handle(NAMES, sizeof(NAMES) / sizeof(NAMES[0])); + if (_glad_GLES2_loader_handle == NULL) { + _glad_GLES2_loader_handle = glad_get_dlopen_handle(NAMES, sizeof(NAMES) / sizeof(NAMES[0])); } - return _glad_GL_loader_handle; + return _glad_GLES2_loader_handle; #endif } @@ -2708,11 +2694,12 @@ int gladLoaderLoadGLES2(void) { userptr.get_proc_address_ptr = emscripten_GetProcAddress; version = gladLoadGLES2UserPtr(glad_gles2_get_proc, &userptr); #else +#ifndef GLAD_GLES2_USE_SYSTEM_EGL if (eglGetProcAddress == NULL) { return 0; } - - did_load = _glad_GL_loader_handle == NULL; +#endif + did_load = _glad_GLES2_loader_handle == NULL; handle = glad_gles2_dlopen_handle(); if (handle != NULL) { userptr = glad_gles2_build_userptr(handle); @@ -2731,9 +2718,9 @@ int gladLoaderLoadGLES2(void) { void gladLoaderUnloadGLES2(void) { - if (_glad_GL_loader_handle != NULL) { - glad_close_dlopen_handle(_glad_GL_loader_handle); - _glad_GL_loader_handle = NULL; + if (_glad_GLES2_loader_handle != NULL) { + glad_close_dlopen_handle(_glad_GLES2_loader_handle); + _glad_GLES2_loader_handle = NULL; } } diff --git a/thirdparty/glad/glad/egl.h b/thirdparty/glad/glad/egl.h index 1bf35c1404..053c5853a7 100644 --- a/thirdparty/glad/glad/egl.h +++ b/thirdparty/glad/glad/egl.h @@ -1,5 +1,5 @@ /** - * Loader generated by glad 2.0.3 on Fri Feb 3 07:06:48 2023 + * Loader generated by glad 2.0.6 on Fri Apr 5 08:17:09 2024 * * SPDX-License-Identifier: (WTFPL OR CC0-1.0) AND Apache-2.0 * @@ -141,7 +141,7 @@ extern "C" { #define GLAD_VERSION_MAJOR(version) (version / 10000) #define GLAD_VERSION_MINOR(version) (version % 10000) -#define GLAD_GENERATOR_VERSION "2.0.3" +#define GLAD_GENERATOR_VERSION "2.0.6" typedef void (*GLADapiproc)(void); diff --git a/thirdparty/glad/glad/gl.h b/thirdparty/glad/glad/gl.h index 307ea4dbb8..1301d10b65 100644 --- a/thirdparty/glad/glad/gl.h +++ b/thirdparty/glad/glad/gl.h @@ -1,5 +1,5 @@ /** - * Loader generated by glad 2.0.4 on Mon May 22 13:18:29 2023 + * Loader generated by glad 2.0.6 on Fri Apr 5 08:14:44 2024 * * SPDX-License-Identifier: (WTFPL OR CC0-1.0) AND Apache-2.0 * @@ -178,7 +178,7 @@ extern "C" { #define GLAD_VERSION_MAJOR(version) (version / 10000) #define GLAD_VERSION_MINOR(version) (version % 10000) -#define GLAD_GENERATOR_VERSION "2.0.4" +#define GLAD_GENERATOR_VERSION "2.0.6" typedef void (*GLADapiproc)(void); @@ -2394,6 +2394,7 @@ typedef void (GLAD_API_PTR *PFNGLMULTITEXCOORDP3UIPROC)(GLenum texture, GLenum t typedef void (GLAD_API_PTR *PFNGLMULTITEXCOORDP3UIVPROC)(GLenum texture, GLenum type, const GLuint * coords); typedef void (GLAD_API_PTR *PFNGLMULTITEXCOORDP4UIPROC)(GLenum texture, GLenum type, GLuint coords); typedef void (GLAD_API_PTR *PFNGLMULTITEXCOORDP4UIVPROC)(GLenum texture, GLenum type, const GLuint * coords); +typedef void (GLAD_API_PTR *PFNGLNAMEDFRAMEBUFFERTEXTUREMULTIVIEWOVRPROC)(GLuint framebuffer, GLenum attachment, GLuint texture, GLint level, GLint baseViewIndex, GLsizei numViews); typedef void (GLAD_API_PTR *PFNGLNEWLISTPROC)(GLuint list, GLenum mode); typedef void (GLAD_API_PTR *PFNGLNORMAL3BPROC)(GLbyte nx, GLbyte ny, GLbyte nz); typedef void (GLAD_API_PTR *PFNGLNORMAL3BVPROC)(const GLbyte * v); @@ -3654,6 +3655,8 @@ GLAD_API_CALL PFNGLMULTITEXCOORDP4UIPROC glad_glMultiTexCoordP4ui; #define glMultiTexCoordP4ui glad_glMultiTexCoordP4ui GLAD_API_CALL PFNGLMULTITEXCOORDP4UIVPROC glad_glMultiTexCoordP4uiv; #define glMultiTexCoordP4uiv glad_glMultiTexCoordP4uiv +GLAD_API_CALL PFNGLNAMEDFRAMEBUFFERTEXTUREMULTIVIEWOVRPROC glad_glNamedFramebufferTextureMultiviewOVR; +#define glNamedFramebufferTextureMultiviewOVR glad_glNamedFramebufferTextureMultiviewOVR GLAD_API_CALL PFNGLNEWLISTPROC glad_glNewList; #define glNewList glad_glNewList GLAD_API_CALL PFNGLNORMAL3BPROC glad_glNormal3b; diff --git a/thirdparty/glad/glad/glx.h b/thirdparty/glad/glad/glx.h index cf7663a3af..a2fa0dadee 100644 --- a/thirdparty/glad/glad/glx.h +++ b/thirdparty/glad/glad/glx.h @@ -1,5 +1,5 @@ /** - * Loader generated by glad 2.0.4 on Mon May 22 13:18:29 2023 + * Loader generated by glad 2.0.6 on Fri Apr 5 08:14:31 2024 * * SPDX-License-Identifier: (WTFPL OR CC0-1.0) AND Apache-2.0 * @@ -152,7 +152,7 @@ extern "C" { #define GLAD_VERSION_MAJOR(version) (version / 10000) #define GLAD_VERSION_MINOR(version) (version % 10000) -#define GLAD_GENERATOR_VERSION "2.0.4" +#define GLAD_GENERATOR_VERSION "2.0.6" typedef void (*GLADapiproc)(void); diff --git a/thirdparty/glad/patches/patch_enable_both_gl_and_gles.diff b/thirdparty/glad/patches/patch_enable_both_gl_and_gles.diff index a98efe51d8..88c5510166 100644 --- a/thirdparty/glad/patches/patch_enable_both_gl_and_gles.diff +++ b/thirdparty/glad/patches/patch_enable_both_gl_and_gles.diff @@ -1,8 +1,8 @@ diff --git a/thirdparty/glad/gl.c b/thirdparty/glad/gl.c -index a0b59dbbfb..9f10f6544a 100644 +index 3f0884a3dc..38ecb514bd 100644 --- a/thirdparty/glad/gl.c +++ b/thirdparty/glad/gl.c -@@ -2475,7 +2475,7 @@ static GLADapiproc glad_gl_get_proc(void *vuserptr, const char *name) { +@@ -2462,7 +2462,7 @@ static GLADapiproc glad_gl_get_proc(void *vuserptr, const char *name) { return result; } @@ -11,7 +11,7 @@ index a0b59dbbfb..9f10f6544a 100644 static void* glad_gl_dlopen_handle(void) { #if GLAD_PLATFORM_APPLE -@@ -2497,11 +2497,11 @@ static void* glad_gl_dlopen_handle(void) { +@@ -2484,11 +2484,11 @@ static void* glad_gl_dlopen_handle(void) { }; #endif @@ -26,7 +26,7 @@ index a0b59dbbfb..9f10f6544a 100644 } static struct _glad_gl_userptr glad_gl_build_userptr(void *handle) { -@@ -2527,7 +2527,7 @@ int gladLoaderLoadGL(void) { +@@ -2514,7 +2514,7 @@ int gladLoaderLoadGL(void) { int did_load = 0; struct _glad_gl_userptr userptr; @@ -35,7 +35,7 @@ index a0b59dbbfb..9f10f6544a 100644 handle = glad_gl_dlopen_handle(); if (handle) { userptr = glad_gl_build_userptr(handle); -@@ -2545,9 +2545,9 @@ int gladLoaderLoadGL(void) { +@@ -2532,9 +2532,9 @@ int gladLoaderLoadGL(void) { void gladLoaderUnloadGL(void) { @@ -49,7 +49,7 @@ index a0b59dbbfb..9f10f6544a 100644 } diff --git a/thirdparty/glad/glad/gl.h b/thirdparty/glad/glad/gl.h -index 905c16aeed..f3cb7d8cb5 100644 +index 77c6f33cab..1301d10b65 100644 --- a/thirdparty/glad/glad/gl.h +++ b/thirdparty/glad/glad/gl.h @@ -67,6 +67,7 @@ diff --git a/thirdparty/harfbuzz/src/OT/Color/COLR/COLR.hh b/thirdparty/harfbuzz/src/OT/Color/COLR/COLR.hh index b632a1d9eb..623775a771 100644 --- a/thirdparty/harfbuzz/src/OT/Color/COLR/COLR.hh +++ b/thirdparty/harfbuzz/src/OT/Color/COLR/COLR.hh @@ -68,7 +68,7 @@ public: hb_font_t *font; unsigned int palette_index; hb_color_t foreground; - VarStoreInstancer &instancer; + ItemVarStoreInstancer &instancer; hb_map_t current_glyphs; hb_map_t current_layers; int depth_left = HB_MAX_NESTING_LEVEL; @@ -80,7 +80,7 @@ public: hb_font_t *font_, unsigned int palette_, hb_color_t foreground_, - VarStoreInstancer &instancer_) : + ItemVarStoreInstancer &instancer_) : base (base_), funcs (funcs_), data (data_), @@ -245,7 +245,7 @@ struct Variable { value.closurev1 (c); } bool subset (hb_subset_context_t *c, - const VarStoreInstancer &instancer) const + const ItemVarStoreInstancer &instancer) const { TRACE_SUBSET (this); if (!value.subset (c, instancer, varIdxBase)) return_trace (false); @@ -270,7 +270,7 @@ struct Variable void get_color_stop (hb_paint_context_t *c, hb_color_stop_t *stop, - const VarStoreInstancer &instancer) const + const ItemVarStoreInstancer &instancer) const { value.get_color_stop (c, stop, varIdxBase, instancer); } @@ -305,7 +305,7 @@ struct NoVariable { value.closurev1 (c); } bool subset (hb_subset_context_t *c, - const VarStoreInstancer &instancer) const + const ItemVarStoreInstancer &instancer) const { TRACE_SUBSET (this); return_trace (value.subset (c, instancer, varIdxBase)); @@ -325,7 +325,7 @@ struct NoVariable void get_color_stop (hb_paint_context_t *c, hb_color_stop_t *stop, - const VarStoreInstancer &instancer) const + const ItemVarStoreInstancer &instancer) const { value.get_color_stop (c, stop, VarIdx::NO_VARIATION, instancer); } @@ -348,7 +348,7 @@ struct ColorStop { c->add_palette_index (paletteIndex); } bool subset (hb_subset_context_t *c, - const VarStoreInstancer &instancer, + const ItemVarStoreInstancer &instancer, uint32_t varIdxBase) const { TRACE_SUBSET (this); @@ -374,7 +374,7 @@ struct ColorStop void get_color_stop (hb_paint_context_t *c, hb_color_stop_t *out, uint32_t varIdx, - const VarStoreInstancer &instancer) const + const ItemVarStoreInstancer &instancer) const { out->offset = stopOffset.to_float(instancer (varIdx, 0)); out->color = c->get_color (paletteIndex, @@ -410,7 +410,7 @@ struct ColorLine } bool subset (hb_subset_context_t *c, - const VarStoreInstancer &instancer) const + const ItemVarStoreInstancer &instancer) const { TRACE_SUBSET (this); auto *out = c->serializer->start_embed (this); @@ -439,7 +439,7 @@ struct ColorLine unsigned int start, unsigned int *count, hb_color_stop_t *color_stops, - const VarStoreInstancer &instancer) const + const ItemVarStoreInstancer &instancer) const { unsigned int len = stops.len; @@ -543,7 +543,7 @@ struct Affine2x3 } bool subset (hb_subset_context_t *c, - const VarStoreInstancer &instancer, + const ItemVarStoreInstancer &instancer, uint32_t varIdxBase) const { TRACE_SUBSET (this); @@ -588,7 +588,7 @@ struct PaintColrLayers void closurev1 (hb_colrv1_closure_context_t* c) const; bool subset (hb_subset_context_t *c, - const VarStoreInstancer &instancer HB_UNUSED) const + const ItemVarStoreInstancer &instancer HB_UNUSED) const { TRACE_SUBSET (this); auto *out = c->serializer->embed (this); @@ -620,7 +620,7 @@ struct PaintSolid { c->add_palette_index (paletteIndex); } bool subset (hb_subset_context_t *c, - const VarStoreInstancer &instancer, + const ItemVarStoreInstancer &instancer, uint32_t varIdxBase) const { TRACE_SUBSET (this); @@ -669,7 +669,7 @@ struct PaintLinearGradient { (this+colorLine).closurev1 (c); } bool subset (hb_subset_context_t *c, - const VarStoreInstancer &instancer, + const ItemVarStoreInstancer &instancer, uint32_t varIdxBase) const { TRACE_SUBSET (this); @@ -736,7 +736,7 @@ struct PaintRadialGradient { (this+colorLine).closurev1 (c); } bool subset (hb_subset_context_t *c, - const VarStoreInstancer &instancer, + const ItemVarStoreInstancer &instancer, uint32_t varIdxBase) const { TRACE_SUBSET (this); @@ -803,7 +803,7 @@ struct PaintSweepGradient { (this+colorLine).closurev1 (c); } bool subset (hb_subset_context_t *c, - const VarStoreInstancer &instancer, + const ItemVarStoreInstancer &instancer, uint32_t varIdxBase) const { TRACE_SUBSET (this); @@ -863,7 +863,7 @@ struct PaintGlyph void closurev1 (hb_colrv1_closure_context_t* c) const; bool subset (hb_subset_context_t *c, - const VarStoreInstancer &instancer) const + const ItemVarStoreInstancer &instancer) const { TRACE_SUBSET (this); auto *out = c->serializer->embed (this); @@ -906,7 +906,7 @@ struct PaintColrGlyph void closurev1 (hb_colrv1_closure_context_t* c) const; bool subset (hb_subset_context_t *c, - const VarStoreInstancer &instancer HB_UNUSED) const + const ItemVarStoreInstancer &instancer HB_UNUSED) const { TRACE_SUBSET (this); auto *out = c->serializer->embed (this); @@ -936,7 +936,7 @@ struct PaintTransform HB_INTERNAL void closurev1 (hb_colrv1_closure_context_t* c) const; bool subset (hb_subset_context_t *c, - const VarStoreInstancer &instancer) const + const ItemVarStoreInstancer &instancer) const { TRACE_SUBSET (this); auto *out = c->serializer->embed (this); @@ -975,7 +975,7 @@ struct PaintTranslate HB_INTERNAL void closurev1 (hb_colrv1_closure_context_t* c) const; bool subset (hb_subset_context_t *c, - const VarStoreInstancer &instancer, + const ItemVarStoreInstancer &instancer, uint32_t varIdxBase) const { TRACE_SUBSET (this); @@ -1024,7 +1024,7 @@ struct PaintScale HB_INTERNAL void closurev1 (hb_colrv1_closure_context_t* c) const; bool subset (hb_subset_context_t *c, - const VarStoreInstancer &instancer, + const ItemVarStoreInstancer &instancer, uint32_t varIdxBase) const { TRACE_SUBSET (this); @@ -1073,7 +1073,7 @@ struct PaintScaleAroundCenter HB_INTERNAL void closurev1 (hb_colrv1_closure_context_t* c) const; bool subset (hb_subset_context_t *c, - const VarStoreInstancer &instancer, + const ItemVarStoreInstancer &instancer, uint32_t varIdxBase) const { TRACE_SUBSET (this); @@ -1132,7 +1132,7 @@ struct PaintScaleUniform HB_INTERNAL void closurev1 (hb_colrv1_closure_context_t* c) const; bool subset (hb_subset_context_t *c, - const VarStoreInstancer &instancer, + const ItemVarStoreInstancer &instancer, uint32_t varIdxBase) const { TRACE_SUBSET (this); @@ -1176,7 +1176,7 @@ struct PaintScaleUniformAroundCenter HB_INTERNAL void closurev1 (hb_colrv1_closure_context_t* c) const; bool subset (hb_subset_context_t *c, - const VarStoreInstancer &instancer, + const ItemVarStoreInstancer &instancer, uint32_t varIdxBase) const { TRACE_SUBSET (this); @@ -1232,7 +1232,7 @@ struct PaintRotate HB_INTERNAL void closurev1 (hb_colrv1_closure_context_t* c) const; bool subset (hb_subset_context_t *c, - const VarStoreInstancer &instancer, + const ItemVarStoreInstancer &instancer, uint32_t varIdxBase) const { TRACE_SUBSET (this); @@ -1276,7 +1276,7 @@ struct PaintRotateAroundCenter HB_INTERNAL void closurev1 (hb_colrv1_closure_context_t* c) const; bool subset (hb_subset_context_t *c, - const VarStoreInstancer &instancer, + const ItemVarStoreInstancer &instancer, uint32_t varIdxBase) const { TRACE_SUBSET (this); @@ -1332,7 +1332,7 @@ struct PaintSkew HB_INTERNAL void closurev1 (hb_colrv1_closure_context_t* c) const; bool subset (hb_subset_context_t *c, - const VarStoreInstancer &instancer, + const ItemVarStoreInstancer &instancer, uint32_t varIdxBase) const { TRACE_SUBSET (this); @@ -1381,7 +1381,7 @@ struct PaintSkewAroundCenter HB_INTERNAL void closurev1 (hb_colrv1_closure_context_t* c) const; bool subset (hb_subset_context_t *c, - const VarStoreInstancer &instancer, + const ItemVarStoreInstancer &instancer, uint32_t varIdxBase) const { TRACE_SUBSET (this); @@ -1440,7 +1440,7 @@ struct PaintComposite void closurev1 (hb_colrv1_closure_context_t* c) const; bool subset (hb_subset_context_t *c, - const VarStoreInstancer &instancer) const + const ItemVarStoreInstancer &instancer) const { TRACE_SUBSET (this); auto *out = c->serializer->embed (this); @@ -1491,7 +1491,7 @@ struct ClipBoxFormat1 return_trace (c->check_struct (this)); } - void get_clip_box (ClipBoxData &clip_box, const VarStoreInstancer &instancer HB_UNUSED) const + void get_clip_box (ClipBoxData &clip_box, const ItemVarStoreInstancer &instancer HB_UNUSED) const { clip_box.xMin = xMin; clip_box.yMin = yMin; @@ -1500,7 +1500,7 @@ struct ClipBoxFormat1 } bool subset (hb_subset_context_t *c, - const VarStoreInstancer &instancer, + const ItemVarStoreInstancer &instancer, uint32_t varIdxBase) const { TRACE_SUBSET (this); @@ -1533,7 +1533,7 @@ struct ClipBoxFormat1 struct ClipBoxFormat2 : Variable<ClipBoxFormat1> { - void get_clip_box (ClipBoxData &clip_box, const VarStoreInstancer &instancer) const + void get_clip_box (ClipBoxData &clip_box, const ItemVarStoreInstancer &instancer) const { value.get_clip_box(clip_box, instancer); if (instancer) @@ -1549,7 +1549,7 @@ struct ClipBoxFormat2 : Variable<ClipBoxFormat1> struct ClipBox { bool subset (hb_subset_context_t *c, - const VarStoreInstancer &instancer) const + const ItemVarStoreInstancer &instancer) const { TRACE_SUBSET (this); switch (u.format) { @@ -1572,7 +1572,7 @@ struct ClipBox } bool get_extents (hb_glyph_extents_t *extents, - const VarStoreInstancer &instancer) const + const ItemVarStoreInstancer &instancer) const { ClipBoxData clip_box; switch (u.format) { @@ -1608,7 +1608,7 @@ struct ClipRecord bool subset (hb_subset_context_t *c, const void *base, - const VarStoreInstancer &instancer) const + const ItemVarStoreInstancer &instancer) const { TRACE_SUBSET (this); auto *out = c->serializer->embed (*this); @@ -1625,7 +1625,7 @@ struct ClipRecord bool get_extents (hb_glyph_extents_t *extents, const void *base, - const VarStoreInstancer &instancer) const + const ItemVarStoreInstancer &instancer) const { return (base+clipBox).get_extents (extents, instancer); } @@ -1642,7 +1642,7 @@ DECLARE_NULL_NAMESPACE_BYTES (OT, ClipRecord); struct ClipList { unsigned serialize_clip_records (hb_subset_context_t *c, - const VarStoreInstancer &instancer, + const ItemVarStoreInstancer &instancer, const hb_set_t& gids, const hb_map_t& gid_offset_map) const { @@ -1695,7 +1695,7 @@ struct ClipList } bool subset (hb_subset_context_t *c, - const VarStoreInstancer &instancer) const + const ItemVarStoreInstancer &instancer) const { TRACE_SUBSET (this); auto *out = c->serializer->start_embed (*this); @@ -1735,7 +1735,7 @@ struct ClipList bool get_extents (hb_codepoint_t gid, hb_glyph_extents_t *extents, - const VarStoreInstancer &instancer) const + const ItemVarStoreInstancer &instancer) const { auto *rec = clips.as_array ().bsearch (gid); if (rec) @@ -1855,7 +1855,7 @@ struct BaseGlyphPaintRecord bool serialize (hb_serialize_context_t *s, const hb_map_t* glyph_map, const void* src_base, hb_subset_context_t *c, - const VarStoreInstancer &instancer) const + const ItemVarStoreInstancer &instancer) const { TRACE_SERIALIZE (this); auto *out = s->embed (this); @@ -1884,7 +1884,7 @@ struct BaseGlyphPaintRecord struct BaseGlyphList : SortedArray32Of<BaseGlyphPaintRecord> { bool subset (hb_subset_context_t *c, - const VarStoreInstancer &instancer) const + const ItemVarStoreInstancer &instancer) const { TRACE_SUBSET (this); auto *out = c->serializer->start_embed (this); @@ -1916,7 +1916,7 @@ struct LayerList : Array32OfOffset32To<Paint> { return this+(*this)[i]; } bool subset (hb_subset_context_t *c, - const VarStoreInstancer &instancer) const + const ItemVarStoreInstancer &instancer) const { TRACE_SUBSET (this); auto *out = c->serializer->start_embed (this); @@ -2206,7 +2206,7 @@ struct COLR auto snap = c->serializer->snapshot (); if (!c->serializer->allocate_size<void> (5 * HBUINT32::static_size)) return_trace (false); - VarStoreInstancer instancer (varStore ? &(this+varStore) : nullptr, + ItemVarStoreInstancer instancer (varStore ? &(this+varStore) : nullptr, varIdxMap ? &(this+varIdxMap) : nullptr, c->plan->normalized_coords.as_array ()); @@ -2250,7 +2250,7 @@ struct COLR if (version != 1) return false; - VarStoreInstancer instancer (&(this+varStore), + ItemVarStoreInstancer instancer (&(this+varStore), &(this+varIdxMap), hb_array (font->coords, font->num_coords)); @@ -2301,7 +2301,7 @@ struct COLR bool get_clip (hb_codepoint_t glyph, hb_glyph_extents_t *extents, - const VarStoreInstancer instancer) const + const ItemVarStoreInstancer instancer) const { return (this+clipList).get_extents (glyph, extents, @@ -2312,7 +2312,7 @@ struct COLR bool paint_glyph (hb_font_t *font, hb_codepoint_t glyph, hb_paint_funcs_t *funcs, void *data, unsigned int palette_index, hb_color_t foreground, bool clip = true) const { - VarStoreInstancer instancer (&(this+varStore), + ItemVarStoreInstancer instancer (&(this+varStore), &(this+varIdxMap), hb_array (font->coords, font->num_coords)); hb_paint_context_t c (this, funcs, data, font, palette_index, foreground, instancer); @@ -2327,7 +2327,7 @@ struct COLR { // COLRv1 glyph - VarStoreInstancer instancer (&(this+varStore), + ItemVarStoreInstancer instancer (&(this+varStore), &(this+varIdxMap), hb_array (font->coords, font->num_coords)); @@ -2413,7 +2413,7 @@ struct COLR Offset32To<LayerList> layerList; Offset32To<ClipList> clipList; // Offset to ClipList table (may be NULL) Offset32To<DeltaSetIndexMap> varIdxMap; // Offset to DeltaSetIndexMap table (may be NULL) - Offset32To<VariationStore> varStore; + Offset32To<ItemVariationStore> varStore; public: DEFINE_SIZE_MIN (14); }; diff --git a/thirdparty/harfbuzz/src/OT/Layout/GDEF/GDEF.hh b/thirdparty/harfbuzz/src/OT/Layout/GDEF/GDEF.hh index 14a9b5e5cd..317b96c714 100644 --- a/thirdparty/harfbuzz/src/OT/Layout/GDEF/GDEF.hh +++ b/thirdparty/harfbuzz/src/OT/Layout/GDEF/GDEF.hh @@ -189,7 +189,7 @@ struct CaretValueFormat3 friend struct CaretValue; hb_position_t get_caret_value (hb_font_t *font, hb_direction_t direction, - const VariationStore &var_store) const + const ItemVariationStore &var_store) const { return HB_DIRECTION_IS_HORIZONTAL (direction) ? font->em_scale_x (coordinate) + (this+deviceTable).get_x_delta (font, var_store) : @@ -251,7 +251,7 @@ struct CaretValue hb_position_t get_caret_value (hb_font_t *font, hb_direction_t direction, hb_codepoint_t glyph_id, - const VariationStore &var_store) const + const ItemVariationStore &var_store) const { switch (u.format) { case 1: return u.format1.get_caret_value (font, direction); @@ -316,7 +316,7 @@ struct LigGlyph unsigned get_lig_carets (hb_font_t *font, hb_direction_t direction, hb_codepoint_t glyph_id, - const VariationStore &var_store, + const ItemVariationStore &var_store, unsigned start_offset, unsigned *caret_count /* IN/OUT */, hb_position_t *caret_array /* OUT */) const @@ -372,7 +372,7 @@ struct LigCaretList unsigned int get_lig_carets (hb_font_t *font, hb_direction_t direction, hb_codepoint_t glyph_id, - const VariationStore &var_store, + const ItemVariationStore &var_store, unsigned int start_offset, unsigned int *caret_count /* IN/OUT */, hb_position_t *caret_array /* OUT */) const @@ -609,7 +609,7 @@ struct GDEFVersion1_2 * definitions--from beginning of GDEF * header (may be NULL). Introduced * in version 0x00010002. */ - Offset32To<VariationStore> + Offset32To<ItemVariationStore> varStore; /* Offset to the table of Item Variation * Store--from beginning of GDEF * header (may be NULL). Introduced @@ -663,21 +663,16 @@ struct GDEFVersion1_2 auto *out = c->serializer->start_embed (*this); if (unlikely (!c->serializer->extend_min (out))) return_trace (false); - out->version.major = version.major; - out->version.minor = version.minor; - bool subset_glyphclassdef = out->glyphClassDef.serialize_subset (c, glyphClassDef, this, nullptr, false, true); - bool subset_attachlist = out->attachList.serialize_subset (c, attachList, this); - bool subset_markattachclassdef = out->markAttachClassDef.serialize_subset (c, markAttachClassDef, this, nullptr, false, true); - - bool subset_markglyphsetsdef = false; + // Push var store first (if it's needed) so that it's last in the + // serialization order. Some font consumers assume that varstore runs to + // the end of the GDEF table. + // See: https://github.com/harfbuzz/harfbuzz/issues/4636 auto snapshot_version0 = c->serializer->snapshot (); - if (version.to_int () >= 0x00010002u) - { - if (unlikely (!c->serializer->embed (markGlyphSetsDef))) return_trace (false); - subset_markglyphsetsdef = out->markGlyphSetsDef.serialize_subset (c, markGlyphSetsDef, this); - } + if (unlikely (version.to_int () >= 0x00010002u && !c->serializer->embed (markGlyphSetsDef))) + return_trace (false); bool subset_varstore = false; + unsigned varstore_index = (unsigned) -1; auto snapshot_version2 = c->serializer->snapshot (); if (version.to_int () >= 0x00010003u) { @@ -690,35 +685,58 @@ struct GDEFVersion1_2 { item_variations_t item_vars; if (item_vars.instantiate (this+varStore, c->plan, true, true, - c->plan->gdef_varstore_inner_maps.as_array ())) + c->plan->gdef_varstore_inner_maps.as_array ())) { subset_varstore = out->varStore.serialize_serialize (c->serializer, item_vars.has_long_word (), c->plan->axis_tags, item_vars.get_region_list (), item_vars.get_vardata_encodings ()); + varstore_index = c->serializer->last_added_child_index(); + } remap_varidx_after_instantiation (item_vars.get_varidx_map (), c->plan->layout_variation_idx_delta_map); } } else + { subset_varstore = out->varStore.serialize_subset (c, varStore, this, c->plan->gdef_varstore_inner_maps.as_array ()); + varstore_index = c->serializer->last_added_child_index(); + } + } + + out->version.major = version.major; + out->version.minor = version.minor; + + if (!subset_varstore && version.to_int () >= 0x00010002u) { + c->serializer->revert (snapshot_version2); } + bool subset_markglyphsetsdef = false; + if (version.to_int () >= 0x00010002u) + { + subset_markglyphsetsdef = out->markGlyphSetsDef.serialize_subset (c, markGlyphSetsDef, this); + } if (subset_varstore) { out->version.minor = 3; c->plan->has_gdef_varstore = true; } else if (subset_markglyphsetsdef) { - out->version.minor = 2; - c->serializer->revert (snapshot_version2); + out->version.minor = 2; } else { out->version.minor = 0; c->serializer->revert (snapshot_version0); } + bool subset_glyphclassdef = out->glyphClassDef.serialize_subset (c, glyphClassDef, this, nullptr, false, true); + bool subset_attachlist = out->attachList.serialize_subset (c, attachList, this); + bool subset_markattachclassdef = out->markAttachClassDef.serialize_subset (c, markAttachClassDef, this, nullptr, false, true); bool subset_ligcaretlist = out->ligCaretList.serialize_subset (c, ligCaretList, this); + if (subset_varstore && varstore_index != (unsigned) -1) { + c->serializer->repack_last(varstore_index); + } + return_trace (subset_glyphclassdef || subset_attachlist || subset_ligcaretlist || subset_markattachclassdef || (out->version.to_int () >= 0x00010002u && subset_markglyphsetsdef) || @@ -884,14 +902,14 @@ struct GDEF default: return false; } } - const VariationStore &get_var_store () const + const ItemVariationStore &get_var_store () const { switch (u.version.major) { - case 1: return u.version.to_int () >= 0x00010003u ? this+u.version1.varStore : Null(VariationStore); + case 1: return u.version.to_int () >= 0x00010003u ? this+u.version1.varStore : Null(ItemVariationStore); #ifndef HB_NO_BEYOND_64K case 2: return this+u.version2.varStore; #endif - default: return Null(VariationStore); + default: return Null(ItemVariationStore); } } @@ -1011,9 +1029,9 @@ struct GDEF hb_hashmap_t<unsigned, hb_pair_t<unsigned, int>> *layout_variation_idx_delta_map /* OUT */) const { if (!has_var_store ()) return; - const VariationStore &var_store = get_var_store (); + const ItemVariationStore &var_store = get_var_store (); float *store_cache = var_store.create_cache (); - + unsigned new_major = 0, new_minor = 0; unsigned last_major = (layout_variation_indices->get_min ()) >> 16; for (unsigned idx : layout_variation_indices->iter ()) diff --git a/thirdparty/harfbuzz/src/OT/Layout/GPOS/PairPosFormat2.hh b/thirdparty/harfbuzz/src/OT/Layout/GPOS/PairPosFormat2.hh index dd02da887d..9c805b39a1 100644 --- a/thirdparty/harfbuzz/src/OT/Layout/GPOS/PairPosFormat2.hh +++ b/thirdparty/harfbuzz/src/OT/Layout/GPOS/PairPosFormat2.hh @@ -324,17 +324,8 @@ struct PairPosFormat2_4 : ValueBase } } - const hb_set_t &glyphset = *c->plan->glyphset_gsub (); - const hb_map_t &glyph_map = *c->plan->glyph_map; - - auto it = - + hb_iter (this+coverage) - | hb_filter (glyphset) - | hb_map_retains_sorting (glyph_map) - ; - - out->coverage.serialize_serialize (c->serializer, it); - return_trace (out->class1Count && out->class2Count && bool (it)); + bool ret = out->coverage.serialize_subset(c, coverage, this); + return_trace (out->class1Count && out->class2Count && ret); } diff --git a/thirdparty/harfbuzz/src/OT/Layout/GPOS/ValueFormat.hh b/thirdparty/harfbuzz/src/OT/Layout/GPOS/ValueFormat.hh index 17f57db1f5..9442cc1cc5 100644 --- a/thirdparty/harfbuzz/src/OT/Layout/GPOS/ValueFormat.hh +++ b/thirdparty/harfbuzz/src/OT/Layout/GPOS/ValueFormat.hh @@ -116,7 +116,7 @@ struct ValueFormat : HBUINT16 if (!use_x_device && !use_y_device) return ret; - const VariationStore &store = c->var_store; + const ItemVariationStore &store = c->var_store; auto *cache = c->var_store_cache; /* pixel -> fractional pixel */ diff --git a/thirdparty/harfbuzz/src/OT/glyf/CompositeGlyph.hh b/thirdparty/harfbuzz/src/OT/glyf/CompositeGlyph.hh index 60858a5a58..5c0ecd5133 100644 --- a/thirdparty/harfbuzz/src/OT/glyf/CompositeGlyph.hh +++ b/thirdparty/harfbuzz/src/OT/glyf/CompositeGlyph.hh @@ -240,7 +240,8 @@ struct CompositeGlyphRecord } if (is_anchored ()) tx = ty = 0; - trans.init ((float) tx, (float) ty); + /* set is_end_point flag to true, used by IUP delta optimization */ + trans.init ((float) tx, (float) ty, true); { const F2DOT14 *points = (const F2DOT14 *) p; diff --git a/thirdparty/harfbuzz/src/OT/glyf/Glyph.hh b/thirdparty/harfbuzz/src/OT/glyf/Glyph.hh index 5ea611948f..69a0b625c7 100644 --- a/thirdparty/harfbuzz/src/OT/glyf/Glyph.hh +++ b/thirdparty/harfbuzz/src/OT/glyf/Glyph.hh @@ -103,6 +103,9 @@ struct Glyph } } + bool is_composite () const + { return type == COMPOSITE; } + bool get_all_points_without_var (const hb_face_t *face, contour_point_vector_t &points /* OUT */) const { diff --git a/thirdparty/harfbuzz/src/OT/glyf/glyf-helpers.hh b/thirdparty/harfbuzz/src/OT/glyf/glyf-helpers.hh index d0a5a132f0..f157bf0020 100644 --- a/thirdparty/harfbuzz/src/OT/glyf/glyf-helpers.hh +++ b/thirdparty/harfbuzz/src/OT/glyf/glyf-helpers.hh @@ -38,7 +38,7 @@ _write_loca (IteratorIn&& it, unsigned padded_size = *it++; offset += padded_size; - DEBUG_MSG (SUBSET, nullptr, "loca entry gid %u offset %u padded-size %u", gid, offset, padded_size); + DEBUG_MSG (SUBSET, nullptr, "loca entry gid %" PRIu32 " offset %u padded-size %u", gid, offset, padded_size); value = offset >> right_shift; *dest++ = value; diff --git a/thirdparty/harfbuzz/src/graph/classdef-graph.hh b/thirdparty/harfbuzz/src/graph/classdef-graph.hh index 9cf845a82d..da6378820b 100644 --- a/thirdparty/harfbuzz/src/graph/classdef-graph.hh +++ b/thirdparty/harfbuzz/src/graph/classdef-graph.hh @@ -134,20 +134,23 @@ struct ClassDef : public OT::ClassDef struct class_def_size_estimator_t { + // TODO(garretrieger): update to support beyond64k coverage/classdef tables. + constexpr static unsigned class_def_format1_base_size = 6; + constexpr static unsigned class_def_format2_base_size = 4; + constexpr static unsigned coverage_base_size = 4; + constexpr static unsigned bytes_per_range = 6; + constexpr static unsigned bytes_per_glyph = 2; + template<typename It> class_def_size_estimator_t (It glyph_and_class) - : gids_consecutive (true), num_ranges_per_class (), glyphs_per_class () + : num_ranges_per_class (), glyphs_per_class () { - unsigned last_gid = (unsigned) -1; + reset(); for (auto p : + glyph_and_class) { unsigned gid = p.first; unsigned klass = p.second; - if (last_gid != (unsigned) -1 && gid != last_gid + 1) - gids_consecutive = false; - last_gid = gid; - hb_set_t* glyphs; if (glyphs_per_class.has (klass, &glyphs) && glyphs) { glyphs->add (gid); @@ -177,28 +180,54 @@ struct class_def_size_estimator_t } } - // Incremental increase in the Coverage and ClassDef table size - // (worst case) if all glyphs associated with 'klass' were added. - unsigned incremental_coverage_size (unsigned klass) const + void reset() { + class_def_1_size = class_def_format1_base_size; + class_def_2_size = class_def_format2_base_size; + included_glyphs.clear(); + included_classes.clear(); + } + + // Compute the size of coverage for all glyphs added via 'add_class_def_size'. + unsigned coverage_size () const { - // Coverage takes 2 bytes per glyph worst case, - return 2 * glyphs_per_class.get (klass).get_population (); + unsigned format1_size = coverage_base_size + bytes_per_glyph * included_glyphs.get_population(); + unsigned format2_size = coverage_base_size + bytes_per_range * num_glyph_ranges(); + return hb_min(format1_size, format2_size); } - // Incremental increase in the Coverage and ClassDef table size - // (worst case) if all glyphs associated with 'klass' were added. - unsigned incremental_class_def_size (unsigned klass) const + // Compute the new size of the ClassDef table if all glyphs associated with 'klass' were added. + unsigned add_class_def_size (unsigned klass) { - // ClassDef takes 6 bytes per range - unsigned class_def_2_size = 6 * num_ranges_per_class.get (klass); - if (gids_consecutive) - { - // ClassDef1 takes 2 bytes per glyph, but only can be used - // when gids are consecutive. - return hb_min (2 * glyphs_per_class.get (klass).get_population (), class_def_2_size); + if (!included_classes.has(klass)) { + hb_set_t* glyphs = nullptr; + if (glyphs_per_class.has(klass, &glyphs)) { + included_glyphs.union_(*glyphs); + } + + class_def_1_size = class_def_format1_base_size; + if (!included_glyphs.is_empty()) { + unsigned min_glyph = included_glyphs.get_min(); + unsigned max_glyph = included_glyphs.get_max(); + class_def_1_size += bytes_per_glyph * (max_glyph - min_glyph + 1); + } + + class_def_2_size += bytes_per_range * num_ranges_per_class.get (klass); + + included_classes.add(klass); } - return class_def_2_size; + return hb_min (class_def_1_size, class_def_2_size); + } + + unsigned num_glyph_ranges() const { + hb_codepoint_t start = HB_SET_VALUE_INVALID; + hb_codepoint_t end = HB_SET_VALUE_INVALID; + + unsigned count = 0; + while (included_glyphs.next_range (&start, &end)) { + count++; + } + return count; } bool in_error () @@ -214,9 +243,12 @@ struct class_def_size_estimator_t } private: - bool gids_consecutive; hb_hashmap_t<unsigned, unsigned> num_ranges_per_class; hb_hashmap_t<unsigned, hb_set_t> glyphs_per_class; + hb_set_t included_classes; + hb_set_t included_glyphs; + unsigned class_def_1_size; + unsigned class_def_2_size; }; diff --git a/thirdparty/harfbuzz/src/graph/graph.hh b/thirdparty/harfbuzz/src/graph/graph.hh index 26ad00bdd9..2a9d8346c0 100644 --- a/thirdparty/harfbuzz/src/graph/graph.hh +++ b/thirdparty/harfbuzz/src/graph/graph.hh @@ -195,6 +195,15 @@ struct graph_t return incoming_edges_; } + unsigned incoming_edges_from_parent (unsigned parent_index) const { + if (single_parent != (unsigned) -1) { + return single_parent == parent_index ? 1 : 0; + } + + unsigned* count; + return parents.has(parent_index, &count) ? *count : 0; + } + void reset_parents () { incoming_edges_ = 0; @@ -334,6 +343,16 @@ struct graph_t return true; } + bool give_max_priority () + { + bool result = false; + while (!has_max_priority()) { + result = true; + priority++; + } + return result; + } + bool has_max_priority () const { return priority >= 3; } @@ -1023,6 +1042,11 @@ struct graph_t * Creates a copy of child and re-assigns the link from * parent to the clone. The copy is a shallow copy, objects * linked from child are not duplicated. + * + * Returns the index of the newly created duplicate. + * + * If the child_idx only has incoming edges from parent_idx, this + * will do nothing and return the original child_idx. */ unsigned duplicate_if_shared (unsigned parent_idx, unsigned child_idx) { @@ -1036,18 +1060,20 @@ struct graph_t * Creates a copy of child and re-assigns the link from * parent to the clone. The copy is a shallow copy, objects * linked from child are not duplicated. + * + * Returns the index of the newly created duplicate. + * + * If the child_idx only has incoming edges from parent_idx, + * duplication isn't possible and this will return -1. */ unsigned duplicate (unsigned parent_idx, unsigned child_idx) { update_parents (); - unsigned links_to_child = 0; - for (const auto& l : vertices_[parent_idx].obj.all_links ()) - { - if (l.objidx == child_idx) links_to_child++; - } + const auto& child = vertices_[child_idx]; + unsigned links_to_child = child.incoming_edges_from_parent(parent_idx); - if (vertices_[child_idx].incoming_edges () <= links_to_child) + if (child.incoming_edges () <= links_to_child) { // Can't duplicate this node, doing so would orphan the original one as all remaining links // to child are from parent. @@ -1060,7 +1086,7 @@ struct graph_t parent_idx, child_idx); unsigned clone_idx = duplicate (child_idx); - if (clone_idx == (unsigned) -1) return false; + if (clone_idx == (unsigned) -1) return -1; // duplicate shifts the root node idx, so if parent_idx was root update it. if (parent_idx == clone_idx) parent_idx++; @@ -1076,6 +1102,62 @@ struct graph_t return clone_idx; } + /* + * Creates a copy of child and re-assigns the links from + * parents to the clone. The copy is a shallow copy, objects + * linked from child are not duplicated. + * + * Returns the index of the newly created duplicate. + * + * If the child_idx only has incoming edges from parents, + * duplication isn't possible or duplication fails and this will + * return -1. + */ + unsigned duplicate (const hb_set_t* parents, unsigned child_idx) + { + if (parents->is_empty()) { + return -1; + } + + update_parents (); + + const auto& child = vertices_[child_idx]; + unsigned links_to_child = 0; + unsigned last_parent = parents->get_max(); + unsigned first_parent = parents->get_min(); + for (unsigned parent_idx : *parents) { + links_to_child += child.incoming_edges_from_parent(parent_idx); + } + + if (child.incoming_edges () <= links_to_child) + { + // Can't duplicate this node, doing so would orphan the original one as all remaining links + // to child are from parent. + DEBUG_MSG (SUBSET_REPACK, nullptr, " Not duplicating %u, ..., %u => %u", first_parent, last_parent, child_idx); + return -1; + } + + DEBUG_MSG (SUBSET_REPACK, nullptr, " Duplicating %u, ..., %u => %u", first_parent, last_parent, child_idx); + + unsigned clone_idx = duplicate (child_idx); + if (clone_idx == (unsigned) -1) return false; + + for (unsigned parent_idx : *parents) { + // duplicate shifts the root node idx, so if parent_idx was root update it. + if (parent_idx == clone_idx) parent_idx++; + auto& parent = vertices_[parent_idx]; + for (auto& l : parent.obj.all_links_writer ()) + { + if (l.objidx != child_idx) + continue; + + reassign_link (l, parent_idx, clone_idx); + } + } + + return clone_idx; + } + /* * Adds a new node to the graph, not connected to anything. diff --git a/thirdparty/harfbuzz/src/graph/pairpos-graph.hh b/thirdparty/harfbuzz/src/graph/pairpos-graph.hh index f7f74b18c9..fd46861de4 100644 --- a/thirdparty/harfbuzz/src/graph/pairpos-graph.hh +++ b/thirdparty/harfbuzz/src/graph/pairpos-graph.hh @@ -247,8 +247,8 @@ struct PairPosFormat2 : public OT::Layout::GPOS_impl::PairPosFormat2_4<SmallType for (unsigned i = 0; i < class1_count; i++) { unsigned accumulated_delta = class1_record_size; - coverage_size += estimator.incremental_coverage_size (i); - class_def_1_size += estimator.incremental_class_def_size (i); + class_def_1_size = estimator.add_class_def_size (i); + coverage_size = estimator.coverage_size (); max_coverage_size = hb_max (max_coverage_size, coverage_size); max_class_def_1_size = hb_max (max_class_def_1_size, class_def_1_size); @@ -280,8 +280,10 @@ struct PairPosFormat2 : public OT::Layout::GPOS_impl::PairPosFormat2_4<SmallType split_points.push (i); // split does not include i, so add the size for i when we reset the size counters. accumulated = base_size + accumulated_delta; - coverage_size = 4 + estimator.incremental_coverage_size (i); - class_def_1_size = 4 + estimator.incremental_class_def_size (i); + + estimator.reset(); + class_def_1_size = estimator.add_class_def_size(i); + coverage_size = estimator.coverage_size(); visited.clear (); // node sharing isn't allowed between splits. } } diff --git a/thirdparty/harfbuzz/src/hb-aat-layout-morx-table.hh b/thirdparty/harfbuzz/src/hb-aat-layout-morx-table.hh index 06c9334b37..8436551324 100644 --- a/thirdparty/harfbuzz/src/hb-aat-layout-morx-table.hh +++ b/thirdparty/harfbuzz/src/hb-aat-layout-morx-table.hh @@ -552,6 +552,7 @@ struct LigatureSubtable { DEBUG_MSG (APPLY, nullptr, "Skipping ligature component"); if (unlikely (!buffer->move_to (match_positions[--match_length % ARRAY_LENGTH (match_positions)]))) return; + buffer->cur().unicode_props() |= UPROPS_MASK_IGNORABLE; if (unlikely (!buffer->replace_glyph (DELETED_GLYPH))) return; } diff --git a/thirdparty/harfbuzz/src/hb-algs.hh b/thirdparty/harfbuzz/src/hb-algs.hh index ea97057165..efa6074a42 100644 --- a/thirdparty/harfbuzz/src/hb-algs.hh +++ b/thirdparty/harfbuzz/src/hb-algs.hh @@ -671,7 +671,7 @@ struct hb_pair_t return 0; } - friend void swap (hb_pair_t& a, hb_pair_t& b) + friend void swap (hb_pair_t& a, hb_pair_t& b) noexcept { hb_swap (a.first, b.first); hb_swap (a.second, b.second); @@ -1053,6 +1053,18 @@ _hb_cmp_method (const void *pkey, const void *pval, Ts... ds) return val.cmp (key, ds...); } +template <typename K, typename V> +static int +_hb_cmp_operator (const void *pkey, const void *pval) +{ + const K& key = * (const K*) pkey; + const V& val = * (const V*) pval; + + if (key < val) return -1; + if (key > val) return 1; + return 0; +} + template <typename V, typename K, typename ...Ts> static inline bool hb_bsearch_impl (unsigned *pos, /* Out */ diff --git a/thirdparty/harfbuzz/src/hb-bit-set-invertible.hh b/thirdparty/harfbuzz/src/hb-bit-set-invertible.hh index 2626251807..d5d1326d9f 100644 --- a/thirdparty/harfbuzz/src/hb-bit-set-invertible.hh +++ b/thirdparty/harfbuzz/src/hb-bit-set-invertible.hh @@ -39,10 +39,10 @@ struct hb_bit_set_invertible_t hb_bit_set_invertible_t () = default; hb_bit_set_invertible_t (const hb_bit_set_invertible_t& o) = default; - hb_bit_set_invertible_t (hb_bit_set_invertible_t&& other) : hb_bit_set_invertible_t () { hb_swap (*this, other); } + hb_bit_set_invertible_t (hb_bit_set_invertible_t&& other) noexcept : hb_bit_set_invertible_t () { hb_swap (*this, other); } hb_bit_set_invertible_t& operator= (const hb_bit_set_invertible_t& o) = default; - hb_bit_set_invertible_t& operator= (hb_bit_set_invertible_t&& other) { hb_swap (*this, other); return *this; } - friend void swap (hb_bit_set_invertible_t &a, hb_bit_set_invertible_t &b) + hb_bit_set_invertible_t& operator= (hb_bit_set_invertible_t&& other) noexcept { hb_swap (*this, other); return *this; } + friend void swap (hb_bit_set_invertible_t &a, hb_bit_set_invertible_t &b) noexcept { if (likely (!a.s.successful || !b.s.successful)) return; diff --git a/thirdparty/harfbuzz/src/hb-bit-set.hh b/thirdparty/harfbuzz/src/hb-bit-set.hh index 1dbcce5cbd..5f4c6f0afe 100644 --- a/thirdparty/harfbuzz/src/hb-bit-set.hh +++ b/thirdparty/harfbuzz/src/hb-bit-set.hh @@ -38,10 +38,10 @@ struct hb_bit_set_t ~hb_bit_set_t () = default; hb_bit_set_t (const hb_bit_set_t& other) : hb_bit_set_t () { set (other, true); } - hb_bit_set_t ( hb_bit_set_t&& other) : hb_bit_set_t () { hb_swap (*this, other); } + hb_bit_set_t ( hb_bit_set_t&& other) noexcept : hb_bit_set_t () { hb_swap (*this, other); } hb_bit_set_t& operator= (const hb_bit_set_t& other) { set (other); return *this; } - hb_bit_set_t& operator= (hb_bit_set_t&& other) { hb_swap (*this, other); return *this; } - friend void swap (hb_bit_set_t &a, hb_bit_set_t &b) + hb_bit_set_t& operator= (hb_bit_set_t&& other) noexcept { hb_swap (*this, other); return *this; } + friend void swap (hb_bit_set_t &a, hb_bit_set_t &b) noexcept { if (likely (!a.successful || !b.successful)) return; diff --git a/thirdparty/harfbuzz/src/hb-blob.cc b/thirdparty/harfbuzz/src/hb-blob.cc index 265effba03..873d9b257a 100644 --- a/thirdparty/harfbuzz/src/hb-blob.cc +++ b/thirdparty/harfbuzz/src/hb-blob.cc @@ -598,6 +598,11 @@ _open_resource_fork (const char *file_name, hb_mapped_file_t *file) * Creates a new blob containing the data from the * specified binary font file. * + * The filename is passed directly to the system on all platforms, + * except on Windows, where the filename is interpreted as UTF-8. + * Only if the filename is not valid UTF-8, it will be interpreted + * according to the system codepage. + * * Returns: An #hb_blob_t pointer with the content of the file, * or hb_blob_get_empty() if failed. * @@ -617,6 +622,11 @@ hb_blob_create_from_file (const char *file_name) * Creates a new blob containing the data from the * specified binary font file. * + * The filename is passed directly to the system on all platforms, + * except on Windows, where the filename is interpreted as UTF-8. + * Only if the filename is not valid UTF-8, it will be interpreted + * according to the system codepage. + * * Returns: An #hb_blob_t pointer with the content of the file, * or `NULL` if failed. * @@ -672,10 +682,19 @@ fail_without_close: if (unlikely (!file)) return nullptr; HANDLE fd; + int conversion; unsigned int size = strlen (file_name) + 1; wchar_t * wchar_file_name = (wchar_t *) hb_malloc (sizeof (wchar_t) * size); if (unlikely (!wchar_file_name)) goto fail_without_close; - mbstowcs (wchar_file_name, file_name, size); + + /* Assume file name is given in UTF-8 encoding */ + conversion = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, file_name, -1, wchar_file_name, size); + if (conversion <= 0) + { + /* Conversion failed due to invalid UTF-8 characters, + Repeat conversion based on system code page */ + mbstowcs(wchar_file_name, file_name, size); + } #if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) && WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP) { CREATEFILE2_EXTENDED_PARAMETERS ceparams = { 0 }; diff --git a/thirdparty/harfbuzz/src/hb-buffer-verify.cc b/thirdparty/harfbuzz/src/hb-buffer-verify.cc index 15a53919de..671d6eda8c 100644 --- a/thirdparty/harfbuzz/src/hb-buffer-verify.cc +++ b/thirdparty/harfbuzz/src/hb-buffer-verify.cc @@ -149,7 +149,7 @@ buffer_verify_unsafe_to_break (hb_buffer_t *buffer, } assert (text_start < text_end); - if (0) + if (false) printf("start %u end %u text start %u end %u\n", start, end, text_start, text_end); hb_buffer_clear_contents (fragment); @@ -288,7 +288,7 @@ buffer_verify_unsafe_to_concat (hb_buffer_t *buffer, } assert (text_start < text_end); - if (0) + if (false) printf("start %u end %u text start %u end %u\n", start, end, text_start, text_end); #if 0 diff --git a/thirdparty/harfbuzz/src/hb-buffer.cc b/thirdparty/harfbuzz/src/hb-buffer.cc index 934c6c2129..d621a7cc55 100644 --- a/thirdparty/harfbuzz/src/hb-buffer.cc +++ b/thirdparty/harfbuzz/src/hb-buffer.cc @@ -309,6 +309,7 @@ hb_buffer_t::clear () deallocate_var_all (); serial = 0; + random_state = 1; scratch_flags = HB_BUFFER_SCRATCH_FLAG_DEFAULT; } @@ -1359,6 +1360,49 @@ hb_buffer_get_not_found_glyph (const hb_buffer_t *buffer) return buffer->not_found; } +/** + * hb_buffer_set_random_state: + * @buffer: An #hb_buffer_t + * @state: the new random state + * + * Sets the random state of the buffer. The state changes + * every time a glyph uses randomness (eg. the `rand` + * OpenType feature). This function together with + * hb_buffer_get_random_state() allow for transferring + * the current random state to a subsequent buffer, to + * get better randomness distribution. + * + * Defaults to 1 and when buffer contents are cleared. + * A value of 0 disables randomness during shaping. + * + * Since: 8.4.0 + **/ +void +hb_buffer_set_random_state (hb_buffer_t *buffer, + unsigned state) +{ + if (unlikely (hb_object_is_immutable (buffer))) + return; + + buffer->random_state = state; +} + +/** + * hb_buffer_get_random_state: + * @buffer: An #hb_buffer_t + * + * See hb_buffer_set_random_state(). + * + * Return value: + * The @buffer random state + * + * Since: 8.4.0 + **/ +unsigned +hb_buffer_get_random_state (const hb_buffer_t *buffer) +{ + return buffer->random_state; +} /** * hb_buffer_clear_contents: diff --git a/thirdparty/harfbuzz/src/hb-buffer.h b/thirdparty/harfbuzz/src/hb-buffer.h index 3573127ff0..f75fe96b21 100644 --- a/thirdparty/harfbuzz/src/hb-buffer.h +++ b/thirdparty/harfbuzz/src/hb-buffer.h @@ -487,6 +487,12 @@ hb_buffer_set_not_found_glyph (hb_buffer_t *buffer, HB_EXTERN hb_codepoint_t hb_buffer_get_not_found_glyph (const hb_buffer_t *buffer); +HB_EXTERN void +hb_buffer_set_random_state (hb_buffer_t *buffer, + unsigned state); + +HB_EXTERN unsigned +hb_buffer_get_random_state (const hb_buffer_t *buffer); /* * Content API. diff --git a/thirdparty/harfbuzz/src/hb-buffer.hh b/thirdparty/harfbuzz/src/hb-buffer.hh index f04ad58f11..0a198722d6 100644 --- a/thirdparty/harfbuzz/src/hb-buffer.hh +++ b/thirdparty/harfbuzz/src/hb-buffer.hh @@ -116,6 +116,7 @@ struct hb_buffer_t uint8_t allocated_var_bits; uint8_t serial; + uint32_t random_state; hb_buffer_scratch_flags_t scratch_flags; /* Have space-fallback, etc. */ unsigned int max_len; /* Maximum allowed len. */ int max_ops; /* Maximum allowed operations. */ diff --git a/thirdparty/harfbuzz/src/hb-cff-interp-dict-common.hh b/thirdparty/harfbuzz/src/hb-cff-interp-dict-common.hh index 53226b227e..a08b10b5ff 100644 --- a/thirdparty/harfbuzz/src/hb-cff-interp-dict-common.hh +++ b/thirdparty/harfbuzz/src/hb-cff-interp-dict-common.hh @@ -54,8 +54,8 @@ struct top_dict_values_t : dict_values_t<OPSTR> } void fini () { dict_values_t<OPSTR>::fini (); } - unsigned int charStringsOffset; - unsigned int FDArrayOffset; + int charStringsOffset; + int FDArrayOffset; }; struct dict_opset_t : opset_t<number_t> @@ -157,11 +157,11 @@ struct top_dict_opset_t : dict_opset_t { switch (op) { case OpCode_CharStrings: - dictval.charStringsOffset = env.argStack.pop_uint (); + dictval.charStringsOffset = env.argStack.pop_int (); env.clear_args (); break; case OpCode_FDArray: - dictval.FDArrayOffset = env.argStack.pop_uint (); + dictval.FDArrayOffset = env.argStack.pop_int (); env.clear_args (); break; case OpCode_FontMatrix: diff --git a/thirdparty/harfbuzz/src/hb-cff2-interp-cs.hh b/thirdparty/harfbuzz/src/hb-cff2-interp-cs.hh index 915b10cf39..55b1d3bf8d 100644 --- a/thirdparty/harfbuzz/src/hb-cff2-interp-cs.hh +++ b/thirdparty/harfbuzz/src/hb-cff2-interp-cs.hh @@ -168,7 +168,7 @@ struct cff2_cs_interp_env_t : cs_interp_env_t<ELEM, CFF2Subrs> protected: const int *coords; unsigned int num_coords; - const CFF2VariationStore *varStore; + const CFF2ItemVariationStore *varStore; unsigned int region_count; unsigned int ivs; hb_vector_t<float> scalars; diff --git a/thirdparty/harfbuzz/src/hb-common.cc b/thirdparty/harfbuzz/src/hb-common.cc index 0c13c7d171..4b8bae4422 100644 --- a/thirdparty/harfbuzz/src/hb-common.cc +++ b/thirdparty/harfbuzz/src/hb-common.cc @@ -996,7 +996,7 @@ hb_feature_to_string (hb_feature_t *feature, if (feature->value > 1) { s[len++] = '='; - len += hb_max (0, snprintf (s + len, ARRAY_LENGTH (s) - len, "%u", feature->value)); + len += hb_max (0, snprintf (s + len, ARRAY_LENGTH (s) - len, "%" PRIu32, feature->value)); } assert (len < ARRAY_LENGTH (s)); len = hb_min (len, size - 1); diff --git a/thirdparty/harfbuzz/src/hb-common.h b/thirdparty/harfbuzz/src/hb-common.h index a9fe666b39..533de91562 100644 --- a/thirdparty/harfbuzz/src/hb-common.h +++ b/thirdparty/harfbuzz/src/hb-common.h @@ -47,14 +47,10 @@ # endif /* !__cplusplus */ #endif -#if defined (_SVR4) || defined (SVR4) || defined (__OpenBSD__) || \ - defined (_sgi) || defined (__sun) || defined (sun) || \ - defined (__digital__) || defined (__HP_cc) -# include <inttypes.h> -#elif defined (_AIX) +#if defined (_AIX) # include <sys/inttypes.h> #elif defined (_MSC_VER) && _MSC_VER < 1600 -/* VS 2010 (_MSC_VER 1600) has stdint.h */ +/* VS 2010 (_MSC_VER 1600) has stdint.h */ typedef __int8 int8_t; typedef unsigned __int8 uint8_t; typedef __int16 int16_t; @@ -63,10 +59,11 @@ typedef __int32 int32_t; typedef unsigned __int32 uint32_t; typedef __int64 int64_t; typedef unsigned __int64 uint64_t; -#elif defined (__KERNEL__) -# include <linux/types.h> -#else +#elif defined (_MSC_VER) && _MSC_VER < 1800 +/* VS 2013 (_MSC_VER 1800) has inttypes.h */ # include <stdint.h> +#else +# include <inttypes.h> #endif #if defined(__GNUC__) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1)) diff --git a/thirdparty/harfbuzz/src/hb-cplusplus.hh b/thirdparty/harfbuzz/src/hb-cplusplus.hh index 531ef1b7c8..a640e192de 100644 --- a/thirdparty/harfbuzz/src/hb-cplusplus.hh +++ b/thirdparty/harfbuzz/src/hb-cplusplus.hh @@ -56,15 +56,15 @@ struct shared_ptr explicit shared_ptr (T *p = nullptr) : p (p) {} shared_ptr (const shared_ptr &o) : p (v::reference (o.p)) {} - shared_ptr (shared_ptr &&o) : p (o.p) { o.p = nullptr; } + shared_ptr (shared_ptr &&o) noexcept : p (o.p) { o.p = nullptr; } shared_ptr& operator = (const shared_ptr &o) { if (p != o.p) { destroy (); p = o.p; reference (); } return *this; } - shared_ptr& operator = (shared_ptr &&o) { v::destroy (p); p = o.p; o.p = nullptr; return *this; } + shared_ptr& operator = (shared_ptr &&o) noexcept { v::destroy (p); p = o.p; o.p = nullptr; return *this; } ~shared_ptr () { v::destroy (p); p = nullptr; } T* get() const { return p; } - void swap (shared_ptr &o) { std::swap (p, o.p); } - friend void swap (shared_ptr &a, shared_ptr &b) { std::swap (a.p, b.p); } + void swap (shared_ptr &o) noexcept { std::swap (p, o.p); } + friend void swap (shared_ptr &a, shared_ptr &b) noexcept { std::swap (a.p, b.p); } operator T * () const { return p; } T& operator * () const { return *get (); } @@ -98,16 +98,16 @@ struct unique_ptr explicit unique_ptr (T *p = nullptr) : p (p) {} unique_ptr (const unique_ptr &o) = delete; - unique_ptr (unique_ptr &&o) : p (o.p) { o.p = nullptr; } + unique_ptr (unique_ptr &&o) noexcept : p (o.p) { o.p = nullptr; } unique_ptr& operator = (const unique_ptr &o) = delete; - unique_ptr& operator = (unique_ptr &&o) { v::destroy (p); p = o.p; o.p = nullptr; return *this; } + unique_ptr& operator = (unique_ptr &&o) noexcept { v::destroy (p); p = o.p; o.p = nullptr; return *this; } ~unique_ptr () { v::destroy (p); p = nullptr; } T* get() const { return p; } T* release () { T* v = p; p = nullptr; return v; } - void swap (unique_ptr &o) { std::swap (p, o.p); } - friend void swap (unique_ptr &a, unique_ptr &b) { std::swap (a.p, b.p); } + void swap (unique_ptr &o) noexcept { std::swap (p, o.p); } + friend void swap (unique_ptr &a, unique_ptr &b) noexcept { std::swap (a.p, b.p); } operator T * () const { return p; } T& operator * () const { return *get (); } diff --git a/thirdparty/harfbuzz/src/hb-directwrite.cc b/thirdparty/harfbuzz/src/hb-directwrite.cc index 42764a244b..6c90265d0b 100644 --- a/thirdparty/harfbuzz/src/hb-directwrite.cc +++ b/thirdparty/harfbuzz/src/hb-directwrite.cc @@ -173,7 +173,7 @@ _hb_directwrite_shaper_face_data_create (hb_face_t *face) t_DWriteCreateFactory p_DWriteCreateFactory; -#if defined(__GNUC__) +#if defined(__GNUC__) || defined(__clang__) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wcast-function-type" #endif @@ -181,7 +181,7 @@ _hb_directwrite_shaper_face_data_create (hb_face_t *face) p_DWriteCreateFactory = (t_DWriteCreateFactory) GetProcAddress (data->dwrite_dll, "DWriteCreateFactory"); -#if defined(__GNUC__) +#if defined(__GNUC__) || defined(__clang__) #pragma GCC diagnostic pop #endif diff --git a/thirdparty/harfbuzz/src/hb-features.h b/thirdparty/harfbuzz/src/hb-features.h new file mode 100644 index 0000000000..9199864195 --- /dev/null +++ b/thirdparty/harfbuzz/src/hb-features.h @@ -0,0 +1,119 @@ +/* + * Copyright © 2022 Red Hat, Inc. + * + * This is part of HarfBuzz, a text shaping library. + * + * Permission is hereby granted, without written agreement and without + * license or royalty fees, to use, copy, modify, and distribute this + * software and its documentation for any purpose, provided that the + * above copyright notice and the following two paragraphs appear in + * all copies of this software. + * + * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR + * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES + * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN + * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, + * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS + * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO + * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. + */ + +#ifndef HB_FEATURES_H +#define HB_FEATURES_H + +HB_BEGIN_DECLS + +/** + * SECTION: hb-features + * @title: hb-features + * @short_description: Feature detection + * @include: hb-features.h + * + * Macros for detecting optional HarfBuzz features at build time. + **/ + +/** + * HB_HAS_CAIRO: + * + * Defined if Harfbuzz has been built with cairo support. + */ +# + +/** + * HB_HAS_CORETEXT: + * + * Defined if Harfbuzz has been built with CoreText support. + */ +#undef HB_HAS_CORETEXT + +/** + * HB_HAS_DIRECTWRITE: + * + * Defined if Harfbuzz has been built with DirectWrite support. + */ +#undef HB_HAS_DIRECTWRITE + +/** + * HB_HAS_FREETYPE: + * + * Defined if Harfbuzz has been built with Freetype support. + */ +#define HB_HAS_FREETYPE 1 + +/** + * HB_HAS_GDI: + * + * Defined if Harfbuzz has been built with GDI support. + */ +#undef HB_HAS_GDI + +/** + * HB_HAS_GLIB: + * + * Defined if Harfbuzz has been built with GLib support. + */ +#define HB_HAS_GLIB 1 + +/** + * HB_HAS_GOBJECT: + * + * Defined if Harfbuzz has been built with GObject support. + */ +#undef HB_HAS_GOBJECT + +/** + * HB_HAS_GRAPHITE: + * + * Defined if Harfbuzz has been built with Graphite support. + */ +#undef HB_HAS_GRAPHITE + +/** + * HB_HAS_ICU: + * + * Defined if Harfbuzz has been built with ICU support. + */ +#undef HB_HAS_ICU + +/** + * HB_HAS_UNISCRIBE: + * + * Defined if Harfbuzz has been built with Uniscribe support. + */ +#undef HB_HAS_UNISCRIBE + +/** + * HB_HAS_WASM: + * + * Defined if Harfbuzz has been built with WebAssembly support. + */ +#undef HB_HAS_WASM + + +HB_END_DECLS + +#endif /* HB_FEATURES_H */ diff --git a/thirdparty/harfbuzz/src/hb-font.hh b/thirdparty/harfbuzz/src/hb-font.hh index f503575c34..4c8190b0dd 100644 --- a/thirdparty/harfbuzz/src/hb-font.hh +++ b/thirdparty/harfbuzz/src/hb-font.hh @@ -651,7 +651,7 @@ struct hb_font_t { if (get_glyph_name (glyph, s, size)) return; - if (size && snprintf (s, size, "gid%u", glyph) < 0) + if (size && snprintf (s, size, "gid%" PRIu32, glyph) < 0) *s = '\0'; } diff --git a/thirdparty/harfbuzz/src/hb-ft.cc b/thirdparty/harfbuzz/src/hb-ft.cc index 955a9081e0..3de4a6d5d4 100644 --- a/thirdparty/harfbuzz/src/hb-ft.cc +++ b/thirdparty/harfbuzz/src/hb-ft.cc @@ -224,7 +224,7 @@ _hb_ft_hb_font_check_changed (hb_font_t *font, * * Sets the FT_Load_Glyph load flags for the specified #hb_font_t. * - * For more information, see + * For more information, see * <https://freetype.org/freetype2/docs/reference/ft2-glyph_retrieval.html#ft_load_xxx> * * This function works with #hb_font_t objects created by @@ -252,7 +252,7 @@ hb_ft_font_set_load_flags (hb_font_t *font, int load_flags) * * Fetches the FT_Load_Glyph load flags of the specified #hb_font_t. * - * For more information, see + * For more information, see * <https://freetype.org/freetype2/docs/reference/ft2-glyph_retrieval.html#ft_load_xxx> * * This function works with #hb_font_t objects created by @@ -1118,10 +1118,10 @@ _hb_ft_reference_table (hb_face_t *face HB_UNUSED, hb_tag_t tag, void *user_data * This variant of the function does not provide any life-cycle management. * * Most client programs should use hb_ft_face_create_referenced() - * (or, perhaps, hb_ft_face_create_cached()) instead. + * (or, perhaps, hb_ft_face_create_cached()) instead. * * If you know you have valid reasons not to use hb_ft_face_create_referenced(), - * then it is the client program's responsibility to destroy @ft_face + * then it is the client program's responsibility to destroy @ft_face * after the #hb_face_t face object has been destroyed. * * Return value: (transfer full): the new #hb_face_t face object @@ -1215,7 +1215,7 @@ hb_ft_face_finalize (void *arg) hb_face_t * hb_ft_face_create_cached (FT_Face ft_face) { - if (unlikely (!ft_face->generic.data || ft_face->generic.finalizer != (FT_Generic_Finalizer) hb_ft_face_finalize)) + if (unlikely (!ft_face->generic.data || ft_face->generic.finalizer != hb_ft_face_finalize)) { if (ft_face->generic.finalizer) ft_face->generic.finalizer (ft_face); @@ -1241,13 +1241,13 @@ hb_ft_face_create_cached (FT_Face ft_face) * This variant of the function does not provide any life-cycle management. * * Most client programs should use hb_ft_font_create_referenced() - * instead. + * instead. * * If you know you have valid reasons not to use hb_ft_font_create_referenced(), - * then it is the client program's responsibility to destroy @ft_face + * then it is the client program's responsibility to destroy @ft_face * after the #hb_font_t font object has been destroyed. * - * HarfBuzz will use the @destroy callback on the #hb_font_t font object + * HarfBuzz will use the @destroy callback on the #hb_font_t font object * if it is supplied when you use this function. However, even if @destroy * is provided, it is the client program's responsibility to destroy @ft_face, * and it is the client program's responsibility to ensure that @ft_face is diff --git a/thirdparty/harfbuzz/src/hb-icu.cc b/thirdparty/harfbuzz/src/hb-icu.cc index e46401f7a6..3707ec30f8 100644 --- a/thirdparty/harfbuzz/src/hb-icu.cc +++ b/thirdparty/harfbuzz/src/hb-icu.cc @@ -93,15 +93,16 @@ hb_icu_script_to_script (UScriptCode script) UScriptCode hb_icu_script_from_script (hb_script_t script) { + UScriptCode out = USCRIPT_INVALID_CODE; + if (unlikely (script == HB_SCRIPT_INVALID)) - return USCRIPT_INVALID_CODE; + return out; - unsigned int numScriptCode = 1 + u_getIntPropertyMaxValue (UCHAR_SCRIPT); - for (unsigned int i = 0; i < numScriptCode; i++) - if (unlikely (hb_icu_script_to_script ((UScriptCode) i) == script)) - return (UScriptCode) i; + UErrorCode icu_err = U_ZERO_ERROR; + const unsigned char buf[5] = {HB_UNTAG (script), 0}; + uscript_getCode ((const char *) buf, &out, 1, &icu_err); - return USCRIPT_UNKNOWN; + return out; } diff --git a/thirdparty/harfbuzz/src/hb-limits.hh b/thirdparty/harfbuzz/src/hb-limits.hh index 25c1e71e13..7efc893eae 100644 --- a/thirdparty/harfbuzz/src/hb-limits.hh +++ b/thirdparty/harfbuzz/src/hb-limits.hh @@ -106,7 +106,7 @@ #endif #ifndef HB_COLRV1_MAX_EDGE_COUNT -#define HB_COLRV1_MAX_EDGE_COUNT 65536 +#define HB_COLRV1_MAX_EDGE_COUNT 2048 #endif diff --git a/thirdparty/harfbuzz/src/hb-map.hh b/thirdparty/harfbuzz/src/hb-map.hh index 45a02b830c..6521b1a41d 100644 --- a/thirdparty/harfbuzz/src/hb-map.hh +++ b/thirdparty/harfbuzz/src/hb-map.hh @@ -70,9 +70,9 @@ struct hb_hashmap_t alloc (o.population); hb_copy (o, *this); } - hb_hashmap_t (hb_hashmap_t&& o) : hb_hashmap_t () { hb_swap (*this, o); } + hb_hashmap_t (hb_hashmap_t&& o) noexcept : hb_hashmap_t () { hb_swap (*this, o); } hb_hashmap_t& operator= (const hb_hashmap_t& o) { reset (); alloc (o.population); hb_copy (o, *this); return *this; } - hb_hashmap_t& operator= (hb_hashmap_t&& o) { hb_swap (*this, o); return *this; } + hb_hashmap_t& operator= (hb_hashmap_t&& o) noexcept { hb_swap (*this, o); return *this; } hb_hashmap_t (std::initializer_list<hb_pair_t<K, V>> lst) : hb_hashmap_t () { @@ -137,26 +137,23 @@ struct hb_hashmap_t }; hb_object_header_t header; - unsigned int successful : 1; /* Allocations successful */ - unsigned int population : 31; /* Not including tombstones. */ + bool successful; /* Allocations successful */ + unsigned short max_chain_length; + unsigned int population; /* Not including tombstones. */ unsigned int occupancy; /* Including tombstones. */ unsigned int mask; unsigned int prime; - unsigned int max_chain_length; item_t *items; - friend void swap (hb_hashmap_t& a, hb_hashmap_t& b) + friend void swap (hb_hashmap_t& a, hb_hashmap_t& b) noexcept { if (unlikely (!a.successful || !b.successful)) return; - unsigned tmp = a.population; - a.population = b.population; - b.population = tmp; - //hb_swap (a.population, b.population); + hb_swap (a.max_chain_length, b.max_chain_length); + hb_swap (a.population, b.population); hb_swap (a.occupancy, b.occupancy); hb_swap (a.mask, b.mask); hb_swap (a.prime, b.prime); - hb_swap (a.max_chain_length, b.max_chain_length); hb_swap (a.items, b.items); } void init () @@ -164,10 +161,10 @@ struct hb_hashmap_t hb_object_init (this); successful = true; + max_chain_length = 0; population = occupancy = 0; mask = 0; prime = 0; - max_chain_length = 0; items = nullptr; } void fini () @@ -558,7 +555,7 @@ struct hb_map_t : hb_hashmap_t<hb_codepoint_t, ~hb_map_t () = default; hb_map_t () : hashmap () {} hb_map_t (const hb_map_t &o) : hashmap ((hashmap &) o) {} - hb_map_t (hb_map_t &&o) : hashmap (std::move ((hashmap &) o)) {} + hb_map_t (hb_map_t &&o) noexcept : hashmap (std::move ((hashmap &) o)) {} hb_map_t& operator= (const hb_map_t&) = default; hb_map_t& operator= (hb_map_t&&) = default; hb_map_t (std::initializer_list<hb_codepoint_pair_t> lst) : hashmap (lst) {} diff --git a/thirdparty/harfbuzz/src/hb-object.hh b/thirdparty/harfbuzz/src/hb-object.hh index e2c2c3394c..5cffe1666b 100644 --- a/thirdparty/harfbuzz/src/hb-object.hh +++ b/thirdparty/harfbuzz/src/hb-object.hh @@ -325,7 +325,7 @@ retry: hb_user_data_array_t *user_data = obj->header.user_data.get_acquire (); if (unlikely (!user_data)) { - user_data = (hb_user_data_array_t *) hb_calloc (sizeof (hb_user_data_array_t), 1); + user_data = (hb_user_data_array_t *) hb_calloc (1, sizeof (hb_user_data_array_t)); if (unlikely (!user_data)) return false; user_data->init (); diff --git a/thirdparty/harfbuzz/src/hb-open-type.hh b/thirdparty/harfbuzz/src/hb-open-type.hh index 6967bca3d4..9c11f14344 100644 --- a/thirdparty/harfbuzz/src/hb-open-type.hh +++ b/thirdparty/harfbuzz/src/hb-open-type.hh @@ -985,6 +985,13 @@ struct SortedArrayOf : ArrayOf<Type, LenType> return_trace (ret); } + SortedArrayOf* copy (hb_serialize_context_t *c) const + { + TRACE_SERIALIZE (this); + SortedArrayOf* out = reinterpret_cast<SortedArrayOf *> (ArrayOf<Type, LenType>::copy (c)); + return_trace (out); + } + template <typename T> Type &bsearch (const T &x, Type ¬_found = Crap (Type)) { return *as_array ().bsearch (x, ¬_found); } diff --git a/thirdparty/harfbuzz/src/hb-ot-cff-common.hh b/thirdparty/harfbuzz/src/hb-ot-cff-common.hh index 4fdba197ac..c7c3264c08 100644 --- a/thirdparty/harfbuzz/src/hb-ot-cff-common.hh +++ b/thirdparty/harfbuzz/src/hb-ot-cff-common.hh @@ -41,10 +41,21 @@ using namespace OT; using objidx_t = hb_serialize_context_t::objidx_t; using whence_t = hb_serialize_context_t::whence_t; -/* utility macro */ -template<typename Type> -static inline const Type& StructAtOffsetOrNull (const void *P, unsigned int offset) -{ return offset ? StructAtOffset<Type> (P, offset) : Null (Type); } +/* CFF offsets can technically be negative */ +template<typename Type, typename ...Ts> +static inline const Type& StructAtOffsetOrNull (const void *P, int offset, hb_sanitize_context_t &sc, Ts&&... ds) +{ + if (!offset) return Null (Type); + + const char *p = (const char *) P + offset; + if (!sc.check_point (p)) return Null (Type); + + const Type &obj = *reinterpret_cast<const Type *> (p); + if (!obj.sanitize (&sc, std::forward<Ts> (ds)...)) return Null (Type); + + return obj; +} + struct code_pair_t { diff --git a/thirdparty/harfbuzz/src/hb-ot-cff1-table.hh b/thirdparty/harfbuzz/src/hb-ot-cff1-table.hh index c869e90554..1bbd463841 100644 --- a/thirdparty/harfbuzz/src/hb-ot-cff1-table.hh +++ b/thirdparty/harfbuzz/src/hb-ot-cff1-table.hh @@ -763,9 +763,9 @@ struct cff1_top_dict_values_t : top_dict_values_t<cff1_top_dict_val_t> unsigned int ros_supplement; unsigned int cidCount; - unsigned int EncodingOffset; - unsigned int CharsetOffset; - unsigned int FDSelectOffset; + int EncodingOffset; + int CharsetOffset; + int FDSelectOffset; table_info_t privateDictInfo; }; @@ -821,24 +821,24 @@ struct cff1_top_dict_opset_t : top_dict_opset_t<cff1_top_dict_val_t> break; case OpCode_Encoding: - dictval.EncodingOffset = env.argStack.pop_uint (); + dictval.EncodingOffset = env.argStack.pop_int (); env.clear_args (); if (unlikely (dictval.EncodingOffset == 0)) return; break; case OpCode_charset: - dictval.CharsetOffset = env.argStack.pop_uint (); + dictval.CharsetOffset = env.argStack.pop_int (); env.clear_args (); if (unlikely (dictval.CharsetOffset == 0)) return; break; case OpCode_FDSelect: - dictval.FDSelectOffset = env.argStack.pop_uint (); + dictval.FDSelectOffset = env.argStack.pop_int (); env.clear_args (); break; case OpCode_Private: - dictval.privateDictInfo.offset = env.argStack.pop_uint (); + dictval.privateDictInfo.offset = env.argStack.pop_int (); dictval.privateDictInfo.size = env.argStack.pop_uint (); env.clear_args (); break; @@ -913,7 +913,7 @@ struct cff1_private_dict_values_base_t : dict_values_t<VAL> } void fini () { dict_values_t<VAL>::fini (); } - unsigned int subrsOffset; + int subrsOffset; const CFF1Subrs *localSubrs; }; @@ -948,7 +948,7 @@ struct cff1_private_dict_opset_t : dict_opset_t env.clear_args (); break; case OpCode_Subrs: - dictval.subrsOffset = env.argStack.pop_uint (); + dictval.subrsOffset = env.argStack.pop_int (); env.clear_args (); break; @@ -990,7 +990,7 @@ struct cff1_private_dict_opset_subset_t : dict_opset_t break; case OpCode_Subrs: - dictval.subrsOffset = env.argStack.pop_uint (); + dictval.subrsOffset = env.argStack.pop_int (); env.clear_args (); break; @@ -1090,8 +1090,8 @@ struct cff1 goto fail; hb_barrier (); - topDictIndex = &StructAtOffset<CFF1TopDictIndex> (nameIndex, nameIndex->get_size ()); - if ((topDictIndex == &Null (CFF1TopDictIndex)) || !topDictIndex->sanitize (&sc) || (topDictIndex->count == 0)) + topDictIndex = &StructAtOffsetOrNull<CFF1TopDictIndex> (nameIndex, nameIndex->get_size (), sc); + if (topDictIndex == &Null (CFF1TopDictIndex) || (topDictIndex->count == 0)) goto fail; hb_barrier (); @@ -1108,20 +1108,18 @@ struct cff1 charset = &Null (Charset); else { - charset = &StructAtOffsetOrNull<Charset> (cff, topDict.CharsetOffset); - if (unlikely ((charset == &Null (Charset)) || !charset->sanitize (&sc, &num_charset_entries))) goto fail; - hb_barrier (); + charset = &StructAtOffsetOrNull<Charset> (cff, topDict.CharsetOffset, sc, &num_charset_entries); + if (unlikely (charset == &Null (Charset))) goto fail; } fdCount = 1; if (is_CID ()) { - fdArray = &StructAtOffsetOrNull<CFF1FDArray> (cff, topDict.FDArrayOffset); - fdSelect = &StructAtOffsetOrNull<CFF1FDSelect> (cff, topDict.FDSelectOffset); - if (unlikely ((fdArray == &Null (CFF1FDArray)) || !fdArray->sanitize (&sc) || - (fdSelect == &Null (CFF1FDSelect)) || !fdSelect->sanitize (&sc, fdArray->count))) + fdArray = &StructAtOffsetOrNull<CFF1FDArray> (cff, topDict.FDArrayOffset, sc); + fdSelect = &StructAtOffsetOrNull<CFF1FDSelect> (cff, topDict.FDSelectOffset, sc, fdArray->count); + if (unlikely (fdArray == &Null (CFF1FDArray) || + fdSelect == &Null (CFF1FDSelect))) goto fail; - hb_barrier (); fdCount = fdArray->count; } @@ -1140,27 +1138,19 @@ struct cff1 { if (!is_predef_encoding ()) { - encoding = &StructAtOffsetOrNull<Encoding> (cff, topDict.EncodingOffset); - if (unlikely ((encoding == &Null (Encoding)) || !encoding->sanitize (&sc))) goto fail; - hb_barrier (); + encoding = &StructAtOffsetOrNull<Encoding> (cff, topDict.EncodingOffset, sc); + if (unlikely (encoding == &Null (Encoding))) goto fail; } } - stringIndex = &StructAtOffset<CFF1StringIndex> (topDictIndex, topDictIndex->get_size ()); - if ((stringIndex == &Null (CFF1StringIndex)) || !stringIndex->sanitize (&sc)) + stringIndex = &StructAtOffsetOrNull<CFF1StringIndex> (topDictIndex, topDictIndex->get_size (), sc); + if (stringIndex == &Null (CFF1StringIndex)) goto fail; - hb_barrier (); - globalSubrs = &StructAtOffset<CFF1Subrs> (stringIndex, stringIndex->get_size ()); - if ((globalSubrs != &Null (CFF1Subrs)) && !globalSubrs->sanitize (&sc)) + globalSubrs = &StructAtOffsetOrNull<CFF1Subrs> (stringIndex, stringIndex->get_size (), sc); + charStrings = &StructAtOffsetOrNull<CFF1CharStrings> (cff, topDict.charStringsOffset, sc); + if (charStrings == &Null (CFF1CharStrings)) goto fail; - hb_barrier (); - - charStrings = &StructAtOffsetOrNull<CFF1CharStrings> (cff, topDict.charStringsOffset); - - if ((charStrings == &Null (CFF1CharStrings)) || unlikely (!charStrings->sanitize (&sc))) - goto fail; - hb_barrier (); num_glyphs = charStrings->count; if (num_glyphs != sc.get_num_glyphs ()) @@ -1188,19 +1178,13 @@ struct cff1 font->init (); if (unlikely (!font_interp.interpret (*font))) goto fail; PRIVDICTVAL *priv = &privateDicts[i]; - const hb_ubytes_t privDictStr = StructAtOffset<UnsizedByteStr> (cff, font->privateDictInfo.offset).as_ubytes (font->privateDictInfo.size); - if (unlikely (!privDictStr.sanitize (&sc))) goto fail; - hb_barrier (); + const hb_ubytes_t privDictStr = StructAtOffsetOrNull<UnsizedByteStr> (cff, font->privateDictInfo.offset, sc, font->privateDictInfo.size).as_ubytes (font->privateDictInfo.size); num_interp_env_t env2 (privDictStr); dict_interpreter_t<PRIVOPSET, PRIVDICTVAL> priv_interp (env2); priv->init (); if (unlikely (!priv_interp.interpret (*priv))) goto fail; - priv->localSubrs = &StructAtOffsetOrNull<CFF1Subrs> (&privDictStr, priv->subrsOffset); - if (priv->localSubrs != &Null (CFF1Subrs) && - unlikely (!priv->localSubrs->sanitize (&sc))) - goto fail; - hb_barrier (); + priv->localSubrs = &StructAtOffsetOrNull<CFF1Subrs> (&privDictStr, priv->subrsOffset, sc); } } else /* non-CID */ @@ -1208,18 +1192,13 @@ struct cff1 cff1_top_dict_values_t *font = &topDict; PRIVDICTVAL *priv = &privateDicts[0]; - const hb_ubytes_t privDictStr = StructAtOffset<UnsizedByteStr> (cff, font->privateDictInfo.offset).as_ubytes (font->privateDictInfo.size); - if (unlikely (!privDictStr.sanitize (&sc))) goto fail; - hb_barrier (); + const hb_ubytes_t privDictStr = StructAtOffsetOrNull<UnsizedByteStr> (cff, font->privateDictInfo.offset, sc, font->privateDictInfo.size).as_ubytes (font->privateDictInfo.size); num_interp_env_t env (privDictStr); dict_interpreter_t<PRIVOPSET, PRIVDICTVAL> priv_interp (env); priv->init (); if (unlikely (!priv_interp.interpret (*priv))) goto fail; - priv->localSubrs = &StructAtOffsetOrNull<CFF1Subrs> (&privDictStr, priv->subrsOffset); - if (priv->localSubrs != &Null (CFF1Subrs) && - unlikely (!priv->localSubrs->sanitize (&sc))) - goto fail; + priv->localSubrs = &StructAtOffsetOrNull<CFF1Subrs> (&privDictStr, priv->subrsOffset, sc); hb_barrier (); } @@ -1437,7 +1416,7 @@ struct cff1 hb_sorted_vector_t<gname_t> *names = glyph_names.get_acquire (); if (unlikely (!names)) { - names = (hb_sorted_vector_t<gname_t> *) hb_calloc (sizeof (hb_sorted_vector_t<gname_t>), 1); + names = (hb_sorted_vector_t<gname_t> *) hb_calloc (1, sizeof (hb_sorted_vector_t<gname_t>)); if (likely (names)) { names->init (); diff --git a/thirdparty/harfbuzz/src/hb-ot-cff2-table.hh b/thirdparty/harfbuzz/src/hb-ot-cff2-table.hh index 652748b737..4b3bdc9315 100644 --- a/thirdparty/harfbuzz/src/hb-ot-cff2-table.hh +++ b/thirdparty/harfbuzz/src/hb-ot-cff2-table.hh @@ -111,7 +111,7 @@ struct CFF2FDSelect DEFINE_SIZE_MIN (2); }; -struct CFF2VariationStore +struct CFF2ItemVariationStore { bool sanitize (hb_sanitize_context_t *c) const { @@ -122,11 +122,11 @@ struct CFF2VariationStore varStore.sanitize (c)); } - bool serialize (hb_serialize_context_t *c, const CFF2VariationStore *varStore) + bool serialize (hb_serialize_context_t *c, const CFF2ItemVariationStore *varStore) { TRACE_SERIALIZE (this); unsigned int size_ = varStore->get_size (); - CFF2VariationStore *dest = c->allocate_size<CFF2VariationStore> (size_); + CFF2ItemVariationStore *dest = c->allocate_size<CFF2ItemVariationStore> (size_); if (unlikely (!dest)) return_trace (false); hb_memcpy (dest, varStore, size_); return_trace (true); @@ -135,9 +135,9 @@ struct CFF2VariationStore unsigned int get_size () const { return HBUINT16::static_size + size; } HBUINT16 size; - VariationStore varStore; + ItemVariationStore varStore; - DEFINE_SIZE_MIN (2 + VariationStore::min_size); + DEFINE_SIZE_MIN (2 + ItemVariationStore::min_size); }; struct cff2_top_dict_values_t : top_dict_values_t<> @@ -150,8 +150,8 @@ struct cff2_top_dict_values_t : top_dict_values_t<> } void fini () { top_dict_values_t<>::fini (); } - unsigned int vstoreOffset; - unsigned int FDSelectOffset; + int vstoreOffset; + int FDSelectOffset; }; struct cff2_top_dict_opset_t : top_dict_opset_t<> @@ -169,11 +169,11 @@ struct cff2_top_dict_opset_t : top_dict_opset_t<> break; case OpCode_vstore: - dictval.vstoreOffset = env.argStack.pop_uint (); + dictval.vstoreOffset = env.argStack.pop_int (); env.clear_args (); break; case OpCode_FDSelect: - dictval.FDSelectOffset = env.argStack.pop_uint (); + dictval.FDSelectOffset = env.argStack.pop_int (); env.clear_args (); break; @@ -241,7 +241,7 @@ struct cff2_private_dict_values_base_t : dict_values_t<VAL> } void fini () { dict_values_t<VAL>::fini (); } - unsigned int subrsOffset; + int subrsOffset; const CFF2Subrs *localSubrs; unsigned int ivs; }; @@ -295,7 +295,7 @@ struct cff2_private_dict_opset_t : dict_opset_t env.clear_args (); break; case OpCode_Subrs: - dictval.subrsOffset = env.argStack.pop_uint (); + dictval.subrsOffset = env.argStack.pop_int (); env.clear_args (); break; case OpCode_vsindexdict: @@ -344,7 +344,7 @@ struct cff2_private_dict_opset_subset_t : dict_opset_t return; case OpCode_Subrs: - dictval.subrsOffset = env.argStack.pop_uint (); + dictval.subrsOffset = env.argStack.pop_int (); env.clear_args (); break; @@ -426,18 +426,15 @@ struct cff2 if (unlikely (!top_interp.interpret (topDict))) goto fail; } - globalSubrs = &StructAtOffset<CFF2Subrs> (cff2, cff2->topDict + cff2->topDictSize); - varStore = &StructAtOffsetOrNull<CFF2VariationStore> (cff2, topDict.vstoreOffset); - charStrings = &StructAtOffsetOrNull<CFF2CharStrings> (cff2, topDict.charStringsOffset); - fdArray = &StructAtOffsetOrNull<CFF2FDArray> (cff2, topDict.FDArrayOffset); - fdSelect = &StructAtOffsetOrNull<CFF2FDSelect> (cff2, topDict.FDSelectOffset); - - if (((varStore != &Null (CFF2VariationStore)) && unlikely (!varStore->sanitize (&sc))) || - (charStrings == &Null (CFF2CharStrings)) || unlikely (!charStrings->sanitize (&sc)) || - (globalSubrs == &Null (CFF2Subrs)) || unlikely (!globalSubrs->sanitize (&sc)) || - (fdArray == &Null (CFF2FDArray)) || unlikely (!fdArray->sanitize (&sc)) || - !hb_barrier () || - (((fdSelect != &Null (CFF2FDSelect)) && unlikely (!fdSelect->sanitize (&sc, fdArray->count))))) + globalSubrs = &StructAtOffsetOrNull<CFF2Subrs> (cff2, cff2->topDict + cff2->topDictSize, sc); + varStore = &StructAtOffsetOrNull<CFF2ItemVariationStore> (cff2, topDict.vstoreOffset, sc); + charStrings = &StructAtOffsetOrNull<CFF2CharStrings> (cff2, topDict.charStringsOffset, sc); + fdArray = &StructAtOffsetOrNull<CFF2FDArray> (cff2, topDict.FDArrayOffset, sc); + fdSelect = &StructAtOffsetOrNull<CFF2FDSelect> (cff2, topDict.FDSelectOffset, sc, fdArray->count); + + if (charStrings == &Null (CFF2CharStrings) || + globalSubrs == &Null (CFF2Subrs) || + fdArray == &Null (CFF2FDArray)) goto fail; num_glyphs = charStrings->count; @@ -462,19 +459,13 @@ struct cff2 font->init (); if (unlikely (!font_interp.interpret (*font))) goto fail; - const hb_ubytes_t privDictStr = StructAtOffsetOrNull<UnsizedByteStr> (cff2, font->privateDictInfo.offset).as_ubytes (font->privateDictInfo.size); - if (unlikely (!privDictStr.sanitize (&sc))) goto fail; - hb_barrier (); + const hb_ubytes_t privDictStr = StructAtOffsetOrNull<UnsizedByteStr> (cff2, font->privateDictInfo.offset, sc, font->privateDictInfo.size).as_ubytes (font->privateDictInfo.size); cff2_priv_dict_interp_env_t env2 (privDictStr); dict_interpreter_t<PRIVOPSET, PRIVDICTVAL, cff2_priv_dict_interp_env_t> priv_interp (env2); privateDicts[i].init (); if (unlikely (!priv_interp.interpret (privateDicts[i]))) goto fail; - privateDicts[i].localSubrs = &StructAtOffsetOrNull<CFF2Subrs> (&privDictStr[0], privateDicts[i].subrsOffset); - if (privateDicts[i].localSubrs != &Null (CFF2Subrs) && - unlikely (!privateDicts[i].localSubrs->sanitize (&sc))) - goto fail; - hb_barrier (); + privateDicts[i].localSubrs = &StructAtOffsetOrNull<CFF2Subrs> (&privDictStr[0], privateDicts[i].subrsOffset, sc); } return; @@ -509,7 +500,7 @@ struct cff2 hb_blob_t *blob = nullptr; cff2_top_dict_values_t topDict; const CFF2Subrs *globalSubrs = nullptr; - const CFF2VariationStore *varStore = nullptr; + const CFF2ItemVariationStore *varStore = nullptr; const CFF2CharStrings *charStrings = nullptr; const CFF2FDArray *fdArray = nullptr; const CFF2FDSelect *fdSelect = nullptr; diff --git a/thirdparty/harfbuzz/src/hb-ot-cmap-table.hh b/thirdparty/harfbuzz/src/hb-ot-cmap-table.hh index e2e2581855..64d2b13880 100644 --- a/thirdparty/harfbuzz/src/hb-ot-cmap-table.hh +++ b/thirdparty/harfbuzz/src/hb-ot-cmap-table.hh @@ -41,6 +41,30 @@ namespace OT { +static inline uint8_t unicode_to_macroman (hb_codepoint_t u) +{ + uint16_t mapping[] = { + 0x00C4, 0x00C5, 0x00C7, 0x00C9, 0x00D1, 0x00D6, 0x00DC, 0x00E1, + 0x00E0, 0x00E2, 0x00E4, 0x00E3, 0x00E5, 0x00E7, 0x00E9, 0x00E8, + 0x00EA, 0x00EB, 0x00ED, 0x00EC, 0x00EE, 0x00EF, 0x00F1, 0x00F3, + 0x00F2, 0x00F4, 0x00F6, 0x00F5, 0x00FA, 0x00F9, 0x00FB, 0x00FC, + 0x2020, 0x00B0, 0x00A2, 0x00A3, 0x00A7, 0x2022, 0x00B6, 0x00DF, + 0x00AE, 0x00A9, 0x2122, 0x00B4, 0x00A8, 0x2260, 0x00C6, 0x00D8, + 0x221E, 0x00B1, 0x2264, 0x2265, 0x00A5, 0x00B5, 0x2202, 0x2211, + 0x220F, 0x03C0, 0x222B, 0x00AA, 0x00BA, 0x03A9, 0x00E6, 0x00F8, + 0x00BF, 0x00A1, 0x00AC, 0x221A, 0x0192, 0x2248, 0x2206, 0x00AB, + 0x00BB, 0x2026, 0x00A0, 0x00C0, 0x00C3, 0x00D5, 0x0152, 0x0153, + 0x2013, 0x2014, 0x201C, 0x201D, 0x2018, 0x2019, 0x00F7, 0x25CA, + 0x00FF, 0x0178, 0x2044, 0x20AC, 0x2039, 0x203A, 0xFB01, 0xFB02, + 0x2021, 0x00B7, 0x201A, 0x201E, 0x2030, 0x00C2, 0x00CA, 0x00C1, + 0x00CB, 0x00C8, 0x00CD, 0x00CE, 0x00CF, 0x00CC, 0x00D3, 0x00D4, + 0xF8FF, 0x00D2, 0x00DA, 0x00DB, 0x00D9, 0x0131, 0x02C6, 0x02DC, + 0x00AF, 0x02D8, 0x02D9, 0x02DA, 0x00B8, 0x02DD, 0x02DB, 0x02C7 + }; + uint16_t *c = hb_bsearch (u, mapping, ARRAY_LENGTH (mapping), sizeof (mapping[0]), + _hb_cmp_operator<uint16_t, uint16_t>); + return c ? (c - mapping) + 0x7F : 0; +} struct CmapSubtableFormat0 { @@ -1465,8 +1489,11 @@ struct EncodingRecord int ret; ret = platformID.cmp (other.platformID); if (ret) return ret; - ret = encodingID.cmp (other.encodingID); - if (ret) return ret; + if (other.encodingID != 0xFFFF) + { + ret = encodingID.cmp (other.encodingID); + if (ret) return ret; + } return 0; } @@ -1814,9 +1841,13 @@ struct cmap c->plan)); } - const CmapSubtable *find_best_subtable (bool *symbol = nullptr) const + const CmapSubtable *find_best_subtable (bool *symbol = nullptr, + bool *mac = nullptr, + bool *macroman = nullptr) const { if (symbol) *symbol = false; + if (mac) *mac = false; + if (macroman) *macroman = false; const CmapSubtable *subtable; @@ -1841,6 +1872,20 @@ struct cmap if ((subtable = this->find_subtable (0, 1))) return subtable; if ((subtable = this->find_subtable (0, 0))) return subtable; + /* MacRoman subtable. */ + if ((subtable = this->find_subtable (1, 0))) + { + if (mac) *mac = true; + if (macroman) *macroman = true; + return subtable; + } + /* Any other Mac subtable; we just map ASCII for these. */ + if ((subtable = this->find_subtable (1, 0xFFFF))) + { + if (mac) *mac = true; + return subtable; + } + /* Meh. */ return &Null (CmapSubtable); } @@ -1852,8 +1897,8 @@ struct cmap accelerator_t (hb_face_t *face) { this->table = hb_sanitize_context_t ().reference_table<cmap> (face); - bool symbol; - this->subtable = table->find_best_subtable (&symbol); + bool symbol, mac, macroman; + this->subtable = table->find_best_subtable (&symbol, &mac, ¯oman); this->subtable_uvs = &Null (CmapSubtableFormat14); { const CmapSubtable *st = table->find_subtable (0, 5); @@ -1862,6 +1907,7 @@ struct cmap } this->get_glyph_data = subtable; +#ifndef HB_NO_CMAP_LEGACY_SUBTABLES if (unlikely (symbol)) { switch ((unsigned) face->table.OS2->get_font_page ()) { @@ -1881,7 +1927,16 @@ struct cmap break; } } + else if (unlikely (macroman)) + { + this->get_glyph_funcZ = get_glyph_from_macroman<CmapSubtable>; + } + else if (unlikely (mac)) + { + this->get_glyph_funcZ = get_glyph_from_ascii<CmapSubtable>; + } else +#endif { switch (subtable->u.format) { /* Accelerate format 4 and format 12. */ @@ -1924,7 +1979,7 @@ struct cmap hb_codepoint_t *glyph, cache_t *cache = nullptr) const { - if (unlikely (!this->get_glyph_funcZ)) return 0; + if (unlikely (!this->get_glyph_funcZ)) return false; return _cached_get (unicode, glyph, cache); } @@ -2006,6 +2061,28 @@ struct cmap return false; } + template <typename Type> + HB_INTERNAL static bool get_glyph_from_ascii (const void *obj, + hb_codepoint_t codepoint, + hb_codepoint_t *glyph) + { + const Type *typed_obj = (const Type *) obj; + return codepoint < 0x80 && typed_obj->get_glyph (codepoint, glyph); + } + + template <typename Type> + HB_INTERNAL static bool get_glyph_from_macroman (const void *obj, + hb_codepoint_t codepoint, + hb_codepoint_t *glyph) + { + if (get_glyph_from_ascii<Type> (obj, codepoint, glyph)) + return true; + + const Type *typed_obj = (const Type *) obj; + unsigned c = unicode_to_macroman (codepoint); + return c && typed_obj->get_glyph (c, glyph); + } + private: hb_nonnull_ptr_t<const CmapSubtable> subtable; hb_nonnull_ptr_t<const CmapSubtableFormat14> subtable_uvs; @@ -2035,28 +2112,6 @@ struct cmap return &(this+result.subtable); } - const EncodingRecord *find_encodingrec (unsigned int platform_id, - unsigned int encoding_id) const - { - EncodingRecord key; - key.platformID = platform_id; - key.encodingID = encoding_id; - - return encodingRecord.as_array ().bsearch (key); - } - - bool find_subtable (unsigned format) const - { - auto it = - + hb_iter (encodingRecord) - | hb_map (&EncodingRecord::subtable) - | hb_map (hb_add (this)) - | hb_filter ([&] (const CmapSubtable& _) { return _.u.format == format; }) - ; - - return it.len (); - } - public: bool sanitize (hb_sanitize_context_t *c) const diff --git a/thirdparty/harfbuzz/src/hb-ot-font.cc b/thirdparty/harfbuzz/src/hb-ot-font.cc index b3677c6a4c..1da869d697 100644 --- a/thirdparty/harfbuzz/src/hb-ot-font.cc +++ b/thirdparty/harfbuzz/src/hb-ot-font.cc @@ -208,12 +208,12 @@ hb_ot_get_glyph_h_advances (hb_font_t* font, void* font_data, #if !defined(HB_NO_VAR) && !defined(HB_NO_OT_FONT_ADVANCE_CACHE) const OT::HVAR &HVAR = *hmtx.var_table; - const OT::VariationStore &varStore = &HVAR + HVAR.varStore; - OT::VariationStore::cache_t *varStore_cache = font->num_coords * count >= 128 ? varStore.create_cache () : nullptr; + const OT::ItemVariationStore &varStore = &HVAR + HVAR.varStore; + OT::ItemVariationStore::cache_t *varStore_cache = font->num_coords * count >= 128 ? varStore.create_cache () : nullptr; bool use_cache = font->num_coords; #else - OT::VariationStore::cache_t *varStore_cache = nullptr; + OT::ItemVariationStore::cache_t *varStore_cache = nullptr; bool use_cache = false; #endif @@ -277,7 +277,7 @@ hb_ot_get_glyph_h_advances (hb_font_t* font, void* font_data, } #if !defined(HB_NO_VAR) && !defined(HB_NO_OT_FONT_ADVANCE_CACHE) - OT::VariationStore::destroy_cache (varStore_cache); + OT::ItemVariationStore::destroy_cache (varStore_cache); #endif if (font->x_strength && !font->embolden_in_place) @@ -313,10 +313,10 @@ hb_ot_get_glyph_v_advances (hb_font_t* font, void* font_data, { #if !defined(HB_NO_VAR) && !defined(HB_NO_OT_FONT_ADVANCE_CACHE) const OT::VVAR &VVAR = *vmtx.var_table; - const OT::VariationStore &varStore = &VVAR + VVAR.varStore; - OT::VariationStore::cache_t *varStore_cache = font->num_coords ? varStore.create_cache () : nullptr; + const OT::ItemVariationStore &varStore = &VVAR + VVAR.varStore; + OT::ItemVariationStore::cache_t *varStore_cache = font->num_coords ? varStore.create_cache () : nullptr; #else - OT::VariationStore::cache_t *varStore_cache = nullptr; + OT::ItemVariationStore::cache_t *varStore_cache = nullptr; #endif for (unsigned int i = 0; i < count; i++) @@ -327,7 +327,7 @@ hb_ot_get_glyph_v_advances (hb_font_t* font, void* font_data, } #if !defined(HB_NO_VAR) && !defined(HB_NO_OT_FONT_ADVANCE_CACHE) - OT::VariationStore::destroy_cache (varStore_cache); + OT::ItemVariationStore::destroy_cache (varStore_cache); #endif } else diff --git a/thirdparty/harfbuzz/src/hb-ot-hmtx-table.hh b/thirdparty/harfbuzz/src/hb-ot-hmtx-table.hh index 89640b43f1..48bd536121 100644 --- a/thirdparty/harfbuzz/src/hb-ot-hmtx-table.hh +++ b/thirdparty/harfbuzz/src/hb-ot-hmtx-table.hh @@ -145,6 +145,29 @@ struct hmtxvmtx table->minTrailingBearing = min_rsb; table->maxExtent = max_extent; } + + if (T::is_horizontal) + { + const auto &OS2 = *c->plan->source->table.OS2; + if (OS2.has_data () && + table->ascender == OS2.sTypoAscender && + table->descender == OS2.sTypoDescender && + table->lineGap == OS2.sTypoLineGap) + { + table->ascender = static_cast<int> (roundf (OS2.sTypoAscender + + MVAR.get_var (HB_OT_METRICS_TAG_HORIZONTAL_ASCENDER, + c->plan->normalized_coords.arrayZ, + c->plan->normalized_coords.length))); + table->descender = static_cast<int> (roundf (OS2.sTypoDescender + + MVAR.get_var (HB_OT_METRICS_TAG_HORIZONTAL_DESCENDER, + c->plan->normalized_coords.arrayZ, + c->plan->normalized_coords.length))); + table->lineGap = static_cast<int> (roundf (OS2.sTypoLineGap + + MVAR.get_var (HB_OT_METRICS_TAG_HORIZONTAL_LINE_GAP, + c->plan->normalized_coords.arrayZ, + c->plan->normalized_coords.length))); + } + } } #endif @@ -374,7 +397,7 @@ struct hmtxvmtx unsigned get_advance_with_var_unscaled (hb_codepoint_t glyph, hb_font_t *font, - VariationStore::cache_t *store_cache = nullptr) const + ItemVariationStore::cache_t *store_cache = nullptr) const { unsigned int advance = get_advance_without_var_unscaled (glyph); diff --git a/thirdparty/harfbuzz/src/hb-ot-layout-base-table.hh b/thirdparty/harfbuzz/src/hb-ot-layout-base-table.hh index a23b6377d1..0278399069 100644 --- a/thirdparty/harfbuzz/src/hb-ot-layout-base-table.hh +++ b/thirdparty/harfbuzz/src/hb-ot-layout-base-table.hh @@ -46,6 +46,12 @@ struct BaseCoordFormat1 return HB_DIRECTION_IS_HORIZONTAL (direction) ? font->em_scale_y (coordinate) : font->em_scale_x (coordinate); } + bool subset (hb_subset_context_t *c) const + { + TRACE_SUBSET (this); + return_trace ((bool) c->serializer->embed (*this)); + } + bool sanitize (hb_sanitize_context_t *c) const { TRACE_SANITIZE (this); @@ -67,6 +73,17 @@ struct BaseCoordFormat2 return HB_DIRECTION_IS_HORIZONTAL (direction) ? font->em_scale_y (coordinate) : font->em_scale_x (coordinate); } + bool subset (hb_subset_context_t *c) const + { + TRACE_SUBSET (this); + auto *out = c->serializer->embed (*this); + if (unlikely (!out)) return_trace (false); + + return_trace (c->serializer->check_assign (out->referenceGlyph, + c->plan->glyph_map->get (referenceGlyph), + HB_SERIALIZE_ERROR_INT_OVERFLOW)); + } + bool sanitize (hb_sanitize_context_t *c) const { TRACE_SANITIZE (this); @@ -86,7 +103,7 @@ struct BaseCoordFormat2 struct BaseCoordFormat3 { hb_position_t get_coord (hb_font_t *font, - const VariationStore &var_store, + const ItemVariationStore &var_store, hb_direction_t direction) const { const Device &device = this+deviceTable; @@ -96,6 +113,23 @@ struct BaseCoordFormat3 : font->em_scale_x (coordinate) + device.get_x_delta (font, var_store); } + void collect_variation_indices (hb_set_t& varidx_set /* OUT */) const + { + unsigned varidx = (this+deviceTable).get_variation_index (); + varidx_set.add (varidx); + } + + bool subset (hb_subset_context_t *c) const + { + TRACE_SUBSET (this); + auto *out = c->serializer->embed (*this); + if (unlikely (!out)) return_trace (false); + + return_trace (out->deviceTable.serialize_copy (c->serializer, deviceTable, + this, 0, + hb_serialize_context_t::Head, + &c->plan->base_variation_idx_map)); + } bool sanitize (hb_sanitize_context_t *c) const { @@ -120,7 +154,7 @@ struct BaseCoord bool has_data () const { return u.format; } hb_position_t get_coord (hb_font_t *font, - const VariationStore &var_store, + const ItemVariationStore &var_store, hb_direction_t direction) const { switch (u.format) { @@ -131,6 +165,27 @@ struct BaseCoord } } + void collect_variation_indices (hb_set_t& varidx_set /* OUT */) const + { + switch (u.format) { + case 3: u.format3.collect_variation_indices (varidx_set); + default:return; + } + } + + template <typename context_t, typename ...Ts> + typename context_t::return_t dispatch (context_t *c, Ts&&... ds) const + { + if (unlikely (!c->may_dispatch (this, &u.format))) return c->no_dispatch_return_value (); + TRACE_DISPATCH (this, u.format); + switch (u.format) { + case 1: return_trace (c->dispatch (u.format1, std::forward<Ts> (ds)...)); + case 2: return_trace (c->dispatch (u.format2, std::forward<Ts> (ds)...)); + case 3: return_trace (c->dispatch (u.format3, std::forward<Ts> (ds)...)); + default:return_trace (c->default_return_value ()); + } + } + bool sanitize (hb_sanitize_context_t *c) const { TRACE_SANITIZE (this); @@ -161,12 +216,37 @@ struct FeatMinMaxRecord bool has_data () const { return tag; } + hb_tag_t get_feature_tag () const { return tag; } + void get_min_max (const BaseCoord **min, const BaseCoord **max) const { if (likely (min)) *min = &(this+minCoord); if (likely (max)) *max = &(this+maxCoord); } + void collect_variation_indices (const hb_subset_plan_t* plan, + const void *base, + hb_set_t& varidx_set /* OUT */) const + { + if (!plan->layout_features.has (tag)) + return; + + (base+minCoord).collect_variation_indices (varidx_set); + (base+maxCoord).collect_variation_indices (varidx_set); + } + + bool subset (hb_subset_context_t *c, + const void *base) const + { + TRACE_SUBSET (this); + auto *out = c->serializer->embed (*this); + if (unlikely (!out)) return_trace (false); + if (!(out->minCoord.serialize_subset (c, minCoord, base))) + return_trace (false); + + return_trace (out->maxCoord.serialize_subset (c, maxCoord, base)); + } + bool sanitize (hb_sanitize_context_t *c, const void *base) const { TRACE_SANITIZE (this); @@ -206,6 +286,39 @@ struct MinMax } } + void collect_variation_indices (const hb_subset_plan_t* plan, + hb_set_t& varidx_set /* OUT */) const + { + (this+minCoord).collect_variation_indices (varidx_set); + (this+maxCoord).collect_variation_indices (varidx_set); + for (const FeatMinMaxRecord& record : featMinMaxRecords) + record.collect_variation_indices (plan, this, varidx_set); + } + + bool subset (hb_subset_context_t *c) const + { + TRACE_SUBSET (this); + auto *out = c->serializer->start_embed (*this); + if (unlikely (!out || !c->serializer->extend_min (out))) return_trace (false); + + if (!(out->minCoord.serialize_subset (c, minCoord, this)) || + !(out->maxCoord.serialize_subset (c, maxCoord, this))) + return_trace (false); + + unsigned len = 0; + for (const FeatMinMaxRecord& _ : featMinMaxRecords) + { + hb_tag_t feature_tag = _.get_feature_tag (); + if (!c->plan->layout_features.has (feature_tag)) + continue; + + if (!_.subset (c, this)) return false; + len++; + } + return_trace (c->serializer->check_assign (out->featMinMaxRecords.len, len, + HB_SERIALIZE_ERROR_INT_OVERFLOW)); + } + bool sanitize (hb_sanitize_context_t *c) const { TRACE_SANITIZE (this); @@ -240,6 +353,26 @@ struct BaseValues return this+baseCoords[baseline_tag_index]; } + void collect_variation_indices (hb_set_t& varidx_set /* OUT */) const + { + for (const auto& _ : baseCoords) + (this+_).collect_variation_indices (varidx_set); + } + + bool subset (hb_subset_context_t *c) const + { + TRACE_SUBSET (this); + auto *out = c->serializer->start_embed (*this); + if (unlikely (!out || !c->serializer->extend_min (out))) return_trace (false); + out->defaultIndex = defaultIndex; + + for (const auto& _ : baseCoords) + if (!subset_offset_array (c, out->baseCoords, this) (_)) + return_trace (false); + + return_trace (bool (out->baseCoords)); + } + bool sanitize (hb_sanitize_context_t *c) const { TRACE_SANITIZE (this); @@ -270,6 +403,20 @@ struct BaseLangSysRecord const MinMax &get_min_max () const { return this+minMax; } + void collect_variation_indices (const hb_subset_plan_t* plan, + hb_set_t& varidx_set /* OUT */) const + { (this+minMax).collect_variation_indices (plan, varidx_set); } + + bool subset (hb_subset_context_t *c, + const void *base) const + { + TRACE_SUBSET (this); + auto *out = c->serializer->embed (*this); + if (unlikely (!out)) return_trace (false); + + return_trace (out->minMax.serialize_subset (c, minMax, base)); + } + bool sanitize (hb_sanitize_context_t *c, const void *base) const { TRACE_SANITIZE (this); @@ -300,6 +447,35 @@ struct BaseScript bool has_values () const { return baseValues; } bool has_min_max () const { return defaultMinMax; /* TODO What if only per-language is present? */ } + void collect_variation_indices (const hb_subset_plan_t* plan, + hb_set_t& varidx_set /* OUT */) const + { + (this+baseValues).collect_variation_indices (varidx_set); + (this+defaultMinMax).collect_variation_indices (plan, varidx_set); + + for (const BaseLangSysRecord& _ : baseLangSysRecords) + _.collect_variation_indices (plan, varidx_set); + } + + bool subset (hb_subset_context_t *c) const + { + TRACE_SUBSET (this); + auto *out = c->serializer->start_embed (*this); + if (unlikely (!out || !c->serializer->extend_min (out))) return_trace (false); + + if (baseValues && !out->baseValues.serialize_subset (c, baseValues, this)) + return_trace (false); + + if (defaultMinMax && !out->defaultMinMax.serialize_subset (c, defaultMinMax, this)) + return_trace (false); + + for (const auto& _ : baseLangSysRecords) + if (!_.subset (c, this)) return_trace (false); + + return_trace (c->serializer->check_assign (out->baseLangSysRecords.len, baseLangSysRecords.len, + HB_SERIALIZE_ERROR_INT_OVERFLOW)); + } + bool sanitize (hb_sanitize_context_t *c) const { TRACE_SANITIZE (this); @@ -332,9 +508,31 @@ struct BaseScriptRecord bool has_data () const { return baseScriptTag; } + hb_tag_t get_script_tag () const { return baseScriptTag; } + const BaseScript &get_base_script (const BaseScriptList *list) const { return list+baseScript; } + void collect_variation_indices (const hb_subset_plan_t* plan, + const void* list, + hb_set_t& varidx_set /* OUT */) const + { + if (!plan->layout_scripts.has (baseScriptTag)) + return; + + (list+baseScript).collect_variation_indices (plan, varidx_set); + } + + bool subset (hb_subset_context_t *c, + const void *base) const + { + TRACE_SUBSET (this); + auto *out = c->serializer->embed (*this); + if (unlikely (!out)) return_trace (false); + + return_trace (out->baseScript.serialize_subset (c, baseScript, base)); + } + bool sanitize (hb_sanitize_context_t *c, const void *base) const { TRACE_SANITIZE (this); @@ -361,6 +559,33 @@ struct BaseScriptList return record->has_data () ? record->get_base_script (this) : Null (BaseScript); } + void collect_variation_indices (const hb_subset_plan_t* plan, + hb_set_t& varidx_set /* OUT */) const + { + for (const BaseScriptRecord& _ : baseScriptRecords) + _.collect_variation_indices (plan, this, varidx_set); + } + + bool subset (hb_subset_context_t *c) const + { + TRACE_SUBSET (this); + auto *out = c->serializer->start_embed (*this); + if (unlikely (!out || !c->serializer->extend_min (out))) return_trace (false); + + unsigned len = 0; + for (const BaseScriptRecord& _ : baseScriptRecords) + { + hb_tag_t script_tag = _.get_script_tag (); + if (!c->plan->layout_scripts.has (script_tag)) + continue; + + if (!_.subset (c, this)) return false; + len++; + } + return_trace (c->serializer->check_assign (out->baseScriptRecords.len, len, + HB_SERIALIZE_ERROR_INT_OVERFLOW)); + } + bool sanitize (hb_sanitize_context_t *c) const { TRACE_SANITIZE (this); @@ -422,6 +647,20 @@ struct Axis return true; } + void collect_variation_indices (const hb_subset_plan_t* plan, + hb_set_t& varidx_set /* OUT */) const + { (this+baseScriptList).collect_variation_indices (plan, varidx_set); } + + bool subset (hb_subset_context_t *c) const + { + TRACE_SUBSET (this); + auto *out = c->serializer->embed (*this); + if (unlikely (!out)) return_trace (false); + + out->baseTagList.serialize_copy (c->serializer, baseTagList, this); + return_trace (out->baseScriptList.serialize_subset (c, baseScriptList, this)); + } + bool sanitize (hb_sanitize_context_t *c) const { TRACE_SANITIZE (this); @@ -453,8 +692,41 @@ struct BASE const Axis &get_axis (hb_direction_t direction) const { return HB_DIRECTION_IS_VERTICAL (direction) ? this+vAxis : this+hAxis; } - const VariationStore &get_var_store () const - { return version.to_int () < 0x00010001u ? Null (VariationStore) : this+varStore; } + bool has_var_store () const + { return version.to_int () >= 0x00010001u && varStore != 0; } + + const ItemVariationStore &get_var_store () const + { return version.to_int () < 0x00010001u ? Null (ItemVariationStore) : this+varStore; } + + void collect_variation_indices (const hb_subset_plan_t* plan, + hb_set_t& varidx_set /* OUT */) const + { + (this+hAxis).collect_variation_indices (plan, varidx_set); + (this+vAxis).collect_variation_indices (plan, varidx_set); + } + + bool subset (hb_subset_context_t *c) const + { + TRACE_SUBSET (this); + auto *out = c->serializer->start_embed (*this); + if (unlikely (!out || !c->serializer->extend_min (out))) return_trace (false); + + out->version = version; + if (hAxis && !out->hAxis.serialize_subset (c, hAxis, this)) + return_trace (false); + + if (vAxis && !out->vAxis.serialize_subset (c, vAxis, this)) + return_trace (false); + + if (has_var_store ()) + { + if (!c->serializer->allocate_size<Offset32To<ItemVariationStore>> (Offset32To<ItemVariationStore>::static_size)) + return_trace (false); + return_trace (out->varStore.serialize_subset (c, varStore, this, c->plan->base_varstore_inner_maps.as_array ())); + } + + return_trace (true); + } bool get_baseline (hb_font_t *font, hb_tag_t baseline_tag, @@ -487,7 +759,7 @@ struct BASE &min_coord, &max_coord)) return false; - const VariationStore &var_store = get_var_store (); + const ItemVariationStore &var_store = get_var_store (); if (likely (min && min_coord)) *min = min_coord->get_coord (font, var_store, direction); if (likely (max && max_coord)) *max = max_coord->get_coord (font, var_store, direction); return true; @@ -510,7 +782,7 @@ struct BASE * of BASE table (may be NULL) */ Offset16To<Axis>vAxis; /* Offset to vertical Axis table, from beginning * of BASE table (may be NULL) */ - Offset32To<VariationStore> + Offset32To<ItemVariationStore> varStore; /* Offset to the table of Item Variation * Store--from beginning of BASE * header (may be NULL). Introduced diff --git a/thirdparty/harfbuzz/src/hb-ot-layout-common.hh b/thirdparty/harfbuzz/src/hb-ot-layout-common.hh index 6b359cceb7..aba427368c 100644 --- a/thirdparty/harfbuzz/src/hb-ot-layout-common.hh +++ b/thirdparty/harfbuzz/src/hb-ot-layout-common.hh @@ -188,7 +188,7 @@ struct hb_subset_layout_context_t : unsigned lookup_index_count; }; -struct VariationStore; +struct ItemVariationStore; struct hb_collect_variation_indices_context_t : hb_dispatch_context_t<hb_collect_variation_indices_context_t> { @@ -3036,7 +3036,7 @@ struct VarData DEFINE_SIZE_ARRAY (6, regionIndices); }; -struct VariationStore +struct ItemVariationStore { friend struct item_variations_t; using cache_t = VarRegionList::cache_t; @@ -3141,7 +3141,7 @@ struct VariationStore } bool serialize (hb_serialize_context_t *c, - const VariationStore *src, + const ItemVariationStore *src, const hb_array_t <const hb_inc_bimap_t> &inner_maps) { TRACE_SERIALIZE (this); @@ -3197,7 +3197,7 @@ struct VariationStore return_trace (true); } - VariationStore *copy (hb_serialize_context_t *c) const + ItemVariationStore *copy (hb_serialize_context_t *c) const { TRACE_SERIALIZE (this); auto *out = c->start_embed (this); @@ -3227,7 +3227,7 @@ struct VariationStore return_trace (false); #endif - VariationStore *varstore_prime = c->serializer->start_embed<VariationStore> (); + ItemVariationStore *varstore_prime = c->serializer->start_embed<ItemVariationStore> (); if (unlikely (!varstore_prime)) return_trace (false); varstore_prime->serialize (c->serializer, this, inner_maps); @@ -4030,13 +4030,13 @@ struct VariationDevice private: hb_position_t get_x_delta (hb_font_t *font, - const VariationStore &store, - VariationStore::cache_t *store_cache = nullptr) const + const ItemVariationStore &store, + ItemVariationStore::cache_t *store_cache = nullptr) const { return font->em_scalef_x (get_delta (font, store, store_cache)); } hb_position_t get_y_delta (hb_font_t *font, - const VariationStore &store, - VariationStore::cache_t *store_cache = nullptr) const + const ItemVariationStore &store, + ItemVariationStore::cache_t *store_cache = nullptr) const { return font->em_scalef_y (get_delta (font, store, store_cache)); } VariationDevice* copy (hb_serialize_context_t *c, @@ -4070,10 +4070,10 @@ struct VariationDevice private: float get_delta (hb_font_t *font, - const VariationStore &store, - VariationStore::cache_t *store_cache = nullptr) const + const ItemVariationStore &store, + ItemVariationStore::cache_t *store_cache = nullptr) const { - return store.get_delta (varIdx, font->coords, font->num_coords, (VariationStore::cache_t *) store_cache); + return store.get_delta (varIdx, font->coords, font->num_coords, (ItemVariationStore::cache_t *) store_cache); } protected: @@ -4097,8 +4097,8 @@ struct DeviceHeader struct Device { hb_position_t get_x_delta (hb_font_t *font, - const VariationStore &store=Null (VariationStore), - VariationStore::cache_t *store_cache = nullptr) const + const ItemVariationStore &store=Null (ItemVariationStore), + ItemVariationStore::cache_t *store_cache = nullptr) const { switch (u.b.format) { @@ -4115,8 +4115,8 @@ struct Device } } hb_position_t get_y_delta (hb_font_t *font, - const VariationStore &store=Null (VariationStore), - VariationStore::cache_t *store_cache = nullptr) const + const ItemVariationStore &store=Null (ItemVariationStore), + ItemVariationStore::cache_t *store_cache = nullptr) const { switch (u.b.format) { diff --git a/thirdparty/harfbuzz/src/hb-ot-layout-gsubgpos.hh b/thirdparty/harfbuzz/src/hb-ot-layout-gsubgpos.hh index 499ad673e4..c65ea32b8a 100644 --- a/thirdparty/harfbuzz/src/hb-ot-layout-gsubgpos.hh +++ b/thirdparty/harfbuzz/src/hb-ot-layout-gsubgpos.hh @@ -708,8 +708,8 @@ struct hb_ot_apply_context_t : recurse_func_t recurse_func = nullptr; const GDEF &gdef; const GDEF::accelerator_t &gdef_accel; - const VariationStore &var_store; - VariationStore::cache_t *var_store_cache; + const ItemVariationStore &var_store; + ItemVariationStore::cache_t *var_store_cache; hb_set_digest_t digest; hb_direction_t direction; @@ -723,7 +723,6 @@ struct hb_ot_apply_context_t : bool auto_zwj = true; bool per_syllable = false; bool random = false; - uint32_t random_state = 1; unsigned new_syllables = (unsigned) -1; signed last_base = -1; // GPOS uses @@ -766,7 +765,7 @@ struct hb_ot_apply_context_t : ~hb_ot_apply_context_t () { #ifndef HB_NO_VAR - VariationStore::destroy_cache (var_store_cache); + ItemVariationStore::destroy_cache (var_store_cache); #endif } @@ -788,8 +787,8 @@ struct hb_ot_apply_context_t : uint32_t random_number () { /* http://www.cplusplus.com/reference/random/minstd_rand/ */ - random_state = random_state * 48271 % 2147483647; - return random_state; + buffer->random_state = buffer->random_state * 48271 % 2147483647; + return buffer->random_state; } bool match_properties_mark (hb_codepoint_t glyph, diff --git a/thirdparty/harfbuzz/src/hb-ot-layout.cc b/thirdparty/harfbuzz/src/hb-ot-layout.cc index 2eb8535db5..a4c13abadf 100644 --- a/thirdparty/harfbuzz/src/hb-ot-layout.cc +++ b/thirdparty/harfbuzz/src/hb-ot-layout.cc @@ -2127,7 +2127,7 @@ hb_ot_layout_get_font_extents (hb_font_t *font, hb_tag_t language_tag, hb_font_extents_t *extents) { - hb_position_t min, max; + hb_position_t min = 0, max = 0; if (font->face->table.BASE->get_min_max (font, direction, script_tag, language_tag, HB_TAG_NONE, &min, &max)) { diff --git a/thirdparty/harfbuzz/src/hb-ot-math-table.hh b/thirdparty/harfbuzz/src/hb-ot-math-table.hh index 32e497aef6..5839059fde 100644 --- a/thirdparty/harfbuzz/src/hb-ot-math-table.hh +++ b/thirdparty/harfbuzz/src/hb-ot-math-table.hh @@ -344,27 +344,20 @@ struct MathKern const MathValueRecord* kernValue = mathValueRecordsZ.arrayZ + heightCount; int sign = font->y_scale < 0 ? -1 : +1; - /* The description of the MathKern table is a ambiguous, but interpreting - * "between the two heights found at those indexes" for 0 < i < len as - * - * correctionHeight[i-1] < correction_height <= correctionHeight[i] - * - * makes the result consistent with the limit cases and we can just use the - * binary search algorithm of std::upper_bound: + /* According to OpenType spec (v1.9), except for the boundary cases, the index + * chosen for kern value should be i such that + * correctionHeight[i-1] <= correction_height < correctionHeight[i] + * We can use the binary search algorithm of std::upper_bound(). Or, we can + * use the internal hb_bsearch_impl. */ - unsigned int i = 0; - unsigned int count = heightCount; - while (count > 0) - { - unsigned int half = count / 2; - hb_position_t height = correctionHeight[i + half].get_y_value (font, this); - if (sign * height < sign * correction_height) - { - i += half + 1; - count -= half + 1; - } else - count = half; - } + unsigned int pos; + auto cmp = +[](const void* key, const void* p, + int sign, hb_font_t* font, const MathKern* mathKern) -> int { + return sign * *(hb_position_t*)key - sign * ((MathValueRecord*)p)->get_y_value(font, mathKern); + }; + unsigned int i = hb_bsearch_impl(&pos, correction_height, correctionHeight, + heightCount, MathValueRecord::static_size, + cmp, sign, font, this) ? pos + 1 : pos; return kernValue[i].get_x_value (font, this); } diff --git a/thirdparty/harfbuzz/src/hb-ot-os2-table.hh b/thirdparty/harfbuzz/src/hb-ot-os2-table.hh index 8c2e696f56..43b58d9bbf 100644 --- a/thirdparty/harfbuzz/src/hb-ot-os2-table.hh +++ b/thirdparty/harfbuzz/src/hb-ot-os2-table.hh @@ -223,7 +223,7 @@ struct OS2 } } - return num ? (unsigned) roundf (total_width / num) : 0; + return num ? (unsigned) roundf ((double) total_width / (double) num) : 0; } bool subset (hb_subset_context_t *c) const @@ -284,12 +284,12 @@ struct OS2 os2_prime->usWidthClass = width_class; } - if (c->plan->flags & HB_SUBSET_FLAGS_NO_PRUNE_UNICODE_RANGES) - return_trace (true); - os2_prime->usFirstCharIndex = hb_min (0xFFFFu, c->plan->unicodes.get_min ()); os2_prime->usLastCharIndex = hb_min (0xFFFFu, c->plan->unicodes.get_max ()); + if (c->plan->flags & HB_SUBSET_FLAGS_NO_PRUNE_UNICODE_RANGES) + return_trace (true); + _update_unicode_ranges (&c->plan->unicodes, os2_prime->ulUnicodeRange); return_trace (true); diff --git a/thirdparty/harfbuzz/src/hb-ot-shape.cc b/thirdparty/harfbuzz/src/hb-ot-shape.cc index 90f596ae79..148830022e 100644 --- a/thirdparty/harfbuzz/src/hb-ot-shape.cc +++ b/thirdparty/harfbuzz/src/hb-ot-shape.cc @@ -155,7 +155,7 @@ hb_ot_shape_planner_t::compile (hb_ot_shape_plan_t &plan, #endif bool has_gpos = !disable_gpos && hb_ot_layout_has_positioning (face); if (false) - ; + {} #ifndef HB_NO_AAT_SHAPE /* Prefer GPOS over kerx if GSUB is present; * https://github.com/harfbuzz/harfbuzz/issues/3008 */ @@ -167,15 +167,16 @@ hb_ot_shape_planner_t::compile (hb_ot_shape_plan_t &plan, if (!plan.apply_kerx && (!has_gpos_kern || !plan.apply_gpos)) { + if (false) {} #ifndef HB_NO_AAT_SHAPE - if (has_kerx) + else if (has_kerx) plan.apply_kerx = true; - else #endif #ifndef HB_NO_OT_KERN - if (hb_ot_layout_has_kerning (face)) + else if (hb_ot_layout_has_kerning (face)) plan.apply_kern = true; #endif + else {} } plan.apply_fallback_kern = !(plan.apply_gpos || plan.apply_kerx || plan.apply_kern); diff --git a/thirdparty/harfbuzz/src/hb-ot-shaper-arabic.cc b/thirdparty/harfbuzz/src/hb-ot-shaper-arabic.cc index 72dcc84df5..d70746ed2b 100644 --- a/thirdparty/harfbuzz/src/hb-ot-shaper-arabic.cc +++ b/thirdparty/harfbuzz/src/hb-ot-shaper-arabic.cc @@ -560,9 +560,9 @@ apply_stch (const hb_ot_shape_plan_t *plan HB_UNUSED, DEBUG_MSG (ARABIC, nullptr, "%s stretch at (%u,%u,%u)", step == MEASURE ? "measuring" : "cutting", context, start, end); - DEBUG_MSG (ARABIC, nullptr, "rest of word: count=%u width %d", start - context, w_total); - DEBUG_MSG (ARABIC, nullptr, "fixed tiles: count=%d width=%d", n_fixed, w_fixed); - DEBUG_MSG (ARABIC, nullptr, "repeating tiles: count=%d width=%d", n_repeating, w_repeating); + DEBUG_MSG (ARABIC, nullptr, "rest of word: count=%u width %" PRId32, start - context, w_total); + DEBUG_MSG (ARABIC, nullptr, "fixed tiles: count=%d width=%" PRId32, n_fixed, w_fixed); + DEBUG_MSG (ARABIC, nullptr, "repeating tiles: count=%d width=%" PRId32, n_repeating, w_repeating); /* Number of additional times to repeat each repeating tile. */ int n_copies = 0; @@ -602,7 +602,7 @@ apply_stch (const hb_ot_shape_plan_t *plan HB_UNUSED, if (info[k - 1].arabic_shaping_action() == STCH_REPEATING) repeat += n_copies; - DEBUG_MSG (ARABIC, nullptr, "appending %u copies of glyph %u; j=%u", + DEBUG_MSG (ARABIC, nullptr, "appending %u copies of glyph %" PRIu32 "; j=%u", repeat, info[k - 1].codepoint, j); pos[k - 1].x_advance = 0; for (unsigned int n = 0; n < repeat; n++) diff --git a/thirdparty/harfbuzz/src/hb-ot-stat-table.hh b/thirdparty/harfbuzz/src/hb-ot-stat-table.hh index 58b3cd74df..e88c82a13c 100644 --- a/thirdparty/harfbuzz/src/hb-ot-stat-table.hh +++ b/thirdparty/harfbuzz/src/hb-ot-stat-table.hh @@ -349,7 +349,7 @@ struct AxisValueFormat4 struct AxisValue { - bool get_value (unsigned int axis_index) const + float get_value (unsigned int axis_index) const { switch (u.format) { @@ -357,7 +357,7 @@ struct AxisValue case 2: return u.format2.get_value (); case 3: return u.format3.get_value (); case 4: return u.format4.get_axis_record (axis_index).get_value (); - default:return 0; + default:return 0.f; } } @@ -485,7 +485,7 @@ struct STAT hb_array_t<const Offset16To<AxisValue>> axis_values = get_axis_value_offsets (); for (unsigned int i = 0; i < axis_values.length; i++) { - const AxisValue& axis_value = this+axis_values[i]; + const AxisValue& axis_value = this+offsetToAxisValueOffsets+axis_values[i]; if (axis_value.get_axis_index () == axis_index) { if (value) diff --git a/thirdparty/harfbuzz/src/hb-ot-tag-table.hh b/thirdparty/harfbuzz/src/hb-ot-tag-table.hh index 032a7c866c..db92f4664a 100644 --- a/thirdparty/harfbuzz/src/hb-ot-tag-table.hh +++ b/thirdparty/harfbuzz/src/hb-ot-tag-table.hh @@ -6,8 +6,8 @@ * * on files with these headers: * - * <meta name="updated_at" content="2022-09-30 11:47 PM" /> - * File-Date: 2023-08-02 + * <meta name="updated_at" content="2023-09-30 01:21 AM" /> + * File-Date: 2024-03-07 */ #ifndef HB_OT_TAG_TABLE_HH @@ -31,7 +31,7 @@ static const LangTag ot_languages2[] = { {HB_TAG('b','i',' ',' '), HB_TAG('B','I','S',' ')}, /* Bislama */ {HB_TAG('b','i',' ',' '), HB_TAG('C','P','P',' ')}, /* Bislama -> Creoles */ {HB_TAG('b','m',' ',' '), HB_TAG('B','M','B',' ')}, /* Bambara (Bamanankan) */ - {HB_TAG('b','n',' ',' '), HB_TAG('B','E','N',' ')}, /* Bengali */ + {HB_TAG('b','n',' ',' '), HB_TAG('B','E','N',' ')}, /* Bangla */ {HB_TAG('b','o',' ',' '), HB_TAG('T','I','B',' ')}, /* Tibetan */ {HB_TAG('b','r',' ',' '), HB_TAG('B','R','E',' ')}, /* Breton */ {HB_TAG('b','s',' ',' '), HB_TAG('B','O','S',' ')}, /* Bosnian */ @@ -64,7 +64,7 @@ static const LangTag ot_languages2[] = { {HB_TAG('f','r',' ',' '), HB_TAG('F','R','A',' ')}, /* French */ {HB_TAG('f','y',' ',' '), HB_TAG('F','R','I',' ')}, /* Western Frisian -> Frisian */ {HB_TAG('g','a',' ',' '), HB_TAG('I','R','I',' ')}, /* Irish */ - {HB_TAG('g','d',' ',' '), HB_TAG('G','A','E',' ')}, /* Scottish Gaelic (Gaelic) */ + {HB_TAG('g','d',' ',' '), HB_TAG('G','A','E',' ')}, /* Scottish Gaelic */ {HB_TAG('g','l',' ',' '), HB_TAG('G','A','L',' ')}, /* Galician */ {HB_TAG('g','n',' ',' '), HB_TAG('G','U','A',' ')}, /* Guarani [macrolanguage] */ {HB_TAG('g','u',' ',' '), HB_TAG('G','U','J',' ')}, /* Gujarati */ @@ -132,7 +132,7 @@ static const LangTag ot_languages2[] = { {HB_TAG('m','l',' ',' '), HB_TAG('M','A','L',' ')}, /* Malayalam -> Malayalam Traditional */ {HB_TAG('m','l',' ',' '), HB_TAG('M','L','R',' ')}, /* Malayalam -> Malayalam Reformed */ {HB_TAG('m','n',' ',' '), HB_TAG('M','N','G',' ')}, /* Mongolian [macrolanguage] */ - {HB_TAG('m','o',' ',' '), HB_TAG('M','O','L',' ')}, /* Moldavian (retired code) */ + {HB_TAG('m','o',' ',' '), HB_TAG('M','O','L',' ')}, /* Moldavian (retired code) -> Romanian (Moldova) */ {HB_TAG('m','o',' ',' '), HB_TAG('R','O','M',' ')}, /* Moldavian (retired code) -> Romanian */ {HB_TAG('m','r',' ',' '), HB_TAG('M','A','R',' ')}, /* Marathi */ {HB_TAG('m','s',' ',' '), HB_TAG('M','L','Y',' ')}, /* Malay [macrolanguage] */ @@ -153,7 +153,7 @@ static const LangTag ot_languages2[] = { {HB_TAG('o','c',' ',' '), HB_TAG('O','C','I',' ')}, /* Occitan (post 1500) */ {HB_TAG('o','j',' ',' '), HB_TAG('O','J','B',' ')}, /* Ojibwa [macrolanguage] -> Ojibway */ {HB_TAG('o','m',' ',' '), HB_TAG('O','R','O',' ')}, /* Oromo [macrolanguage] */ - {HB_TAG('o','r',' ',' '), HB_TAG('O','R','I',' ')}, /* Odia (formerly Oriya) [macrolanguage] */ + {HB_TAG('o','r',' ',' '), HB_TAG('O','R','I',' ')}, /* Odia [macrolanguage] */ {HB_TAG('o','s',' ',' '), HB_TAG('O','S','S',' ')}, /* Ossetian */ {HB_TAG('p','a',' ',' '), HB_TAG('P','A','N',' ')}, /* Punjabi */ {HB_TAG('p','i',' ',' '), HB_TAG('P','A','L',' ')}, /* Pali */ @@ -166,7 +166,7 @@ static const LangTag ot_languages2[] = { {HB_TAG('r','o',' ',' '), HB_TAG('R','O','M',' ')}, /* Romanian */ {HB_TAG('r','u',' ',' '), HB_TAG('R','U','S',' ')}, /* Russian */ {HB_TAG('r','w',' ',' '), HB_TAG('R','U','A',' ')}, /* Kinyarwanda */ - {HB_TAG('s','a',' ',' '), HB_TAG('S','A','N',' ')}, /* Sanskrit */ + {HB_TAG('s','a',' ',' '), HB_TAG('S','A','N',' ')}, /* Sanskrit [macrolanguage] */ {HB_TAG('s','c',' ',' '), HB_TAG('S','R','D',' ')}, /* Sardinian [macrolanguage] */ {HB_TAG('s','d',' ',' '), HB_TAG('S','N','D',' ')}, /* Sindhi */ {HB_TAG('s','e',' ',' '), HB_TAG('N','S','M',' ')}, /* Northern Sami */ @@ -465,6 +465,7 @@ static const LangTag ot_languages3[] = { {HB_TAG('c','l','d',' '), HB_TAG('S','Y','R',' ')}, /* Chaldean Neo-Aramaic -> Syriac */ {HB_TAG('c','l','e',' '), HB_TAG('C','C','H','N')}, /* Lealao Chinantec -> Chinantec */ {HB_TAG('c','l','j',' '), HB_TAG('Q','I','N',' ')}, /* Laitu Chin -> Chin */ + {HB_TAG('c','l','s',' '), HB_TAG('S','A','N',' ')}, /* Classical Sanskrit -> Sanskrit */ {HB_TAG('c','l','t',' '), HB_TAG('Q','I','N',' ')}, /* Lautu Chin -> Chin */ {HB_TAG('c','m','n',' '), HB_TAG('Z','H','S',' ')}, /* Mandarin Chinese -> Chinese, Simplified */ {HB_TAG('c','m','r',' '), HB_TAG('Q','I','N',' ')}, /* Mro-Khimi Chin -> Chin */ @@ -637,7 +638,7 @@ static const LangTag ot_languages3[] = { {HB_TAG('g','a','a',' '), HB_TAG('G','A','D',' ')}, /* Ga */ {HB_TAG('g','a','c',' '), HB_TAG('C','P','P',' ')}, /* Mixed Great Andamanese -> Creoles */ {HB_TAG('g','a','d',' '), HB_TAG_NONE }, /* Gaddang != Ga */ - {HB_TAG('g','a','e',' '), HB_TAG_NONE }, /* Guarequena != Scottish Gaelic (Gaelic) */ + {HB_TAG('g','a','e',' '), HB_TAG_NONE }, /* Guarequena != Scottish Gaelic */ /*{HB_TAG('g','a','g',' '), HB_TAG('G','A','G',' ')},*/ /* Gagauz */ {HB_TAG('g','a','l',' '), HB_TAG_NONE }, /* Galolen != Galician */ {HB_TAG('g','a','n',' '), HB_TAG('Z','H','S',' ')}, /* Gan Chinese -> Chinese, Simplified */ @@ -1160,7 +1161,7 @@ static const LangTag ot_languages3[] = { {HB_TAG('o','r','o',' '), HB_TAG_NONE }, /* Orokolo != Oromo */ {HB_TAG('o','r','r',' '), HB_TAG('I','J','O',' ')}, /* Oruma -> Ijo */ {HB_TAG('o','r','s',' '), HB_TAG('M','L','Y',' ')}, /* Orang Seletar -> Malay */ - {HB_TAG('o','r','y',' '), HB_TAG('O','R','I',' ')}, /* Odia (formerly Oriya) */ + {HB_TAG('o','r','y',' '), HB_TAG('O','R','I',' ')}, /* Odia */ {HB_TAG('o','t','w',' '), HB_TAG('O','J','B',' ')}, /* Ottawa -> Ojibway */ {HB_TAG('o','u','a',' '), HB_TAG('B','B','R',' ')}, /* Tagargrent -> Berber */ {HB_TAG('p','a','a',' '), HB_TAG_NONE }, /* Papuan [collection] != Palestinian Aramaic */ @@ -1395,7 +1396,7 @@ static const LangTag ot_languages3[] = { /*{HB_TAG('s','n','k',' '), HB_TAG('S','N','K',' ')},*/ /* Soninke */ {HB_TAG('s','o','g',' '), HB_TAG_NONE }, /* Sogdian != Sodo Gurage */ /*{HB_TAG('s','o','p',' '), HB_TAG('S','O','P',' ')},*/ /* Songe */ - {HB_TAG('s','p','v',' '), HB_TAG('O','R','I',' ')}, /* Sambalpuri -> Odia (formerly Oriya) */ + {HB_TAG('s','p','v',' '), HB_TAG('O','R','I',' ')}, /* Sambalpuri -> Odia */ {HB_TAG('s','p','y',' '), HB_TAG('K','A','L',' ')}, /* Sabaot -> Kalenjin */ {HB_TAG('s','r','b',' '), HB_TAG_NONE }, /* Sora != Serbian */ {HB_TAG('s','r','c',' '), HB_TAG('S','R','D',' ')}, /* Logudorese Sardinian -> Sardinian */ @@ -1533,6 +1534,7 @@ static const LangTag ot_languages3[] = { {HB_TAG('v','l','s',' '), HB_TAG('F','L','E',' ')}, /* Vlaams -> Dutch (Flemish) */ {HB_TAG('v','m','w',' '), HB_TAG('M','A','K',' ')}, /* Makhuwa */ /*{HB_TAG('v','r','o',' '), HB_TAG('V','R','O',' ')},*/ /* Võro */ + {HB_TAG('v','s','n',' '), HB_TAG('S','A','N',' ')}, /* Vedic Sanskrit -> Sanskrit */ {HB_TAG('w','a','g',' '), HB_TAG_NONE }, /* Wa'ema != Wagdi */ /*{HB_TAG('w','a','r',' '), HB_TAG('W','A','R',' ')},*/ /* Waray (Philippines) -> Waray-Waray */ {HB_TAG('w','b','m',' '), HB_TAG('W','A',' ',' ')}, /* Wa */ @@ -2643,7 +2645,7 @@ out: /* Romanian; Moldova */ unsigned int i; hb_tag_t possible_tags[] = { - HB_TAG('M','O','L',' '), /* Moldavian */ + HB_TAG('M','O','L',' '), /* Romanian (Moldova) */ HB_TAG('R','O','M',' '), /* Romanian */ }; for (i = 0; i < 2 && i < *count; i++) @@ -2920,7 +2922,7 @@ hb_ot_ambiguous_tag_to_language (hb_tag_t tag) return hb_language_from_string ("mn", -1); /* Mongolian [macrolanguage] */ case HB_TAG('M','N','K',' '): /* Maninka */ return hb_language_from_string ("man", -1); /* Mandingo [macrolanguage] */ - case HB_TAG('M','O','L',' '): /* Moldavian */ + case HB_TAG('M','O','L',' '): /* Romanian (Moldova) */ return hb_language_from_string ("ro-MD", -1); /* Romanian; Moldova */ case HB_TAG('M','O','N','T'): /* Thailand Mon */ return hb_language_from_string ("mnw-TH", -1); /* Mon; Thailand */ @@ -2958,6 +2960,8 @@ hb_ot_ambiguous_tag_to_language (hb_tag_t tag) return hb_language_from_string ("ro", -1); /* Romanian */ case HB_TAG('R','O','Y',' '): /* Romany */ return hb_language_from_string ("rom", -1); /* Romany [macrolanguage] */ + case HB_TAG('S','A','N',' '): /* Sanskrit */ + return hb_language_from_string ("sa", -1); /* Sanskrit [macrolanguage] */ case HB_TAG('S','Q','I',' '): /* Albanian */ return hb_language_from_string ("sq", -1); /* Albanian [macrolanguage] */ case HB_TAG('S','R','B',' '): /* Serbian */ diff --git a/thirdparty/harfbuzz/src/hb-ot-tag.cc b/thirdparty/harfbuzz/src/hb-ot-tag.cc index 53b6b38f66..0c63756b14 100644 --- a/thirdparty/harfbuzz/src/hb-ot-tag.cc +++ b/thirdparty/harfbuzz/src/hb-ot-tag.cc @@ -547,7 +547,7 @@ hb_ot_tag_to_language (hb_tag_t tag) buf[3] = '-'; str += 4; } - snprintf (str, 16, "x-hbot-%08x", tag); + snprintf (str, 16, "x-hbot-%08" PRIx32, tag); return hb_language_from_string (&*buf, -1); } } diff --git a/thirdparty/harfbuzz/src/hb-ot-var-avar-table.hh b/thirdparty/harfbuzz/src/hb-ot-var-avar-table.hh index b2e5d87a3c..9149959d79 100644 --- a/thirdparty/harfbuzz/src/hb-ot-var-avar-table.hh +++ b/thirdparty/harfbuzz/src/hb-ot-var-avar-table.hh @@ -57,7 +57,7 @@ struct avarV2Tail protected: Offset32To<DeltaSetIndexMap> varIdxMap; /* Offset from the beginning of 'avar' table. */ - Offset32To<VariationStore> varStore; /* Offset from the beginning of 'avar' table. */ + Offset32To<ItemVariationStore> varStore; /* Offset from the beginning of 'avar' table. */ public: DEFINE_SIZE_STATIC (8); @@ -230,7 +230,7 @@ struct SegmentMaps : Array16Of<AxisValueMap> * duplicates here */ if (mapping.must_include ()) continue; - value_mappings.push (std::move (mapping)); + value_mappings.push (mapping); } AxisValueMap m; @@ -343,7 +343,7 @@ struct avar for (unsigned i = 0; i < coords_length; i++) coords[i] = out[i]; - OT::VariationStore::destroy_cache (var_store_cache); + OT::ItemVariationStore::destroy_cache (var_store_cache); #endif } diff --git a/thirdparty/harfbuzz/src/hb-ot-var-common.hh b/thirdparty/harfbuzz/src/hb-ot-var-common.hh index eff6df380f..379e164059 100644 --- a/thirdparty/harfbuzz/src/hb-ot-var-common.hh +++ b/thirdparty/harfbuzz/src/hb-ot-var-common.hh @@ -28,6 +28,7 @@ #include "hb-ot-layout-common.hh" #include "hb-priority-queue.hh" +#include "hb-subset-instancer-iup.hh" namespace OT { @@ -221,9 +222,9 @@ struct DeltaSetIndexMap }; -struct VarStoreInstancer +struct ItemVarStoreInstancer { - VarStoreInstancer (const VariationStore *varStore, + ItemVarStoreInstancer (const ItemVariationStore *varStore, const DeltaSetIndexMap *varIdxMap, hb_array_t<int> coords) : varStore (varStore), varIdxMap (varIdxMap), coords (coords) {} @@ -235,7 +236,7 @@ struct VarStoreInstancer float operator() (uint32_t varIdx, unsigned short offset = 0) const { return coords ? varStore->get_delta (varIdxMap ? varIdxMap->map (VarIdx::add (varIdx, offset)) : varIdx + offset, coords) : 0; } - const VariationStore *varStore; + const ItemVariationStore *varStore; const DeltaSetIndexMap *varIdxMap; hb_array_t<int> coords; }; @@ -460,7 +461,7 @@ struct tuple_delta_t tuple_delta_t () = default; tuple_delta_t (const tuple_delta_t& o) = default; - friend void swap (tuple_delta_t& a, tuple_delta_t& b) + friend void swap (tuple_delta_t& a, tuple_delta_t& b) noexcept { hb_swap (a.axis_tuples, b.axis_tuples); hb_swap (a.indices, b.indices); @@ -471,10 +472,10 @@ struct tuple_delta_t hb_swap (a.compiled_peak_coords, b.compiled_peak_coords); } - tuple_delta_t (tuple_delta_t&& o) : tuple_delta_t () + tuple_delta_t (tuple_delta_t&& o) noexcept : tuple_delta_t () { hb_swap (*this, o); } - tuple_delta_t& operator = (tuple_delta_t&& o) + tuple_delta_t& operator = (tuple_delta_t&& o) noexcept { hb_swap (*this, o); return *this; @@ -609,7 +610,9 @@ struct tuple_delta_t const hb_map_t& axes_old_index_tag_map, const hb_hashmap_t<const hb_vector_t<char>*, unsigned>* shared_tuples_idx_map) { - if (!compiled_deltas) return false; + /* compiled_deltas could be empty after iup delta optimization, we can skip + * compiling this tuple and return true */ + if (!compiled_deltas) return true; unsigned cur_axis_count = axes_index_map.get_population (); /* allocate enough memory: 1 peak + 2 intermediate coords + fixed header size */ @@ -723,22 +726,28 @@ struct tuple_delta_t } bool compile_deltas () + { return compile_deltas (indices, deltas_x, deltas_y, compiled_deltas); } + + bool compile_deltas (const hb_vector_t<bool> &point_indices, + const hb_vector_t<float> &x_deltas, + const hb_vector_t<float> &y_deltas, + hb_vector_t<char> &compiled_deltas /* OUT */) { hb_vector_t<int> rounded_deltas; - if (unlikely (!rounded_deltas.alloc (indices.length))) + if (unlikely (!rounded_deltas.alloc (point_indices.length))) return false; - for (unsigned i = 0; i < indices.length; i++) + for (unsigned i = 0; i < point_indices.length; i++) { - if (!indices[i]) continue; - int rounded_delta = (int) roundf (deltas_x[i]); + if (!point_indices[i]) continue; + int rounded_delta = (int) roundf (x_deltas.arrayZ[i]); rounded_deltas.push (rounded_delta); } - if (!rounded_deltas) return false; + if (!rounded_deltas) return true; /* allocate enough memories 3 * num_deltas */ unsigned alloc_len = 3 * rounded_deltas.length; - if (deltas_y) + if (y_deltas) alloc_len *= 2; if (unlikely (!compiled_deltas.resize (alloc_len))) return false; @@ -746,14 +755,14 @@ struct tuple_delta_t unsigned i = 0; unsigned encoded_len = encode_delta_run (i, compiled_deltas.as_array (), rounded_deltas); - if (deltas_y) + if (y_deltas) { - /* reuse the rounded_deltas vector, check that deltas_y have the same num of deltas as deltas_x */ + /* reuse the rounded_deltas vector, check that y_deltas have the same num of deltas as x_deltas */ unsigned j = 0; - for (unsigned idx = 0; idx < indices.length; idx++) + for (unsigned idx = 0; idx < point_indices.length; idx++) { - if (!indices[idx]) continue; - int rounded_delta = (int) roundf (deltas_y[idx]); + if (!point_indices[idx]) continue; + int rounded_delta = (int) roundf (y_deltas.arrayZ[idx]); if (j >= rounded_deltas.length) return false; @@ -761,7 +770,7 @@ struct tuple_delta_t } if (j != rounded_deltas.length) return false; - /* reset i because we reuse rounded_deltas for deltas_y */ + /* reset i because we reuse rounded_deltas for y_deltas */ i = 0; encoded_len += encode_delta_run (i, compiled_deltas.as_array ().sub_array (encoded_len), rounded_deltas); } @@ -1020,6 +1029,171 @@ struct tuple_delta_t return true; } + bool optimize (const contour_point_vector_t& contour_points, + bool is_composite, + float tolerance = 0.5f) + { + unsigned count = contour_points.length; + if (deltas_x.length != count || + deltas_y.length != count) + return false; + + hb_vector_t<bool> opt_indices; + hb_vector_t<int> rounded_x_deltas, rounded_y_deltas; + + if (unlikely (!rounded_x_deltas.alloc (count) || + !rounded_y_deltas.alloc (count))) + return false; + + for (unsigned i = 0; i < count; i++) + { + int rounded_x_delta = (int) roundf (deltas_x.arrayZ[i]); + int rounded_y_delta = (int) roundf (deltas_y.arrayZ[i]); + rounded_x_deltas.push (rounded_x_delta); + rounded_y_deltas.push (rounded_y_delta); + } + + if (!iup_delta_optimize (contour_points, rounded_x_deltas, rounded_y_deltas, opt_indices, tolerance)) + return false; + + unsigned ref_count = 0; + for (bool ref_flag : opt_indices) + ref_count += ref_flag; + + if (ref_count == count) return true; + + hb_vector_t<float> opt_deltas_x, opt_deltas_y; + bool is_comp_glyph_wo_deltas = (is_composite && ref_count == 0); + if (is_comp_glyph_wo_deltas) + { + if (unlikely (!opt_deltas_x.resize (count) || + !opt_deltas_y.resize (count))) + return false; + + opt_indices.arrayZ[0] = true; + for (unsigned i = 1; i < count; i++) + opt_indices.arrayZ[i] = false; + } + + hb_vector_t<char> opt_point_data; + if (!compile_point_set (opt_indices, opt_point_data)) + return false; + hb_vector_t<char> opt_deltas_data; + if (!compile_deltas (opt_indices, + is_comp_glyph_wo_deltas ? opt_deltas_x : deltas_x, + is_comp_glyph_wo_deltas ? opt_deltas_y : deltas_y, + opt_deltas_data)) + return false; + + hb_vector_t<char> point_data; + if (!compile_point_set (indices, point_data)) + return false; + hb_vector_t<char> deltas_data; + if (!compile_deltas (indices, deltas_x, deltas_y, deltas_data)) + return false; + + if (opt_point_data.length + opt_deltas_data.length < point_data.length + deltas_data.length) + { + indices.fini (); + indices = std::move (opt_indices); + + if (is_comp_glyph_wo_deltas) + { + deltas_x.fini (); + deltas_x = std::move (opt_deltas_x); + + deltas_y.fini (); + deltas_y = std::move (opt_deltas_y); + } + } + return !indices.in_error () && !deltas_x.in_error () && !deltas_y.in_error (); + } + + static bool compile_point_set (const hb_vector_t<bool> &point_indices, + hb_vector_t<char>& compiled_points /* OUT */) + { + unsigned num_points = 0; + for (bool i : point_indices) + if (i) num_points++; + + /* when iup optimization is enabled, num of referenced points could be 0 */ + if (!num_points) return true; + + unsigned indices_length = point_indices.length; + /* If the points set consists of all points in the glyph, it's encoded with a + * single zero byte */ + if (num_points == indices_length) + return compiled_points.resize (1); + + /* allocate enough memories: 2 bytes for count + 3 bytes for each point */ + unsigned num_bytes = 2 + 3 *num_points; + if (unlikely (!compiled_points.resize (num_bytes, false))) + return false; + + unsigned pos = 0; + /* binary data starts with the total number of reference points */ + if (num_points < 0x80) + compiled_points.arrayZ[pos++] = num_points; + else + { + compiled_points.arrayZ[pos++] = ((num_points >> 8) | 0x80); + compiled_points.arrayZ[pos++] = num_points & 0xFF; + } + + const unsigned max_run_length = 0x7F; + unsigned i = 0; + unsigned last_value = 0; + unsigned num_encoded = 0; + while (i < indices_length && num_encoded < num_points) + { + unsigned run_length = 0; + unsigned header_pos = pos; + compiled_points.arrayZ[pos++] = 0; + + bool use_byte_encoding = false; + bool new_run = true; + while (i < indices_length && num_encoded < num_points && + run_length <= max_run_length) + { + // find out next referenced point index + while (i < indices_length && !point_indices[i]) + i++; + + if (i >= indices_length) break; + + unsigned cur_value = i; + unsigned delta = cur_value - last_value; + + if (new_run) + { + use_byte_encoding = (delta <= 0xFF); + new_run = false; + } + + if (use_byte_encoding && delta > 0xFF) + break; + + if (use_byte_encoding) + compiled_points.arrayZ[pos++] = delta; + else + { + compiled_points.arrayZ[pos++] = delta >> 8; + compiled_points.arrayZ[pos++] = delta & 0xFF; + } + i++; + last_value = cur_value; + run_length++; + num_encoded++; + } + + if (use_byte_encoding) + compiled_points.arrayZ[header_pos] = run_length - 1; + else + compiled_points.arrayZ[header_pos] = (run_length - 1) | 0x80; + } + return compiled_points.resize (pos, false); + } + static float infer_delta (float target_val, float prev_val, float next_val, float prev_delta, float next_delta) { if (prev_val == next_val) @@ -1071,41 +1245,41 @@ struct TupleVariationData private: /* referenced point set->compiled point data map */ - hb_hashmap_t<const hb_vector_t<bool>*, hb_bytes_t> point_data_map; + hb_hashmap_t<const hb_vector_t<bool>*, hb_vector_t<char>> point_data_map; /* referenced point set-> count map, used in finding shared points */ hb_hashmap_t<const hb_vector_t<bool>*, unsigned> point_set_count_map; /* empty for non-gvar tuples. - * shared_points_bytes is just a copy of some value in the point_data_map, + * shared_points_bytes is a pointer to some value in the point_data_map, * which will be freed during map destruction. Save it for serialization, so * no need to do find_shared_points () again */ - hb_bytes_t shared_points_bytes; + hb_vector_t<char> *shared_points_bytes = nullptr; /* total compiled byte size as TupleVariationData format, initialized to its * min_size: 4 */ unsigned compiled_byte_size = 4; + /* for gvar iup delta optimization: whether this is a composite glyph */ + bool is_composite = false; + public: tuple_variations_t () = default; tuple_variations_t (const tuple_variations_t&) = delete; tuple_variations_t& operator=(const tuple_variations_t&) = delete; tuple_variations_t (tuple_variations_t&&) = default; tuple_variations_t& operator=(tuple_variations_t&&) = default; - ~tuple_variations_t () { fini (); } - void fini () - { - for (auto _ : point_data_map.values ()) - _.fini (); - - point_set_count_map.fini (); - tuple_vars.fini (); - } + ~tuple_variations_t () = default; explicit operator bool () const { return bool (tuple_vars); } unsigned get_var_count () const { - unsigned count = tuple_vars.length; - if (shared_points_bytes.length) + unsigned count = 0; + /* when iup delta opt is enabled, compiled_deltas could be empty and we + * should skip this tuple */ + for (auto& tuple: tuple_vars) + if (tuple.compiled_deltas) count++; + + if (shared_points_bytes && shared_points_bytes->length) count |= TupleVarCount::SharedPointNumbers; return count; } @@ -1119,26 +1293,27 @@ struct TupleVariationData bool is_gvar, const hb_map_t *axes_old_index_tag_map, const hb_vector_t<unsigned> &shared_indices, - const hb_array_t<const F2DOT14> shared_tuples) + const hb_array_t<const F2DOT14> shared_tuples, + bool is_composite_glyph) { do { const HBUINT8 *p = iterator.get_serialized_data (); unsigned int length = iterator.current_tuple->get_data_size (); if (unlikely (!iterator.var_data_bytes.check_range (p, length))) - { fini (); return false; } + return false; hb_hashmap_t<hb_tag_t, Triple> axis_tuples; if (!iterator.current_tuple->unpack_axis_tuples (iterator.get_axis_count (), shared_tuples, axes_old_index_tag_map, axis_tuples) || axis_tuples.is_empty ()) - { fini (); return false; } + return false; hb_vector_t<unsigned> private_indices; bool has_private_points = iterator.current_tuple->has_private_points (); const HBUINT8 *end = p + length; if (has_private_points && !TupleVariationData::unpack_points (p, private_indices, end)) - { fini (); return false; } + return false; const hb_vector_t<unsigned> &indices = has_private_points ? private_indices : shared_indices; bool apply_to_all = (indices.length == 0); @@ -1148,24 +1323,24 @@ struct TupleVariationData if (unlikely (!deltas_x.resize (num_deltas, false) || !TupleVariationData::unpack_deltas (p, deltas_x, end))) - { fini (); return false; } + return false; hb_vector_t<int> deltas_y; if (is_gvar) { if (unlikely (!deltas_y.resize (num_deltas, false) || !TupleVariationData::unpack_deltas (p, deltas_y, end))) - { fini (); return false; } + return false; } tuple_delta_t var; var.axis_tuples = std::move (axis_tuples); if (unlikely (!var.indices.resize (point_count) || !var.deltas_x.resize (point_count, false))) - { fini (); return false; } + return false; if (is_gvar && unlikely (!var.deltas_y.resize (point_count, false))) - { fini (); return false; } + return false; for (unsigned i = 0; i < num_deltas; i++) { @@ -1178,6 +1353,8 @@ struct TupleVariationData } tuple_vars.push (std::move (var)); } while (iterator.move_to_next ()); + + is_composite = is_composite_glyph; return true; } @@ -1261,7 +1438,7 @@ struct TupleVariationData unsigned new_len = new_vars.length + out.length; if (unlikely (!new_vars.alloc (new_len, false))) - { fini (); return false;} + return false; for (unsigned i = 0; i < out.length; i++) new_vars.push (std::move (out[i])); @@ -1272,8 +1449,9 @@ struct TupleVariationData return true; } - /* merge tuple variations with overlapping tents */ - void merge_tuple_variations () + /* merge tuple variations with overlapping tents, if iup delta optimization + * is enabled, add default deltas to contour_points */ + bool merge_tuple_variations (contour_point_vector_t* contour_points = nullptr) { hb_vector_t<tuple_delta_t> new_vars; hb_hashmap_t<const hb_hashmap_t<hb_tag_t, Triple>*, unsigned> m; @@ -1281,7 +1459,15 @@ struct TupleVariationData for (const tuple_delta_t& var : tuple_vars) { /* if all axes are pinned, drop the tuple variation */ - if (var.axis_tuples.is_empty ()) continue; + if (var.axis_tuples.is_empty ()) + { + /* if iup_delta_optimize is enabled, add deltas to contour coords */ + if (contour_points && !contour_points->add_deltas (var.deltas_x, + var.deltas_y, + var.indices)) + return false; + continue; + } unsigned *idx; if (m.has (&(var.axis_tuples), &idx)) @@ -1291,98 +1477,14 @@ struct TupleVariationData else { new_vars.push (var); - m.set (&(var.axis_tuples), i); + if (!m.set (&(var.axis_tuples), i)) + return false; i++; } } tuple_vars.fini (); tuple_vars = std::move (new_vars); - } - - hb_bytes_t compile_point_set (const hb_vector_t<bool> &point_indices) - { - unsigned num_points = 0; - for (bool i : point_indices) - if (i) num_points++; - - unsigned indices_length = point_indices.length; - /* If the points set consists of all points in the glyph, it's encoded with a - * single zero byte */ - if (num_points == indices_length) - { - char *p = (char *) hb_calloc (1, sizeof (char)); - if (unlikely (!p)) return hb_bytes_t (); - - return hb_bytes_t (p, 1); - } - - /* allocate enough memories: 2 bytes for count + 3 bytes for each point */ - unsigned num_bytes = 2 + 3 *num_points; - char *p = (char *) hb_calloc (num_bytes, sizeof (char)); - if (unlikely (!p)) return hb_bytes_t (); - - unsigned pos = 0; - /* binary data starts with the total number of reference points */ - if (num_points < 0x80) - p[pos++] = num_points; - else - { - p[pos++] = ((num_points >> 8) | 0x80); - p[pos++] = num_points & 0xFF; - } - - const unsigned max_run_length = 0x7F; - unsigned i = 0; - unsigned last_value = 0; - unsigned num_encoded = 0; - while (i < indices_length && num_encoded < num_points) - { - unsigned run_length = 0; - unsigned header_pos = pos; - p[pos++] = 0; - - bool use_byte_encoding = false; - bool new_run = true; - while (i < indices_length && num_encoded < num_points && - run_length <= max_run_length) - { - // find out next referenced point index - while (i < indices_length && !point_indices[i]) - i++; - - if (i >= indices_length) break; - - unsigned cur_value = i; - unsigned delta = cur_value - last_value; - - if (new_run) - { - use_byte_encoding = (delta <= 0xFF); - new_run = false; - } - - if (use_byte_encoding && delta > 0xFF) - break; - - if (use_byte_encoding) - p[pos++] = delta; - else - { - p[pos++] = delta >> 8; - p[pos++] = delta & 0xFF; - } - i++; - last_value = cur_value; - run_length++; - num_encoded++; - } - - if (use_byte_encoding) - p[header_pos] = run_length - 1; - else - p[header_pos] = (run_length - 1) | 0x80; - } - return hb_bytes_t (p, pos); + return true; } /* compile all point set and store byte data in a point_set->hb_bytes_t hashmap, @@ -1402,11 +1504,11 @@ struct TupleVariationData continue; } - hb_bytes_t compiled_data = compile_point_set (*points_set); - if (unlikely (compiled_data == hb_bytes_t ())) + hb_vector_t<char> compiled_point_data; + if (!tuple_delta_t::compile_point_set (*points_set, compiled_point_data)) return false; - if (!point_data_map.set (points_set, compiled_data) || + if (!point_data_map.set (points_set, std::move (compiled_point_data)) || !point_set_count_map.set (points_set, 1)) return false; } @@ -1414,31 +1516,33 @@ struct TupleVariationData } /* find shared points set which saves most bytes */ - hb_bytes_t find_shared_points () + void find_shared_points () { unsigned max_saved_bytes = 0; - hb_bytes_t res{}; - for (const auto& _ : point_data_map.iter ()) + for (const auto& _ : point_data_map.iter_ref ()) { const hb_vector_t<bool>* points_set = _.first; unsigned data_length = _.second.length; + if (!data_length) continue; unsigned *count; if (unlikely (!point_set_count_map.has (points_set, &count) || *count <= 1)) - return hb_bytes_t (); + { + shared_points_bytes = nullptr; + return; + } unsigned saved_bytes = data_length * ((*count) -1); if (saved_bytes > max_saved_bytes) { max_saved_bytes = saved_bytes; - res = _.second; + shared_points_bytes = &(_.second); } } - return res; } - bool calc_inferred_deltas (contour_point_vector_t& contour_points) + bool calc_inferred_deltas (const contour_point_vector_t& contour_points) { for (tuple_delta_t& var : tuple_vars) if (!var.calc_inferred_deltas (contour_points)) @@ -1447,10 +1551,21 @@ struct TupleVariationData return true; } + bool iup_optimize (const contour_point_vector_t& contour_points) + { + for (tuple_delta_t& var : tuple_vars) + { + if (!var.optimize (contour_points, is_composite)) + return false; + } + return true; + } + public: bool instantiate (const hb_hashmap_t<hb_tag_t, Triple>& normalized_axes_location, const hb_hashmap_t<hb_tag_t, TripleDistances>& axes_triple_distances, - contour_point_vector_t* contour_points = nullptr) + contour_point_vector_t* contour_points = nullptr, + bool optimize = false) { if (!tuple_vars) return true; if (!change_tuple_variations_axis_limits (normalized_axes_location, axes_triple_distances)) @@ -1460,7 +1575,14 @@ struct TupleVariationData if (!calc_inferred_deltas (*contour_points)) return false; - merge_tuple_variations (); + /* if iup delta opt is on, contour_points can't be null */ + if (optimize && !contour_points) + return false; + + if (!merge_tuple_variations (optimize ? contour_points : nullptr)) + return false; + + if (optimize && !iup_optimize (*contour_points)) return false; return !tuple_vars.in_error (); } @@ -1475,21 +1597,27 @@ struct TupleVariationData if (use_shared_points) { - shared_points_bytes = find_shared_points (); - compiled_byte_size += shared_points_bytes.length; + find_shared_points (); + if (shared_points_bytes) + compiled_byte_size += shared_points_bytes->length; } // compile delta and tuple var header for each tuple variation for (auto& tuple: tuple_vars) { const hb_vector_t<bool>* points_set = &(tuple.indices); - hb_bytes_t *points_data; + hb_vector_t<char> *points_data; if (unlikely (!point_data_map.has (points_set, &points_data))) return false; + /* when iup optimization is enabled, num of referenced points could be 0 + * and thus the compiled points bytes is empty, we should skip compiling + * this tuple */ + if (!points_data->length) + continue; if (!tuple.compile_deltas ()) return false; - unsigned points_data_length = (*points_data != shared_points_bytes) ? points_data->length : 0; + unsigned points_data_length = (points_data != shared_points_bytes) ? points_data->length : 0; if (!tuple.compile_tuple_var_header (axes_index_map, points_data_length, axes_old_index_tag_map, shared_tuples_idx_map)) return false; @@ -1513,18 +1641,24 @@ struct TupleVariationData bool serialize_var_data (hb_serialize_context_t *c, bool is_gvar) const { TRACE_SERIALIZE (this); - if (is_gvar) - shared_points_bytes.copy (c); + if (is_gvar && shared_points_bytes) + { + hb_bytes_t s (shared_points_bytes->arrayZ, shared_points_bytes->length); + s.copy (c); + } for (const auto& tuple: tuple_vars) { const hb_vector_t<bool>* points_set = &(tuple.indices); - hb_bytes_t *point_data; + hb_vector_t<char> *point_data; if (!point_data_map.has (points_set, &point_data)) return_trace (false); - if (!is_gvar || *point_data != shared_points_bytes) - point_data->copy (c); + if (!is_gvar || point_data != shared_points_bytes) + { + hb_bytes_t s (point_data->arrayZ, point_data->length); + s.copy (c); + } tuple.compiled_deltas.as_array ().copy (c); if (c->in_error ()) return_trace (false); @@ -1711,13 +1845,15 @@ struct TupleVariationData const hb_map_t *axes_old_index_tag_map, const hb_vector_t<unsigned> &shared_indices, const hb_array_t<const F2DOT14> shared_tuples, - tuple_variations_t& tuple_variations /* OUT */) const + tuple_variations_t& tuple_variations, /* OUT */ + bool is_composite_glyph = false) const { return tuple_variations.create_from_tuple_var_data (iterator, tupleVarCount, point_count, is_gvar, axes_old_index_tag_map, shared_indices, - shared_tuples); + shared_tuples, + is_composite_glyph); } bool serialize (hb_serialize_context_t *c, @@ -1831,7 +1967,7 @@ struct item_variations_t const hb_map_t& get_varidx_map () const { return varidx_map; } - bool instantiate (const VariationStore& varStore, + bool instantiate (const ItemVariationStore& varStore, const hb_subset_plan_t *plan, bool optimize=true, bool use_no_variation_idx=true, @@ -1845,7 +1981,7 @@ struct item_variations_t } /* keep below APIs public only for unit test: test-item-varstore */ - bool create_from_item_varstore (const VariationStore& varStore, + bool create_from_item_varstore (const ItemVariationStore& varStore, const hb_map_t& axes_old_index_tag_map, const hb_array_t <const hb_inc_bimap_t> inner_maps = hb_array_t<const hb_inc_bimap_t> ()) { diff --git a/thirdparty/harfbuzz/src/hb-ot-var-gvar-table.hh b/thirdparty/harfbuzz/src/hb-ot-var-gvar-table.hh index 1c7a1f6c1e..59aad57e37 100644 --- a/thirdparty/harfbuzz/src/hb-ot-var-gvar-table.hh +++ b/thirdparty/harfbuzz/src/hb-ot-var-gvar-table.hh @@ -101,10 +101,15 @@ struct glyph_variations_t continue; } + bool is_composite_glyph = false; +#ifdef HB_EXPERIMENTAL_API + is_composite_glyph = plan->composite_new_gids.has (new_gid); +#endif if (!p->decompile_tuple_variations (all_contour_points->length, true /* is_gvar */, iterator, &(plan->axes_old_index_tag_map), shared_indices, shared_tuples, - tuple_vars /* OUT */)) + tuple_vars, /* OUT */ + is_composite_glyph)) return false; glyph_variations.push (std::move (tuple_vars)); } @@ -114,13 +119,17 @@ struct glyph_variations_t bool instantiate (const hb_subset_plan_t *plan) { unsigned count = plan->new_to_old_gid_list.length; + bool iup_optimize = false; +#ifdef HB_EXPERIMENTAL_API + iup_optimize = plan->flags & HB_SUBSET_FLAGS_OPTIMIZE_IUP_DELTAS; +#endif for (unsigned i = 0; i < count; i++) { hb_codepoint_t new_gid = plan->new_to_old_gid_list[i].first; contour_point_vector_t *all_points; if (!plan->new_gid_contour_points_map.has (new_gid, &all_points)) return false; - if (!glyph_variations[i].instantiate (plan->axes_location, plan->axes_triple_distances, all_points)) + if (!glyph_variations[i].instantiate (plan->axes_location, plan->axes_triple_distances, all_points, iup_optimize)) return false; } return true; @@ -340,7 +349,8 @@ struct gvar const glyph_variations_t& glyph_vars, Iterator it, unsigned axis_count, - unsigned num_glyphs) const + unsigned num_glyphs, + bool force_long_offsets) const { TRACE_SERIALIZE (this); gvar *out = c->allocate_min<gvar> (); @@ -352,7 +362,7 @@ struct gvar out->glyphCountX = hb_min (0xFFFFu, num_glyphs); unsigned glyph_var_data_size = glyph_vars.compiled_byte_size (); - bool long_offset = glyph_var_data_size & ~0xFFFFu; + bool long_offset = glyph_var_data_size & ~0xFFFFu || force_long_offsets; out->flags = long_offset ? 1 : 0; HBUINT8 *glyph_var_data_offsets = c->allocate_size<HBUINT8> ((long_offset ? 4 : 2) * (num_glyphs + 1), false); @@ -393,7 +403,12 @@ struct gvar unsigned axis_count = c->plan->axes_index_map.get_population (); unsigned num_glyphs = c->plan->num_output_glyphs (); auto it = hb_iter (c->plan->new_to_old_gid_list); - return_trace (serialize (c->serializer, glyph_vars, it, axis_count, num_glyphs)); + + bool force_long_offsets = false; +#ifdef HB_EXPERIMENTAL_API + force_long_offsets = c->plan->flags & HB_SUBSET_FLAGS_IFTB_REQUIREMENTS; +#endif + return_trace (serialize (c->serializer, glyph_vars, it, axis_count, num_glyphs, force_long_offsets)); } bool subset (hb_subset_context_t *c) const @@ -429,7 +444,7 @@ struct gvar } bool long_offset = (subset_data_size & ~0xFFFFu); - #ifdef HB_EXPERIMENTAL_API +#ifdef HB_EXPERIMENTAL_API long_offset = long_offset || (c->plan->flags & HB_SUBSET_FLAGS_IFTB_REQUIREMENTS); #endif out->flags = long_offset ? 1 : 0; diff --git a/thirdparty/harfbuzz/src/hb-ot-var-hvar-table.hh b/thirdparty/harfbuzz/src/hb-ot-var-hvar-table.hh index 53a4642d38..33a4e1a40e 100644 --- a/thirdparty/harfbuzz/src/hb-ot-var-hvar-table.hh +++ b/thirdparty/harfbuzz/src/hb-ot-var-hvar-table.hh @@ -188,7 +188,7 @@ struct hvarvvar_subset_plan_t ~hvarvvar_subset_plan_t() { fini (); } void init (const hb_array_t<const DeltaSetIndexMap *> &index_maps, - const VariationStore &_var_store, + const ItemVariationStore &_var_store, const hb_subset_plan_t *plan) { index_map_plans.resize (index_maps.length); @@ -263,7 +263,7 @@ struct hvarvvar_subset_plan_t hb_inc_bimap_t outer_map; hb_vector_t<hb_inc_bimap_t> inner_maps; hb_vector_t<index_map_subset_plan_t> index_map_plans; - const VariationStore *var_store; + const ItemVariationStore *var_store; protected: hb_vector_t<hb_set_t *> inner_sets; @@ -296,7 +296,7 @@ struct HVARVVAR rsbMap.sanitize (c, this)); } - const VariationStore& get_var_store () const + const ItemVariationStore& get_var_store () const { return this+varStore; } void listup_index_maps (hb_vector_t<const DeltaSetIndexMap *> &index_maps) const @@ -384,7 +384,7 @@ struct HVARVVAR float get_advance_delta_unscaled (hb_codepoint_t glyph, const int *coords, unsigned int coord_count, - VariationStore::cache_t *store_cache = nullptr) const + ItemVariationStore::cache_t *store_cache = nullptr) const { uint32_t varidx = (this+advMap).map (glyph); return (this+varStore).get_delta (varidx, @@ -405,7 +405,7 @@ struct HVARVVAR public: FixedVersion<>version; /* Version of the metrics variation table * initially set to 0x00010000u */ - Offset32To<VariationStore> + Offset32To<ItemVariationStore> varStore; /* Offset to item variation store table. */ Offset32To<DeltaSetIndexMap> advMap; /* Offset to advance var-idx mapping. */ diff --git a/thirdparty/harfbuzz/src/hb-ot-var-mvar-table.hh b/thirdparty/harfbuzz/src/hb-ot-var-mvar-table.hh index 6d69777618..1f0401d1d3 100644 --- a/thirdparty/harfbuzz/src/hb-ot-var-mvar-table.hh +++ b/thirdparty/harfbuzz/src/hb-ot-var-mvar-table.hh @@ -56,7 +56,7 @@ struct VariationValueRecord public: Tag valueTag; /* Four-byte tag identifying a font-wide measure. */ - VarIdx varIdx; /* Outer/inner index into VariationStore item. */ + VarIdx varIdx; /* Outer/inner index into ItemVariationStore item. */ public: DEFINE_SIZE_STATIC (8); @@ -106,7 +106,7 @@ struct MVAR out->valueRecordCount = valueRecordCount; item_variations_t item_vars; - const VariationStore& src_var_store = this+varStore; + const ItemVariationStore& src_var_store = this+varStore; if (!item_vars.instantiate (src_var_store, c->plan)) return_trace (false); @@ -159,7 +159,7 @@ protected: HBUINT16 valueRecordSize;/* The size in bytes of each value record — * must be greater than zero. */ HBUINT16 valueRecordCount;/* The number of value records — may be zero. */ - Offset16To<VariationStore> + Offset16To<ItemVariationStore> varStore; /* Offset to item variation store table. */ UnsizedArrayOf<HBUINT8> valuesZ; /* Array of value records. The records must be diff --git a/thirdparty/harfbuzz/src/hb-priority-queue.hh b/thirdparty/harfbuzz/src/hb-priority-queue.hh index 9b962a29d9..274d5df4c5 100644 --- a/thirdparty/harfbuzz/src/hb-priority-queue.hh +++ b/thirdparty/harfbuzz/src/hb-priority-queue.hh @@ -163,7 +163,7 @@ struct hb_priority_queue_t goto repeat; } - void swap (unsigned a, unsigned b) + void swap (unsigned a, unsigned b) noexcept { assert (a < heap.length); assert (b < heap.length); diff --git a/thirdparty/harfbuzz/src/hb-repacker.hh b/thirdparty/harfbuzz/src/hb-repacker.hh index e9cd376ad3..ed40f271cc 100644 --- a/thirdparty/harfbuzz/src/hb-repacker.hh +++ b/thirdparty/harfbuzz/src/hb-repacker.hh @@ -239,6 +239,54 @@ bool _try_isolating_subgraphs (const hb_vector_t<graph::overflow_record_t>& over } static inline +bool _resolve_shared_overflow(const hb_vector_t<graph::overflow_record_t>& overflows, + int overflow_index, + graph_t& sorted_graph) +{ + const graph::overflow_record_t& r = overflows[overflow_index]; + + // Find all of the parents in overflowing links that link to this + // same child node. We will then try duplicating the child node and + // re-assigning all of these parents to the duplicate. + hb_set_t parents; + parents.add(r.parent); + for (int i = overflow_index - 1; i >= 0; i--) { + const graph::overflow_record_t& r2 = overflows[i]; + if (r2.child == r.child) { + parents.add(r2.parent); + } + } + + unsigned result = sorted_graph.duplicate(&parents, r.child); + if (result == (unsigned) -1 && parents.get_population() > 2) { + // All links to the child are overflowing, so we can't include all + // in the duplication. Remove one parent from the duplication. + // Remove the lowest index parent, which will be the closest to the child. + parents.del(parents.get_min()); + result = sorted_graph.duplicate(&parents, r.child); + } + + if (result == (unsigned) -1) return result; + + if (parents.get_population() > 1) { + // If the duplicated node has more than one parent pre-emptively raise it's priority to the maximum. + // This will place it close to the parents. Node's with only one parent, don't need this as normal overflow + // resolution will raise priority if needed. + // + // Reasoning: most of the parents to this child are likely at the same layer in the graph. Duplicating + // the child will theoretically allow it to be placed closer to it's parents. However, due to the shortest + // distance sort by default it's placement will remain in the same layer, thus it will remain in roughly the + // same position (and distance from parents) as the original child node. The overflow resolution will attempt + // to move nodes closer, but only for non-shared nodes. Since this node is shared, it will simply be given + // further duplication which defeats the attempt to duplicate with multiple parents. To fix this we + // pre-emptively raise priority now which allows the duplicated node to pack into the same layer as it's parents. + sorted_graph.vertices_[result].give_max_priority(); + } + + return result; +} + +static inline bool _process_overflows (const hb_vector_t<graph::overflow_record_t>& overflows, hb_set_t& priority_bumped_parents, graph_t& sorted_graph) @@ -254,7 +302,7 @@ bool _process_overflows (const hb_vector_t<graph::overflow_record_t>& overflows, { // The child object is shared, we may be able to eliminate the overflow // by duplicating it. - if (sorted_graph.duplicate (r.parent, r.child) == (unsigned) -1) continue; + if (!_resolve_shared_overflow(overflows, i, sorted_graph)) continue; return true; } @@ -388,7 +436,7 @@ template<typename T> inline hb_blob_t* hb_resolve_overflows (const T& packed, hb_tag_t table_tag, - unsigned max_rounds = 20, + unsigned max_rounds = 32, bool recalculate_extensions = false) { graph_t sorted_graph (packed); if (sorted_graph.in_error ()) diff --git a/thirdparty/harfbuzz/src/hb-serialize.hh b/thirdparty/harfbuzz/src/hb-serialize.hh index 15eccb6a09..e988451eb3 100644 --- a/thirdparty/harfbuzz/src/hb-serialize.hh +++ b/thirdparty/harfbuzz/src/hb-serialize.hh @@ -91,7 +91,27 @@ struct hb_serialize_context_t } #endif - friend void swap (object_t& a, object_t& b) + bool add_virtual_link (objidx_t objidx) + { + if (!objidx) + return false; + + auto& link = *virtual_links.push (); + if (virtual_links.in_error ()) + return false; + + link.objidx = objidx; + // Remaining fields were previously zero'd by push(): + // link.width = 0; + // link.is_signed = 0; + // link.whence = 0; + // link.position = 0; + // link.bias = 0; + + return true; + } + + friend void swap (object_t& a, object_t& b) noexcept { hb_swap (a.head, b.head); hb_swap (a.tail, b.tail); @@ -156,9 +176,9 @@ struct hb_serialize_context_t object_t *next; auto all_links () const HB_AUTO_RETURN - (( hb_concat (this->real_links, this->virtual_links) )); + (( hb_concat (real_links, virtual_links) )); auto all_links_writer () HB_AUTO_RETURN - (( hb_concat (this->real_links.writer (), this->virtual_links.writer ()) )); + (( hb_concat (real_links.writer (), virtual_links.writer ()) )); }; struct snapshot_t @@ -469,16 +489,40 @@ struct hb_serialize_context_t assert (current); - auto& link = *current->virtual_links.push (); - if (current->virtual_links.in_error ()) + if (!current->add_virtual_link(objidx)) err (HB_SERIALIZE_ERROR_OTHER); + } - link.width = 0; - link.objidx = objidx; - link.is_signed = 0; - link.whence = 0; - link.position = 0; - link.bias = 0; + objidx_t last_added_child_index() const { + if (unlikely (in_error ())) return (objidx_t) -1; + + assert (current); + if (!bool(current->real_links)) { + return (objidx_t) -1; + } + + return current->real_links[current->real_links.length - 1].objidx; + } + + // For the current object ensure that the sub-table bytes for child objidx are always placed + // after the subtable bytes for any other existing children. This only ensures that the + // repacker will not move the target subtable before the other children + // (by adding virtual links). It is up to the caller to ensure the initial serialization + // order is correct. + void repack_last(objidx_t objidx) { + if (unlikely (in_error ())) return; + + if (!objidx) + return; + + assert (current); + for (auto& l : current->real_links) { + if (l.objidx == objidx) { + continue; + } + + packed[l.objidx]->add_virtual_link(objidx); + } } template <typename T> diff --git a/thirdparty/harfbuzz/src/hb-set.hh b/thirdparty/harfbuzz/src/hb-set.hh index ff2a170d2d..ce69ea2c9b 100644 --- a/thirdparty/harfbuzz/src/hb-set.hh +++ b/thirdparty/harfbuzz/src/hb-set.hh @@ -44,10 +44,10 @@ struct hb_sparseset_t ~hb_sparseset_t () { fini (); } hb_sparseset_t (const hb_sparseset_t& other) : hb_sparseset_t () { set (other); } - hb_sparseset_t (hb_sparseset_t&& other) : hb_sparseset_t () { s = std::move (other.s); } + hb_sparseset_t (hb_sparseset_t&& other) noexcept : hb_sparseset_t () { s = std::move (other.s); } hb_sparseset_t& operator = (const hb_sparseset_t& other) { set (other); return *this; } - hb_sparseset_t& operator = (hb_sparseset_t&& other) { s = std::move (other.s); return *this; } - friend void swap (hb_sparseset_t& a, hb_sparseset_t& b) { hb_swap (a.s, b.s); } + hb_sparseset_t& operator = (hb_sparseset_t&& other) noexcept { s = std::move (other.s); return *this; } + friend void swap (hb_sparseset_t& a, hb_sparseset_t& b) noexcept { hb_swap (a.s, b.s); } hb_sparseset_t (std::initializer_list<hb_codepoint_t> lst) : hb_sparseset_t () { @@ -166,7 +166,7 @@ struct hb_set_t : hb_sparseset_t<hb_bit_set_invertible_t> ~hb_set_t () = default; hb_set_t () : sparseset () {}; hb_set_t (const hb_set_t &o) : sparseset ((sparseset &) o) {}; - hb_set_t (hb_set_t&& o) : sparseset (std::move ((sparseset &) o)) {} + hb_set_t (hb_set_t&& o) noexcept : sparseset (std::move ((sparseset &) o)) {} hb_set_t& operator = (const hb_set_t&) = default; hb_set_t& operator = (hb_set_t&&) = default; hb_set_t (std::initializer_list<hb_codepoint_t> lst) : sparseset (lst) {} diff --git a/thirdparty/harfbuzz/src/hb-subset-cff2.cc b/thirdparty/harfbuzz/src/hb-subset-cff2.cc index abc108e571..eb5cb0c625 100644 --- a/thirdparty/harfbuzz/src/hb-subset-cff2.cc +++ b/thirdparty/harfbuzz/src/hb-subset-cff2.cc @@ -248,7 +248,7 @@ struct cff2_subr_subsetter_t : subr_subsetter_t<cff2_subr_subsetter_t, CFF2Subrs struct cff2_private_blend_encoder_param_t { cff2_private_blend_encoder_param_t (hb_serialize_context_t *c, - const CFF2VariationStore *varStore, + const CFF2ItemVariationStore *varStore, hb_array_t<int> normalized_coords) : c (c), varStore (varStore), normalized_coords (normalized_coords) {} @@ -284,7 +284,7 @@ struct cff2_private_blend_encoder_param_t unsigned ivs = 0; unsigned region_count = 0; hb_vector_t<float> scalars; - const CFF2VariationStore *varStore = nullptr; + const CFF2ItemVariationStore *varStore = nullptr; hb_array_t<int> normalized_coords; }; @@ -378,7 +378,7 @@ struct cff2_private_dict_blend_opset_t : dict_opset_t struct cff2_private_dict_op_serializer_t : op_serializer_t { cff2_private_dict_op_serializer_t (bool desubroutinize_, bool drop_hints_, bool pinned_, - const CFF::CFF2VariationStore* varStore_, + const CFF::CFF2ItemVariationStore* varStore_, hb_array_t<int> normalized_coords_) : desubroutinize (desubroutinize_), drop_hints (drop_hints_), pinned (pinned_), varStore (varStore_), normalized_coords (normalized_coords_) {} @@ -416,7 +416,7 @@ struct cff2_private_dict_op_serializer_t : op_serializer_t const bool desubroutinize; const bool drop_hints; const bool pinned; - const CFF::CFF2VariationStore* varStore; + const CFF::CFF2ItemVariationStore* varStore; hb_array_t<int> normalized_coords; }; @@ -628,10 +628,10 @@ OT::cff2::accelerator_subset_t::serialize (hb_serialize_context_t *c, } /* variation store */ - if (varStore != &Null (CFF2VariationStore) && + if (varStore != &Null (CFF2ItemVariationStore) && !plan.pinned) { - auto *dest = c->push<CFF2VariationStore> (); + auto *dest = c->push<CFF2ItemVariationStore> (); if (unlikely (!dest->serialize (c, varStore))) { c->pop_discard (); diff --git a/thirdparty/harfbuzz/src/hb-subset-input.cc b/thirdparty/harfbuzz/src/hb-subset-input.cc index 1e0a89a630..68a3e77788 100644 --- a/thirdparty/harfbuzz/src/hb-subset-input.cc +++ b/thirdparty/harfbuzz/src/hb-subset-input.cc @@ -24,6 +24,7 @@ * Google Author(s): Garret Rieger, Rod Sheeter, Behdad Esfahbod */ +#include "hb-subset-instancer-solver.hh" #include "hb-subset.hh" #include "hb-set.hh" #include "hb-utf.hh" @@ -50,7 +51,6 @@ hb_subset_input_t::hb_subset_input_t () HB_TAG ('k', 'e', 'r', 'n'), // Copied from fontTools: - HB_TAG ('B', 'A', 'S', 'E'), HB_TAG ('J', 'S', 'T', 'F'), HB_TAG ('D', 'S', 'I', 'G'), HB_TAG ('E', 'B', 'D', 'T'), @@ -418,6 +418,46 @@ hb_subset_input_keep_everything (hb_subset_input_t *input) #ifndef HB_NO_VAR /** + * hb_subset_input_pin_all_axes_to_default: (skip) + * @input: a #hb_subset_input_t object. + * @face: a #hb_face_t object. + * + * Pin all axes to default locations in the given subset input object. + * + * All axes in a font must be pinned. Additionally, `CFF2` table, if present, + * will be de-subroutinized. + * + * Return value: `true` if success, `false` otherwise + * + * Since: 8.3.1 + **/ +HB_EXTERN hb_bool_t +hb_subset_input_pin_all_axes_to_default (hb_subset_input_t *input, + hb_face_t *face) +{ + unsigned axis_count = hb_ot_var_get_axis_count (face); + if (!axis_count) return false; + + hb_ot_var_axis_info_t *axis_infos = (hb_ot_var_axis_info_t *) hb_calloc (axis_count, sizeof (hb_ot_var_axis_info_t)); + if (unlikely (!axis_infos)) return false; + + (void) hb_ot_var_get_axis_infos (face, 0, &axis_count, axis_infos); + + for (unsigned i = 0; i < axis_count; i++) + { + hb_tag_t axis_tag = axis_infos[i].tag; + float default_val = axis_infos[i].default_value; + if (!input->axes_location.set (axis_tag, Triple (default_val, default_val, default_val))) + { + hb_free (axis_infos); + return false; + } + } + hb_free (axis_infos); + return true; +} + +/** * hb_subset_input_pin_axis_to_default: (skip) * @input: a #hb_subset_input_t object. * @face: a #hb_face_t object. @@ -481,16 +521,13 @@ hb_subset_input_pin_axis_location (hb_subset_input_t *input, * @input: a #hb_subset_input_t object. * @face: a #hb_face_t object. * @axis_tag: Tag of the axis - * @axis_min_value: Minimum value of the axis variation range to set - * @axis_max_value: Maximum value of the axis variation range to set - * @axis_def_value: Default value of the axis variation range to set, in case of - * null, it'll be determined automatically + * @axis_min_value: Minimum value of the axis variation range to set, if NaN the existing min will be used. + * @axis_max_value: Maximum value of the axis variation range to set if NaN the existing max will be used. + * @axis_def_value: Default value of the axis variation range to set, if NaN the existing default will be used. * * Restricting the range of variation on an axis in the given subset input object. * New min/default/max values will be clamped if they're not within the fvar axis range. - * If the new default value is null: - * If the fvar axis default value is within the new range, then new default - * value is the same as original default value. + * * If the fvar axis default value is not within the new range, the new default * value will be changed to the new min or max value, whichever is closer to the fvar * axis default. @@ -509,21 +546,57 @@ hb_subset_input_set_axis_range (hb_subset_input_t *input, hb_tag_t axis_tag, float axis_min_value, float axis_max_value, - float *axis_def_value /* IN, maybe NULL */) + float axis_def_value) { - if (axis_min_value > axis_max_value) - return false; - hb_ot_var_axis_info_t axis_info; if (!hb_ot_var_find_axis_info (face, axis_tag, &axis_info)) return false; - float new_min_val = hb_clamp(axis_min_value, axis_info.min_value, axis_info.max_value); - float new_max_val = hb_clamp(axis_max_value, axis_info.min_value, axis_info.max_value); - float new_default_val = axis_def_value ? *axis_def_value : axis_info.default_value; - new_default_val = hb_clamp(new_default_val, new_min_val, new_max_val); + float min = !std::isnan(axis_min_value) ? axis_min_value : axis_info.min_value; + float max = !std::isnan(axis_max_value) ? axis_max_value : axis_info.max_value; + float def = !std::isnan(axis_def_value) ? axis_def_value : axis_info.default_value; + + if (min > max) + return false; + + float new_min_val = hb_clamp(min, axis_info.min_value, axis_info.max_value); + float new_max_val = hb_clamp(max, axis_info.min_value, axis_info.max_value); + float new_default_val = hb_clamp(def, new_min_val, new_max_val); return input->axes_location.set (axis_tag, Triple (new_min_val, new_default_val, new_max_val)); } + +/** + * hb_subset_input_get_axis_range: (skip) + * @input: a #hb_subset_input_t object. + * @axis_tag: Tag of the axis + * @axis_min_value: Set to the previously configured minimum value of the axis variation range. + * @axis_max_value: Set to the previously configured maximum value of the axis variation range. + * @axis_def_value: Set to the previously configured default value of the axis variation range. + * + * Gets the axis range assigned by previous calls to hb_subset_input_set_axis_range. + * + * Return value: `true` if a range has been set for this axis tag, `false` otherwise. + * + * XSince: EXPERIMENTAL + **/ +HB_EXTERN hb_bool_t +hb_subset_input_get_axis_range (hb_subset_input_t *input, + hb_tag_t axis_tag, + float *axis_min_value, + float *axis_max_value, + float *axis_def_value) + +{ + Triple* triple; + if (!input->axes_location.has(axis_tag, &triple)) { + return false; + } + + *axis_min_value = triple->minimum; + *axis_def_value = triple->middle; + *axis_max_value = triple->maximum; + return true; +} #endif #endif diff --git a/thirdparty/harfbuzz/src/hb-subset-instancer-iup.cc b/thirdparty/harfbuzz/src/hb-subset-instancer-iup.cc new file mode 100644 index 0000000000..35a964d082 --- /dev/null +++ b/thirdparty/harfbuzz/src/hb-subset-instancer-iup.cc @@ -0,0 +1,532 @@ +/* + * Copyright © 2024 Google, Inc. + * + * This is part of HarfBuzz, a text shaping library. + * + * Permission is hereby granted, without written agreement and without + * license or royalty fees, to use, copy, modify, and distribute this + * software and its documentation for any purpose, provided that the + * above copyright notice and the following two paragraphs appear in + * all copies of this software. + * + * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR + * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES + * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN + * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, + * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS + * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO + * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. + */ + +#include "hb-subset-instancer-iup.hh" + +/* This file is a straight port of the following: + * + * https://github.com/fonttools/fonttools/blob/main/Lib/fontTools/varLib/iup.py + * + * Where that file returns optimzied deltas vector, we return optimized + * referenced point indices. + */ + +constexpr static unsigned MAX_LOOKBACK = 8; + +static void _iup_contour_bound_forced_set (const hb_array_t<const contour_point_t> contour_points, + const hb_array_t<const int> x_deltas, + const hb_array_t<const int> y_deltas, + hb_set_t& forced_set, /* OUT */ + float tolerance = 0.f) +{ + unsigned len = contour_points.length; + unsigned next_i = 0; + for (int i = len - 1; i >= 0; i--) + { + unsigned last_i = (len + i -1) % len; + for (unsigned j = 0; j < 2; j++) + { + float cj, lcj, ncj; + int dj, ldj, ndj; + if (j == 0) + { + cj = contour_points.arrayZ[i].x; + dj = x_deltas.arrayZ[i]; + lcj = contour_points.arrayZ[last_i].x; + ldj = x_deltas.arrayZ[last_i]; + ncj = contour_points.arrayZ[next_i].x; + ndj = x_deltas.arrayZ[next_i]; + } + else + { + cj = contour_points.arrayZ[i].y; + dj = y_deltas.arrayZ[i]; + lcj = contour_points.arrayZ[last_i].y; + ldj = y_deltas.arrayZ[last_i]; + ncj = contour_points.arrayZ[next_i].y; + ndj = y_deltas.arrayZ[next_i]; + } + + float c1, c2; + int d1, d2; + if (lcj <= ncj) + { + c1 = lcj; + c2 = ncj; + d1 = ldj; + d2 = ndj; + } + else + { + c1 = ncj; + c2 = lcj; + d1 = ndj; + d2 = ldj; + } + + bool force = false; + if (c1 == c2) + { + if (abs (d1 - d2) > tolerance && abs (dj) > tolerance) + force = true; + } + else if (c1 <= cj && cj <= c2) + { + if (!(hb_min (d1, d2) - tolerance <= dj && + dj <= hb_max (d1, d2) + tolerance)) + force = true; + } + else + { + if (d1 != d2) + { + if (cj < c1) + { + if (abs (dj) > tolerance && + abs (dj - d1) > tolerance && + ((dj - tolerance < d1) != (d1 < d2))) + force = true; + } + else + { + if (abs (dj) > tolerance && + abs (dj - d2) > tolerance && + ((d2 < dj + tolerance) != (d1 < d2))) + force = true; + } + } + } + + if (force) + { + forced_set.add (i); + break; + } + } + next_i = i; + } +} + +template <typename T, + hb_enable_if (hb_is_trivially_copyable (T))> +static bool rotate_array (const hb_array_t<const T>& org_array, + int k, + hb_vector_t<T>& out) +{ + unsigned n = org_array.length; + if (!n) return true; + if (unlikely (!out.resize (n, false))) + return false; + + unsigned item_size = hb_static_size (T); + if (k < 0) + k = n - (-k) % n; + else + k %= n; + + hb_memcpy ((void *) out.arrayZ, (const void *) (org_array.arrayZ + n - k), k * item_size); + hb_memcpy ((void *) (out.arrayZ + k), (const void *) org_array.arrayZ, (n - k) * item_size); + return true; +} + +static bool rotate_set (const hb_set_t& org_set, + int k, + unsigned n, + hb_set_t& out) +{ + if (!n) return false; + k %= n; + if (k < 0) + k = n + k; + + if (k == 0) + { + out.set (org_set); + } + else + { + for (auto v : org_set) + out.add ((v + k) % n); + } + return !out.in_error (); +} + +/* Given two reference coordinates (start and end of contour_points array), + * output interpolated deltas for points in between */ +static bool _iup_segment (const hb_array_t<const contour_point_t> contour_points, + const hb_array_t<const int> x_deltas, + const hb_array_t<const int> y_deltas, + const contour_point_t& p1, const contour_point_t& p2, + int p1_dx, int p2_dx, + int p1_dy, int p2_dy, + hb_vector_t<float>& interp_x_deltas, /* OUT */ + hb_vector_t<float>& interp_y_deltas /* OUT */) +{ + unsigned n = contour_points.length; + if (unlikely (!interp_x_deltas.resize (n, false) || + !interp_y_deltas.resize (n, false))) + return false; + + for (unsigned j = 0; j < 2; j++) + { + float x1, x2, d1, d2; + float *out; + if (j == 0) + { + x1 = p1.x; + x2 = p2.x; + d1 = p1_dx; + d2 = p2_dx; + out = interp_x_deltas.arrayZ; + } + else + { + x1 = p1.y; + x2 = p2.y; + d1 = p1_dy; + d2 = p2_dy; + out = interp_y_deltas.arrayZ; + } + + if (x1 == x2) + { + if (d1 == d2) + { + for (unsigned i = 0; i < n; i++) + out[i] = d1; + } + else + { + for (unsigned i = 0; i < n; i++) + out[i] = 0.f; + } + continue; + } + + if (x1 > x2) + { + hb_swap (x1, x2); + hb_swap (d1, d2); + } + + float scale = (d2 - d1) / (x2 - x1); + for (unsigned i = 0; i < n; i++) + { + float x = j == 0 ? contour_points.arrayZ[i].x : contour_points.arrayZ[i].y; + float d; + if (x <= x1) + d = d1; + else if (x >= x2) + d = d2; + else + d = d1 + (x - x1) * scale; + + out[i] = d; + } + } + return true; +} + +static bool _can_iup_in_between (const hb_array_t<const contour_point_t> contour_points, + const hb_array_t<const int> x_deltas, + const hb_array_t<const int> y_deltas, + const contour_point_t& p1, const contour_point_t& p2, + int p1_dx, int p2_dx, + int p1_dy, int p2_dy, + float tolerance) +{ + hb_vector_t<float> interp_x_deltas, interp_y_deltas; + if (!_iup_segment (contour_points, x_deltas, y_deltas, + p1, p2, p1_dx, p2_dx, p1_dy, p2_dy, + interp_x_deltas, interp_y_deltas)) + return false; + + unsigned num = contour_points.length; + + for (unsigned i = 0; i < num; i++) + { + float dx = x_deltas.arrayZ[i] - interp_x_deltas.arrayZ[i]; + float dy = y_deltas.arrayZ[i] - interp_y_deltas.arrayZ[i]; + + if (sqrtf ((float)dx * dx + (float)dy * dy) > tolerance) + return false; + } + return true; +} + +static bool _iup_contour_optimize_dp (const contour_point_vector_t& contour_points, + const hb_vector_t<int>& x_deltas, + const hb_vector_t<int>& y_deltas, + const hb_set_t& forced_set, + float tolerance, + unsigned lookback, + hb_vector_t<unsigned>& costs, /* OUT */ + hb_vector_t<int>& chain /* OUT */) +{ + unsigned n = contour_points.length; + if (unlikely (!costs.resize (n, false) || + !chain.resize (n, false))) + return false; + + lookback = hb_min (lookback, MAX_LOOKBACK); + + for (unsigned i = 0; i < n; i++) + { + unsigned best_cost = (i == 0 ? 1 : costs.arrayZ[i-1] + 1); + + costs.arrayZ[i] = best_cost; + chain.arrayZ[i] = (i == 0 ? -1 : i - 1); + + if (i > 0 && forced_set.has (i - 1)) + continue; + + int lookback_index = hb_max ((int) i - (int) lookback + 1, -1); + for (int j = i - 2; j >= lookback_index; j--) + { + unsigned cost = j == -1 ? 1 : costs.arrayZ[j] + 1; + /* num points between i and j */ + unsigned num_points = i - j - 1; + unsigned p1 = (j == -1 ? n - 1 : j); + if (cost < best_cost && + _can_iup_in_between (contour_points.as_array ().sub_array (j + 1, num_points), + x_deltas.as_array ().sub_array (j + 1, num_points), + y_deltas.as_array ().sub_array (j + 1, num_points), + contour_points.arrayZ[p1], contour_points.arrayZ[i], + x_deltas.arrayZ[p1], x_deltas.arrayZ[i], + y_deltas.arrayZ[p1], y_deltas.arrayZ[i], + tolerance)) + { + best_cost = cost; + costs.arrayZ[i] = best_cost; + chain.arrayZ[i] = j; + } + + if (j > 0 && forced_set.has (j)) + break; + } + } + return true; +} + +static bool _iup_contour_optimize (const hb_array_t<const contour_point_t> contour_points, + const hb_array_t<const int> x_deltas, + const hb_array_t<const int> y_deltas, + hb_array_t<bool> opt_indices, /* OUT */ + float tolerance = 0.f) +{ + unsigned n = contour_points.length; + if (opt_indices.length != n || + x_deltas.length != n || + y_deltas.length != n) + return false; + + bool all_within_tolerance = true; + for (unsigned i = 0; i < n; i++) + { + int dx = x_deltas.arrayZ[i]; + int dy = y_deltas.arrayZ[i]; + if (sqrtf ((float)dx * dx + (float)dy * dy) > tolerance) + { + all_within_tolerance = false; + break; + } + } + + /* If all are within tolerance distance, do nothing, opt_indices is + * initilized to false */ + if (all_within_tolerance) + return true; + + /* If there's exactly one point, return it */ + if (n == 1) + { + opt_indices.arrayZ[0] = true; + return true; + } + + /* If all deltas are exactly the same, return just one (the first one) */ + bool all_deltas_are_equal = true; + for (unsigned i = 1; i < n; i++) + if (x_deltas.arrayZ[i] != x_deltas.arrayZ[0] || + y_deltas.arrayZ[i] != y_deltas.arrayZ[0]) + { + all_deltas_are_equal = false; + break; + } + + if (all_deltas_are_equal) + { + opt_indices.arrayZ[0] = true; + return true; + } + + /* else, solve the general problem using Dynamic Programming */ + hb_set_t forced_set; + _iup_contour_bound_forced_set (contour_points, x_deltas, y_deltas, forced_set, tolerance); + + if (!forced_set.is_empty ()) + { + int k = n - 1 - forced_set.get_max (); + if (k < 0) + return false; + + hb_vector_t<int> rot_x_deltas, rot_y_deltas; + contour_point_vector_t rot_points; + hb_set_t rot_forced_set; + if (!rotate_array (contour_points, k, rot_points) || + !rotate_array (x_deltas, k, rot_x_deltas) || + !rotate_array (y_deltas, k, rot_y_deltas) || + !rotate_set (forced_set, k, n, rot_forced_set)) + return false; + + hb_vector_t<unsigned> costs; + hb_vector_t<int> chain; + + if (!_iup_contour_optimize_dp (rot_points, rot_x_deltas, rot_y_deltas, + rot_forced_set, tolerance, n, + costs, chain)) + return false; + + hb_set_t solution; + int index = n - 1; + while (index != -1) + { + solution.add (index); + index = chain.arrayZ[index]; + } + + if (solution.is_empty () || + forced_set.get_population () > solution.get_population ()) + return false; + + for (unsigned i : solution) + opt_indices.arrayZ[i] = true; + + hb_vector_t<bool> rot_indices; + const hb_array_t<const bool> opt_indices_array (opt_indices.arrayZ, opt_indices.length); + rotate_array (opt_indices_array, -k, rot_indices); + + for (unsigned i = 0; i < n; i++) + opt_indices.arrayZ[i] = rot_indices.arrayZ[i]; + } + else + { + hb_vector_t<int> repeat_x_deltas, repeat_y_deltas; + contour_point_vector_t repeat_points; + + if (unlikely (!repeat_x_deltas.resize (n * 2, false) || + !repeat_y_deltas.resize (n * 2, false) || + !repeat_points.resize (n * 2, false))) + return false; + + unsigned contour_point_size = hb_static_size (contour_point_t); + for (unsigned i = 0; i < n; i++) + { + hb_memcpy ((void *) repeat_x_deltas.arrayZ, (const void *) x_deltas.arrayZ, n * sizeof (float)); + hb_memcpy ((void *) (repeat_x_deltas.arrayZ + n), (const void *) x_deltas.arrayZ, n * sizeof (float)); + + hb_memcpy ((void *) repeat_y_deltas.arrayZ, (const void *) y_deltas.arrayZ, n * sizeof (float)); + hb_memcpy ((void *) (repeat_y_deltas.arrayZ + n), (const void *) y_deltas.arrayZ, n * sizeof (float)); + + hb_memcpy ((void *) repeat_points.arrayZ, (const void *) contour_points.arrayZ, n * contour_point_size); + hb_memcpy ((void *) (repeat_points.arrayZ + n), (const void *) contour_points.arrayZ, n * contour_point_size); + } + + hb_vector_t<unsigned> costs; + hb_vector_t<int> chain; + if (!_iup_contour_optimize_dp (repeat_points, repeat_x_deltas, repeat_y_deltas, + forced_set, tolerance, n, + costs, chain)) + return false; + + unsigned best_cost = n + 1; + int len = costs.length; + hb_set_t best_sol; + for (int start = n - 1; start < len; start++) + { + hb_set_t solution; + int i = start; + int lookback = start - (int) n; + while (i > lookback) + { + solution.add (i % n); + i = chain.arrayZ[i]; + } + if (i == lookback) + { + unsigned cost_i = i < 0 ? 0 : costs.arrayZ[i]; + unsigned cost = costs.arrayZ[start] - cost_i; + if (cost <= best_cost) + { + best_sol.set (solution); + best_cost = cost; + } + } + } + + for (unsigned i = 0; i < n; i++) + if (best_sol.has (i)) + opt_indices.arrayZ[i] = true; + } + return true; +} + +bool iup_delta_optimize (const contour_point_vector_t& contour_points, + const hb_vector_t<int>& x_deltas, + const hb_vector_t<int>& y_deltas, + hb_vector_t<bool>& opt_indices, /* OUT */ + float tolerance) +{ + if (!opt_indices.resize (contour_points.length)) + return false; + + hb_vector_t<unsigned> end_points; + unsigned count = contour_points.length; + if (unlikely (!end_points.alloc (count))) + return false; + + for (unsigned i = 0; i < count - 4; i++) + if (contour_points.arrayZ[i].is_end_point) + end_points.push (i); + + /* phantom points */ + for (unsigned i = count - 4; i < count; i++) + end_points.push (i); + + if (end_points.in_error ()) return false; + + unsigned start = 0; + for (unsigned end : end_points) + { + unsigned len = end - start + 1; + if (!_iup_contour_optimize (contour_points.as_array ().sub_array (start, len), + x_deltas.as_array ().sub_array (start, len), + y_deltas.as_array ().sub_array (start, len), + opt_indices.as_array ().sub_array (start, len), + tolerance)) + return false; + start = end + 1; + } + return true; +} diff --git a/thirdparty/harfbuzz/src/hb-subset-instancer-iup.hh b/thirdparty/harfbuzz/src/hb-subset-instancer-iup.hh new file mode 100644 index 0000000000..7eac5935a4 --- /dev/null +++ b/thirdparty/harfbuzz/src/hb-subset-instancer-iup.hh @@ -0,0 +1,37 @@ +/* + * Copyright © 2024 Google, Inc. + * + * This is part of HarfBuzz, a text shaping library. + * + * Permission is hereby granted, without written agreement and without + * license or royalty fees, to use, copy, modify, and distribute this + * software and its documentation for any purpose, provided that the + * above copyright notice and the following two paragraphs appear in + * all copies of this software. + * + * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR + * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES + * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN + * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, + * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS + * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO + * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. + */ + +#ifndef HB_SUBSET_INSTANCER_IUP_HH +#define HB_SUBSET_INSTANCER_IUP_HH + +#include "hb-subset-plan.hh" +/* given contour points and deltas, optimize a set of referenced points within error + * tolerance. Returns optimized referenced point indices */ +HB_INTERNAL bool iup_delta_optimize (const contour_point_vector_t& contour_points, + const hb_vector_t<int>& x_deltas, + const hb_vector_t<int>& y_deltas, + hb_vector_t<bool>& opt_indices, /* OUT */ + float tolerance = 0.f); + +#endif /* HB_SUBSET_INSTANCER_IUP_HH */ diff --git a/thirdparty/harfbuzz/src/hb-subset-instancer-solver.cc b/thirdparty/harfbuzz/src/hb-subset-instancer-solver.cc index 4876bc4379..70783c0a0d 100644 --- a/thirdparty/harfbuzz/src/hb-subset-instancer-solver.cc +++ b/thirdparty/harfbuzz/src/hb-subset-instancer-solver.cc @@ -256,7 +256,10 @@ _solve (Triple tent, Triple axisLimit, bool negative = false) */ float newUpper = peak + (1 - gain) * (upper - peak); assert (axisMax <= newUpper); // Because outGain > gain - if (newUpper <= axisDef + (axisMax - axisDef) * 2) + /* Disabled because ots doesn't like us: + * https://github.com/fonttools/fonttools/issues/3350 */ + + if (false && (newUpper <= axisDef + (axisMax - axisDef) * 2)) { upper = newUpper; if (!negative && axisDef + (axisMax - axisDef) * MAX_F2DOT14 < upper) diff --git a/thirdparty/harfbuzz/src/hb-subset-plan-member-list.hh b/thirdparty/harfbuzz/src/hb-subset-plan-member-list.hh index 71da80e387..74416b92f9 100644 --- a/thirdparty/harfbuzz/src/hb-subset-plan-member-list.hh +++ b/thirdparty/harfbuzz/src/hb-subset-plan-member-list.hh @@ -140,6 +140,15 @@ HB_SUBSET_PLAN_MEMBER (mutable hb_vector_t<unsigned>, bounds_height_vec) //map: new_gid -> contour points vector HB_SUBSET_PLAN_MEMBER (mutable hb_hashmap_t E(<hb_codepoint_t, contour_point_vector_t>), new_gid_contour_points_map) +//new gids set for composite glyphs +HB_SUBSET_PLAN_MEMBER (hb_set_t, composite_new_gids) + +//Old BASE item variation index -> (New varidx, 0) mapping +HB_SUBSET_PLAN_MEMBER (hb_hashmap_t E(<unsigned, hb_pair_t E(<unsigned, int>)>), base_variation_idx_map) + +//BASE table varstore retained varidx mapping +HB_SUBSET_PLAN_MEMBER (hb_vector_t<hb_inc_bimap_t>, base_varstore_inner_maps) + #ifdef HB_EXPERIMENTAL_API // name table overrides map: hb_ot_name_record_ids_t-> name string new value or // None to indicate should remove diff --git a/thirdparty/harfbuzz/src/hb-subset-plan.cc b/thirdparty/harfbuzz/src/hb-subset-plan.cc index 5786223196..068fddaedd 100644 --- a/thirdparty/harfbuzz/src/hb-subset-plan.cc +++ b/thirdparty/harfbuzz/src/hb-subset-plan.cc @@ -32,6 +32,7 @@ #include "hb-ot-cmap-table.hh" #include "hb-ot-glyf-table.hh" +#include "hb-ot-layout-base-table.hh" #include "hb-ot-layout-gdef-table.hh" #include "hb-ot-layout-gpos-table.hh" #include "hb-ot-layout-gsub-table.hh" @@ -431,6 +432,52 @@ _collect_layout_variation_indices (hb_subset_plan_t* plan) gdef.destroy (); gpos.destroy (); } + +#ifndef HB_NO_BASE +/* used by BASE table only, delta is always set to 0 in the output map */ +static inline void +_remap_variation_indices (const hb_set_t& indices, + unsigned subtable_count, + hb_hashmap_t<unsigned, hb_pair_t<unsigned, int>>& variation_idx_delta_map /* OUT */) +{ + unsigned new_major = 0, new_minor = 0; + unsigned last_major = (indices.get_min ()) >> 16; + for (unsigned idx : indices) + { + uint16_t major = idx >> 16; + if (major >= subtable_count) break; + if (major != last_major) + { + new_minor = 0; + ++new_major; + } + + unsigned new_idx = (new_major << 16) + new_minor; + variation_idx_delta_map.set (idx, hb_pair_t<unsigned, int> (new_idx, 0)); + ++new_minor; + last_major = major; + } +} + +static inline void +_collect_base_variation_indices (hb_subset_plan_t* plan) +{ + hb_blob_ptr_t<OT::BASE> base = plan->source_table<OT::BASE> (); + if (!base->has_var_store ()) + { + base.destroy (); + return; + } + + hb_set_t varidx_set; + base->collect_variation_indices (plan, varidx_set); + unsigned subtable_count = base->get_var_store ().get_sub_table_count (); + base.destroy (); + + _remap_variation_indices (varidx_set, subtable_count, plan->base_variation_idx_map); + _generate_varstore_inner_maps (varidx_set, subtable_count, plan->base_varstore_inner_maps); +} +#endif #endif static inline void @@ -994,8 +1041,8 @@ _update_instance_metrics_map_from_cff2 (hb_subset_plan_t *plan) OT::cff2::accelerator_t cff2 (plan->source); if (!cff2.is_valid ()) return; - hb_font_t *font = nullptr; - if (unlikely (!plan->check_success (font = _get_hb_font_with_variations (plan)))) + hb_font_t *font = _get_hb_font_with_variations (plan); + if (unlikely (!plan->check_success (font != nullptr))) { hb_font_destroy (font); return; @@ -1073,8 +1120,8 @@ _update_instance_metrics_map_from_cff2 (hb_subset_plan_t *plan) static bool _get_instance_glyphs_contour_points (hb_subset_plan_t *plan) { - /* contour_points vector only needed for updating gvar table (infer delta) - * during partial instancing */ + /* contour_points vector only needed for updating gvar table (infer delta and + * iup delta optimization) during partial instancing */ if (plan->user_axes_location.is_empty () || plan->all_axes_pinned) return true; @@ -1092,10 +1139,17 @@ _get_instance_glyphs_contour_points (hb_subset_plan_t *plan) } hb_codepoint_t old_gid = _.second; - if (unlikely (!glyf.glyph_for_gid (old_gid).get_all_points_without_var (plan->source, all_points))) + auto glyph = glyf.glyph_for_gid (old_gid); + if (unlikely (!glyph.get_all_points_without_var (plan->source, all_points))) return false; if (unlikely (!plan->new_gid_contour_points_map.set (new_gid, all_points))) return false; + +#ifdef HB_EXPERIMENTAL_API + /* composite new gids are only needed by iup delta optimization */ + if ((plan->flags & HB_SUBSET_FLAGS_OPTIMIZE_IUP_DELTAS) && glyph.is_composite ()) + plan->composite_new_gids.add (new_gid); +#endif } return true; } @@ -1205,6 +1259,13 @@ hb_subset_plan_t::hb_subset_plan_t (hb_face_t *face, if (!drop_tables.has (HB_OT_TAG_GDEF)) _remap_used_mark_sets (this, used_mark_sets_map); +#ifndef HB_NO_VAR +#ifndef HB_NO_BASE + if (!drop_tables.has (HB_OT_TAG_BASE)) + _collect_base_variation_indices (this); +#endif +#endif + if (unlikely (in_error ())) return; diff --git a/thirdparty/harfbuzz/src/hb-subset-plan.hh b/thirdparty/harfbuzz/src/hb-subset-plan.hh index 1f19a58c1e..19a9fa6918 100644 --- a/thirdparty/harfbuzz/src/hb-subset-plan.hh +++ b/thirdparty/harfbuzz/src/hb-subset-plan.hh @@ -78,6 +78,13 @@ struct contour_point_t y = x * matrix[1] + y * matrix[3]; x = x_; } + + void add_delta (float delta_x, float delta_y) + { + x += delta_x; + y += delta_y; + } + HB_ALWAYS_INLINE void translate (const contour_point_t &p) { x += p.x; y += p.y; } @@ -99,6 +106,22 @@ struct contour_point_vector_t : hb_vector_t<contour_point_t> unsigned count = a.length; hb_memcpy (arrayZ, a.arrayZ, count * sizeof (arrayZ[0])); } + + bool add_deltas (const hb_vector_t<float> deltas_x, + const hb_vector_t<float> deltas_y, + const hb_vector_t<bool> indices) + { + if (indices.length != deltas_x.length || + indices.length != deltas_y.length) + return false; + + for (unsigned i = 0; i < indices.length; i++) + { + if (!indices.arrayZ[i]) continue; + arrayZ[i].add_delta (deltas_x.arrayZ[i], deltas_y.arrayZ[i]); + } + return true; + } }; namespace OT { @@ -147,7 +170,7 @@ struct hb_subset_plan_t bool gsub_insert_catch_all_feature_variation_rec; bool gpos_insert_catch_all_feature_variation_rec; - // whether GDEF VarStore is retained + // whether GDEF ItemVariationStore is retained mutable bool has_gdef_varstore; #define HB_SUBSET_PLAN_MEMBER(Type, Name) Type Name; diff --git a/thirdparty/harfbuzz/src/hb-subset.cc b/thirdparty/harfbuzz/src/hb-subset.cc index 06e77dd8eb..f10ef54dbd 100644 --- a/thirdparty/harfbuzz/src/hb-subset.cc +++ b/thirdparty/harfbuzz/src/hb-subset.cc @@ -48,6 +48,7 @@ #include "hb-ot-cff2-table.hh" #include "hb-ot-vorg-table.hh" #include "hb-ot-name-table.hh" +#include "hb-ot-layout-base-table.hh" #include "hb-ot-layout-gsub-table.hh" #include "hb-ot-layout-gpos-table.hh" #include "hb-ot-var-avar-table.hh" @@ -503,6 +504,7 @@ _subset_table (hb_subset_plan_t *plan, case HB_OT_TAG_CBLC: return _subset<const OT::CBLC> (plan, buf); case HB_OT_TAG_CBDT: return true; /* skip CBDT, handled by CBLC */ case HB_OT_TAG_MATH: return _subset<const OT::MATH> (plan, buf); + case HB_OT_TAG_BASE: return _subset<const OT::BASE> (plan, buf); #ifndef HB_NO_SUBSET_CFF case HB_OT_TAG_CFF1: return _subset<const OT::cff1> (plan, buf); @@ -548,6 +550,7 @@ _subset_table (hb_subset_plan_t *plan, } #endif return _passthrough (plan, tag); + default: if (plan->flags & HB_SUBSET_FLAGS_PASSTHROUGH_UNRECOGNIZED) return _passthrough (plan, tag); diff --git a/thirdparty/harfbuzz/src/hb-subset.h b/thirdparty/harfbuzz/src/hb-subset.h index d79e7f762a..73dcae4660 100644 --- a/thirdparty/harfbuzz/src/hb-subset.h +++ b/thirdparty/harfbuzz/src/hb-subset.h @@ -76,6 +76,8 @@ typedef struct hb_subset_plan_t hb_subset_plan_t; * @HB_SUBSET_FLAGS_IFTB_REQUIREMENTS: If set enforce requirements on the output subset * to allow it to be used with incremental font transfer IFTB patches. Primarily, * this forces all outline data to use long (32 bit) offsets. Since: EXPERIMENTAL + * @HB_SUBSET_FLAGS_OPTIMIZE_IUP_DELTAS: If set perform IUP delta optimization on the + * remaining gvar table's deltas. Since: EXPERIMENTAL * * List of boolean properties that can be configured on the subset input. * @@ -95,6 +97,7 @@ typedef enum { /*< flags >*/ HB_SUBSET_FLAGS_NO_LAYOUT_CLOSURE = 0x00000200u, #ifdef HB_EXPERIMENTAL_API HB_SUBSET_FLAGS_IFTB_REQUIREMENTS = 0x00000400u, + HB_SUBSET_FLAGS_OPTIMIZE_IUP_DELTAS = 0x00000800u, #endif } hb_subset_flags_t; @@ -171,6 +174,10 @@ hb_subset_input_set_flags (hb_subset_input_t *input, unsigned value); HB_EXTERN hb_bool_t +hb_subset_input_pin_all_axes_to_default (hb_subset_input_t *input, + hb_face_t *face); + +HB_EXTERN hb_bool_t hb_subset_input_pin_axis_to_default (hb_subset_input_t *input, hb_face_t *face, hb_tag_t axis_tag); @@ -183,12 +190,19 @@ hb_subset_input_pin_axis_location (hb_subset_input_t *input, #ifdef HB_EXPERIMENTAL_API HB_EXTERN hb_bool_t +hb_subset_input_get_axis_range (hb_subset_input_t *input, + hb_tag_t axis_tag, + float *axis_min_value, + float *axis_max_value, + float *axis_def_value); + +HB_EXTERN hb_bool_t hb_subset_input_set_axis_range (hb_subset_input_t *input, hb_face_t *face, hb_tag_t axis_tag, float axis_min_value, float axis_max_value, - float *axis_def_value); + float axis_def_value); HB_EXTERN hb_bool_t hb_subset_input_override_name_table (hb_subset_input_t *input, diff --git a/thirdparty/harfbuzz/src/hb-vector.hh b/thirdparty/harfbuzz/src/hb-vector.hh index dfe1b7d1c7..c0cc7063ff 100644 --- a/thirdparty/harfbuzz/src/hb-vector.hh +++ b/thirdparty/harfbuzz/src/hb-vector.hh @@ -78,7 +78,7 @@ struct hb_vector_t if (unlikely (in_error ())) return; copy_array (o); } - hb_vector_t (hb_vector_t &&o) + hb_vector_t (hb_vector_t &&o) noexcept { allocated = o.allocated; length = o.length; @@ -122,7 +122,7 @@ struct hb_vector_t resize (0); } - friend void swap (hb_vector_t& a, hb_vector_t& b) + friend void swap (hb_vector_t& a, hb_vector_t& b) noexcept { hb_swap (a.allocated, b.allocated); hb_swap (a.length, b.length); @@ -139,7 +139,7 @@ struct hb_vector_t return *this; } - hb_vector_t& operator = (hb_vector_t &&o) + hb_vector_t& operator = (hb_vector_t &&o) noexcept { hb_swap (*this, o); return *this; diff --git a/thirdparty/harfbuzz/src/hb-version.h b/thirdparty/harfbuzz/src/hb-version.h index b08dd1f09f..68681874ca 100644 --- a/thirdparty/harfbuzz/src/hb-version.h +++ b/thirdparty/harfbuzz/src/hb-version.h @@ -47,7 +47,7 @@ HB_BEGIN_DECLS * * The minor component of the library version available at compile-time. */ -#define HB_VERSION_MINOR 3 +#define HB_VERSION_MINOR 4 /** * HB_VERSION_MICRO: * @@ -60,7 +60,7 @@ HB_BEGIN_DECLS * * A string literal containing the library version available at compile-time. */ -#define HB_VERSION_STRING "8.3.0" +#define HB_VERSION_STRING "8.4.0" /** * HB_VERSION_ATLEAST: diff --git a/thirdparty/harfbuzz/src/hb.hh b/thirdparty/harfbuzz/src/hb.hh index 972608d6a3..0ceeb99f50 100644 --- a/thirdparty/harfbuzz/src/hb.hh +++ b/thirdparty/harfbuzz/src/hb.hh @@ -64,6 +64,7 @@ #pragma GCC diagnostic error "-Wbitwise-instead-of-logical" #pragma GCC diagnostic error "-Wcast-align" #pragma GCC diagnostic error "-Wcast-function-type" +#pragma GCC diagnostic error "-Wcast-function-type-strict" #pragma GCC diagnostic error "-Wconstant-conversion" #pragma GCC diagnostic error "-Wcomma" #pragma GCC diagnostic error "-Wdelete-non-virtual-dtor" @@ -177,6 +178,11 @@ #define HB_EXTERN __declspec (dllexport) extern #endif +// https://github.com/harfbuzz/harfbuzz/pull/4619 +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS 1 +#endif + #include "hb.h" #define HB_H_IN #include "hb-ot.h" @@ -212,6 +218,12 @@ #include <winapifamily.h> #endif +#ifndef PRId32 +# define PRId32 "d" +# define PRIu32 "u" +# define PRIx32 "x" +#endif + #define HB_PASTE1(a,b) a##b #define HB_PASTE(a,b) HB_PASTE1(a,b) diff --git a/thirdparty/libktx/lib/dfdutils/createdfd.c b/thirdparty/libktx/lib/dfdutils/createdfd.c index 2a5af00c7e..eb64204189 100644 --- a/thirdparty/libktx/lib/dfdutils/createdfd.c +++ b/thirdparty/libktx/lib/dfdutils/createdfd.c @@ -94,6 +94,10 @@ static uint32_t setChannelFlags(uint32_t channel, enum VkSuffix suffix) channel |= KHR_DF_SAMPLE_DATATYPE_LINEAR; } break; + case s_S10_5: + channel |= + KHR_DF_SAMPLE_DATATYPE_SIGNED; + break; } return channel; } @@ -109,7 +113,6 @@ static void writeSample(uint32_t *DFD, int sampleNo, int channel, float f; } lower, upper; uint32_t *sample = DFD + 1 + KHR_DF_WORD_SAMPLESTART + sampleNo * KHR_DF_WORD_SAMPLEWORDS; - if (channel == 3) channel = KHR_DF_CHANNEL_RGBSDA_ALPHA; if (channel == 3) channel = KHR_DF_CHANNEL_RGBSDA_ALPHA; channel = setChannelFlags(channel, suffix); @@ -159,6 +162,10 @@ static void writeSample(uint32_t *DFD, int sampleNo, int channel, upper.f = 1.0f; lower.f = 0.0f; break; + case s_S10_5: + assert(bits == 16 && "Format with this suffix must be 16 bits per channel."); + upper.i = 32; + lower.i = ~upper.i + 1; // -32 } sample[KHR_DF_SAMPLEWORD_SAMPLELOWER] = lower.i; sample[KHR_DF_SAMPLEWORD_SAMPLEUPPER] = upper.i; @@ -230,8 +237,9 @@ uint32_t *createDFDUnpacked(int bigEndian, int numChannels, int bytes, * @param bits[] An array of length numChannels. * Each entry is the number of bits composing the channel, in * order starting at bit 0 of the packed type. - * @param paddings[] An array of length numChannels. - * Each entry is the number of padding bits after each channel. + * @param shiftBits[] An array of length numChannels. + * Each entry is the number of bits each channel is shifted + * and thus padded with insignificant bits. * @param channels[] An array of length numChannels. * Each entry enumerates the channel type: 0 = red, 1 = green, * 2 = blue, 15 = alpha, in order starting at bit 0 of the @@ -243,9 +251,9 @@ uint32_t *createDFDUnpacked(int bigEndian, int numChannels, int bytes, * @return A data format descriptor in malloc'd data. The caller is responsible * for freeing the descriptor. **/ -uint32_t *createDFDPackedPadded(int bigEndian, int numChannels, - int bits[], int paddings[], int channels[], - enum VkSuffix suffix) +uint32_t *createDFDPackedShifted(int bigEndian, int numChannels, + int bits[], int shiftBits[], int channels[], + enum VkSuffix suffix) { uint32_t *DFD = 0; if (numChannels == 6) { @@ -291,17 +299,18 @@ uint32_t *createDFDPackedPadded(int bigEndian, int numChannels, int sampleCounter; for (channelCounter = 0; channelCounter < numChannels; ++channelCounter) { beChannelStart[channelCounter] = totalBits; - totalBits += bits[channelCounter] + paddings[channelCounter]; + totalBits += shiftBits[channelCounter] + bits[channelCounter]; } BEMask = (totalBits - 1) & 0x18; for (channelCounter = 0; channelCounter < numChannels; ++channelCounter) { + bitOffset += shiftBits[channelCounter]; bitChannel[bitOffset ^ BEMask] = channelCounter; if (((bitOffset + bits[channelCounter] - 1) & ~7) != (bitOffset & ~7)) { /* Continuation sample */ bitChannel[((bitOffset + bits[channelCounter] - 1) & ~7) ^ BEMask] = channelCounter; numSamples++; } - bitOffset += bits[channelCounter] + paddings[channelCounter]; + bitOffset += bits[channelCounter]; } DFD = writeHeader(numSamples, totalBits >> 3, suffix, i_COLOR); @@ -343,16 +352,17 @@ uint32_t *createDFDPackedPadded(int bigEndian, int numChannels, int totalBits = 0; int bitOffset = 0; for (sampleCounter = 0; sampleCounter < numChannels; ++sampleCounter) { - totalBits += bits[sampleCounter] + paddings[sampleCounter]; + totalBits += shiftBits[sampleCounter] + bits[sampleCounter]; } /* One sample per channel */ DFD = writeHeader(numChannels, totalBits >> 3, suffix, i_COLOR); for (sampleCounter = 0; sampleCounter < numChannels; ++sampleCounter) { + bitOffset += shiftBits[sampleCounter]; writeSample(DFD, sampleCounter, channels[sampleCounter], bits[sampleCounter], bitOffset, 1, 1, suffix); - bitOffset += bits[sampleCounter] + paddings[sampleCounter]; + bitOffset += bits[sampleCounter]; } } return DFD; @@ -383,12 +393,12 @@ uint32_t *createDFDPacked(int bigEndian, int numChannels, int bits[], int channels[], enum VkSuffix suffix) { assert(numChannels <= 6); - int paddings[] = {0, 0, 0, 0, 0, 0}; - return createDFDPackedPadded(bigEndian, numChannels, bits, paddings, channels, suffix); + int shiftBits[] = {0, 0, 0, 0, 0, 0}; + return createDFDPackedShifted(bigEndian, numChannels, bits, shiftBits, channels, suffix); } uint32_t *createDFD422(int bigEndian, int numSamples, - int bits[], int paddings[], int channels[], + int bits[], int shiftBits[], int channels[], int position_xs[], int position_ys[], enum VkSuffix suffix) { assert(!bigEndian); (void) bigEndian; @@ -396,7 +406,7 @@ uint32_t *createDFD422(int bigEndian, int numSamples, int totalBits = 0; for (int i = 0; i < numSamples; ++i) - totalBits += bits[i] + paddings[i]; + totalBits += shiftBits[i] + bits[i]; assert(totalBits % 8 == 0); uint32_t BDFDSize = sizeof(uint32_t) * (KHR_DF_WORD_SAMPLESTART + numSamples * KHR_DF_WORD_SAMPLEWORDS); @@ -428,6 +438,7 @@ uint32_t *createDFD422(int bigEndian, int numSamples, int bitOffset = 0; for (int i = 0; i < numSamples; ++i) { + bitOffset += shiftBits[i]; KHR_DFDSETSVAL(BDFD, i, BITOFFSET, bitOffset); KHR_DFDSETSVAL(BDFD, i, BITLENGTH, bits[i] - 1); KHR_DFDSETSVAL(BDFD, i, CHANNELID, channels[i]); @@ -438,7 +449,7 @@ uint32_t *createDFD422(int bigEndian, int numSamples, KHR_DFDSETSVAL(BDFD, i, SAMPLEPOSITION3, 0); KHR_DFDSETSVAL(BDFD, i, SAMPLELOWER, 0); KHR_DFDSETSVAL(BDFD, i, SAMPLEUPPER, (1u << bits[i]) - 1u); - bitOffset += bits[i] + paddings[i]; + bitOffset += bits[i]; } return DFD; diff --git a/thirdparty/libktx/lib/dfdutils/dfd.h b/thirdparty/libktx/lib/dfdutils/dfd.h index 7f1fb74a6b..756490fc82 100644 --- a/thirdparty/libktx/lib/dfdutils/dfd.h +++ b/thirdparty/libktx/lib/dfdutils/dfd.h @@ -35,7 +35,8 @@ enum VkSuffix { s_SINT, /*!< Signed integer format. */ s_SFLOAT, /*!< Signed float format. */ s_UFLOAT, /*!< Unsigned float format. */ - s_SRGB /*!< sRGB normalized format. */ + s_SRGB, /*!< sRGB normalized format. */ + s_S10_5 /*!< 2's complement fixed-point; 5 fractional bits. */ }; /** Compression scheme, in Vulkan terms. */ @@ -68,15 +69,16 @@ typedef unsigned int uint32_t; #endif uint32_t* vk2dfd(enum VkFormat format); +enum VkFormat dfd2vk(uint32_t* dfd); /* Create a Data Format Descriptor for an unpacked format. */ uint32_t *createDFDUnpacked(int bigEndian, int numChannels, int bytes, int redBlueSwap, enum VkSuffix suffix); /* Create a Data Format Descriptor for a packed padded format. */ -uint32_t *createDFDPackedPadded(int bigEndian, int numChannels, - int bits[], int paddings[], int channels[], - enum VkSuffix suffix); +uint32_t *createDFDPackedShifted(int bigEndian, int numChannels, + int bits[], int shiftBits[], + int channels[], enum VkSuffix suffix); /* Create a Data Format Descriptor for a packed format. */ uint32_t *createDFDPacked(int bigEndian, int numChannels, @@ -85,7 +87,7 @@ uint32_t *createDFDPacked(int bigEndian, int numChannels, /* Create a Data Format Descriptor for a 4:2:2 format. */ uint32_t *createDFD422(int bigEndian, int numChannels, - int bits[], int paddings[], int channels[], + int bits[], int shiftBits[], int channels[], int position_xs[], int position_ys[], enum VkSuffix suffix); @@ -111,10 +113,11 @@ enum InterpretDFDResult { i_SRGB_FORMAT_BIT = 1u << 2u, /*!< sRGB transfer function. */ i_NORMALIZED_FORMAT_BIT = 1u << 3u, /*!< Normalized (UNORM or SNORM). */ i_SIGNED_FORMAT_BIT = 1u << 4u, /*!< Format is signed. */ - i_FLOAT_FORMAT_BIT = 1u << 5u, /*!< Format is floating point. */ - i_COMPRESSED_FORMAT_BIT = 1u << 6u, /*!< Format is block compressed (422). */ - i_YUVSDA_FORMAT_BIT = 1u << 7u, /*!< Color model is YUVSDA. */ - i_UNSUPPORTED_ERROR_BIT = 1u << 8u, /*!< Format not successfully interpreted. */ + i_FIXED_FORMAT_BIT = 1u << 5u, /*!< Format is a fixed-point representation. */ + i_FLOAT_FORMAT_BIT = 1u << 6u, /*!< Format is floating point. */ + i_COMPRESSED_FORMAT_BIT = 1u << 7u, /*!< Format is block compressed (422). */ + i_YUVSDA_FORMAT_BIT = 1u << 8u, /*!< Color model is YUVSDA. */ + i_UNSUPPORTED_ERROR_BIT = 1u << 9u, /*!< Format not successfully interpreted. */ /** "NONTRIVIAL_ENDIANNESS" means not big-endian, not little-endian * (a channel has bits that are not consecutive in either order). **/ i_UNSUPPORTED_NONTRIVIAL_ENDIANNESS = i_UNSUPPORTED_ERROR_BIT, @@ -198,9 +201,12 @@ getDFDComponentInfoUnpacked(const uint32_t* DFD, uint32_t* numComponents, /* Return the number of components described by a DFD. */ uint32_t getDFDNumComponents(const uint32_t* DFD); -/* Recreate and return the value of bytesPlane0 as it should be for the data +/* Reconstruct and return the value of bytesPlane0 as it should be for the data * post-inflation from variable-rate compression. */ +uint32_t +reconstructDFDBytesPlane0FromSamples(const uint32_t* DFD); +/* Deprecated. For backward compatibility. */ void recreateBytesPlane0FromSampleInfo(const uint32_t* DFD, uint32_t* bytesPlane0); diff --git a/thirdparty/libktx/lib/dfdutils/dfd2vk.inl b/thirdparty/libktx/lib/dfdutils/dfd2vk.inl index 8b4de41597..a616566c74 100644 --- a/thirdparty/libktx/lib/dfdutils/dfd2vk.inl +++ b/thirdparty/libktx/lib/dfdutils/dfd2vk.inl @@ -4,12 +4,20 @@ /***************************** Do not edit. ***************************** Automatically generated by makedfd2vk.pl. *************************************************************************/ -if (KHR_DFDVAL(dfd + 1, MODEL) == KHR_DF_MODEL_RGBSDA) { +if (KHR_DFDVAL(dfd + 1, MODEL) == KHR_DF_MODEL_RGBSDA || KHR_DFDVAL(dfd + 1, MODEL) == KHR_DF_MODEL_YUVSDA) { enum InterpretDFDResult r; InterpretedDFDChannel R = {0,0}; InterpretedDFDChannel G = {0,0}; InterpretedDFDChannel B = {0,0}; InterpretedDFDChannel A = {0,0}; + /* interpretDFD channel overloadings for YUVSDA formats. These are + * different from the mapping used by Vulkan. */ + #define Y1 R + #define Y2 A + #define CB G + #define U G + #define CR B + #define V B uint32_t wordBytes; /* Special case exponent format */ @@ -44,8 +52,17 @@ if (KHR_DFDVAL(dfd + 1, MODEL) == KHR_DF_MODEL_RGBSDA) { else if (wordBytes == 2) { /* PACK16 */ if (A.size == 4) { if (R.offset == 12) return VK_FORMAT_R4G4B4A4_UNORM_PACK16; - else return VK_FORMAT_B4G4R4A4_UNORM_PACK16; - } else if (A.size == 0) { /* Three channels */ + else if (B.offset == 12) return VK_FORMAT_B4G4R4A4_UNORM_PACK16; + else if (A.offset == 12) { + if (R.offset == 8) return VK_FORMAT_A4R4G4B4_UNORM_PACK16; + else return VK_FORMAT_A4B4G4R4_UNORM_PACK16; + } + } else if (G.size == 0 && B.size == 0 && A.size == 0) { /* One channel */ + if (R.size == 10) + return VK_FORMAT_R10X6_UNORM_PACK16; + else if (R.size ==12) + return VK_FORMAT_R12X4_UNORM_PACK16; + } else if (A.size == 0) { /* Three channels */ if (B.offset == 0) return VK_FORMAT_R5G6B5_UNORM_PACK16; else return VK_FORMAT_B5G6R5_UNORM_PACK16; } else { /* Four channels, one-bit alpha */ @@ -54,7 +71,7 @@ if (KHR_DFDVAL(dfd + 1, MODEL) == KHR_DF_MODEL_RGBSDA) { if (B.offset == 10) return VK_FORMAT_A1B5G5R5_UNORM_PACK16_KHR; return VK_FORMAT_B5G5R5A1_UNORM_PACK16; } - } else if (wordBytes == 4) { /* PACK32 */ + } else if (wordBytes == 4) { /* PACK32 or 2PACK16 */ if (A.size == 8) { if ((r & i_SRGB_FORMAT_BIT)) return VK_FORMAT_A8B8G8R8_SRGB_PACK32; if ((r & i_NORMALIZED_FORMAT_BIT) && !(r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_A8B8G8R8_UNORM_PACK32; @@ -71,133 +88,171 @@ if (KHR_DFDVAL(dfd + 1, MODEL) == KHR_DF_MODEL_RGBSDA) { if ((r & i_NORMALIZED_FORMAT_BIT) && (r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_A2B10G10R10_SNORM_PACK32; if (!(r & i_NORMALIZED_FORMAT_BIT) && !(r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_A2B10G10R10_UINT_PACK32; if (!(r & i_NORMALIZED_FORMAT_BIT) && (r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_A2B10G10R10_SINT_PACK32; - } else if (R.size == 11) return VK_FORMAT_B10G11R11_UFLOAT_PACK32; + } else if (R.size == 11) { + return VK_FORMAT_B10G11R11_UFLOAT_PACK32; + } else if (R.size == 10 && G.size == 10 && B.size == 0) { + return VK_FORMAT_R10X6G10X6_UNORM_2PACK16; + } else if (R.size == 12 && G.size == 12 && B.size == 0) { + return VK_FORMAT_R12X4G12X4_UNORM_2PACK16; + } + } else if (wordBytes == 8) { /* 4PACK16 */ + if (r & i_YUVSDA_FORMAT_BIT) { + /* In Vulkan G = Y, R = Cr, B = Cb. */ + if (Y1.size == 10 && Y1.offset == 6 && Y2.size == 10 && Y2.offset == 38) + return VK_FORMAT_G10X6B10X6G10X6R10X6_422_UNORM_4PACK16; + if (Y1.size == 10 && Y1.offset == 22 && Y2.size == 10 && Y2.offset == 54) + return VK_FORMAT_B10X6G10X6R10X6G10X6_422_UNORM_4PACK16; + if (Y1.size == 12 && Y1.offset == 4 && Y2.size == 12 && Y2.offset == 36) + return VK_FORMAT_G12X4B12X4G12X4R12X4_422_UNORM_4PACK16; + if (Y1.size == 12 && Y1.offset == 20 && Y2.size == 12 && Y2.offset == 52) + return VK_FORMAT_B12X4G12X4R12X4G12X4_422_UNORM_4PACK16; + } else { + if (R.size == 10) + return VK_FORMAT_R10X6G10X6B10X6A10X6_UNORM_4PACK16; + else if (R.size == 12) + return VK_FORMAT_R12X4G12X4B12X4A12X4_UNORM_4PACK16; + } } } else { /* Not a packed format */ - if (wordBytes == 1) { - if (A.size > 8 && R.size == 0 && G.size == 0 && B.size == 0 && (r & i_NORMALIZED_FORMAT_BIT) && !(r & i_SIGNED_FORMAT_BIT)) { - return VK_FORMAT_A8_UNORM_KHR; - } - if (A.size > 0) { /* 4 channels */ - if (R.offset == 0) { /* RGBA */ - if ((r & i_SRGB_FORMAT_BIT)) return VK_FORMAT_R8G8B8A8_SRGB; - if ((r & i_NORMALIZED_FORMAT_BIT) && !(r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R8G8B8A8_UNORM; - if ((r & i_NORMALIZED_FORMAT_BIT) && (r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R8G8B8A8_SNORM; - if (!(r & i_NORMALIZED_FORMAT_BIT) && !(r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R8G8B8A8_UINT; - if (!(r & i_NORMALIZED_FORMAT_BIT) && (r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R8G8B8A8_SINT; - } else { /* BGRA */ - if ((r & i_SRGB_FORMAT_BIT)) return VK_FORMAT_B8G8R8A8_SRGB; - if ((r & i_NORMALIZED_FORMAT_BIT) && !(r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_B8G8R8A8_UNORM; - if ((r & i_NORMALIZED_FORMAT_BIT) && (r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_B8G8R8A8_SNORM; - if (!(r & i_NORMALIZED_FORMAT_BIT) && !(r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_B8G8R8A8_UINT; - if (!(r & i_NORMALIZED_FORMAT_BIT) && (r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_B8G8R8A8_SINT; - } - } else if (B.size > 0) { /* 3 channels */ - if (R.offset == 0) { /* RGB */ - if ((r & i_SRGB_FORMAT_BIT)) return VK_FORMAT_R8G8B8_SRGB; - if ((r & i_NORMALIZED_FORMAT_BIT) && !(r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R8G8B8_UNORM; - if ((r & i_NORMALIZED_FORMAT_BIT) && (r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R8G8B8_SNORM; - if (!(r & i_NORMALIZED_FORMAT_BIT) && !(r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R8G8B8_UINT; - if (!(r & i_NORMALIZED_FORMAT_BIT) && (r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R8G8B8_SINT; - } else { /* BGR */ - if ((r & i_SRGB_FORMAT_BIT)) return VK_FORMAT_B8G8R8_SRGB; - if ((r & i_NORMALIZED_FORMAT_BIT) && !(r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_B8G8R8_UNORM; - if ((r & i_NORMALIZED_FORMAT_BIT) && (r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_B8G8R8_SNORM; - if (!(r & i_NORMALIZED_FORMAT_BIT) && !(r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_B8G8R8_UINT; - if (!(r & i_NORMALIZED_FORMAT_BIT) && (r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_B8G8R8_SINT; + if (r & i_YUVSDA_FORMAT_BIT) { + /* In Vulkan G = Y, R = Cr, B = Cb. */ + if (Y1.size == 1 && Y1.offset == 0 && Y2.size == 1 && Y2.offset == 2) + return VK_FORMAT_G8B8G8R8_422_UNORM; + else if (Y1.size == 1 && Y1.offset == 1 && Y2.size == 1 && Y2.offset == 3) + return VK_FORMAT_B8G8R8G8_422_UNORM; + else if (Y1.size == 2 && Y1.offset == 0 && Y2.size == 2 && Y2.offset == 4) + return VK_FORMAT_G16B16G16R16_422_UNORM; + else if (Y1.size == 2 && Y1.offset == 2 && Y2.size == 2 && Y2.offset == 6) + return VK_FORMAT_B16G16R16G16_422_UNORM; + else + return VK_FORMAT_UNDEFINED; // Until support added. + } else { /* Not YUV */ + if (wordBytes == 1) { + if (A.size == 1 && R.size == 0 && G.size == 0 && B.size == 0 && (r & i_NORMALIZED_FORMAT_BIT) && !(r & i_SIGNED_FORMAT_BIT)) { + return VK_FORMAT_A8_UNORM_KHR; } - } else if (G.size > 0) { /* 2 channels */ + if (A.size > 0) { /* 4 channels */ + if (R.offset == 0) { /* RGBA */ + if ((r & i_SRGB_FORMAT_BIT)) return VK_FORMAT_R8G8B8A8_SRGB; + if ((r & i_NORMALIZED_FORMAT_BIT) && !(r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R8G8B8A8_UNORM; + if ((r & i_NORMALIZED_FORMAT_BIT) && (r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R8G8B8A8_SNORM; + if (!(r & i_NORMALIZED_FORMAT_BIT) && !(r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R8G8B8A8_UINT; + if (!(r & i_NORMALIZED_FORMAT_BIT) && (r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R8G8B8A8_SINT; + } else { /* BGRA */ + if ((r & i_SRGB_FORMAT_BIT)) return VK_FORMAT_B8G8R8A8_SRGB; + if ((r & i_NORMALIZED_FORMAT_BIT) && !(r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_B8G8R8A8_UNORM; + if ((r & i_NORMALIZED_FORMAT_BIT) && (r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_B8G8R8A8_SNORM; + if (!(r & i_NORMALIZED_FORMAT_BIT) && !(r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_B8G8R8A8_UINT; + if (!(r & i_NORMALIZED_FORMAT_BIT) && (r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_B8G8R8A8_SINT; + } + } else if (B.size > 0) { /* 3 channels */ + if (R.offset == 0) { /* RGB */ + if ((r & i_SRGB_FORMAT_BIT)) return VK_FORMAT_R8G8B8_SRGB; + if ((r & i_NORMALIZED_FORMAT_BIT) && !(r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R8G8B8_UNORM; + if ((r & i_NORMALIZED_FORMAT_BIT) && (r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R8G8B8_SNORM; + if (!(r & i_NORMALIZED_FORMAT_BIT) && !(r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R8G8B8_UINT; + if (!(r & i_NORMALIZED_FORMAT_BIT) && (r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R8G8B8_SINT; + } else { /* BGR */ + if ((r & i_SRGB_FORMAT_BIT)) return VK_FORMAT_B8G8R8_SRGB; + if ((r & i_NORMALIZED_FORMAT_BIT) && !(r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_B8G8R8_UNORM; + if ((r & i_NORMALIZED_FORMAT_BIT) && (r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_B8G8R8_SNORM; + if (!(r & i_NORMALIZED_FORMAT_BIT) && !(r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_B8G8R8_UINT; + if (!(r & i_NORMALIZED_FORMAT_BIT) && (r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_B8G8R8_SINT; + } + } else if (G.size > 0) { /* 2 channels */ if ((r & i_SRGB_FORMAT_BIT)) return VK_FORMAT_R8G8_SRGB; if ((r & i_NORMALIZED_FORMAT_BIT) && !(r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R8G8_UNORM; if ((r & i_NORMALIZED_FORMAT_BIT) && (r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R8G8_SNORM; if (!(r & i_NORMALIZED_FORMAT_BIT) && !(r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R8G8_UINT; if (!(r & i_NORMALIZED_FORMAT_BIT) && (r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R8G8_SINT; - } else { /* 1 channel */ + } else { /* 1 channel */ if ((r & i_SRGB_FORMAT_BIT)) return VK_FORMAT_R8_SRGB; if ((r & i_NORMALIZED_FORMAT_BIT) && !(r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R8_UNORM; if ((r & i_NORMALIZED_FORMAT_BIT) && (r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R8_SNORM; if (!(r & i_NORMALIZED_FORMAT_BIT) && !(r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R8_UINT; if (!(r & i_NORMALIZED_FORMAT_BIT) && (r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R8_SINT; - } - } else if (wordBytes == 2) { - if (A.size > 0) { /* 4 channels */ - if (R.offset == 0) { /* RGBA */ - if ((r & i_FLOAT_FORMAT_BIT)) return VK_FORMAT_R16G16B16A16_SFLOAT; - if ((r & i_NORMALIZED_FORMAT_BIT) && !(r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R16G16B16A16_UNORM; - if ((r & i_NORMALIZED_FORMAT_BIT) && (r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R16G16B16A16_SNORM; - if (!(r & i_NORMALIZED_FORMAT_BIT) && !(r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R16G16B16A16_UINT; - if (!(r & i_NORMALIZED_FORMAT_BIT) && (r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R16G16B16A16_SINT; - } else { /* BGRA */ } - } else if (B.size > 0) { /* 3 channels */ - if (R.offset == 0) { /* RGB */ - if ((r & i_FLOAT_FORMAT_BIT)) return VK_FORMAT_R16G16B16_SFLOAT; - if ((r & i_NORMALIZED_FORMAT_BIT) && !(r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R16G16B16_UNORM; - if ((r & i_NORMALIZED_FORMAT_BIT) && (r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R16G16B16_SNORM; - if (!(r & i_NORMALIZED_FORMAT_BIT) && !(r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R16G16B16_UINT; - if (!(r & i_NORMALIZED_FORMAT_BIT) && (r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R16G16B16_SINT; - } else { /* BGR */ - } - } else if (G.size > 0) { /* 2 channels */ + } else if (wordBytes == 2) { + if ((r & i_FIXED_FORMAT_BIT) && R.size == 2 && G.size == 2) return VK_FORMAT_R16G16_S10_5_NV; + if (A.size > 0) { /* 4 channels */ + if (R.offset == 0) { /* RGBA */ + if ((r & i_FLOAT_FORMAT_BIT)) return VK_FORMAT_R16G16B16A16_SFLOAT; + if ((r & i_NORMALIZED_FORMAT_BIT) && !(r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R16G16B16A16_UNORM; + if ((r & i_NORMALIZED_FORMAT_BIT) && (r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R16G16B16A16_SNORM; + if (!(r & i_NORMALIZED_FORMAT_BIT) && !(r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R16G16B16A16_UINT; + if (!(r & i_NORMALIZED_FORMAT_BIT) && (r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R16G16B16A16_SINT; + } else { /* BGRA */ + } + } else if (B.size > 0) { /* 3 channels */ + if (R.offset == 0) { /* RGB */ + if ((r & i_FLOAT_FORMAT_BIT)) return VK_FORMAT_R16G16B16_SFLOAT; + if ((r & i_NORMALIZED_FORMAT_BIT) && !(r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R16G16B16_UNORM; + if ((r & i_NORMALIZED_FORMAT_BIT) && (r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R16G16B16_SNORM; + if (!(r & i_NORMALIZED_FORMAT_BIT) && !(r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R16G16B16_UINT; + if (!(r & i_NORMALIZED_FORMAT_BIT) && (r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R16G16B16_SINT; + } else { /* BGR */ + } + } else if (G.size > 0) { /* 2 channels */ if ((r & i_FLOAT_FORMAT_BIT)) return VK_FORMAT_R16G16_SFLOAT; if ((r & i_NORMALIZED_FORMAT_BIT) && !(r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R16G16_UNORM; if ((r & i_NORMALIZED_FORMAT_BIT) && (r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R16G16_SNORM; if (!(r & i_NORMALIZED_FORMAT_BIT) && !(r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R16G16_UINT; if (!(r & i_NORMALIZED_FORMAT_BIT) && (r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R16G16_SINT; - } else { /* 1 channel */ + } else { /* 1 channel */ if ((r & i_FLOAT_FORMAT_BIT)) return VK_FORMAT_R16_SFLOAT; if ((r & i_NORMALIZED_FORMAT_BIT) && !(r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R16_UNORM; if ((r & i_NORMALIZED_FORMAT_BIT) && (r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R16_SNORM; if (!(r & i_NORMALIZED_FORMAT_BIT) && !(r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R16_UINT; if (!(r & i_NORMALIZED_FORMAT_BIT) && (r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R16_SINT; - } - } else if (wordBytes == 4) { - if (A.size > 0) { /* 4 channels */ - if (R.offset == 0) { /* RGBA */ - if ((r & i_FLOAT_FORMAT_BIT)) return VK_FORMAT_R32G32B32A32_SFLOAT; - if (!(r & i_NORMALIZED_FORMAT_BIT) && !(r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R32G32B32A32_UINT; - if (!(r & i_NORMALIZED_FORMAT_BIT) && (r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R32G32B32A32_SINT; - } else { /* BGRA */ - } - } else if (B.size > 0) { /* 3 channels */ - if (R.offset == 0) { /* RGB */ - if ((r & i_FLOAT_FORMAT_BIT)) return VK_FORMAT_R32G32B32_SFLOAT; - if (!(r & i_NORMALIZED_FORMAT_BIT) && !(r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R32G32B32_UINT; - if (!(r & i_NORMALIZED_FORMAT_BIT) && (r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R32G32B32_SINT; - } else { /* BGR */ } - } else if (G.size > 0) { /* 2 channels */ + } else if (wordBytes == 4) { + if (A.size > 0) { /* 4 channels */ + if (R.offset == 0) { /* RGBA */ + if ((r & i_FLOAT_FORMAT_BIT)) return VK_FORMAT_R32G32B32A32_SFLOAT; + if (!(r & i_NORMALIZED_FORMAT_BIT) && !(r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R32G32B32A32_UINT; + if (!(r & i_NORMALIZED_FORMAT_BIT) && (r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R32G32B32A32_SINT; + } else { /* BGRA */ + } + } else if (B.size > 0) { /* 3 channels */ + if (R.offset == 0) { /* RGB */ + if ((r & i_FLOAT_FORMAT_BIT)) return VK_FORMAT_R32G32B32_SFLOAT; + if (!(r & i_NORMALIZED_FORMAT_BIT) && !(r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R32G32B32_UINT; + if (!(r & i_NORMALIZED_FORMAT_BIT) && (r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R32G32B32_SINT; + } else { /* BGR */ + } + } else if (G.size > 0) { /* 2 channels */ if ((r & i_FLOAT_FORMAT_BIT)) return VK_FORMAT_R32G32_SFLOAT; if (!(r & i_NORMALIZED_FORMAT_BIT) && !(r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R32G32_UINT; if (!(r & i_NORMALIZED_FORMAT_BIT) && (r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R32G32_SINT; - } else { /* 1 channel */ + } else { /* 1 channel */ if ((r & i_FLOAT_FORMAT_BIT)) return VK_FORMAT_R32_SFLOAT; if (!(r & i_NORMALIZED_FORMAT_BIT) && !(r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R32_UINT; if (!(r & i_NORMALIZED_FORMAT_BIT) && (r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R32_SINT; - } - } else if (wordBytes == 8) { - if (A.size > 0) { /* 4 channels */ - if (R.offset == 0) { /* RGBA */ - if ((r & i_FLOAT_FORMAT_BIT)) return VK_FORMAT_R64G64B64A64_SFLOAT; - if (!(r & i_NORMALIZED_FORMAT_BIT) && !(r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R64G64B64A64_UINT; - if (!(r & i_NORMALIZED_FORMAT_BIT) && (r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R64G64B64A64_SINT; - } else { /* BGRA */ - } - } else if (B.size > 0) { /* 3 channels */ - if (R.offset == 0) { /* RGB */ - if ((r & i_FLOAT_FORMAT_BIT)) return VK_FORMAT_R64G64B64_SFLOAT; - if (!(r & i_NORMALIZED_FORMAT_BIT) && !(r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R64G64B64_UINT; - if (!(r & i_NORMALIZED_FORMAT_BIT) && (r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R64G64B64_SINT; - } else { /* BGR */ } - } else if (G.size > 0) { /* 2 channels */ + } else if (wordBytes == 8) { + if (A.size > 0) { /* 4 channels */ + if (R.offset == 0) { /* RGBA */ + if ((r & i_FLOAT_FORMAT_BIT)) return VK_FORMAT_R64G64B64A64_SFLOAT; + if (!(r & i_NORMALIZED_FORMAT_BIT) && !(r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R64G64B64A64_UINT; + if (!(r & i_NORMALIZED_FORMAT_BIT) && (r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R64G64B64A64_SINT; + } else { /* BGRA */ + } + } else if (B.size > 0) { /* 3 channels */ + if (R.offset == 0) { /* RGB */ + if ((r & i_FLOAT_FORMAT_BIT)) return VK_FORMAT_R64G64B64_SFLOAT; + if (!(r & i_NORMALIZED_FORMAT_BIT) && !(r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R64G64B64_UINT; + if (!(r & i_NORMALIZED_FORMAT_BIT) && (r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R64G64B64_SINT; + } else { /* BGR */ + } + } else if (G.size > 0) { /* 2 channels */ if ((r & i_FLOAT_FORMAT_BIT)) return VK_FORMAT_R64G64_SFLOAT; if (!(r & i_NORMALIZED_FORMAT_BIT) && !(r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R64G64_UINT; if (!(r & i_NORMALIZED_FORMAT_BIT) && (r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R64G64_SINT; - } else { /* 1 channel */ + } else { /* 1 channel */ if ((r & i_FLOAT_FORMAT_BIT)) return VK_FORMAT_R64_SFLOAT; if (!(r & i_NORMALIZED_FORMAT_BIT) && !(r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R64_UINT; if (!(r & i_NORMALIZED_FORMAT_BIT) && (r & i_SIGNED_FORMAT_BIT)) return VK_FORMAT_R64_SINT; + } } } } diff --git a/thirdparty/libktx/lib/dfdutils/interpretdfd.c b/thirdparty/libktx/lib/dfdutils/interpretdfd.c index 6d25f33a1b..b58a79fb43 100644 --- a/thirdparty/libktx/lib/dfdutils/interpretdfd.c +++ b/thirdparty/libktx/lib/dfdutils/interpretdfd.c @@ -29,18 +29,37 @@ static uint32_t bit_ceil(uint32_t x) { * @~English * @brief Interpret a Data Format Descriptor for a simple format. * - * @param DFD Pointer to a Data Format Descriptor to interpret, - described as 32-bit words in native endianness. - Note that this is the whole descriptor, not just - the basic descriptor block. - * @param R Information about the decoded red channel or the depth channel, if any. - * @param G Information about the decoded green channel or the stencil channel, if any. - * @param B Information about the decoded blue channel, if any. - * @param A Information about the decoded alpha channel, if any. - * @param wordBytes Byte size of the channels (unpacked) or total size (packed). + * Handles "simple" cases that can be translated to things a GPU can access. + * For simplicity, it ignores the compressed formats, which are generally a + * single sample (and I believe are all defined to be little-endian in their + * in-memory layout, even if some documentation confuses this). Focuses on + * the layout and ignores sRGB except for reporting if that is the transfer + * function by way of a bit in the returned value. + * + * @param[in] DFD Pointer to a Data Format Descriptor to interpret, + * described as 32-bit words in native endianness. + * Note that this is the whole descriptor, not just + * the basic descriptor block. + * @param R[in,out] Pointer to struct to receive information about the decoded + * red channel, the Y channel, if YUV, or the depth channel, + * if any. + * @param G[in,out] Pointer to struct to receive information about the decoded + * green channel, the U (Cb) channel, if YUV, or the stencil + * channel, if any. + * @param B[in,out] Pointer to struct to receive information about the decoded + * blue channel, if any or the V (Cr) channel, if YUV. + * @param A[in,out] Pointer to struct to receive information about the decoded + * alpha channel, if any or the second Y channel, if YUV and + * any. + * @param wordBytes[in,out] Pointer to a uint32_t to receive the byte size of + * the channels (unpacked) or total size (packed). * * @return An enumerant describing the decoded value, * or an error code in case of failure. + * + * The mapping of YUV channels to the parameter names used here is based on + * the channel ids in @c khr_df.h and is different from the convention used + * in format names in the Vulkan specification where G == Y, R = Cr and B = Cb. **/ enum InterpretDFDResult interpretDFD(const uint32_t *DFD, InterpretedDFDChannel *R, @@ -49,14 +68,6 @@ enum InterpretDFDResult interpretDFD(const uint32_t *DFD, InterpretedDFDChannel *A, uint32_t *wordBytes) { - /* We specifically handle "simple" cases that can be translated */ - /* to things a GPU can access. For simplicity, we also ignore */ - /* the compressed formats, which are generally a single sample */ - /* (and I believe are all defined to be little-endian in their */ - /* in-memory layout, even if some documentation confuses this). */ - /* We also just worry about layout and ignore sRGB, since that's */ - /* trivial to extract anyway. */ - /* DFD points to the whole descriptor, not the basic descriptor block. */ /* Make everything else relative to the basic descriptor block. */ const uint32_t *BDFDB = DFD+1; @@ -78,7 +89,7 @@ enum InterpretDFDResult interpretDFD(const uint32_t *DFD, /* First rule out the multiple planes case (trivially) */ /* - that is, we check that only bytesPlane0 is non-zero. */ - /* This means we don't handle YUV even if the API could. */ + /* This means we don't handle multi-plane YUV, even if the API could. */ /* (We rely on KHR_DF_WORD_BYTESPLANE0..3 being the same and */ /* KHR_DF_WORD_BYTESPLANE4..7 being the same as a short cut.) */ if ((BDFDB[KHR_DF_WORD_BYTESPLANE0] & ~KHR_DF_MASK_BYTESPLANE0) @@ -104,6 +115,8 @@ enum InterpretDFDResult interpretDFD(const uint32_t *DFD, bool hasSigned = false; bool hasFloat = false; bool hasNormalized = false; + bool hasFixed = false; + khr_df_model_e model = KHR_DFDVAL(BDFDB, MODEL); // Note: We're ignoring 9995, which is weird and worth special-casing // rather than trying to generalise to all float formats. @@ -116,13 +129,23 @@ enum InterpretDFDResult interpretDFD(const uint32_t *DFD, // (i.e. set to the maximum bit value, and check min value) on // the assumption that we're looking at a format which *came* from // an API we can support. - const bool isNormalized = isFloat ? - *(float*) (void*) &BDFDB[KHR_DF_WORD_SAMPLESTART + + bool isFixed; + bool isNormalized; + if (isFloat) { + isNormalized = *(float*) (void*) &BDFDB[KHR_DF_WORD_SAMPLESTART + KHR_DF_WORD_SAMPLEWORDS * i + - KHR_DF_SAMPLEWORD_SAMPLEUPPER] != 1.0f : - KHR_DFDSVAL(BDFDB, i, SAMPLEUPPER) != 1U; - + KHR_DF_SAMPLEWORD_SAMPLEUPPER] != 1.0f; + isFixed = false; + } else { + uint32_t sampleUpper = KHR_DFDSVAL(BDFDB, i, SAMPLEUPPER); + uint32_t maxVal = 1U << KHR_DFDSVAL(BDFDB, i, BITLENGTH); + if (!isSigned) maxVal <<= 1; + maxVal--; + isFixed = 1U < sampleUpper && sampleUpper < maxVal; + isNormalized = !isFixed && sampleUpper != 1U; + } hasSigned |= isSigned; + hasFixed |= isFixed; hasFloat |= isFloat; // By our definition the normalizedness of a single bit channel (like in RGBA 5:5:5:1) // is ambiguous. Ignore these during normalized checks. @@ -132,9 +155,10 @@ enum InterpretDFDResult interpretDFD(const uint32_t *DFD, result |= hasSigned ? i_SIGNED_FORMAT_BIT : 0; result |= hasFloat ? i_FLOAT_FORMAT_BIT : 0; result |= hasNormalized ? i_NORMALIZED_FORMAT_BIT : 0; + result |= hasFixed ? i_FIXED_FORMAT_BIT : 0; // Checks based on color model - if (KHR_DFDVAL(BDFDB, MODEL) == KHR_DF_MODEL_YUVSDA) { + if (model == KHR_DF_MODEL_YUVSDA) { result |= i_NORMALIZED_FORMAT_BIT; result |= i_COMPRESSED_FORMAT_BIT; result |= i_YUVSDA_FORMAT_BIT; @@ -165,7 +189,7 @@ enum InterpretDFDResult interpretDFD(const uint32_t *DFD, *wordBytes = ((result & i_PACKED_FORMAT_BIT) ? 4 : 1) * bit_ceil(largestSampleSize) / 8; } else if (KHR_DFDVAL(BDFDB, MODEL) == KHR_DF_MODEL_RGBSDA) { - /* We only pay attention to sRGB. */ + /* Check if transfer is sRGB. */ if (KHR_DFDVAL(BDFDB, TRANSFER) == KHR_DF_TRANSFER_SRGB) result |= i_SRGB_FORMAT_BIT; /* We only support samples at coordinate 0,0,0,0. */ @@ -175,7 +199,11 @@ enum InterpretDFDResult interpretDFD(const uint32_t *DFD, if (KHR_DFDSVAL(BDFDB, sampleCounter, SAMPLEPOSITION_ALL)) return i_UNSUPPORTED_MULTIPLE_SAMPLE_LOCATIONS; } + } + if (model == KHR_DF_MODEL_RGBSDA || model == KHR_DF_MODEL_YUVSDA) { + /* The values of the DEPTH and STENCIL tokens are the same for */ + /* RGBSDA and YUVSDA. */ /* For Depth/Stencil formats mixed channels are allowed */ for (uint32_t sampleCounter = 0; sampleCounter < numSamples; ++sampleCounter) { switch (KHR_DFDSVAL(BDFDB, sampleCounter, CHANNELID)) { @@ -206,6 +234,9 @@ enum InterpretDFDResult interpretDFD(const uint32_t *DFD, } } + /* This all relies on the channel id values for RGB being equal to */ + /* those for YUV. */ + /* Remember: the canonical ordering of samples is to start with */ /* the lowest bit of the channel/location which touches bit 0 of */ /* the data, when the latter is concatenated in little-endian order, */ @@ -288,8 +319,20 @@ enum InterpretDFDResult interpretDFD(const uint32_t *DFD, currentByteOffset = sampleByteOffset; currentBitLength = sampleBitLength; if (sampleChannelPtr->size) { - /* Uh-oh, we've seen this channel before. */ - return i_UNSUPPORTED_NONTRIVIAL_ENDIANNESS; + if (model == KHR_DF_MODEL_YUVSDA && sampleChannel == KHR_DF_CHANNEL_YUVSDA_Y) { + if (sampleChannelPtr == R) { + /* We've got another Y channel. Record details in A. */ + if (A->size == 0) { + sampleChannelPtr = A; + } else { + /* Uh-oh, we've already got a second Y or an alpha channel. */ + return i_UNSUPPORTED_CHANNEL_TYPES; + } + } + } else { + /* Uh-oh, we've seen this channel before. */ + return i_UNSUPPORTED_NONTRIVIAL_ENDIANNESS; + } } /* For now, record the bit offset in little-endian terms, */ /* because we may not know to reverse it yet. */ @@ -378,8 +421,20 @@ enum InterpretDFDResult interpretDFD(const uint32_t *DFD, currentByteOffset = sampleByteOffset; currentByteLength = sampleByteLength; if (sampleChannelPtr->size) { - /* Uh-oh, we've seen this channel before. */ - return i_UNSUPPORTED_NONTRIVIAL_ENDIANNESS; + if (model == KHR_DF_MODEL_YUVSDA && sampleChannel == KHR_DF_CHANNEL_YUVSDA_Y) { + if (sampleChannelPtr == R) { + /* We've got another Y channel. Record details in A. */ + if (A->size == 0) { + sampleChannelPtr = A; + } else { + /* Uh-oh, we've already got a second Y or an alpha channel. */ + return i_UNSUPPORTED_CHANNEL_TYPES; + } + } + } else { + /* Uh-oh, we've seen this channel before. */ + return i_UNSUPPORTED_NONTRIVIAL_ENDIANNESS; + } } /* For now, record the byte offset in little-endian terms, */ /* because we may not know to reverse it yet. */ diff --git a/thirdparty/libktx/lib/dfdutils/queries.c b/thirdparty/libktx/lib/dfdutils/queries.c index 19488f9e33..66d77cf6ad 100644 --- a/thirdparty/libktx/lib/dfdutils/queries.c +++ b/thirdparty/libktx/lib/dfdutils/queries.c @@ -41,15 +41,15 @@ getDFDComponentInfoUnpacked(const uint32_t* DFD, uint32_t* numComponents, { const uint32_t *BDFDB = DFD+1; uint32_t numSamples = KHR_DFDSAMPLECOUNT(BDFDB); - uint32_t sampleCounter; + uint32_t sampleNumber; uint32_t currentChannel = ~0U; /* Don't start matched. */ /* This is specifically for unpacked formats which means the size of */ /* each component is the same. */ *numComponents = 0; - for (sampleCounter = 0; sampleCounter < numSamples; ++sampleCounter) { - uint32_t sampleByteLength = (KHR_DFDSVAL(BDFDB, sampleCounter, BITLENGTH) + 1) >> 3U; - uint32_t sampleChannel = KHR_DFDSVAL(BDFDB, sampleCounter, CHANNELID); + for (sampleNumber = 0; sampleNumber < numSamples; ++sampleNumber) { + uint32_t sampleByteLength = (KHR_DFDSVAL(BDFDB, sampleNumber, BITLENGTH) + 1) >> 3U; + uint32_t sampleChannel = KHR_DFDSVAL(BDFDB, sampleNumber, CHANNELID); if (sampleChannel == currentChannel) { /* Continuation of the same channel. */ @@ -85,10 +85,10 @@ uint32_t getDFDNumComponents(const uint32_t* DFD) uint32_t currentChannel = ~0U; /* Don't start matched. */ uint32_t numComponents = 0; uint32_t numSamples = KHR_DFDSAMPLECOUNT(BDFDB); - uint32_t sampleCounter; + uint32_t sampleNumber; - for (sampleCounter = 0; sampleCounter < numSamples; ++sampleCounter) { - uint32_t sampleChannel = KHR_DFDSVAL(BDFDB, sampleCounter, CHANNELID); + for (sampleNumber = 0; sampleNumber < numSamples; ++sampleNumber) { + uint32_t sampleChannel = KHR_DFDSVAL(BDFDB, sampleNumber, CHANNELID); if (sampleChannel != currentChannel) { numComponents++; currentChannel = sampleChannel; @@ -97,50 +97,83 @@ uint32_t getDFDNumComponents(const uint32_t* DFD) return numComponents; } + /** * @~English - * @brief Recreate the value of bytesPlane0 from sample info. + * @brief Reconstruct the value of bytesPlane0 from sample info. * - * This can be use to recreate the value of bytesPlane0 for data that - * has been variable-rate compressed so has bytesPlane0 = 0. For DFDs - * that are valid for KTX files. Little-endian data only and no multi-plane - * formats. + * Reconstruct the value for data that has been variable-rate compressed so + * has bytesPlane0 = 0. For DFDs that are valid for KTX files. Little-endian + * data only and no multi-plane formats. * * @param DFD Pointer to a Data Format Descriptor for which, * described as 32-bit words in native endianness. * Note that this is the whole descriptor, not just * the basic descriptor block. - * @param bytesPlane0 pointer to a 32-bit word in which the recreated - * value of bytesPlane0 will be written. */ -void -recreateBytesPlane0FromSampleInfo(const uint32_t* DFD, uint32_t* bytesPlane0) +uint32_t +reconstructDFDBytesPlane0FromSamples(const uint32_t* DFD) { const uint32_t *BDFDB = DFD+1; uint32_t numSamples = KHR_DFDSAMPLECOUNT(BDFDB); - uint32_t sampleCounter; + uint32_t sampleNumber; uint32_t bitsPlane0 = 0; - uint32_t* bitOffsets = malloc(sizeof(uint32_t) * numSamples); - memset(bitOffsets, -1, sizeof(uint32_t) * numSamples); - for (sampleCounter = 0; sampleCounter < numSamples; ++sampleCounter) { - uint32_t sampleBitOffset = KHR_DFDSVAL(BDFDB, sampleCounter, BITOFFSET); - /* The sample bitLength field stores the bit length - 1. */ - uint32_t sampleBitLength = KHR_DFDSVAL(BDFDB, sampleCounter, BITLENGTH) + 1; - uint32_t i; - for (i = 0; i < numSamples; i++) { - if (sampleBitOffset == bitOffsets[i]) { - // This sample is being repeated as in e.g. RGB9E5. - break; + int32_t largestOffset = 0; + uint32_t sampleNumberWithLargestOffset = 0; + + // Special case these depth{,-stencil} formats. The unused bits are + // in the MSBs so have no visibility in the DFD therefore the max offset + // algorithm below returns a value that is too small. + if (KHR_DFDSVAL(BDFDB, 0, CHANNELID) == KHR_DF_CHANNEL_COMMON_DEPTH) { + if (numSamples == 1) { + if (KHR_DFDSVAL(BDFDB, 0, BITLENGTH) + 1 == 24) { + // X8_D24_UNORM_PACK32, + return 4; + } + } else if (numSamples == 2) { + if (KHR_DFDSVAL(BDFDB, 0, BITLENGTH) + 1 == 16) { + // D16_UNORM_S8_UINT + return 4; + } + if (KHR_DFDSVAL(BDFDB, 0, BITLENGTH) + 1 == 32 + && KHR_DFDSVAL(BDFDB, 1, CHANNELID) == KHR_DF_CHANNEL_COMMON_STENCIL) { + // D32_SFLOAT_S8_UINT + return 8; } } - if (i == numSamples) { - // Previously unseen bitOffset. Bump size. - bitsPlane0 += sampleBitLength; - bitOffsets[sampleCounter] = sampleBitOffset; + } + for (sampleNumber = 0; sampleNumber < numSamples; ++sampleNumber) { + int32_t sampleBitOffset = KHR_DFDSVAL(BDFDB, sampleNumber, BITOFFSET); + if (sampleBitOffset > largestOffset) { + largestOffset = sampleBitOffset; + sampleNumberWithLargestOffset = sampleNumber; } } - free(bitOffsets); - *bytesPlane0 = bitsPlane0 >> 3U; + + /* The sample bitLength field stores the bit length - 1. */ + uint32_t sampleBitLength = KHR_DFDSVAL(BDFDB, sampleNumberWithLargestOffset, BITLENGTH) + 1; + bitsPlane0 = largestOffset + sampleBitLength; + return bitsPlane0 >> 3U; } +/** + * @~English + * @brief Reconstruct the value of bytesPlane0 from sample info. + * + * @see reconstructDFDBytesPlane0FromSamples for details. + * @deprecated For backward comparibility only. Use + * reconstructDFDBytesPlane0FromSamples. + * + * @param DFD Pointer to a Data Format Descriptor for which, + * described as 32-bit words in native endianness. + * Note that this is the whole descriptor, not just + * the basic descriptor block. + * @param bytesPlane0 pointer to a 32-bit word in which the recreated + * value of bytesPlane0 will be written. + */ +void +recreateBytesPlane0FromSampleInfo(const uint32_t* DFD, uint32_t* bytesPlane0) +{ + *bytesPlane0 = reconstructDFDBytesPlane0FromSamples(DFD); +} diff --git a/thirdparty/libktx/lib/dfdutils/vk2dfd.inl b/thirdparty/libktx/lib/dfdutils/vk2dfd.inl index 25c7a2c238..3398441e8c 100644 --- a/thirdparty/libktx/lib/dfdutils/vk2dfd.inl +++ b/thirdparty/libktx/lib/dfdutils/vk2dfd.inl @@ -277,68 +277,68 @@ case VK_FORMAT_ASTC_12x10_SRGB_BLOCK: return createDFDCompressed(c_ASTC, 12, 10, case VK_FORMAT_ASTC_12x12_UNORM_BLOCK: return createDFDCompressed(c_ASTC, 12, 12, 1, s_UNORM); case VK_FORMAT_ASTC_12x12_SRGB_BLOCK: return createDFDCompressed(c_ASTC, 12, 12, 1, s_SRGB); case VK_FORMAT_G8B8G8R8_422_UNORM: { - int channels[] = {0, 1, 0, 2}; int bits[] = {8, 8, 8, 8}; int paddings[] = {0, 0, 0, 0}; + int channels[] = {0, 1, 0, 2}; int bits[] = {8, 8, 8, 8}; int shiftBits[] = {0, 0, 0, 0}; int position_xs[] = {64, 64, 192, 64}; int position_ys[] = {128, 128, 128, 128}; - return createDFD422(0, 4, bits, paddings, channels, position_xs, position_ys, s_UNORM); + return createDFD422(0, 4, bits, shiftBits, channels, position_xs, position_ys, s_UNORM); } case VK_FORMAT_B8G8R8G8_422_UNORM: { - int channels[] = {1, 0, 2, 0}; int bits[] = {8, 8, 8, 8}; int paddings[] = {0, 0, 0, 0}; + int channels[] = {1, 0, 2, 0}; int bits[] = {8, 8, 8, 8}; int shiftBits[] = {0, 0, 0, 0}; int position_xs[] = {64, 64, 64, 192}; int position_ys[] = {128, 128, 128, 128}; - return createDFD422(0, 4, bits, paddings, channels, position_xs, position_ys, s_UNORM); + return createDFD422(0, 4, bits, shiftBits, channels, position_xs, position_ys, s_UNORM); } case VK_FORMAT_R10X6_UNORM_PACK16: { - int channels[] = {0}; int bits[] = {10}; int paddings[] = {6}; - return createDFDPackedPadded(0, 1, bits, paddings, channels, s_UNORM); + int channels[] = {0}; int bits[] = {10}; int shiftBits[] = {6}; + return createDFDPackedShifted(0, 1, bits, shiftBits, channels, s_UNORM); } case VK_FORMAT_R10X6G10X6_UNORM_2PACK16: { - int channels[] = {0, 1}; int bits[] = {10, 10}; int paddings[] = {6, 6}; - return createDFDPackedPadded(0, 2, bits, paddings, channels, s_UNORM); + int channels[] = {0, 1}; int bits[] = {10, 10}; int shiftBits[] = {6, 6}; + return createDFDPackedShifted(0, 2, bits, shiftBits, channels, s_UNORM); } case VK_FORMAT_R10X6G10X6B10X6A10X6_UNORM_4PACK16: { - int channels[] = {0, 1, 2, 3}; int bits[] = {10, 10, 10, 10}; int paddings[] = {6, 6, 6, 6}; - return createDFDPackedPadded(0, 4, bits, paddings, channels, s_UNORM); + int channels[] = {0, 1, 2, 3}; int bits[] = {10, 10, 10, 10}; int shiftBits[] = {6, 6, 6, 6}; + return createDFDPackedShifted(0, 4, bits, shiftBits, channels, s_UNORM); } case VK_FORMAT_G10X6B10X6G10X6R10X6_422_UNORM_4PACK16: { - int channels[] = {0, 1, 0, 2}; int bits[] = {10, 10, 10, 10}; int paddings[] = {6, 6, 6, 6}; + int channels[] = {0, 1, 0, 2}; int bits[] = {10, 10, 10, 10}; int shiftBits[] = {6, 6, 6, 6}; int position_xs[] = {64, 64, 192, 64}; int position_ys[] = {128, 128, 128, 128}; - return createDFD422(0, 4, bits, paddings, channels, position_xs, position_ys, s_UNORM); + return createDFD422(0, 4, bits, shiftBits, channels, position_xs, position_ys, s_UNORM); } case VK_FORMAT_B10X6G10X6R10X6G10X6_422_UNORM_4PACK16: { - int channels[] = {1, 0, 2, 0}; int bits[] = {10, 10, 10, 10}; int paddings[] = {6, 6, 6, 6}; + int channels[] = {1, 0, 2, 0}; int bits[] = {10, 10, 10, 10}; int shiftBits[] = {6, 6, 6, 6}; int position_xs[] = {64, 64, 64, 192}; int position_ys[] = {128, 128, 128, 128}; - return createDFD422(0, 4, bits, paddings, channels, position_xs, position_ys, s_UNORM); + return createDFD422(0, 4, bits, shiftBits, channels, position_xs, position_ys, s_UNORM); } case VK_FORMAT_R12X4_UNORM_PACK16: { - int channels[] = {0}; int bits[] = {12}; int paddings[] = {4}; - return createDFDPackedPadded(0, 1, bits, paddings, channels, s_UNORM); + int channels[] = {0}; int bits[] = {12}; int shiftBits[] = {4}; + return createDFDPackedShifted(0, 1, bits, shiftBits, channels, s_UNORM); } case VK_FORMAT_R12X4G12X4_UNORM_2PACK16: { - int channels[] = {0, 1}; int bits[] = {12, 12}; int paddings[] = {4, 4}; - return createDFDPackedPadded(0, 2, bits, paddings, channels, s_UNORM); + int channels[] = {0, 1}; int bits[] = {12, 12}; int shiftBits[] = {4, 4}; + return createDFDPackedShifted(0, 2, bits, shiftBits, channels, s_UNORM); } case VK_FORMAT_R12X4G12X4B12X4A12X4_UNORM_4PACK16: { - int channels[] = {0, 1, 2, 3}; int bits[] = {12, 12, 12, 12}; int paddings[] = {4, 4, 4, 4}; - return createDFDPackedPadded(0, 4, bits, paddings, channels, s_UNORM); + int channels[] = {0, 1, 2, 3}; int bits[] = {12, 12, 12, 12}; int shiftBits[] = {4, 4, 4, 4}; + return createDFDPackedShifted(0, 4, bits, shiftBits, channels, s_UNORM); } case VK_FORMAT_G12X4B12X4G12X4R12X4_422_UNORM_4PACK16: { - int channels[] = {0, 1, 0, 2}; int bits[] = {12, 12, 12, 12}; int paddings[] = {4, 4, 4, 4}; + int channels[] = {0, 1, 0, 2}; int bits[] = {12, 12, 12, 12}; int shiftBits[] = {4, 4, 4, 4}; int position_xs[] = {64, 64, 192, 64}; int position_ys[] = {128, 128, 128, 128}; - return createDFD422(0, 4, bits, paddings, channels, position_xs, position_ys, s_UNORM); + return createDFD422(0, 4, bits, shiftBits, channels, position_xs, position_ys, s_UNORM); } case VK_FORMAT_B12X4G12X4R12X4G12X4_422_UNORM_4PACK16: { - int channels[] = {1, 0, 2, 0}; int bits[] = {12, 12, 12, 12}; int paddings[] = {4, 4, 4, 4}; + int channels[] = {1, 0, 2, 0}; int bits[] = {12, 12, 12, 12}; int shiftBits[] = {4, 4, 4, 4}; int position_xs[] = {64, 64, 64, 192}; int position_ys[] = {128, 128, 128, 128}; - return createDFD422(0, 4, bits, paddings, channels, position_xs, position_ys, s_UNORM); + return createDFD422(0, 4, bits, shiftBits, channels, position_xs, position_ys, s_UNORM); } case VK_FORMAT_G16B16G16R16_422_UNORM: { - int channels[] = {0, 1, 0, 2}; int bits[] = {16, 16, 16, 16}; int paddings[] = {0, 0, 0, 0}; + int channels[] = {0, 1, 0, 2}; int bits[] = {16, 16, 16, 16}; int shiftBits[] = {0, 0, 0, 0}; int position_xs[] = {64, 64, 192, 64}; int position_ys[] = {128, 128, 128, 128}; - return createDFD422(0, 4, bits, paddings, channels, position_xs, position_ys, s_UNORM); + return createDFD422(0, 4, bits, shiftBits, channels, position_xs, position_ys, s_UNORM); } case VK_FORMAT_B16G16R16G16_422_UNORM: { - int channels[] = {1, 0, 2, 0}; int bits[] = {16, 16, 16, 16}; int paddings[] = {0, 0, 0, 0}; + int channels[] = {1, 0, 2, 0}; int bits[] = {16, 16, 16, 16}; int shiftBits[] = {0, 0, 0, 0}; int position_xs[] = {64, 64, 64, 192}; int position_ys[] = {128, 128, 128, 128}; - return createDFD422(0, 4, bits, paddings, channels, position_xs, position_ys, s_UNORM); + return createDFD422(0, 4, bits, shiftBits, channels, position_xs, position_ys, s_UNORM); } case VK_FORMAT_A4R4G4B4_UNORM_PACK16: { int channels[] = {2,1,0,3}; int bits[] = {4,4,4,4}; @@ -402,6 +402,7 @@ case VK_FORMAT_ASTC_6x6x6_UNORM_BLOCK_EXT: return createDFDCompressed(c_ASTC, 6, case VK_FORMAT_ASTC_6x6x6_SRGB_BLOCK_EXT: return createDFDCompressed(c_ASTC, 6, 6, 6, s_SRGB); case VK_FORMAT_ASTC_6x6x6_SFLOAT_BLOCK_EXT: return createDFDCompressed(c_ASTC, 6, 6, 6, s_SFLOAT); #endif +case VK_FORMAT_R16G16_S10_5_NV: return createDFDUnpacked(0, 2, 2, 0, s_S10_5); case VK_FORMAT_A1B5G5R5_UNORM_PACK16_KHR: { int channels[] = {0,1,2,3}; int bits[] = {5,5,5,1}; return createDFDPacked(0, 4, bits, channels, s_UNORM); diff --git a/thirdparty/libktx/lib/gl_format.h b/thirdparty/libktx/lib/gl_format.h index 3986bc430c..44abd2c6d4 100644 --- a/thirdparty/libktx/lib/gl_format.h +++ b/thirdparty/libktx/lib/gl_format.h @@ -1,4 +1,4 @@ -/* +/* ================================================================================================ Description : OpenGL formats/types and properties. @@ -70,7 +70,6 @@ static inline GLenum glGetFormatFromInternalFormat( const GLenum internalFormat static inline GLenum glGetTypeFromInternalFormat( const GLenum internalFormat ); static inline void glGetFormatSize( const GLenum internalFormat, GlFormatSize * pFormatSize ); static inline unsigned int glGetTypeSizeFromType( const GLenum type ); -static inline GLenum glGetInternalFormatFromVkFormat ( VkFormat format ); MODIFICATIONS for use in libktx =============================== @@ -79,7 +78,6 @@ MODIFICATIONS for use in libktx 2019.3.09 #if 0 around GL type declarations. 〃 2019.5.30 Use common ktxFormatSize to return results. 〃 2019.5.30 Return blockSizeInBits 0 for default case of glGetFormatSize. 〃 -2019.5.30 Added glGetInternalFormatFromVkFormat. 〃 ================================================================================================ */ @@ -101,6 +99,7 @@ MODIFICATIONS for use in libktx #endif // __cplusplus #endif + /* =========================================================================== Avoid warnings or even errors when using strict C99. "Redefinition of @@ -2436,219 +2435,4 @@ static inline void glGetFormatSize( const GLenum internalFormat, ktxFormatSize * } } -static inline GLint glGetInternalFormatFromVkFormat( VkFormat vkFormat ) -{ - switch ( vkFormat ) - { - // - // 8 bits per component - // - case VK_FORMAT_R8_UNORM: return GL_R8; // 1-component, 8-bit unsigned normalized - case VK_FORMAT_R8G8_UNORM: return GL_RG8; // 2-component, 8-bit unsigned normalized - case VK_FORMAT_R8G8B8_UNORM: return GL_RGB8; // 3-component, 8-bit unsigned normalized - case VK_FORMAT_R8G8B8A8_UNORM: return GL_RGBA8; // 4-component, 8-bit unsigned normalized - - case VK_FORMAT_R8_SNORM: return GL_R8_SNORM; // 1-component, 8-bit signed normalized - case VK_FORMAT_R8G8_SNORM: return GL_RG8_SNORM; // 2-component, 8-bit signed normalized - case VK_FORMAT_R8G8B8_SNORM: return GL_RGB8_SNORM; // 3-component, 8-bit signed normalized - case VK_FORMAT_R8G8B8A8_SNORM: return GL_RGBA8_SNORM; // 4-component, 8-bit signed normalized - - case VK_FORMAT_R8_UINT: return GL_R8UI; // 1-component, 8-bit unsigned integer - case VK_FORMAT_R8G8_UINT: return GL_RG8UI; // 2-component, 8-bit unsigned integer - case VK_FORMAT_R8G8B8_UINT: return GL_RGB8UI; // 3-component, 8-bit unsigned integer - case VK_FORMAT_R8G8B8A8_UINT: return GL_RGBA8UI; // 4-component, 8-bit unsigned integer - - case VK_FORMAT_R8_SINT: return GL_R8I; // 1-component, 8-bit signed integer - case VK_FORMAT_R8G8_SINT: return GL_RG8I; // 2-component, 8-bit signed integer - case VK_FORMAT_R8G8B8_SINT: return GL_RGB8I; // 3-component, 8-bit signed integer - case VK_FORMAT_R8G8B8A8_SINT: return GL_RGBA8I; // 4-component, 8-bit signed integer - - case VK_FORMAT_R8_SRGB: return GL_SR8; // 1-component, 8-bit sRGB - case VK_FORMAT_R8G8_SRGB: return GL_SRG8; // 2-component, 8-bit sRGB - case VK_FORMAT_R8G8B8_SRGB: return GL_SRGB8; // 3-component, 8-bit sRGB - case VK_FORMAT_R8G8B8A8_SRGB: return GL_SRGB8_ALPHA8; // 4-component, 8-bit sRGB - - // - // 16 bits per component - // - case VK_FORMAT_R16_UNORM: return GL_R16; // 1-component, 16-bit unsigned normalized - case VK_FORMAT_R16G16_UNORM: return GL_RG16; // 2-component, 16-bit unsigned normalized - case VK_FORMAT_R16G16B16_UNORM: return GL_RGB16; // 3-component, 16-bit unsigned normalized - case VK_FORMAT_R16G16B16A16_UNORM: return GL_RGBA16; // 4-component, 16-bit unsigned normalized - - case VK_FORMAT_R16_SNORM: return GL_R16_SNORM; // 1-component, 16-bit signed normalized - case VK_FORMAT_R16G16_SNORM: return GL_RG16_SNORM; // 2-component, 16-bit signed normalized - case VK_FORMAT_R16G16B16_SNORM: return GL_RGB16_SNORM; // 3-component, 16-bit signed normalized - case VK_FORMAT_R16G16B16A16_SNORM: return GL_RGBA16_SNORM; // 4-component, 16-bit signed normalized - - case VK_FORMAT_R16_UINT: return GL_R16UI; // 1-component, 16-bit unsigned integer - case VK_FORMAT_R16G16_UINT: return GL_RG16UI; // 2-component, 16-bit unsigned integer - case VK_FORMAT_R16G16B16_UINT: return GL_RGB16UI; // 3-component, 16-bit unsigned integer - case VK_FORMAT_R16G16B16A16_UINT: return GL_RGBA16UI; // 4-component, 16-bit unsigned integer - - case VK_FORMAT_R16_SINT: return GL_R16I; // 1-component, 16-bit signed integer - case VK_FORMAT_R16G16_SINT: return GL_RG16I; // 2-component, 16-bit signed integer - case VK_FORMAT_R16G16B16_SINT: return GL_RGB16I; // 3-component, 16-bit signed integer - case VK_FORMAT_R16G16B16A16_SINT: return GL_RGBA16I; // 4-component, 16-bit signed integer - - case VK_FORMAT_R16_SFLOAT: return GL_R16F; // 1-component, 16-bit floating-point - case VK_FORMAT_R16G16_SFLOAT: return GL_RG16F; // 2-component, 16-bit floating-point - case VK_FORMAT_R16G16B16_SFLOAT: return GL_RGB16F; // 3-component, 16-bit floating-point - case VK_FORMAT_R16G16B16A16_SFLOAT: return GL_RGBA16F; // 4-component, 16-bit floating-point - - // - // 32 bits per component - // - case VK_FORMAT_R32_UINT: return GL_R32UI; // 1-component, 32-bit unsigned integer - case VK_FORMAT_R32G32_UINT: return GL_RG32UI; // 2-component, 32-bit unsigned integer - case VK_FORMAT_R32G32B32_UINT: return GL_RGB32UI; // 3-component, 32-bit unsigned integer - case VK_FORMAT_R32G32B32A32_UINT: return GL_RGBA32UI; // 4-component, 32-bit unsigned integer - - case VK_FORMAT_R32_SINT: return GL_R32I; // 1-component, 32-bit signed integer - case VK_FORMAT_R32G32_SINT: return GL_RG32I; // 2-component, 32-bit signed integer - case VK_FORMAT_R32G32B32_SINT: return GL_RGB32I; // 3-component, 32-bit signed integer - case VK_FORMAT_R32G32B32A32_SINT: return GL_RGBA32I; // 4-component, 32-bit signed integer - - case VK_FORMAT_R32_SFLOAT: return GL_R32F; // 1-component, 32-bit floating-point - case VK_FORMAT_R32G32_SFLOAT: return GL_RG32F; // 2-component, 32-bit floating-point - case VK_FORMAT_R32G32B32_SFLOAT: return GL_RGB32F; // 3-component, 32-bit floating-point - case VK_FORMAT_R32G32B32A32_SFLOAT: return GL_RGBA32F; // 4-component, 32-bit floating-point - - // - // Packed - // - case VK_FORMAT_R5G5B5A1_UNORM_PACK16: return GL_RGB5; // 3-component 5:5:5, unsigned normalized - case VK_FORMAT_R5G6B5_UNORM_PACK16: return GL_RGB565; // 3-component 5:6:5, unsigned normalized - case VK_FORMAT_R4G4B4A4_UNORM_PACK16: return GL_RGBA4; // 4-component 4:4:4:4, unsigned normalized - case VK_FORMAT_A1R5G5B5_UNORM_PACK16: return GL_RGB5_A1; // 4-component 5:5:5:1, unsigned normalized - case VK_FORMAT_A2R10G10B10_UNORM_PACK32: return GL_RGB10_A2; // 4-component 10:10:10:2, unsigned normalized - case VK_FORMAT_A2R10G10B10_UINT_PACK32: return GL_RGB10_A2UI; // 4-component 10:10:10:2, unsigned integer - case VK_FORMAT_B10G11R11_UFLOAT_PACK32: return GL_R11F_G11F_B10F; // 3-component 11:11:10, floating-point - case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32: return GL_RGB9_E5; // 3-component/exp 9:9:9/5, floating-point - - // - // S3TC/DXT/BC - // - - case VK_FORMAT_BC1_RGB_UNORM_BLOCK: return GL_COMPRESSED_RGB_S3TC_DXT1_EXT; // line through 3D space, 4x4 blocks, unsigned normalized - case VK_FORMAT_BC1_RGBA_UNORM_BLOCK: return GL_COMPRESSED_RGBA_S3TC_DXT1_EXT; // line through 3D space plus 1-bit alpha, 4x4 blocks, unsigned normalized - case VK_FORMAT_BC2_UNORM_BLOCK: return GL_COMPRESSED_RGBA_S3TC_DXT3_EXT; // line through 3D space plus line through 1D space, 4x4 blocks, unsigned normalized - case VK_FORMAT_BC3_UNORM_BLOCK: return GL_COMPRESSED_RGBA_S3TC_DXT5_EXT; // line through 3D space plus 4-bit alpha, 4x4 blocks, unsigned normalized - - case VK_FORMAT_BC1_RGB_SRGB_BLOCK: return GL_COMPRESSED_SRGB_S3TC_DXT1_EXT; // line through 3D space, 4x4 blocks, sRGB - case VK_FORMAT_BC1_RGBA_SRGB_BLOCK: return GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT1_EXT; // line through 3D space plus 1-bit alpha, 4x4 blocks, sRGB - case VK_FORMAT_BC2_SRGB_BLOCK: return GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT3_EXT; // line through 3D space plus line through 1D space, 4x4 blocks, sRGB - case VK_FORMAT_BC3_SRGB_BLOCK: return GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT; // line through 3D space plus 4-bit alpha, 4x4 blocks, sRGB - - case VK_FORMAT_BC4_UNORM_BLOCK: return GL_COMPRESSED_RED_RGTC1; // line through 1D space, 4x4 blocks, unsigned normalized - case VK_FORMAT_BC5_UNORM_BLOCK: return GL_COMPRESSED_RG_RGTC2; // two lines through 1D space, 4x4 blocks, unsigned normalized - case VK_FORMAT_BC4_SNORM_BLOCK: return GL_COMPRESSED_SIGNED_RED_RGTC1; // line through 1D space, 4x4 blocks, signed normalized - case VK_FORMAT_BC5_SNORM_BLOCK: return GL_COMPRESSED_SIGNED_RG_RGTC2; // two lines through 1D space, 4x4 blocks, signed normalized - - case VK_FORMAT_BC6H_UFLOAT_BLOCK: return GL_COMPRESSED_RGB_BPTC_UNSIGNED_FLOAT; // 3-component, 4x4 blocks, unsigned floating-point - case VK_FORMAT_BC6H_SFLOAT_BLOCK: return GL_COMPRESSED_RGB_BPTC_SIGNED_FLOAT; // 3-component, 4x4 blocks, signed floating-point - case VK_FORMAT_BC7_UNORM_BLOCK: return GL_COMPRESSED_RGBA_BPTC_UNORM; // 4-component, 4x4 blocks, unsigned normalized - case VK_FORMAT_BC7_SRGB_BLOCK: return GL_COMPRESSED_SRGB_ALPHA_BPTC_UNORM; // 4-component, 4x4 blocks, sRGB - - // - // ETC - // - case VK_FORMAT_ETC2_R8G8B8_UNORM_BLOCK: return GL_COMPRESSED_RGB8_ETC2; // 3-component ETC2, 4x4 blocks, unsigned normalized - case VK_FORMAT_ETC2_R8G8B8A1_UNORM_BLOCK: return GL_COMPRESSED_RGB8_PUNCHTHROUGH_ALPHA1_ETC2; // 4-component ETC2 with 1-bit alpha, 4x4 blocks, unsigned normalized - case VK_FORMAT_ETC2_R8G8B8A8_UNORM_BLOCK: return GL_COMPRESSED_RGBA8_ETC2_EAC; // 4-component ETC2, 4x4 blocks, unsigned normalized - - case VK_FORMAT_ETC2_R8G8B8_SRGB_BLOCK: return GL_COMPRESSED_SRGB8_ETC2; // 3-component ETC2, 4x4 blocks, sRGB - case VK_FORMAT_ETC2_R8G8B8A1_SRGB_BLOCK: return GL_COMPRESSED_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2; // 4-component ETC2 with 1-bit alpha, 4x4 blocks, sRGB - case VK_FORMAT_ETC2_R8G8B8A8_SRGB_BLOCK: return GL_COMPRESSED_SRGB8_ALPHA8_ETC2_EAC; // 4-component ETC2, 4x4 blocks, sRGB - - case VK_FORMAT_EAC_R11_UNORM_BLOCK: return GL_COMPRESSED_R11_EAC; // 1-component ETC, 4x4 blocks, unsigned normalized - case VK_FORMAT_EAC_R11G11_UNORM_BLOCK: return GL_COMPRESSED_RG11_EAC; // 2-component ETC, 4x4 blocks, unsigned normalized - case VK_FORMAT_EAC_R11_SNORM_BLOCK: return GL_COMPRESSED_SIGNED_R11_EAC; // 1-component ETC, 4x4 blocks, signed normalized - case VK_FORMAT_EAC_R11G11_SNORM_BLOCK: return GL_COMPRESSED_SIGNED_RG11_EAC; // 2-component ETC, 4x4 blocks, signed normalized - - // - // PVRTC - // - case VK_FORMAT_PVRTC1_2BPP_UNORM_BLOCK_IMG: return GL_COMPRESSED_RGBA_PVRTC_2BPPV1_IMG; // 3- or 4-component PVRTC, 16x8 blocks, unsigned normalized - case VK_FORMAT_PVRTC1_4BPP_UNORM_BLOCK_IMG: return GL_COMPRESSED_RGBA_PVRTC_4BPPV1_IMG; // 3- or 4-component PVRTC, 8x8 blocks, unsigned normalized - case VK_FORMAT_PVRTC2_2BPP_UNORM_BLOCK_IMG: return GL_COMPRESSED_RGBA_PVRTC_2BPPV2_IMG; // 3- or 4-component PVRTC, 16x8 blocks, unsigned normalized - case VK_FORMAT_PVRTC2_4BPP_UNORM_BLOCK_IMG: return GL_COMPRESSED_RGBA_PVRTC_4BPPV2_IMG; // 3- or 4-component PVRTC, 4x4 blocks, unsigned normalized - - case VK_FORMAT_PVRTC1_2BPP_SRGB_BLOCK_IMG: return GL_COMPRESSED_SRGB_ALPHA_PVRTC_2BPPV1_EXT; // 4-component PVRTC, 16x8 blocks, sRGB - case VK_FORMAT_PVRTC1_4BPP_SRGB_BLOCK_IMG: return GL_COMPRESSED_SRGB_ALPHA_PVRTC_4BPPV1_EXT; // 4-component PVRTC, 8x8 blocks, sRGB - case VK_FORMAT_PVRTC2_2BPP_SRGB_BLOCK_IMG: return GL_COMPRESSED_SRGB_ALPHA_PVRTC_2BPPV2_IMG; // 4-component PVRTC, 8x4 blocks, sRGB - case VK_FORMAT_PVRTC2_4BPP_SRGB_BLOCK_IMG: return GL_COMPRESSED_SRGB_ALPHA_PVRTC_4BPPV2_IMG; // 4-component PVRTC, 4x4 blocks, sRGB - - // - // ASTC - // - case VK_FORMAT_ASTC_4x4_UNORM_BLOCK: return GL_COMPRESSED_RGBA_ASTC_4x4_KHR; // 4-component ASTC, 4x4 blocks, unsigned normalized - case VK_FORMAT_ASTC_5x4_UNORM_BLOCK: return GL_COMPRESSED_RGBA_ASTC_5x4_KHR; // 4-component ASTC, 5x4 blocks, unsigned normalized - case VK_FORMAT_ASTC_5x5_UNORM_BLOCK: return GL_COMPRESSED_RGBA_ASTC_5x5_KHR; // 4-component ASTC, 5x5 blocks, unsigned normalized - case VK_FORMAT_ASTC_6x5_UNORM_BLOCK: return GL_COMPRESSED_RGBA_ASTC_6x5_KHR; // 4-component ASTC, 6x5 blocks, unsigned normalized - case VK_FORMAT_ASTC_6x6_UNORM_BLOCK: return GL_COMPRESSED_RGBA_ASTC_6x6_KHR; // 4-component ASTC, 6x6 blocks, unsigned normalized - case VK_FORMAT_ASTC_8x5_UNORM_BLOCK: return GL_COMPRESSED_RGBA_ASTC_8x5_KHR; // 4-component ASTC, 8x5 blocks, unsigned normalized - case VK_FORMAT_ASTC_8x6_UNORM_BLOCK: return GL_COMPRESSED_RGBA_ASTC_8x6_KHR; // 4-component ASTC, 8x6 blocks, unsigned normalized - case VK_FORMAT_ASTC_8x8_UNORM_BLOCK: return GL_COMPRESSED_RGBA_ASTC_8x8_KHR; // 4-component ASTC, 8x8 blocks, unsigned normalized - case VK_FORMAT_ASTC_10x5_UNORM_BLOCK: return GL_COMPRESSED_RGBA_ASTC_10x5_KHR; // 4-component ASTC, 10x5 blocks, unsigned normalized - case VK_FORMAT_ASTC_10x6_UNORM_BLOCK: return GL_COMPRESSED_RGBA_ASTC_10x6_KHR; // 4-component ASTC, 10x6 blocks, unsigned normalized - case VK_FORMAT_ASTC_10x8_UNORM_BLOCK: return GL_COMPRESSED_RGBA_ASTC_10x8_KHR; // 4-component ASTC, 10x8 blocks, unsigned normalized - case VK_FORMAT_ASTC_10x10_UNORM_BLOCK: return GL_COMPRESSED_RGBA_ASTC_10x10_KHR; // 4-component ASTC, 10x10 blocks, unsigned normalized - case VK_FORMAT_ASTC_12x10_UNORM_BLOCK: return GL_COMPRESSED_RGBA_ASTC_12x10_KHR; // 4-component ASTC, 12x10 blocks, unsigned normalized - case VK_FORMAT_ASTC_12x12_UNORM_BLOCK: return GL_COMPRESSED_RGBA_ASTC_12x12_KHR; // 4-component ASTC, 12x12 blocks, unsigned normalized - - case VK_FORMAT_ASTC_4x4_SRGB_BLOCK: return GL_COMPRESSED_SRGB8_ALPHA8_ASTC_4x4_KHR; // 4-component ASTC, 4x4 blocks, sRGB - case VK_FORMAT_ASTC_5x4_SRGB_BLOCK: return GL_COMPRESSED_SRGB8_ALPHA8_ASTC_5x4_KHR; // 4-component ASTC, 5x4 blocks, sRGB - case VK_FORMAT_ASTC_5x5_SRGB_BLOCK: return GL_COMPRESSED_SRGB8_ALPHA8_ASTC_5x5_KHR; // 4-component ASTC, 5x5 blocks, sRGB - case VK_FORMAT_ASTC_6x5_SRGB_BLOCK: return GL_COMPRESSED_SRGB8_ALPHA8_ASTC_6x5_KHR; // 4-component ASTC, 6x5 blocks, sRGB - case VK_FORMAT_ASTC_6x6_SRGB_BLOCK: return GL_COMPRESSED_SRGB8_ALPHA8_ASTC_6x6_KHR; // 4-component ASTC, 6x6 blocks, sRGB - case VK_FORMAT_ASTC_8x5_SRGB_BLOCK: return GL_COMPRESSED_SRGB8_ALPHA8_ASTC_8x5_KHR; // 4-component ASTC, 8x5 blocks, sRGB - case VK_FORMAT_ASTC_8x6_SRGB_BLOCK: return GL_COMPRESSED_SRGB8_ALPHA8_ASTC_8x6_KHR; // 4-component ASTC, 8x6 blocks, sRGB - case VK_FORMAT_ASTC_8x8_SRGB_BLOCK: return GL_COMPRESSED_SRGB8_ALPHA8_ASTC_8x8_KHR; // 4-component ASTC, 8x8 blocks, sRGB - case VK_FORMAT_ASTC_10x5_SRGB_BLOCK: return GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x5_KHR; // 4-component ASTC, 10x5 blocks, sRGB - case VK_FORMAT_ASTC_10x6_SRGB_BLOCK: return GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x6_KHR; // 4-component ASTC, 10x6 blocks, sRGB - case VK_FORMAT_ASTC_10x8_SRGB_BLOCK: return GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x8_KHR; // 4-component ASTC, 10x8 blocks, sRGB - case VK_FORMAT_ASTC_10x10_SRGB_BLOCK: return GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x10_KHR; // 4-component ASTC, 10x10 blocks, sRGB - case VK_FORMAT_ASTC_12x10_SRGB_BLOCK: return GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x10_KHR; // 4-component ASTC, 12x10 blocks, sRGB - case VK_FORMAT_ASTC_12x12_SRGB_BLOCK: return GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x12_KHR; // 4-component ASTC, 12x12 blocks, sRGB - - // XXX FIXME Update once Vulkan ASTC HDR & 3D extensions are released. -#if 0 - case VK_FORMAT_UNDEFINED: return GL_COMPRESSED_RGBA_ASTC_3x3x3_OES; // 4-component ASTC, 3x3x3 blocks, unsigned normalized - case VK_FORMAT_UNDEFINED: return GL_COMPRESSED_RGBA_ASTC_4x3x3_OES; // 4-component ASTC, 4x3x3 blocks, unsigned normalized - case VK_FORMAT_UNDEFINED: return GL_COMPRESSED_RGBA_ASTC_4x4x3_OES; // 4-component ASTC, 4x4x3 blocks, unsigned normalized - case VK_FORMAT_UNDEFINED: return GL_COMPRESSED_RGBA_ASTC_4x4x4_OES; // 4-component ASTC, 4x4x4 blocks, unsigned normalized - case VK_FORMAT_UNDEFINED: return GL_COMPRESSED_RGBA_ASTC_5x4x4_OES; // 4-component ASTC, 5x4x4 blocks, unsigned normalized - case VK_FORMAT_UNDEFINED: return GL_COMPRESSED_RGBA_ASTC_5x5x4_OES; // 4-component ASTC, 5x5x4 blocks, unsigned normalized - case VK_FORMAT_UNDEFINED: return GL_COMPRESSED_RGBA_ASTC_5x5x5_OES; // 4-component ASTC, 5x5x5 blocks, unsigned normalized - case VK_FORMAT_UNDEFINED: return GL_COMPRESSED_RGBA_ASTC_6x5x5_OES; // 4-component ASTC, 6x5x5 blocks, unsigned normalized - case VK_FORMAT_UNDEFINED: return GL_COMPRESSED_RGBA_ASTC_6x6x5_OES; // 4-component ASTC, 6x6x5 blocks, unsigned normalized - case VK_FORMAT_UNDEFINED: return GL_COMPRESSED_RGBA_ASTC_6x6x6_OES; // 4-component ASTC, 6x6x6 blocks, unsigned normalized - - case VK_FORMAT_UNDEFINED: return GL_COMPRESSED_SRGB8_ALPHA8_ASTC_3x3x3_OES; // 4-component ASTC, 3x3x3 blocks, sRGB - case VK_FORMAT_UNDEFINED: return GL_COMPRESSED_SRGB8_ALPHA8_ASTC_4x3x3_OES; // 4-component ASTC, 4x3x3 blocks, sRGB - case VK_FORMAT_UNDEFINED: return GL_COMPRESSED_SRGB8_ALPHA8_ASTC_4x4x3_OES; // 4-component ASTC, 4x4x3 blocks, sRGB - case VK_FORMAT_UNDEFINED: return GL_COMPRESSED_SRGB8_ALPHA8_ASTC_4x4x4_OES; // 4-component ASTC, 4x4x4 blocks, sRGB - case VK_FORMAT_UNDEFINED: return GL_COMPRESSED_SRGB8_ALPHA8_ASTC_5x4x4_OES; // 4-component ASTC, 5x4x4 blocks, sRGB - case VK_FORMAT_UNDEFINED: return GL_COMPRESSED_SRGB8_ALPHA8_ASTC_5x5x4_OES; // 4-component ASTC, 5x5x4 blocks, sRGB - case VK_FORMAT_UNDEFINED: return GL_COMPRESSED_SRGB8_ALPHA8_ASTC_5x5x5_OES; // 4-component ASTC, 5x5x5 blocks, sRGB - case VK_FORMAT_UNDEFINED: return GL_COMPRESSED_SRGB8_ALPHA8_ASTC_6x5x5_OES; // 4-component ASTC, 6x5x5 blocks, sRGB - case VK_FORMAT_UNDEFINED: return GL_COMPRESSED_SRGB8_ALPHA8_ASTC_6x6x5_OES; // 4-component ASTC, 6x6x5 blocks, sRGB - case VK_FORMAT_UNDEFINED: return GL_COMPRESSED_SRGB8_ALPHA8_ASTC_6x6x6_OES; // 4-component ASTC, 6x6x6 blocks, sRGB -#endif - - // - // Depth/stencil - // - case VK_FORMAT_D16_UNORM: return GL_DEPTH_COMPONENT16; - case VK_FORMAT_X8_D24_UNORM_PACK32: return GL_DEPTH_COMPONENT24; - case VK_FORMAT_D32_SFLOAT: return GL_DEPTH_COMPONENT32F; - case VK_FORMAT_S8_UINT: return GL_STENCIL_INDEX8; - case VK_FORMAT_D24_UNORM_S8_UINT: return GL_DEPTH24_STENCIL8; - case VK_FORMAT_D32_SFLOAT_S8_UINT: return GL_DEPTH32F_STENCIL8; - - default: return GL_INVALID_VALUE; - } -} - #endif // !GL_FORMAT_H diff --git a/thirdparty/libktx/lib/texture2.c b/thirdparty/libktx/lib/texture2.c index 173e5026d9..014612fb57 100644 --- a/thirdparty/libktx/lib/texture2.c +++ b/thirdparty/libktx/lib/texture2.c @@ -20,6 +20,7 @@ #define _CRT_SECURE_NO_WARNINGS #endif +#include <assert.h> #include <stdlib.h> #include <string.h> #include <math.h> @@ -34,12 +35,13 @@ #include "memstream.h" #include "texture2.h" #include "unused.h" -#include "vk_format.h" // FIXME: Test this #define and put it in a header somewhere. //#define IS_BIG_ENDIAN (1 == *(unsigned char *)&(const int){0x01000000ul}) #define IS_BIG_ENDIAN 0 +extern uint32_t vkFormatTypeSize(VkFormat format); + struct ktxTexture_vtbl ktxTexture2_vtbl; struct ktxTexture_vtblInt ktxTexture2_vtblInt; @@ -217,18 +219,18 @@ ktx_uint32_t e5b9g9r9_ufloat_comparator[e5b9g9r9_bdbwordcount] = { #endif /** -* @private -* @~English -* @brief Initialize a ktxFormatSize object from the info in a DFD. -* -* This is used instead of referring to the DFD directly so code dealing -* with format info can be common to KTX 1 & 2. -* -* @param[in] This pointer the ktxTexture2 whose DFD to use. -* @param[in] fi pointer to the ktxFormatSize object to initialize. -* -* @return KTX_TRUE on success, otherwise KTX_FALSE. -*/ + * @private + * @~English + * @brief Initialize a ktxFormatSize object from the info in a DFD. + * + * This is used instead of referring to the DFD directly so code dealing + * with format info can be common to KTX 1 & 2. + * + * @param[in] This pointer the ktxFormatSize to initialize. + * @param[in] pDFD pointer to the DFD whose data to use. + * + * @return KTX_TRUE on success, otherwise KTX_FALSE. + */ bool ktxFormatSize_initFromDfd(ktxFormatSize* This, ktx_uint32_t* pDfd) { @@ -308,9 +310,10 @@ ktxFormatSize_initFromDfd(ktxFormatSize* This, ktx_uint32_t* pDfd) // the following reasons. (1) in v2 files levelIndex is always used to // calculate data size and, of course, for the level offsets. (2) Finer // grain access to supercompressed data than levels is not possible. - uint32_t blockByteLength; - recreateBytesPlane0FromSampleInfo(pDfd, &blockByteLength); - This->blockSizeInBits = blockByteLength * 8; + // + // The value set here is applied to the DFD after the data has been + // inflated during loading. + This->blockSizeInBits = reconstructDFDBytesPlane0FromSamples(pDfd) * 8; } return true; } @@ -427,34 +430,11 @@ ktxTexture2_construct(ktxTexture2* This, ktxTextureCreateInfo* createInfo, This->vkFormat = createInfo->vkFormat; - // Ideally we'd set all these things in ktxFormatSize_initFromDfd - // but This->_protected is not allocated until ktxTexture_construct; - if (This->isCompressed && (formatSize.flags & KTX_FORMAT_SIZE_YUVSDA_BIT) == 0) { - This->_protected->_typeSize = 1; - } else if (formatSize.flags & (KTX_FORMAT_SIZE_DEPTH_BIT | KTX_FORMAT_SIZE_STENCIL_BIT)) { - switch (createInfo->vkFormat) { - case VK_FORMAT_S8_UINT: - This->_protected->_typeSize = 1; - break; - case VK_FORMAT_D16_UNORM: // [[fallthrough]]; - case VK_FORMAT_D16_UNORM_S8_UINT: - This->_protected->_typeSize = 2; - break; - case VK_FORMAT_X8_D24_UNORM_PACK32: // [[fallthrough]]; - case VK_FORMAT_D24_UNORM_S8_UINT: // [[fallthrough]]; - case VK_FORMAT_D32_SFLOAT: // [[fallthrough]]; - case VK_FORMAT_D32_SFLOAT_S8_UINT: - This->_protected->_typeSize = 4; - break; - } - } else if (formatSize.flags & KTX_FORMAT_SIZE_PACKED_BIT) { - This->_protected->_typeSize = formatSize.blockSizeInBits / 8; - } else { - // Unpacked and uncompressed - uint32_t numComponents; - getDFDComponentInfoUnpacked(This->pDfd, &numComponents, - &This->_protected->_typeSize); - } + // The typeSize cannot be reconstructed just from the DFD as the BDFD + // does not capture the packing expressed by the [m]PACK[n] layout + // information in the VkFormat, so we calculate the typeSize directly + // from the vkFormat + This->_protected->_typeSize = vkFormatTypeSize(createInfo->vkFormat); This->supercompressionScheme = KTX_SS_NONE; diff --git a/thirdparty/libktx/lib/vk_format.h b/thirdparty/libktx/lib/vk_format.h index 18adf33b59..ab8a1ecbb6 100644 --- a/thirdparty/libktx/lib/vk_format.h +++ b/thirdparty/libktx/lib/vk_format.h @@ -401,6 +401,7 @@ static inline VkFormat vkGetFormatFromOpenGLFormat( const GLenum format, const G return VK_FORMAT_UNDEFINED; } +#if defined(NEED_VK_GET_FORMAT_FROM_OPENGL_TYPE) static inline VkFormat vkGetFormatFromOpenGLType( const GLenum type, const GLuint numComponents, const GLboolean normalized ) { switch ( type ) @@ -566,6 +567,7 @@ static inline VkFormat vkGetFormatFromOpenGLType( const GLenum type, const GLuin return VK_FORMAT_UNDEFINED; } +#endif static inline VkFormat vkGetFormatFromOpenGLInternalFormat( const GLenum internalFormat ) { @@ -823,6 +825,7 @@ static inline VkFormat vkGetFormatFromOpenGLInternalFormat( const GLenum interna } } +#if defined(NEED_VK_GET_FORMAT_SIZE) static inline void vkGetFormatSize( const VkFormat format, ktxFormatSize * pFormatSize ) { pFormatSize->minBlocksX = pFormatSize->minBlocksY = 1; @@ -1384,5 +1387,6 @@ static inline void vkGetFormatSize( const VkFormat format, ktxFormatSize * pForm break; } } +#endif #endif // !VK_FORMAT_H diff --git a/thirdparty/libktx/lib/vkformat_check.c b/thirdparty/libktx/lib/vkformat_check.c index e987b3d76a..5b86e3c8fe 100644 --- a/thirdparty/libktx/lib/vkformat_check.c +++ b/thirdparty/libktx/lib/vkformat_check.c @@ -30,13 +30,8 @@ isProhibitedFormat(VkFormat format) case VK_FORMAT_R8G8B8A8_SSCALED: case VK_FORMAT_B8G8R8A8_USCALED: case VK_FORMAT_B8G8R8A8_SSCALED: - case VK_FORMAT_A8B8G8R8_UNORM_PACK32: - case VK_FORMAT_A8B8G8R8_SNORM_PACK32: case VK_FORMAT_A8B8G8R8_USCALED_PACK32: case VK_FORMAT_A8B8G8R8_SSCALED_PACK32: - case VK_FORMAT_A8B8G8R8_UINT_PACK32: - case VK_FORMAT_A8B8G8R8_SINT_PACK32: - case VK_FORMAT_A8B8G8R8_SRGB_PACK32: case VK_FORMAT_A2R10G10B10_USCALED_PACK32: case VK_FORMAT_A2R10G10B10_SSCALED_PACK32: case VK_FORMAT_A2B10G10R10_USCALED_PACK32: diff --git a/thirdparty/libktx/lib/vkformat_typesize.c b/thirdparty/libktx/lib/vkformat_typesize.c new file mode 100644 index 0000000000..92fb468045 --- /dev/null +++ b/thirdparty/libktx/lib/vkformat_typesize.c @@ -0,0 +1,584 @@ + +/***************************** Do not edit. ***************************** + Automatically generated from vulkan_core.h version 267 by mkvkformatfiles. + *************************************************************************/ + +/* +** Copyright 2015-2023 The Khronos Group Inc. +** +** SPDX-License-Identifier: Apache-2.0 +*/ + + +#include <stdint.h> + +#include "vkformat_enum.h" + +uint32_t +vkFormatTypeSize(VkFormat format) +{ + switch (format) { + case VK_FORMAT_UNDEFINED: + return 1; + case VK_FORMAT_R4G4_UNORM_PACK8: + return 1; + case VK_FORMAT_R4G4B4A4_UNORM_PACK16: + return 2; + case VK_FORMAT_B4G4R4A4_UNORM_PACK16: + return 2; + case VK_FORMAT_R5G6B5_UNORM_PACK16: + return 2; + case VK_FORMAT_B5G6R5_UNORM_PACK16: + return 2; + case VK_FORMAT_R5G5B5A1_UNORM_PACK16: + return 2; + case VK_FORMAT_B5G5R5A1_UNORM_PACK16: + return 2; + case VK_FORMAT_A1R5G5B5_UNORM_PACK16: + return 2; + case VK_FORMAT_R8_UNORM: + return 1; + case VK_FORMAT_R8_SNORM: + return 1; + case VK_FORMAT_R8_USCALED: + return 1; + case VK_FORMAT_R8_SSCALED: + return 1; + case VK_FORMAT_R8_UINT: + return 1; + case VK_FORMAT_R8_SINT: + return 1; + case VK_FORMAT_R8_SRGB: + return 1; + case VK_FORMAT_R8G8_UNORM: + return 1; + case VK_FORMAT_R8G8_SNORM: + return 1; + case VK_FORMAT_R8G8_USCALED: + return 1; + case VK_FORMAT_R8G8_SSCALED: + return 1; + case VK_FORMAT_R8G8_UINT: + return 1; + case VK_FORMAT_R8G8_SINT: + return 1; + case VK_FORMAT_R8G8_SRGB: + return 1; + case VK_FORMAT_R8G8B8_UNORM: + return 1; + case VK_FORMAT_R8G8B8_SNORM: + return 1; + case VK_FORMAT_R8G8B8_USCALED: + return 1; + case VK_FORMAT_R8G8B8_SSCALED: + return 1; + case VK_FORMAT_R8G8B8_UINT: + return 1; + case VK_FORMAT_R8G8B8_SINT: + return 1; + case VK_FORMAT_R8G8B8_SRGB: + return 1; + case VK_FORMAT_B8G8R8_UNORM: + return 1; + case VK_FORMAT_B8G8R8_SNORM: + return 1; + case VK_FORMAT_B8G8R8_USCALED: + return 1; + case VK_FORMAT_B8G8R8_SSCALED: + return 1; + case VK_FORMAT_B8G8R8_UINT: + return 1; + case VK_FORMAT_B8G8R8_SINT: + return 1; + case VK_FORMAT_B8G8R8_SRGB: + return 1; + case VK_FORMAT_R8G8B8A8_UNORM: + return 1; + case VK_FORMAT_R8G8B8A8_SNORM: + return 1; + case VK_FORMAT_R8G8B8A8_USCALED: + return 1; + case VK_FORMAT_R8G8B8A8_SSCALED: + return 1; + case VK_FORMAT_R8G8B8A8_UINT: + return 1; + case VK_FORMAT_R8G8B8A8_SINT: + return 1; + case VK_FORMAT_R8G8B8A8_SRGB: + return 1; + case VK_FORMAT_B8G8R8A8_UNORM: + return 1; + case VK_FORMAT_B8G8R8A8_SNORM: + return 1; + case VK_FORMAT_B8G8R8A8_USCALED: + return 1; + case VK_FORMAT_B8G8R8A8_SSCALED: + return 1; + case VK_FORMAT_B8G8R8A8_UINT: + return 1; + case VK_FORMAT_B8G8R8A8_SINT: + return 1; + case VK_FORMAT_B8G8R8A8_SRGB: + return 1; + case VK_FORMAT_A8B8G8R8_UNORM_PACK32: + return 4; + case VK_FORMAT_A8B8G8R8_SNORM_PACK32: + return 4; + case VK_FORMAT_A8B8G8R8_USCALED_PACK32: + return 4; + case VK_FORMAT_A8B8G8R8_SSCALED_PACK32: + return 4; + case VK_FORMAT_A8B8G8R8_UINT_PACK32: + return 4; + case VK_FORMAT_A8B8G8R8_SINT_PACK32: + return 4; + case VK_FORMAT_A8B8G8R8_SRGB_PACK32: + return 4; + case VK_FORMAT_A2R10G10B10_UNORM_PACK32: + return 4; + case VK_FORMAT_A2R10G10B10_SNORM_PACK32: + return 4; + case VK_FORMAT_A2R10G10B10_USCALED_PACK32: + return 4; + case VK_FORMAT_A2R10G10B10_SSCALED_PACK32: + return 4; + case VK_FORMAT_A2R10G10B10_UINT_PACK32: + return 4; + case VK_FORMAT_A2R10G10B10_SINT_PACK32: + return 4; + case VK_FORMAT_A2B10G10R10_UNORM_PACK32: + return 4; + case VK_FORMAT_A2B10G10R10_SNORM_PACK32: + return 4; + case VK_FORMAT_A2B10G10R10_USCALED_PACK32: + return 4; + case VK_FORMAT_A2B10G10R10_SSCALED_PACK32: + return 4; + case VK_FORMAT_A2B10G10R10_UINT_PACK32: + return 4; + case VK_FORMAT_A2B10G10R10_SINT_PACK32: + return 4; + case VK_FORMAT_R16_UNORM: + return 2; + case VK_FORMAT_R16_SNORM: + return 2; + case VK_FORMAT_R16_USCALED: + return 2; + case VK_FORMAT_R16_SSCALED: + return 2; + case VK_FORMAT_R16_UINT: + return 2; + case VK_FORMAT_R16_SINT: + return 2; + case VK_FORMAT_R16_SFLOAT: + return 2; + case VK_FORMAT_R16G16_UNORM: + return 2; + case VK_FORMAT_R16G16_SNORM: + return 2; + case VK_FORMAT_R16G16_USCALED: + return 2; + case VK_FORMAT_R16G16_SSCALED: + return 2; + case VK_FORMAT_R16G16_UINT: + return 2; + case VK_FORMAT_R16G16_SINT: + return 2; + case VK_FORMAT_R16G16_SFLOAT: + return 2; + case VK_FORMAT_R16G16B16_UNORM: + return 2; + case VK_FORMAT_R16G16B16_SNORM: + return 2; + case VK_FORMAT_R16G16B16_USCALED: + return 2; + case VK_FORMAT_R16G16B16_SSCALED: + return 2; + case VK_FORMAT_R16G16B16_UINT: + return 2; + case VK_FORMAT_R16G16B16_SINT: + return 2; + case VK_FORMAT_R16G16B16_SFLOAT: + return 2; + case VK_FORMAT_R16G16B16A16_UNORM: + return 2; + case VK_FORMAT_R16G16B16A16_SNORM: + return 2; + case VK_FORMAT_R16G16B16A16_USCALED: + return 2; + case VK_FORMAT_R16G16B16A16_SSCALED: + return 2; + case VK_FORMAT_R16G16B16A16_UINT: + return 2; + case VK_FORMAT_R16G16B16A16_SINT: + return 2; + case VK_FORMAT_R16G16B16A16_SFLOAT: + return 2; + case VK_FORMAT_R32_UINT: + return 4; + case VK_FORMAT_R32_SINT: + return 4; + case VK_FORMAT_R32_SFLOAT: + return 4; + case VK_FORMAT_R32G32_UINT: + return 4; + case VK_FORMAT_R32G32_SINT: + return 4; + case VK_FORMAT_R32G32_SFLOAT: + return 4; + case VK_FORMAT_R32G32B32_UINT: + return 4; + case VK_FORMAT_R32G32B32_SINT: + return 4; + case VK_FORMAT_R32G32B32_SFLOAT: + return 4; + case VK_FORMAT_R32G32B32A32_UINT: + return 4; + case VK_FORMAT_R32G32B32A32_SINT: + return 4; + case VK_FORMAT_R32G32B32A32_SFLOAT: + return 4; + case VK_FORMAT_R64_UINT: + return 8; + case VK_FORMAT_R64_SINT: + return 8; + case VK_FORMAT_R64_SFLOAT: + return 8; + case VK_FORMAT_R64G64_UINT: + return 8; + case VK_FORMAT_R64G64_SINT: + return 8; + case VK_FORMAT_R64G64_SFLOAT: + return 8; + case VK_FORMAT_R64G64B64_UINT: + return 8; + case VK_FORMAT_R64G64B64_SINT: + return 8; + case VK_FORMAT_R64G64B64_SFLOAT: + return 8; + case VK_FORMAT_R64G64B64A64_UINT: + return 8; + case VK_FORMAT_R64G64B64A64_SINT: + return 8; + case VK_FORMAT_R64G64B64A64_SFLOAT: + return 8; + case VK_FORMAT_B10G11R11_UFLOAT_PACK32: + return 4; + case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32: + return 4; + case VK_FORMAT_D16_UNORM: + return 2; + case VK_FORMAT_X8_D24_UNORM_PACK32: + return 4; + case VK_FORMAT_D32_SFLOAT: + return 4; + case VK_FORMAT_S8_UINT: + return 1; + case VK_FORMAT_D16_UNORM_S8_UINT: + return 2; + case VK_FORMAT_D24_UNORM_S8_UINT: + return 4; + case VK_FORMAT_D32_SFLOAT_S8_UINT: + return 4; + case VK_FORMAT_BC1_RGB_UNORM_BLOCK: + return 1; + case VK_FORMAT_BC1_RGB_SRGB_BLOCK: + return 1; + case VK_FORMAT_BC1_RGBA_UNORM_BLOCK: + return 1; + case VK_FORMAT_BC1_RGBA_SRGB_BLOCK: + return 1; + case VK_FORMAT_BC2_UNORM_BLOCK: + return 1; + case VK_FORMAT_BC2_SRGB_BLOCK: + return 1; + case VK_FORMAT_BC3_UNORM_BLOCK: + return 1; + case VK_FORMAT_BC3_SRGB_BLOCK: + return 1; + case VK_FORMAT_BC4_UNORM_BLOCK: + return 1; + case VK_FORMAT_BC4_SNORM_BLOCK: + return 1; + case VK_FORMAT_BC5_UNORM_BLOCK: + return 1; + case VK_FORMAT_BC5_SNORM_BLOCK: + return 1; + case VK_FORMAT_BC6H_UFLOAT_BLOCK: + return 1; + case VK_FORMAT_BC6H_SFLOAT_BLOCK: + return 1; + case VK_FORMAT_BC7_UNORM_BLOCK: + return 1; + case VK_FORMAT_BC7_SRGB_BLOCK: + return 1; + case VK_FORMAT_ETC2_R8G8B8_UNORM_BLOCK: + return 1; + case VK_FORMAT_ETC2_R8G8B8_SRGB_BLOCK: + return 1; + case VK_FORMAT_ETC2_R8G8B8A1_UNORM_BLOCK: + return 1; + case VK_FORMAT_ETC2_R8G8B8A1_SRGB_BLOCK: + return 1; + case VK_FORMAT_ETC2_R8G8B8A8_UNORM_BLOCK: + return 1; + case VK_FORMAT_ETC2_R8G8B8A8_SRGB_BLOCK: + return 1; + case VK_FORMAT_EAC_R11_UNORM_BLOCK: + return 1; + case VK_FORMAT_EAC_R11_SNORM_BLOCK: + return 1; + case VK_FORMAT_EAC_R11G11_UNORM_BLOCK: + return 1; + case VK_FORMAT_EAC_R11G11_SNORM_BLOCK: + return 1; + case VK_FORMAT_ASTC_4x4_UNORM_BLOCK: + return 1; + case VK_FORMAT_ASTC_4x4_SRGB_BLOCK: + return 1; + case VK_FORMAT_ASTC_5x4_UNORM_BLOCK: + return 1; + case VK_FORMAT_ASTC_5x4_SRGB_BLOCK: + return 1; + case VK_FORMAT_ASTC_5x5_UNORM_BLOCK: + return 1; + case VK_FORMAT_ASTC_5x5_SRGB_BLOCK: + return 1; + case VK_FORMAT_ASTC_6x5_UNORM_BLOCK: + return 1; + case VK_FORMAT_ASTC_6x5_SRGB_BLOCK: + return 1; + case VK_FORMAT_ASTC_6x6_UNORM_BLOCK: + return 1; + case VK_FORMAT_ASTC_6x6_SRGB_BLOCK: + return 1; + case VK_FORMAT_ASTC_8x5_UNORM_BLOCK: + return 1; + case VK_FORMAT_ASTC_8x5_SRGB_BLOCK: + return 1; + case VK_FORMAT_ASTC_8x6_UNORM_BLOCK: + return 1; + case VK_FORMAT_ASTC_8x6_SRGB_BLOCK: + return 1; + case VK_FORMAT_ASTC_8x8_UNORM_BLOCK: + return 1; + case VK_FORMAT_ASTC_8x8_SRGB_BLOCK: + return 1; + case VK_FORMAT_ASTC_10x5_UNORM_BLOCK: + return 1; + case VK_FORMAT_ASTC_10x5_SRGB_BLOCK: + return 1; + case VK_FORMAT_ASTC_10x6_UNORM_BLOCK: + return 1; + case VK_FORMAT_ASTC_10x6_SRGB_BLOCK: + return 1; + case VK_FORMAT_ASTC_10x8_UNORM_BLOCK: + return 1; + case VK_FORMAT_ASTC_10x8_SRGB_BLOCK: + return 1; + case VK_FORMAT_ASTC_10x10_UNORM_BLOCK: + return 1; + case VK_FORMAT_ASTC_10x10_SRGB_BLOCK: + return 1; + case VK_FORMAT_ASTC_12x10_UNORM_BLOCK: + return 1; + case VK_FORMAT_ASTC_12x10_SRGB_BLOCK: + return 1; + case VK_FORMAT_ASTC_12x12_UNORM_BLOCK: + return 1; + case VK_FORMAT_ASTC_12x12_SRGB_BLOCK: + return 1; + case VK_FORMAT_G8B8G8R8_422_UNORM: + return 1; + case VK_FORMAT_B8G8R8G8_422_UNORM: + return 1; + case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM: + return 1; + case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM: + return 1; + case VK_FORMAT_G8_B8_R8_3PLANE_422_UNORM: + return 1; + case VK_FORMAT_G8_B8R8_2PLANE_422_UNORM: + return 1; + case VK_FORMAT_G8_B8_R8_3PLANE_444_UNORM: + return 1; + case VK_FORMAT_R10X6_UNORM_PACK16: + return 2; + case VK_FORMAT_R10X6G10X6_UNORM_2PACK16: + return 2; + case VK_FORMAT_R10X6G10X6B10X6A10X6_UNORM_4PACK16: + return 2; + case VK_FORMAT_G10X6B10X6G10X6R10X6_422_UNORM_4PACK16: + return 2; + case VK_FORMAT_B10X6G10X6R10X6G10X6_422_UNORM_4PACK16: + return 2; + case VK_FORMAT_G10X6_B10X6_R10X6_3PLANE_420_UNORM_3PACK16: + return 2; + case VK_FORMAT_G10X6_B10X6R10X6_2PLANE_420_UNORM_3PACK16: + return 2; + case VK_FORMAT_G10X6_B10X6_R10X6_3PLANE_422_UNORM_3PACK16: + return 2; + case VK_FORMAT_G10X6_B10X6R10X6_2PLANE_422_UNORM_3PACK16: + return 2; + case VK_FORMAT_G10X6_B10X6_R10X6_3PLANE_444_UNORM_3PACK16: + return 2; + case VK_FORMAT_R12X4_UNORM_PACK16: + return 2; + case VK_FORMAT_R12X4G12X4_UNORM_2PACK16: + return 2; + case VK_FORMAT_R12X4G12X4B12X4A12X4_UNORM_4PACK16: + return 2; + case VK_FORMAT_G12X4B12X4G12X4R12X4_422_UNORM_4PACK16: + return 2; + case VK_FORMAT_B12X4G12X4R12X4G12X4_422_UNORM_4PACK16: + return 2; + case VK_FORMAT_G12X4_B12X4_R12X4_3PLANE_420_UNORM_3PACK16: + return 2; + case VK_FORMAT_G12X4_B12X4R12X4_2PLANE_420_UNORM_3PACK16: + return 2; + case VK_FORMAT_G12X4_B12X4_R12X4_3PLANE_422_UNORM_3PACK16: + return 2; + case VK_FORMAT_G12X4_B12X4R12X4_2PLANE_422_UNORM_3PACK16: + return 2; + case VK_FORMAT_G12X4_B12X4_R12X4_3PLANE_444_UNORM_3PACK16: + return 2; + case VK_FORMAT_G16B16G16R16_422_UNORM: + return 2; + case VK_FORMAT_B16G16R16G16_422_UNORM: + return 2; + case VK_FORMAT_G16_B16_R16_3PLANE_420_UNORM: + return 2; + case VK_FORMAT_G16_B16R16_2PLANE_420_UNORM: + return 2; + case VK_FORMAT_G16_B16_R16_3PLANE_422_UNORM: + return 2; + case VK_FORMAT_G16_B16R16_2PLANE_422_UNORM: + return 2; + case VK_FORMAT_G16_B16_R16_3PLANE_444_UNORM: + return 2; + case VK_FORMAT_G8_B8R8_2PLANE_444_UNORM: + return 1; + case VK_FORMAT_G10X6_B10X6R10X6_2PLANE_444_UNORM_3PACK16: + return 2; + case VK_FORMAT_G12X4_B12X4R12X4_2PLANE_444_UNORM_3PACK16: + return 2; + case VK_FORMAT_G16_B16R16_2PLANE_444_UNORM: + return 2; + case VK_FORMAT_A4R4G4B4_UNORM_PACK16: + return 2; + case VK_FORMAT_A4B4G4R4_UNORM_PACK16: + return 2; + case VK_FORMAT_ASTC_4x4_SFLOAT_BLOCK: + return 1; + case VK_FORMAT_ASTC_5x4_SFLOAT_BLOCK: + return 1; + case VK_FORMAT_ASTC_5x5_SFLOAT_BLOCK: + return 1; + case VK_FORMAT_ASTC_6x5_SFLOAT_BLOCK: + return 1; + case VK_FORMAT_ASTC_6x6_SFLOAT_BLOCK: + return 1; + case VK_FORMAT_ASTC_8x5_SFLOAT_BLOCK: + return 1; + case VK_FORMAT_ASTC_8x6_SFLOAT_BLOCK: + return 1; + case VK_FORMAT_ASTC_8x8_SFLOAT_BLOCK: + return 1; + case VK_FORMAT_ASTC_10x5_SFLOAT_BLOCK: + return 1; + case VK_FORMAT_ASTC_10x6_SFLOAT_BLOCK: + return 1; + case VK_FORMAT_ASTC_10x8_SFLOAT_BLOCK: + return 1; + case VK_FORMAT_ASTC_10x10_SFLOAT_BLOCK: + return 1; + case VK_FORMAT_ASTC_12x10_SFLOAT_BLOCK: + return 1; + case VK_FORMAT_ASTC_12x12_SFLOAT_BLOCK: + return 1; + case VK_FORMAT_PVRTC1_2BPP_UNORM_BLOCK_IMG: + return 1; + case VK_FORMAT_PVRTC1_4BPP_UNORM_BLOCK_IMG: + return 1; + case VK_FORMAT_PVRTC2_2BPP_UNORM_BLOCK_IMG: + return 1; + case VK_FORMAT_PVRTC2_4BPP_UNORM_BLOCK_IMG: + return 1; + case VK_FORMAT_PVRTC1_2BPP_SRGB_BLOCK_IMG: + return 1; + case VK_FORMAT_PVRTC1_4BPP_SRGB_BLOCK_IMG: + return 1; + case VK_FORMAT_PVRTC2_2BPP_SRGB_BLOCK_IMG: + return 1; + case VK_FORMAT_PVRTC2_4BPP_SRGB_BLOCK_IMG: + return 1; + case VK_FORMAT_ASTC_3x3x3_UNORM_BLOCK_EXT: + return 1; + case VK_FORMAT_ASTC_3x3x3_SRGB_BLOCK_EXT: + return 1; + case VK_FORMAT_ASTC_3x3x3_SFLOAT_BLOCK_EXT: + return 1; + case VK_FORMAT_ASTC_4x3x3_UNORM_BLOCK_EXT: + return 1; + case VK_FORMAT_ASTC_4x3x3_SRGB_BLOCK_EXT: + return 1; + case VK_FORMAT_ASTC_4x3x3_SFLOAT_BLOCK_EXT: + return 1; + case VK_FORMAT_ASTC_4x4x3_UNORM_BLOCK_EXT: + return 1; + case VK_FORMAT_ASTC_4x4x3_SRGB_BLOCK_EXT: + return 1; + case VK_FORMAT_ASTC_4x4x3_SFLOAT_BLOCK_EXT: + return 1; + case VK_FORMAT_ASTC_4x4x4_UNORM_BLOCK_EXT: + return 1; + case VK_FORMAT_ASTC_4x4x4_SRGB_BLOCK_EXT: + return 1; + case VK_FORMAT_ASTC_4x4x4_SFLOAT_BLOCK_EXT: + return 1; + case VK_FORMAT_ASTC_5x4x4_UNORM_BLOCK_EXT: + return 1; + case VK_FORMAT_ASTC_5x4x4_SRGB_BLOCK_EXT: + return 1; + case VK_FORMAT_ASTC_5x4x4_SFLOAT_BLOCK_EXT: + return 1; + case VK_FORMAT_ASTC_5x5x4_UNORM_BLOCK_EXT: + return 1; + case VK_FORMAT_ASTC_5x5x4_SRGB_BLOCK_EXT: + return 1; + case VK_FORMAT_ASTC_5x5x4_SFLOAT_BLOCK_EXT: + return 1; + case VK_FORMAT_ASTC_5x5x5_UNORM_BLOCK_EXT: + return 1; + case VK_FORMAT_ASTC_5x5x5_SRGB_BLOCK_EXT: + return 1; + case VK_FORMAT_ASTC_5x5x5_SFLOAT_BLOCK_EXT: + return 1; + case VK_FORMAT_ASTC_6x5x5_UNORM_BLOCK_EXT: + return 1; + case VK_FORMAT_ASTC_6x5x5_SRGB_BLOCK_EXT: + return 1; + case VK_FORMAT_ASTC_6x5x5_SFLOAT_BLOCK_EXT: + return 1; + case VK_FORMAT_ASTC_6x6x5_UNORM_BLOCK_EXT: + return 1; + case VK_FORMAT_ASTC_6x6x5_SRGB_BLOCK_EXT: + return 1; + case VK_FORMAT_ASTC_6x6x5_SFLOAT_BLOCK_EXT: + return 1; + case VK_FORMAT_ASTC_6x6x6_UNORM_BLOCK_EXT: + return 1; + case VK_FORMAT_ASTC_6x6x6_SRGB_BLOCK_EXT: + return 1; + case VK_FORMAT_ASTC_6x6x6_SFLOAT_BLOCK_EXT: + return 1; + case VK_FORMAT_R16G16_S10_5_NV: + return 2; + case VK_FORMAT_A1B5G5R5_UNORM_PACK16_KHR: + return 2; + case VK_FORMAT_A8_UNORM_KHR: + return 1; + default: + return 0; + } +} diff --git a/thirdparty/libktx/patches/godot.patch b/thirdparty/libktx/patches/godot.patch index 28db86cf9b..13468813fb 100644 --- a/thirdparty/libktx/patches/godot.patch +++ b/thirdparty/libktx/patches/godot.patch @@ -16,7 +16,7 @@ index ca68545e4a..d7ecb7a0fd 100644 #undef DECLARE_PRIVATE #undef DECLARE_PROTECTED diff --git a/thirdparty/libktx/lib/dfdutils/vk2dfd.inl b/thirdparty/libktx/lib/dfdutils/vk2dfd.inl -index 85d53202a5..25c7a2c238 100644 +index 5104c8fcb4..3398441e8c 100644 --- a/thirdparty/libktx/lib/dfdutils/vk2dfd.inl +++ b/thirdparty/libktx/lib/dfdutils/vk2dfd.inl @@ -370,6 +370,7 @@ case VK_FORMAT_PVRTC1_2BPP_SRGB_BLOCK_IMG: return createDFDCompressed(c_PVRTC, 8 @@ -32,9 +32,9 @@ index 85d53202a5..25c7a2c238 100644 case VK_FORMAT_ASTC_6x6x6_SRGB_BLOCK_EXT: return createDFDCompressed(c_ASTC, 6, 6, 6, s_SRGB); case VK_FORMAT_ASTC_6x6x6_SFLOAT_BLOCK_EXT: return createDFDCompressed(c_ASTC, 6, 6, 6, s_SFLOAT); +#endif + case VK_FORMAT_R16G16_S10_5_NV: return createDFDUnpacked(0, 2, 2, 0, s_S10_5); case VK_FORMAT_A1B5G5R5_UNORM_PACK16_KHR: { int channels[] = {0,1,2,3}; int bits[] = {5,5,5,1}; - return createDFDPacked(0, 4, bits, channels, s_UNORM); diff --git a/thirdparty/libktx/lib/miniz_wrapper.cpp b/thirdparty/libktx/lib/miniz_wrapper.cpp index 07920c4809..cbd7da540a 100644 --- a/thirdparty/libktx/lib/miniz_wrapper.cpp diff --git a/thirdparty/mbedtls/include/mbedtls/aesni.h b/thirdparty/mbedtls/include/mbedtls/aesni.h index b636c100ae..93f067304d 100644 --- a/thirdparty/mbedtls/include/mbedtls/aesni.h +++ b/thirdparty/mbedtls/include/mbedtls/aesni.h @@ -46,7 +46,7 @@ * macros that may change in future releases. */ #undef MBEDTLS_AESNI_HAVE_INTRINSICS -#if defined(_MSC_VER) +#if defined(_MSC_VER) && !defined(__clang__) /* Visual Studio supports AESNI intrinsics since VS 2008 SP1. We only support * VS 2013 and up for other reasons anyway, so no need to check the version. */ #define MBEDTLS_AESNI_HAVE_INTRINSICS @@ -54,7 +54,7 @@ /* GCC-like compilers: currently, we only support intrinsics if the requisite * target flag is enabled when building the library (e.g. `gcc -mpclmul -msse2` * or `clang -maes -mpclmul`). */ -#if defined(__GNUC__) && defined(__AES__) && defined(__PCLMUL__) +#if (defined(__GNUC__) || defined(__clang__)) && defined(__AES__) && defined(__PCLMUL__) #define MBEDTLS_AESNI_HAVE_INTRINSICS #endif diff --git a/thirdparty/mbedtls/include/mbedtls/config.h b/thirdparty/mbedtls/include/mbedtls/config.h index ac2146ea11..4842fd494c 100644 --- a/thirdparty/mbedtls/include/mbedtls/config.h +++ b/thirdparty/mbedtls/include/mbedtls/config.h @@ -1571,6 +1571,26 @@ //#define MBEDTLS_PSA_INJECT_ENTROPY /** + * \def MBEDTLS_PSA_ASSUME_EXCLUSIVE_BUFFERS + * + * Assume all buffers passed to PSA functions are owned exclusively by the + * PSA function and are not stored in shared memory. + * + * This option may be enabled if all buffers passed to any PSA function reside + * in memory that is accessible only to the PSA function during its execution. + * + * This option MUST be disabled whenever buffer arguments are in memory shared + * with an untrusted party, for example where arguments to PSA calls are passed + * across a trust boundary. + * + * \note Enabling this option reduces memory usage and code size. + * + * \note Enabling this option causes overlap of input and output buffers + * not to be supported by PSA functions. + */ +//#define MBEDTLS_PSA_ASSUME_EXCLUSIVE_BUFFERS + +/** * \def MBEDTLS_RSA_NO_CRT * * Do not use the Chinese Remainder Theorem diff --git a/thirdparty/mbedtls/include/mbedtls/ecp.h b/thirdparty/mbedtls/include/mbedtls/ecp.h index e4e40c003c..33ea14d7e2 100644 --- a/thirdparty/mbedtls/include/mbedtls/ecp.h +++ b/thirdparty/mbedtls/include/mbedtls/ecp.h @@ -1265,6 +1265,8 @@ int mbedtls_ecp_gen_key(mbedtls_ecp_group_id grp_id, mbedtls_ecp_keypair *key, /** * \brief This function reads an elliptic curve private key. * + * \note This function does not support Curve448 yet. + * * \param grp_id The ECP group identifier. * \param key The destination key. * \param buf The buffer containing the binary representation of the @@ -1286,17 +1288,43 @@ int mbedtls_ecp_read_key(mbedtls_ecp_group_id grp_id, mbedtls_ecp_keypair *key, /** * \brief This function exports an elliptic curve private key. * + * \note Note that although this function accepts an output + * buffer that is smaller or larger than the key, most key + * import interfaces require the output to have exactly + * key's nominal length. It is generally simplest to + * pass the key's nominal length as \c buflen, after + * checking that the output buffer is large enough. + * See the description of the \p buflen parameter for + * how to calculate the nominal length. + * + * \note If the private key was not set in \p key, + * the output is unspecified. Future versions + * may return an error in that case. + * + * \note This function does not support Curve448 yet. + * * \param key The private key. * \param buf The output buffer for containing the binary representation - * of the key. (Big endian integer for Weierstrass curves, byte - * string for Montgomery curves.) + * of the key. + * For Weierstrass curves, this is the big-endian + * representation, padded with null bytes at the beginning + * to reach \p buflen bytes. + * For Montgomery curves, this is the standard byte string + * representation (which is little-endian), padded with + * null bytes at the end to reach \p buflen bytes. * \param buflen The total length of the buffer in bytes. + * The length of the output is + * (`grp->nbits` + 7) / 8 bytes + * where `grp->nbits` is the private key size in bits. + * For Weierstrass keys, if the output buffer is smaller, + * leading zeros are trimmed to fit if possible. For + * Montgomery keys, the output buffer must always be large + * enough for the nominal length. * * \return \c 0 on success. - * \return #MBEDTLS_ERR_ECP_BUFFER_TOO_SMALL if the \p key - representation is larger than the available space in \p buf. - * \return #MBEDTLS_ERR_ECP_FEATURE_UNAVAILABLE if the operation for - * the group is not implemented. + * \return #MBEDTLS_ERR_ECP_BUFFER_TOO_SMALL or + * #MBEDTLS_ERR_MPI_BUFFER_TOO_SMALL if the \p key + * representation is larger than the available space in \p buf. * \return Another negative error code on different kinds of failure. */ int mbedtls_ecp_write_key(mbedtls_ecp_keypair *key, diff --git a/thirdparty/mbedtls/include/mbedtls/net_sockets.h b/thirdparty/mbedtls/include/mbedtls/net_sockets.h index 2d3fe3f949..1a12c9c803 100644 --- a/thirdparty/mbedtls/include/mbedtls/net_sockets.h +++ b/thirdparty/mbedtls/include/mbedtls/net_sockets.h @@ -140,7 +140,7 @@ int mbedtls_net_bind(mbedtls_net_context *ctx, const char *bind_ip, const char * * \param client_ctx Will contain the connected client socket * \param client_ip Will contain the client IP address, can be NULL * \param buf_size Size of the client_ip buffer - * \param ip_len Will receive the size of the client IP written, + * \param cip_len Will receive the size of the client IP written, * can be NULL if client_ip is null * * \return 0 if successful, or @@ -153,7 +153,7 @@ int mbedtls_net_bind(mbedtls_net_context *ctx, const char *bind_ip, const char * */ int mbedtls_net_accept(mbedtls_net_context *bind_ctx, mbedtls_net_context *client_ctx, - void *client_ip, size_t buf_size, size_t *ip_len); + void *client_ip, size_t buf_size, size_t *cip_len); /** * \brief Check and wait for the context to be ready for read/write diff --git a/thirdparty/mbedtls/include/mbedtls/version.h b/thirdparty/mbedtls/include/mbedtls/version.h index 0533bca681..bbe76b1739 100644 --- a/thirdparty/mbedtls/include/mbedtls/version.h +++ b/thirdparty/mbedtls/include/mbedtls/version.h @@ -26,16 +26,16 @@ */ #define MBEDTLS_VERSION_MAJOR 2 #define MBEDTLS_VERSION_MINOR 28 -#define MBEDTLS_VERSION_PATCH 7 +#define MBEDTLS_VERSION_PATCH 8 /** * The single version number has the following structure: * MMNNPP00 * Major version | Minor version | Patch version */ -#define MBEDTLS_VERSION_NUMBER 0x021C0700 -#define MBEDTLS_VERSION_STRING "2.28.7" -#define MBEDTLS_VERSION_STRING_FULL "Mbed TLS 2.28.7" +#define MBEDTLS_VERSION_NUMBER 0x021C0800 +#define MBEDTLS_VERSION_STRING "2.28.8" +#define MBEDTLS_VERSION_STRING_FULL "Mbed TLS 2.28.8" #if defined(MBEDTLS_VERSION_C) diff --git a/thirdparty/mbedtls/library/aes.c b/thirdparty/mbedtls/library/aes.c index 24d7ab92fb..836367cea7 100644 --- a/thirdparty/mbedtls/library/aes.c +++ b/thirdparty/mbedtls/library/aes.c @@ -322,7 +322,7 @@ static const uint32_t RT3[256] = { RT }; /* * Round constants */ -static const uint32_t RCON[10] = +static const uint32_t round_constants[10] = { 0x00000001, 0x00000002, 0x00000004, 0x00000008, 0x00000010, 0x00000020, 0x00000040, 0x00000080, @@ -369,7 +369,7 @@ static uint32_t RT3[256]; /* * Round constants */ -static uint32_t RCON[10]; +static uint32_t round_constants[10]; /* * Tables generation code @@ -399,7 +399,7 @@ static void aes_gen_tables(void) * calculate the round constants */ for (i = 0, x = 1; i < 10; i++) { - RCON[i] = (uint32_t) x; + round_constants[i] = (uint32_t) x; x = MBEDTLS_BYTE_0(XTIME(x)); } @@ -625,7 +625,7 @@ int mbedtls_aes_setkey_enc(mbedtls_aes_context *ctx, const unsigned char *key, case 10: for (i = 0; i < 10; i++, RK += 4) { - RK[4] = RK[0] ^ RCON[i] ^ + RK[4] = RK[0] ^ round_constants[i] ^ ((uint32_t) FSb[MBEDTLS_BYTE_1(RK[3])]) ^ ((uint32_t) FSb[MBEDTLS_BYTE_2(RK[3])] << 8) ^ ((uint32_t) FSb[MBEDTLS_BYTE_3(RK[3])] << 16) ^ @@ -640,7 +640,7 @@ int mbedtls_aes_setkey_enc(mbedtls_aes_context *ctx, const unsigned char *key, case 12: for (i = 0; i < 8; i++, RK += 6) { - RK[6] = RK[0] ^ RCON[i] ^ + RK[6] = RK[0] ^ round_constants[i] ^ ((uint32_t) FSb[MBEDTLS_BYTE_1(RK[5])]) ^ ((uint32_t) FSb[MBEDTLS_BYTE_2(RK[5])] << 8) ^ ((uint32_t) FSb[MBEDTLS_BYTE_3(RK[5])] << 16) ^ @@ -657,7 +657,7 @@ int mbedtls_aes_setkey_enc(mbedtls_aes_context *ctx, const unsigned char *key, case 14: for (i = 0; i < 7; i++, RK += 8) { - RK[8] = RK[0] ^ RCON[i] ^ + RK[8] = RK[0] ^ round_constants[i] ^ ((uint32_t) FSb[MBEDTLS_BYTE_1(RK[7])]) ^ ((uint32_t) FSb[MBEDTLS_BYTE_2(RK[7])] << 8) ^ ((uint32_t) FSb[MBEDTLS_BYTE_3(RK[7])] << 16) ^ diff --git a/thirdparty/mbedtls/library/aesni.c b/thirdparty/mbedtls/library/aesni.c index dd84c2b4ea..74bae91f5e 100644 --- a/thirdparty/mbedtls/library/aesni.c +++ b/thirdparty/mbedtls/library/aesni.c @@ -27,10 +27,12 @@ #if defined(MBEDTLS_AESNI_HAVE_CODE) #if MBEDTLS_AESNI_HAVE_CODE == 2 -#if !defined(_WIN32) +#if defined(__GNUC__) #include <cpuid.h> -#else +#elif defined(_MSC_VER) #include <intrin.h> +#else +#error "`__cpuid` required by MBEDTLS_AESNI_C is not supported by the compiler" #endif #include <immintrin.h> #endif @@ -45,7 +47,7 @@ int mbedtls_aesni_has_support(unsigned int what) if (!done) { #if MBEDTLS_AESNI_HAVE_CODE == 2 - static unsigned info[4] = { 0, 0, 0, 0 }; + static int info[4] = { 0, 0, 0, 0 }; #if defined(_MSC_VER) __cpuid(info, 1); #else @@ -179,7 +181,7 @@ void mbedtls_aesni_gcm_mult(unsigned char c[16], const unsigned char a[16], const unsigned char b[16]) { - __m128i aa, bb, cc, dd; + __m128i aa = { 0 }, bb = { 0 }, cc, dd; /* The inputs are in big-endian order, so byte-reverse them */ for (size_t i = 0; i < 16; i++) { diff --git a/thirdparty/mbedtls/library/common.h b/thirdparty/mbedtls/library/common.h index bf18d725cc..49e2c97ea0 100644 --- a/thirdparty/mbedtls/library/common.h +++ b/thirdparty/mbedtls/library/common.h @@ -350,4 +350,31 @@ static inline const unsigned char *mbedtls_buffer_offset_const( #define MBEDTLS_STATIC_ASSERT(expr, msg) #endif +/* Suppress compiler warnings for unused functions and variables. */ +#if !defined(MBEDTLS_MAYBE_UNUSED) && defined(__has_attribute) +# if __has_attribute(unused) +# define MBEDTLS_MAYBE_UNUSED __attribute__((unused)) +# endif +#endif +#if !defined(MBEDTLS_MAYBE_UNUSED) && defined(__GNUC__) +# define MBEDTLS_MAYBE_UNUSED __attribute__((unused)) +#endif +#if !defined(MBEDTLS_MAYBE_UNUSED) && defined(__IAR_SYSTEMS_ICC__) && defined(__VER__) +/* IAR does support __attribute__((unused)), but only if the -e flag (extended language support) + * is given; the pragma always works. + * Unfortunately the pragma affects the rest of the file where it is used, but this is harmless. + * Check for version 5.2 or later - this pragma may be supported by earlier versions, but I wasn't + * able to find documentation). + */ +# if (__VER__ >= 5020000) +# define MBEDTLS_MAYBE_UNUSED _Pragma("diag_suppress=Pe177") +# endif +#endif +#if !defined(MBEDTLS_MAYBE_UNUSED) && defined(_MSC_VER) +# define MBEDTLS_MAYBE_UNUSED __pragma(warning(suppress:4189)) +#endif +#if !defined(MBEDTLS_MAYBE_UNUSED) +# define MBEDTLS_MAYBE_UNUSED +#endif + #endif /* MBEDTLS_LIBRARY_COMMON_H */ diff --git a/thirdparty/mbedtls/library/ecp.c b/thirdparty/mbedtls/library/ecp.c index 31a6b9e305..cfe02b0d2c 100644 --- a/thirdparty/mbedtls/library/ecp.c +++ b/thirdparty/mbedtls/library/ecp.c @@ -927,7 +927,7 @@ int mbedtls_ecp_point_read_binary(const mbedtls_ecp_group *grp, size_t plen; ECP_VALIDATE_RET(grp != NULL); ECP_VALIDATE_RET(pt != NULL); - ECP_VALIDATE_RET(buf != NULL); + ECP_VALIDATE_RET(ilen == 0 || buf != NULL); if (ilen < 1) { return MBEDTLS_ERR_ECP_BAD_INPUT_DATA; @@ -996,7 +996,7 @@ int mbedtls_ecp_tls_read_point(const mbedtls_ecp_group *grp, ECP_VALIDATE_RET(grp != NULL); ECP_VALIDATE_RET(pt != NULL); ECP_VALIDATE_RET(buf != NULL); - ECP_VALIDATE_RET(*buf != NULL); + ECP_VALIDATE_RET(buf_len == 0 || *buf != NULL); /* * We must have at least two bytes (1 for length, at least one for data) @@ -1068,7 +1068,7 @@ int mbedtls_ecp_tls_read_group(mbedtls_ecp_group *grp, mbedtls_ecp_group_id grp_id; ECP_VALIDATE_RET(grp != NULL); ECP_VALIDATE_RET(buf != NULL); - ECP_VALIDATE_RET(*buf != NULL); + ECP_VALIDATE_RET(len == 0 || *buf != NULL); if ((ret = mbedtls_ecp_tls_read_group_id(&grp_id, buf, len)) != 0) { return ret; @@ -1088,7 +1088,7 @@ int mbedtls_ecp_tls_read_group_id(mbedtls_ecp_group_id *grp, const mbedtls_ecp_curve_info *curve_info; ECP_VALIDATE_RET(grp != NULL); ECP_VALIDATE_RET(buf != NULL); - ECP_VALIDATE_RET(*buf != NULL); + ECP_VALIDATE_RET(len == 0 || *buf != NULL); /* * We expect at least three bytes (see below) @@ -2614,8 +2614,8 @@ static int ecp_mul_mxz(mbedtls_ecp_group *grp, mbedtls_ecp_point *R, /* RP.X might be slightly larger than P, so reduce it */ MOD_ADD(RP.X); + /* Randomize coordinates of the starting point */ #if defined(MBEDTLS_ECP_NO_INTERNAL_RNG) - /* Derandomize coordinates of the starting point */ if (f_rng == NULL) { have_rng = 0; } @@ -3358,10 +3358,10 @@ cleanup: int mbedtls_ecp_write_key(mbedtls_ecp_keypair *key, unsigned char *buf, size_t buflen) { - int ret = MBEDTLS_ERR_ECP_FEATURE_UNAVAILABLE; + int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED; ECP_VALIDATE_RET(key != NULL); - ECP_VALIDATE_RET(buf != NULL); + ECP_VALIDATE_RET(buflen == 0 || buf != NULL); #if defined(MBEDTLS_ECP_MONTGOMERY_ENABLED) if (mbedtls_ecp_get_type(&key->grp) == MBEDTLS_ECP_TYPE_MONTGOMERY) { diff --git a/thirdparty/mbedtls/library/ecp_curves.c b/thirdparty/mbedtls/library/ecp_curves.c index c7565cce5d..61a1046f3a 100644 --- a/thirdparty/mbedtls/library/ecp_curves.c +++ b/thirdparty/mbedtls/library/ecp_curves.c @@ -535,10 +535,10 @@ static inline void ecp_mpi_load(mbedtls_mpi *X, const mbedtls_mpi_uint *p, size_ */ static inline void ecp_mpi_set1(mbedtls_mpi *X) { - static mbedtls_mpi_uint one[] = { 1 }; + static const mbedtls_mpi_uint one[] = { 1 }; X->s = 1; X->n = 1; - X->p = one; + X->p = (mbedtls_mpi_uint *) one; /* X->p will not be modified so the cast is safe */ } /* @@ -1348,7 +1348,7 @@ cleanup: */ #define P_KOBLITZ_MAX (256 / 8 / sizeof(mbedtls_mpi_uint)) // Max limbs in P #define P_KOBLITZ_R (8 / sizeof(mbedtls_mpi_uint)) // Limbs in R -static inline int ecp_mod_koblitz(mbedtls_mpi *N, mbedtls_mpi_uint *Rp, size_t p_limbs, +static inline int ecp_mod_koblitz(mbedtls_mpi *N, const mbedtls_mpi_uint *Rp, size_t p_limbs, size_t adjust, size_t shift, mbedtls_mpi_uint mask) { int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED; @@ -1362,7 +1362,7 @@ static inline int ecp_mod_koblitz(mbedtls_mpi *N, mbedtls_mpi_uint *Rp, size_t p /* Init R */ R.s = 1; - R.p = Rp; + R.p = (mbedtls_mpi_uint *) Rp; /* R.p will not be modified so the cast is safe */ R.n = P_KOBLITZ_R; /* Common setup for M */ @@ -1433,7 +1433,7 @@ cleanup: */ static int ecp_mod_p192k1(mbedtls_mpi *N) { - static mbedtls_mpi_uint Rp[] = { + static const mbedtls_mpi_uint Rp[] = { MBEDTLS_BYTES_TO_T_UINT_8(0xC9, 0x11, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00) }; @@ -1450,7 +1450,7 @@ static int ecp_mod_p192k1(mbedtls_mpi *N) */ static int ecp_mod_p224k1(mbedtls_mpi *N) { - static mbedtls_mpi_uint Rp[] = { + static const mbedtls_mpi_uint Rp[] = { MBEDTLS_BYTES_TO_T_UINT_8(0x93, 0x1A, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00) }; @@ -1472,7 +1472,7 @@ static int ecp_mod_p224k1(mbedtls_mpi *N) */ static int ecp_mod_p256k1(mbedtls_mpi *N) { - static mbedtls_mpi_uint Rp[] = { + static const mbedtls_mpi_uint Rp[] = { MBEDTLS_BYTES_TO_T_UINT_8(0xD1, 0x03, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00) }; diff --git a/thirdparty/mbedtls/library/entropy_poll.c b/thirdparty/mbedtls/library/entropy_poll.c index 4c5184686e..727f848b93 100644 --- a/thirdparty/mbedtls/library/entropy_poll.c +++ b/thirdparty/mbedtls/library/entropy_poll.c @@ -5,7 +5,7 @@ * SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later */ -#if defined(__linux__) && !defined(_GNU_SOURCE) +#if defined(__linux__) || defined(__midipix__) && !defined(_GNU_SOURCE) /* Ensure that syscall() is available even when compiling with -std=c99 */ #define _GNU_SOURCE #endif diff --git a/thirdparty/mbedtls/library/gcm.c b/thirdparty/mbedtls/library/gcm.c index 86d5fa2b5f..d3e773278f 100644 --- a/thirdparty/mbedtls/library/gcm.c +++ b/thirdparty/mbedtls/library/gcm.c @@ -241,7 +241,7 @@ int mbedtls_gcm_starts(mbedtls_gcm_context *ctx, uint64_t iv_bits; GCM_VALIDATE_RET(ctx != NULL); - GCM_VALIDATE_RET(iv != NULL); + GCM_VALIDATE_RET(iv_len == 0 || iv != NULL); GCM_VALIDATE_RET(add_len == 0 || add != NULL); /* IV and AD are limited to 2^64 bits, so 2^61 bytes */ @@ -433,7 +433,7 @@ int mbedtls_gcm_crypt_and_tag(mbedtls_gcm_context *ctx, int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED; GCM_VALIDATE_RET(ctx != NULL); - GCM_VALIDATE_RET(iv != NULL); + GCM_VALIDATE_RET(iv_len == 0 || iv != NULL); GCM_VALIDATE_RET(add_len == 0 || add != NULL); GCM_VALIDATE_RET(length == 0 || input != NULL); GCM_VALIDATE_RET(length == 0 || output != NULL); @@ -470,7 +470,7 @@ int mbedtls_gcm_auth_decrypt(mbedtls_gcm_context *ctx, int diff; GCM_VALIDATE_RET(ctx != NULL); - GCM_VALIDATE_RET(iv != NULL); + GCM_VALIDATE_RET(iv_len == 0 || iv != NULL); GCM_VALIDATE_RET(add_len == 0 || add != NULL); GCM_VALIDATE_RET(tag != NULL); GCM_VALIDATE_RET(length == 0 || input != NULL); diff --git a/thirdparty/mbedtls/library/net_sockets.c b/thirdparty/mbedtls/library/net_sockets.c index 8140eeade4..5d985ef001 100644 --- a/thirdparty/mbedtls/library/net_sockets.c +++ b/thirdparty/mbedtls/library/net_sockets.c @@ -321,7 +321,7 @@ static int net_would_block(const mbedtls_net_context *ctx) */ int mbedtls_net_accept(mbedtls_net_context *bind_ctx, mbedtls_net_context *client_ctx, - void *client_ip, size_t buf_size, size_t *ip_len) + void *client_ip, size_t buf_size, size_t *cip_len) { int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED; int type; @@ -404,22 +404,22 @@ int mbedtls_net_accept(mbedtls_net_context *bind_ctx, if (client_ip != NULL) { if (client_addr.ss_family == AF_INET) { struct sockaddr_in *addr4 = (struct sockaddr_in *) &client_addr; - *ip_len = sizeof(addr4->sin_addr.s_addr); + *cip_len = sizeof(addr4->sin_addr.s_addr); - if (buf_size < *ip_len) { + if (buf_size < *cip_len) { return MBEDTLS_ERR_NET_BUFFER_TOO_SMALL; } - memcpy(client_ip, &addr4->sin_addr.s_addr, *ip_len); + memcpy(client_ip, &addr4->sin_addr.s_addr, *cip_len); } else { struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *) &client_addr; - *ip_len = sizeof(addr6->sin6_addr.s6_addr); + *cip_len = sizeof(addr6->sin6_addr.s6_addr); - if (buf_size < *ip_len) { + if (buf_size < *cip_len) { return MBEDTLS_ERR_NET_BUFFER_TOO_SMALL; } - memcpy(client_ip, &addr6->sin6_addr.s6_addr, *ip_len); + memcpy(client_ip, &addr6->sin6_addr.s6_addr, *cip_len); } } diff --git a/thirdparty/mbedtls/library/pk_wrap.c b/thirdparty/mbedtls/library/pk_wrap.c index 14c6d3f99c..dd460a6a0c 100644 --- a/thirdparty/mbedtls/library/pk_wrap.c +++ b/thirdparty/mbedtls/library/pk_wrap.c @@ -53,7 +53,23 @@ static int rsa_can_do(mbedtls_pk_type_t type) static size_t rsa_get_bitlen(const void *ctx) { const mbedtls_rsa_context *rsa = (const mbedtls_rsa_context *) ctx; - return 8 * mbedtls_rsa_get_len(rsa); + /* Unfortunately, the rsa.h interface does not have a direct way + * to access the bit-length that works with MBEDTLS_RSA_ALT. + * So we have to do a little work here. + */ + mbedtls_mpi N; + mbedtls_mpi_init(&N); + int ret = mbedtls_rsa_export(rsa, &N, NULL, NULL, NULL, NULL); + /* If the export fails for some reason (e.g. the RSA_ALT implementation + * does not support export, or there is not enough memory), + * we have no way of returning an error from this function. + * As a fallback, return the byte-length converted in bits, which is + * the correct value if the modulus size is a multiple of 8 bits, which + * is very often the case in practice. */ + size_t bitlen = (ret == 0 ? mbedtls_mpi_bitlen(&N) : + 8 * mbedtls_rsa_get_len(rsa)); + mbedtls_mpi_free(&N); + return bitlen; } static int rsa_verify_wrap(void *ctx, mbedtls_md_type_t md_alg, diff --git a/thirdparty/mbedtls/library/pkcs12.c b/thirdparty/mbedtls/library/pkcs12.c index 712488233f..55de216edb 100644 --- a/thirdparty/mbedtls/library/pkcs12.c +++ b/thirdparty/mbedtls/library/pkcs12.c @@ -244,21 +244,22 @@ int mbedtls_pkcs12_pbe_ext(mbedtls_asn1_buf *pbe_params, int mode, } #if defined(MBEDTLS_CIPHER_MODE_WITH_PADDING) - /* PKCS12 uses CBC with PKCS7 padding */ - - mbedtls_cipher_padding_t padding = MBEDTLS_PADDING_PKCS7; + { + /* PKCS12 uses CBC with PKCS7 padding */ + mbedtls_cipher_padding_t padding = MBEDTLS_PADDING_PKCS7; #if !defined(MBEDTLS_CIPHER_PADDING_PKCS7) - /* For historical reasons, when decrypting, this function works when - * decrypting even when support for PKCS7 padding is disabled. In this - * case, it ignores the padding, and so will never report a - * password mismatch. - */ - if (mode == MBEDTLS_PKCS12_PBE_DECRYPT) { - padding = MBEDTLS_PADDING_NONE; - } + /* For historical reasons, when decrypting, this function works when + * decrypting even when support for PKCS7 padding is disabled. In this + * case, it ignores the padding, and so will never report a + * password mismatch. + */ + if (mode == MBEDTLS_PKCS12_PBE_DECRYPT) { + padding = MBEDTLS_PADDING_NONE; + } #endif - if ((ret = mbedtls_cipher_set_padding_mode(&cipher_ctx, padding)) != 0) { - goto exit; + if ((ret = mbedtls_cipher_set_padding_mode(&cipher_ctx, padding)) != 0) { + goto exit; + } } #endif /* MBEDTLS_CIPHER_MODE_WITH_PADDING */ diff --git a/thirdparty/mbedtls/library/pkcs5.c b/thirdparty/mbedtls/library/pkcs5.c index 8e5b751a38..90703c45f9 100644 --- a/thirdparty/mbedtls/library/pkcs5.c +++ b/thirdparty/mbedtls/library/pkcs5.c @@ -239,23 +239,25 @@ int mbedtls_pkcs5_pbes2_ext(const mbedtls_asn1_buf *pbe_params, int mode, } #if defined(MBEDTLS_CIPHER_MODE_WITH_PADDING) - /* PKCS5 uses CBC with PKCS7 padding (which is the same as - * "PKCS5 padding" except that it's typically only called PKCS5 - * with 64-bit-block ciphers). - */ - mbedtls_cipher_padding_t padding = MBEDTLS_PADDING_PKCS7; + { + /* PKCS5 uses CBC with PKCS7 padding (which is the same as + * "PKCS5 padding" except that it's typically only called PKCS5 + * with 64-bit-block ciphers). + */ + mbedtls_cipher_padding_t padding = MBEDTLS_PADDING_PKCS7; #if !defined(MBEDTLS_CIPHER_PADDING_PKCS7) - /* For historical reasons, when decrypting, this function works when - * decrypting even when support for PKCS7 padding is disabled. In this - * case, it ignores the padding, and so will never report a - * password mismatch. - */ - if (mode == MBEDTLS_DECRYPT) { - padding = MBEDTLS_PADDING_NONE; - } + /* For historical reasons, when decrypting, this function works when + * decrypting even when support for PKCS7 padding is disabled. In this + * case, it ignores the padding, and so will never report a + * password mismatch. + */ + if (mode == MBEDTLS_DECRYPT) { + padding = MBEDTLS_PADDING_NONE; + } #endif - if ((ret = mbedtls_cipher_set_padding_mode(&cipher_ctx, padding)) != 0) { - goto exit; + if ((ret = mbedtls_cipher_set_padding_mode(&cipher_ctx, padding)) != 0) { + goto exit; + } } #endif /* MBEDTLS_CIPHER_MODE_WITH_PADDING */ if ((ret = mbedtls_cipher_crypt(&cipher_ctx, iv, enc_scheme_params.len, diff --git a/thirdparty/mbedtls/library/pkwrite.c b/thirdparty/mbedtls/library/pkwrite.c index fafcf0e1a7..534290df4e 100644 --- a/thirdparty/mbedtls/library/pkwrite.c +++ b/thirdparty/mbedtls/library/pkwrite.c @@ -559,38 +559,49 @@ end_of_export: int mbedtls_pk_write_pubkey_pem(mbedtls_pk_context *key, unsigned char *buf, size_t size) { int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED; - unsigned char output_buf[PUB_DER_MAX_BYTES]; + unsigned char *output_buf = NULL; + output_buf = mbedtls_calloc(1, PUB_DER_MAX_BYTES); + if (output_buf == NULL) { + return MBEDTLS_ERR_PK_ALLOC_FAILED; + } size_t olen = 0; PK_VALIDATE_RET(key != NULL); PK_VALIDATE_RET(buf != NULL || size == 0); if ((ret = mbedtls_pk_write_pubkey_der(key, output_buf, - sizeof(output_buf))) < 0) { - return ret; + PUB_DER_MAX_BYTES)) < 0) { + goto cleanup; } if ((ret = mbedtls_pem_write_buffer(PEM_BEGIN_PUBLIC_KEY, PEM_END_PUBLIC_KEY, - output_buf + sizeof(output_buf) - ret, + output_buf + PUB_DER_MAX_BYTES - ret, ret, buf, size, &olen)) != 0) { - return ret; + goto cleanup; } - return 0; + ret = 0; +cleanup: + mbedtls_free(output_buf); + return ret; } int mbedtls_pk_write_key_pem(mbedtls_pk_context *key, unsigned char *buf, size_t size) { int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED; - unsigned char output_buf[PRV_DER_MAX_BYTES]; + unsigned char *output_buf = NULL; + output_buf = mbedtls_calloc(1, PRV_DER_MAX_BYTES); + if (output_buf == NULL) { + return MBEDTLS_ERR_PK_ALLOC_FAILED; + } const char *begin, *end; size_t olen = 0; PK_VALIDATE_RET(key != NULL); PK_VALIDATE_RET(buf != NULL || size == 0); - if ((ret = mbedtls_pk_write_key_der(key, output_buf, sizeof(output_buf))) < 0) { - return ret; + if ((ret = mbedtls_pk_write_key_der(key, output_buf, PRV_DER_MAX_BYTES)) < 0) { + goto cleanup; } #if defined(MBEDTLS_RSA_C) @@ -605,15 +616,22 @@ int mbedtls_pk_write_key_pem(mbedtls_pk_context *key, unsigned char *buf, size_t end = PEM_END_PRIVATE_KEY_EC; } else #endif - return MBEDTLS_ERR_PK_FEATURE_UNAVAILABLE; + { + ret = MBEDTLS_ERR_PK_FEATURE_UNAVAILABLE; + goto cleanup; + } if ((ret = mbedtls_pem_write_buffer(begin, end, - output_buf + sizeof(output_buf) - ret, + output_buf + PRV_DER_MAX_BYTES - ret, ret, buf, size, &olen)) != 0) { - return ret; + goto cleanup; } - return 0; + ret = 0; +cleanup: + mbedtls_platform_zeroize(output_buf, PRV_DER_MAX_BYTES); + mbedtls_free(output_buf); + return ret; } #endif /* MBEDTLS_PEM_WRITE_C */ diff --git a/thirdparty/mbedtls/library/platform_util.c b/thirdparty/mbedtls/library/platform_util.c index a86b07fa3f..df34167a8f 100644 --- a/thirdparty/mbedtls/library/platform_util.c +++ b/thirdparty/mbedtls/library/platform_util.c @@ -66,10 +66,10 @@ void mbedtls_platform_zeroize(void *buf, size_t len) #include <time.h> #if !defined(_WIN32) && (defined(unix) || \ defined(__unix) || defined(__unix__) || (defined(__APPLE__) && \ - defined(__MACH__))) + defined(__MACH__)) || defined(__midipix__)) #include <unistd.h> #endif /* !_WIN32 && (unix || __unix || __unix__ || - * (__APPLE__ && __MACH__)) */ + * (__APPLE__ && __MACH__)) || __midipix__ */ #if !((defined(_POSIX_VERSION) && _POSIX_VERSION >= 200809L) || \ (defined(_POSIX_THREAD_SAFE_FUNCTIONS) && \ diff --git a/thirdparty/mbedtls/library/ssl_tls.c b/thirdparty/mbedtls/library/ssl_tls.c index 1a2bc7bc9e..c667a2923b 100644 --- a/thirdparty/mbedtls/library/ssl_tls.c +++ b/thirdparty/mbedtls/library/ssl_tls.c @@ -992,8 +992,7 @@ static int ssl_populate_transform(mbedtls_ssl_transform *transform, !defined(MBEDTLS_SSL_EXPORT_KEYS) && \ !defined(MBEDTLS_SSL_DTLS_CONNECTION_ID) && \ !defined(MBEDTLS_DEBUG_C) - ssl = NULL; /* make sure we don't use it except for those cases */ - (void) ssl; + (void) ssl; /* ssl is unused except for those cases */ #endif /* @@ -5205,6 +5204,12 @@ const mbedtls_ssl_session *mbedtls_ssl_get_session_pointer(const mbedtls_ssl_con #define SSL_SERIALIZED_SESSION_CONFIG_CRT 0 #endif /* MBEDTLS_X509_CRT_PARSE_C */ +#if defined(MBEDTLS_SSL_KEEP_PEER_CERTIFICATE) +#define SSL_SERIALIZED_SESSION_CONFIG_KEEP_PEER_CRT 1 +#else +#define SSL_SERIALIZED_SESSION_CONFIG_KEEP_PEER_CRT 0 +#endif /* MBEDTLS_SSL_SESSION_TICKETS */ + #if defined(MBEDTLS_SSL_CLI_C) && defined(MBEDTLS_SSL_SESSION_TICKETS) #define SSL_SERIALIZED_SESSION_CONFIG_CLIENT_TICKET 1 #else @@ -5242,6 +5247,7 @@ const mbedtls_ssl_session *mbedtls_ssl_get_session_pointer(const mbedtls_ssl_con #define SSL_SERIALIZED_SESSION_CONFIG_TRUNC_HMAC_BIT 4 #define SSL_SERIALIZED_SESSION_CONFIG_ETM_BIT 5 #define SSL_SERIALIZED_SESSION_CONFIG_TICKET_BIT 6 +#define SSL_SERIALIZED_SESSION_CONFIG_KEEP_PEER_CRT_BIT 7 #define SSL_SERIALIZED_SESSION_CONFIG_BITFLAG \ ((uint16_t) ( \ @@ -5253,9 +5259,11 @@ const mbedtls_ssl_session *mbedtls_ssl_get_session_pointer(const mbedtls_ssl_con (SSL_SERIALIZED_SESSION_CONFIG_TRUNC_HMAC << \ SSL_SERIALIZED_SESSION_CONFIG_TRUNC_HMAC_BIT) | \ (SSL_SERIALIZED_SESSION_CONFIG_ETM << SSL_SERIALIZED_SESSION_CONFIG_ETM_BIT) | \ - (SSL_SERIALIZED_SESSION_CONFIG_TICKET << SSL_SERIALIZED_SESSION_CONFIG_TICKET_BIT))) + (SSL_SERIALIZED_SESSION_CONFIG_TICKET << SSL_SERIALIZED_SESSION_CONFIG_TICKET_BIT) | \ + (SSL_SERIALIZED_SESSION_CONFIG_KEEP_PEER_CRT << \ + SSL_SERIALIZED_SESSION_CONFIG_KEEP_PEER_CRT_BIT))) -static unsigned char ssl_serialized_session_header[] = { +static const unsigned char ssl_serialized_session_header[] = { MBEDTLS_VERSION_MAJOR, MBEDTLS_VERSION_MINOR, MBEDTLS_VERSION_PATCH, @@ -5279,19 +5287,36 @@ static unsigned char ssl_serialized_session_header[] = { * // the setting of those compile-time * // configuration options which influence * // the structure of mbedtls_ssl_session. - * uint64 start_time; - * uint8 ciphersuite[2]; // defined by the standard - * uint8 compression; // 0 or 1 - * uint8 session_id_len; // at most 32 - * opaque session_id[32]; - * opaque master[48]; // fixed length in the standard - * uint32 verify_result; - * opaque peer_cert<0..2^24-1>; // length 0 means no peer cert - * opaque ticket<0..2^24-1>; // length 0 means no ticket - * uint32 ticket_lifetime; - * uint8 mfl_code; // up to 255 according to standard - * uint8 trunc_hmac; // 0 or 1 - * uint8 encrypt_then_mac; // 0 or 1 + * #if defined(MBEDTLS_HAVE_TIME) + * uint64 start_time; + * #endif + * uint8 ciphersuite[2]; // defined by the standard + * uint8 compression; // 0 or 1 + * uint8 session_id_len; // at most 32 + * opaque session_id[32]; + * opaque master[48]; // fixed length in the standard + * uint32 verify_result; + * #if defined(MBEDTLS_X509_CRT_PARSE_C) + * #if defined(MBEDTLS_SSL_KEEP_PEER_CERTIFICATE) + * opaque peer_cert<0..2^24-1>; // length 0 means no peer cert + * #else + * uint8 peer_cert_digest_type; + * opaque peer_cert_digest<0..2^8-1> + * #endif + * #endif + * #if defined(MBEDTLS_SSL_SESSION_TICKETS) && defined(MBEDTLS_SSL_CLI_C) + * opaque ticket<0..2^24-1>; // length 0 means no ticket + * uint32 ticket_lifetime; + * #endif + * #if defined(MBEDTLS_SSL_MAX_FRAGMENT_LENGTH) + * uint8 mfl_code; // up to 255 according to standard + * #endif + * #if defined(MBEDTLS_SSL_TRUNCATED_HMAC) + * uint8 trunc_hmac; // 0 or 1 + * #endif + * #if defined(MBEDTLS_SSL_ENCRYPT_THEN_MAC) + * uint8 encrypt_then_mac; // 0 or 1 + * #endif * * The order is the same as in the definition of the structure, except * verify_result is put before peer_cert so that all mandatory fields come @@ -6124,7 +6149,7 @@ void mbedtls_ssl_session_free(mbedtls_ssl_session *session) (SSL_SERIALIZED_CONTEXT_CONFIG_ALPN << SSL_SERIALIZED_CONTEXT_CONFIG_ALPN_BIT) | \ 0u)) -static unsigned char ssl_serialized_context_header[] = { +static const unsigned char ssl_serialized_context_header[] = { MBEDTLS_VERSION_MAJOR, MBEDTLS_VERSION_MINOR, MBEDTLS_VERSION_PATCH, @@ -6655,7 +6680,7 @@ static int ssl_context_load(mbedtls_ssl_context *ssl, /* alpn_chosen should point to an item in the configured list */ for (cur = ssl->conf->alpn_list; *cur != NULL; cur++) { if (strlen(*cur) == alpn_len && - memcmp(p, cur, alpn_len) == 0) { + memcmp(p, *cur, alpn_len) == 0) { ssl->alpn_chosen = *cur; break; } @@ -6822,7 +6847,7 @@ void mbedtls_ssl_config_init(mbedtls_ssl_config *conf) } #if defined(MBEDTLS_KEY_EXCHANGE_WITH_CERT_ENABLED) -static int ssl_preset_default_hashes[] = { +static const int ssl_preset_default_hashes[] = { #if defined(MBEDTLS_SHA512_C) MBEDTLS_MD_SHA512, #endif @@ -6840,14 +6865,14 @@ static int ssl_preset_default_hashes[] = { }; #endif -static int ssl_preset_suiteb_ciphersuites[] = { +static const int ssl_preset_suiteb_ciphersuites[] = { MBEDTLS_TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256, MBEDTLS_TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384, 0 }; #if defined(MBEDTLS_KEY_EXCHANGE_WITH_CERT_ENABLED) -static int ssl_preset_suiteb_hashes[] = { +static const int ssl_preset_suiteb_hashes[] = { MBEDTLS_MD_SHA256, MBEDTLS_MD_SHA384, MBEDTLS_MD_NONE @@ -6855,7 +6880,7 @@ static int ssl_preset_suiteb_hashes[] = { #endif #if defined(MBEDTLS_ECP_C) -static mbedtls_ecp_group_id ssl_preset_suiteb_curves[] = { +static const mbedtls_ecp_group_id ssl_preset_suiteb_curves[] = { #if defined(MBEDTLS_ECP_DP_SECP256R1_ENABLED) MBEDTLS_ECP_DP_SECP256R1, #endif diff --git a/thirdparty/mbedtls/library/timing.c b/thirdparty/mbedtls/library/timing.c index 7ef9f473b5..ec2cff1084 100644 --- a/thirdparty/mbedtls/library/timing.c +++ b/thirdparty/mbedtls/library/timing.c @@ -403,6 +403,7 @@ int mbedtls_timing_self_test(int verbose) uint32_t a = 0, b = 0; mbedtls_timing_delay_context ctx; + memset(&ctx, 0, sizeof(ctx)); if (verbose != 0) { mbedtls_printf(" TIMING tests note: will take some time!\n"); } diff --git a/thirdparty/mbedtls/library/version_features.c b/thirdparty/mbedtls/library/version_features.c index 779325744b..6f663b12a7 100644 --- a/thirdparty/mbedtls/library/version_features.c +++ b/thirdparty/mbedtls/library/version_features.c @@ -456,6 +456,9 @@ static const char * const features[] = { #if defined(MBEDTLS_PSA_INJECT_ENTROPY) "MBEDTLS_PSA_INJECT_ENTROPY", #endif /* MBEDTLS_PSA_INJECT_ENTROPY */ +#if defined(MBEDTLS_PSA_ASSUME_EXCLUSIVE_BUFFERS) + "MBEDTLS_PSA_ASSUME_EXCLUSIVE_BUFFERS", +#endif /* MBEDTLS_PSA_ASSUME_EXCLUSIVE_BUFFERS */ #if defined(MBEDTLS_RSA_NO_CRT) "MBEDTLS_RSA_NO_CRT", #endif /* MBEDTLS_RSA_NO_CRT */ diff --git a/thirdparty/miniupnpc/src/miniupnpcstrings.h b/thirdparty/miniupnpc/src/miniupnpcstrings.h index bd4dd317c1..f5730111af 100644 --- a/thirdparty/miniupnpc/src/miniupnpcstrings.h +++ b/thirdparty/miniupnpc/src/miniupnpcstrings.h @@ -2,7 +2,7 @@ #define MINIUPNPCSTRINGS_H_INCLUDED #define OS_STRING "Godot Engine/1.0" -#define MINIUPNPC_VERSION_STRING "2.2.6" +#define MINIUPNPC_VERSION_STRING "2.2.7" #if 0 /* according to "UPnP Device Architecture 1.0" */ diff --git a/thirdparty/thorvg/inc/config.h b/thirdparty/thorvg/inc/config.h index 67716b66cc..31baceca67 100644 --- a/thirdparty/thorvg/inc/config.h +++ b/thirdparty/thorvg/inc/config.h @@ -10,5 +10,5 @@ // For internal debugging: //#define THORVG_LOG_ENABLED -#define THORVG_VERSION_STRING "0.12.9" +#define THORVG_VERSION_STRING "0.12.10" #endif diff --git a/thirdparty/thorvg/src/common/tvgMath.cpp b/thirdparty/thorvg/src/common/tvgMath.cpp index 42bc2cf4aa..37a8879cb5 100644 --- a/thirdparty/thorvg/src/common/tvgMath.cpp +++ b/thirdparty/thorvg/src/common/tvgMath.cpp @@ -71,7 +71,7 @@ void mathRotate(Matrix* m, float degree) { if (degree == 0.0f) return; - auto radian = degree / 180.0f * M_PI; + auto radian = degree / 180.0f * MATH_PI; auto cosVal = cosf(radian); auto sinVal = sinf(radian); diff --git a/thirdparty/thorvg/src/common/tvgMath.h b/thirdparty/thorvg/src/common/tvgMath.h index 7f6708262b..32f4e6b7d1 100644 --- a/thirdparty/thorvg/src/common/tvgMath.h +++ b/thirdparty/thorvg/src/common/tvgMath.h @@ -70,7 +70,7 @@ static inline bool mathEqual(const Matrix& a, const Matrix& b) static inline bool mathRightAngle(const Matrix* m) { auto radian = fabsf(atan2f(m->e21, m->e11)); - if (radian < FLT_EPSILON || mathEqual(radian, float(M_PI_2)) || mathEqual(radian, float(M_PI))) return true; + if (radian < FLT_EPSILON || mathEqual(radian, MATH_PI2) || mathEqual(radian, MATH_PI)) return true; return false; } diff --git a/thirdparty/thorvg/src/loaders/svg/tvgSvgLoader.cpp b/thirdparty/thorvg/src/loaders/svg/tvgSvgLoader.cpp index 7a4f544539..f2fbc07b4a 100644 --- a/thirdparty/thorvg/src/loaders/svg/tvgSvgLoader.cpp +++ b/thirdparty/thorvg/src/loaders/svg/tvgSvgLoader.cpp @@ -743,10 +743,10 @@ static Matrix* _parseTransformationMatrix(const char* value) } else goto error; } else if (state == MatrixState::Rotate) { //Transform to signed. - points[0] = fmod(points[0], 360); - if (points[0] < 0) points[0] += 360; - auto c = cosf(points[0] * (M_PI / 180.0)); - auto s = sinf(points[0] * (M_PI / 180.0)); + points[0] = fmodf(points[0], 360.0f); + if (points[0] < 0) points[0] += 360.0f; + auto c = cosf(points[0] * (MATH_PI / 180.0f)); + auto s = sinf(points[0] * (MATH_PI / 180.0f)); if (ptCount == 1) { Matrix tmp = { c, -s, 0, s, c, 0, 0, 0, 1 }; *matrix = mathMultiply(matrix, &tmp); @@ -769,12 +769,12 @@ static Matrix* _parseTransformationMatrix(const char* value) *matrix = mathMultiply(matrix, &tmp); } else if (state == MatrixState::SkewX) { if (ptCount != 1) goto error; - auto deg = tanf(points[0] * (M_PI / 180.0)); + auto deg = tanf(points[0] * (MATH_PI / 180.0f)); Matrix tmp = { 1, deg, 0, 0, 1, 0, 0, 0, 1 }; *matrix = mathMultiply(matrix, &tmp); } else if (state == MatrixState::SkewY) { if (ptCount != 1) goto error; - auto deg = tanf(points[0] * (M_PI / 180.0)); + auto deg = tanf(points[0] * (MATH_PI / 180.0f)); Matrix tmp = { 1, 0, 0, deg, 1, 0, 0, 0, 1 }; *matrix = mathMultiply(matrix, &tmp); } @@ -1919,6 +1919,19 @@ static SvgNode* _findNodeById(SvgNode *node, const char* id) } +static SvgNode* _findParentById(SvgNode* node, char* id, SvgNode* doc) +{ + SvgNode *parent = node->parent; + while (parent != nullptr && parent != doc) { + if (parent->id && !strcmp(parent->id, id)) { + return parent; + } + parent = parent->parent; + } + return nullptr; +} + + static constexpr struct { const char* tag; @@ -1959,8 +1972,12 @@ static bool _attrParseUseNode(void* data, const char* key, const char* value) defs = _getDefsNode(node); nodeFrom = _findNodeById(defs, id); if (nodeFrom) { - _cloneNode(nodeFrom, node, 0); - if (nodeFrom->type == SvgNodeType::Symbol) use->symbol = nodeFrom; + if (!_findParentById(node, id, loader->doc)) { + _cloneNode(nodeFrom, node, 0); + if (nodeFrom->type == SvgNodeType::Symbol) use->symbol = nodeFrom; + } else { + TVGLOG("SVG", "%s is ancestor element. This reference is invalid.", id); + } free(id); } else { //some svg export software include <defs> element at the end of the file @@ -2669,7 +2686,7 @@ static void _inheritGradient(SvgLoaderData* loader, SvgStyleGradient* to, SvgSty if (to->transform) memcpy(to->transform, from->transform, sizeof(Matrix)); } - if (to->type == SvgGradientType::Linear && from->type == SvgGradientType::Linear) { + if (to->type == SvgGradientType::Linear) { for (unsigned int i = 0; i < sizeof(linear_tags) / sizeof(linear_tags[0]); i++) { bool coordSet = to->flags & linear_tags[i].flag; if (!(to->flags & linear_tags[i].flag) && (from->flags & linear_tags[i].flag)) { @@ -2686,7 +2703,7 @@ static void _inheritGradient(SvgLoaderData* loader, SvgStyleGradient* to, SvgSty linear_tags[i].tagInheritedRecalc(loader, to->linear, to->userSpace); } } - } else if (to->type == SvgGradientType::Radial && from->type == SvgGradientType::Radial) { + } else if (to->type == SvgGradientType::Radial) { for (unsigned int i = 0; i < sizeof(radialTags) / sizeof(radialTags[0]); i++) { bool coordSet = (to->flags & radialTags[i].flag); if (!(to->flags & radialTags[i].flag) && (from->flags & radialTags[i].flag)) { @@ -2696,10 +2713,16 @@ static void _inheritGradient(SvgLoaderData* loader, SvgStyleGradient* to, SvgSty //GradUnits not set directly, coord set if (!gradUnitSet && coordSet) { radialTags[i].tagRecalc(loader, to->radial, to->userSpace); + //If fx and fy are not set, set cx and cy. + if (!strcmp(radialTags[i].tag, "cx") && !(to->flags & SvgGradientFlags::Fx)) to->radial->fx = to->radial->cx; + if (!strcmp(radialTags[i].tag, "cy") && !(to->flags & SvgGradientFlags::Fy)) to->radial->fy = to->radial->cy; } //GradUnits set, coord not set directly if (to->userSpace == from->userSpace) continue; if (gradUnitSet && !coordSet) { + //If fx and fx are not set, do not call recalc. + if (!strcmp(radialTags[i].tag, "fx") && !(to->flags & SvgGradientFlags::Fx)) continue; + if (!strcmp(radialTags[i].tag, "fy") && !(to->flags & SvgGradientFlags::Fy)) continue; radialTags[i].tagInheritedRecalc(loader, to->radial, to->userSpace); } } @@ -3018,9 +3041,13 @@ static void _clonePostponedNodes(Array<SvgNodeIdPair>* cloneNodes, SvgNode* doc) auto defs = _getDefsNode(nodeIdPair.node); auto nodeFrom = _findNodeById(defs, nodeIdPair.id); if (!nodeFrom) nodeFrom = _findNodeById(doc, nodeIdPair.id); - _cloneNode(nodeFrom, nodeIdPair.node, 0); - if (nodeFrom && nodeFrom->type == SvgNodeType::Symbol && nodeIdPair.node->type == SvgNodeType::Use) { - nodeIdPair.node->node.use.symbol = nodeFrom; + if (!_findParentById(nodeIdPair.node, nodeIdPair.id, doc)) { + _cloneNode(nodeFrom, nodeIdPair.node, 0); + if (nodeFrom && nodeFrom->type == SvgNodeType::Symbol && nodeIdPair.node->type == SvgNodeType::Use) { + nodeIdPair.node->node.use.symbol = nodeFrom; + } + } else { + TVGLOG("SVG", "%s is ancestor element. This reference is invalid.", nodeIdPair.id); } free(nodeIdPair.id); } diff --git a/thirdparty/thorvg/src/loaders/svg/tvgSvgPath.cpp b/thirdparty/thorvg/src/loaders/svg/tvgSvgPath.cpp index f9780749a9..691cde1fc5 100644 --- a/thirdparty/thorvg/src/loaders/svg/tvgSvgPath.cpp +++ b/thirdparty/thorvg/src/loaders/svg/tvgSvgPath.cpp @@ -126,7 +126,7 @@ void _pathAppendArcTo(Array<PathCommand>* cmds, Array<Point>* pts, Point* cur, P rx = fabsf(rx); ry = fabsf(ry); - angle = angle * M_PI / 180.0f; + angle = angle * MATH_PI / 180.0f; cosPhi = cosf(angle); sinPhi = sinf(angle); dx2 = (sx - x) / 2.0f; @@ -195,24 +195,24 @@ void _pathAppendArcTo(Array<PathCommand>* cmds, Array<Point>* pts, Point* cur, P //http://www.euclideanspace.com/maths/algebra/vectors/angleBetween/index.htm //Note: atan2 (0.0, 1.0) == 0.0 at = atan2(((y1p - cyp) / ry), ((x1p - cxp) / rx)); - theta1 = (at < 0.0f) ? 2.0f * M_PI + at : at; + theta1 = (at < 0.0f) ? 2.0f * MATH_PI + at : at; nat = atan2(((-y1p - cyp) / ry), ((-x1p - cxp) / rx)); - deltaTheta = (nat < at) ? 2.0f * M_PI - at + nat : nat - at; + deltaTheta = (nat < at) ? 2.0f * MATH_PI - at + nat : nat - at; if (sweep) { //Ensure delta theta < 0 or else add 360 degrees - if (deltaTheta < 0.0f) deltaTheta += (float)(2.0f * M_PI); + if (deltaTheta < 0.0f) deltaTheta += 2.0f * MATH_PI; } else { //Ensure delta theta > 0 or else substract 360 degrees - if (deltaTheta > 0.0f) deltaTheta -= (float)(2.0f * M_PI); + if (deltaTheta > 0.0f) deltaTheta -= 2.0f * MATH_PI; } //Add several cubic bezier to approximate the arc //(smaller than 90 degrees) //We add one extra segment because we want something //Smaller than 90deg (i.e. not 90 itself) - segments = static_cast<int>(fabsf(deltaTheta / float(M_PI_2)) + 1.0f); + segments = static_cast<int>(fabsf(deltaTheta / MATH_PI2) + 1.0f); delta = deltaTheta / segments; //http://www.stillhq.com/ctpfaq/2001/comp.text.pdf-faq-2001-04.txt (section 2.13) diff --git a/thirdparty/thorvg/src/renderer/sw_engine/tvgSwShape.cpp b/thirdparty/thorvg/src/renderer/sw_engine/tvgSwShape.cpp index 03261a4b7f..5e79bb6ffa 100644 --- a/thirdparty/thorvg/src/renderer/sw_engine/tvgSwShape.cpp +++ b/thirdparty/thorvg/src/renderer/sw_engine/tvgSwShape.cpp @@ -28,6 +28,7 @@ /* Internal Class Implementation */ /************************************************************************/ + struct Line { Point pt1; @@ -55,23 +56,24 @@ static void _lineSplitAt(const Line& cur, float at, Line& left, Line& right) } -static void _outlineEnd(SwOutline& outline) +static bool _outlineEnd(SwOutline& outline) { - if (outline.pts.empty()) return; + //Make a contour if lineTo/curveTo without calling close/moveTo beforehand. + if (outline.pts.empty()) return false; outline.cntrs.push(outline.pts.count - 1); outline.closed.push(false); + return false; } -static void _outlineMoveTo(SwOutline& outline, const Point* to, const Matrix* transform) +static bool _outlineMoveTo(SwOutline& outline, const Point* to, const Matrix* transform, bool closed = false) { - if (outline.pts.count > 0) { - outline.cntrs.push(outline.pts.count - 1); - outline.closed.push(false); - } + //make it a contour, if the last contour is not closed yet. + if (!closed) _outlineEnd(outline); outline.pts.push(mathTransform(to, transform)); outline.types.push(SW_CURVE_TYPE_POINT); + return false; } @@ -95,20 +97,22 @@ static void _outlineCubicTo(SwOutline& outline, const Point* ctrl1, const Point* } -static void _outlineClose(SwOutline& outline) +static bool _outlineClose(SwOutline& outline) { - uint32_t i = 0; - + uint32_t i; if (outline.cntrs.count > 0) i = outline.cntrs.last() + 1; - else i = 0; //First Path + else i = 0; //Make sure there is at least one point in the current path - if (outline.pts.count == i) return; + if (outline.pts.count == i) return false; //Close the path outline.pts.push(outline.pts[i]); + outline.cntrs.push(outline.pts.count - 1); outline.types.push(SW_CURVE_TYPE_POINT); outline.closed.push(true); + + return true; } @@ -306,7 +310,7 @@ static SwOutline* _genDashOutline(const RenderShape* rshape, const Matrix* trans bool isOdd = dash.cnt % 2; if (isOdd) patternLength *= 2; - offset = fmod(offset, patternLength); + offset = fmodf(offset, patternLength); if (offset < 0) offset += patternLength; for (size_t i = 0; i < dash.cnt * (1 + (size_t)isOdd); ++i, ++offIdx) { @@ -425,25 +429,28 @@ static bool _genOutline(SwShape* shape, const RenderShape* rshape, const Matrix* shape->outline = mpoolReqOutline(mpool, tid); auto outline = shape->outline; + bool closed = false; //Generate Outlines while (cmdCnt-- > 0) { switch (*cmds) { case PathCommand::Close: { - _outlineClose(*outline); + if (!closed) closed = _outlineClose(*outline); break; } case PathCommand::MoveTo: { - _outlineMoveTo(*outline, pts, transform); + closed = _outlineMoveTo(*outline, pts, transform, closed); ++pts; break; } case PathCommand::LineTo: { + if (closed) closed = _outlineEnd(*outline); _outlineLineTo(*outline, pts, transform); ++pts; break; } case PathCommand::CubicTo: { + if (closed) closed = _outlineEnd(*outline); _outlineCubicTo(*outline, pts, pts + 1, pts + 2, transform); pts += 3; break; @@ -452,7 +459,7 @@ static bool _genOutline(SwShape* shape, const RenderShape* rshape, const Matrix* ++cmds; } - _outlineEnd(*outline); + if (!closed) _outlineEnd(*outline); outline->fillRule = rshape->rule; shape->outline = outline; diff --git a/thirdparty/thorvg/src/renderer/tvgPaint.cpp b/thirdparty/thorvg/src/renderer/tvgPaint.cpp index 13ec4183d6..227ce10a0d 100644 --- a/thirdparty/thorvg/src/renderer/tvgPaint.cpp +++ b/thirdparty/thorvg/src/renderer/tvgPaint.cpp @@ -41,20 +41,26 @@ } -static bool _compFastTrack(Paint* cmpTarget, const RenderTransform* pTransform, RenderTransform* rTransform, RenderRegion& viewport) + +static Result _compFastTrack(Paint* cmpTarget, const RenderTransform* pTransform, RenderTransform* rTransform, RenderRegion& viewport) { /* Access Shape class by Paint is bad... but it's ok still it's an internal usage. */ auto shape = static_cast<Shape*>(cmpTarget); //Rectangle Candidates? const Point* pts; - if (shape->pathCoords(&pts) != 4) return false; + auto ptsCnt = shape->pathCoords(&pts); + + //nothing to clip + if (ptsCnt == 0) return Result::InvalidArguments; + + if (ptsCnt != 4) return Result::InsufficientCondition; if (rTransform) rTransform->update(); //No rotation and no skewing - if (pTransform && (!mathRightAngle(&pTransform->m) || mathSkewed(&pTransform->m))) return false; - if (rTransform && (!mathRightAngle(&rTransform->m) || mathSkewed(&rTransform->m))) return false; + if (pTransform && (!mathRightAngle(&pTransform->m) || mathSkewed(&pTransform->m))) return Result::InsufficientCondition; + if (rTransform && (!mathRightAngle(&rTransform->m) || mathSkewed(&rTransform->m))) return Result::InsufficientCondition; //Perpendicular Rectangle? auto pt1 = pts + 0; @@ -99,10 +105,9 @@ static bool _compFastTrack(Paint* cmpTarget, const RenderTransform* pTransform, if (viewport.w < 0) viewport.w = 0; if (viewport.h < 0) viewport.h = 0; - return true; + return Result::Success; } - - return false; + return Result::InsufficientCondition; } @@ -235,7 +240,7 @@ RenderData Paint::Impl::update(RenderMethod* renderer, const RenderTransform* pT /* 1. Composition Pre Processing */ RenderData trd = nullptr; //composite target render data RenderRegion viewport; - bool compFastTrack = false; + Result compFastTrack = Result::InsufficientCondition; bool childClipper = false; if (compData) { @@ -260,7 +265,7 @@ RenderData Paint::Impl::update(RenderMethod* renderer, const RenderTransform* pT } if (tryFastTrack) { RenderRegion viewport2; - if ((compFastTrack = _compFastTrack(target, pTransform, target->pImpl->rTransform, viewport2))) { + if ((compFastTrack = _compFastTrack(target, pTransform, target->pImpl->rTransform, viewport2)) == Result::Success) { viewport = renderer->viewport(); viewport2.intersect(viewport); renderer->viewport(viewport2); @@ -268,7 +273,7 @@ RenderData Paint::Impl::update(RenderMethod* renderer, const RenderTransform* pT } } } - if (!compFastTrack) { + if (compFastTrack == Result::InsufficientCondition) { childClipper = compData->method == CompositeMethod::ClipPath ? true : false; trd = target->pImpl->update(renderer, pTransform, clips, 255, pFlag, childClipper); if (childClipper) clips.push(trd); @@ -285,7 +290,7 @@ RenderData Paint::Impl::update(RenderMethod* renderer, const RenderTransform* pT PAINT_METHOD(rd, update(renderer, &outTransform, clips, opacity, newFlag, clipper)); /* 3. Composition Post Processing */ - if (compFastTrack) renderer->viewport(viewport); + if (compFastTrack == Result::Success) renderer->viewport(viewport); else if (childClipper) clips.pop(); return rd; diff --git a/thirdparty/thorvg/src/renderer/tvgShape.cpp b/thirdparty/thorvg/src/renderer/tvgShape.cpp index ab1f378b47..a42060e241 100644 --- a/thirdparty/thorvg/src/renderer/tvgShape.cpp +++ b/thirdparty/thorvg/src/renderer/tvgShape.cpp @@ -164,7 +164,7 @@ Result Shape::appendArc(float cx, float cy, float radius, float startAngle, floa } for (int i = 0; i < nCurves; ++i) { - auto endAngle = startAngle + ((i != nCurves - 1) ? float(M_PI_2) * sweepSign : fract); + auto endAngle = startAngle + ((i != nCurves - 1) ? MATH_PI2 * sweepSign : fract); Point end = {radius * cosf(endAngle), radius * sinf(endAngle)}; //variables needed to calculate bezier control points diff --git a/thirdparty/thorvg/update-thorvg.sh b/thirdparty/thorvg/update-thorvg.sh index 7a754c09b9..afe05e629e 100755 --- a/thirdparty/thorvg/update-thorvg.sh +++ b/thirdparty/thorvg/update-thorvg.sh @@ -1,6 +1,6 @@ #!/bin/bash -e -VERSION=0.12.9 +VERSION=0.12.10 cd thirdparty/thorvg/ || true rm -rf AUTHORS LICENSE inc/ src/ *.zip *.tar.gz tmp/ diff --git a/thirdparty/zstd/common/allocations.h b/thirdparty/zstd/common/allocations.h index a3153c4bac..5e89955010 100644 --- a/thirdparty/zstd/common/allocations.h +++ b/thirdparty/zstd/common/allocations.h @@ -14,7 +14,7 @@ #define ZSTD_DEPS_NEED_MALLOC #include "zstd_deps.h" /* ZSTD_malloc, ZSTD_calloc, ZSTD_free, ZSTD_memset */ -#include "mem.h" /* MEM_STATIC */ +#include "compiler.h" /* MEM_STATIC */ #define ZSTD_STATIC_LINKING_ONLY #include "../zstd.h" /* ZSTD_customMem */ diff --git a/thirdparty/zstd/common/bitstream.h b/thirdparty/zstd/common/bitstream.h index 72b0b3df22..676044989c 100644 --- a/thirdparty/zstd/common/bitstream.h +++ b/thirdparty/zstd/common/bitstream.h @@ -90,19 +90,20 @@ MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC); /*-******************************************** * bitStream decoding API (read backward) **********************************************/ +typedef size_t BitContainerType; typedef struct { - size_t bitContainer; + BitContainerType bitContainer; unsigned bitsConsumed; const char* ptr; const char* start; const char* limitPtr; } BIT_DStream_t; -typedef enum { BIT_DStream_unfinished = 0, - BIT_DStream_endOfBuffer = 1, - BIT_DStream_completed = 2, - BIT_DStream_overflow = 3 } BIT_DStream_status; /* result of BIT_reloadDStream() */ - /* 1,2,4,8 would be better for bitmap combinations, but slows down performance a bit ... :( */ +typedef enum { BIT_DStream_unfinished = 0, /* fully refilled */ + BIT_DStream_endOfBuffer = 1, /* still some bits left in bitstream */ + BIT_DStream_completed = 2, /* bitstream entirely consumed, bit-exact */ + BIT_DStream_overflow = 3 /* user requested more bits than present in bitstream */ + } BIT_DStream_status; /* result of BIT_reloadDStream() */ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, size_t srcSize); MEM_STATIC size_t BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits); @@ -112,7 +113,7 @@ MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* bitD); /* Start by invoking BIT_initDStream(). * A chunk of the bitStream is then stored into a local register. -* Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (size_t). +* Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (BitContainerType). * You can then retrieve bitFields stored into the local register, **in reverse order**. * Local register is explicitly reloaded from memory by the BIT_reloadDStream() method. * A reload guarantee a minimum of ((8*sizeof(bitD->bitContainer))-7) bits when its result is BIT_DStream_unfinished. @@ -162,7 +163,7 @@ MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC, return 0; } -MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits) +FORCE_INLINE_TEMPLATE size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits) { #if defined(STATIC_BMI2) && STATIC_BMI2 == 1 && !defined(ZSTD_NO_INTRINSICS) return _bzhi_u64(bitContainer, nbBits); @@ -267,22 +268,22 @@ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, si bitD->bitContainer = *(const BYTE*)(bitD->start); switch(srcSize) { - case 7: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[6]) << (sizeof(bitD->bitContainer)*8 - 16); + case 7: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[6]) << (sizeof(bitD->bitContainer)*8 - 16); ZSTD_FALLTHROUGH; - case 6: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[5]) << (sizeof(bitD->bitContainer)*8 - 24); + case 6: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[5]) << (sizeof(bitD->bitContainer)*8 - 24); ZSTD_FALLTHROUGH; - case 5: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[4]) << (sizeof(bitD->bitContainer)*8 - 32); + case 5: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[4]) << (sizeof(bitD->bitContainer)*8 - 32); ZSTD_FALLTHROUGH; - case 4: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[3]) << 24; + case 4: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[3]) << 24; ZSTD_FALLTHROUGH; - case 3: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[2]) << 16; + case 3: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[2]) << 16; ZSTD_FALLTHROUGH; - case 2: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[1]) << 8; + case 2: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[1]) << 8; ZSTD_FALLTHROUGH; default: break; @@ -297,12 +298,12 @@ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, si return srcSize; } -MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getUpperBits(size_t bitContainer, U32 const start) +FORCE_INLINE_TEMPLATE size_t BIT_getUpperBits(BitContainerType bitContainer, U32 const start) { return bitContainer >> start; } -MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getMiddleBits(size_t bitContainer, U32 const start, U32 const nbBits) +FORCE_INLINE_TEMPLATE size_t BIT_getMiddleBits(BitContainerType bitContainer, U32 const start, U32 const nbBits) { U32 const regMask = sizeof(bitContainer)*8 - 1; /* if start > regMask, bitstream is corrupted, and result is undefined */ @@ -325,7 +326,7 @@ MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getMiddleBits(size_t bitContainer, U32 c * On 32-bits, maxNbBits==24. * On 64-bits, maxNbBits==56. * @return : value extracted */ -MEM_STATIC FORCE_INLINE_ATTR size_t BIT_lookBits(const BIT_DStream_t* bitD, U32 nbBits) +FORCE_INLINE_TEMPLATE size_t BIT_lookBits(const BIT_DStream_t* bitD, U32 nbBits) { /* arbitrate between double-shift and shift+mask */ #if 1 @@ -348,7 +349,7 @@ MEM_STATIC size_t BIT_lookBitsFast(const BIT_DStream_t* bitD, U32 nbBits) return (bitD->bitContainer << (bitD->bitsConsumed & regMask)) >> (((regMask+1)-nbBits) & regMask); } -MEM_STATIC FORCE_INLINE_ATTR void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits) +FORCE_INLINE_TEMPLATE void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits) { bitD->bitsConsumed += nbBits; } @@ -357,7 +358,7 @@ MEM_STATIC FORCE_INLINE_ATTR void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits) * Read (consume) next n bits from local register and update. * Pay attention to not read more than nbBits contained into local register. * @return : extracted value. */ -MEM_STATIC FORCE_INLINE_ATTR size_t BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits) +FORCE_INLINE_TEMPLATE size_t BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits) { size_t const value = BIT_lookBits(bitD, nbBits); BIT_skipBits(bitD, nbBits); @@ -374,6 +375,21 @@ MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits) return value; } +/*! BIT_reloadDStream_internal() : + * Simple variant of BIT_reloadDStream(), with two conditions: + * 1. bitstream is valid : bitsConsumed <= sizeof(bitD->bitContainer)*8 + * 2. look window is valid after shifted down : bitD->ptr >= bitD->start + */ +MEM_STATIC BIT_DStream_status BIT_reloadDStream_internal(BIT_DStream_t* bitD) +{ + assert(bitD->bitsConsumed <= sizeof(bitD->bitContainer)*8); + bitD->ptr -= bitD->bitsConsumed >> 3; + assert(bitD->ptr >= bitD->start); + bitD->bitsConsumed &= 7; + bitD->bitContainer = MEM_readLEST(bitD->ptr); + return BIT_DStream_unfinished; +} + /*! BIT_reloadDStreamFast() : * Similar to BIT_reloadDStream(), but with two differences: * 1. bitsConsumed <= sizeof(bitD->bitContainer)*8 must hold! @@ -384,31 +400,35 @@ MEM_STATIC BIT_DStream_status BIT_reloadDStreamFast(BIT_DStream_t* bitD) { if (UNLIKELY(bitD->ptr < bitD->limitPtr)) return BIT_DStream_overflow; - assert(bitD->bitsConsumed <= sizeof(bitD->bitContainer)*8); - bitD->ptr -= bitD->bitsConsumed >> 3; - bitD->bitsConsumed &= 7; - bitD->bitContainer = MEM_readLEST(bitD->ptr); - return BIT_DStream_unfinished; + return BIT_reloadDStream_internal(bitD); } /*! BIT_reloadDStream() : * Refill `bitD` from buffer previously set in BIT_initDStream() . - * This function is safe, it guarantees it will not read beyond src buffer. + * This function is safe, it guarantees it will not never beyond src buffer. * @return : status of `BIT_DStream_t` internal register. * when status == BIT_DStream_unfinished, internal register is filled with at least 25 or 57 bits */ -MEM_STATIC FORCE_INLINE_ATTR BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD) +FORCE_INLINE_TEMPLATE BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD) { - if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8)) /* overflow detected, like end of stream */ + /* note : once in overflow mode, a bitstream remains in this mode until it's reset */ + if (UNLIKELY(bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))) { + static const BitContainerType zeroFilled = 0; + bitD->ptr = (const char*)&zeroFilled; /* aliasing is allowed for char */ + /* overflow detected, erroneous scenario or end of stream: no update */ return BIT_DStream_overflow; + } + + assert(bitD->ptr >= bitD->start); if (bitD->ptr >= bitD->limitPtr) { - return BIT_reloadDStreamFast(bitD); + return BIT_reloadDStream_internal(bitD); } if (bitD->ptr == bitD->start) { + /* reached end of bitStream => no update */ if (bitD->bitsConsumed < sizeof(bitD->bitContainer)*8) return BIT_DStream_endOfBuffer; return BIT_DStream_completed; } - /* start < ptr < limitPtr */ + /* start < ptr < limitPtr => cautious update */ { U32 nbBytes = bitD->bitsConsumed >> 3; BIT_DStream_status result = BIT_DStream_unfinished; if (bitD->ptr - nbBytes < bitD->start) { diff --git a/thirdparty/zstd/common/compiler.h b/thirdparty/zstd/common/compiler.h index 73f8d01998..31880ecbe1 100644 --- a/thirdparty/zstd/common/compiler.h +++ b/thirdparty/zstd/common/compiler.h @@ -11,6 +11,8 @@ #ifndef ZSTD_COMPILER_H #define ZSTD_COMPILER_H +#include <stddef.h> + #include "portability_macros.h" /*-******************************************************* @@ -51,12 +53,19 @@ # define WIN_CDECL #endif +/* UNUSED_ATTR tells the compiler it is okay if the function is unused. */ +#if defined(__GNUC__) +# define UNUSED_ATTR __attribute__((unused)) +#else +# define UNUSED_ATTR +#endif + /** * FORCE_INLINE_TEMPLATE is used to define C "templates", which take constant * parameters. They must be inlined for the compiler to eliminate the constant * branches. */ -#define FORCE_INLINE_TEMPLATE static INLINE_KEYWORD FORCE_INLINE_ATTR +#define FORCE_INLINE_TEMPLATE static INLINE_KEYWORD FORCE_INLINE_ATTR UNUSED_ATTR /** * HINT_INLINE is used to help the compiler generate better code. It is *not* * used for "templates", so it can be tweaked based on the compilers @@ -71,14 +80,28 @@ #if !defined(__clang__) && defined(__GNUC__) && __GNUC__ >= 4 && __GNUC_MINOR__ >= 8 && __GNUC__ < 5 # define HINT_INLINE static INLINE_KEYWORD #else -# define HINT_INLINE static INLINE_KEYWORD FORCE_INLINE_ATTR +# define HINT_INLINE FORCE_INLINE_TEMPLATE #endif -/* UNUSED_ATTR tells the compiler it is okay if the function is unused. */ +/* "soft" inline : + * The compiler is free to select if it's a good idea to inline or not. + * The main objective is to silence compiler warnings + * when a defined function in included but not used. + * + * Note : this macro is prefixed `MEM_` because it used to be provided by `mem.h` unit. + * Updating the prefix is probably preferable, but requires a fairly large codemod, + * since this name is used everywhere. + */ +#ifndef MEM_STATIC /* already defined in Linux Kernel mem.h */ #if defined(__GNUC__) -# define UNUSED_ATTR __attribute__((unused)) +# define MEM_STATIC static __inline UNUSED_ATTR +#elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) +# define MEM_STATIC static inline +#elif defined(_MSC_VER) +# define MEM_STATIC static __inline #else -# define UNUSED_ATTR +# define MEM_STATIC static /* this version may generate warnings for unused static functions; disable the relevant warning */ +#endif #endif /* force no inlining */ @@ -109,10 +132,10 @@ /* prefetch * can be disabled, by declaring NO_PREFETCH build macro */ #if defined(NO_PREFETCH) -# define PREFETCH_L1(ptr) (void)(ptr) /* disabled */ -# define PREFETCH_L2(ptr) (void)(ptr) /* disabled */ +# define PREFETCH_L1(ptr) do { (void)(ptr); } while (0) /* disabled */ +# define PREFETCH_L2(ptr) do { (void)(ptr); } while (0) /* disabled */ #else -# if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86)) /* _mm_prefetch() is not defined outside of x86/x64 */ +# if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86)) && !defined(_M_ARM64EC) /* _mm_prefetch() is not defined outside of x86/x64 */ # include <mmintrin.h> /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */ # define PREFETCH_L1(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0) # define PREFETCH_L2(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T1) @@ -120,24 +143,25 @@ # define PREFETCH_L1(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */) # define PREFETCH_L2(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 2 /* locality */) # elif defined(__aarch64__) -# define PREFETCH_L1(ptr) __asm__ __volatile__("prfm pldl1keep, %0" ::"Q"(*(ptr))) -# define PREFETCH_L2(ptr) __asm__ __volatile__("prfm pldl2keep, %0" ::"Q"(*(ptr))) +# define PREFETCH_L1(ptr) do { __asm__ __volatile__("prfm pldl1keep, %0" ::"Q"(*(ptr))); } while (0) +# define PREFETCH_L2(ptr) do { __asm__ __volatile__("prfm pldl2keep, %0" ::"Q"(*(ptr))); } while (0) # else -# define PREFETCH_L1(ptr) (void)(ptr) /* disabled */ -# define PREFETCH_L2(ptr) (void)(ptr) /* disabled */ +# define PREFETCH_L1(ptr) do { (void)(ptr); } while (0) /* disabled */ +# define PREFETCH_L2(ptr) do { (void)(ptr); } while (0) /* disabled */ # endif #endif /* NO_PREFETCH */ #define CACHELINE_SIZE 64 -#define PREFETCH_AREA(p, s) { \ - const char* const _ptr = (const char*)(p); \ - size_t const _size = (size_t)(s); \ - size_t _pos; \ - for (_pos=0; _pos<_size; _pos+=CACHELINE_SIZE) { \ - PREFETCH_L2(_ptr + _pos); \ - } \ -} +#define PREFETCH_AREA(p, s) \ + do { \ + const char* const _ptr = (const char*)(p); \ + size_t const _size = (size_t)(s); \ + size_t _pos; \ + for (_pos=0; _pos<_size; _pos+=CACHELINE_SIZE) { \ + PREFETCH_L2(_ptr + _pos); \ + } \ + } while (0) /* vectorization * older GCC (pre gcc-4.3 picked as the cutoff) uses a different syntax, @@ -166,9 +190,9 @@ #endif #if __has_builtin(__builtin_unreachable) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5))) -# define ZSTD_UNREACHABLE { assert(0), __builtin_unreachable(); } +# define ZSTD_UNREACHABLE do { assert(0), __builtin_unreachable(); } while (0) #else -# define ZSTD_UNREACHABLE { assert(0); } +# define ZSTD_UNREACHABLE do { assert(0); } while (0) #endif /* disable warnings */ @@ -281,6 +305,74 @@ * Sanitizer *****************************************************************/ +/** + * Zstd relies on pointer overflow in its decompressor. + * We add this attribute to functions that rely on pointer overflow. + */ +#ifndef ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +# if __has_attribute(no_sanitize) +# if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 8 + /* gcc < 8 only has signed-integer-overlow which triggers on pointer overflow */ +# define ZSTD_ALLOW_POINTER_OVERFLOW_ATTR __attribute__((no_sanitize("signed-integer-overflow"))) +# else + /* older versions of clang [3.7, 5.0) will warn that pointer-overflow is ignored. */ +# define ZSTD_ALLOW_POINTER_OVERFLOW_ATTR __attribute__((no_sanitize("pointer-overflow"))) +# endif +# else +# define ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +# endif +#endif + +/** + * Helper function to perform a wrapped pointer difference without trigging + * UBSAN. + * + * @returns lhs - rhs with wrapping + */ +MEM_STATIC +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +ptrdiff_t ZSTD_wrappedPtrDiff(unsigned char const* lhs, unsigned char const* rhs) +{ + return lhs - rhs; +} + +/** + * Helper function to perform a wrapped pointer add without triggering UBSAN. + * + * @return ptr + add with wrapping + */ +MEM_STATIC +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +unsigned char const* ZSTD_wrappedPtrAdd(unsigned char const* ptr, ptrdiff_t add) +{ + return ptr + add; +} + +/** + * Helper function to perform a wrapped pointer subtraction without triggering + * UBSAN. + * + * @return ptr - sub with wrapping + */ +MEM_STATIC +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +unsigned char const* ZSTD_wrappedPtrSub(unsigned char const* ptr, ptrdiff_t sub) +{ + return ptr - sub; +} + +/** + * Helper function to add to a pointer that works around C's undefined behavior + * of adding 0 to NULL. + * + * @returns `ptr + add` except it defines `NULL + 0 == NULL`. + */ +MEM_STATIC +unsigned char* ZSTD_maybeNullPtrAdd(unsigned char* ptr, ptrdiff_t add) +{ + return add > 0 ? ptr + add : ptr; +} + /* Issue #3240 reports an ASAN failure on an llvm-mingw build. Out of an * abundance of caution, disable our custom poisoning on mingw. */ #ifdef __MINGW32__ diff --git a/thirdparty/zstd/common/cpu.h b/thirdparty/zstd/common/cpu.h index 8bc34a36da..0e684d9ad8 100644 --- a/thirdparty/zstd/common/cpu.h +++ b/thirdparty/zstd/common/cpu.h @@ -35,6 +35,7 @@ MEM_STATIC ZSTD_cpuid_t ZSTD_cpuid(void) { U32 f7b = 0; U32 f7c = 0; #if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)) +#if !defined(__clang__) int reg[4]; __cpuid((int*)reg, 0); { @@ -50,6 +51,41 @@ MEM_STATIC ZSTD_cpuid_t ZSTD_cpuid(void) { f7c = (U32)reg[2]; } } +#else + /* Clang compiler has a bug (fixed in https://reviews.llvm.org/D101338) in + * which the `__cpuid` intrinsic does not save and restore `rbx` as it needs + * to due to being a reserved register. So in that case, do the `cpuid` + * ourselves. Clang supports inline assembly anyway. + */ + U32 n; + __asm__( + "pushq %%rbx\n\t" + "cpuid\n\t" + "popq %%rbx\n\t" + : "=a"(n) + : "a"(0) + : "rcx", "rdx"); + if (n >= 1) { + U32 f1a; + __asm__( + "pushq %%rbx\n\t" + "cpuid\n\t" + "popq %%rbx\n\t" + : "=a"(f1a), "=c"(f1c), "=d"(f1d) + : "a"(1) + :); + } + if (n >= 7) { + __asm__( + "pushq %%rbx\n\t" + "cpuid\n\t" + "movq %%rbx, %%rax\n\t" + "popq %%rbx" + : "=a"(f7b), "=c"(f7c) + : "a"(7), "c"(0) + : "rdx"); + } +#endif #elif defined(__i386__) && defined(__PIC__) && !defined(__clang__) && defined(__GNUC__) /* The following block like the normal cpuid branch below, but gcc * reserves ebx for use of its pic register so we must specially diff --git a/thirdparty/zstd/common/debug.c b/thirdparty/zstd/common/debug.c index ebf7bfccfa..9d0b7d229c 100644 --- a/thirdparty/zstd/common/debug.c +++ b/thirdparty/zstd/common/debug.c @@ -21,4 +21,10 @@ #include "debug.h" +#if !defined(ZSTD_LINUX_KERNEL) || (DEBUGLEVEL>=2) +/* We only use this when DEBUGLEVEL>=2, but we get -Werror=pedantic errors if a + * translation unit is empty. So remove this from Linux kernel builds, but + * otherwise just leave it in. + */ int g_debuglevel = DEBUGLEVEL; +#endif diff --git a/thirdparty/zstd/common/debug.h b/thirdparty/zstd/common/debug.h index 0e9817ea6d..a16b69e574 100644 --- a/thirdparty/zstd/common/debug.h +++ b/thirdparty/zstd/common/debug.h @@ -85,18 +85,27 @@ extern int g_debuglevel; /* the variable is only declared, It's useful when enabling very verbose levels on selective conditions (such as position in src) */ -# define RAWLOG(l, ...) { \ - if (l<=g_debuglevel) { \ - ZSTD_DEBUG_PRINT(__VA_ARGS__); \ - } } -# define DEBUGLOG(l, ...) { \ - if (l<=g_debuglevel) { \ - ZSTD_DEBUG_PRINT(__FILE__ ": " __VA_ARGS__); \ - ZSTD_DEBUG_PRINT(" \n"); \ - } } +# define RAWLOG(l, ...) \ + do { \ + if (l<=g_debuglevel) { \ + ZSTD_DEBUG_PRINT(__VA_ARGS__); \ + } \ + } while (0) + +#define STRINGIFY(x) #x +#define TOSTRING(x) STRINGIFY(x) +#define LINE_AS_STRING TOSTRING(__LINE__) + +# define DEBUGLOG(l, ...) \ + do { \ + if (l<=g_debuglevel) { \ + ZSTD_DEBUG_PRINT(__FILE__ ":" LINE_AS_STRING ": " __VA_ARGS__); \ + ZSTD_DEBUG_PRINT(" \n"); \ + } \ + } while (0) #else -# define RAWLOG(l, ...) {} /* disabled */ -# define DEBUGLOG(l, ...) {} /* disabled */ +# define RAWLOG(l, ...) do { } while (0) /* disabled */ +# define DEBUGLOG(l, ...) do { } while (0) /* disabled */ #endif diff --git a/thirdparty/zstd/common/error_private.h b/thirdparty/zstd/common/error_private.h index 325daad404..0156010c74 100644 --- a/thirdparty/zstd/common/error_private.h +++ b/thirdparty/zstd/common/error_private.h @@ -60,8 +60,13 @@ ERR_STATIC unsigned ERR_isError(size_t code) { return (code > ERROR(maxCode)); } ERR_STATIC ERR_enum ERR_getErrorCode(size_t code) { if (!ERR_isError(code)) return (ERR_enum)0; return (ERR_enum) (0-code); } /* check and forward error code */ -#define CHECK_V_F(e, f) size_t const e = f; if (ERR_isError(e)) return e -#define CHECK_F(f) { CHECK_V_F(_var_err__, f); } +#define CHECK_V_F(e, f) \ + size_t const e = f; \ + do { \ + if (ERR_isError(e)) \ + return e; \ + } while (0) +#define CHECK_F(f) do { CHECK_V_F(_var_err__, f); } while (0) /*-**************************************** @@ -95,10 +100,12 @@ void _force_has_format_string(const char *format, ...) { * We want to force this function invocation to be syntactically correct, but * we don't want to force runtime evaluation of its arguments. */ -#define _FORCE_HAS_FORMAT_STRING(...) \ - if (0) { \ - _force_has_format_string(__VA_ARGS__); \ - } +#define _FORCE_HAS_FORMAT_STRING(...) \ + do { \ + if (0) { \ + _force_has_format_string(__VA_ARGS__); \ + } \ + } while (0) #define ERR_QUOTE(str) #str @@ -109,48 +116,50 @@ void _force_has_format_string(const char *format, ...) { * In order to do that (particularly, printing the conditional that failed), * this can't just wrap RETURN_ERROR(). */ -#define RETURN_ERROR_IF(cond, err, ...) \ - if (cond) { \ - RAWLOG(3, "%s:%d: ERROR!: check %s failed, returning %s", \ - __FILE__, __LINE__, ERR_QUOTE(cond), ERR_QUOTE(ERROR(err))); \ - _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ - RAWLOG(3, ": " __VA_ARGS__); \ - RAWLOG(3, "\n"); \ - return ERROR(err); \ - } +#define RETURN_ERROR_IF(cond, err, ...) \ + do { \ + if (cond) { \ + RAWLOG(3, "%s:%d: ERROR!: check %s failed, returning %s", \ + __FILE__, __LINE__, ERR_QUOTE(cond), ERR_QUOTE(ERROR(err))); \ + _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ + RAWLOG(3, ": " __VA_ARGS__); \ + RAWLOG(3, "\n"); \ + return ERROR(err); \ + } \ + } while (0) /** * Unconditionally return the specified error. * * In debug modes, prints additional information. */ -#define RETURN_ERROR(err, ...) \ - do { \ - RAWLOG(3, "%s:%d: ERROR!: unconditional check failed, returning %s", \ - __FILE__, __LINE__, ERR_QUOTE(ERROR(err))); \ - _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ - RAWLOG(3, ": " __VA_ARGS__); \ - RAWLOG(3, "\n"); \ - return ERROR(err); \ - } while(0); +#define RETURN_ERROR(err, ...) \ + do { \ + RAWLOG(3, "%s:%d: ERROR!: unconditional check failed, returning %s", \ + __FILE__, __LINE__, ERR_QUOTE(ERROR(err))); \ + _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ + RAWLOG(3, ": " __VA_ARGS__); \ + RAWLOG(3, "\n"); \ + return ERROR(err); \ + } while(0) /** * If the provided expression evaluates to an error code, returns that error code. * * In debug modes, prints additional information. */ -#define FORWARD_IF_ERROR(err, ...) \ - do { \ - size_t const err_code = (err); \ - if (ERR_isError(err_code)) { \ - RAWLOG(3, "%s:%d: ERROR!: forwarding error in %s: %s", \ - __FILE__, __LINE__, ERR_QUOTE(err), ERR_getErrorName(err_code)); \ - _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ - RAWLOG(3, ": " __VA_ARGS__); \ - RAWLOG(3, "\n"); \ - return err_code; \ - } \ - } while(0); +#define FORWARD_IF_ERROR(err, ...) \ + do { \ + size_t const err_code = (err); \ + if (ERR_isError(err_code)) { \ + RAWLOG(3, "%s:%d: ERROR!: forwarding error in %s: %s", \ + __FILE__, __LINE__, ERR_QUOTE(err), ERR_getErrorName(err_code)); \ + _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ + RAWLOG(3, ": " __VA_ARGS__); \ + RAWLOG(3, "\n"); \ + return err_code; \ + } \ + } while(0) #if defined (__cplusplus) } diff --git a/thirdparty/zstd/common/fse.h b/thirdparty/zstd/common/fse.h index 02a1f0bc53..2ae128e60d 100644 --- a/thirdparty/zstd/common/fse.h +++ b/thirdparty/zstd/common/fse.h @@ -229,6 +229,7 @@ If there is an error, the function will return an error code, which can be teste #endif /* FSE_H */ + #if defined(FSE_STATIC_LINKING_ONLY) && !defined(FSE_H_FSE_STATIC_LINKING_ONLY) #define FSE_H_FSE_STATIC_LINKING_ONLY @@ -464,13 +465,13 @@ MEM_STATIC void FSE_encodeSymbol(BIT_CStream_t* bitC, FSE_CState_t* statePtr, un FSE_symbolCompressionTransform const symbolTT = ((const FSE_symbolCompressionTransform*)(statePtr->symbolTT))[symbol]; const U16* const stateTable = (const U16*)(statePtr->stateTable); U32 const nbBitsOut = (U32)((statePtr->value + symbolTT.deltaNbBits) >> 16); - BIT_addBits(bitC, statePtr->value, nbBitsOut); + BIT_addBits(bitC, (size_t)statePtr->value, nbBitsOut); statePtr->value = stateTable[ (statePtr->value >> nbBitsOut) + symbolTT.deltaFindState]; } MEM_STATIC void FSE_flushCState(BIT_CStream_t* bitC, const FSE_CState_t* statePtr) { - BIT_addBits(bitC, statePtr->value, statePtr->stateLog); + BIT_addBits(bitC, (size_t)statePtr->value, statePtr->stateLog); BIT_flushBits(bitC); } diff --git a/thirdparty/zstd/common/fse_decompress.c b/thirdparty/zstd/common/fse_decompress.c index 1e1c9f92d6..0dcc4640d0 100644 --- a/thirdparty/zstd/common/fse_decompress.c +++ b/thirdparty/zstd/common/fse_decompress.c @@ -22,8 +22,7 @@ #define FSE_STATIC_LINKING_ONLY #include "fse.h" #include "error_private.h" -#define ZSTD_DEPS_NEED_MALLOC -#include "zstd_deps.h" +#include "zstd_deps.h" /* ZSTD_memcpy */ #include "bits.h" /* ZSTD_highbit32 */ @@ -84,7 +83,7 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo symbolNext[s] = 1; } else { if (normalizedCounter[s] >= largeLimit) DTableH.fastMode=0; - symbolNext[s] = normalizedCounter[s]; + symbolNext[s] = (U16)normalizedCounter[s]; } } } ZSTD_memcpy(dt, &DTableH, sizeof(DTableH)); } @@ -99,8 +98,7 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo * all symbols have counts <= 8. We ensure we have 8 bytes at the end of * our buffer to handle the over-write. */ - { - U64 const add = 0x0101010101010101ull; + { U64 const add = 0x0101010101010101ull; size_t pos = 0; U64 sv = 0; U32 s; @@ -111,9 +109,8 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo for (i = 8; i < n; i += 8) { MEM_write64(spread + pos + i, sv); } - pos += n; - } - } + pos += (size_t)n; + } } /* Now we spread those positions across the table. * The benefit of doing it in two stages is that we avoid the * variable size inner loop, which caused lots of branch misses. @@ -232,12 +229,12 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompress_usingDTable_generic( break; } } - return op-ostart; + assert(op >= ostart); + return (size_t)(op-ostart); } typedef struct { short ncount[FSE_MAX_SYMBOL_VALUE + 1]; - FSE_DTable dtable[1]; /* Dynamically sized */ } FSE_DecompressWksp; @@ -252,13 +249,18 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompress_wksp_body( unsigned tableLog; unsigned maxSymbolValue = FSE_MAX_SYMBOL_VALUE; FSE_DecompressWksp* const wksp = (FSE_DecompressWksp*)workSpace; + size_t const dtablePos = sizeof(FSE_DecompressWksp) / sizeof(FSE_DTable); + FSE_DTable* const dtable = (FSE_DTable*)workSpace + dtablePos; - DEBUG_STATIC_ASSERT((FSE_MAX_SYMBOL_VALUE + 1) % 2 == 0); + FSE_STATIC_ASSERT((FSE_MAX_SYMBOL_VALUE + 1) % 2 == 0); if (wkspSize < sizeof(*wksp)) return ERROR(GENERIC); + /* correct offset to dtable depends on this property */ + FSE_STATIC_ASSERT(sizeof(FSE_DecompressWksp) % sizeof(FSE_DTable) == 0); + /* normal FSE decoding mode */ - { - size_t const NCountLength = FSE_readNCount_bmi2(wksp->ncount, &maxSymbolValue, &tableLog, istart, cSrcSize, bmi2); + { size_t const NCountLength = + FSE_readNCount_bmi2(wksp->ncount, &maxSymbolValue, &tableLog, istart, cSrcSize, bmi2); if (FSE_isError(NCountLength)) return NCountLength; if (tableLog > maxLog) return ERROR(tableLog_tooLarge); assert(NCountLength <= cSrcSize); @@ -271,16 +273,16 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompress_wksp_body( workSpace = (BYTE*)workSpace + sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog); wkspSize -= sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog); - CHECK_F( FSE_buildDTable_internal(wksp->dtable, wksp->ncount, maxSymbolValue, tableLog, workSpace, wkspSize) ); + CHECK_F( FSE_buildDTable_internal(dtable, wksp->ncount, maxSymbolValue, tableLog, workSpace, wkspSize) ); { - const void* ptr = wksp->dtable; + const void* ptr = dtable; const FSE_DTableHeader* DTableH = (const FSE_DTableHeader*)ptr; const U32 fastMode = DTableH->fastMode; /* select fast mode (static) */ - if (fastMode) return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, wksp->dtable, 1); - return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, wksp->dtable, 0); + if (fastMode) return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, dtable, 1); + return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, dtable, 0); } } diff --git a/thirdparty/zstd/common/huf.h b/thirdparty/zstd/common/huf.h index 73d1ee5654..99bf85d6f4 100644 --- a/thirdparty/zstd/common/huf.h +++ b/thirdparty/zstd/common/huf.h @@ -197,9 +197,22 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void /** HUF_getNbBitsFromCTable() : * Read nbBits from CTable symbolTable, for symbol `symbolValue` presumed <= HUF_SYMBOLVALUE_MAX - * Note 1 : is not inlined, as HUF_CElt definition is private */ + * Note 1 : If symbolValue > HUF_readCTableHeader(symbolTable).maxSymbolValue, returns 0 + * Note 2 : is not inlined, as HUF_CElt definition is private + */ U32 HUF_getNbBitsFromCTable(const HUF_CElt* symbolTable, U32 symbolValue); +typedef struct { + BYTE tableLog; + BYTE maxSymbolValue; + BYTE unused[sizeof(size_t) - 2]; +} HUF_CTableHeader; + +/** HUF_readCTableHeader() : + * @returns The header from the CTable specifying the tableLog and the maxSymbolValue. + */ +HUF_CTableHeader HUF_readCTableHeader(HUF_CElt const* ctable); + /* * HUF_decompress() does the following: * 1. select the decompression algorithm (X1, X2) based on pre-computed heuristics diff --git a/thirdparty/zstd/common/mem.h b/thirdparty/zstd/common/mem.h index 98dd47a047..096f4be519 100644 --- a/thirdparty/zstd/common/mem.h +++ b/thirdparty/zstd/common/mem.h @@ -31,15 +31,6 @@ extern "C" { # include <stdlib.h> /* _byteswap_ulong */ # include <intrin.h> /* _byteswap_* */ #endif -#if defined(__GNUC__) -# define MEM_STATIC static __inline __attribute__((unused)) -#elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) -# define MEM_STATIC static inline -#elif defined(_MSC_VER) -# define MEM_STATIC static __inline -#else -# define MEM_STATIC static /* this version may generate warnings for unused static functions; disable the relevant warning */ -#endif /*-************************************************************** * Basic Types diff --git a/thirdparty/zstd/common/pool.c b/thirdparty/zstd/common/pool.c index d5ca5a7808..3adcefc9a5 100644 --- a/thirdparty/zstd/common/pool.c +++ b/thirdparty/zstd/common/pool.c @@ -223,7 +223,7 @@ static int POOL_resize_internal(POOL_ctx* ctx, size_t numThreads) { ZSTD_pthread_t* const threadPool = (ZSTD_pthread_t*)ZSTD_customCalloc(numThreads * sizeof(ZSTD_pthread_t), ctx->customMem); if (!threadPool) return 1; /* replace existing thread pool */ - ZSTD_memcpy(threadPool, ctx->threads, ctx->threadCapacity * sizeof(*threadPool)); + ZSTD_memcpy(threadPool, ctx->threads, ctx->threadCapacity * sizeof(ZSTD_pthread_t)); ZSTD_customFree(ctx->threads, ctx->customMem); ctx->threads = threadPool; /* Initialize additional threads */ diff --git a/thirdparty/zstd/common/pool.h b/thirdparty/zstd/common/pool.h index eb22ff509f..cca4de73a8 100644 --- a/thirdparty/zstd/common/pool.h +++ b/thirdparty/zstd/common/pool.h @@ -47,7 +47,7 @@ void POOL_joinJobs(POOL_ctx* ctx); /*! POOL_resize() : * Expands or shrinks pool's number of threads. * This is more efficient than releasing + creating a new context, - * since it tries to preserve and re-use existing threads. + * since it tries to preserve and reuse existing threads. * `numThreads` must be at least 1. * @return : 0 when resize was successful, * !0 (typically 1) if there is an error. diff --git a/thirdparty/zstd/common/portability_macros.h b/thirdparty/zstd/common/portability_macros.h index 8fd6ea82d1..e50314a78e 100644 --- a/thirdparty/zstd/common/portability_macros.h +++ b/thirdparty/zstd/common/portability_macros.h @@ -68,6 +68,8 @@ /* Mark the internal assembly functions as hidden */ #ifdef __ELF__ # define ZSTD_HIDE_ASM_FUNCTION(func) .hidden func +#elif defined(__APPLE__) +# define ZSTD_HIDE_ASM_FUNCTION(func) .private_extern func #else # define ZSTD_HIDE_ASM_FUNCTION(func) #endif diff --git a/thirdparty/zstd/common/threading.c b/thirdparty/zstd/common/threading.c index ca155b9b9d..25bb8b9810 100644 --- a/thirdparty/zstd/common/threading.c +++ b/thirdparty/zstd/common/threading.c @@ -73,10 +73,12 @@ int ZSTD_pthread_create(ZSTD_pthread_t* thread, const void* unused, ZSTD_thread_params_t thread_param; (void)unused; + if (thread==NULL) return -1; + *thread = NULL; + thread_param.start_routine = start_routine; thread_param.arg = arg; thread_param.initialized = 0; - *thread = NULL; /* Setup thread initialization synchronization */ if(ZSTD_pthread_cond_init(&thread_param.initialized_cond, NULL)) { @@ -91,7 +93,7 @@ int ZSTD_pthread_create(ZSTD_pthread_t* thread, const void* unused, /* Spawn thread */ *thread = (HANDLE)_beginthreadex(NULL, 0, worker, &thread_param, 0, NULL); - if (!thread) { + if (*thread==NULL) { ZSTD_pthread_mutex_destroy(&thread_param.initialized_mutex); ZSTD_pthread_cond_destroy(&thread_param.initialized_cond); return errno; @@ -137,6 +139,7 @@ int ZSTD_pthread_join(ZSTD_pthread_t thread) int ZSTD_pthread_mutex_init(ZSTD_pthread_mutex_t* mutex, pthread_mutexattr_t const* attr) { + assert(mutex != NULL); *mutex = (pthread_mutex_t*)ZSTD_malloc(sizeof(pthread_mutex_t)); if (!*mutex) return 1; @@ -145,6 +148,7 @@ int ZSTD_pthread_mutex_init(ZSTD_pthread_mutex_t* mutex, pthread_mutexattr_t con int ZSTD_pthread_mutex_destroy(ZSTD_pthread_mutex_t* mutex) { + assert(mutex != NULL); if (!*mutex) return 0; { @@ -156,6 +160,7 @@ int ZSTD_pthread_mutex_destroy(ZSTD_pthread_mutex_t* mutex) int ZSTD_pthread_cond_init(ZSTD_pthread_cond_t* cond, pthread_condattr_t const* attr) { + assert(cond != NULL); *cond = (pthread_cond_t*)ZSTD_malloc(sizeof(pthread_cond_t)); if (!*cond) return 1; @@ -164,6 +169,7 @@ int ZSTD_pthread_cond_init(ZSTD_pthread_cond_t* cond, pthread_condattr_t const* int ZSTD_pthread_cond_destroy(ZSTD_pthread_cond_t* cond) { + assert(cond != NULL); if (!*cond) return 0; { diff --git a/thirdparty/zstd/common/xxhash.c b/thirdparty/zstd/common/xxhash.c index fd237c9062..052cd52282 100644 --- a/thirdparty/zstd/common/xxhash.c +++ b/thirdparty/zstd/common/xxhash.c @@ -1,24 +1,18 @@ /* - * xxHash - Fast Hash algorithm - * Copyright (c) Meta Platforms, Inc. and affiliates. - * - * You can contact the author at : - * - xxHash homepage: https://cyan4973.github.io/xxHash/ - * - xxHash source repository : https://github.com/Cyan4973/xxHash + * xxHash - Extremely Fast Hash algorithm + * Copyright (c) Yann Collet - Meta Platforms, Inc * * This source code is licensed under both the BSD-style license (found in the * LICENSE file in the root directory of this source tree) and the GPLv2 (found * in the COPYING file in the root directory of this source tree). * You may select, at your option, one of the above-listed licenses. -*/ - - + */ /* * xxhash.c instantiates functions defined in xxhash.h */ -#define XXH_STATIC_LINKING_ONLY /* access advanced declarations */ -#define XXH_IMPLEMENTATION /* access definitions */ +#define XXH_STATIC_LINKING_ONLY /* access advanced declarations */ +#define XXH_IMPLEMENTATION /* access definitions */ #include "xxhash.h" diff --git a/thirdparty/zstd/common/xxhash.h b/thirdparty/zstd/common/xxhash.h index b8b73290bb..e59e44267c 100644 --- a/thirdparty/zstd/common/xxhash.h +++ b/thirdparty/zstd/common/xxhash.h @@ -1,17 +1,15 @@ /* - * xxHash - Fast Hash algorithm - * Copyright (c) Meta Platforms, Inc. and affiliates. - * - * You can contact the author at : - * - xxHash homepage: https://cyan4973.github.io/xxHash/ - * - xxHash source repository : https://github.com/Cyan4973/xxHash + * xxHash - Extremely Fast Hash algorithm + * Header File + * Copyright (c) Yann Collet - Meta Platforms, Inc * * This source code is licensed under both the BSD-style license (found in the * LICENSE file in the root directory of this source tree) and the GPLv2 (found * in the COPYING file in the root directory of this source tree). * You may select, at your option, one of the above-listed licenses. -*/ + */ +/* Local adaptations for Zstandard */ #ifndef XXH_NO_XXH3 # define XXH_NO_XXH3 @@ -24,46 +22,210 @@ /*! * @mainpage xxHash * + * xxHash is an extremely fast non-cryptographic hash algorithm, working at RAM speed + * limits. + * + * It is proposed in four flavors, in three families: + * 1. @ref XXH32_family + * - Classic 32-bit hash function. Simple, compact, and runs on almost all + * 32-bit and 64-bit systems. + * 2. @ref XXH64_family + * - Classic 64-bit adaptation of XXH32. Just as simple, and runs well on most + * 64-bit systems (but _not_ 32-bit systems). + * 3. @ref XXH3_family + * - Modern 64-bit and 128-bit hash function family which features improved + * strength and performance across the board, especially on smaller data. + * It benefits greatly from SIMD and 64-bit without requiring it. + * + * Benchmarks + * --- + * The reference system uses an Intel i7-9700K CPU, and runs Ubuntu x64 20.04. + * The open source benchmark program is compiled with clang v10.0 using -O3 flag. + * + * | Hash Name | ISA ext | Width | Large Data Speed | Small Data Velocity | + * | -------------------- | ------- | ----: | ---------------: | ------------------: | + * | XXH3_64bits() | @b AVX2 | 64 | 59.4 GB/s | 133.1 | + * | MeowHash | AES-NI | 128 | 58.2 GB/s | 52.5 | + * | XXH3_128bits() | @b AVX2 | 128 | 57.9 GB/s | 118.1 | + * | CLHash | PCLMUL | 64 | 37.1 GB/s | 58.1 | + * | XXH3_64bits() | @b SSE2 | 64 | 31.5 GB/s | 133.1 | + * | XXH3_128bits() | @b SSE2 | 128 | 29.6 GB/s | 118.1 | + * | RAM sequential read | | N/A | 28.0 GB/s | N/A | + * | ahash | AES-NI | 64 | 22.5 GB/s | 107.2 | + * | City64 | | 64 | 22.0 GB/s | 76.6 | + * | T1ha2 | | 64 | 22.0 GB/s | 99.0 | + * | City128 | | 128 | 21.7 GB/s | 57.7 | + * | FarmHash | AES-NI | 64 | 21.3 GB/s | 71.9 | + * | XXH64() | | 64 | 19.4 GB/s | 71.0 | + * | SpookyHash | | 64 | 19.3 GB/s | 53.2 | + * | Mum | | 64 | 18.0 GB/s | 67.0 | + * | CRC32C | SSE4.2 | 32 | 13.0 GB/s | 57.9 | + * | XXH32() | | 32 | 9.7 GB/s | 71.9 | + * | City32 | | 32 | 9.1 GB/s | 66.0 | + * | Blake3* | @b AVX2 | 256 | 4.4 GB/s | 8.1 | + * | Murmur3 | | 32 | 3.9 GB/s | 56.1 | + * | SipHash* | | 64 | 3.0 GB/s | 43.2 | + * | Blake3* | @b SSE2 | 256 | 2.4 GB/s | 8.1 | + * | HighwayHash | | 64 | 1.4 GB/s | 6.0 | + * | FNV64 | | 64 | 1.2 GB/s | 62.7 | + * | Blake2* | | 256 | 1.1 GB/s | 5.1 | + * | SHA1* | | 160 | 0.8 GB/s | 5.6 | + * | MD5* | | 128 | 0.6 GB/s | 7.8 | + * @note + * - Hashes which require a specific ISA extension are noted. SSE2 is also noted, + * even though it is mandatory on x64. + * - Hashes with an asterisk are cryptographic. Note that MD5 is non-cryptographic + * by modern standards. + * - Small data velocity is a rough average of algorithm's efficiency for small + * data. For more accurate information, see the wiki. + * - More benchmarks and strength tests are found on the wiki: + * https://github.com/Cyan4973/xxHash/wiki + * + * Usage + * ------ + * All xxHash variants use a similar API. Changing the algorithm is a trivial + * substitution. + * + * @pre + * For functions which take an input and length parameter, the following + * requirements are assumed: + * - The range from [`input`, `input + length`) is valid, readable memory. + * - The only exception is if the `length` is `0`, `input` may be `NULL`. + * - For C++, the objects must have the *TriviallyCopyable* property, as the + * functions access bytes directly as if it was an array of `unsigned char`. + * + * @anchor single_shot_example + * **Single Shot** + * + * These functions are stateless functions which hash a contiguous block of memory, + * immediately returning the result. They are the easiest and usually the fastest + * option. + * + * XXH32(), XXH64(), XXH3_64bits(), XXH3_128bits() + * + * @code{.c} + * #include <string.h> + * #include "xxhash.h" + * + * // Example for a function which hashes a null terminated string with XXH32(). + * XXH32_hash_t hash_string(const char* string, XXH32_hash_t seed) + * { + * // NULL pointers are only valid if the length is zero + * size_t length = (string == NULL) ? 0 : strlen(string); + * return XXH32(string, length, seed); + * } + * @endcode + * + * + * @anchor streaming_example + * **Streaming** + * + * These groups of functions allow incremental hashing of unknown size, even + * more than what would fit in a size_t. + * + * XXH32_reset(), XXH64_reset(), XXH3_64bits_reset(), XXH3_128bits_reset() + * + * @code{.c} + * #include <stdio.h> + * #include <assert.h> + * #include "xxhash.h" + * // Example for a function which hashes a FILE incrementally with XXH3_64bits(). + * XXH64_hash_t hashFile(FILE* f) + * { + * // Allocate a state struct. Do not just use malloc() or new. + * XXH3_state_t* state = XXH3_createState(); + * assert(state != NULL && "Out of memory!"); + * // Reset the state to start a new hashing session. + * XXH3_64bits_reset(state); + * char buffer[4096]; + * size_t count; + * // Read the file in chunks + * while ((count = fread(buffer, 1, sizeof(buffer), f)) != 0) { + * // Run update() as many times as necessary to process the data + * XXH3_64bits_update(state, buffer, count); + * } + * // Retrieve the finalized hash. This will not change the state. + * XXH64_hash_t result = XXH3_64bits_digest(state); + * // Free the state. Do not use free(). + * XXH3_freeState(state); + * return result; + * } + * @endcode + * + * Streaming functions generate the xxHash value from an incremental input. + * This method is slower than single-call functions, due to state management. + * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized. + * + * An XXH state must first be allocated using `XXH*_createState()`. + * + * Start a new hash by initializing the state with a seed using `XXH*_reset()`. + * + * Then, feed the hash state by calling `XXH*_update()` as many times as necessary. + * + * The function returns an error code, with 0 meaning OK, and any other value + * meaning there is an error. + * + * Finally, a hash value can be produced anytime, by using `XXH*_digest()`. + * This function returns the nn-bits hash as an int or long long. + * + * It's still possible to continue inserting input into the hash state after a + * digest, and generate new hash values later on by invoking `XXH*_digest()`. + * + * When done, release the state using `XXH*_freeState()`. + * + * + * @anchor canonical_representation_example + * **Canonical Representation** + * + * The default return values from XXH functions are unsigned 32, 64 and 128 bit + * integers. + * This the simplest and fastest format for further post-processing. + * + * However, this leaves open the question of what is the order on the byte level, + * since little and big endian conventions will store the same number differently. + * + * The canonical representation settles this issue by mandating big-endian + * convention, the same convention as human-readable numbers (large digits first). + * + * When writing hash values to storage, sending them over a network, or printing + * them, it's highly recommended to use the canonical representation to ensure + * portability across a wider range of systems, present and future. + * + * The following functions allow transformation of hash values to and from + * canonical format. + * + * XXH32_canonicalFromHash(), XXH32_hashFromCanonical(), + * XXH64_canonicalFromHash(), XXH64_hashFromCanonical(), + * XXH128_canonicalFromHash(), XXH128_hashFromCanonical(), + * + * @code{.c} + * #include <stdio.h> + * #include "xxhash.h" + * + * // Example for a function which prints XXH32_hash_t in human readable format + * void printXxh32(XXH32_hash_t hash) + * { + * XXH32_canonical_t cano; + * XXH32_canonicalFromHash(&cano, hash); + * size_t i; + * for(i = 0; i < sizeof(cano.digest); ++i) { + * printf("%02x", cano.digest[i]); + * } + * printf("\n"); + * } + * + * // Example for a function which converts XXH32_canonical_t to XXH32_hash_t + * XXH32_hash_t convertCanonicalToXxh32(XXH32_canonical_t cano) + * { + * XXH32_hash_t hash = XXH32_hashFromCanonical(&cano); + * return hash; + * } + * @endcode + * + * * @file xxhash.h * xxHash prototypes and implementation */ -/* TODO: update */ -/* Notice extracted from xxHash homepage: - -xxHash is an extremely fast hash algorithm, running at RAM speed limits. -It also successfully passes all tests from the SMHasher suite. - -Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz) - -Name Speed Q.Score Author -xxHash 5.4 GB/s 10 -CrapWow 3.2 GB/s 2 Andrew -MurmurHash 3a 2.7 GB/s 10 Austin Appleby -SpookyHash 2.0 GB/s 10 Bob Jenkins -SBox 1.4 GB/s 9 Bret Mulvey -Lookup3 1.2 GB/s 9 Bob Jenkins -SuperFastHash 1.2 GB/s 1 Paul Hsieh -CityHash64 1.05 GB/s 10 Pike & Alakuijala -FNV 0.55 GB/s 5 Fowler, Noll, Vo -CRC32 0.43 GB/s 9 -MD5-32 0.33 GB/s 10 Ronald L. Rivest -SHA1-32 0.28 GB/s 10 - -Q.Score is a measure of quality of the hash function. -It depends on successfully passing SMHasher test set. -10 is a perfect score. - -Note: SMHasher's CRC32 implementation is not the fastest one. -Other speed-oriented implementations can be faster, -especially in combination with PCLMUL instruction: -https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html?showComment=1552696407071#c3490092340461170735 - -A 64-bit version, named XXH64, is available since r35. -It offers much better speed, but for 64-bit applications only. -Name Speed on 64 bits Speed on 32 bits -XXH64 13.8 GB/s 1.9 GB/s -XXH32 6.8 GB/s 6.0 GB/s -*/ #if defined (__cplusplus) extern "C" { @@ -73,21 +235,80 @@ extern "C" { * INLINE mode ******************************/ /*! - * XXH_INLINE_ALL (and XXH_PRIVATE_API) + * @defgroup public Public API + * Contains details on the public xxHash functions. + * @{ + */ +#ifdef XXH_DOXYGEN +/*! + * @brief Gives access to internal state declaration, required for static allocation. + * + * Incompatible with dynamic linking, due to risks of ABI changes. + * + * Usage: + * @code{.c} + * #define XXH_STATIC_LINKING_ONLY + * #include "xxhash.h" + * @endcode + */ +# define XXH_STATIC_LINKING_ONLY +/* Do not undef XXH_STATIC_LINKING_ONLY for Doxygen */ + +/*! + * @brief Gives access to internal definitions. + * + * Usage: + * @code{.c} + * #define XXH_STATIC_LINKING_ONLY + * #define XXH_IMPLEMENTATION + * #include "xxhash.h" + * @endcode + */ +# define XXH_IMPLEMENTATION +/* Do not undef XXH_IMPLEMENTATION for Doxygen */ + +/*! + * @brief Exposes the implementation and marks all functions as `inline`. + * * Use these build macros to inline xxhash into the target unit. * Inlining improves performance on small inputs, especially when the length is * expressed as a compile-time constant: * - * https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html + * https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html * * It also keeps xxHash symbols private to the unit, so they are not exported. * * Usage: + * @code{.c} * #define XXH_INLINE_ALL * #include "xxhash.h" - * + * @endcode * Do not compile and link xxhash.o as a separate object, as it is not useful. */ +# define XXH_INLINE_ALL +# undef XXH_INLINE_ALL +/*! + * @brief Exposes the implementation without marking functions as inline. + */ +# define XXH_PRIVATE_API +# undef XXH_PRIVATE_API +/*! + * @brief Emulate a namespace by transparently prefixing all symbols. + * + * If you want to include _and expose_ xxHash functions from within your own + * library, but also want to avoid symbol collisions with other libraries which + * may also include xxHash, you can use @ref XXH_NAMESPACE to automatically prefix + * any public symbol from xxhash library with the value of @ref XXH_NAMESPACE + * (therefore, avoid empty or numeric values). + * + * Note that no change is required within the calling program as long as it + * includes `xxhash.h`: Regular symbol names will be automatically translated + * by this header. + */ +# define XXH_NAMESPACE /* YOUR NAME HERE */ +# undef XXH_NAMESPACE +#endif + #if (defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)) \ && !defined(XXH_INLINE_ALL_31684351384) /* this section should be traversed only once */ @@ -202,21 +423,13 @@ extern "C" { # undef XXHASH_H_STATIC_13879238742 #endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */ - - /* **************************************************************** * Stable API *****************************************************************/ #ifndef XXHASH_H_5627135585666179 #define XXHASH_H_5627135585666179 1 - -/*! - * @defgroup public Public API - * Contains details on the public xxHash functions. - * @{ - */ -/* specific declaration modes for Windows */ +/*! @brief Marks a global symbol. */ #if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API) # if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT)) # ifdef XXH_EXPORT @@ -229,24 +442,6 @@ extern "C" { # endif #endif -#ifdef XXH_DOXYGEN -/*! - * @brief Emulate a namespace by transparently prefixing all symbols. - * - * If you want to include _and expose_ xxHash functions from within your own - * library, but also want to avoid symbol collisions with other libraries which - * may also include xxHash, you can use XXH_NAMESPACE to automatically prefix - * any public symbol from xxhash library with the value of XXH_NAMESPACE - * (therefore, avoid empty or numeric values). - * - * Note that no change is required within the calling program as long as it - * includes `xxhash.h`: Regular symbol names will be automatically translated - * by this header. - */ -# define XXH_NAMESPACE /* YOUR NAME HERE */ -# undef XXH_NAMESPACE -#endif - #ifdef XXH_NAMESPACE # define XXH_CAT(A,B) A##B # define XXH_NAME2(A,B) XXH_CAT(A,B) @@ -307,11 +502,39 @@ extern "C" { /* ************************************* +* Compiler specifics +***************************************/ + +/* specific declaration modes for Windows */ +#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API) +# if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT)) +# ifdef XXH_EXPORT +# define XXH_PUBLIC_API __declspec(dllexport) +# elif XXH_IMPORT +# define XXH_PUBLIC_API __declspec(dllimport) +# endif +# else +# define XXH_PUBLIC_API /* do nothing */ +# endif +#endif + +#if defined (__GNUC__) +# define XXH_CONSTF __attribute__((const)) +# define XXH_PUREF __attribute__((pure)) +# define XXH_MALLOCF __attribute__((malloc)) +#else +# define XXH_CONSTF /* disable */ +# define XXH_PUREF +# define XXH_MALLOCF +#endif + +/* ************************************* * Version ***************************************/ #define XXH_VERSION_MAJOR 0 #define XXH_VERSION_MINOR 8 -#define XXH_VERSION_RELEASE 1 +#define XXH_VERSION_RELEASE 2 +/*! @brief Version number, encoded as two digits each */ #define XXH_VERSION_NUMBER (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE) /*! @@ -320,16 +543,22 @@ extern "C" { * This is mostly useful when xxHash is compiled as a shared library, * since the returned value comes from the library, as opposed to header file. * - * @return `XXH_VERSION_NUMBER` of the invoked library. + * @return @ref XXH_VERSION_NUMBER of the invoked library. */ -XXH_PUBLIC_API unsigned XXH_versionNumber (void); +XXH_PUBLIC_API XXH_CONSTF unsigned XXH_versionNumber (void); /* **************************** * Common basic types ******************************/ #include <stddef.h> /* size_t */ -typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode; +/*! + * @brief Exit code for the streaming API. + */ +typedef enum { + XXH_OK = 0, /*!< OK */ + XXH_ERROR /*!< Error */ +} XXH_errorcode; /*-********************************************************************** @@ -346,44 +575,44 @@ typedef uint32_t XXH32_hash_t; #elif !defined (__VMS) \ && (defined (__cplusplus) \ || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) -# include <stdint.h> +# ifdef _AIX +# include <inttypes.h> +# else +# include <stdint.h> +# endif typedef uint32_t XXH32_hash_t; #else # include <limits.h> # if UINT_MAX == 0xFFFFFFFFUL typedef unsigned int XXH32_hash_t; +# elif ULONG_MAX == 0xFFFFFFFFUL + typedef unsigned long XXH32_hash_t; # else -# if ULONG_MAX == 0xFFFFFFFFUL - typedef unsigned long XXH32_hash_t; -# else -# error "unsupported platform: need a 32-bit type" -# endif +# error "unsupported platform: need a 32-bit type" # endif #endif /*! * @} * - * @defgroup xxh32_family XXH32 family + * @defgroup XXH32_family XXH32 family * @ingroup public * Contains functions used in the classic 32-bit xxHash algorithm. * * @note * XXH32 is useful for older platforms, with no or poor 64-bit performance. - * Note that @ref xxh3_family provides competitive speed - * for both 32-bit and 64-bit systems, and offers true 64/128 bit hash results. + * Note that the @ref XXH3_family provides competitive speed for both 32-bit + * and 64-bit systems, and offers true 64/128 bit hash results. * - * @see @ref xxh64_family, @ref xxh3_family : Other xxHash families - * @see @ref xxh32_impl for implementation details + * @see @ref XXH64_family, @ref XXH3_family : Other xxHash families + * @see @ref XXH32_impl for implementation details * @{ */ /*! * @brief Calculates the 32-bit hash of @p input using xxHash32. * - * Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark): 5.4 GB/s - * * @param input The block of data to be hashed, at least @p length bytes in size. * @param length The length of @p input, in bytes. * @param seed The 32-bit seed to alter the hash's output predictably. @@ -393,66 +622,13 @@ typedef uint32_t XXH32_hash_t; * readable, contiguous memory. However, if @p length is `0`, @p input may be * `NULL`. In C++, this also must be *TriviallyCopyable*. * - * @return The calculated 32-bit hash value. - * - * @see - * XXH64(), XXH3_64bits_withSeed(), XXH3_128bits_withSeed(), XXH128(): - * Direct equivalents for the other variants of xxHash. - * @see - * XXH32_createState(), XXH32_update(), XXH32_digest(): Streaming version. - */ -XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed); - -/*! - * Streaming functions generate the xxHash value from an incremental input. - * This method is slower than single-call functions, due to state management. - * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized. - * - * An XXH state must first be allocated using `XXH*_createState()`. - * - * Start a new hash by initializing the state with a seed using `XXH*_reset()`. - * - * Then, feed the hash state by calling `XXH*_update()` as many times as necessary. - * - * The function returns an error code, with 0 meaning OK, and any other value - * meaning there is an error. - * - * Finally, a hash value can be produced anytime, by using `XXH*_digest()`. - * This function returns the nn-bits hash as an int or long long. - * - * It's still possible to continue inserting input into the hash state after a - * digest, and generate new hash values later on by invoking `XXH*_digest()`. - * - * When done, release the state using `XXH*_freeState()`. - * - * Example code for incrementally hashing a file: - * @code{.c} - * #include <stdio.h> - * #include <xxhash.h> - * #define BUFFER_SIZE 256 + * @return The calculated 32-bit xxHash32 value. * - * // Note: XXH64 and XXH3 use the same interface. - * XXH32_hash_t - * hashFile(FILE* stream) - * { - * XXH32_state_t* state; - * unsigned char buf[BUFFER_SIZE]; - * size_t amt; - * XXH32_hash_t hash; - * - * state = XXH32_createState(); // Create a state - * assert(state != NULL); // Error check here - * XXH32_reset(state, 0xbaad5eed); // Reset state with our seed - * while ((amt = fread(buf, 1, sizeof(buf), stream)) != 0) { - * XXH32_update(state, buf, amt); // Hash the file in chunks - * } - * hash = XXH32_digest(state); // Finalize the hash - * XXH32_freeState(state); // Clean up - * return hash; - * } - * @endcode + * @see @ref single_shot_example "Single Shot Example" for an example. */ +XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed); +#ifndef XXH_NO_STREAM /*! * @typedef struct XXH32_state_s XXH32_state_t * @brief The opaque state struct for the XXH32 streaming API. @@ -464,16 +640,21 @@ typedef struct XXH32_state_s XXH32_state_t; /*! * @brief Allocates an @ref XXH32_state_t. * - * Must be freed with XXH32_freeState(). - * @return An allocated XXH32_state_t on success, `NULL` on failure. + * @return An allocated pointer of @ref XXH32_state_t on success. + * @return `NULL` on failure. + * + * @note Must be freed with XXH32_freeState(). */ -XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void); +XXH_PUBLIC_API XXH_MALLOCF XXH32_state_t* XXH32_createState(void); /*! * @brief Frees an @ref XXH32_state_t. * - * Must be allocated with XXH32_createState(). * @param statePtr A pointer to an @ref XXH32_state_t allocated with @ref XXH32_createState(). - * @return XXH_OK. + * + * @return @ref XXH_OK. + * + * @note @p statePtr must be allocated with XXH32_createState(). + * */ XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr); /*! @@ -489,23 +670,22 @@ XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state, const XXH32_state_ /*! * @brief Resets an @ref XXH32_state_t to begin a new hash. * - * This function resets and seeds a state. Call it before @ref XXH32_update(). - * * @param statePtr The state struct to reset. * @param seed The 32-bit seed to alter the hash result predictably. * * @pre * @p statePtr must not be `NULL`. * - * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @note This function resets and seeds a state. Call it before @ref XXH32_update(). */ XXH_PUBLIC_API XXH_errorcode XXH32_reset (XXH32_state_t* statePtr, XXH32_hash_t seed); /*! * @brief Consumes a block of @p input to an @ref XXH32_state_t. * - * Call this to incrementally consume blocks of data. - * * @param statePtr The state struct to update. * @param input The block of data to be hashed, at least @p length bytes in size. * @param length The length of @p input, in bytes. @@ -517,47 +697,32 @@ XXH_PUBLIC_API XXH_errorcode XXH32_reset (XXH32_state_t* statePtr, XXH32_hash_t * readable, contiguous memory. However, if @p length is `0`, @p input may be * `NULL`. In C++, this also must be *TriviallyCopyable*. * - * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @note Call this to incrementally consume blocks of data. */ XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length); /*! * @brief Returns the calculated hash value from an @ref XXH32_state_t. * - * @note - * Calling XXH32_digest() will not affect @p statePtr, so you can update, - * digest, and update again. - * * @param statePtr The state struct to calculate the hash from. * * @pre * @p statePtr must not be `NULL`. * - * @return The calculated xxHash32 value from that state. + * @return The calculated 32-bit xxHash32 value from that state. + * + * @note + * Calling XXH32_digest() will not affect @p statePtr, so you can update, + * digest, and update again. */ -XXH_PUBLIC_API XXH32_hash_t XXH32_digest (const XXH32_state_t* statePtr); +XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_digest (const XXH32_state_t* statePtr); +#endif /* !XXH_NO_STREAM */ /******* Canonical representation *******/ -/* - * The default return values from XXH functions are unsigned 32 and 64 bit - * integers. - * This the simplest and fastest format for further post-processing. - * - * However, this leaves open the question of what is the order on the byte level, - * since little and big endian conventions will store the same number differently. - * - * The canonical representation settles this issue by mandating big-endian - * convention, the same convention as human-readable numbers (large digits first). - * - * When writing hash values to storage, sending them over a network, or printing - * them, it's highly recommended to use the canonical representation to ensure - * portability across a wider range of systems, present and future. - * - * The following functions allow transformation of hash values to and from - * canonical format. - */ - /*! * @brief Canonical (big endian) representation of @ref XXH32_hash_t. */ @@ -568,11 +733,13 @@ typedef struct { /*! * @brief Converts an @ref XXH32_hash_t to a big endian @ref XXH32_canonical_t. * - * @param dst The @ref XXH32_canonical_t pointer to be stored to. + * @param dst The @ref XXH32_canonical_t pointer to be stored to. * @param hash The @ref XXH32_hash_t to be converted. * * @pre * @p dst must not be `NULL`. + * + * @see @ref canonical_representation_example "Canonical Representation Example" */ XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash); @@ -585,44 +752,75 @@ XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t * @p src must not be `NULL`. * * @return The converted hash. + * + * @see @ref canonical_representation_example "Canonical Representation Example" */ -XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src); +XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src); +/*! @cond Doxygen ignores this part */ #ifdef __has_attribute # define XXH_HAS_ATTRIBUTE(x) __has_attribute(x) #else # define XXH_HAS_ATTRIBUTE(x) 0 #endif +/*! @endcond */ + +/*! @cond Doxygen ignores this part */ +/* + * C23 __STDC_VERSION__ number hasn't been specified yet. For now + * leave as `201711L` (C17 + 1). + * TODO: Update to correct value when its been specified. + */ +#define XXH_C23_VN 201711L +/*! @endcond */ +/*! @cond Doxygen ignores this part */ /* C-language Attributes are added in C23. */ -#if defined(__STDC_VERSION__) && (__STDC_VERSION__ > 201710L) && defined(__has_c_attribute) +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= XXH_C23_VN) && defined(__has_c_attribute) # define XXH_HAS_C_ATTRIBUTE(x) __has_c_attribute(x) #else # define XXH_HAS_C_ATTRIBUTE(x) 0 #endif +/*! @endcond */ +/*! @cond Doxygen ignores this part */ #if defined(__cplusplus) && defined(__has_cpp_attribute) # define XXH_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x) #else # define XXH_HAS_CPP_ATTRIBUTE(x) 0 #endif +/*! @endcond */ +/*! @cond Doxygen ignores this part */ /* -Define XXH_FALLTHROUGH macro for annotating switch case with the 'fallthrough' attribute -introduced in CPP17 and C23. -CPP17 : https://en.cppreference.com/w/cpp/language/attributes/fallthrough -C23 : https://en.cppreference.com/w/c/language/attributes/fallthrough -*/ -#if XXH_HAS_C_ATTRIBUTE(x) -# define XXH_FALLTHROUGH [[fallthrough]] -#elif XXH_HAS_CPP_ATTRIBUTE(x) + * Define XXH_FALLTHROUGH macro for annotating switch case with the 'fallthrough' attribute + * introduced in CPP17 and C23. + * CPP17 : https://en.cppreference.com/w/cpp/language/attributes/fallthrough + * C23 : https://en.cppreference.com/w/c/language/attributes/fallthrough + */ +#if XXH_HAS_C_ATTRIBUTE(fallthrough) || XXH_HAS_CPP_ATTRIBUTE(fallthrough) # define XXH_FALLTHROUGH [[fallthrough]] #elif XXH_HAS_ATTRIBUTE(__fallthrough__) -# define XXH_FALLTHROUGH __attribute__ ((fallthrough)) +# define XXH_FALLTHROUGH __attribute__ ((__fallthrough__)) #else -# define XXH_FALLTHROUGH +# define XXH_FALLTHROUGH /* fallthrough */ #endif +/*! @endcond */ + +/*! @cond Doxygen ignores this part */ +/* + * Define XXH_NOESCAPE for annotated pointers in public API. + * https://clang.llvm.org/docs/AttributeReference.html#noescape + * As of writing this, only supported by clang. + */ +#if XXH_HAS_ATTRIBUTE(noescape) +# define XXH_NOESCAPE __attribute__((noescape)) +#else +# define XXH_NOESCAPE +#endif +/*! @endcond */ + /*! * @} @@ -644,7 +842,11 @@ typedef uint64_t XXH64_hash_t; #elif !defined (__VMS) \ && (defined (__cplusplus) \ || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) -# include <stdint.h> +# ifdef _AIX +# include <inttypes.h> +# else +# include <stdint.h> +# endif typedef uint64_t XXH64_hash_t; #else # include <limits.h> @@ -660,7 +862,7 @@ typedef uint64_t XXH64_hash_t; /*! * @} * - * @defgroup xxh64_family XXH64 family + * @defgroup XXH64_family XXH64 family * @ingroup public * @{ * Contains functions used in the classic 64-bit xxHash algorithm. @@ -671,13 +873,9 @@ typedef uint64_t XXH64_hash_t; * It provides better speed for systems with vector processing capabilities. */ - /*! * @brief Calculates the 64-bit hash of @p input using xxHash64. * - * This function usually runs faster on 64-bit systems, but slower on 32-bit - * systems (see benchmark). - * * @param input The block of data to be hashed, at least @p length bytes in size. * @param length The length of @p input, in bytes. * @param seed The 64-bit seed to alter the hash's output predictably. @@ -687,41 +885,145 @@ typedef uint64_t XXH64_hash_t; * readable, contiguous memory. However, if @p length is `0`, @p input may be * `NULL`. In C++, this also must be *TriviallyCopyable*. * - * @return The calculated 64-bit hash. + * @return The calculated 64-bit xxHash64 value. * - * @see - * XXH32(), XXH3_64bits_withSeed(), XXH3_128bits_withSeed(), XXH128(): - * Direct equivalents for the other variants of xxHash. - * @see - * XXH64_createState(), XXH64_update(), XXH64_digest(): Streaming version. + * @see @ref single_shot_example "Single Shot Example" for an example. */ -XXH_PUBLIC_API XXH64_hash_t XXH64(const void* input, size_t length, XXH64_hash_t seed); +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed); /******* Streaming *******/ +#ifndef XXH_NO_STREAM /*! * @brief The opaque state struct for the XXH64 streaming API. * * @see XXH64_state_s for details. */ typedef struct XXH64_state_s XXH64_state_t; /* incomplete type */ -XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void); + +/*! + * @brief Allocates an @ref XXH64_state_t. + * + * @return An allocated pointer of @ref XXH64_state_t on success. + * @return `NULL` on failure. + * + * @note Must be freed with XXH64_freeState(). + */ +XXH_PUBLIC_API XXH_MALLOCF XXH64_state_t* XXH64_createState(void); + +/*! + * @brief Frees an @ref XXH64_state_t. + * + * @param statePtr A pointer to an @ref XXH64_state_t allocated with @ref XXH64_createState(). + * + * @return @ref XXH_OK. + * + * @note @p statePtr must be allocated with XXH64_createState(). + */ XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr); -XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dst_state, const XXH64_state_t* src_state); -XXH_PUBLIC_API XXH_errorcode XXH64_reset (XXH64_state_t* statePtr, XXH64_hash_t seed); -XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length); -XXH_PUBLIC_API XXH64_hash_t XXH64_digest (const XXH64_state_t* statePtr); +/*! + * @brief Copies one @ref XXH64_state_t to another. + * + * @param dst_state The state to copy to. + * @param src_state The state to copy from. + * @pre + * @p dst_state and @p src_state must not be `NULL` and must not overlap. + */ +XXH_PUBLIC_API void XXH64_copyState(XXH_NOESCAPE XXH64_state_t* dst_state, const XXH64_state_t* src_state); + +/*! + * @brief Resets an @ref XXH64_state_t to begin a new hash. + * + * @param statePtr The state struct to reset. + * @param seed The 64-bit seed to alter the hash result predictably. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @note This function resets and seeds a state. Call it before @ref XXH64_update(). + */ +XXH_PUBLIC_API XXH_errorcode XXH64_reset (XXH_NOESCAPE XXH64_state_t* statePtr, XXH64_hash_t seed); +/*! + * @brief Consumes a block of @p input to an @ref XXH64_state_t. + * + * @param statePtr The state struct to update. + * @param input The block of data to be hashed, at least @p length bytes in size. + * @param length The length of @p input, in bytes. + * + * @pre + * @p statePtr must not be `NULL`. + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @note Call this to incrementally consume blocks of data. + */ +XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH_NOESCAPE XXH64_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length); + +/*! + * @brief Returns the calculated hash value from an @ref XXH64_state_t. + * + * @param statePtr The state struct to calculate the hash from. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return The calculated 64-bit xxHash64 value from that state. + * + * @note + * Calling XXH64_digest() will not affect @p statePtr, so you can update, + * digest, and update again. + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_digest (XXH_NOESCAPE const XXH64_state_t* statePtr); +#endif /* !XXH_NO_STREAM */ /******* Canonical representation *******/ + +/*! + * @brief Canonical (big endian) representation of @ref XXH64_hash_t. + */ typedef struct { unsigned char digest[sizeof(XXH64_hash_t)]; } XXH64_canonical_t; -XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash); -XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src); + +/*! + * @brief Converts an @ref XXH64_hash_t to a big endian @ref XXH64_canonical_t. + * + * @param dst The @ref XXH64_canonical_t pointer to be stored to. + * @param hash The @ref XXH64_hash_t to be converted. + * + * @pre + * @p dst must not be `NULL`. + * + * @see @ref canonical_representation_example "Canonical Representation Example" + */ +XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH_NOESCAPE XXH64_canonical_t* dst, XXH64_hash_t hash); + +/*! + * @brief Converts an @ref XXH64_canonical_t to a native @ref XXH64_hash_t. + * + * @param src The @ref XXH64_canonical_t to convert. + * + * @pre + * @p src must not be `NULL`. + * + * @return The converted hash. + * + * @see @ref canonical_representation_example "Canonical Representation Example" + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_canonical_t* src); #ifndef XXH_NO_XXH3 + /*! * @} * ************************************************************************ - * @defgroup xxh3_family XXH3 family + * @defgroup XXH3_family XXH3 family * @ingroup public * @{ * @@ -741,16 +1043,26 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src * * XXH3's speed benefits greatly from SIMD and 64-bit arithmetic, * but does not require it. - * Any 32-bit and 64-bit targets that can run XXH32 smoothly - * can run XXH3 at competitive speeds, even without vector support. - * Further details are explained in the implementation. - * - * Optimized implementations are provided for AVX512, AVX2, SSE2, NEON, POWER8, - * ZVector and scalar targets. This can be controlled via the XXH_VECTOR macro. + * Most 32-bit and 64-bit targets that can run XXH32 smoothly can run XXH3 + * at competitive speeds, even without vector support. Further details are + * explained in the implementation. + * + * XXH3 has a fast scalar implementation, but it also includes accelerated SIMD + * implementations for many common platforms: + * - AVX512 + * - AVX2 + * - SSE2 + * - ARM NEON + * - WebAssembly SIMD128 + * - POWER8 VSX + * - s390x ZVector + * This can be controlled via the @ref XXH_VECTOR macro, but it automatically + * selects the best version according to predefined macros. For the x86 family, an + * automatic runtime dispatcher is included separately in @ref xxh_x86dispatch.c. * * XXH3 implementation is portable: * it has a generic C90 formulation that can be compiled on any platform, - * all implementations generage exactly the same hash value on all platforms. + * all implementations generate exactly the same hash value on all platforms. * Starting from v0.8.0, it's also labelled "stable", meaning that * any future version will also generate the same hash value. * @@ -762,24 +1074,59 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src * * The API supports one-shot hashing, streaming mode, and custom secrets. */ - /*-********************************************************************** * XXH3 64-bit variant ************************************************************************/ -/* XXH3_64bits(): - * default 64-bit variant, using default secret and default seed of 0. - * It's the fastest variant. */ -XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* data, size_t len); +/*! + * @brief Calculates 64-bit unseeded variant of XXH3 hash of @p input. + * + * @param input The block of data to be hashed, at least @p length bytes in size. + * @param length The length of @p input, in bytes. + * + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return The calculated 64-bit XXH3 hash value. + * + * @note + * This is equivalent to @ref XXH3_64bits_withSeed() with a seed of `0`, however + * it may have slightly better performance due to constant propagation of the + * defaults. + * + * @see + * XXH3_64bits_withSeed(), XXH3_64bits_withSecret(): other seeding variants + * @see @ref single_shot_example "Single Shot Example" for an example. + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits(XXH_NOESCAPE const void* input, size_t length); -/* - * XXH3_64bits_withSeed(): - * This variant generates a custom secret on the fly - * based on default secret altered using the `seed` value. +/*! + * @brief Calculates 64-bit seeded variant of XXH3 hash of @p input. + * + * @param input The block of data to be hashed, at least @p length bytes in size. + * @param length The length of @p input, in bytes. + * @param seed The 64-bit seed to alter the hash result predictably. + * + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return The calculated 64-bit XXH3 hash value. + * + * @note + * seed == 0 produces the same results as @ref XXH3_64bits(). + * + * This variant generates a custom secret on the fly based on default secret + * altered using the @p seed value. + * * While this operation is decently fast, note that it's not completely free. - * Note: seed==0 produces the same results as XXH3_64bits(). + * + * @see @ref single_shot_example "Single Shot Example" for an example. */ -XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void* data, size_t len, XXH64_hash_t seed); +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSeed(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed); /*! * The bare minimum size for a custom secret. @@ -790,27 +1137,43 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void* data, size_t len, X */ #define XXH3_SECRET_SIZE_MIN 136 -/* - * XXH3_64bits_withSecret(): +/*! + * @brief Calculates 64-bit variant of XXH3 with a custom "secret". + * + * @param data The block of data to be hashed, at least @p len bytes in size. + * @param len The length of @p data, in bytes. + * @param secret The secret data. + * @param secretSize The length of @p secret, in bytes. + * + * @return The calculated 64-bit XXH3 hash value. + * + * @pre + * The memory between @p data and @p data + @p len must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p data may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * * It's possible to provide any blob of bytes as a "secret" to generate the hash. * This makes it more difficult for an external actor to prepare an intentional collision. - * The main condition is that secretSize *must* be large enough (>= XXH3_SECRET_SIZE_MIN). + * The main condition is that @p secretSize *must* be large enough (>= @ref XXH3_SECRET_SIZE_MIN). * However, the quality of the secret impacts the dispersion of the hash algorithm. * Therefore, the secret _must_ look like a bunch of random bytes. * Avoid "trivial" or structured data such as repeated sequences or a text document. * Whenever in doubt about the "randomness" of the blob of bytes, - * consider employing "XXH3_generateSecret()" instead (see below). + * consider employing @ref XXH3_generateSecret() instead (see below). * It will generate a proper high entropy secret derived from the blob of bytes. * Another advantage of using XXH3_generateSecret() is that * it guarantees that all bits within the initial blob of bytes * will impact every bit of the output. * This is not necessarily the case when using the blob of bytes directly * because, when hashing _small_ inputs, only a portion of the secret is employed. + * + * @see @ref single_shot_example "Single Shot Example" for an example. */ -XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize); +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSecret(XXH_NOESCAPE const void* data, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize); /******* Streaming *******/ +#ifndef XXH_NO_STREAM /* * Streaming requires state maintenance. * This operation costs memory and CPU. @@ -819,40 +1182,124 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void* data, size_t len, */ /*! - * @brief The state struct for the XXH3 streaming API. + * @brief The opaque state struct for the XXH3 streaming API. * * @see XXH3_state_s for details. */ typedef struct XXH3_state_s XXH3_state_t; -XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void); +XXH_PUBLIC_API XXH_MALLOCF XXH3_state_t* XXH3_createState(void); XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr); -XXH_PUBLIC_API void XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state); -/* - * XXH3_64bits_reset(): - * Initialize with default parameters. - * digest will be equivalent to `XXH3_64bits()`. +/*! + * @brief Copies one @ref XXH3_state_t to another. + * + * @param dst_state The state to copy to. + * @param src_state The state to copy from. + * @pre + * @p dst_state and @p src_state must not be `NULL` and must not overlap. */ -XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH3_state_t* statePtr); -/* - * XXH3_64bits_reset_withSeed(): - * Generate a custom secret from `seed`, and store it into `statePtr`. - * digest will be equivalent to `XXH3_64bits_withSeed()`. +XXH_PUBLIC_API void XXH3_copyState(XXH_NOESCAPE XXH3_state_t* dst_state, XXH_NOESCAPE const XXH3_state_t* src_state); + +/*! + * @brief Resets an @ref XXH3_state_t to begin a new hash. + * + * @param statePtr The state struct to reset. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @note + * - This function resets `statePtr` and generate a secret with default parameters. + * - Call this function before @ref XXH3_64bits_update(). + * - Digest will be equivalent to `XXH3_64bits()`. + * */ -XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed); -/* - * XXH3_64bits_reset_withSecret(): - * `secret` is referenced, it _must outlive_ the hash streaming session. - * Similar to one-shot API, `secretSize` must be >= `XXH3_SECRET_SIZE_MIN`, +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr); + +/*! + * @brief Resets an @ref XXH3_state_t with 64-bit seed to begin a new hash. + * + * @param statePtr The state struct to reset. + * @param seed The 64-bit seed to alter the hash result predictably. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @note + * - This function resets `statePtr` and generate a secret from `seed`. + * - Call this function before @ref XXH3_64bits_update(). + * - Digest will be equivalent to `XXH3_64bits_withSeed()`. + * + */ +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed); + +/*! + * @brief Resets an @ref XXH3_state_t with secret data to begin a new hash. + * + * @param statePtr The state struct to reset. + * @param secret The secret data. + * @param secretSize The length of @p secret, in bytes. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @note + * `secret` is referenced, it _must outlive_ the hash streaming session. + * + * Similar to one-shot API, `secretSize` must be >= @ref XXH3_SECRET_SIZE_MIN, * and the quality of produced hash values depends on secret's entropy * (secret's content should look like a bunch of random bytes). * When in doubt about the randomness of a candidate `secret`, * consider employing `XXH3_generateSecret()` instead (see below). */ -XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize); +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize); -XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH3_state_t* statePtr, const void* input, size_t length); -XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* statePtr); +/*! + * @brief Consumes a block of @p input to an @ref XXH3_state_t. + * + * @param statePtr The state struct to update. + * @param input The block of data to be hashed, at least @p length bytes in size. + * @param length The length of @p input, in bytes. + * + * @pre + * @p statePtr must not be `NULL`. + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @note Call this to incrementally consume blocks of data. + */ +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length); + +/*! + * @brief Returns the calculated XXH3 64-bit hash value from an @ref XXH3_state_t. + * + * @param statePtr The state struct to calculate the hash from. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return The calculated XXH3 64-bit hash value from that state. + * + * @note + * Calling XXH3_64bits_digest() will not affect @p statePtr, so you can update, + * digest, and update again. + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_digest (XXH_NOESCAPE const XXH3_state_t* statePtr); +#endif /* !XXH_NO_STREAM */ /* note : canonical representation of XXH3 is the same as XXH64 * since they both produce XXH64_hash_t values */ @@ -873,11 +1320,76 @@ typedef struct { XXH64_hash_t high64; /*!< `value >> 64` */ } XXH128_hash_t; -XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* data, size_t len); -XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(const void* data, size_t len, XXH64_hash_t seed); -XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize); +/*! + * @brief Calculates 128-bit unseeded variant of XXH3 of @p data. + * + * @param data The block of data to be hashed, at least @p length bytes in size. + * @param len The length of @p data, in bytes. + * + * @return The calculated 128-bit variant of XXH3 value. + * + * The 128-bit variant of XXH3 has more strength, but it has a bit of overhead + * for shorter inputs. + * + * This is equivalent to @ref XXH3_128bits_withSeed() with a seed of `0`, however + * it may have slightly better performance due to constant propagation of the + * defaults. + * + * @see XXH3_128bits_withSeed(), XXH3_128bits_withSecret(): other seeding variants + * @see @ref single_shot_example "Single Shot Example" for an example. + */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits(XXH_NOESCAPE const void* data, size_t len); +/*! @brief Calculates 128-bit seeded variant of XXH3 hash of @p data. + * + * @param data The block of data to be hashed, at least @p length bytes in size. + * @param len The length of @p data, in bytes. + * @param seed The 64-bit seed to alter the hash result predictably. + * + * @return The calculated 128-bit variant of XXH3 value. + * + * @note + * seed == 0 produces the same results as @ref XXH3_64bits(). + * + * This variant generates a custom secret on the fly based on default secret + * altered using the @p seed value. + * + * While this operation is decently fast, note that it's not completely free. + * + * @see XXH3_128bits(), XXH3_128bits_withSecret(): other seeding variants + * @see @ref single_shot_example "Single Shot Example" for an example. + */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSeed(XXH_NOESCAPE const void* data, size_t len, XXH64_hash_t seed); +/*! + * @brief Calculates 128-bit variant of XXH3 with a custom "secret". + * + * @param data The block of data to be hashed, at least @p len bytes in size. + * @param len The length of @p data, in bytes. + * @param secret The secret data. + * @param secretSize The length of @p secret, in bytes. + * + * @return The calculated 128-bit variant of XXH3 value. + * + * It's possible to provide any blob of bytes as a "secret" to generate the hash. + * This makes it more difficult for an external actor to prepare an intentional collision. + * The main condition is that @p secretSize *must* be large enough (>= @ref XXH3_SECRET_SIZE_MIN). + * However, the quality of the secret impacts the dispersion of the hash algorithm. + * Therefore, the secret _must_ look like a bunch of random bytes. + * Avoid "trivial" or structured data such as repeated sequences or a text document. + * Whenever in doubt about the "randomness" of the blob of bytes, + * consider employing @ref XXH3_generateSecret() instead (see below). + * It will generate a proper high entropy secret derived from the blob of bytes. + * Another advantage of using XXH3_generateSecret() is that + * it guarantees that all bits within the initial blob of bytes + * will impact every bit of the output. + * This is not necessarily the case when using the blob of bytes directly + * because, when hashing _small_ inputs, only a portion of the secret is employed. + * + * @see @ref single_shot_example "Single Shot Example" for an example. + */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSecret(XXH_NOESCAPE const void* data, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize); /******* Streaming *******/ +#ifndef XXH_NO_STREAM /* * Streaming requires state maintenance. * This operation costs memory and CPU. @@ -890,39 +1402,163 @@ XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSecret(const void* data, size_t le * All reset and streaming functions have same meaning as their 64-bit counterpart. */ -XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH3_state_t* statePtr); -XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed); -XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize); +/*! + * @brief Resets an @ref XXH3_state_t to begin a new hash. + * + * @param statePtr The state struct to reset. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @note + * - This function resets `statePtr` and generate a secret with default parameters. + * - Call it before @ref XXH3_128bits_update(). + * - Digest will be equivalent to `XXH3_128bits()`. + */ +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr); + +/*! + * @brief Resets an @ref XXH3_state_t with 64-bit seed to begin a new hash. + * + * @param statePtr The state struct to reset. + * @param seed The 64-bit seed to alter the hash result predictably. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @note + * - This function resets `statePtr` and generate a secret from `seed`. + * - Call it before @ref XXH3_128bits_update(). + * - Digest will be equivalent to `XXH3_128bits_withSeed()`. + */ +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed); +/*! + * @brief Resets an @ref XXH3_state_t with secret data to begin a new hash. + * + * @param statePtr The state struct to reset. + * @param secret The secret data. + * @param secretSize The length of @p secret, in bytes. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * `secret` is referenced, it _must outlive_ the hash streaming session. + * Similar to one-shot API, `secretSize` must be >= @ref XXH3_SECRET_SIZE_MIN, + * and the quality of produced hash values depends on secret's entropy + * (secret's content should look like a bunch of random bytes). + * When in doubt about the randomness of a candidate `secret`, + * consider employing `XXH3_generateSecret()` instead (see below). + */ +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize); + +/*! + * @brief Consumes a block of @p input to an @ref XXH3_state_t. + * + * Call this to incrementally consume blocks of data. + * + * @param statePtr The state struct to update. + * @param input The block of data to be hashed, at least @p length bytes in size. + * @param length The length of @p input, in bytes. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @note + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + */ +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length); -XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH3_state_t* statePtr, const void* input, size_t length); -XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* statePtr); +/*! + * @brief Returns the calculated XXH3 128-bit hash value from an @ref XXH3_state_t. + * + * @param statePtr The state struct to calculate the hash from. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return The calculated XXH3 128-bit hash value from that state. + * + * @note + * Calling XXH3_128bits_digest() will not affect @p statePtr, so you can update, + * digest, and update again. + * + */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_digest (XXH_NOESCAPE const XXH3_state_t* statePtr); +#endif /* !XXH_NO_STREAM */ /* Following helper functions make it possible to compare XXH128_hast_t values. * Since XXH128_hash_t is a structure, this capability is not offered by the language. * Note: For better performance, these functions can be inlined using XXH_INLINE_ALL */ /*! - * XXH128_isEqual(): - * Return: 1 if `h1` and `h2` are equal, 0 if they are not. + * @brief Check equality of two XXH128_hash_t values + * + * @param h1 The 128-bit hash value. + * @param h2 Another 128-bit hash value. + * + * @return `1` if `h1` and `h2` are equal. + * @return `0` if they are not. */ -XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2); +XXH_PUBLIC_API XXH_PUREF int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2); /*! - * XXH128_cmp(): + * @brief Compares two @ref XXH128_hash_t * * This comparator is compatible with stdlib's `qsort()`/`bsearch()`. * - * return: >0 if *h128_1 > *h128_2 - * =0 if *h128_1 == *h128_2 - * <0 if *h128_1 < *h128_2 + * @param h128_1 Left-hand side value + * @param h128_2 Right-hand side value + * + * @return >0 if @p h128_1 > @p h128_2 + * @return =0 if @p h128_1 == @p h128_2 + * @return <0 if @p h128_1 < @p h128_2 */ -XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2); +XXH_PUBLIC_API XXH_PUREF int XXH128_cmp(XXH_NOESCAPE const void* h128_1, XXH_NOESCAPE const void* h128_2); /******* Canonical representation *******/ typedef struct { unsigned char digest[sizeof(XXH128_hash_t)]; } XXH128_canonical_t; -XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash); -XXH_PUBLIC_API XXH128_hash_t XXH128_hashFromCanonical(const XXH128_canonical_t* src); + + +/*! + * @brief Converts an @ref XXH128_hash_t to a big endian @ref XXH128_canonical_t. + * + * @param dst The @ref XXH128_canonical_t pointer to be stored to. + * @param hash The @ref XXH128_hash_t to be converted. + * + * @pre + * @p dst must not be `NULL`. + * @see @ref canonical_representation_example "Canonical Representation Example" + */ +XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH_NOESCAPE XXH128_canonical_t* dst, XXH128_hash_t hash); + +/*! + * @brief Converts an @ref XXH128_canonical_t to a native @ref XXH128_hash_t. + * + * @param src The @ref XXH128_canonical_t to convert. + * + * @pre + * @p src must not be `NULL`. + * + * @return The converted hash. + * @see @ref canonical_representation_example "Canonical Representation Example" + */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128_hashFromCanonical(XXH_NOESCAPE const XXH128_canonical_t* src); #endif /* !XXH_NO_XXH3 */ @@ -996,7 +1632,6 @@ struct XXH64_state_s { XXH64_hash_t reserved64; /*!< Reserved field. Do not read or write to it. */ }; /* typedef'd to XXH64_state_t */ - #ifndef XXH_NO_XXH3 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* >= C11 */ @@ -1032,6 +1667,7 @@ struct XXH64_state_s { #define XXH3_INTERNALBUFFER_SIZE 256 /*! + * @internal * @brief Default size of the secret buffer (and @ref XXH3_kSecret). * * This is the size used in @ref XXH3_kSecret and the seeded functions. @@ -1064,7 +1700,7 @@ struct XXH64_state_s { */ struct XXH3_state_s { XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]); - /*!< The 8 accumulators. Similar to `vN` in @ref XXH32_state_s::v1 and @ref XXH64_state_s */ + /*!< The 8 accumulators. See @ref XXH32_state_s::v and @ref XXH64_state_s::v */ XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]); /*!< Used to store a custom secret generated from a seed. */ XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]); @@ -1104,69 +1740,148 @@ struct XXH3_state_s { * Note that this doesn't prepare the state for a streaming operation, * it's still necessary to use XXH3_NNbits_reset*() afterwards. */ -#define XXH3_INITSTATE(XXH3_state_ptr) { (XXH3_state_ptr)->seed = 0; } +#define XXH3_INITSTATE(XXH3_state_ptr) \ + do { \ + XXH3_state_t* tmp_xxh3_state_ptr = (XXH3_state_ptr); \ + tmp_xxh3_state_ptr->seed = 0; \ + tmp_xxh3_state_ptr->extSecret = NULL; \ + } while(0) -/* XXH128() : - * simple alias to pre-selected XXH3_128bits variant +/*! + * @brief Calculates the 128-bit hash of @p data using XXH3. + * + * @param data The block of data to be hashed, at least @p len bytes in size. + * @param len The length of @p data, in bytes. + * @param seed The 64-bit seed to alter the hash's output predictably. + * + * @pre + * The memory between @p data and @p data + @p len must be valid, + * readable, contiguous memory. However, if @p len is `0`, @p data may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return The calculated 128-bit XXH3 value. + * + * @see @ref single_shot_example "Single Shot Example" for an example. */ -XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t seed); +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128(XXH_NOESCAPE const void* data, size_t len, XXH64_hash_t seed); /* === Experimental API === */ /* Symbols defined below must be considered tied to a specific library version. */ -/* - * XXH3_generateSecret(): +/*! + * @brief Derive a high-entropy secret from any user-defined content, named customSeed. + * + * @param secretBuffer A writable buffer for derived high-entropy secret data. + * @param secretSize Size of secretBuffer, in bytes. Must be >= XXH3_SECRET_DEFAULT_SIZE. + * @param customSeed A user-defined content. + * @param customSeedSize Size of customSeed, in bytes. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. * - * Derive a high-entropy secret from any user-defined content, named customSeed. * The generated secret can be used in combination with `*_withSecret()` functions. - * The `_withSecret()` variants are useful to provide a higher level of protection than 64-bit seed, - * as it becomes much more difficult for an external actor to guess how to impact the calculation logic. + * The `_withSecret()` variants are useful to provide a higher level of protection + * than 64-bit seed, as it becomes much more difficult for an external actor to + * guess how to impact the calculation logic. * * The function accepts as input a custom seed of any length and any content, - * and derives from it a high-entropy secret of length @secretSize - * into an already allocated buffer @secretBuffer. - * @secretSize must be >= XXH3_SECRET_SIZE_MIN + * and derives from it a high-entropy secret of length @p secretSize into an + * already allocated buffer @p secretBuffer. * * The generated secret can then be used with any `*_withSecret()` variant. - * Functions `XXH3_128bits_withSecret()`, `XXH3_64bits_withSecret()`, - * `XXH3_128bits_reset_withSecret()` and `XXH3_64bits_reset_withSecret()` + * The functions @ref XXH3_128bits_withSecret(), @ref XXH3_64bits_withSecret(), + * @ref XXH3_128bits_reset_withSecret() and @ref XXH3_64bits_reset_withSecret() * are part of this list. They all accept a `secret` parameter - * which must be large enough for implementation reasons (>= XXH3_SECRET_SIZE_MIN) + * which must be large enough for implementation reasons (>= @ref XXH3_SECRET_SIZE_MIN) * _and_ feature very high entropy (consist of random-looking bytes). - * These conditions can be a high bar to meet, so - * XXH3_generateSecret() can be employed to ensure proper quality. + * These conditions can be a high bar to meet, so @ref XXH3_generateSecret() can + * be employed to ensure proper quality. + * + * @p customSeed can be anything. It can have any size, even small ones, + * and its content can be anything, even "poor entropy" sources such as a bunch + * of zeroes. The resulting `secret` will nonetheless provide all required qualities. * - * customSeed can be anything. It can have any size, even small ones, - * and its content can be anything, even "poor entropy" sources such as a bunch of zeroes. - * The resulting `secret` will nonetheless provide all required qualities. + * @pre + * - @p secretSize must be >= @ref XXH3_SECRET_SIZE_MIN + * - When @p customSeedSize > 0, supplying NULL as customSeed is undefined behavior. * - * When customSeedSize > 0, supplying NULL as customSeed is undefined behavior. + * Example code: + * @code{.c} + * #include <stdio.h> + * #include <stdlib.h> + * #include <string.h> + * #define XXH_STATIC_LINKING_ONLY // expose unstable API + * #include "xxhash.h" + * // Hashes argv[2] using the entropy from argv[1]. + * int main(int argc, char* argv[]) + * { + * char secret[XXH3_SECRET_SIZE_MIN]; + * if (argv != 3) { return 1; } + * XXH3_generateSecret(secret, sizeof(secret), argv[1], strlen(argv[1])); + * XXH64_hash_t h = XXH3_64bits_withSecret( + * argv[2], strlen(argv[2]), + * secret, sizeof(secret) + * ); + * printf("%016llx\n", (unsigned long long) h); + * } + * @endcode */ -XXH_PUBLIC_API XXH_errorcode XXH3_generateSecret(void* secretBuffer, size_t secretSize, const void* customSeed, size_t customSeedSize); +XXH_PUBLIC_API XXH_errorcode XXH3_generateSecret(XXH_NOESCAPE void* secretBuffer, size_t secretSize, XXH_NOESCAPE const void* customSeed, size_t customSeedSize); - -/* - * XXH3_generateSecret_fromSeed(): - * - * Generate the same secret as the _withSeed() variants. +/*! + * @brief Generate the same secret as the _withSeed() variants. * - * The resulting secret has a length of XXH3_SECRET_DEFAULT_SIZE (necessarily). - * @secretBuffer must be already allocated, of size at least XXH3_SECRET_DEFAULT_SIZE bytes. + * @param secretBuffer A writable buffer of @ref XXH3_SECRET_SIZE_MIN bytes + * @param seed The 64-bit seed to alter the hash result predictably. * * The generated secret can be used in combination with *`*_withSecret()` and `_withSecretandSeed()` variants. - * This generator is notably useful in combination with `_withSecretandSeed()`, - * as a way to emulate a faster `_withSeed()` variant. + * + * Example C++ `std::string` hash class: + * @code{.cpp} + * #include <string> + * #define XXH_STATIC_LINKING_ONLY // expose unstable API + * #include "xxhash.h" + * // Slow, seeds each time + * class HashSlow { + * XXH64_hash_t seed; + * public: + * HashSlow(XXH64_hash_t s) : seed{s} {} + * size_t operator()(const std::string& x) const { + * return size_t{XXH3_64bits_withSeed(x.c_str(), x.length(), seed)}; + * } + * }; + * // Fast, caches the seeded secret for future uses. + * class HashFast { + * unsigned char secret[XXH3_SECRET_SIZE_MIN]; + * public: + * HashFast(XXH64_hash_t s) { + * XXH3_generateSecret_fromSeed(secret, seed); + * } + * size_t operator()(const std::string& x) const { + * return size_t{ + * XXH3_64bits_withSecret(x.c_str(), x.length(), secret, sizeof(secret)) + * }; + * } + * }; + * @endcode */ -XXH_PUBLIC_API void XXH3_generateSecret_fromSeed(void* secretBuffer, XXH64_hash_t seed); +XXH_PUBLIC_API void XXH3_generateSecret_fromSeed(XXH_NOESCAPE void* secretBuffer, XXH64_hash_t seed); -/* - * *_withSecretandSeed() : +/*! + * @brief Calculates 64/128-bit seeded variant of XXH3 hash of @p data. + * + * @param data The block of data to be hashed, at least @p len bytes in size. + * @param len The length of @p data, in bytes. + * @param secret The secret data. + * @param secretSize The length of @p secret, in bytes. + * @param seed The 64-bit seed to alter the hash result predictably. + * * These variants generate hash values using either - * @seed for "short" keys (< XXH3_MIDSIZE_MAX = 240 bytes) - * or @secret for "large" keys (>= XXH3_MIDSIZE_MAX). + * @p seed for "short" keys (< @ref XXH3_MIDSIZE_MAX = 240 bytes) + * or @p secret for "large" keys (>= @ref XXH3_MIDSIZE_MAX). * * This generally benefits speed, compared to `_withSeed()` or `_withSecret()`. * `_withSeed()` has to generate the secret on the fly for "large" keys. @@ -1175,7 +1890,7 @@ XXH_PUBLIC_API void XXH3_generateSecret_fromSeed(void* secretBuffer, XXH64_hash_ * which requires more instructions than _withSeed() variants. * Therefore, _withSecretandSeed variant combines the best of both worlds. * - * When @secret has been generated by XXH3_generateSecret_fromSeed(), + * When @p secret has been generated by XXH3_generateSecret_fromSeed(), * this variant produces *exactly* the same results as `_withSeed()` variant, * hence offering only a pure speed benefit on "large" input, * by skipping the need to regenerate the secret for every large input. @@ -1184,33 +1899,71 @@ XXH_PUBLIC_API void XXH3_generateSecret_fromSeed(void* secretBuffer, XXH64_hash_ * for example with XXH3_64bits(), which then becomes the seed, * and then employ both the seed and the secret in _withSecretandSeed(). * On top of speed, an added benefit is that each bit in the secret - * has a 50% chance to swap each bit in the output, - * via its impact to the seed. + * has a 50% chance to swap each bit in the output, via its impact to the seed. + * * This is not guaranteed when using the secret directly in "small data" scenarios, * because only portions of the secret are employed for small data. */ -XXH_PUBLIC_API XXH64_hash_t -XXH3_64bits_withSecretandSeed(const void* data, size_t len, - const void* secret, size_t secretSize, +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t +XXH3_64bits_withSecretandSeed(XXH_NOESCAPE const void* data, size_t len, + XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed); - -XXH_PUBLIC_API XXH128_hash_t -XXH3_128bits_withSecretandSeed(const void* data, size_t len, - const void* secret, size_t secretSize, +/*! + * @brief Calculates 128-bit seeded variant of XXH3 hash of @p data. + * + * @param input The block of data to be hashed, at least @p len bytes in size. + * @param length The length of @p data, in bytes. + * @param secret The secret data. + * @param secretSize The length of @p secret, in bytes. + * @param seed64 The 64-bit seed to alter the hash result predictably. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @see XXH3_64bits_withSecretandSeed() + */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t +XXH3_128bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t length, + XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed64); - +#ifndef XXH_NO_STREAM +/*! + * @brief Resets an @ref XXH3_state_t with secret data to begin a new hash. + * + * @param statePtr A pointer to an @ref XXH3_state_t allocated with @ref XXH3_createState(). + * @param secret The secret data. + * @param secretSize The length of @p secret, in bytes. + * @param seed64 The 64-bit seed to alter the hash result predictably. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @see XXH3_64bits_withSecretandSeed() + */ XXH_PUBLIC_API XXH_errorcode -XXH3_64bits_reset_withSecretandSeed(XXH3_state_t* statePtr, - const void* secret, size_t secretSize, +XXH3_64bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, + XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed64); - +/*! + * @brief Resets an @ref XXH3_state_t with secret data to begin a new hash. + * + * @param statePtr A pointer to an @ref XXH3_state_t allocated with @ref XXH3_createState(). + * @param secret The secret data. + * @param secretSize The length of @p secret, in bytes. + * @param seed64 The 64-bit seed to alter the hash result predictably. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @see XXH3_64bits_withSecretandSeed() + */ XXH_PUBLIC_API XXH_errorcode -XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr, - const void* secret, size_t secretSize, +XXH3_128bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, + XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed64); +#endif /* !XXH_NO_STREAM */ - -#endif /* XXH_NO_XXH3 */ +#endif /* !XXH_NO_XXH3 */ #endif /* XXH_NO_LONG_LONG */ #if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) # define XXH_IMPLEMENTATION @@ -1264,7 +2017,7 @@ XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr, /*! * @brief Define this to disable 64-bit code. * - * Useful if only using the @ref xxh32_family and you have a strict C90 compiler. + * Useful if only using the @ref XXH32_family and you have a strict C90 compiler. */ # define XXH_NO_LONG_LONG # undef XXH_NO_LONG_LONG /* don't actually */ @@ -1287,7 +2040,7 @@ XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr, * Use `memcpy()`. Safe and portable. Note that most modern compilers will * eliminate the function call and treat it as an unaligned access. * - * - `XXH_FORCE_MEMORY_ACCESS=1`: `__attribute__((packed))` + * - `XXH_FORCE_MEMORY_ACCESS=1`: `__attribute__((aligned(1)))` * @par * Depends on compiler extensions and is therefore not portable. * This method is safe _if_ your compiler supports it, @@ -1307,7 +2060,7 @@ XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr, * inline small `memcpy()` calls, and it might also be faster on big-endian * systems which lack a native byteswap instruction. However, some compilers * will emit literal byteshifts even if the target supports unaligned access. - * . + * * * @warning * Methods 1 and 2 rely on implementation-defined behavior. Use these with @@ -1321,6 +2074,34 @@ XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr, # define XXH_FORCE_MEMORY_ACCESS 0 /*! + * @def XXH_SIZE_OPT + * @brief Controls how much xxHash optimizes for size. + * + * xxHash, when compiled, tends to result in a rather large binary size. This + * is mostly due to heavy usage to forced inlining and constant folding of the + * @ref XXH3_family to increase performance. + * + * However, some developers prefer size over speed. This option can + * significantly reduce the size of the generated code. When using the `-Os` + * or `-Oz` options on GCC or Clang, this is defined to 1 by default, + * otherwise it is defined to 0. + * + * Most of these size optimizations can be controlled manually. + * + * This is a number from 0-2. + * - `XXH_SIZE_OPT` == 0: Default. xxHash makes no size optimizations. Speed + * comes first. + * - `XXH_SIZE_OPT` == 1: Default for `-Os` and `-Oz`. xxHash is more + * conservative and disables hacks that increase code size. It implies the + * options @ref XXH_NO_INLINE_HINTS == 1, @ref XXH_FORCE_ALIGN_CHECK == 0, + * and @ref XXH3_NEON_LANES == 8 if they are not already defined. + * - `XXH_SIZE_OPT` == 2: xxHash tries to make itself as small as possible. + * Performance may cry. For example, the single shot functions just use the + * streaming API. + */ +# define XXH_SIZE_OPT 0 + +/*! * @def XXH_FORCE_ALIGN_CHECK * @brief If defined to non-zero, adds a special path for aligned inputs (XXH32() * and XXH64() only). @@ -1341,9 +2122,11 @@ XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr, * * In these cases, the alignment check can be removed by setting this macro to 0. * Then the code will always use unaligned memory access. - * Align check is automatically disabled on x86, x64 & arm64, + * Align check is automatically disabled on x86, x64, ARM64, and some ARM chips * which are platforms known to offer good unaligned memory accesses performance. * + * It is also disabled by default when @ref XXH_SIZE_OPT >= 1. + * * This option does not affect XXH3 (only XXH32 and XXH64). */ # define XXH_FORCE_ALIGN_CHECK 0 @@ -1365,12 +2148,29 @@ XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr, * XXH_NO_INLINE_HINTS marks all internal functions as static, giving the * compiler full control on whether to inline or not. * - * When not optimizing (-O0), optimizing for size (-Os, -Oz), or using - * -fno-inline with GCC or Clang, this will automatically be defined. + * When not optimizing (-O0), using `-fno-inline` with GCC or Clang, or if + * @ref XXH_SIZE_OPT >= 1, this will automatically be defined. */ # define XXH_NO_INLINE_HINTS 0 /*! + * @def XXH3_INLINE_SECRET + * @brief Determines whether to inline the XXH3 withSecret code. + * + * When the secret size is known, the compiler can improve the performance + * of XXH3_64bits_withSecret() and XXH3_128bits_withSecret(). + * + * However, if the secret size is not known, it doesn't have any benefit. This + * happens when xxHash is compiled into a global symbol. Therefore, if + * @ref XXH_INLINE_ALL is *not* defined, this will be defined to 0. + * + * Additionally, this defaults to 0 on GCC 12+, which has an issue with function pointers + * that are *sometimes* force inline on -Og, and it is impossible to automatically + * detect this optimization level. + */ +# define XXH3_INLINE_SECRET 0 + +/*! * @def XXH32_ENDJMP * @brief Whether to use a jump for `XXH32_finalize`. * @@ -1391,34 +2191,45 @@ XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr, */ # define XXH_OLD_NAMES # undef XXH_OLD_NAMES /* don't actually use, it is ugly. */ + +/*! + * @def XXH_NO_STREAM + * @brief Disables the streaming API. + * + * When xxHash is not inlined and the streaming functions are not used, disabling + * the streaming functions can improve code size significantly, especially with + * the @ref XXH3_family which tends to make constant folded copies of itself. + */ +# define XXH_NO_STREAM +# undef XXH_NO_STREAM /* don't actually */ #endif /* XXH_DOXYGEN */ /*! * @} */ #ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */ - /* prefer __packed__ structures (method 1) for gcc on armv7+ and mips */ -# if !defined(__clang__) && \ -( \ - (defined(__INTEL_COMPILER) && !defined(_WIN32)) || \ - ( \ - defined(__GNUC__) && ( \ - (defined(__ARM_ARCH) && __ARM_ARCH >= 7) || \ - ( \ - defined(__mips__) && \ - (__mips <= 5 || __mips_isa_rev < 6) && \ - (!defined(__mips16) || defined(__mips_mips16e2)) \ - ) \ - ) \ - ) \ -) + /* prefer __packed__ structures (method 1) for GCC + * < ARMv7 with unaligned access (e.g. Raspbian armhf) still uses byte shifting, so we use memcpy + * which for some reason does unaligned loads. */ +# if defined(__GNUC__) && !(defined(__ARM_ARCH) && __ARM_ARCH < 7 && defined(__ARM_FEATURE_UNALIGNED)) # define XXH_FORCE_MEMORY_ACCESS 1 # endif #endif +#ifndef XXH_SIZE_OPT + /* default to 1 for -Os or -Oz */ +# if (defined(__GNUC__) || defined(__clang__)) && defined(__OPTIMIZE_SIZE__) +# define XXH_SIZE_OPT 1 +# else +# define XXH_SIZE_OPT 0 +# endif +#endif + #ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */ -# if defined(__i386) || defined(__x86_64__) || defined(__aarch64__) \ - || defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM64) /* visual */ + /* don't check on sizeopt, x86, aarch64, or arm when unaligned access is available */ +# if XXH_SIZE_OPT >= 1 || \ + defined(__i386) || defined(__x86_64__) || defined(__aarch64__) || defined(__ARM_FEATURE_UNALIGNED) \ + || defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM64) || defined(_M_ARM) /* visual */ # define XXH_FORCE_ALIGN_CHECK 0 # else # define XXH_FORCE_ALIGN_CHECK 1 @@ -1426,14 +2237,22 @@ XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr, #endif #ifndef XXH_NO_INLINE_HINTS -# if defined(__OPTIMIZE_SIZE__) /* -Os, -Oz */ \ - || defined(__NO_INLINE__) /* -O0, -fno-inline */ +# if XXH_SIZE_OPT >= 1 || defined(__NO_INLINE__) /* -O0, -fno-inline */ # define XXH_NO_INLINE_HINTS 1 # else # define XXH_NO_INLINE_HINTS 0 # endif #endif +#ifndef XXH3_INLINE_SECRET +# if (defined(__GNUC__) && !defined(__clang__) && __GNUC__ >= 12) \ + || !defined(XXH_INLINE_ALL) +# define XXH3_INLINE_SECRET 0 +# else +# define XXH3_INLINE_SECRET 1 +# endif +#endif + #ifndef XXH32_ENDJMP /* generally preferable for performance */ # define XXH32_ENDJMP 0 @@ -1448,13 +2267,56 @@ XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr, /* ************************************* * Includes & Memory related functions ***************************************/ -/* Modify the local functions below should you wish to use some other memory routines */ -/* for ZSTD_malloc(), ZSTD_free() */ -#define ZSTD_DEPS_NEED_MALLOC -#include "zstd_deps.h" /* size_t, ZSTD_malloc, ZSTD_free, ZSTD_memcpy */ -static void* XXH_malloc(size_t s) { return ZSTD_malloc(s); } -static void XXH_free (void* p) { ZSTD_free(p); } -static void* XXH_memcpy(void* dest, const void* src, size_t size) { return ZSTD_memcpy(dest,src,size); } +#if defined(XXH_NO_STREAM) +/* nothing */ +#elif defined(XXH_NO_STDLIB) + +/* When requesting to disable any mention of stdlib, + * the library loses the ability to invoked malloc / free. + * In practice, it means that functions like `XXH*_createState()` + * will always fail, and return NULL. + * This flag is useful in situations where + * xxhash.h is integrated into some kernel, embedded or limited environment + * without access to dynamic allocation. + */ + +static XXH_CONSTF void* XXH_malloc(size_t s) { (void)s; return NULL; } +static void XXH_free(void* p) { (void)p; } + +#else + +/* + * Modify the local functions below should you wish to use + * different memory routines for malloc() and free() + */ +#include <stdlib.h> + +/*! + * @internal + * @brief Modify this function to use a different routine than malloc(). + */ +static XXH_MALLOCF void* XXH_malloc(size_t s) { return malloc(s); } + +/*! + * @internal + * @brief Modify this function to use a different routine than free(). + */ +static void XXH_free(void* p) { free(p); } + +#endif /* XXH_NO_STDLIB */ + +#include <string.h> + +/*! + * @internal + * @brief Modify this function to use a different routine than memcpy(). + */ +static void* XXH_memcpy(void* dest, const void* src, size_t size) +{ + return memcpy(dest,src,size); +} + +#include <limits.h> /* ULLONG_MAX */ /* ************************************* @@ -1487,6 +2349,11 @@ static void* XXH_memcpy(void* dest, const void* src, size_t size) { return ZSTD_ # define XXH_NO_INLINE static #endif +#if XXH3_INLINE_SECRET +# define XXH3_WITH_SECRET_INLINE XXH_FORCE_INLINE +#else +# define XXH3_WITH_SECRET_INLINE XXH_NO_INLINE +#endif /* ************************************* @@ -1512,14 +2379,17 @@ static void* XXH_memcpy(void* dest, const void* src, size_t size) { return ZSTD_ # include <assert.h> /* note: can still be disabled with NDEBUG */ # define XXH_ASSERT(c) assert(c) #else -# define XXH_ASSERT(c) ((void)0) +# if defined(__INTEL_COMPILER) +# define XXH_ASSERT(c) XXH_ASSUME((unsigned char) (c)) +# else +# define XXH_ASSERT(c) XXH_ASSUME(c) +# endif #endif /* note: use after variable declarations */ #ifndef XXH_STATIC_ASSERT # if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* C11 */ -# include <assert.h> -# define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { static_assert((c),m); } while(0) +# define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { _Static_assert((c),m); } while(0) # elif defined(__cplusplus) && (__cplusplus >= 201103L) /* C++11 */ # define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { static_assert((c),m); } while(0) # else @@ -1534,7 +2404,7 @@ static void* XXH_memcpy(void* dest, const void* src, size_t size) { return ZSTD_ * @brief Used to prevent unwanted optimizations for @p var. * * It uses an empty GCC inline assembly statement with a register constraint - * which forces @p var into a general purpose register (e.g. eax, ebx, ecx + * which forces @p var into a general purpose register (eg eax, ebx, ecx * on x86) and marks it as modified. * * This is used in a few places to avoid unwanted autovectorization (e.g. @@ -1545,18 +2415,30 @@ static void* XXH_memcpy(void* dest, const void* src, size_t size) { return ZSTD_ * XXH3_initCustomSecret_scalar(). */ #if defined(__GNUC__) || defined(__clang__) -# define XXH_COMPILER_GUARD(var) __asm__ __volatile__("" : "+r" (var)) +# define XXH_COMPILER_GUARD(var) __asm__("" : "+r" (var)) #else # define XXH_COMPILER_GUARD(var) ((void)0) #endif +/* Specifically for NEON vectors which use the "w" constraint, on + * Clang. */ +#if defined(__clang__) && defined(__ARM_ARCH) && !defined(__wasm__) +# define XXH_COMPILER_GUARD_CLANG_NEON(var) __asm__("" : "+w" (var)) +#else +# define XXH_COMPILER_GUARD_CLANG_NEON(var) ((void)0) +#endif + /* ************************************* * Basic Types ***************************************/ #if !defined (__VMS) \ && (defined (__cplusplus) \ || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) -# include <stdint.h> +# ifdef _AIX +# include <inttypes.h> +# else +# include <stdint.h> +# endif typedef uint8_t xxh_u8; #else typedef unsigned char xxh_u8; @@ -1564,6 +2446,7 @@ static void* XXH_memcpy(void* dest, const void* src, size_t size) { return ZSTD_ typedef XXH32_hash_t xxh_u32; #ifdef XXH_OLD_NAMES +# warning "XXH_OLD_NAMES is planned to be removed starting v0.9. If the program depends on it, consider moving away from it by employing newer type names directly" # define BYTE xxh_u8 # define U8 xxh_u8 # define U32 xxh_u32 @@ -1637,18 +2520,19 @@ static xxh_u32 XXH_read32(const void* memPtr) { return *(const xxh_u32*) memPtr; #elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1)) /* - * __pack instructions are safer but compiler specific, hence potentially - * problematic for some compilers. - * - * Currently only defined for GCC and ICC. + * __attribute__((aligned(1))) is supported by gcc and clang. Originally the + * documentation claimed that it only increased the alignment, but actually it + * can decrease it on gcc, clang, and icc: + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69502, + * https://gcc.godbolt.org/z/xYez1j67Y. */ #ifdef XXH_OLD_NAMES typedef union { xxh_u32 u32; } __attribute__((packed)) unalign; #endif static xxh_u32 XXH_read32(const void* ptr) { - typedef union { xxh_u32 u32; } __attribute__((packed)) xxh_unalign; - return ((const xxh_unalign*)ptr)->u32; + typedef __attribute__((aligned(1))) xxh_u32 xxh_unalign32; + return *((const xxh_unalign32*)ptr); } #else @@ -1731,6 +2615,51 @@ static int XXH_isLittleEndian(void) # define XXH_HAS_BUILTIN(x) 0 #endif + + +/* + * C23 and future versions have standard "unreachable()". + * Once it has been implemented reliably we can add it as an + * additional case: + * + * ``` + * #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= XXH_C23_VN) + * # include <stddef.h> + * # ifdef unreachable + * # define XXH_UNREACHABLE() unreachable() + * # endif + * #endif + * ``` + * + * Note C++23 also has std::unreachable() which can be detected + * as follows: + * ``` + * #if defined(__cpp_lib_unreachable) && (__cpp_lib_unreachable >= 202202L) + * # include <utility> + * # define XXH_UNREACHABLE() std::unreachable() + * #endif + * ``` + * NB: `__cpp_lib_unreachable` is defined in the `<version>` header. + * We don't use that as including `<utility>` in `extern "C"` blocks + * doesn't work on GCC12 + */ + +#if XXH_HAS_BUILTIN(__builtin_unreachable) +# define XXH_UNREACHABLE() __builtin_unreachable() + +#elif defined(_MSC_VER) +# define XXH_UNREACHABLE() __assume(0) + +#else +# define XXH_UNREACHABLE() +#endif + +#if XXH_HAS_BUILTIN(__builtin_assume) +# define XXH_ASSUME(c) __builtin_assume(c) +#else +# define XXH_ASSUME(c) if (!(c)) { XXH_UNREACHABLE(); } +#endif + /*! * @internal * @def XXH_rotl32(x,r) @@ -1853,8 +2782,10 @@ XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; } *********************************************************************/ /*! * @} - * @defgroup xxh32_impl XXH32 implementation + * @defgroup XXH32_impl XXH32 implementation * @ingroup impl + * + * Details on the XXH32 implementation. * @{ */ /* #define instead of static const, to be used as initializers */ @@ -1888,7 +2819,7 @@ static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input) acc += input * XXH_PRIME32_2; acc = XXH_rotl32(acc, 13); acc *= XXH_PRIME32_1; -#if (defined(__SSE4_1__) || defined(__aarch64__)) && !defined(XXH_ENABLE_AUTOVECTORIZE) +#if (defined(__SSE4_1__) || defined(__aarch64__) || defined(__wasm_simd128__)) && !defined(XXH_ENABLE_AUTOVECTORIZE) /* * UGLY HACK: * A compiler fence is the only thing that prevents GCC and Clang from @@ -1918,9 +2849,12 @@ static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input) * can load data, while v3 can multiply. SSE forces them to operate * together. * - * This is also enabled on AArch64, as Clang autovectorizes it incorrectly - * and it is pointless writing a NEON implementation that is basically the - * same speed as scalar for XXH32. + * This is also enabled on AArch64, as Clang is *very aggressive* in vectorizing + * the loop. NEON is only faster on the A53, and with the newer cores, it is less + * than half the speed. + * + * Additionally, this is used on WASM SIMD128 because it JITs to the same + * SIMD instructions and has the same issue. */ XXH_COMPILER_GUARD(acc); #endif @@ -1934,17 +2868,17 @@ static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input) * The final mix ensures that all input bits have a chance to impact any bit in * the output digest, resulting in an unbiased distribution. * - * @param h32 The hash to avalanche. + * @param hash The hash to avalanche. * @return The avalanched hash. */ -static xxh_u32 XXH32_avalanche(xxh_u32 h32) +static xxh_u32 XXH32_avalanche(xxh_u32 hash) { - h32 ^= h32 >> 15; - h32 *= XXH_PRIME32_2; - h32 ^= h32 >> 13; - h32 *= XXH_PRIME32_3; - h32 ^= h32 >> 16; - return(h32); + hash ^= hash >> 15; + hash *= XXH_PRIME32_2; + hash ^= hash >> 13; + hash *= XXH_PRIME32_3; + hash ^= hash >> 16; + return hash; } #define XXH_get32bits(p) XXH_readLE32_align(p, align) @@ -1957,24 +2891,25 @@ static xxh_u32 XXH32_avalanche(xxh_u32 h32) * This final stage will digest them to ensure that all input bytes are present * in the final mix. * - * @param h32 The hash to finalize. + * @param hash The hash to finalize. * @param ptr The pointer to the remaining input. * @param len The remaining length, modulo 16. * @param align Whether @p ptr is aligned. * @return The finalized hash. + * @see XXH64_finalize(). */ -static xxh_u32 -XXH32_finalize(xxh_u32 h32, const xxh_u8* ptr, size_t len, XXH_alignment align) +static XXH_PUREF xxh_u32 +XXH32_finalize(xxh_u32 hash, const xxh_u8* ptr, size_t len, XXH_alignment align) { -#define XXH_PROCESS1 do { \ - h32 += (*ptr++) * XXH_PRIME32_5; \ - h32 = XXH_rotl32(h32, 11) * XXH_PRIME32_1; \ +#define XXH_PROCESS1 do { \ + hash += (*ptr++) * XXH_PRIME32_5; \ + hash = XXH_rotl32(hash, 11) * XXH_PRIME32_1; \ } while (0) -#define XXH_PROCESS4 do { \ - h32 += XXH_get32bits(ptr) * XXH_PRIME32_3; \ - ptr += 4; \ - h32 = XXH_rotl32(h32, 17) * XXH_PRIME32_4; \ +#define XXH_PROCESS4 do { \ + hash += XXH_get32bits(ptr) * XXH_PRIME32_3; \ + ptr += 4; \ + hash = XXH_rotl32(hash, 17) * XXH_PRIME32_4; \ } while (0) if (ptr==NULL) XXH_ASSERT(len == 0); @@ -1990,49 +2925,49 @@ XXH32_finalize(xxh_u32 h32, const xxh_u8* ptr, size_t len, XXH_alignment align) XXH_PROCESS1; --len; } - return XXH32_avalanche(h32); + return XXH32_avalanche(hash); } else { switch(len&15) /* or switch(bEnd - p) */ { case 12: XXH_PROCESS4; - XXH_FALLTHROUGH; + XXH_FALLTHROUGH; /* fallthrough */ case 8: XXH_PROCESS4; - XXH_FALLTHROUGH; + XXH_FALLTHROUGH; /* fallthrough */ case 4: XXH_PROCESS4; - return XXH32_avalanche(h32); + return XXH32_avalanche(hash); case 13: XXH_PROCESS4; - XXH_FALLTHROUGH; + XXH_FALLTHROUGH; /* fallthrough */ case 9: XXH_PROCESS4; - XXH_FALLTHROUGH; + XXH_FALLTHROUGH; /* fallthrough */ case 5: XXH_PROCESS4; XXH_PROCESS1; - return XXH32_avalanche(h32); + return XXH32_avalanche(hash); case 14: XXH_PROCESS4; - XXH_FALLTHROUGH; + XXH_FALLTHROUGH; /* fallthrough */ case 10: XXH_PROCESS4; - XXH_FALLTHROUGH; + XXH_FALLTHROUGH; /* fallthrough */ case 6: XXH_PROCESS4; XXH_PROCESS1; XXH_PROCESS1; - return XXH32_avalanche(h32); + return XXH32_avalanche(hash); case 15: XXH_PROCESS4; - XXH_FALLTHROUGH; + XXH_FALLTHROUGH; /* fallthrough */ case 11: XXH_PROCESS4; - XXH_FALLTHROUGH; + XXH_FALLTHROUGH; /* fallthrough */ case 7: XXH_PROCESS4; - XXH_FALLTHROUGH; + XXH_FALLTHROUGH; /* fallthrough */ case 3: XXH_PROCESS1; - XXH_FALLTHROUGH; + XXH_FALLTHROUGH; /* fallthrough */ case 2: XXH_PROCESS1; - XXH_FALLTHROUGH; + XXH_FALLTHROUGH; /* fallthrough */ case 1: XXH_PROCESS1; - XXH_FALLTHROUGH; - case 0: return XXH32_avalanche(h32); + XXH_FALLTHROUGH; /* fallthrough */ + case 0: return XXH32_avalanche(hash); } XXH_ASSERT(0); - return h32; /* reaching this point is deemed impossible */ + return hash; /* reaching this point is deemed impossible */ } } @@ -2052,7 +2987,7 @@ XXH32_finalize(xxh_u32 h32, const xxh_u8* ptr, size_t len, XXH_alignment align) * @param align Whether @p input is aligned. * @return The calculated hash. */ -XXH_FORCE_INLINE xxh_u32 +XXH_FORCE_INLINE XXH_PUREF xxh_u32 XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment align) { xxh_u32 h32; @@ -2085,10 +3020,10 @@ XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment return XXH32_finalize(h32, input, len&15, align); } -/*! @ingroup xxh32_family */ +/*! @ingroup XXH32_family */ XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t seed) { -#if 0 +#if !defined(XXH_NO_STREAM) && XXH_SIZE_OPT >= 2 /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ XXH32_state_t state; XXH32_reset(&state, seed); @@ -2107,27 +3042,26 @@ XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t s /******* Hash streaming *******/ -/*! - * @ingroup xxh32_family - */ +#ifndef XXH_NO_STREAM +/*! @ingroup XXH32_family */ XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void) { return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t)); } -/*! @ingroup xxh32_family */ +/*! @ingroup XXH32_family */ XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr) { XXH_free(statePtr); return XXH_OK; } -/*! @ingroup xxh32_family */ +/*! @ingroup XXH32_family */ XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState) { XXH_memcpy(dstState, srcState, sizeof(*dstState)); } -/*! @ingroup xxh32_family */ +/*! @ingroup XXH32_family */ XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t seed) { XXH_ASSERT(statePtr != NULL); @@ -2140,7 +3074,7 @@ XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t s } -/*! @ingroup xxh32_family */ +/*! @ingroup XXH32_family */ XXH_PUBLIC_API XXH_errorcode XXH32_update(XXH32_state_t* state, const void* input, size_t len) { @@ -2195,7 +3129,7 @@ XXH32_update(XXH32_state_t* state, const void* input, size_t len) } -/*! @ingroup xxh32_family */ +/*! @ingroup XXH32_family */ XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t* state) { xxh_u32 h32; @@ -2213,31 +3147,18 @@ XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t* state) return XXH32_finalize(h32, (const xxh_u8*)state->mem32, state->memsize, XXH_aligned); } - +#endif /* !XXH_NO_STREAM */ /******* Canonical representation *******/ -/*! - * @ingroup xxh32_family - * The default return values from XXH functions are unsigned 32 and 64 bit - * integers. - * - * The canonical representation uses big endian convention, the same convention - * as human-readable numbers (large digits first). - * - * This way, hash values can be written into a file or buffer, remaining - * comparable across different systems. - * - * The following functions allow transformation of hash values to and from their - * canonical format. - */ +/*! @ingroup XXH32_family */ XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash) { - /* XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t)); */ + XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t)); if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash); XXH_memcpy(dst, &hash, sizeof(*dst)); } -/*! @ingroup xxh32_family */ +/*! @ingroup XXH32_family */ XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src) { return XXH_readBE32(src); @@ -2278,18 +3199,19 @@ static xxh_u64 XXH_read64(const void* memPtr) #elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1)) /* - * __pack instructions are safer, but compiler specific, hence potentially - * problematic for some compilers. - * - * Currently only defined for GCC and ICC. + * __attribute__((aligned(1))) is supported by gcc and clang. Originally the + * documentation claimed that it only increased the alignment, but actually it + * can decrease it on gcc, clang, and icc: + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69502, + * https://gcc.godbolt.org/z/xYez1j67Y. */ #ifdef XXH_OLD_NAMES typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) unalign64; #endif static xxh_u64 XXH_read64(const void* ptr) { - typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) xxh_unalign64; - return ((const xxh_unalign64*)ptr)->u64; + typedef __attribute__((aligned(1))) xxh_u64 xxh_unalign64; + return *((const xxh_unalign64*)ptr); } #else @@ -2380,8 +3302,10 @@ XXH_readLE64_align(const void* ptr, XXH_alignment align) /******* xxh64 *******/ /*! * @} - * @defgroup xxh64_impl XXH64 implementation + * @defgroup XXH64_impl XXH64 implementation * @ingroup impl + * + * Details on the XXH64 implementation. * @{ */ /* #define rather that static const, to be used as initializers */ @@ -2399,11 +3323,29 @@ XXH_readLE64_align(const void* ptr, XXH_alignment align) # define PRIME64_5 XXH_PRIME64_5 #endif +/*! @copydoc XXH32_round */ static xxh_u64 XXH64_round(xxh_u64 acc, xxh_u64 input) { acc += input * XXH_PRIME64_2; acc = XXH_rotl64(acc, 31); acc *= XXH_PRIME64_1; +#if (defined(__AVX512F__)) && !defined(XXH_ENABLE_AUTOVECTORIZE) + /* + * DISABLE AUTOVECTORIZATION: + * A compiler fence is used to prevent GCC and Clang from + * autovectorizing the XXH64 loop (pragmas and attributes don't work for some + * reason) without globally disabling AVX512. + * + * Autovectorization of XXH64 tends to be detrimental, + * though the exact outcome may change depending on exact cpu and compiler version. + * For information, it has been reported as detrimental for Skylake-X, + * but possibly beneficial for Zen4. + * + * The default is to disable auto-vectorization, + * but you can select to enable it instead using `XXH_ENABLE_AUTOVECTORIZE` build variable. + */ + XXH_COMPILER_GUARD(acc); +#endif return acc; } @@ -2415,43 +3357,59 @@ static xxh_u64 XXH64_mergeRound(xxh_u64 acc, xxh_u64 val) return acc; } -static xxh_u64 XXH64_avalanche(xxh_u64 h64) +/*! @copydoc XXH32_avalanche */ +static xxh_u64 XXH64_avalanche(xxh_u64 hash) { - h64 ^= h64 >> 33; - h64 *= XXH_PRIME64_2; - h64 ^= h64 >> 29; - h64 *= XXH_PRIME64_3; - h64 ^= h64 >> 32; - return h64; + hash ^= hash >> 33; + hash *= XXH_PRIME64_2; + hash ^= hash >> 29; + hash *= XXH_PRIME64_3; + hash ^= hash >> 32; + return hash; } #define XXH_get64bits(p) XXH_readLE64_align(p, align) -static xxh_u64 -XXH64_finalize(xxh_u64 h64, const xxh_u8* ptr, size_t len, XXH_alignment align) +/*! + * @internal + * @brief Processes the last 0-31 bytes of @p ptr. + * + * There may be up to 31 bytes remaining to consume from the input. + * This final stage will digest them to ensure that all input bytes are present + * in the final mix. + * + * @param hash The hash to finalize. + * @param ptr The pointer to the remaining input. + * @param len The remaining length, modulo 32. + * @param align Whether @p ptr is aligned. + * @return The finalized hash + * @see XXH32_finalize(). + */ +static XXH_PUREF xxh_u64 +XXH64_finalize(xxh_u64 hash, const xxh_u8* ptr, size_t len, XXH_alignment align) { if (ptr==NULL) XXH_ASSERT(len == 0); len &= 31; while (len >= 8) { xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr)); ptr += 8; - h64 ^= k1; - h64 = XXH_rotl64(h64,27) * XXH_PRIME64_1 + XXH_PRIME64_4; + hash ^= k1; + hash = XXH_rotl64(hash,27) * XXH_PRIME64_1 + XXH_PRIME64_4; len -= 8; } if (len >= 4) { - h64 ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1; + hash ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1; ptr += 4; - h64 = XXH_rotl64(h64, 23) * XXH_PRIME64_2 + XXH_PRIME64_3; + hash = XXH_rotl64(hash, 23) * XXH_PRIME64_2 + XXH_PRIME64_3; len -= 4; } while (len > 0) { - h64 ^= (*ptr++) * XXH_PRIME64_5; - h64 = XXH_rotl64(h64, 11) * XXH_PRIME64_1; + hash ^= (*ptr++) * XXH_PRIME64_5; + hash = XXH_rotl64(hash, 11) * XXH_PRIME64_1; --len; } - return XXH64_avalanche(h64); + return XXH64_avalanche(hash); } #ifdef XXH_OLD_NAMES @@ -2464,7 +3422,15 @@ XXH64_finalize(xxh_u64 h64, const xxh_u8* ptr, size_t len, XXH_alignment align) # undef XXH_PROCESS8_64 #endif -XXH_FORCE_INLINE xxh_u64 +/*! + * @internal + * @brief The implementation for @ref XXH64(). + * + * @param input , len , seed Directly passed from @ref XXH64(). + * @param align Whether @p input is aligned. + * @return The calculated hash. + */ +XXH_FORCE_INLINE XXH_PUREF xxh_u64 XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment align) { xxh_u64 h64; @@ -2501,10 +3467,10 @@ XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment } -/*! @ingroup xxh64_family */ -XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t len, XXH64_hash_t seed) +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API XXH64_hash_t XXH64 (XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed) { -#if 0 +#if !defined(XXH_NO_STREAM) && XXH_SIZE_OPT >= 2 /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ XXH64_state_t state; XXH64_reset(&state, seed); @@ -2522,27 +3488,27 @@ XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t len, XXH64_hash_t s } /******* Hash Streaming *******/ - -/*! @ingroup xxh64_family*/ +#ifndef XXH_NO_STREAM +/*! @ingroup XXH64_family*/ XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void) { return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t)); } -/*! @ingroup xxh64_family */ +/*! @ingroup XXH64_family */ XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr) { XXH_free(statePtr); return XXH_OK; } -/*! @ingroup xxh64_family */ -XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dstState, const XXH64_state_t* srcState) +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API void XXH64_copyState(XXH_NOESCAPE XXH64_state_t* dstState, const XXH64_state_t* srcState) { XXH_memcpy(dstState, srcState, sizeof(*dstState)); } -/*! @ingroup xxh64_family */ -XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, XXH64_hash_t seed) +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH_NOESCAPE XXH64_state_t* statePtr, XXH64_hash_t seed) { XXH_ASSERT(statePtr != NULL); memset(statePtr, 0, sizeof(*statePtr)); @@ -2553,9 +3519,9 @@ XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, XXH64_hash_t s return XXH_OK; } -/*! @ingroup xxh64_family */ +/*! @ingroup XXH64_family */ XXH_PUBLIC_API XXH_errorcode -XXH64_update (XXH64_state_t* state, const void* input, size_t len) +XXH64_update (XXH_NOESCAPE XXH64_state_t* state, XXH_NOESCAPE const void* input, size_t len) { if (input==NULL) { XXH_ASSERT(len == 0); @@ -2605,8 +3571,8 @@ XXH64_update (XXH64_state_t* state, const void* input, size_t len) } -/*! @ingroup xxh64_family */ -XXH_PUBLIC_API XXH64_hash_t XXH64_digest(const XXH64_state_t* state) +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API XXH64_hash_t XXH64_digest(XXH_NOESCAPE const XXH64_state_t* state) { xxh_u64 h64; @@ -2624,20 +3590,20 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_digest(const XXH64_state_t* state) return XXH64_finalize(h64, (const xxh_u8*)state->mem64, (size_t)state->total_len, XXH_aligned); } - +#endif /* !XXH_NO_STREAM */ /******* Canonical representation *******/ -/*! @ingroup xxh64_family */ -XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash) +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH_NOESCAPE XXH64_canonical_t* dst, XXH64_hash_t hash) { - /* XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t)); */ + XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t)); if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash); XXH_memcpy(dst, &hash, sizeof(*dst)); } -/*! @ingroup xxh64_family */ -XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src) +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_canonical_t* src) { return XXH_readBE64(src); } @@ -2650,7 +3616,7 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src ************************************************************************ */ /*! * @} - * @defgroup xxh3_impl XXH3 implementation + * @defgroup XXH3_impl XXH3 implementation * @ingroup impl * @{ */ @@ -2658,11 +3624,19 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src /* === Compiler specifics === */ #if ((defined(sun) || defined(__sun)) && __cplusplus) /* Solaris includes __STDC_VERSION__ with C++. Tested with GCC 5.5 */ -# define XXH_RESTRICT /* disable */ +# define XXH_RESTRICT /* disable */ #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* >= C99 */ # define XXH_RESTRICT restrict +#elif (defined (__GNUC__) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))) \ + || (defined (__clang__)) \ + || (defined (_MSC_VER) && (_MSC_VER >= 1400)) \ + || (defined (__INTEL_COMPILER) && (__INTEL_COMPILER >= 1300)) +/* + * There are a LOT more compilers that recognize __restrict but this + * covers the major ones. + */ +# define XXH_RESTRICT __restrict #else -/* Note: it might be useful to define __restrict or __restrict__ for some C++ compilers */ # define XXH_RESTRICT /* disable */ #endif @@ -2676,10 +3650,26 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src # define XXH_unlikely(x) (x) #endif +#ifndef XXH_HAS_INCLUDE +# ifdef __has_include +/* + * Not defined as XXH_HAS_INCLUDE(x) (function-like) because + * this causes segfaults in Apple Clang 4.2 (on Mac OS X 10.7 Lion) + */ +# define XXH_HAS_INCLUDE __has_include +# else +# define XXH_HAS_INCLUDE(x) 0 +# endif +#endif + #if defined(__GNUC__) || defined(__clang__) +# if defined(__ARM_FEATURE_SVE) +# include <arm_sve.h> +# endif # if defined(__ARM_NEON__) || defined(__ARM_NEON) \ - || defined(__aarch64__) || defined(_M_ARM) \ - || defined(_M_ARM64) || defined(_M_ARM64EC) + || (defined(_M_ARM) && _M_ARM >= 7) \ + || defined(_M_ARM64) || defined(_M_ARM64EC) \ + || (defined(__wasm_simd128__) && XXH_HAS_INCLUDE(<arm_neon.h>)) /* WASM SIMD128 via SIMDe */ # define inline __inline__ /* circumvent a clang bug */ # include <arm_neon.h> # undef inline @@ -2790,7 +3780,7 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src * Note that these are actually implemented as macros. * * If this is not defined, it is detected automatically. - * @ref XXH_X86DISPATCH overrides this. + * internal macro XXH_X86DISPATCH overrides this. */ enum XXH_VECTOR_TYPE /* fake enum */ { XXH_SCALAR = 0, /*!< Portable scalar version */ @@ -2802,8 +3792,13 @@ enum XXH_VECTOR_TYPE /* fake enum */ { */ XXH_AVX2 = 2, /*!< AVX2 for Haswell and Bulldozer */ XXH_AVX512 = 3, /*!< AVX512 for Skylake and Icelake */ - XXH_NEON = 4, /*!< NEON for most ARMv7-A and all AArch64 */ + XXH_NEON = 4, /*!< + * NEON for most ARMv7-A, all AArch64, and WASM SIMD128 + * via the SIMDeverywhere polyfill provided with the + * Emscripten SDK. + */ XXH_VSX = 5, /*!< VSX and ZVector for POWER8/z13 (64-bit) */ + XXH_SVE = 6, /*!< SVE for some ARMv8-A and ARMv9-A */ }; /*! * @ingroup tuning @@ -2825,12 +3820,16 @@ enum XXH_VECTOR_TYPE /* fake enum */ { # define XXH_AVX512 3 # define XXH_NEON 4 # define XXH_VSX 5 +# define XXH_SVE 6 #endif #ifndef XXH_VECTOR /* can be defined on command line */ -# if ( \ +# if defined(__ARM_FEATURE_SVE) +# define XXH_VECTOR XXH_SVE +# elif ( \ defined(__ARM_NEON__) || defined(__ARM_NEON) /* gcc */ \ || defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) /* msvc */ \ + || (defined(__wasm_simd128__) && XXH_HAS_INCLUDE(<arm_neon.h>)) /* wasm simd128 via SIMDe */ \ ) && ( \ defined(_WIN32) || defined(__LITTLE_ENDIAN__) /* little endian only */ \ || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) \ @@ -2851,6 +3850,17 @@ enum XXH_VECTOR_TYPE /* fake enum */ { # endif #endif +/* __ARM_FEATURE_SVE is only supported by GCC & Clang. */ +#if (XXH_VECTOR == XXH_SVE) && !defined(__ARM_FEATURE_SVE) +# ifdef _MSC_VER +# pragma warning(once : 4606) +# else +# warning "__ARM_FEATURE_SVE isn't supported. Use SCALAR instead." +# endif +# undef XXH_VECTOR +# define XXH_VECTOR XXH_SCALAR +#endif + /* * Controls the alignment of the accumulator, * for compatibility with aligned vector loads, which are usually faster. @@ -2870,16 +3880,26 @@ enum XXH_VECTOR_TYPE /* fake enum */ { # define XXH_ACC_ALIGN 16 # elif XXH_VECTOR == XXH_AVX512 /* avx512 */ # define XXH_ACC_ALIGN 64 +# elif XXH_VECTOR == XXH_SVE /* sve */ +# define XXH_ACC_ALIGN 64 # endif #endif #if defined(XXH_X86DISPATCH) || XXH_VECTOR == XXH_SSE2 \ || XXH_VECTOR == XXH_AVX2 || XXH_VECTOR == XXH_AVX512 # define XXH_SEC_ALIGN XXH_ACC_ALIGN +#elif XXH_VECTOR == XXH_SVE +# define XXH_SEC_ALIGN XXH_ACC_ALIGN #else # define XXH_SEC_ALIGN 8 #endif +#if defined(__GNUC__) || defined(__clang__) +# define XXH_ALIASING __attribute__((may_alias)) +#else +# define XXH_ALIASING /* nothing */ +#endif + /* * UGLY HACK: * GCC usually generates the best code with -O3 for xxHash. @@ -2903,153 +3923,126 @@ enum XXH_VECTOR_TYPE /* fake enum */ { */ #if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \ && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ - && defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */ + && defined(__OPTIMIZE__) && XXH_SIZE_OPT <= 0 /* respect -O0 and -Os */ # pragma GCC push_options # pragma GCC optimize("-O2") #endif - #if XXH_VECTOR == XXH_NEON + /* - * NEON's setup for vmlal_u32 is a little more complicated than it is on - * SSE2, AVX2, and VSX. - * - * While PMULUDQ and VMULEUW both perform a mask, VMLAL.U32 performs an upcast. - * - * To do the same operation, the 128-bit 'Q' register needs to be split into - * two 64-bit 'D' registers, performing this operation:: - * - * [ a | b ] - * | '---------. .--------' | - * | x | - * | .---------' '--------. | - * [ a & 0xFFFFFFFF | b & 0xFFFFFFFF ],[ a >> 32 | b >> 32 ] - * - * Due to significant changes in aarch64, the fastest method for aarch64 is - * completely different than the fastest method for ARMv7-A. + * UGLY HACK: While AArch64 GCC on Linux does not seem to care, on macOS, GCC -O3 + * optimizes out the entire hashLong loop because of the aliasing violation. * - * ARMv7-A treats D registers as unions overlaying Q registers, so modifying - * D11 will modify the high half of Q5. This is similar to how modifying AH - * will only affect bits 8-15 of AX on x86. - * - * VZIP takes two registers, and puts even lanes in one register and odd lanes - * in the other. - * - * On ARMv7-A, this strangely modifies both parameters in place instead of - * taking the usual 3-operand form. - * - * Therefore, if we want to do this, we can simply use a D-form VZIP.32 on the - * lower and upper halves of the Q register to end up with the high and low - * halves where we want - all in one instruction. - * - * vzip.32 d10, d11 @ d10 = { d10[0], d11[0] }; d11 = { d10[1], d11[1] } - * - * Unfortunately we need inline assembly for this: Instructions modifying two - * registers at once is not possible in GCC or Clang's IR, and they have to - * create a copy. - * - * aarch64 requires a different approach. - * - * In order to make it easier to write a decent compiler for aarch64, many - * quirks were removed, such as conditional execution. - * - * NEON was also affected by this. - * - * aarch64 cannot access the high bits of a Q-form register, and writes to a - * D-form register zero the high bits, similar to how writes to W-form scalar - * registers (or DWORD registers on x86_64) work. - * - * The formerly free vget_high intrinsics now require a vext (with a few - * exceptions) - * - * Additionally, VZIP was replaced by ZIP1 and ZIP2, which are the equivalent - * of PUNPCKL* and PUNPCKH* in SSE, respectively, in order to only modify one - * operand. - * - * The equivalent of the VZIP.32 on the lower and upper halves would be this - * mess: - * - * ext v2.4s, v0.4s, v0.4s, #2 // v2 = { v0[2], v0[3], v0[0], v0[1] } - * zip1 v1.2s, v0.2s, v2.2s // v1 = { v0[0], v2[0] } - * zip2 v0.2s, v0.2s, v1.2s // v0 = { v0[1], v2[1] } + * However, GCC is also inefficient at load-store optimization with vld1q/vst1q, + * so the only option is to mark it as aliasing. + */ +typedef uint64x2_t xxh_aliasing_uint64x2_t XXH_ALIASING; + +/*! + * @internal + * @brief `vld1q_u64` but faster and alignment-safe. * - * Instead, we use a literal downcast, vmovn_u64 (XTN), and vshrn_n_u64 (SHRN): + * On AArch64, unaligned access is always safe, but on ARMv7-a, it is only + * *conditionally* safe (`vld1` has an alignment bit like `movdq[ua]` in x86). * - * shrn v1.2s, v0.2d, #32 // v1 = (uint32x2_t)(v0 >> 32); - * xtn v0.2s, v0.2d // v0 = (uint32x2_t)(v0 & 0xFFFFFFFF); + * GCC for AArch64 sees `vld1q_u8` as an intrinsic instead of a load, so it + * prohibits load-store optimizations. Therefore, a direct dereference is used. * - * This is available on ARMv7-A, but is less efficient than a single VZIP.32. + * Otherwise, `vld1q_u8` is used with `vreinterpretq_u8_u64` to do a safe + * unaligned load. */ +#if defined(__aarch64__) && defined(__GNUC__) && !defined(__clang__) +XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr) /* silence -Wcast-align */ +{ + return *(xxh_aliasing_uint64x2_t const *)ptr; +} +#else +XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr) +{ + return vreinterpretq_u64_u8(vld1q_u8((uint8_t const*)ptr)); +} +#endif /*! - * Function-like macro: - * void XXH_SPLIT_IN_PLACE(uint64x2_t &in, uint32x2_t &outLo, uint32x2_t &outHi) - * { - * outLo = (uint32x2_t)(in & 0xFFFFFFFF); - * outHi = (uint32x2_t)(in >> 32); - * in = UNDEFINED; - * } + * @internal + * @brief `vmlal_u32` on low and high halves of a vector. + * + * This is a workaround for AArch64 GCC < 11 which implemented arm_neon.h with + * inline assembly and were therefore incapable of merging the `vget_{low, high}_u32` + * with `vmlal_u32`. */ -# if !defined(XXH_NO_VZIP_HACK) /* define to disable */ \ - && (defined(__GNUC__) || defined(__clang__)) \ - && (defined(__arm__) || defined(__thumb__) || defined(_M_ARM)) -# define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \ - do { \ - /* Undocumented GCC/Clang operand modifier: %e0 = lower D half, %f0 = upper D half */ \ - /* https://github.com/gcc-mirror/gcc/blob/38cf91e5/gcc/config/arm/arm.c#L22486 */ \ - /* https://github.com/llvm-mirror/llvm/blob/2c4ca683/lib/Target/ARM/ARMAsmPrinter.cpp#L399 */ \ - __asm__("vzip.32 %e0, %f0" : "+w" (in)); \ - (outLo) = vget_low_u32 (vreinterpretq_u32_u64(in)); \ - (outHi) = vget_high_u32(vreinterpretq_u32_u64(in)); \ - } while (0) -# else -# define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \ - do { \ - (outLo) = vmovn_u64 (in); \ - (outHi) = vshrn_n_u64 ((in), 32); \ - } while (0) -# endif +#if defined(__aarch64__) && defined(__GNUC__) && !defined(__clang__) && __GNUC__ < 11 +XXH_FORCE_INLINE uint64x2_t +XXH_vmlal_low_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs) +{ + /* Inline assembly is the only way */ + __asm__("umlal %0.2d, %1.2s, %2.2s" : "+w" (acc) : "w" (lhs), "w" (rhs)); + return acc; +} +XXH_FORCE_INLINE uint64x2_t +XXH_vmlal_high_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs) +{ + /* This intrinsic works as expected */ + return vmlal_high_u32(acc, lhs, rhs); +} +#else +/* Portable intrinsic versions */ +XXH_FORCE_INLINE uint64x2_t +XXH_vmlal_low_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs) +{ + return vmlal_u32(acc, vget_low_u32(lhs), vget_low_u32(rhs)); +} +/*! @copydoc XXH_vmlal_low_u32 + * Assume the compiler converts this to vmlal_high_u32 on aarch64 */ +XXH_FORCE_INLINE uint64x2_t +XXH_vmlal_high_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs) +{ + return vmlal_u32(acc, vget_high_u32(lhs), vget_high_u32(rhs)); +} +#endif /*! * @ingroup tuning * @brief Controls the NEON to scalar ratio for XXH3 * - * On AArch64 when not optimizing for size, XXH3 will run 6 lanes using NEON and - * 2 lanes on scalar by default. + * This can be set to 2, 4, 6, or 8. * - * This can be set to 2, 4, 6, or 8. ARMv7 will default to all 8 NEON lanes, as the - * emulated 64-bit arithmetic is too slow. + * ARM Cortex CPUs are _very_ sensitive to how their pipelines are used. * - * Modern ARM CPUs are _very_ sensitive to how their pipelines are used. + * For example, the Cortex-A73 can dispatch 3 micro-ops per cycle, but only 2 of those + * can be NEON. If you are only using NEON instructions, you are only using 2/3 of the CPU + * bandwidth. * - * For example, the Cortex-A73 can dispatch 3 micro-ops per cycle, but it can't - * have more than 2 NEON (F0/F1) micro-ops. If you are only using NEON instructions, - * you are only using 2/3 of the CPU bandwidth. - * - * This is even more noticeable on the more advanced cores like the A76 which + * This is even more noticeable on the more advanced cores like the Cortex-A76 which * can dispatch 8 micro-ops per cycle, but still only 2 NEON micro-ops at once. * - * Therefore, @ref XXH3_NEON_LANES lanes will be processed using NEON, and the - * remaining lanes will use scalar instructions. This improves the bandwidth - * and also gives the integer pipelines something to do besides twiddling loop - * counters and pointers. + * Therefore, to make the most out of the pipeline, it is beneficial to run 6 NEON lanes + * and 2 scalar lanes, which is chosen by default. + * + * This does not apply to Apple processors or 32-bit processors, which run better with + * full NEON. These will default to 8. Additionally, size-optimized builds run 8 lanes. * * This change benefits CPUs with large micro-op buffers without negatively affecting - * other CPUs: + * most other CPUs: * * | Chipset | Dispatch type | NEON only | 6:2 hybrid | Diff. | * |:----------------------|:--------------------|----------:|-----------:|------:| * | Snapdragon 730 (A76) | 2 NEON/8 micro-ops | 8.8 GB/s | 10.1 GB/s | ~16% | * | Snapdragon 835 (A73) | 2 NEON/3 micro-ops | 5.1 GB/s | 5.3 GB/s | ~5% | * | Marvell PXA1928 (A53) | In-order dual-issue | 1.9 GB/s | 1.9 GB/s | 0% | + * | Apple M1 | 4 NEON/8 micro-ops | 37.3 GB/s | 36.1 GB/s | ~-3% | * * It also seems to fix some bad codegen on GCC, making it almost as fast as clang. * + * When using WASM SIMD128, if this is 2 or 6, SIMDe will scalarize 2 of the lanes meaning + * it effectively becomes worse 4. + * * @see XXH3_accumulate_512_neon() */ # ifndef XXH3_NEON_LANES # if (defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) \ - && !defined(__OPTIMIZE_SIZE__) + && !defined(__APPLE__) && XXH_SIZE_OPT <= 0 # define XXH3_NEON_LANES 6 # else # define XXH3_NEON_LANES XXH_ACC_NB @@ -3066,27 +4059,42 @@ enum XXH_VECTOR_TYPE /* fake enum */ { * inconsistent intrinsics, spotty coverage, and multiple endiannesses. */ #if XXH_VECTOR == XXH_VSX +/* Annoyingly, these headers _may_ define three macros: `bool`, `vector`, + * and `pixel`. This is a problem for obvious reasons. + * + * These keywords are unnecessary; the spec literally says they are + * equivalent to `__bool`, `__vector`, and `__pixel` and may be undef'd + * after including the header. + * + * We use pragma push_macro/pop_macro to keep the namespace clean. */ +# pragma push_macro("bool") +# pragma push_macro("vector") +# pragma push_macro("pixel") +/* silence potential macro redefined warnings */ +# undef bool +# undef vector +# undef pixel + # if defined(__s390x__) # include <s390intrin.h> # else -/* gcc's altivec.h can have the unwanted consequence to unconditionally - * #define bool, vector, and pixel keywords, - * with bad consequences for programs already using these keywords for other purposes. - * The paragraph defining these macros is skipped when __APPLE_ALTIVEC__ is defined. - * __APPLE_ALTIVEC__ is _generally_ defined automatically by the compiler, - * but it seems that, in some cases, it isn't. - * Force the build macro to be defined, so that keywords are not altered. - */ -# if defined(__GNUC__) && !defined(__APPLE_ALTIVEC__) -# define __APPLE_ALTIVEC__ -# endif # include <altivec.h> # endif +/* Restore the original macro values, if applicable. */ +# pragma pop_macro("pixel") +# pragma pop_macro("vector") +# pragma pop_macro("bool") + typedef __vector unsigned long long xxh_u64x2; typedef __vector unsigned char xxh_u8x16; typedef __vector unsigned xxh_u32x4; +/* + * UGLY HACK: Similar to aarch64 macOS GCC, s390x GCC has the same aliasing issue. + */ +typedef xxh_u64x2 xxh_aliasing_u64x2 XXH_ALIASING; + # ifndef XXH_VSX_BE # if defined(__BIG_ENDIAN__) \ || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) @@ -3138,8 +4146,9 @@ XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr) /* s390x is always big endian, no issue on this platform */ # define XXH_vec_mulo vec_mulo # define XXH_vec_mule vec_mule -# elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw) +# elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw) && !defined(__ibmxl__) /* Clang has a better way to control this, we can just use the builtin which doesn't swap. */ + /* The IBM XL Compiler (which defined __clang__) only implements the vec_* operations */ # define XXH_vec_mulo __builtin_altivec_vmulouw # define XXH_vec_mule __builtin_altivec_vmuleuw # else @@ -3160,13 +4169,28 @@ XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b) # endif /* XXH_vec_mulo, XXH_vec_mule */ #endif /* XXH_VECTOR == XXH_VSX */ +#if XXH_VECTOR == XXH_SVE +#define ACCRND(acc, offset) \ +do { \ + svuint64_t input_vec = svld1_u64(mask, xinput + offset); \ + svuint64_t secret_vec = svld1_u64(mask, xsecret + offset); \ + svuint64_t mixed = sveor_u64_x(mask, secret_vec, input_vec); \ + svuint64_t swapped = svtbl_u64(input_vec, kSwap); \ + svuint64_t mixed_lo = svextw_u64_x(mask, mixed); \ + svuint64_t mixed_hi = svlsr_n_u64_x(mask, mixed, 32); \ + svuint64_t mul = svmad_u64_x(mask, mixed_lo, mixed_hi, swapped); \ + acc = svadd_u64_x(mask, acc, mul); \ +} while (0) +#endif /* XXH_VECTOR == XXH_SVE */ /* prefetch * can be disabled, by declaring XXH_NO_PREFETCH build macro */ #if defined(XXH_NO_PREFETCH) # define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */ #else -# if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)) /* _mm_prefetch() not defined outside of x86/x64 */ +# if XXH_SIZE_OPT >= 1 +# define XXH_PREFETCH(ptr) (void)(ptr) +# elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)) /* _mm_prefetch() not defined outside of x86/x64 */ # include <mmintrin.h> /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */ # define XXH_PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0) # elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) ) @@ -3203,6 +4227,8 @@ XXH_ALIGN(64) static const xxh_u8 XXH3_kSecret[XXH_SECRET_DEFAULT_SIZE] = { 0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e, }; +static const xxh_u64 PRIME_MX1 = 0x165667919E3779F9ULL; /*!< 0b0001011001010110011001111001000110011110001101110111100111111001 */ +static const xxh_u64 PRIME_MX2 = 0x9FB21C651E98DF25ULL; /*!< 0b1001111110110010000111000110010100011110100110001101111100100101 */ #ifdef XXH_OLD_NAMES # define kSecret XXH3_kSecret @@ -3394,7 +4420,7 @@ XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs) } /*! Seems to produce slightly better code on GCC for some reason. */ -XXH_FORCE_INLINE xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift) +XXH_FORCE_INLINE XXH_CONSTF xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift) { XXH_ASSERT(0 <= shift && shift < 64); return v64 ^ (v64 >> shift); @@ -3407,7 +4433,7 @@ XXH_FORCE_INLINE xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift) static XXH64_hash_t XXH3_avalanche(xxh_u64 h64) { h64 = XXH_xorshift64(h64, 37); - h64 *= 0x165667919E3779F9ULL; + h64 *= PRIME_MX1; h64 = XXH_xorshift64(h64, 32); return h64; } @@ -3421,9 +4447,9 @@ static XXH64_hash_t XXH3_rrmxmx(xxh_u64 h64, xxh_u64 len) { /* this mix is inspired by Pelle Evensen's rrmxmx */ h64 ^= XXH_rotl64(h64, 49) ^ XXH_rotl64(h64, 24); - h64 *= 0x9FB21C651E98DF25ULL; + h64 *= PRIME_MX2; h64 ^= (h64 >> 35) + len ; - h64 *= 0x9FB21C651E98DF25ULL; + h64 *= PRIME_MX2; return XXH_xorshift64(h64, 28); } @@ -3461,7 +4487,7 @@ static XXH64_hash_t XXH3_rrmxmx(xxh_u64 h64, xxh_u64 len) * * This adds an extra layer of strength for custom secrets. */ -XXH_FORCE_INLINE XXH64_hash_t +XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t XXH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) { XXH_ASSERT(input != NULL); @@ -3483,7 +4509,7 @@ XXH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_h } } -XXH_FORCE_INLINE XXH64_hash_t +XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) { XXH_ASSERT(input != NULL); @@ -3499,7 +4525,7 @@ XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_h } } -XXH_FORCE_INLINE XXH64_hash_t +XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) { XXH_ASSERT(input != NULL); @@ -3516,7 +4542,7 @@ XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_ } } -XXH_FORCE_INLINE XXH64_hash_t +XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t XXH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) { XXH_ASSERT(len <= 16); @@ -3586,7 +4612,7 @@ XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8* XXH_RESTRICT input, } /* For mid range keys, XXH3 uses a Mum-hash variant. */ -XXH_FORCE_INLINE XXH64_hash_t +XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len, const xxh_u8* XXH_RESTRICT secret, size_t secretSize, XXH64_hash_t seed) @@ -3595,6 +4621,14 @@ XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len, XXH_ASSERT(16 < len && len <= 128); { xxh_u64 acc = len * XXH_PRIME64_1; +#if XXH_SIZE_OPT >= 1 + /* Smaller and cleaner, but slightly slower. */ + unsigned int i = (unsigned int)(len - 1) / 32; + do { + acc += XXH3_mix16B(input+16 * i, secret+32*i, seed); + acc += XXH3_mix16B(input+len-16*(i+1), secret+32*i+16, seed); + } while (i-- != 0); +#else if (len > 32) { if (len > 64) { if (len > 96) { @@ -3609,14 +4643,17 @@ XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len, } acc += XXH3_mix16B(input+0, secret+0, seed); acc += XXH3_mix16B(input+len-16, secret+16, seed); - +#endif return XXH3_avalanche(acc); } } +/*! + * @brief Maximum size of "short" key in bytes. + */ #define XXH3_MIDSIZE_MAX 240 -XXH_NO_INLINE XXH64_hash_t +XXH_NO_INLINE XXH_PUREF XXH64_hash_t XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len, const xxh_u8* XXH_RESTRICT secret, size_t secretSize, XXH64_hash_t seed) @@ -3628,13 +4665,17 @@ XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len, #define XXH3_MIDSIZE_LASTOFFSET 17 { xxh_u64 acc = len * XXH_PRIME64_1; - int const nbRounds = (int)len / 16; - int i; + xxh_u64 acc_end; + unsigned int const nbRounds = (unsigned int)len / 16; + unsigned int i; + XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); for (i=0; i<8; i++) { acc += XXH3_mix16B(input+(16*i), secret+(16*i), seed); } - acc = XXH3_avalanche(acc); + /* last bytes */ + acc_end = XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed); XXH_ASSERT(nbRounds >= 8); + acc = XXH3_avalanche(acc); #if defined(__clang__) /* Clang */ \ && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \ && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */ @@ -3661,11 +4702,13 @@ XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len, #pragma clang loop vectorize(disable) #endif for (i=8 ; i < nbRounds; i++) { - acc += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed); + /* + * Prevents clang for unrolling the acc loop and interleaving with this one. + */ + XXH_COMPILER_GUARD(acc); + acc_end += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed); } - /* last bytes */ - acc += XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed); - return XXH3_avalanche(acc); + return XXH3_avalanche(acc + acc_end); } } @@ -3681,6 +4724,47 @@ XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len, # define ACC_NB XXH_ACC_NB #endif +#ifndef XXH_PREFETCH_DIST +# ifdef __clang__ +# define XXH_PREFETCH_DIST 320 +# else +# if (XXH_VECTOR == XXH_AVX512) +# define XXH_PREFETCH_DIST 512 +# else +# define XXH_PREFETCH_DIST 384 +# endif +# endif /* __clang__ */ +#endif /* XXH_PREFETCH_DIST */ + +/* + * These macros are to generate an XXH3_accumulate() function. + * The two arguments select the name suffix and target attribute. + * + * The name of this symbol is XXH3_accumulate_<name>() and it calls + * XXH3_accumulate_512_<name>(). + * + * It may be useful to hand implement this function if the compiler fails to + * optimize the inline function. + */ +#define XXH3_ACCUMULATE_TEMPLATE(name) \ +void \ +XXH3_accumulate_##name(xxh_u64* XXH_RESTRICT acc, \ + const xxh_u8* XXH_RESTRICT input, \ + const xxh_u8* XXH_RESTRICT secret, \ + size_t nbStripes) \ +{ \ + size_t n; \ + for (n = 0; n < nbStripes; n++ ) { \ + const xxh_u8* const in = input + n*XXH_STRIPE_LEN; \ + XXH_PREFETCH(in + XXH_PREFETCH_DIST); \ + XXH3_accumulate_512_##name( \ + acc, \ + in, \ + secret + n*XXH_SECRET_CONSUME_RATE); \ + } \ +} + + XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64) { if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64); @@ -3749,7 +4833,7 @@ XXH3_accumulate_512_avx512(void* XXH_RESTRICT acc, /* data_key = data_vec ^ key_vec; */ __m512i const data_key = _mm512_xor_si512 (data_vec, key_vec); /* data_key_lo = data_key >> 32; */ - __m512i const data_key_lo = _mm512_shuffle_epi32 (data_key, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 3, 0, 1)); + __m512i const data_key_lo = _mm512_srli_epi64 (data_key, 32); /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ __m512i const product = _mm512_mul_epu32 (data_key, data_key_lo); /* xacc[0] += swap(data_vec); */ @@ -3759,6 +4843,7 @@ XXH3_accumulate_512_avx512(void* XXH_RESTRICT acc, *xacc = _mm512_add_epi64(product, sum); } } +XXH_FORCE_INLINE XXH_TARGET_AVX512 XXH3_ACCUMULATE_TEMPLATE(avx512) /* * XXH3_scrambleAcc: Scrambles the accumulators to improve mixing. @@ -3792,13 +4877,12 @@ XXH3_scrambleAcc_avx512(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) /* xacc[0] ^= (xacc[0] >> 47) */ __m512i const acc_vec = *xacc; __m512i const shifted = _mm512_srli_epi64 (acc_vec, 47); - __m512i const data_vec = _mm512_xor_si512 (acc_vec, shifted); /* xacc[0] ^= secret; */ __m512i const key_vec = _mm512_loadu_si512 (secret); - __m512i const data_key = _mm512_xor_si512 (data_vec, key_vec); + __m512i const data_key = _mm512_ternarylogic_epi32(key_vec, acc_vec, shifted, 0x96 /* key_vec ^ acc_vec ^ shifted */); /* xacc[0] *= XXH_PRIME32_1; */ - __m512i const data_key_hi = _mm512_shuffle_epi32 (data_key, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 3, 0, 1)); + __m512i const data_key_hi = _mm512_srli_epi64 (data_key, 32); __m512i const prod_lo = _mm512_mul_epu32 (data_key, prime32); __m512i const prod_hi = _mm512_mul_epu32 (data_key_hi, prime32); *xacc = _mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32)); @@ -3813,7 +4897,8 @@ XXH3_initCustomSecret_avx512(void* XXH_RESTRICT customSecret, xxh_u64 seed64) XXH_ASSERT(((size_t)customSecret & 63) == 0); (void)(&XXH_writeLE64); { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m512i); - __m512i const seed = _mm512_mask_set1_epi64(_mm512_set1_epi64((xxh_i64)seed64), 0xAA, (xxh_i64)(0U - seed64)); + __m512i const seed_pos = _mm512_set1_epi64((xxh_i64)seed64); + __m512i const seed = _mm512_mask_sub_epi64(seed_pos, 0xAA, _mm512_set1_epi8(0), seed_pos); const __m512i* const src = (const __m512i*) ((const void*) XXH3_kSecret); __m512i* const dest = ( __m512i*) customSecret; @@ -3821,14 +4906,7 @@ XXH3_initCustomSecret_avx512(void* XXH_RESTRICT customSecret, xxh_u64 seed64) XXH_ASSERT(((size_t)src & 63) == 0); /* control alignment */ XXH_ASSERT(((size_t)dest & 63) == 0); for (i=0; i < nbRounds; ++i) { - /* GCC has a bug, _mm512_stream_load_si512 accepts 'void*', not 'void const*', - * this will warn "discards 'const' qualifier". */ - union { - const __m512i* cp; - void* p; - } remote_const_void; - remote_const_void.cp = src + i; - dest[i] = _mm512_add_epi64(_mm512_stream_load_si512(remote_const_void.p), seed); + dest[i] = _mm512_add_epi64(_mm512_load_si512(src + i), seed); } } } @@ -3864,7 +4942,7 @@ XXH3_accumulate_512_avx2( void* XXH_RESTRICT acc, /* data_key = data_vec ^ key_vec; */ __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); /* data_key_lo = data_key >> 32; */ - __m256i const data_key_lo = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); + __m256i const data_key_lo = _mm256_srli_epi64 (data_key, 32); /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ __m256i const product = _mm256_mul_epu32 (data_key, data_key_lo); /* xacc[i] += swap(data_vec); */ @@ -3874,6 +4952,7 @@ XXH3_accumulate_512_avx2( void* XXH_RESTRICT acc, xacc[i] = _mm256_add_epi64(product, sum); } } } +XXH_FORCE_INLINE XXH_TARGET_AVX2 XXH3_ACCUMULATE_TEMPLATE(avx2) XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_scrambleAcc_avx2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) @@ -3896,7 +4975,7 @@ XXH3_scrambleAcc_avx2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); /* xacc[i] *= XXH_PRIME32_1; */ - __m256i const data_key_hi = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); + __m256i const data_key_hi = _mm256_srli_epi64 (data_key, 32); __m256i const prod_lo = _mm256_mul_epu32 (data_key, prime32); __m256i const prod_hi = _mm256_mul_epu32 (data_key_hi, prime32); xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32)); @@ -3928,12 +5007,12 @@ XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2(void* XXH_RESTR XXH_ASSERT(((size_t)dest & 31) == 0); /* GCC -O2 need unroll loop manually */ - dest[0] = _mm256_add_epi64(_mm256_stream_load_si256(src+0), seed); - dest[1] = _mm256_add_epi64(_mm256_stream_load_si256(src+1), seed); - dest[2] = _mm256_add_epi64(_mm256_stream_load_si256(src+2), seed); - dest[3] = _mm256_add_epi64(_mm256_stream_load_si256(src+3), seed); - dest[4] = _mm256_add_epi64(_mm256_stream_load_si256(src+4), seed); - dest[5] = _mm256_add_epi64(_mm256_stream_load_si256(src+5), seed); + dest[0] = _mm256_add_epi64(_mm256_load_si256(src+0), seed); + dest[1] = _mm256_add_epi64(_mm256_load_si256(src+1), seed); + dest[2] = _mm256_add_epi64(_mm256_load_si256(src+2), seed); + dest[3] = _mm256_add_epi64(_mm256_load_si256(src+3), seed); + dest[4] = _mm256_add_epi64(_mm256_load_si256(src+4), seed); + dest[5] = _mm256_add_epi64(_mm256_load_si256(src+5), seed); } } @@ -3980,6 +5059,7 @@ XXH3_accumulate_512_sse2( void* XXH_RESTRICT acc, xacc[i] = _mm_add_epi64(product, sum); } } } +XXH_FORCE_INLINE XXH_TARGET_SSE2 XXH3_ACCUMULATE_TEMPLATE(sse2) XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_scrambleAcc_sse2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) @@ -4058,14 +5138,28 @@ XXH3_scalarScrambleRound(void* XXH_RESTRICT acc, /*! * @internal - * @brief The bulk processing loop for NEON. + * @brief The bulk processing loop for NEON and WASM SIMD128. * * The NEON code path is actually partially scalar when running on AArch64. This * is to optimize the pipelining and can have up to 15% speedup depending on the * CPU, and it also mitigates some GCC codegen issues. * * @see XXH3_NEON_LANES for configuring this and details about this optimization. + * + * NEON's 32-bit to 64-bit long multiply takes a half vector of 32-bit + * integers instead of the other platforms which mask full 64-bit vectors, + * so the setup is more complicated than just shifting right. + * + * Additionally, there is an optimization for 4 lanes at once noted below. + * + * Since, as stated, the most optimal amount of lanes for Cortexes is 6, + * there needs to be *three* versions of the accumulate operation used + * for the remaining 2 lanes. + * + * WASM's SIMD128 uses SIMDe's arm_neon.h polyfill because the intrinsics overlap + * nearly perfectly. */ + XXH_FORCE_INLINE void XXH3_accumulate_512_neon( void* XXH_RESTRICT acc, const void* XXH_RESTRICT input, @@ -4073,101 +5167,182 @@ XXH3_accumulate_512_neon( void* XXH_RESTRICT acc, { XXH_ASSERT((((size_t)acc) & 15) == 0); XXH_STATIC_ASSERT(XXH3_NEON_LANES > 0 && XXH3_NEON_LANES <= XXH_ACC_NB && XXH3_NEON_LANES % 2 == 0); - { - uint64x2_t* const xacc = (uint64x2_t *) acc; + { /* GCC for darwin arm64 does not like aliasing here */ + xxh_aliasing_uint64x2_t* const xacc = (xxh_aliasing_uint64x2_t*) acc; /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */ - uint8_t const* const xinput = (const uint8_t *) input; - uint8_t const* const xsecret = (const uint8_t *) secret; + uint8_t const* xinput = (const uint8_t *) input; + uint8_t const* xsecret = (const uint8_t *) secret; size_t i; - /* NEON for the first few lanes (these loops are normally interleaved) */ - for (i=0; i < XXH3_NEON_LANES / 2; i++) { +#ifdef __wasm_simd128__ + /* + * On WASM SIMD128, Clang emits direct address loads when XXH3_kSecret + * is constant propagated, which results in it converting it to this + * inside the loop: + * + * a = v128.load(XXH3_kSecret + 0 + $secret_offset, offset = 0) + * b = v128.load(XXH3_kSecret + 16 + $secret_offset, offset = 0) + * ... + * + * This requires a full 32-bit address immediate (and therefore a 6 byte + * instruction) as well as an add for each offset. + * + * Putting an asm guard prevents it from folding (at the cost of losing + * the alignment hint), and uses the free offset in `v128.load` instead + * of adding secret_offset each time which overall reduces code size by + * about a kilobyte and improves performance. + */ + XXH_COMPILER_GUARD(xsecret); +#endif + /* Scalar lanes use the normal scalarRound routine */ + for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) { + XXH3_scalarRound(acc, input, secret, i); + } + i = 0; + /* 4 NEON lanes at a time. */ + for (; i+1 < XXH3_NEON_LANES / 2; i+=2) { /* data_vec = xinput[i]; */ - uint8x16_t data_vec = vld1q_u8(xinput + (i * 16)); + uint64x2_t data_vec_1 = XXH_vld1q_u64(xinput + (i * 16)); + uint64x2_t data_vec_2 = XXH_vld1q_u64(xinput + ((i+1) * 16)); /* key_vec = xsecret[i]; */ - uint8x16_t key_vec = vld1q_u8(xsecret + (i * 16)); - uint64x2_t data_key; - uint32x2_t data_key_lo, data_key_hi; - /* xacc[i] += swap(data_vec); */ - uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec); - uint64x2_t const swapped = vextq_u64(data64, data64, 1); - xacc[i] = vaddq_u64 (xacc[i], swapped); + uint64x2_t key_vec_1 = XXH_vld1q_u64(xsecret + (i * 16)); + uint64x2_t key_vec_2 = XXH_vld1q_u64(xsecret + ((i+1) * 16)); + /* data_swap = swap(data_vec) */ + uint64x2_t data_swap_1 = vextq_u64(data_vec_1, data_vec_1, 1); + uint64x2_t data_swap_2 = vextq_u64(data_vec_2, data_vec_2, 1); /* data_key = data_vec ^ key_vec; */ - data_key = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec)); - /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF); - * data_key_hi = (uint32x2_t) (data_key >> 32); - * data_key = UNDEFINED; */ - XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi); - /* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */ - xacc[i] = vmlal_u32 (xacc[i], data_key_lo, data_key_hi); + uint64x2_t data_key_1 = veorq_u64(data_vec_1, key_vec_1); + uint64x2_t data_key_2 = veorq_u64(data_vec_2, key_vec_2); + /* + * If we reinterpret the 64x2 vectors as 32x4 vectors, we can use a + * de-interleave operation for 4 lanes in 1 step with `vuzpq_u32` to + * get one vector with the low 32 bits of each lane, and one vector + * with the high 32 bits of each lane. + * + * The intrinsic returns a double vector because the original ARMv7-a + * instruction modified both arguments in place. AArch64 and SIMD128 emit + * two instructions from this intrinsic. + * + * [ dk11L | dk11H | dk12L | dk12H ] -> [ dk11L | dk12L | dk21L | dk22L ] + * [ dk21L | dk21H | dk22L | dk22H ] -> [ dk11H | dk12H | dk21H | dk22H ] + */ + uint32x4x2_t unzipped = vuzpq_u32( + vreinterpretq_u32_u64(data_key_1), + vreinterpretq_u32_u64(data_key_2) + ); + /* data_key_lo = data_key & 0xFFFFFFFF */ + uint32x4_t data_key_lo = unzipped.val[0]; + /* data_key_hi = data_key >> 32 */ + uint32x4_t data_key_hi = unzipped.val[1]; + /* + * Then, we can split the vectors horizontally and multiply which, as for most + * widening intrinsics, have a variant that works on both high half vectors + * for free on AArch64. A similar instruction is available on SIMD128. + * + * sum = data_swap + (u64x2) data_key_lo * (u64x2) data_key_hi + */ + uint64x2_t sum_1 = XXH_vmlal_low_u32(data_swap_1, data_key_lo, data_key_hi); + uint64x2_t sum_2 = XXH_vmlal_high_u32(data_swap_2, data_key_lo, data_key_hi); + /* + * Clang reorders + * a += b * c; // umlal swap.2d, dkl.2s, dkh.2s + * c += a; // add acc.2d, acc.2d, swap.2d + * to + * c += a; // add acc.2d, acc.2d, swap.2d + * c += b * c; // umlal acc.2d, dkl.2s, dkh.2s + * + * While it would make sense in theory since the addition is faster, + * for reasons likely related to umlal being limited to certain NEON + * pipelines, this is worse. A compiler guard fixes this. + */ + XXH_COMPILER_GUARD_CLANG_NEON(sum_1); + XXH_COMPILER_GUARD_CLANG_NEON(sum_2); + /* xacc[i] = acc_vec + sum; */ + xacc[i] = vaddq_u64(xacc[i], sum_1); + xacc[i+1] = vaddq_u64(xacc[i+1], sum_2); } - /* Scalar for the remainder. This may be a zero iteration loop. */ - for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) { - XXH3_scalarRound(acc, input, secret, i); + /* Operate on the remaining NEON lanes 2 at a time. */ + for (; i < XXH3_NEON_LANES / 2; i++) { + /* data_vec = xinput[i]; */ + uint64x2_t data_vec = XXH_vld1q_u64(xinput + (i * 16)); + /* key_vec = xsecret[i]; */ + uint64x2_t key_vec = XXH_vld1q_u64(xsecret + (i * 16)); + /* acc_vec_2 = swap(data_vec) */ + uint64x2_t data_swap = vextq_u64(data_vec, data_vec, 1); + /* data_key = data_vec ^ key_vec; */ + uint64x2_t data_key = veorq_u64(data_vec, key_vec); + /* For two lanes, just use VMOVN and VSHRN. */ + /* data_key_lo = data_key & 0xFFFFFFFF; */ + uint32x2_t data_key_lo = vmovn_u64(data_key); + /* data_key_hi = data_key >> 32; */ + uint32x2_t data_key_hi = vshrn_n_u64(data_key, 32); + /* sum = data_swap + (u64x2) data_key_lo * (u64x2) data_key_hi; */ + uint64x2_t sum = vmlal_u32(data_swap, data_key_lo, data_key_hi); + /* Same Clang workaround as before */ + XXH_COMPILER_GUARD_CLANG_NEON(sum); + /* xacc[i] = acc_vec + sum; */ + xacc[i] = vaddq_u64 (xacc[i], sum); } } } +XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(neon) XXH_FORCE_INLINE void XXH3_scrambleAcc_neon(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) { XXH_ASSERT((((size_t)acc) & 15) == 0); - { uint64x2_t* xacc = (uint64x2_t*) acc; + { xxh_aliasing_uint64x2_t* xacc = (xxh_aliasing_uint64x2_t*) acc; uint8_t const* xsecret = (uint8_t const*) secret; - uint32x2_t prime = vdup_n_u32 (XXH_PRIME32_1); size_t i; - /* NEON for the first few lanes (these loops are normally interleaved) */ + /* WASM uses operator overloads and doesn't need these. */ +#ifndef __wasm_simd128__ + /* { prime32_1, prime32_1 } */ + uint32x2_t const kPrimeLo = vdup_n_u32(XXH_PRIME32_1); + /* { 0, prime32_1, 0, prime32_1 } */ + uint32x4_t const kPrimeHi = vreinterpretq_u32_u64(vdupq_n_u64((xxh_u64)XXH_PRIME32_1 << 32)); +#endif + + /* AArch64 uses both scalar and neon at the same time */ + for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) { + XXH3_scalarScrambleRound(acc, secret, i); + } for (i=0; i < XXH3_NEON_LANES / 2; i++) { /* xacc[i] ^= (xacc[i] >> 47); */ uint64x2_t acc_vec = xacc[i]; - uint64x2_t shifted = vshrq_n_u64 (acc_vec, 47); - uint64x2_t data_vec = veorq_u64 (acc_vec, shifted); + uint64x2_t shifted = vshrq_n_u64(acc_vec, 47); + uint64x2_t data_vec = veorq_u64(acc_vec, shifted); /* xacc[i] ^= xsecret[i]; */ - uint8x16_t key_vec = vld1q_u8 (xsecret + (i * 16)); - uint64x2_t data_key = veorq_u64 (data_vec, vreinterpretq_u64_u8(key_vec)); - + uint64x2_t key_vec = XXH_vld1q_u64(xsecret + (i * 16)); + uint64x2_t data_key = veorq_u64(data_vec, key_vec); /* xacc[i] *= XXH_PRIME32_1 */ - uint32x2_t data_key_lo, data_key_hi; - /* data_key_lo = (uint32x2_t) (xacc[i] & 0xFFFFFFFF); - * data_key_hi = (uint32x2_t) (xacc[i] >> 32); - * xacc[i] = UNDEFINED; */ - XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi); - { /* - * prod_hi = (data_key >> 32) * XXH_PRIME32_1; - * - * Avoid vmul_u32 + vshll_n_u32 since Clang 6 and 7 will - * incorrectly "optimize" this: - * tmp = vmul_u32(vmovn_u64(a), vmovn_u64(b)); - * shifted = vshll_n_u32(tmp, 32); - * to this: - * tmp = "vmulq_u64"(a, b); // no such thing! - * shifted = vshlq_n_u64(tmp, 32); - * - * However, unlike SSE, Clang lacks a 64-bit multiply routine - * for NEON, and it scalarizes two 64-bit multiplies instead. - * - * vmull_u32 has the same timing as vmul_u32, and it avoids - * this bug completely. - * See https://bugs.llvm.org/show_bug.cgi?id=39967 - */ - uint64x2_t prod_hi = vmull_u32 (data_key_hi, prime); - /* xacc[i] = prod_hi << 32; */ - xacc[i] = vshlq_n_u64(prod_hi, 32); - /* xacc[i] += (prod_hi & 0xFFFFFFFF) * XXH_PRIME32_1; */ - xacc[i] = vmlal_u32(xacc[i], data_key_lo, prime); - } - } - /* Scalar for the remainder. This may be a zero iteration loop. */ - for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) { - XXH3_scalarScrambleRound(acc, secret, i); +#ifdef __wasm_simd128__ + /* SIMD128 has multiply by u64x2, use it instead of expanding and scalarizing */ + xacc[i] = data_key * XXH_PRIME32_1; +#else + /* + * Expanded version with portable NEON intrinsics + * + * lo(x) * lo(y) + (hi(x) * lo(y) << 32) + * + * prod_hi = hi(data_key) * lo(prime) << 32 + * + * Since we only need 32 bits of this multiply a trick can be used, reinterpreting the vector + * as a uint32x4_t and multiplying by { 0, prime, 0, prime } to cancel out the unwanted bits + * and avoid the shift. + */ + uint32x4_t prod_hi = vmulq_u32 (vreinterpretq_u32_u64(data_key), kPrimeHi); + /* Extract low bits for vmlal_u32 */ + uint32x2_t data_key_lo = vmovn_u64(data_key); + /* xacc[i] = prod_hi + lo(data_key) * XXH_PRIME32_1; */ + xacc[i] = vmlal_u32(vreinterpretq_u64_u32(prod_hi), data_key_lo, kPrimeLo); +#endif } } } - #endif #if (XXH_VECTOR == XXH_VSX) @@ -4178,23 +5353,23 @@ XXH3_accumulate_512_vsx( void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) { /* presumed aligned */ - unsigned int* const xacc = (unsigned int*) acc; - xxh_u64x2 const* const xinput = (xxh_u64x2 const*) input; /* no alignment restriction */ - xxh_u64x2 const* const xsecret = (xxh_u64x2 const*) secret; /* no alignment restriction */ + xxh_aliasing_u64x2* const xacc = (xxh_aliasing_u64x2*) acc; + xxh_u8 const* const xinput = (xxh_u8 const*) input; /* no alignment restriction */ + xxh_u8 const* const xsecret = (xxh_u8 const*) secret; /* no alignment restriction */ xxh_u64x2 const v32 = { 32, 32 }; size_t i; for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) { /* data_vec = xinput[i]; */ - xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + i); + xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + 16*i); /* key_vec = xsecret[i]; */ - xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i); + xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + 16*i); xxh_u64x2 const data_key = data_vec ^ key_vec; /* shuffled = (data_key << 32) | (data_key >> 32); */ xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32); /* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */ xxh_u64x2 const product = XXH_vec_mulo((xxh_u32x4)data_key, shuffled); /* acc_vec = xacc[i]; */ - xxh_u64x2 acc_vec = (xxh_u64x2)vec_xl(0, xacc + 4 * i); + xxh_u64x2 acc_vec = xacc[i]; acc_vec += product; /* swap high and low halves */ @@ -4203,18 +5378,18 @@ XXH3_accumulate_512_vsx( void* XXH_RESTRICT acc, #else acc_vec += vec_xxpermdi(data_vec, data_vec, 2); #endif - /* xacc[i] = acc_vec; */ - vec_xst((xxh_u32x4)acc_vec, 0, xacc + 4 * i); + xacc[i] = acc_vec; } } +XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(vsx) XXH_FORCE_INLINE void XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) { XXH_ASSERT((((size_t)acc) & 15) == 0); - { xxh_u64x2* const xacc = (xxh_u64x2*) acc; - const xxh_u64x2* const xsecret = (const xxh_u64x2*) secret; + { xxh_aliasing_u64x2* const xacc = (xxh_aliasing_u64x2*) acc; + const xxh_u8* const xsecret = (const xxh_u8*) secret; /* constants */ xxh_u64x2 const v32 = { 32, 32 }; xxh_u64x2 const v47 = { 47, 47 }; @@ -4226,7 +5401,7 @@ XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) xxh_u64x2 const data_vec = acc_vec ^ (acc_vec >> v47); /* xacc[i] ^= xsecret[i]; */ - xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i); + xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + 16*i); xxh_u64x2 const data_key = data_vec ^ key_vec; /* xacc[i] *= XXH_PRIME32_1 */ @@ -4240,8 +5415,148 @@ XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) #endif +#if (XXH_VECTOR == XXH_SVE) + +XXH_FORCE_INLINE void +XXH3_accumulate_512_sve( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + uint64_t *xacc = (uint64_t *)acc; + const uint64_t *xinput = (const uint64_t *)(const void *)input; + const uint64_t *xsecret = (const uint64_t *)(const void *)secret; + svuint64_t kSwap = sveor_n_u64_z(svptrue_b64(), svindex_u64(0, 1), 1); + uint64_t element_count = svcntd(); + if (element_count >= 8) { + svbool_t mask = svptrue_pat_b64(SV_VL8); + svuint64_t vacc = svld1_u64(mask, xacc); + ACCRND(vacc, 0); + svst1_u64(mask, xacc, vacc); + } else if (element_count == 2) { /* sve128 */ + svbool_t mask = svptrue_pat_b64(SV_VL2); + svuint64_t acc0 = svld1_u64(mask, xacc + 0); + svuint64_t acc1 = svld1_u64(mask, xacc + 2); + svuint64_t acc2 = svld1_u64(mask, xacc + 4); + svuint64_t acc3 = svld1_u64(mask, xacc + 6); + ACCRND(acc0, 0); + ACCRND(acc1, 2); + ACCRND(acc2, 4); + ACCRND(acc3, 6); + svst1_u64(mask, xacc + 0, acc0); + svst1_u64(mask, xacc + 2, acc1); + svst1_u64(mask, xacc + 4, acc2); + svst1_u64(mask, xacc + 6, acc3); + } else { + svbool_t mask = svptrue_pat_b64(SV_VL4); + svuint64_t acc0 = svld1_u64(mask, xacc + 0); + svuint64_t acc1 = svld1_u64(mask, xacc + 4); + ACCRND(acc0, 0); + ACCRND(acc1, 4); + svst1_u64(mask, xacc + 0, acc0); + svst1_u64(mask, xacc + 4, acc1); + } +} + +XXH_FORCE_INLINE void +XXH3_accumulate_sve(xxh_u64* XXH_RESTRICT acc, + const xxh_u8* XXH_RESTRICT input, + const xxh_u8* XXH_RESTRICT secret, + size_t nbStripes) +{ + if (nbStripes != 0) { + uint64_t *xacc = (uint64_t *)acc; + const uint64_t *xinput = (const uint64_t *)(const void *)input; + const uint64_t *xsecret = (const uint64_t *)(const void *)secret; + svuint64_t kSwap = sveor_n_u64_z(svptrue_b64(), svindex_u64(0, 1), 1); + uint64_t element_count = svcntd(); + if (element_count >= 8) { + svbool_t mask = svptrue_pat_b64(SV_VL8); + svuint64_t vacc = svld1_u64(mask, xacc + 0); + do { + /* svprfd(svbool_t, void *, enum svfprop); */ + svprfd(mask, xinput + 128, SV_PLDL1STRM); + ACCRND(vacc, 0); + xinput += 8; + xsecret += 1; + nbStripes--; + } while (nbStripes != 0); + + svst1_u64(mask, xacc + 0, vacc); + } else if (element_count == 2) { /* sve128 */ + svbool_t mask = svptrue_pat_b64(SV_VL2); + svuint64_t acc0 = svld1_u64(mask, xacc + 0); + svuint64_t acc1 = svld1_u64(mask, xacc + 2); + svuint64_t acc2 = svld1_u64(mask, xacc + 4); + svuint64_t acc3 = svld1_u64(mask, xacc + 6); + do { + svprfd(mask, xinput + 128, SV_PLDL1STRM); + ACCRND(acc0, 0); + ACCRND(acc1, 2); + ACCRND(acc2, 4); + ACCRND(acc3, 6); + xinput += 8; + xsecret += 1; + nbStripes--; + } while (nbStripes != 0); + + svst1_u64(mask, xacc + 0, acc0); + svst1_u64(mask, xacc + 2, acc1); + svst1_u64(mask, xacc + 4, acc2); + svst1_u64(mask, xacc + 6, acc3); + } else { + svbool_t mask = svptrue_pat_b64(SV_VL4); + svuint64_t acc0 = svld1_u64(mask, xacc + 0); + svuint64_t acc1 = svld1_u64(mask, xacc + 4); + do { + svprfd(mask, xinput + 128, SV_PLDL1STRM); + ACCRND(acc0, 0); + ACCRND(acc1, 4); + xinput += 8; + xsecret += 1; + nbStripes--; + } while (nbStripes != 0); + + svst1_u64(mask, xacc + 0, acc0); + svst1_u64(mask, xacc + 4, acc1); + } + } +} + +#endif + /* scalar variants - universal */ +#if defined(__aarch64__) && (defined(__GNUC__) || defined(__clang__)) +/* + * In XXH3_scalarRound(), GCC and Clang have a similar codegen issue, where they + * emit an excess mask and a full 64-bit multiply-add (MADD X-form). + * + * While this might not seem like much, as AArch64 is a 64-bit architecture, only + * big Cortex designs have a full 64-bit multiplier. + * + * On the little cores, the smaller 32-bit multiplier is used, and full 64-bit + * multiplies expand to 2-3 multiplies in microcode. This has a major penalty + * of up to 4 latency cycles and 2 stall cycles in the multiply pipeline. + * + * Thankfully, AArch64 still provides the 32-bit long multiply-add (UMADDL) which does + * not have this penalty and does the mask automatically. + */ +XXH_FORCE_INLINE xxh_u64 +XXH_mult32to64_add64(xxh_u64 lhs, xxh_u64 rhs, xxh_u64 acc) +{ + xxh_u64 ret; + /* note: %x = 64-bit register, %w = 32-bit register */ + __asm__("umaddl %x0, %w1, %w2, %x3" : "=r" (ret) : "r" (lhs), "r" (rhs), "r" (acc)); + return ret; +} +#else +XXH_FORCE_INLINE xxh_u64 +XXH_mult32to64_add64(xxh_u64 lhs, xxh_u64 rhs, xxh_u64 acc) +{ + return XXH_mult32to64((xxh_u32)lhs, (xxh_u32)rhs) + acc; +} +#endif + /*! * @internal * @brief Scalar round for @ref XXH3_accumulate_512_scalar(). @@ -4264,7 +5579,7 @@ XXH3_scalarRound(void* XXH_RESTRICT acc, xxh_u64 const data_val = XXH_readLE64(xinput + lane * 8); xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + lane * 8); xacc[lane ^ 1] += data_val; /* swap adjacent lanes */ - xacc[lane] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32); + xacc[lane] = XXH_mult32to64_add64(data_key /* & 0xFFFFFFFF */, data_key >> 32, xacc[lane]); } } @@ -4278,10 +5593,18 @@ XXH3_accumulate_512_scalar(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) { size_t i; + /* ARM GCC refuses to unroll this loop, resulting in a 24% slowdown on ARMv6. */ +#if defined(__GNUC__) && !defined(__clang__) \ + && (defined(__arm__) || defined(__thumb2__)) \ + && defined(__ARM_FEATURE_UNALIGNED) /* no unaligned access just wastes bytes */ \ + && XXH_SIZE_OPT <= 0 +# pragma GCC unroll 8 +#endif for (i=0; i < XXH_ACC_NB; i++) { XXH3_scalarRound(acc, input, secret, i); } } +XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(scalar) /*! * @internal @@ -4333,10 +5656,10 @@ XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64) const xxh_u8* kSecretPtr = XXH3_kSecret; XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0); -#if defined(__clang__) && defined(__aarch64__) +#if defined(__GNUC__) && defined(__aarch64__) /* * UGLY HACK: - * Clang generates a bunch of MOV/MOVK pairs for aarch64, and they are + * GCC and Clang generate a bunch of MOV/MOVK pairs for aarch64, and they are * placed sequentially, in order, at the top of the unrolled loop. * * While MOVK is great for generating constants (2 cycles for a 64-bit @@ -4351,7 +5674,7 @@ XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64) * ADD * SUB STR * STR - * By forcing loads from memory (as the asm line causes Clang to assume + * By forcing loads from memory (as the asm line causes the compiler to assume * that XXH3_kSecretPtr has been changed), the pipelines are used more * efficiently: * I L S @@ -4368,17 +5691,11 @@ XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64) */ XXH_COMPILER_GUARD(kSecretPtr); #endif - /* - * Note: in debug mode, this overrides the asm optimization - * and Clang will emit MOVK chains again. - */ - XXH_ASSERT(kSecretPtr == XXH3_kSecret); - { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16; int i; for (i=0; i < nbRounds; i++) { /* - * The asm hack causes Clang to assume that kSecretPtr aliases with + * The asm hack causes the compiler to assume that kSecretPtr aliases with * customSecret, and on aarch64, this prevented LDP from merging two * loads together for free. Putting the loads together before the stores * properly generates LDP. @@ -4391,7 +5708,7 @@ XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64) } -typedef void (*XXH3_f_accumulate_512)(void* XXH_RESTRICT, const void*, const void*); +typedef void (*XXH3_f_accumulate)(xxh_u64* XXH_RESTRICT, const xxh_u8* XXH_RESTRICT, const xxh_u8* XXH_RESTRICT, size_t); typedef void (*XXH3_f_scrambleAcc)(void* XXH_RESTRICT, const void*); typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64); @@ -4399,82 +5716,63 @@ typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64); #if (XXH_VECTOR == XXH_AVX512) #define XXH3_accumulate_512 XXH3_accumulate_512_avx512 +#define XXH3_accumulate XXH3_accumulate_avx512 #define XXH3_scrambleAcc XXH3_scrambleAcc_avx512 #define XXH3_initCustomSecret XXH3_initCustomSecret_avx512 #elif (XXH_VECTOR == XXH_AVX2) #define XXH3_accumulate_512 XXH3_accumulate_512_avx2 +#define XXH3_accumulate XXH3_accumulate_avx2 #define XXH3_scrambleAcc XXH3_scrambleAcc_avx2 #define XXH3_initCustomSecret XXH3_initCustomSecret_avx2 #elif (XXH_VECTOR == XXH_SSE2) #define XXH3_accumulate_512 XXH3_accumulate_512_sse2 +#define XXH3_accumulate XXH3_accumulate_sse2 #define XXH3_scrambleAcc XXH3_scrambleAcc_sse2 #define XXH3_initCustomSecret XXH3_initCustomSecret_sse2 #elif (XXH_VECTOR == XXH_NEON) #define XXH3_accumulate_512 XXH3_accumulate_512_neon +#define XXH3_accumulate XXH3_accumulate_neon #define XXH3_scrambleAcc XXH3_scrambleAcc_neon #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar #elif (XXH_VECTOR == XXH_VSX) #define XXH3_accumulate_512 XXH3_accumulate_512_vsx +#define XXH3_accumulate XXH3_accumulate_vsx #define XXH3_scrambleAcc XXH3_scrambleAcc_vsx #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar +#elif (XXH_VECTOR == XXH_SVE) +#define XXH3_accumulate_512 XXH3_accumulate_512_sve +#define XXH3_accumulate XXH3_accumulate_sve +#define XXH3_scrambleAcc XXH3_scrambleAcc_scalar +#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar + #else /* scalar */ #define XXH3_accumulate_512 XXH3_accumulate_512_scalar +#define XXH3_accumulate XXH3_accumulate_scalar #define XXH3_scrambleAcc XXH3_scrambleAcc_scalar #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar #endif - - -#ifndef XXH_PREFETCH_DIST -# ifdef __clang__ -# define XXH_PREFETCH_DIST 320 -# else -# if (XXH_VECTOR == XXH_AVX512) -# define XXH_PREFETCH_DIST 512 -# else -# define XXH_PREFETCH_DIST 384 -# endif -# endif /* __clang__ */ -#endif /* XXH_PREFETCH_DIST */ - -/* - * XXH3_accumulate() - * Loops over XXH3_accumulate_512(). - * Assumption: nbStripes will not overflow the secret size - */ -XXH_FORCE_INLINE void -XXH3_accumulate( xxh_u64* XXH_RESTRICT acc, - const xxh_u8* XXH_RESTRICT input, - const xxh_u8* XXH_RESTRICT secret, - size_t nbStripes, - XXH3_f_accumulate_512 f_acc512) -{ - size_t n; - for (n = 0; n < nbStripes; n++ ) { - const xxh_u8* const in = input + n*XXH_STRIPE_LEN; - XXH_PREFETCH(in + XXH_PREFETCH_DIST); - f_acc512(acc, - in, - secret + n*XXH_SECRET_CONSUME_RATE); - } -} +#if XXH_SIZE_OPT >= 1 /* don't do SIMD for initialization */ +# undef XXH3_initCustomSecret +# define XXH3_initCustomSecret XXH3_initCustomSecret_scalar +#endif XXH_FORCE_INLINE void XXH3_hashLong_internal_loop(xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT input, size_t len, const xxh_u8* XXH_RESTRICT secret, size_t secretSize, - XXH3_f_accumulate_512 f_acc512, + XXH3_f_accumulate f_acc, XXH3_f_scrambleAcc f_scramble) { size_t const nbStripesPerBlock = (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE; @@ -4486,7 +5784,7 @@ XXH3_hashLong_internal_loop(xxh_u64* XXH_RESTRICT acc, XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); for (n = 0; n < nb_blocks; n++) { - XXH3_accumulate(acc, input + n*block_len, secret, nbStripesPerBlock, f_acc512); + f_acc(acc, input + n*block_len, secret, nbStripesPerBlock); f_scramble(acc, secret + secretSize - XXH_STRIPE_LEN); } @@ -4494,12 +5792,12 @@ XXH3_hashLong_internal_loop(xxh_u64* XXH_RESTRICT acc, XXH_ASSERT(len > XXH_STRIPE_LEN); { size_t const nbStripes = ((len - 1) - (block_len * nb_blocks)) / XXH_STRIPE_LEN; XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE)); - XXH3_accumulate(acc, input + nb_blocks*block_len, secret, nbStripes, f_acc512); + f_acc(acc, input + nb_blocks*block_len, secret, nbStripes); /* last stripe */ { const xxh_u8* const p = input + len - XXH_STRIPE_LEN; #define XXH_SECRET_LASTACC_START 7 /* not aligned on 8, last secret is different from acc & scrambler */ - f_acc512(acc, p, secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START); + XXH3_accumulate_512(acc, p, secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START); } } } @@ -4544,12 +5842,12 @@ XXH3_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secre XXH_FORCE_INLINE XXH64_hash_t XXH3_hashLong_64b_internal(const void* XXH_RESTRICT input, size_t len, const void* XXH_RESTRICT secret, size_t secretSize, - XXH3_f_accumulate_512 f_acc512, + XXH3_f_accumulate f_acc, XXH3_f_scrambleAcc f_scramble) { XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC; - XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, f_acc512, f_scramble); + XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, f_acc, f_scramble); /* converge into final hash */ XXH_STATIC_ASSERT(sizeof(acc) == 64); @@ -4563,13 +5861,15 @@ XXH3_hashLong_64b_internal(const void* XXH_RESTRICT input, size_t len, * It's important for performance to transmit secret's size (when it's static) * so that the compiler can properly optimize the vectorized loop. * This makes a big performance difference for "medium" keys (<1 KB) when using AVX instruction set. + * When the secret size is unknown, or on GCC 12 where the mix of NO_INLINE and FORCE_INLINE + * breaks -Og, this is XXH_NO_INLINE. */ -XXH_FORCE_INLINE XXH64_hash_t +XXH3_WITH_SECRET_INLINE XXH64_hash_t XXH3_hashLong_64b_withSecret(const void* XXH_RESTRICT input, size_t len, XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen) { (void)seed64; - return XXH3_hashLong_64b_internal(input, len, secret, secretLen, XXH3_accumulate_512, XXH3_scrambleAcc); + return XXH3_hashLong_64b_internal(input, len, secret, secretLen, XXH3_accumulate, XXH3_scrambleAcc); } /* @@ -4578,12 +5878,12 @@ XXH3_hashLong_64b_withSecret(const void* XXH_RESTRICT input, size_t len, * Note that inside this no_inline function, we do inline the internal loop, * and provide a statically defined secret size to allow optimization of vector loop. */ -XXH_NO_INLINE XXH64_hash_t +XXH_NO_INLINE XXH_PUREF XXH64_hash_t XXH3_hashLong_64b_default(const void* XXH_RESTRICT input, size_t len, XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen) { (void)seed64; (void)secret; (void)secretLen; - return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_accumulate_512, XXH3_scrambleAcc); + return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_accumulate, XXH3_scrambleAcc); } /* @@ -4600,18 +5900,20 @@ XXH3_hashLong_64b_default(const void* XXH_RESTRICT input, size_t len, XXH_FORCE_INLINE XXH64_hash_t XXH3_hashLong_64b_withSeed_internal(const void* input, size_t len, XXH64_hash_t seed, - XXH3_f_accumulate_512 f_acc512, + XXH3_f_accumulate f_acc, XXH3_f_scrambleAcc f_scramble, XXH3_f_initCustomSecret f_initSec) { +#if XXH_SIZE_OPT <= 0 if (seed == 0) return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), - f_acc512, f_scramble); + f_acc, f_scramble); +#endif { XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE]; f_initSec(secret, seed); return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret), - f_acc512, f_scramble); + f_acc, f_scramble); } } @@ -4619,12 +5921,12 @@ XXH3_hashLong_64b_withSeed_internal(const void* input, size_t len, * It's important for performance that XXH3_hashLong is not inlined. */ XXH_NO_INLINE XXH64_hash_t -XXH3_hashLong_64b_withSeed(const void* input, size_t len, - XXH64_hash_t seed, const xxh_u8* secret, size_t secretLen) +XXH3_hashLong_64b_withSeed(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed, const xxh_u8* XXH_RESTRICT secret, size_t secretLen) { (void)secret; (void)secretLen; return XXH3_hashLong_64b_withSeed_internal(input, len, seed, - XXH3_accumulate_512, XXH3_scrambleAcc, XXH3_initCustomSecret); + XXH3_accumulate, XXH3_scrambleAcc, XXH3_initCustomSecret); } @@ -4656,37 +5958,37 @@ XXH3_64bits_internal(const void* XXH_RESTRICT input, size_t len, /* === Public entry point === */ -/*! @ingroup xxh3_family */ -XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* input, size_t len) +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(XXH_NOESCAPE const void* input, size_t length) { - return XXH3_64bits_internal(input, len, 0, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_default); + return XXH3_64bits_internal(input, length, 0, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_default); } -/*! @ingroup xxh3_family */ +/*! @ingroup XXH3_family */ XXH_PUBLIC_API XXH64_hash_t -XXH3_64bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize) +XXH3_64bits_withSecret(XXH_NOESCAPE const void* input, size_t length, XXH_NOESCAPE const void* secret, size_t secretSize) { - return XXH3_64bits_internal(input, len, 0, secret, secretSize, XXH3_hashLong_64b_withSecret); + return XXH3_64bits_internal(input, length, 0, secret, secretSize, XXH3_hashLong_64b_withSecret); } -/*! @ingroup xxh3_family */ +/*! @ingroup XXH3_family */ XXH_PUBLIC_API XXH64_hash_t -XXH3_64bits_withSeed(const void* input, size_t len, XXH64_hash_t seed) +XXH3_64bits_withSeed(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed) { - return XXH3_64bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_withSeed); + return XXH3_64bits_internal(input, length, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_withSeed); } XXH_PUBLIC_API XXH64_hash_t -XXH3_64bits_withSecretandSeed(const void* input, size_t len, const void* secret, size_t secretSize, XXH64_hash_t seed) +XXH3_64bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t length, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed) { - if (len <= XXH3_MIDSIZE_MAX) - return XXH3_64bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL); - return XXH3_hashLong_64b_withSecret(input, len, seed, (const xxh_u8*)secret, secretSize); + if (length <= XXH3_MIDSIZE_MAX) + return XXH3_64bits_internal(input, length, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL); + return XXH3_hashLong_64b_withSecret(input, length, seed, (const xxh_u8*)secret, secretSize); } /* === XXH3 streaming === */ - +#ifndef XXH_NO_STREAM /* * Malloc's a pointer that is always aligned to align. * @@ -4710,7 +6012,7 @@ XXH3_64bits_withSecretandSeed(const void* input, size_t len, const void* secret, * * Align must be a power of 2 and 8 <= align <= 128. */ -static void* XXH_alignedMalloc(size_t s, size_t align) +static XXH_MALLOCF void* XXH_alignedMalloc(size_t s, size_t align) { XXH_ASSERT(align <= 128 && align >= 8); /* range check */ XXH_ASSERT((align & (align-1)) == 0); /* power of 2 */ @@ -4752,7 +6054,15 @@ static void XXH_alignedFree(void* p) XXH_free(base); } } -/*! @ingroup xxh3_family */ +/*! @ingroup XXH3_family */ +/*! + * @brief Allocate an @ref XXH3_state_t. + * + * @return An allocated pointer of @ref XXH3_state_t on success. + * @return `NULL` on failure. + * + * @note Must be freed with XXH3_freeState(). + */ XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void) { XXH3_state_t* const state = (XXH3_state_t*)XXH_alignedMalloc(sizeof(XXH3_state_t), 64); @@ -4761,16 +6071,25 @@ XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void) return state; } -/*! @ingroup xxh3_family */ +/*! @ingroup XXH3_family */ +/*! + * @brief Frees an @ref XXH3_state_t. + * + * @param statePtr A pointer to an @ref XXH3_state_t allocated with @ref XXH3_createState(). + * + * @return @ref XXH_OK. + * + * @note Must be allocated with XXH3_createState(). + */ XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr) { XXH_alignedFree(statePtr); return XXH_OK; } -/*! @ingroup xxh3_family */ +/*! @ingroup XXH3_family */ XXH_PUBLIC_API void -XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state) +XXH3_copyState(XXH_NOESCAPE XXH3_state_t* dst_state, XXH_NOESCAPE const XXH3_state_t* src_state) { XXH_memcpy(dst_state, src_state, sizeof(*dst_state)); } @@ -4802,18 +6121,18 @@ XXH3_reset_internal(XXH3_state_t* statePtr, statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE; } -/*! @ingroup xxh3_family */ +/*! @ingroup XXH3_family */ XXH_PUBLIC_API XXH_errorcode -XXH3_64bits_reset(XXH3_state_t* statePtr) +XXH3_64bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr) { if (statePtr == NULL) return XXH_ERROR; XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE); return XXH_OK; } -/*! @ingroup xxh3_family */ +/*! @ingroup XXH3_family */ XXH_PUBLIC_API XXH_errorcode -XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize) +XXH3_64bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize) { if (statePtr == NULL) return XXH_ERROR; XXH3_reset_internal(statePtr, 0, secret, secretSize); @@ -4822,9 +6141,9 @@ XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t return XXH_OK; } -/*! @ingroup xxh3_family */ +/*! @ingroup XXH3_family */ XXH_PUBLIC_API XXH_errorcode -XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed) +XXH3_64bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed) { if (statePtr == NULL) return XXH_ERROR; if (seed==0) return XXH3_64bits_reset(statePtr); @@ -4834,9 +6153,9 @@ XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed) return XXH_OK; } -/*! @ingroup xxh3_family */ +/*! @ingroup XXH3_family */ XXH_PUBLIC_API XXH_errorcode -XXH3_64bits_reset_withSecretandSeed(XXH3_state_t* statePtr, const void* secret, size_t secretSize, XXH64_hash_t seed64) +XXH3_64bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed64) { if (statePtr == NULL) return XXH_ERROR; if (secret == NULL) return XXH_ERROR; @@ -4846,35 +6165,61 @@ XXH3_64bits_reset_withSecretandSeed(XXH3_state_t* statePtr, const void* secret, return XXH_OK; } -/* Note : when XXH3_consumeStripes() is invoked, - * there must be a guarantee that at least one more byte must be consumed from input - * so that the function can blindly consume all stripes using the "normal" secret segment */ -XXH_FORCE_INLINE void +/*! + * @internal + * @brief Processes a large input for XXH3_update() and XXH3_digest_long(). + * + * Unlike XXH3_hashLong_internal_loop(), this can process data that overlaps a block. + * + * @param acc Pointer to the 8 accumulator lanes + * @param nbStripesSoFarPtr In/out pointer to the number of leftover stripes in the block* + * @param nbStripesPerBlock Number of stripes in a block + * @param input Input pointer + * @param nbStripes Number of stripes to process + * @param secret Secret pointer + * @param secretLimit Offset of the last block in @p secret + * @param f_acc Pointer to an XXH3_accumulate implementation + * @param f_scramble Pointer to an XXH3_scrambleAcc implementation + * @return Pointer past the end of @p input after processing + */ +XXH_FORCE_INLINE const xxh_u8 * XXH3_consumeStripes(xxh_u64* XXH_RESTRICT acc, size_t* XXH_RESTRICT nbStripesSoFarPtr, size_t nbStripesPerBlock, const xxh_u8* XXH_RESTRICT input, size_t nbStripes, const xxh_u8* XXH_RESTRICT secret, size_t secretLimit, - XXH3_f_accumulate_512 f_acc512, + XXH3_f_accumulate f_acc, XXH3_f_scrambleAcc f_scramble) { - XXH_ASSERT(nbStripes <= nbStripesPerBlock); /* can handle max 1 scramble per invocation */ - XXH_ASSERT(*nbStripesSoFarPtr < nbStripesPerBlock); - if (nbStripesPerBlock - *nbStripesSoFarPtr <= nbStripes) { - /* need a scrambling operation */ - size_t const nbStripesToEndofBlock = nbStripesPerBlock - *nbStripesSoFarPtr; - size_t const nbStripesAfterBlock = nbStripes - nbStripesToEndofBlock; - XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripesToEndofBlock, f_acc512); - f_scramble(acc, secret + secretLimit); - XXH3_accumulate(acc, input + nbStripesToEndofBlock * XXH_STRIPE_LEN, secret, nbStripesAfterBlock, f_acc512); - *nbStripesSoFarPtr = nbStripesAfterBlock; - } else { - XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripes, f_acc512); + const xxh_u8* initialSecret = secret + *nbStripesSoFarPtr * XXH_SECRET_CONSUME_RATE; + /* Process full blocks */ + if (nbStripes >= (nbStripesPerBlock - *nbStripesSoFarPtr)) { + /* Process the initial partial block... */ + size_t nbStripesThisIter = nbStripesPerBlock - *nbStripesSoFarPtr; + + do { + /* Accumulate and scramble */ + f_acc(acc, input, initialSecret, nbStripesThisIter); + f_scramble(acc, secret + secretLimit); + input += nbStripesThisIter * XXH_STRIPE_LEN; + nbStripes -= nbStripesThisIter; + /* Then continue the loop with the full block size */ + nbStripesThisIter = nbStripesPerBlock; + initialSecret = secret; + } while (nbStripes >= nbStripesPerBlock); + *nbStripesSoFarPtr = 0; + } + /* Process a partial block */ + if (nbStripes > 0) { + f_acc(acc, input, initialSecret, nbStripes); + input += nbStripes * XXH_STRIPE_LEN; *nbStripesSoFarPtr += nbStripes; } + /* Return end pointer */ + return input; } #ifndef XXH3_STREAM_USE_STACK -# ifndef __clang__ /* clang doesn't need additional stack space */ +# if XXH_SIZE_OPT <= 0 && !defined(__clang__) /* clang doesn't need additional stack space */ # define XXH3_STREAM_USE_STACK 1 # endif #endif @@ -4884,7 +6229,7 @@ XXH3_consumeStripes(xxh_u64* XXH_RESTRICT acc, XXH_FORCE_INLINE XXH_errorcode XXH3_update(XXH3_state_t* XXH_RESTRICT const state, const xxh_u8* XXH_RESTRICT input, size_t len, - XXH3_f_accumulate_512 f_acc512, + XXH3_f_accumulate f_acc, XXH3_f_scrambleAcc f_scramble) { if (input==NULL) { @@ -4900,7 +6245,8 @@ XXH3_update(XXH3_state_t* XXH_RESTRICT const state, * when operating accumulators directly into state. * Operating into stack space seems to enable proper optimization. * clang, on the other hand, doesn't seem to need this trick */ - XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[8]; memcpy(acc, state->acc, sizeof(acc)); + XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[8]; + XXH_memcpy(acc, state->acc, sizeof(acc)); #else xxh_u64* XXH_RESTRICT const acc = state->acc; #endif @@ -4908,7 +6254,7 @@ XXH3_update(XXH3_state_t* XXH_RESTRICT const state, XXH_ASSERT(state->bufferedSize <= XXH3_INTERNALBUFFER_SIZE); /* small input : just fill in tmp buffer */ - if (state->bufferedSize + len <= XXH3_INTERNALBUFFER_SIZE) { + if (len <= XXH3_INTERNALBUFFER_SIZE - state->bufferedSize) { XXH_memcpy(state->buffer + state->bufferedSize, input, len); state->bufferedSize += (XXH32_hash_t)len; return XXH_OK; @@ -4930,57 +6276,20 @@ XXH3_update(XXH3_state_t* XXH_RESTRICT const state, &state->nbStripesSoFar, state->nbStripesPerBlock, state->buffer, XXH3_INTERNALBUFFER_STRIPES, secret, state->secretLimit, - f_acc512, f_scramble); + f_acc, f_scramble); state->bufferedSize = 0; } XXH_ASSERT(input < bEnd); - - /* large input to consume : ingest per full block */ - if ((size_t)(bEnd - input) > state->nbStripesPerBlock * XXH_STRIPE_LEN) { + if (bEnd - input > XXH3_INTERNALBUFFER_SIZE) { size_t nbStripes = (size_t)(bEnd - 1 - input) / XXH_STRIPE_LEN; - XXH_ASSERT(state->nbStripesPerBlock >= state->nbStripesSoFar); - /* join to current block's end */ - { size_t const nbStripesToEnd = state->nbStripesPerBlock - state->nbStripesSoFar; - XXH_ASSERT(nbStripesToEnd <= nbStripes); - XXH3_accumulate(acc, input, secret + state->nbStripesSoFar * XXH_SECRET_CONSUME_RATE, nbStripesToEnd, f_acc512); - f_scramble(acc, secret + state->secretLimit); - state->nbStripesSoFar = 0; - input += nbStripesToEnd * XXH_STRIPE_LEN; - nbStripes -= nbStripesToEnd; - } - /* consume per entire blocks */ - while(nbStripes >= state->nbStripesPerBlock) { - XXH3_accumulate(acc, input, secret, state->nbStripesPerBlock, f_acc512); - f_scramble(acc, secret + state->secretLimit); - input += state->nbStripesPerBlock * XXH_STRIPE_LEN; - nbStripes -= state->nbStripesPerBlock; - } - /* consume last partial block */ - XXH3_accumulate(acc, input, secret, nbStripes, f_acc512); - input += nbStripes * XXH_STRIPE_LEN; - XXH_ASSERT(input < bEnd); /* at least some bytes left */ - state->nbStripesSoFar = nbStripes; - /* buffer predecessor of last partial stripe */ - XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN); - XXH_ASSERT(bEnd - input <= XXH_STRIPE_LEN); - } else { - /* content to consume <= block size */ - /* Consume input by a multiple of internal buffer size */ - if (bEnd - input > XXH3_INTERNALBUFFER_SIZE) { - const xxh_u8* const limit = bEnd - XXH3_INTERNALBUFFER_SIZE; - do { - XXH3_consumeStripes(acc, + input = XXH3_consumeStripes(acc, &state->nbStripesSoFar, state->nbStripesPerBlock, - input, XXH3_INTERNALBUFFER_STRIPES, - secret, state->secretLimit, - f_acc512, f_scramble); - input += XXH3_INTERNALBUFFER_SIZE; - } while (input<limit); - /* buffer predecessor of last partial stripe */ - XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN); - } - } + input, nbStripes, + secret, state->secretLimit, + f_acc, f_scramble); + XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN); + } /* Some remaining input (always) : buffer it */ XXH_ASSERT(input < bEnd); XXH_ASSERT(bEnd - input <= XXH3_INTERNALBUFFER_SIZE); @@ -4989,19 +6298,19 @@ XXH3_update(XXH3_state_t* XXH_RESTRICT const state, state->bufferedSize = (XXH32_hash_t)(bEnd-input); #if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1 /* save stack accumulators into state */ - memcpy(state->acc, acc, sizeof(acc)); + XXH_memcpy(state->acc, acc, sizeof(acc)); #endif } return XXH_OK; } -/*! @ingroup xxh3_family */ +/*! @ingroup XXH3_family */ XXH_PUBLIC_API XXH_errorcode -XXH3_64bits_update(XXH3_state_t* state, const void* input, size_t len) +XXH3_64bits_update(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len) { return XXH3_update(state, (const xxh_u8*)input, len, - XXH3_accumulate_512, XXH3_scrambleAcc); + XXH3_accumulate, XXH3_scrambleAcc); } @@ -5010,37 +6319,40 @@ XXH3_digest_long (XXH64_hash_t* acc, const XXH3_state_t* state, const unsigned char* secret) { + xxh_u8 lastStripe[XXH_STRIPE_LEN]; + const xxh_u8* lastStripePtr; + /* * Digest on a local copy. This way, the state remains unaltered, and it can * continue ingesting more input afterwards. */ XXH_memcpy(acc, state->acc, sizeof(state->acc)); if (state->bufferedSize >= XXH_STRIPE_LEN) { + /* Consume remaining stripes then point to remaining data in buffer */ size_t const nbStripes = (state->bufferedSize - 1) / XXH_STRIPE_LEN; size_t nbStripesSoFar = state->nbStripesSoFar; XXH3_consumeStripes(acc, &nbStripesSoFar, state->nbStripesPerBlock, state->buffer, nbStripes, secret, state->secretLimit, - XXH3_accumulate_512, XXH3_scrambleAcc); - /* last stripe */ - XXH3_accumulate_512(acc, - state->buffer + state->bufferedSize - XXH_STRIPE_LEN, - secret + state->secretLimit - XXH_SECRET_LASTACC_START); + XXH3_accumulate, XXH3_scrambleAcc); + lastStripePtr = state->buffer + state->bufferedSize - XXH_STRIPE_LEN; } else { /* bufferedSize < XXH_STRIPE_LEN */ - xxh_u8 lastStripe[XXH_STRIPE_LEN]; + /* Copy to temp buffer */ size_t const catchupSize = XXH_STRIPE_LEN - state->bufferedSize; XXH_ASSERT(state->bufferedSize > 0); /* there is always some input buffered */ XXH_memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize); XXH_memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize); - XXH3_accumulate_512(acc, - lastStripe, - secret + state->secretLimit - XXH_SECRET_LASTACC_START); + lastStripePtr = lastStripe; } + /* Last stripe */ + XXH3_accumulate_512(acc, + lastStripePtr, + secret + state->secretLimit - XXH_SECRET_LASTACC_START); } -/*! @ingroup xxh3_family */ -XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* state) +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (XXH_NOESCAPE const XXH3_state_t* state) { const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret; if (state->totalLen > XXH3_MIDSIZE_MAX) { @@ -5056,7 +6368,7 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* state) return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen), secret, state->secretLimit + XXH_STRIPE_LEN); } - +#endif /* !XXH_NO_STREAM */ /* ========================================== @@ -5076,7 +6388,7 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* state) * fast for a _128-bit_ hash on 32-bit (it usually clears XXH64). */ -XXH_FORCE_INLINE XXH128_hash_t +XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t XXH3_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) { /* A doubled version of 1to3_64b with different constants. */ @@ -5105,7 +6417,7 @@ XXH3_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_ } } -XXH_FORCE_INLINE XXH128_hash_t +XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t XXH3_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) { XXH_ASSERT(input != NULL); @@ -5125,14 +6437,14 @@ XXH3_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_ m128.low64 ^= (m128.high64 >> 3); m128.low64 = XXH_xorshift64(m128.low64, 35); - m128.low64 *= 0x9FB21C651E98DF25ULL; + m128.low64 *= PRIME_MX2; m128.low64 = XXH_xorshift64(m128.low64, 28); m128.high64 = XXH3_avalanche(m128.high64); return m128; } } -XXH_FORCE_INLINE XXH128_hash_t +XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) { XXH_ASSERT(input != NULL); @@ -5207,7 +6519,7 @@ XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64 /* * Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN */ -XXH_FORCE_INLINE XXH128_hash_t +XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t XXH3_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) { XXH_ASSERT(len <= 16); @@ -5238,7 +6550,7 @@ XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2, } -XXH_FORCE_INLINE XXH128_hash_t +XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len, const xxh_u8* XXH_RESTRICT secret, size_t secretSize, XXH64_hash_t seed) @@ -5249,6 +6561,16 @@ XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len, { XXH128_hash_t acc; acc.low64 = len * XXH_PRIME64_1; acc.high64 = 0; + +#if XXH_SIZE_OPT >= 1 + { + /* Smaller, but slightly slower. */ + unsigned int i = (unsigned int)(len - 1) / 32; + do { + acc = XXH128_mix32B(acc, input+16*i, input+len-16*(i+1), secret+32*i, seed); + } while (i-- != 0); + } +#else if (len > 32) { if (len > 64) { if (len > 96) { @@ -5259,6 +6581,7 @@ XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len, acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed); } acc = XXH128_mix32B(acc, input, input+len-16, secret, seed); +#endif { XXH128_hash_t h128; h128.low64 = acc.low64 + acc.high64; h128.high64 = (acc.low64 * XXH_PRIME64_1) @@ -5271,7 +6594,7 @@ XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len, } } -XXH_NO_INLINE XXH128_hash_t +XXH_NO_INLINE XXH_PUREF XXH128_hash_t XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len, const xxh_u8* XXH_RESTRICT secret, size_t secretSize, XXH64_hash_t seed) @@ -5280,25 +6603,34 @@ XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len, XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); { XXH128_hash_t acc; - int const nbRounds = (int)len / 32; - int i; + unsigned i; acc.low64 = len * XXH_PRIME64_1; acc.high64 = 0; - for (i=0; i<4; i++) { + /* + * We set as `i` as offset + 32. We do this so that unchanged + * `len` can be used as upper bound. This reaches a sweet spot + * where both x86 and aarch64 get simple agen and good codegen + * for the loop. + */ + for (i = 32; i < 160; i += 32) { acc = XXH128_mix32B(acc, - input + (32 * i), - input + (32 * i) + 16, - secret + (32 * i), + input + i - 32, + input + i - 16, + secret + i - 32, seed); } acc.low64 = XXH3_avalanche(acc.low64); acc.high64 = XXH3_avalanche(acc.high64); - XXH_ASSERT(nbRounds >= 4); - for (i=4 ; i < nbRounds; i++) { + /* + * NB: `i <= len` will duplicate the last 32-bytes if + * len % 32 was zero. This is an unfortunate necessity to keep + * the hash result stable. + */ + for (i=160; i <= len; i += 32) { acc = XXH128_mix32B(acc, - input + (32 * i), - input + (32 * i) + 16, - secret + XXH3_MIDSIZE_STARTOFFSET + (32 * (i - 4)), + input + i - 32, + input + i - 16, + secret + XXH3_MIDSIZE_STARTOFFSET + i - 160, seed); } /* last bytes */ @@ -5306,7 +6638,7 @@ XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len, input + len - 16, input + len - 32, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16, - 0ULL - seed); + (XXH64_hash_t)0 - seed); { XXH128_hash_t h128; h128.low64 = acc.low64 + acc.high64; @@ -5323,12 +6655,12 @@ XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len, XXH_FORCE_INLINE XXH128_hash_t XXH3_hashLong_128b_internal(const void* XXH_RESTRICT input, size_t len, const xxh_u8* XXH_RESTRICT secret, size_t secretSize, - XXH3_f_accumulate_512 f_acc512, + XXH3_f_accumulate f_acc, XXH3_f_scrambleAcc f_scramble) { XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC; - XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, secret, secretSize, f_acc512, f_scramble); + XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, secret, secretSize, f_acc, f_scramble); /* converge into final hash */ XXH_STATIC_ASSERT(sizeof(acc) == 64); @@ -5346,47 +6678,50 @@ XXH3_hashLong_128b_internal(const void* XXH_RESTRICT input, size_t len, } /* - * It's important for performance that XXH3_hashLong is not inlined. + * It's important for performance that XXH3_hashLong() is not inlined. */ -XXH_NO_INLINE XXH128_hash_t +XXH_NO_INLINE XXH_PUREF XXH128_hash_t XXH3_hashLong_128b_default(const void* XXH_RESTRICT input, size_t len, XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen) { (void)seed64; (void)secret; (void)secretLen; return XXH3_hashLong_128b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), - XXH3_accumulate_512, XXH3_scrambleAcc); + XXH3_accumulate, XXH3_scrambleAcc); } /* - * It's important for performance to pass @secretLen (when it's static) + * It's important for performance to pass @p secretLen (when it's static) * to the compiler, so that it can properly optimize the vectorized loop. + * + * When the secret size is unknown, or on GCC 12 where the mix of NO_INLINE and FORCE_INLINE + * breaks -Og, this is XXH_NO_INLINE. */ -XXH_FORCE_INLINE XXH128_hash_t +XXH3_WITH_SECRET_INLINE XXH128_hash_t XXH3_hashLong_128b_withSecret(const void* XXH_RESTRICT input, size_t len, XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen) { (void)seed64; return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, secretLen, - XXH3_accumulate_512, XXH3_scrambleAcc); + XXH3_accumulate, XXH3_scrambleAcc); } XXH_FORCE_INLINE XXH128_hash_t XXH3_hashLong_128b_withSeed_internal(const void* XXH_RESTRICT input, size_t len, XXH64_hash_t seed64, - XXH3_f_accumulate_512 f_acc512, + XXH3_f_accumulate f_acc, XXH3_f_scrambleAcc f_scramble, XXH3_f_initCustomSecret f_initSec) { if (seed64 == 0) return XXH3_hashLong_128b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), - f_acc512, f_scramble); + f_acc, f_scramble); { XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE]; f_initSec(secret, seed64); return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, sizeof(secret), - f_acc512, f_scramble); + f_acc, f_scramble); } } @@ -5399,7 +6734,7 @@ XXH3_hashLong_128b_withSeed(const void* input, size_t len, { (void)secret; (void)secretLen; return XXH3_hashLong_128b_withSeed_internal(input, len, seed64, - XXH3_accumulate_512, XXH3_scrambleAcc, XXH3_initCustomSecret); + XXH3_accumulate, XXH3_scrambleAcc, XXH3_initCustomSecret); } typedef XXH128_hash_t (*XXH3_hashLong128_f)(const void* XXH_RESTRICT, size_t, @@ -5429,94 +6764,93 @@ XXH3_128bits_internal(const void* input, size_t len, /* === Public XXH128 API === */ -/*! @ingroup xxh3_family */ -XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* input, size_t len) +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(XXH_NOESCAPE const void* input, size_t len) { return XXH3_128bits_internal(input, len, 0, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_128b_default); } -/*! @ingroup xxh3_family */ +/*! @ingroup XXH3_family */ XXH_PUBLIC_API XXH128_hash_t -XXH3_128bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize) +XXH3_128bits_withSecret(XXH_NOESCAPE const void* input, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize) { return XXH3_128bits_internal(input, len, 0, (const xxh_u8*)secret, secretSize, XXH3_hashLong_128b_withSecret); } -/*! @ingroup xxh3_family */ +/*! @ingroup XXH3_family */ XXH_PUBLIC_API XXH128_hash_t -XXH3_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed) +XXH3_128bits_withSeed(XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed) { return XXH3_128bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_128b_withSeed); } -/*! @ingroup xxh3_family */ +/*! @ingroup XXH3_family */ XXH_PUBLIC_API XXH128_hash_t -XXH3_128bits_withSecretandSeed(const void* input, size_t len, const void* secret, size_t secretSize, XXH64_hash_t seed) +XXH3_128bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed) { if (len <= XXH3_MIDSIZE_MAX) return XXH3_128bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL); return XXH3_hashLong_128b_withSecret(input, len, seed, secret, secretSize); } -/*! @ingroup xxh3_family */ +/*! @ingroup XXH3_family */ XXH_PUBLIC_API XXH128_hash_t -XXH128(const void* input, size_t len, XXH64_hash_t seed) +XXH128(XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed) { return XXH3_128bits_withSeed(input, len, seed); } /* === XXH3 128-bit streaming === */ - +#ifndef XXH_NO_STREAM /* * All initialization and update functions are identical to 64-bit streaming variant. * The only difference is the finalization routine. */ -/*! @ingroup xxh3_family */ +/*! @ingroup XXH3_family */ XXH_PUBLIC_API XXH_errorcode -XXH3_128bits_reset(XXH3_state_t* statePtr) +XXH3_128bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr) { return XXH3_64bits_reset(statePtr); } -/*! @ingroup xxh3_family */ +/*! @ingroup XXH3_family */ XXH_PUBLIC_API XXH_errorcode -XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize) +XXH3_128bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize) { return XXH3_64bits_reset_withSecret(statePtr, secret, secretSize); } -/*! @ingroup xxh3_family */ +/*! @ingroup XXH3_family */ XXH_PUBLIC_API XXH_errorcode -XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed) +XXH3_128bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed) { return XXH3_64bits_reset_withSeed(statePtr, seed); } -/*! @ingroup xxh3_family */ +/*! @ingroup XXH3_family */ XXH_PUBLIC_API XXH_errorcode -XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr, const void* secret, size_t secretSize, XXH64_hash_t seed) +XXH3_128bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed) { return XXH3_64bits_reset_withSecretandSeed(statePtr, secret, secretSize, seed); } -/*! @ingroup xxh3_family */ +/*! @ingroup XXH3_family */ XXH_PUBLIC_API XXH_errorcode -XXH3_128bits_update(XXH3_state_t* state, const void* input, size_t len) +XXH3_128bits_update(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len) { - return XXH3_update(state, (const xxh_u8*)input, len, - XXH3_accumulate_512, XXH3_scrambleAcc); + return XXH3_64bits_update(state, input, len); } -/*! @ingroup xxh3_family */ -XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* state) +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (XXH_NOESCAPE const XXH3_state_t* state) { const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret; if (state->totalLen > XXH3_MIDSIZE_MAX) { @@ -5540,13 +6874,13 @@ XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* state) return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen), secret, state->secretLimit + XXH_STRIPE_LEN); } - +#endif /* !XXH_NO_STREAM */ /* 128-bit utility functions */ #include <string.h> /* memcmp, memcpy */ /* return : 1 is equal, 0 if different */ -/*! @ingroup xxh3_family */ +/*! @ingroup XXH3_family */ XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2) { /* note : XXH128_hash_t is compact, it has no padding byte */ @@ -5554,11 +6888,11 @@ XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2) } /* This prototype is compatible with stdlib's qsort(). - * return : >0 if *h128_1 > *h128_2 - * <0 if *h128_1 < *h128_2 - * =0 if *h128_1 == *h128_2 */ -/*! @ingroup xxh3_family */ -XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2) + * @return : >0 if *h128_1 > *h128_2 + * <0 if *h128_1 < *h128_2 + * =0 if *h128_1 == *h128_2 */ +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API int XXH128_cmp(XXH_NOESCAPE const void* h128_1, XXH_NOESCAPE const void* h128_2) { XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1; XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2; @@ -5570,9 +6904,9 @@ XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2) /*====== Canonical representation ======*/ -/*! @ingroup xxh3_family */ +/*! @ingroup XXH3_family */ XXH_PUBLIC_API void -XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash) +XXH128_canonicalFromHash(XXH_NOESCAPE XXH128_canonical_t* dst, XXH128_hash_t hash) { XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t)); if (XXH_CPU_LITTLE_ENDIAN) { @@ -5583,9 +6917,9 @@ XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash) XXH_memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64)); } -/*! @ingroup xxh3_family */ +/*! @ingroup XXH3_family */ XXH_PUBLIC_API XXH128_hash_t -XXH128_hashFromCanonical(const XXH128_canonical_t* src) +XXH128_hashFromCanonical(XXH_NOESCAPE const XXH128_canonical_t* src) { XXH128_hash_t h; h.high64 = XXH_readBE64(src); @@ -5607,9 +6941,9 @@ XXH_FORCE_INLINE void XXH3_combine16(void* dst, XXH128_hash_t h128) XXH_writeLE64( (char*)dst+8, XXH_readLE64((char*)dst+8) ^ h128.high64 ); } -/*! @ingroup xxh3_family */ +/*! @ingroup XXH3_family */ XXH_PUBLIC_API XXH_errorcode -XXH3_generateSecret(void* secretBuffer, size_t secretSize, const void* customSeed, size_t customSeedSize) +XXH3_generateSecret(XXH_NOESCAPE void* secretBuffer, size_t secretSize, XXH_NOESCAPE const void* customSeed, size_t customSeedSize) { #if (XXH_DEBUGLEVEL >= 1) XXH_ASSERT(secretBuffer != NULL); @@ -5652,9 +6986,9 @@ XXH3_generateSecret(void* secretBuffer, size_t secretSize, const void* customSee return XXH_OK; } -/*! @ingroup xxh3_family */ +/*! @ingroup XXH3_family */ XXH_PUBLIC_API void -XXH3_generateSecret_fromSeed(void* secretBuffer, XXH64_hash_t seed) +XXH3_generateSecret_fromSeed(XXH_NOESCAPE void* secretBuffer, XXH64_hash_t seed) { XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE]; XXH3_initCustomSecret(secret, seed); @@ -5667,7 +7001,7 @@ XXH3_generateSecret_fromSeed(void* secretBuffer, XXH64_hash_t seed) /* Pop our optimization override from above */ #if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \ && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ - && defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */ + && defined(__OPTIMIZE__) && XXH_SIZE_OPT <= 0 /* respect -O0 and -Os */ # pragma GCC pop_options #endif @@ -5682,5 +7016,5 @@ XXH3_generateSecret_fromSeed(void* secretBuffer, XXH64_hash_t seed) #if defined (__cplusplus) -} +} /* extern "C" */ #endif diff --git a/thirdparty/zstd/common/zstd_internal.h b/thirdparty/zstd/common/zstd_internal.h index 1f942f27bf..ecb9cfba87 100644 --- a/thirdparty/zstd/common/zstd_internal.h +++ b/thirdparty/zstd/common/zstd_internal.h @@ -178,7 +178,7 @@ static void ZSTD_copy8(void* dst, const void* src) { ZSTD_memcpy(dst, src, 8); #endif } -#define COPY8(d,s) { ZSTD_copy8(d,s); d+=8; s+=8; } +#define COPY8(d,s) do { ZSTD_copy8(d,s); d+=8; s+=8; } while (0) /* Need to use memmove here since the literal buffer can now be located within the dst buffer. In circumstances where the op "catches up" to where the @@ -198,7 +198,7 @@ static void ZSTD_copy16(void* dst, const void* src) { ZSTD_memcpy(dst, copy16_buf, 16); #endif } -#define COPY16(d,s) { ZSTD_copy16(d,s); d+=16; s+=16; } +#define COPY16(d,s) do { ZSTD_copy16(d,s); d+=16; s+=16; } while (0) #define WILDCOPY_OVERLENGTH 32 #define WILDCOPY_VECLEN 16 @@ -227,7 +227,7 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e if (ovtype == ZSTD_overlap_src_before_dst && diff < WILDCOPY_VECLEN) { /* Handle short offset copies. */ do { - COPY8(op, ip) + COPY8(op, ip); } while (op < oend); } else { assert(diff >= WILDCOPY_VECLEN || diff <= -WILDCOPY_VECLEN); @@ -366,13 +366,13 @@ typedef struct { /*! ZSTD_getcBlockSize() : * Provides the size of compressed block from block header `src` */ -/* Used by: decompress, fullbench (does not get its definition from here) */ +/* Used by: decompress, fullbench */ size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, blockProperties_t* bpPtr); /*! ZSTD_decodeSeqHeaders() : * decode sequence header from src */ -/* Used by: decompress, fullbench (does not get its definition from here) */ +/* Used by: zstd_decompress_block, fullbench */ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr, const void* src, size_t srcSize); diff --git a/thirdparty/zstd/compress/fse_compress.c b/thirdparty/zstd/compress/fse_compress.c index 5d3770808d..1ce3cf16ac 100644 --- a/thirdparty/zstd/compress/fse_compress.c +++ b/thirdparty/zstd/compress/fse_compress.c @@ -25,7 +25,7 @@ #include "../common/error_private.h" #define ZSTD_DEPS_NEED_MALLOC #define ZSTD_DEPS_NEED_MATH64 -#include "../common/zstd_deps.h" /* ZSTD_malloc, ZSTD_free, ZSTD_memcpy, ZSTD_memset */ +#include "../common/zstd_deps.h" /* ZSTD_memset */ #include "../common/bits.h" /* ZSTD_highbit32 */ @@ -225,8 +225,8 @@ size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog) size_t const maxHeaderSize = (((maxSymbolValue+1) * tableLog + 4 /* bitCount initialized at 4 */ + 2 /* first two symbols may use one additional bit each */) / 8) - + 1 /* round up to whole nb bytes */ - + 2 /* additional two bytes for bitstream flush */; + + 1 /* round up to whole nb bytes */ + + 2 /* additional two bytes for bitstream flush */; return maxSymbolValue ? maxHeaderSize : FSE_NCOUNTBOUND; /* maxSymbolValue==0 ? use default */ } @@ -255,7 +255,7 @@ FSE_writeNCount_generic (void* header, size_t headerBufferSize, /* Init */ remaining = tableSize+1; /* +1 for extra accuracy */ threshold = tableSize; - nbBits = tableLog+1; + nbBits = (int)tableLog+1; while ((symbol < alphabetSize) && (remaining>1)) { /* stops at 1 */ if (previousIs0) { @@ -274,7 +274,7 @@ FSE_writeNCount_generic (void* header, size_t headerBufferSize, } while (symbol >= start+3) { start+=3; - bitStream += 3 << bitCount; + bitStream += 3U << bitCount; bitCount += 2; } bitStream += (symbol-start) << bitCount; @@ -294,7 +294,7 @@ FSE_writeNCount_generic (void* header, size_t headerBufferSize, count++; /* +1 for extra accuracy */ if (count>=threshold) count += max; /* [0..max[ [max..threshold[ (...) [threshold+max 2*threshold[ */ - bitStream += count << bitCount; + bitStream += (U32)count << bitCount; bitCount += nbBits; bitCount -= (count<max); previousIs0 = (count==1); @@ -322,7 +322,8 @@ FSE_writeNCount_generic (void* header, size_t headerBufferSize, out[1] = (BYTE)(bitStream>>8); out+= (bitCount+7) /8; - return (out-ostart); + assert(out >= ostart); + return (size_t)(out-ostart); } diff --git a/thirdparty/zstd/compress/huf_compress.c b/thirdparty/zstd/compress/huf_compress.c index 29871877a7..ea00072320 100644 --- a/thirdparty/zstd/compress/huf_compress.c +++ b/thirdparty/zstd/compress/huf_compress.c @@ -220,6 +220,25 @@ static void HUF_setValue(HUF_CElt* elt, size_t value) } } +HUF_CTableHeader HUF_readCTableHeader(HUF_CElt const* ctable) +{ + HUF_CTableHeader header; + ZSTD_memcpy(&header, ctable, sizeof(header)); + return header; +} + +static void HUF_writeCTableHeader(HUF_CElt* ctable, U32 tableLog, U32 maxSymbolValue) +{ + HUF_CTableHeader header; + HUF_STATIC_ASSERT(sizeof(ctable[0]) == sizeof(header)); + ZSTD_memset(&header, 0, sizeof(header)); + assert(tableLog < 256); + header.tableLog = (BYTE)tableLog; + assert(maxSymbolValue < 256); + header.maxSymbolValue = (BYTE)maxSymbolValue; + ZSTD_memcpy(ctable, &header, sizeof(header)); +} + typedef struct { HUF_CompressWeightsWksp wksp; BYTE bitsToWeight[HUF_TABLELOG_MAX + 1]; /* precomputed conversion table */ @@ -237,6 +256,9 @@ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize, HUF_STATIC_ASSERT(HUF_CTABLE_WORKSPACE_SIZE >= sizeof(HUF_WriteCTableWksp)); + assert(HUF_readCTableHeader(CTable).maxSymbolValue == maxSymbolValue); + assert(HUF_readCTableHeader(CTable).tableLog == huffLog); + /* check conditions */ if (workspaceSize < sizeof(HUF_WriteCTableWksp)) return ERROR(GENERIC); if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) return ERROR(maxSymbolValue_tooLarge); @@ -283,7 +305,9 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void if (tableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge); if (nbSymbols > *maxSymbolValuePtr+1) return ERROR(maxSymbolValue_tooSmall); - CTable[0] = tableLog; + *maxSymbolValuePtr = nbSymbols - 1; + + HUF_writeCTableHeader(CTable, tableLog, *maxSymbolValuePtr); /* Prepare base value per rank */ { U32 n, nextRankStart = 0; @@ -315,7 +339,6 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void { U32 n; for (n=0; n<nbSymbols; n++) HUF_setValue(ct + n, valPerRank[HUF_getNbBits(ct[n])]++); } } - *maxSymbolValuePtr = nbSymbols - 1; return readSize; } @@ -323,6 +346,8 @@ U32 HUF_getNbBitsFromCTable(HUF_CElt const* CTable, U32 symbolValue) { const HUF_CElt* const ct = CTable + 1; assert(symbolValue <= HUF_SYMBOLVALUE_MAX); + if (symbolValue > HUF_readCTableHeader(CTable).maxSymbolValue) + return 0; return (U32)HUF_getNbBits(ct[symbolValue]); } @@ -723,7 +748,8 @@ static void HUF_buildCTableFromTree(HUF_CElt* CTable, nodeElt const* huffNode, i HUF_setNbBits(ct + huffNode[n].byte, huffNode[n].nbBits); /* push nbBits per symbol, symbol order */ for (n=0; n<alphabetSize; n++) HUF_setValue(ct + n, valPerRank[HUF_getNbBits(ct[n])]++); /* assign value within rank, symbol order */ - CTable[0] = maxNbBits; + + HUF_writeCTableHeader(CTable, maxNbBits, maxSymbolValue); } size_t @@ -776,13 +802,20 @@ size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count, } int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue) { - HUF_CElt const* ct = CTable + 1; - int bad = 0; - int s; - for (s = 0; s <= (int)maxSymbolValue; ++s) { - bad |= (count[s] != 0) & (HUF_getNbBits(ct[s]) == 0); - } - return !bad; + HUF_CTableHeader header = HUF_readCTableHeader(CTable); + HUF_CElt const* ct = CTable + 1; + int bad = 0; + int s; + + assert(header.tableLog <= HUF_TABLELOG_ABSOLUTEMAX); + + if (header.maxSymbolValue < maxSymbolValue) + return 0; + + for (s = 0; s <= (int)maxSymbolValue; ++s) { + bad |= (count[s] != 0) & (HUF_getNbBits(ct[s]) == 0); + } + return !bad; } size_t HUF_compressBound(size_t size) { return HUF_COMPRESSBOUND(size); } @@ -1024,17 +1057,17 @@ HUF_compress1X_usingCTable_internal_body(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable) { - U32 const tableLog = (U32)CTable[0]; + U32 const tableLog = HUF_readCTableHeader(CTable).tableLog; HUF_CElt const* ct = CTable + 1; const BYTE* ip = (const BYTE*) src; BYTE* const ostart = (BYTE*)dst; BYTE* const oend = ostart + dstSize; - BYTE* op = ostart; HUF_CStream_t bitC; /* init */ if (dstSize < 8) return 0; /* not enough space to compress */ - { size_t const initErr = HUF_initCStream(&bitC, op, (size_t)(oend-op)); + { BYTE* op = ostart; + size_t const initErr = HUF_initCStream(&bitC, op, (size_t)(oend-op)); if (HUF_isError(initErr)) return 0; } if (dstSize < HUF_tightCompressBound(srcSize, (size_t)tableLog) || tableLog > 11) @@ -1255,7 +1288,7 @@ unsigned HUF_optimalTableLog( { BYTE* dst = (BYTE*)workSpace + sizeof(HUF_WriteCTableWksp); size_t dstSize = wkspSize - sizeof(HUF_WriteCTableWksp); - size_t maxBits, hSize, newSize; + size_t hSize, newSize; const unsigned symbolCardinality = HUF_cardinality(count, maxSymbolValue); const unsigned minTableLog = HUF_minTableLog(symbolCardinality); size_t optSize = ((size_t) ~0) - 1; @@ -1266,12 +1299,14 @@ unsigned HUF_optimalTableLog( /* Search until size increases */ for (optLogGuess = minTableLog; optLogGuess <= maxTableLog; optLogGuess++) { DEBUGLOG(7, "checking for huffLog=%u", optLogGuess); - maxBits = HUF_buildCTable_wksp(table, count, maxSymbolValue, optLogGuess, workSpace, wkspSize); - if (ERR_isError(maxBits)) continue; - if (maxBits < optLogGuess && optLogGuess > minTableLog) break; + { size_t maxBits = HUF_buildCTable_wksp(table, count, maxSymbolValue, optLogGuess, workSpace, wkspSize); + if (ERR_isError(maxBits)) continue; + + if (maxBits < optLogGuess && optLogGuess > minTableLog) break; - hSize = HUF_writeCTable_wksp(dst, dstSize, table, maxSymbolValue, (U32)maxBits, workSpace, wkspSize); + hSize = HUF_writeCTable_wksp(dst, dstSize, table, maxSymbolValue, (U32)maxBits, workSpace, wkspSize); + } if (ERR_isError(hSize)) continue; @@ -1372,12 +1407,6 @@ HUF_compress_internal (void* dst, size_t dstSize, huffLog = (U32)maxBits; DEBUGLOG(6, "bit distribution completed (%zu symbols)", showCTableBits(table->CTable + 1, maxSymbolValue+1)); } - /* Zero unused symbols in CTable, so we can check it for validity */ - { - size_t const ctableSize = HUF_CTABLE_SIZE_ST(maxSymbolValue); - size_t const unusedSize = sizeof(table->CTable) - ctableSize * sizeof(HUF_CElt); - ZSTD_memset(table->CTable + ctableSize, 0, unusedSize); - } /* Write table description header */ { CHECK_V_F(hSize, HUF_writeCTable_wksp(op, dstSize, table->CTable, maxSymbolValue, huffLog, @@ -1420,7 +1449,7 @@ size_t HUF_compress1X_repeat (void* dst, size_t dstSize, /* HUF_compress4X_repeat(): * compress input using 4 streams. * consider skipping quickly - * re-use an existing huffman compression table */ + * reuse an existing huffman compression table */ size_t HUF_compress4X_repeat (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned huffLog, diff --git a/thirdparty/zstd/compress/zstd_compress.c b/thirdparty/zstd/compress/zstd_compress.c index d6133e70b4..9284e2a480 100644 --- a/thirdparty/zstd/compress/zstd_compress.c +++ b/thirdparty/zstd/compress/zstd_compress.c @@ -178,6 +178,7 @@ static void ZSTD_freeCCtxContent(ZSTD_CCtx* cctx) size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx) { + DEBUGLOG(3, "ZSTD_freeCCtx (address: %p)", (void*)cctx); if (cctx==NULL) return 0; /* support free on NULL */ RETURN_ERROR_IF(cctx->staticSize, memory_allocation, "not compatible with static CCtx"); @@ -649,10 +650,11 @@ static size_t ZSTD_cParam_clampBounds(ZSTD_cParameter cParam, int* value) return 0; } -#define BOUNDCHECK(cParam, val) { \ - RETURN_ERROR_IF(!ZSTD_cParam_withinBounds(cParam,val), \ - parameter_outOfBound, "Param out of bounds"); \ -} +#define BOUNDCHECK(cParam, val) \ + do { \ + RETURN_ERROR_IF(!ZSTD_cParam_withinBounds(cParam,val), \ + parameter_outOfBound, "Param out of bounds"); \ + } while (0) static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param) @@ -868,7 +870,7 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, #else FORWARD_IF_ERROR(ZSTD_cParam_clampBounds(param, &value), ""); CCtxParams->nbWorkers = value; - return CCtxParams->nbWorkers; + return (size_t)(CCtxParams->nbWorkers); #endif case ZSTD_c_jobSize : @@ -892,7 +894,7 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, #else FORWARD_IF_ERROR(ZSTD_cParam_clampBounds(ZSTD_c_overlapLog, &value), ""); CCtxParams->overlapLog = value; - return CCtxParams->overlapLog; + return (size_t)CCtxParams->overlapLog; #endif case ZSTD_c_rsyncable : @@ -902,7 +904,7 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, #else FORWARD_IF_ERROR(ZSTD_cParam_clampBounds(ZSTD_c_overlapLog, &value), ""); CCtxParams->rsyncable = value; - return CCtxParams->rsyncable; + return (size_t)CCtxParams->rsyncable; #endif case ZSTD_c_enableDedicatedDictSearch : @@ -939,8 +941,10 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, return CCtxParams->ldmParams.hashRateLog; case ZSTD_c_targetCBlockSize : - if (value!=0) /* 0 ==> default */ + if (value!=0) { /* 0 ==> default */ + value = MAX(value, ZSTD_TARGETCBLOCKSIZE_MIN); BOUNDCHECK(ZSTD_c_targetCBlockSize, value); + } CCtxParams->targetCBlockSize = (U32)value; return CCtxParams->targetCBlockSize; @@ -968,7 +972,7 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, case ZSTD_c_validateSequences: BOUNDCHECK(ZSTD_c_validateSequences, value); CCtxParams->validateSequences = value; - return CCtxParams->validateSequences; + return (size_t)CCtxParams->validateSequences; case ZSTD_c_useBlockSplitter: BOUNDCHECK(ZSTD_c_useBlockSplitter, value); @@ -983,7 +987,7 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, case ZSTD_c_deterministicRefPrefix: BOUNDCHECK(ZSTD_c_deterministicRefPrefix, value); CCtxParams->deterministicRefPrefix = !!value; - return CCtxParams->deterministicRefPrefix; + return (size_t)CCtxParams->deterministicRefPrefix; case ZSTD_c_prefetchCDictTables: BOUNDCHECK(ZSTD_c_prefetchCDictTables, value); @@ -993,7 +997,7 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, case ZSTD_c_enableSeqProducerFallback: BOUNDCHECK(ZSTD_c_enableSeqProducerFallback, value); CCtxParams->enableMatchFinderFallback = value; - return CCtxParams->enableMatchFinderFallback; + return (size_t)CCtxParams->enableMatchFinderFallback; case ZSTD_c_maxBlockSize: if (value!=0) /* 0 ==> default */ @@ -1363,7 +1367,6 @@ size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset) RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, "Reset parameters is only possible during init stage."); ZSTD_clearAllDicts(cctx); - ZSTD_memset(&cctx->externalMatchCtx, 0, sizeof(cctx->externalMatchCtx)); return ZSTD_CCtxParams_reset(&cctx->requestedParams); } return 0; @@ -1391,11 +1394,12 @@ size_t ZSTD_checkCParams(ZSTD_compressionParameters cParams) static ZSTD_compressionParameters ZSTD_clampCParams(ZSTD_compressionParameters cParams) { -# define CLAMP_TYPE(cParam, val, type) { \ - ZSTD_bounds const bounds = ZSTD_cParam_getBounds(cParam); \ - if ((int)val<bounds.lowerBound) val=(type)bounds.lowerBound; \ - else if ((int)val>bounds.upperBound) val=(type)bounds.upperBound; \ - } +# define CLAMP_TYPE(cParam, val, type) \ + do { \ + ZSTD_bounds const bounds = ZSTD_cParam_getBounds(cParam); \ + if ((int)val<bounds.lowerBound) val=(type)bounds.lowerBound; \ + else if ((int)val>bounds.upperBound) val=(type)bounds.upperBound; \ + } while (0) # define CLAMP(cParam, val) CLAMP_TYPE(cParam, val, unsigned) CLAMP(ZSTD_c_windowLog, cParams.windowLog); CLAMP(ZSTD_c_chainLog, cParams.chainLog); @@ -1467,6 +1471,48 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar, const U64 maxWindowResize = 1ULL << (ZSTD_WINDOWLOG_MAX-1); assert(ZSTD_checkCParams(cPar)==0); + /* Cascade the selected strategy down to the next-highest one built into + * this binary. */ +#ifdef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR + if (cPar.strategy == ZSTD_btultra2) { + cPar.strategy = ZSTD_btultra; + } + if (cPar.strategy == ZSTD_btultra) { + cPar.strategy = ZSTD_btopt; + } +#endif +#ifdef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR + if (cPar.strategy == ZSTD_btopt) { + cPar.strategy = ZSTD_btlazy2; + } +#endif +#ifdef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR + if (cPar.strategy == ZSTD_btlazy2) { + cPar.strategy = ZSTD_lazy2; + } +#endif +#ifdef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR + if (cPar.strategy == ZSTD_lazy2) { + cPar.strategy = ZSTD_lazy; + } +#endif +#ifdef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR + if (cPar.strategy == ZSTD_lazy) { + cPar.strategy = ZSTD_greedy; + } +#endif +#ifdef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR + if (cPar.strategy == ZSTD_greedy) { + cPar.strategy = ZSTD_dfast; + } +#endif +#ifdef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR + if (cPar.strategy == ZSTD_dfast) { + cPar.strategy = ZSTD_fast; + cPar.targetLength = 0; + } +#endif + switch (mode) { case ZSTD_cpm_unknown: case ZSTD_cpm_noAttachDict: @@ -1617,8 +1663,8 @@ ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams, + ZSTD_cwksp_aligned_alloc_size((MaxLL+1) * sizeof(U32)) + ZSTD_cwksp_aligned_alloc_size((MaxOff+1) * sizeof(U32)) + ZSTD_cwksp_aligned_alloc_size((1<<Litbits) * sizeof(U32)) - + ZSTD_cwksp_aligned_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_match_t)) - + ZSTD_cwksp_aligned_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t)); + + ZSTD_cwksp_aligned_alloc_size(ZSTD_OPT_SIZE * sizeof(ZSTD_match_t)) + + ZSTD_cwksp_aligned_alloc_size(ZSTD_OPT_SIZE * sizeof(ZSTD_optimal_t)); size_t const lazyAdditionalSpace = ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder) ? ZSTD_cwksp_aligned_alloc_size(hSize) : 0; @@ -1707,7 +1753,7 @@ size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params) * be needed. However, we still allocate two 0-sized buffers, which can * take space under ASAN. */ return ZSTD_estimateCCtxSize_usingCCtxParams_internal( - &cParams, ¶ms->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN, params->useSequenceProducer, params->maxBlockSize); + &cParams, ¶ms->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN, ZSTD_hasExtSeqProd(params), params->maxBlockSize); } size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams) @@ -1768,7 +1814,7 @@ size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params) return ZSTD_estimateCCtxSize_usingCCtxParams_internal( &cParams, ¶ms->ldmParams, 1, useRowMatchFinder, inBuffSize, outBuffSize, - ZSTD_CONTENTSIZE_UNKNOWN, params->useSequenceProducer, params->maxBlockSize); + ZSTD_CONTENTSIZE_UNKNOWN, ZSTD_hasExtSeqProd(params), params->maxBlockSize); } } @@ -2001,8 +2047,8 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms, ms->opt.litLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxLL+1) * sizeof(unsigned)); ms->opt.matchLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxML+1) * sizeof(unsigned)); ms->opt.offCodeFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxOff+1) * sizeof(unsigned)); - ms->opt.matchTable = (ZSTD_match_t*)ZSTD_cwksp_reserve_aligned(ws, (ZSTD_OPT_NUM+1) * sizeof(ZSTD_match_t)); - ms->opt.priceTable = (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned(ws, (ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t)); + ms->opt.matchTable = (ZSTD_match_t*)ZSTD_cwksp_reserve_aligned(ws, ZSTD_OPT_SIZE * sizeof(ZSTD_match_t)); + ms->opt.priceTable = (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned(ws, ZSTD_OPT_SIZE * sizeof(ZSTD_optimal_t)); } ms->cParams = *cParams; @@ -2074,7 +2120,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, { size_t const windowSize = MAX(1, (size_t)MIN(((U64)1 << params->cParams.windowLog), pledgedSrcSize)); size_t const blockSize = MIN(params->maxBlockSize, windowSize); - size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, params->cParams.minMatch, params->useSequenceProducer); + size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, params->cParams.minMatch, ZSTD_hasExtSeqProd(params)); size_t const buffOutSize = (zbuff == ZSTDb_buffered && params->outBufferMode == ZSTD_bm_buffered) ? ZSTD_compressBound(blockSize) + 1 : 0; @@ -2091,8 +2137,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, size_t const neededSpace = ZSTD_estimateCCtxSize_usingCCtxParams_internal( ¶ms->cParams, ¶ms->ldmParams, zc->staticSize != 0, params->useRowMatchFinder, - buffInSize, buffOutSize, pledgedSrcSize, params->useSequenceProducer, params->maxBlockSize); - int resizeWorkspace; + buffInSize, buffOutSize, pledgedSrcSize, ZSTD_hasExtSeqProd(params), params->maxBlockSize); FORWARD_IF_ERROR(neededSpace, "cctx size estimate failed!"); @@ -2101,7 +2146,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, { /* Check if workspace is large enough, alloc a new one if needed */ int const workspaceTooSmall = ZSTD_cwksp_sizeof(ws) < neededSpace; int const workspaceWasteful = ZSTD_cwksp_check_wasteful(ws, neededSpace); - resizeWorkspace = workspaceTooSmall || workspaceWasteful; + int resizeWorkspace = workspaceTooSmall || workspaceWasteful; DEBUGLOG(4, "Need %zu B workspace", neededSpace); DEBUGLOG(4, "windowSize: %zu - blockSize: %zu", windowSize, blockSize); @@ -2176,10 +2221,10 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, } /* reserve space for block-level external sequences */ - if (params->useSequenceProducer) { + if (ZSTD_hasExtSeqProd(params)) { size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize); - zc->externalMatchCtx.seqBufferCapacity = maxNbExternalSeq; - zc->externalMatchCtx.seqBuffer = + zc->extSeqBufCapacity = maxNbExternalSeq; + zc->extSeqBuf = (ZSTD_Sequence*)ZSTD_cwksp_reserve_aligned(ws, maxNbExternalSeq * sizeof(ZSTD_Sequence)); } @@ -2564,7 +2609,7 @@ ZSTD_reduceTable_internal (U32* const table, U32 const size, U32 const reducerVa assert(size < (1U<<31)); /* can be casted to int */ #if ZSTD_MEMORY_SANITIZER && !defined (ZSTD_MSAN_DONT_POISON_WORKSPACE) - /* To validate that the table re-use logic is sound, and that we don't + /* To validate that the table reuse logic is sound, and that we don't * access table space that we haven't cleaned, we re-"poison" the table * space every time we mark it dirty. * @@ -2992,40 +3037,43 @@ ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_paramS static const ZSTD_blockCompressor blockCompressor[4][ZSTD_STRATEGY_MAX+1] = { { ZSTD_compressBlock_fast /* default for 0 */, ZSTD_compressBlock_fast, - ZSTD_compressBlock_doubleFast, - ZSTD_compressBlock_greedy, - ZSTD_compressBlock_lazy, - ZSTD_compressBlock_lazy2, - ZSTD_compressBlock_btlazy2, - ZSTD_compressBlock_btopt, - ZSTD_compressBlock_btultra, - ZSTD_compressBlock_btultra2 }, + ZSTD_COMPRESSBLOCK_DOUBLEFAST, + ZSTD_COMPRESSBLOCK_GREEDY, + ZSTD_COMPRESSBLOCK_LAZY, + ZSTD_COMPRESSBLOCK_LAZY2, + ZSTD_COMPRESSBLOCK_BTLAZY2, + ZSTD_COMPRESSBLOCK_BTOPT, + ZSTD_COMPRESSBLOCK_BTULTRA, + ZSTD_COMPRESSBLOCK_BTULTRA2 + }, { ZSTD_compressBlock_fast_extDict /* default for 0 */, ZSTD_compressBlock_fast_extDict, - ZSTD_compressBlock_doubleFast_extDict, - ZSTD_compressBlock_greedy_extDict, - ZSTD_compressBlock_lazy_extDict, - ZSTD_compressBlock_lazy2_extDict, - ZSTD_compressBlock_btlazy2_extDict, - ZSTD_compressBlock_btopt_extDict, - ZSTD_compressBlock_btultra_extDict, - ZSTD_compressBlock_btultra_extDict }, + ZSTD_COMPRESSBLOCK_DOUBLEFAST_EXTDICT, + ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT, + ZSTD_COMPRESSBLOCK_LAZY_EXTDICT, + ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT, + ZSTD_COMPRESSBLOCK_BTLAZY2_EXTDICT, + ZSTD_COMPRESSBLOCK_BTOPT_EXTDICT, + ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT, + ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT + }, { ZSTD_compressBlock_fast_dictMatchState /* default for 0 */, ZSTD_compressBlock_fast_dictMatchState, - ZSTD_compressBlock_doubleFast_dictMatchState, - ZSTD_compressBlock_greedy_dictMatchState, - ZSTD_compressBlock_lazy_dictMatchState, - ZSTD_compressBlock_lazy2_dictMatchState, - ZSTD_compressBlock_btlazy2_dictMatchState, - ZSTD_compressBlock_btopt_dictMatchState, - ZSTD_compressBlock_btultra_dictMatchState, - ZSTD_compressBlock_btultra_dictMatchState }, + ZSTD_COMPRESSBLOCK_DOUBLEFAST_DICTMATCHSTATE, + ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE, + ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE, + ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE, + ZSTD_COMPRESSBLOCK_BTLAZY2_DICTMATCHSTATE, + ZSTD_COMPRESSBLOCK_BTOPT_DICTMATCHSTATE, + ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE, + ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE + }, { NULL /* default for 0 */, NULL, NULL, - ZSTD_compressBlock_greedy_dedicatedDictSearch, - ZSTD_compressBlock_lazy_dedicatedDictSearch, - ZSTD_compressBlock_lazy2_dedicatedDictSearch, + ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH, + ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH, + ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH, NULL, NULL, NULL, @@ -3038,18 +3086,26 @@ ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_paramS DEBUGLOG(4, "Selected block compressor: dictMode=%d strat=%d rowMatchfinder=%d", (int)dictMode, (int)strat, (int)useRowMatchFinder); if (ZSTD_rowMatchFinderUsed(strat, useRowMatchFinder)) { static const ZSTD_blockCompressor rowBasedBlockCompressors[4][3] = { - { ZSTD_compressBlock_greedy_row, - ZSTD_compressBlock_lazy_row, - ZSTD_compressBlock_lazy2_row }, - { ZSTD_compressBlock_greedy_extDict_row, - ZSTD_compressBlock_lazy_extDict_row, - ZSTD_compressBlock_lazy2_extDict_row }, - { ZSTD_compressBlock_greedy_dictMatchState_row, - ZSTD_compressBlock_lazy_dictMatchState_row, - ZSTD_compressBlock_lazy2_dictMatchState_row }, - { ZSTD_compressBlock_greedy_dedicatedDictSearch_row, - ZSTD_compressBlock_lazy_dedicatedDictSearch_row, - ZSTD_compressBlock_lazy2_dedicatedDictSearch_row } + { + ZSTD_COMPRESSBLOCK_GREEDY_ROW, + ZSTD_COMPRESSBLOCK_LAZY_ROW, + ZSTD_COMPRESSBLOCK_LAZY2_ROW + }, + { + ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT_ROW, + ZSTD_COMPRESSBLOCK_LAZY_EXTDICT_ROW, + ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT_ROW + }, + { + ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE_ROW, + ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE_ROW, + ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE_ROW + }, + { + ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH_ROW, + ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH_ROW, + ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH_ROW + } }; DEBUGLOG(4, "Selecting a row-based matchfinder"); assert(useRowMatchFinder != ZSTD_ps_auto); @@ -3192,7 +3248,7 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) /* External matchfinder + LDM is technically possible, just not implemented yet. * We need to revisit soon and implement it. */ RETURN_ERROR_IF( - zc->appliedParams.useSequenceProducer, + ZSTD_hasExtSeqProd(&zc->appliedParams), parameter_combination_unsupported, "Long-distance matching with external sequence producer enabled is not currently supported." ); @@ -3211,7 +3267,7 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) /* External matchfinder + LDM is technically possible, just not implemented yet. * We need to revisit soon and implement it. */ RETURN_ERROR_IF( - zc->appliedParams.useSequenceProducer, + ZSTD_hasExtSeqProd(&zc->appliedParams), parameter_combination_unsupported, "Long-distance matching with external sequence producer enabled is not currently supported." ); @@ -3230,18 +3286,18 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) zc->appliedParams.useRowMatchFinder, src, srcSize); assert(ldmSeqStore.pos == ldmSeqStore.size); - } else if (zc->appliedParams.useSequenceProducer) { + } else if (ZSTD_hasExtSeqProd(&zc->appliedParams)) { assert( - zc->externalMatchCtx.seqBufferCapacity >= ZSTD_sequenceBound(srcSize) + zc->extSeqBufCapacity >= ZSTD_sequenceBound(srcSize) ); - assert(zc->externalMatchCtx.mFinder != NULL); + assert(zc->appliedParams.extSeqProdFunc != NULL); { U32 const windowSize = (U32)1 << zc->appliedParams.cParams.windowLog; - size_t const nbExternalSeqs = (zc->externalMatchCtx.mFinder)( - zc->externalMatchCtx.mState, - zc->externalMatchCtx.seqBuffer, - zc->externalMatchCtx.seqBufferCapacity, + size_t const nbExternalSeqs = (zc->appliedParams.extSeqProdFunc)( + zc->appliedParams.extSeqProdState, + zc->extSeqBuf, + zc->extSeqBufCapacity, src, srcSize, NULL, 0, /* dict and dictSize, currently not supported */ zc->appliedParams.compressionLevel, @@ -3249,21 +3305,21 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) ); size_t const nbPostProcessedSeqs = ZSTD_postProcessSequenceProducerResult( - zc->externalMatchCtx.seqBuffer, + zc->extSeqBuf, nbExternalSeqs, - zc->externalMatchCtx.seqBufferCapacity, + zc->extSeqBufCapacity, srcSize ); /* Return early if there is no error, since we don't need to worry about last literals */ if (!ZSTD_isError(nbPostProcessedSeqs)) { ZSTD_sequencePosition seqPos = {0,0,0}; - size_t const seqLenSum = ZSTD_fastSequenceLengthSum(zc->externalMatchCtx.seqBuffer, nbPostProcessedSeqs); + size_t const seqLenSum = ZSTD_fastSequenceLengthSum(zc->extSeqBuf, nbPostProcessedSeqs); RETURN_ERROR_IF(seqLenSum > srcSize, externalSequences_invalid, "External sequences imply too large a block!"); FORWARD_IF_ERROR( ZSTD_copySequencesToSeqStoreExplicitBlockDelim( zc, &seqPos, - zc->externalMatchCtx.seqBuffer, nbPostProcessedSeqs, + zc->extSeqBuf, nbPostProcessedSeqs, src, srcSize, zc->appliedParams.searchForExternalRepcodes ), @@ -3280,9 +3336,11 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) } /* Fallback to software matchfinder */ - { ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy, - zc->appliedParams.useRowMatchFinder, - dictMode); + { ZSTD_blockCompressor const blockCompressor = + ZSTD_selectBlockCompressor( + zc->appliedParams.cParams.strategy, + zc->appliedParams.useRowMatchFinder, + dictMode); ms->ldmSeqStore = NULL; DEBUGLOG( 5, @@ -3292,9 +3350,10 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize); } } } else { /* not long range mode and no external matchfinder */ - ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy, - zc->appliedParams.useRowMatchFinder, - dictMode); + ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor( + zc->appliedParams.cParams.strategy, + zc->appliedParams.useRowMatchFinder, + dictMode); ms->ldmSeqStore = NULL; lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize); } @@ -3304,29 +3363,38 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) return ZSTDbss_compress; } -static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc) +static size_t ZSTD_copyBlockSequences(SeqCollector* seqCollector, const seqStore_t* seqStore, const U32 prevRepcodes[ZSTD_REP_NUM]) { - const seqStore_t* seqStore = ZSTD_getSeqStore(zc); - const seqDef* seqStoreSeqs = seqStore->sequencesStart; - size_t seqStoreSeqSize = seqStore->sequences - seqStoreSeqs; - size_t seqStoreLiteralsSize = (size_t)(seqStore->lit - seqStore->litStart); - size_t literalsRead = 0; - size_t lastLLSize; + const seqDef* inSeqs = seqStore->sequencesStart; + const size_t nbInSequences = seqStore->sequences - inSeqs; + const size_t nbInLiterals = (size_t)(seqStore->lit - seqStore->litStart); - ZSTD_Sequence* outSeqs = &zc->seqCollector.seqStart[zc->seqCollector.seqIndex]; + ZSTD_Sequence* outSeqs = seqCollector->seqIndex == 0 ? seqCollector->seqStart : seqCollector->seqStart + seqCollector->seqIndex; + const size_t nbOutSequences = nbInSequences + 1; + size_t nbOutLiterals = 0; + repcodes_t repcodes; size_t i; - repcodes_t updatedRepcodes; - assert(zc->seqCollector.seqIndex + 1 < zc->seqCollector.maxSequences); - /* Ensure we have enough space for last literals "sequence" */ - assert(zc->seqCollector.maxSequences >= seqStoreSeqSize + 1); - ZSTD_memcpy(updatedRepcodes.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t)); - for (i = 0; i < seqStoreSeqSize; ++i) { - U32 rawOffset = seqStoreSeqs[i].offBase - ZSTD_REP_NUM; - outSeqs[i].litLength = seqStoreSeqs[i].litLength; - outSeqs[i].matchLength = seqStoreSeqs[i].mlBase + MINMATCH; + /* Bounds check that we have enough space for every input sequence + * and the block delimiter + */ + assert(seqCollector->seqIndex <= seqCollector->maxSequences); + RETURN_ERROR_IF( + nbOutSequences > (size_t)(seqCollector->maxSequences - seqCollector->seqIndex), + dstSize_tooSmall, + "Not enough space to copy sequences"); + + ZSTD_memcpy(&repcodes, prevRepcodes, sizeof(repcodes)); + for (i = 0; i < nbInSequences; ++i) { + U32 rawOffset; + outSeqs[i].litLength = inSeqs[i].litLength; + outSeqs[i].matchLength = inSeqs[i].mlBase + MINMATCH; outSeqs[i].rep = 0; + /* Handle the possible single length >= 64K + * There can only be one because we add MINMATCH to every match length, + * and blocks are at most 128K. + */ if (i == seqStore->longLengthPos) { if (seqStore->longLengthType == ZSTD_llt_literalLength) { outSeqs[i].litLength += 0x10000; @@ -3335,41 +3403,55 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc) } } - if (seqStoreSeqs[i].offBase <= ZSTD_REP_NUM) { - /* Derive the correct offset corresponding to a repcode */ - outSeqs[i].rep = seqStoreSeqs[i].offBase; + /* Determine the raw offset given the offBase, which may be a repcode. */ + if (OFFBASE_IS_REPCODE(inSeqs[i].offBase)) { + const U32 repcode = OFFBASE_TO_REPCODE(inSeqs[i].offBase); + assert(repcode > 0); + outSeqs[i].rep = repcode; if (outSeqs[i].litLength != 0) { - rawOffset = updatedRepcodes.rep[outSeqs[i].rep - 1]; + rawOffset = repcodes.rep[repcode - 1]; } else { - if (outSeqs[i].rep == 3) { - rawOffset = updatedRepcodes.rep[0] - 1; + if (repcode == 3) { + assert(repcodes.rep[0] > 1); + rawOffset = repcodes.rep[0] - 1; } else { - rawOffset = updatedRepcodes.rep[outSeqs[i].rep]; + rawOffset = repcodes.rep[repcode]; } } + } else { + rawOffset = OFFBASE_TO_OFFSET(inSeqs[i].offBase); } outSeqs[i].offset = rawOffset; - /* seqStoreSeqs[i].offset == offCode+1, and ZSTD_updateRep() expects offCode - so we provide seqStoreSeqs[i].offset - 1 */ - ZSTD_updateRep(updatedRepcodes.rep, - seqStoreSeqs[i].offBase, - seqStoreSeqs[i].litLength == 0); - literalsRead += outSeqs[i].litLength; + + /* Update repcode history for the sequence */ + ZSTD_updateRep(repcodes.rep, + inSeqs[i].offBase, + inSeqs[i].litLength == 0); + + nbOutLiterals += outSeqs[i].litLength; } /* Insert last literals (if any exist) in the block as a sequence with ml == off == 0. * If there are no last literals, then we'll emit (of: 0, ml: 0, ll: 0), which is a marker * for the block boundary, according to the API. */ - assert(seqStoreLiteralsSize >= literalsRead); - lastLLSize = seqStoreLiteralsSize - literalsRead; - outSeqs[i].litLength = (U32)lastLLSize; - outSeqs[i].matchLength = outSeqs[i].offset = outSeqs[i].rep = 0; - seqStoreSeqSize++; - zc->seqCollector.seqIndex += seqStoreSeqSize; + assert(nbInLiterals >= nbOutLiterals); + { + const size_t lastLLSize = nbInLiterals - nbOutLiterals; + outSeqs[nbInSequences].litLength = (U32)lastLLSize; + outSeqs[nbInSequences].matchLength = 0; + outSeqs[nbInSequences].offset = 0; + assert(nbOutSequences == nbInSequences + 1); + } + seqCollector->seqIndex += nbOutSequences; + assert(seqCollector->seqIndex <= seqCollector->maxSequences); + + return 0; } size_t ZSTD_sequenceBound(size_t srcSize) { - return (srcSize / ZSTD_MINMATCH_MIN) + 1; + const size_t maxNbSeq = (srcSize / ZSTD_MINMATCH_MIN) + 1; + const size_t maxNbDelims = (srcSize / ZSTD_BLOCKSIZE_MAX_MIN) + 1; + return maxNbSeq + maxNbDelims; } size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, @@ -3378,6 +3460,16 @@ size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, const size_t dstCapacity = ZSTD_compressBound(srcSize); void* dst = ZSTD_customMalloc(dstCapacity, ZSTD_defaultCMem); SeqCollector seqCollector; + { + int targetCBlockSize; + FORWARD_IF_ERROR(ZSTD_CCtx_getParameter(zc, ZSTD_c_targetCBlockSize, &targetCBlockSize), ""); + RETURN_ERROR_IF(targetCBlockSize != 0, parameter_unsupported, "targetCBlockSize != 0"); + } + { + int nbWorkers; + FORWARD_IF_ERROR(ZSTD_CCtx_getParameter(zc, ZSTD_c_nbWorkers, &nbWorkers), ""); + RETURN_ERROR_IF(nbWorkers != 0, parameter_unsupported, "nbWorkers != 0"); + } RETURN_ERROR_IF(dst == NULL, memory_allocation, "NULL pointer!"); @@ -3387,8 +3479,12 @@ size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, seqCollector.maxSequences = outSeqsSize; zc->seqCollector = seqCollector; - ZSTD_compress2(zc, dst, dstCapacity, src, srcSize); - ZSTD_customFree(dst, ZSTD_defaultCMem); + { + const size_t ret = ZSTD_compress2(zc, dst, dstCapacity, src, srcSize); + ZSTD_customFree(dst, ZSTD_defaultCMem); + FORWARD_IF_ERROR(ret, "ZSTD_compress2 failed"); + } + assert(zc->seqCollector.seqIndex <= ZSTD_sequenceBound(srcSize)); return zc->seqCollector.seqIndex; } @@ -3981,8 +4077,9 @@ ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, cSeqsSize = 1; } + /* Sequence collection not supported when block splitting */ if (zc->seqCollector.collectSequences) { - ZSTD_copyBlockSequences(zc); + FORWARD_IF_ERROR(ZSTD_copyBlockSequences(&zc->seqCollector, seqStore, dRepOriginal.rep), "copyBlockSequences failed"); ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); return 0; } @@ -4204,6 +4301,7 @@ ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc, if (bss == ZSTDbss_noCompress) { if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid) zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check; + RETURN_ERROR_IF(zc->seqCollector.collectSequences, sequenceProducer_failed, "Uncompressible block"); cSize = ZSTD_noCompressBlock(dst, dstCapacity, src, srcSize, lastBlock); FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed"); DEBUGLOG(4, "ZSTD_compressBlock_splitBlock: Nocompress block"); @@ -4236,11 +4334,15 @@ ZSTD_compressBlock_internal(ZSTD_CCtx* zc, { const size_t bss = ZSTD_buildSeqStore(zc, src, srcSize); FORWARD_IF_ERROR(bss, "ZSTD_buildSeqStore failed"); - if (bss == ZSTDbss_noCompress) { cSize = 0; goto out; } + if (bss == ZSTDbss_noCompress) { + RETURN_ERROR_IF(zc->seqCollector.collectSequences, sequenceProducer_failed, "Uncompressible block"); + cSize = 0; + goto out; + } } if (zc->seqCollector.collectSequences) { - ZSTD_copyBlockSequences(zc); + FORWARD_IF_ERROR(ZSTD_copyBlockSequences(&zc->seqCollector, ZSTD_getSeqStore(zc), zc->blockState.prevCBlock->rep), "copyBlockSequences failed"); ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); return 0; } @@ -4553,19 +4655,15 @@ size_t ZSTD_writeLastEmptyBlock(void* dst, size_t dstCapacity) } } -size_t ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq) +void ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq) { - RETURN_ERROR_IF(cctx->stage != ZSTDcs_init, stage_wrong, - "wrong cctx stage"); - RETURN_ERROR_IF(cctx->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable, - parameter_unsupported, - "incompatible with ldm"); + assert(cctx->stage == ZSTDcs_init); + assert(nbSeq == 0 || cctx->appliedParams.ldmParams.enableLdm != ZSTD_ps_enable); cctx->externSeqStore.seq = seq; cctx->externSeqStore.size = nbSeq; cctx->externSeqStore.capacity = nbSeq; cctx->externSeqStore.pos = 0; cctx->externSeqStore.posInSequence = 0; - return 0; } @@ -4760,12 +4858,19 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms, ZSTD_fillHashTable(ms, iend, dtlm, tfp); break; case ZSTD_dfast: +#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR ZSTD_fillDoubleHashTable(ms, iend, dtlm, tfp); +#else + assert(0); /* shouldn't be called: cparams should've been adjusted. */ +#endif break; case ZSTD_greedy: case ZSTD_lazy: case ZSTD_lazy2: +#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \ + || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \ + || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) assert(srcSize >= HASH_READ_SIZE); if (ms->dedicatedDictSearch) { assert(ms->chainTable != NULL); @@ -4782,14 +4887,23 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms, DEBUGLOG(4, "Using chain-based hash table for lazy dict"); } } +#else + assert(0); /* shouldn't be called: cparams should've been adjusted. */ +#endif break; case ZSTD_btlazy2: /* we want the dictionary table fully sorted */ case ZSTD_btopt: case ZSTD_btultra: case ZSTD_btultra2: +#if !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) \ + || !defined(ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR) \ + || !defined(ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR) assert(srcSize >= HASH_READ_SIZE); ZSTD_updateTree(ms, iend-HASH_READ_SIZE, iend); +#else + assert(0); /* shouldn't be called: cparams should've been adjusted. */ +#endif break; default: @@ -4836,11 +4950,10 @@ size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace, /* We only set the loaded table as valid if it contains all non-zero * weights. Otherwise, we set it to check */ - if (!hasZeroWeights) + if (!hasZeroWeights && maxSymbolValue == 255) bs->entropy.huf.repeatMode = HUF_repeat_valid; RETURN_ERROR_IF(HUF_isError(hufHeaderSize), dictionary_corrupted, ""); - RETURN_ERROR_IF(maxSymbolValue < 255, dictionary_corrupted, ""); dictPtr += hufHeaderSize; } @@ -5107,14 +5220,13 @@ static size_t ZSTD_writeEpilogue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity) { BYTE* const ostart = (BYTE*)dst; BYTE* op = ostart; - size_t fhSize = 0; DEBUGLOG(4, "ZSTD_writeEpilogue"); RETURN_ERROR_IF(cctx->stage == ZSTDcs_created, stage_wrong, "init missing"); /* special case : empty frame */ if (cctx->stage == ZSTDcs_init) { - fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, &cctx->appliedParams, 0, 0); + size_t fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, &cctx->appliedParams, 0, 0); FORWARD_IF_ERROR(fhSize, "ZSTD_writeFrameHeader failed"); dstCapacity -= fhSize; op += fhSize; @@ -5124,8 +5236,9 @@ static size_t ZSTD_writeEpilogue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity) if (cctx->stage != ZSTDcs_ending) { /* write one last empty block, make it the "last" block */ U32 const cBlockHeader24 = 1 /* last block */ + (((U32)bt_raw)<<1) + 0; - RETURN_ERROR_IF(dstCapacity<4, dstSize_tooSmall, "no room for epilogue"); - MEM_writeLE32(op, cBlockHeader24); + ZSTD_STATIC_ASSERT(ZSTD_BLOCKHEADERSIZE == 3); + RETURN_ERROR_IF(dstCapacity<3, dstSize_tooSmall, "no room for epilogue"); + MEM_writeLE24(op, cBlockHeader24); op += ZSTD_blockHeaderSize; dstCapacity -= ZSTD_blockHeaderSize; } @@ -5455,7 +5568,7 @@ ZSTD_CDict* ZSTD_createCDict_advanced2( cctxParams.useRowMatchFinder, cctxParams.enableDedicatedDictSearch, customMem); - if (ZSTD_isError( ZSTD_initCDict_internal(cdict, + if (!cdict || ZSTD_isError( ZSTD_initCDict_internal(cdict, dict, dictSize, dictLoadMethod, dictContentType, cctxParams) )) { @@ -5879,7 +5992,7 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, if (zcs->appliedParams.inBufferMode == ZSTD_bm_stable) { assert(input->pos >= zcs->stableIn_notConsumed); input->pos -= zcs->stableIn_notConsumed; - ip -= zcs->stableIn_notConsumed; + if (ip) ip -= zcs->stableIn_notConsumed; zcs->stableIn_notConsumed = 0; } if (zcs->appliedParams.inBufferMode == ZSTD_bm_buffered) { @@ -6138,7 +6251,7 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx, #ifdef ZSTD_MULTITHREAD /* If external matchfinder is enabled, make sure to fail before checking job size (for consistency) */ RETURN_ERROR_IF( - params.useSequenceProducer == 1 && params.nbWorkers >= 1, + ZSTD_hasExtSeqProd(¶ms) && params.nbWorkers >= 1, parameter_combination_unsupported, "External sequence producer isn't supported with nbWorkers >= 1" ); @@ -6430,7 +6543,7 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx, if (cctx->appliedParams.validateSequences) { seqPos->posInSrc += litLength + matchLength; FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, cctx->appliedParams.cParams.minMatch, seqPos->posInSrc, - cctx->appliedParams.cParams.windowLog, dictSize, cctx->appliedParams.useSequenceProducer), + cctx->appliedParams.cParams.windowLog, dictSize, ZSTD_hasExtSeqProd(&cctx->appliedParams)), "Sequence validation failed"); } RETURN_ERROR_IF(idx - seqPos->idx >= cctx->seqStore.maxNbSeq, externalSequences_invalid, @@ -6568,7 +6681,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* if (cctx->appliedParams.validateSequences) { seqPos->posInSrc += litLength + matchLength; FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, cctx->appliedParams.cParams.minMatch, seqPos->posInSrc, - cctx->appliedParams.cParams.windowLog, dictSize, cctx->appliedParams.useSequenceProducer), + cctx->appliedParams.cParams.windowLog, dictSize, ZSTD_hasExtSeqProd(&cctx->appliedParams)), "Sequence validation failed"); } DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength); @@ -7014,19 +7127,27 @@ ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long srcSizeH } void ZSTD_registerSequenceProducer( - ZSTD_CCtx* zc, void* mState, - ZSTD_sequenceProducer_F* mFinder + ZSTD_CCtx* zc, + void* extSeqProdState, + ZSTD_sequenceProducer_F extSeqProdFunc +) { + assert(zc != NULL); + ZSTD_CCtxParams_registerSequenceProducer( + &zc->requestedParams, extSeqProdState, extSeqProdFunc + ); +} + +void ZSTD_CCtxParams_registerSequenceProducer( + ZSTD_CCtx_params* params, + void* extSeqProdState, + ZSTD_sequenceProducer_F extSeqProdFunc ) { - if (mFinder != NULL) { - ZSTD_externalMatchCtx emctx; - emctx.mState = mState; - emctx.mFinder = mFinder; - emctx.seqBuffer = NULL; - emctx.seqBufferCapacity = 0; - zc->externalMatchCtx = emctx; - zc->requestedParams.useSequenceProducer = 1; + assert(params != NULL); + if (extSeqProdFunc != NULL) { + params->extSeqProdFunc = extSeqProdFunc; + params->extSeqProdState = extSeqProdState; } else { - ZSTD_memset(&zc->externalMatchCtx, 0, sizeof(zc->externalMatchCtx)); - zc->requestedParams.useSequenceProducer = 0; + params->extSeqProdFunc = NULL; + params->extSeqProdState = NULL; } } diff --git a/thirdparty/zstd/compress/zstd_compress_internal.h b/thirdparty/zstd/compress/zstd_compress_internal.h index 10f68d010e..e41d7b78ec 100644 --- a/thirdparty/zstd/compress/zstd_compress_internal.h +++ b/thirdparty/zstd/compress/zstd_compress_internal.h @@ -39,7 +39,7 @@ extern "C" { It's not a big deal though : candidate will just be sorted again. Additionally, candidate position 1 will be lost. But candidate 1 cannot hide a large tree of candidates, so it's a minimal loss. - The benefit is that ZSTD_DUBT_UNSORTED_MARK cannot be mishandled after table re-use with a different strategy. + The benefit is that ZSTD_DUBT_UNSORTED_MARK cannot be mishandled after table reuse with a different strategy. This constant is required by ZSTD_compressBlock_btlazy2() and ZSTD_reduceTable_internal() */ @@ -159,23 +159,24 @@ typedef struct { UNUSED_ATTR static const rawSeqStore_t kNullRawSeqStore = {NULL, 0, 0, 0, 0}; typedef struct { - int price; - U32 off; - U32 mlen; - U32 litlen; - U32 rep[ZSTD_REP_NUM]; + int price; /* price from beginning of segment to this position */ + U32 off; /* offset of previous match */ + U32 mlen; /* length of previous match */ + U32 litlen; /* nb of literals since previous match */ + U32 rep[ZSTD_REP_NUM]; /* offset history after previous match */ } ZSTD_optimal_t; typedef enum { zop_dynamic=0, zop_predef } ZSTD_OptPrice_e; +#define ZSTD_OPT_SIZE (ZSTD_OPT_NUM+3) typedef struct { /* All tables are allocated inside cctx->workspace by ZSTD_resetCCtx_internal() */ unsigned* litFreq; /* table of literals statistics, of size 256 */ unsigned* litLengthFreq; /* table of litLength statistics, of size (MaxLL+1) */ unsigned* matchLengthFreq; /* table of matchLength statistics, of size (MaxML+1) */ unsigned* offCodeFreq; /* table of offCode statistics, of size (MaxOff+1) */ - ZSTD_match_t* matchTable; /* list of found matches, of size ZSTD_OPT_NUM+1 */ - ZSTD_optimal_t* priceTable; /* All positions tracked by optimal parser, of size ZSTD_OPT_NUM+1 */ + ZSTD_match_t* matchTable; /* list of found matches, of size ZSTD_OPT_SIZE */ + ZSTD_optimal_t* priceTable; /* All positions tracked by optimal parser, of size ZSTD_OPT_SIZE */ U32 litSum; /* nb of literals */ U32 litLengthSum; /* nb of litLength codes */ @@ -228,7 +229,7 @@ struct ZSTD_matchState_t { U32 rowHashLog; /* For row-based matchfinder: Hashlog based on nb of rows in the hashTable.*/ BYTE* tagTable; /* For row-based matchFinder: A row-based table containing the hashes and head index. */ U32 hashCache[ZSTD_ROW_HASH_CACHE_SIZE]; /* For row-based matchFinder: a cache of hashes to improve speed */ - U64 hashSalt; /* For row-based matchFinder: salts the hash for re-use of tag table */ + U64 hashSalt; /* For row-based matchFinder: salts the hash for reuse of tag table */ U32 hashSaltEntropy; /* For row-based matchFinder: collects entropy for salt generation */ U32* hashTable; @@ -360,10 +361,11 @@ struct ZSTD_CCtx_params_s { * if the external matchfinder returns an error code. */ int enableMatchFinderFallback; - /* Indicates whether an external matchfinder has been referenced. - * Users can't set this externally. - * It is set internally in ZSTD_registerSequenceProducer(). */ - int useSequenceProducer; + /* Parameters for the external sequence producer API. + * Users set these parameters through ZSTD_registerSequenceProducer(). + * It is not possible to set these parameters individually through the public API. */ + void* extSeqProdState; + ZSTD_sequenceProducer_F extSeqProdFunc; /* Adjust the max block size*/ size_t maxBlockSize; @@ -401,14 +403,6 @@ typedef struct { ZSTD_entropyCTablesMetadata_t entropyMetadata; } ZSTD_blockSplitCtx; -/* Context for block-level external matchfinder API */ -typedef struct { - void* mState; - ZSTD_sequenceProducer_F* mFinder; - ZSTD_Sequence* seqBuffer; - size_t seqBufferCapacity; -} ZSTD_externalMatchCtx; - struct ZSTD_CCtx_s { ZSTD_compressionStage_e stage; int cParamsChanged; /* == 1 if cParams(except wlog) or compression level are changed in requestedParams. Triggers transmission of new params to ZSTDMT (if available) then reset to 0. */ @@ -479,8 +473,9 @@ struct ZSTD_CCtx_s { /* Workspace for block splitter */ ZSTD_blockSplitCtx blockSplitCtx; - /* Workspace for external matchfinder */ - ZSTD_externalMatchCtx externalMatchCtx; + /* Buffer for output from external sequence producer */ + ZSTD_Sequence* extSeqBuf; + size_t extSeqBufCapacity; }; typedef enum { ZSTD_dtlm_fast, ZSTD_dtlm_full } ZSTD_dictTableLoadMethod_e; @@ -1053,7 +1048,9 @@ MEM_STATIC U32 ZSTD_window_needOverflowCorrection(ZSTD_window_t const window, * The least significant cycleLog bits of the indices must remain the same, * which may be 0. Every index up to maxDist in the past must be valid. */ -MEM_STATIC U32 ZSTD_window_correctOverflow(ZSTD_window_t* window, U32 cycleLog, +MEM_STATIC +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +U32 ZSTD_window_correctOverflow(ZSTD_window_t* window, U32 cycleLog, U32 maxDist, void const* src) { /* preemptive overflow correction: @@ -1246,7 +1243,9 @@ MEM_STATIC void ZSTD_window_init(ZSTD_window_t* window) { * forget about the extDict. Handles overlap of the prefix and extDict. * Returns non-zero if the segment is contiguous. */ -MEM_STATIC U32 ZSTD_window_update(ZSTD_window_t* window, +MEM_STATIC +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +U32 ZSTD_window_update(ZSTD_window_t* window, void const* src, size_t srcSize, int forceNonContiguous) { @@ -1467,11 +1466,10 @@ size_t ZSTD_writeLastEmptyBlock(void* dst, size_t dstCapacity); * This cannot be used when long range matching is enabled. * Zstd will use these sequences, and pass the literals to a secondary block * compressor. - * @return : An error code on failure. * NOTE: seqs are not verified! Invalid sequences can cause out-of-bounds memory * access and data corruption. */ -size_t ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq); +void ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq); /** ZSTD_cycleLog() : * condition for correct operation : hashLog > 1 */ @@ -1509,6 +1507,10 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch); +/* Returns 1 if an external sequence producer is registered, otherwise returns 0. */ +MEM_STATIC int ZSTD_hasExtSeqProd(const ZSTD_CCtx_params* params) { + return params->extSeqProdFunc != NULL; +} /* =============================================================== * Deprecated definitions that are still used internally to avoid diff --git a/thirdparty/zstd/compress/zstd_compress_superblock.c b/thirdparty/zstd/compress/zstd_compress_superblock.c index 638c4acbe7..628a2dccd0 100644 --- a/thirdparty/zstd/compress/zstd_compress_superblock.c +++ b/thirdparty/zstd/compress/zstd_compress_superblock.c @@ -76,8 +76,8 @@ ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable, } { int const flags = bmi2 ? HUF_flags_bmi2 : 0; - const size_t cSize = singleStream ? HUF_compress1X_usingCTable(op, oend-op, literals, litSize, hufTable, flags) - : HUF_compress4X_usingCTable(op, oend-op, literals, litSize, hufTable, flags); + const size_t cSize = singleStream ? HUF_compress1X_usingCTable(op, (size_t)(oend-op), literals, litSize, hufTable, flags) + : HUF_compress4X_usingCTable(op, (size_t)(oend-op), literals, litSize, hufTable, flags); op += cSize; cLitSize += cSize; if (cSize == 0 || ERR_isError(cSize)) { @@ -102,7 +102,7 @@ ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable, switch(lhSize) { case 3: /* 2 - 2 - 10 - 10 */ - { U32 const lhc = hType + ((!singleStream) << 2) + ((U32)litSize<<4) + ((U32)cLitSize<<14); + { U32 const lhc = hType + ((U32)(!singleStream) << 2) + ((U32)litSize<<4) + ((U32)cLitSize<<14); MEM_writeLE24(ostart, lhc); break; } @@ -122,30 +122,30 @@ ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable, } *entropyWritten = 1; DEBUGLOG(5, "Compressed literals: %u -> %u", (U32)litSize, (U32)(op-ostart)); - return op-ostart; + return (size_t)(op-ostart); } static size_t ZSTD_seqDecompressedSize(seqStore_t const* seqStore, - const seqDef* sequences, size_t nbSeq, - size_t litSize, int lastSequence) + const seqDef* sequences, size_t nbSeqs, + size_t litSize, int lastSubBlock) { - const seqDef* const sstart = sequences; - const seqDef* const send = sequences + nbSeq; - const seqDef* sp = sstart; size_t matchLengthSum = 0; size_t litLengthSum = 0; - (void)(litLengthSum); /* suppress unused variable warning on some environments */ - while (send-sp > 0) { - ZSTD_sequenceLength const seqLen = ZSTD_getSequenceLength(seqStore, sp); + size_t n; + for (n=0; n<nbSeqs; n++) { + const ZSTD_sequenceLength seqLen = ZSTD_getSequenceLength(seqStore, sequences+n); litLengthSum += seqLen.litLength; matchLengthSum += seqLen.matchLength; - sp++; } - assert(litLengthSum <= litSize); - if (!lastSequence) { + DEBUGLOG(5, "ZSTD_seqDecompressedSize: %u sequences from %p: %u literals + %u matchlength", + (unsigned)nbSeqs, (const void*)sequences, + (unsigned)litLengthSum, (unsigned)matchLengthSum); + if (!lastSubBlock) assert(litLengthSum == litSize); - } + else + assert(litLengthSum <= litSize); + (void)litLengthSum; return matchLengthSum + litSize; } @@ -180,14 +180,14 @@ ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables, /* Sequences Header */ RETURN_ERROR_IF((oend-op) < 3 /*max nbSeq Size*/ + 1 /*seqHead*/, dstSize_tooSmall, ""); - if (nbSeq < 0x7F) + if (nbSeq < 128) *op++ = (BYTE)nbSeq; else if (nbSeq < LONGNBSEQ) op[0] = (BYTE)((nbSeq>>8) + 0x80), op[1] = (BYTE)nbSeq, op+=2; else op[0]=0xFF, MEM_writeLE16(op+1, (U16)(nbSeq - LONGNBSEQ)), op+=3; if (nbSeq==0) { - return op - ostart; + return (size_t)(op - ostart); } /* seqHead : flags for FSE encoding type */ @@ -209,7 +209,7 @@ ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables, } { size_t const bitstreamSize = ZSTD_encodeSequences( - op, oend - op, + op, (size_t)(oend - op), fseTables->matchlengthCTable, mlCode, fseTables->offcodeCTable, ofCode, fseTables->litlengthCTable, llCode, @@ -253,7 +253,7 @@ ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables, #endif *entropyWritten = 1; - return op - ostart; + return (size_t)(op - ostart); } /** ZSTD_compressSubBlock() : @@ -279,7 +279,8 @@ static size_t ZSTD_compressSubBlock(const ZSTD_entropyCTables_t* entropy, litSize, nbSeq, writeLitEntropy, writeSeqEntropy, lastBlock); { size_t cLitSize = ZSTD_compressSubBlock_literal((const HUF_CElt*)entropy->huf.CTable, &entropyMetadata->hufMetadata, literals, litSize, - op, oend-op, bmi2, writeLitEntropy, litEntropyWritten); + op, (size_t)(oend-op), + bmi2, writeLitEntropy, litEntropyWritten); FORWARD_IF_ERROR(cLitSize, "ZSTD_compressSubBlock_literal failed"); if (cLitSize == 0) return 0; op += cLitSize; @@ -289,18 +290,18 @@ static size_t ZSTD_compressSubBlock(const ZSTD_entropyCTables_t* entropy, sequences, nbSeq, llCode, mlCode, ofCode, cctxParams, - op, oend-op, + op, (size_t)(oend-op), bmi2, writeSeqEntropy, seqEntropyWritten); FORWARD_IF_ERROR(cSeqSize, "ZSTD_compressSubBlock_sequences failed"); if (cSeqSize == 0) return 0; op += cSeqSize; } /* Write block header */ - { size_t cSize = (op-ostart)-ZSTD_blockHeaderSize; + { size_t cSize = (size_t)(op-ostart) - ZSTD_blockHeaderSize; U32 const cBlockHeader24 = lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3); MEM_writeLE24(ostart, cBlockHeader24); } - return op-ostart; + return (size_t)(op-ostart); } static size_t ZSTD_estimateSubBlockSize_literal(const BYTE* literals, size_t litSize, @@ -389,7 +390,11 @@ static size_t ZSTD_estimateSubBlockSize_sequences(const BYTE* ofCodeTable, return cSeqSizeEstimate + sequencesSectionHeaderSize; } -static size_t ZSTD_estimateSubBlockSize(const BYTE* literals, size_t litSize, +typedef struct { + size_t estLitSize; + size_t estBlockSize; +} EstimatedBlockSize; +static EstimatedBlockSize ZSTD_estimateSubBlockSize(const BYTE* literals, size_t litSize, const BYTE* ofCodeTable, const BYTE* llCodeTable, const BYTE* mlCodeTable, @@ -397,15 +402,17 @@ static size_t ZSTD_estimateSubBlockSize(const BYTE* literals, size_t litSize, const ZSTD_entropyCTables_t* entropy, const ZSTD_entropyCTablesMetadata_t* entropyMetadata, void* workspace, size_t wkspSize, - int writeLitEntropy, int writeSeqEntropy) { - size_t cSizeEstimate = 0; - cSizeEstimate += ZSTD_estimateSubBlockSize_literal(literals, litSize, - &entropy->huf, &entropyMetadata->hufMetadata, - workspace, wkspSize, writeLitEntropy); - cSizeEstimate += ZSTD_estimateSubBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable, + int writeLitEntropy, int writeSeqEntropy) +{ + EstimatedBlockSize ebs; + ebs.estLitSize = ZSTD_estimateSubBlockSize_literal(literals, litSize, + &entropy->huf, &entropyMetadata->hufMetadata, + workspace, wkspSize, writeLitEntropy); + ebs.estBlockSize = ZSTD_estimateSubBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable, nbSeq, &entropy->fse, &entropyMetadata->fseMetadata, workspace, wkspSize, writeSeqEntropy); - return cSizeEstimate + ZSTD_blockHeaderSize; + ebs.estBlockSize += ebs.estLitSize + ZSTD_blockHeaderSize; + return ebs; } static int ZSTD_needSequenceEntropyTables(ZSTD_fseCTablesMetadata_t const* fseMetadata) @@ -419,13 +426,56 @@ static int ZSTD_needSequenceEntropyTables(ZSTD_fseCTablesMetadata_t const* fseMe return 0; } +static size_t countLiterals(seqStore_t const* seqStore, const seqDef* sp, size_t seqCount) +{ + size_t n, total = 0; + assert(sp != NULL); + for (n=0; n<seqCount; n++) { + total += ZSTD_getSequenceLength(seqStore, sp+n).litLength; + } + DEBUGLOG(6, "countLiterals for %zu sequences from %p => %zu bytes", seqCount, (const void*)sp, total); + return total; +} + +#define BYTESCALE 256 + +static size_t sizeBlockSequences(const seqDef* sp, size_t nbSeqs, + size_t targetBudget, size_t avgLitCost, size_t avgSeqCost, + int firstSubBlock) +{ + size_t n, budget = 0, inSize=0; + /* entropy headers */ + size_t const headerSize = (size_t)firstSubBlock * 120 * BYTESCALE; /* generous estimate */ + assert(firstSubBlock==0 || firstSubBlock==1); + budget += headerSize; + + /* first sequence => at least one sequence*/ + budget += sp[0].litLength * avgLitCost + avgSeqCost; + if (budget > targetBudget) return 1; + inSize = sp[0].litLength + (sp[0].mlBase+MINMATCH); + + /* loop over sequences */ + for (n=1; n<nbSeqs; n++) { + size_t currentCost = sp[n].litLength * avgLitCost + avgSeqCost; + budget += currentCost; + inSize += sp[n].litLength + (sp[n].mlBase+MINMATCH); + /* stop when sub-block budget is reached */ + if ( (budget > targetBudget) + /* though continue to expand until the sub-block is deemed compressible */ + && (budget < inSize * BYTESCALE) ) + break; + } + + return n; +} + /** ZSTD_compressSubBlock_multi() : * Breaks super-block into multiple sub-blocks and compresses them. - * Entropy will be written to the first block. - * The following blocks will use repeat mode to compress. - * All sub-blocks are compressed blocks (no raw or rle blocks). - * @return : compressed size of the super block (which is multiple ZSTD blocks) - * Or 0 if it failed to compress. */ + * Entropy will be written into the first block. + * The following blocks use repeat_mode to compress. + * Sub-blocks are all compressed, except the last one when beneficial. + * @return : compressed size of the super block (which features multiple ZSTD blocks) + * or 0 if it failed to compress. */ static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr, const ZSTD_compressedBlockState_t* prevCBlock, ZSTD_compressedBlockState_t* nextCBlock, @@ -438,10 +488,12 @@ static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr, { const seqDef* const sstart = seqStorePtr->sequencesStart; const seqDef* const send = seqStorePtr->sequences; - const seqDef* sp = sstart; + const seqDef* sp = sstart; /* tracks progresses within seqStorePtr->sequences */ + size_t const nbSeqs = (size_t)(send - sstart); const BYTE* const lstart = seqStorePtr->litStart; const BYTE* const lend = seqStorePtr->lit; const BYTE* lp = lstart; + size_t const nbLiterals = (size_t)(lend - lstart); BYTE const* ip = (BYTE const*)src; BYTE const* const iend = ip + srcSize; BYTE* const ostart = (BYTE*)dst; @@ -450,96 +502,152 @@ static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr, const BYTE* llCodePtr = seqStorePtr->llCode; const BYTE* mlCodePtr = seqStorePtr->mlCode; const BYTE* ofCodePtr = seqStorePtr->ofCode; - size_t targetCBlockSize = cctxParams->targetCBlockSize; - size_t litSize, seqCount; - int writeLitEntropy = entropyMetadata->hufMetadata.hType == set_compressed; + size_t const minTarget = ZSTD_TARGETCBLOCKSIZE_MIN; /* enforce minimum size, to reduce undesirable side effects */ + size_t const targetCBlockSize = MAX(minTarget, cctxParams->targetCBlockSize); + int writeLitEntropy = (entropyMetadata->hufMetadata.hType == set_compressed); int writeSeqEntropy = 1; - int lastSequence = 0; - - DEBUGLOG(5, "ZSTD_compressSubBlock_multi (litSize=%u, nbSeq=%u)", - (unsigned)(lend-lp), (unsigned)(send-sstart)); - - litSize = 0; - seqCount = 0; - do { - size_t cBlockSizeEstimate = 0; - if (sstart == send) { - lastSequence = 1; - } else { - const seqDef* const sequence = sp + seqCount; - lastSequence = sequence == send - 1; - litSize += ZSTD_getSequenceLength(seqStorePtr, sequence).litLength; - seqCount++; - } - if (lastSequence) { - assert(lp <= lend); - assert(litSize <= (size_t)(lend - lp)); - litSize = (size_t)(lend - lp); + + DEBUGLOG(5, "ZSTD_compressSubBlock_multi (srcSize=%u, litSize=%u, nbSeq=%u)", + (unsigned)srcSize, (unsigned)(lend-lstart), (unsigned)(send-sstart)); + + /* let's start by a general estimation for the full block */ + if (nbSeqs > 0) { + EstimatedBlockSize const ebs = + ZSTD_estimateSubBlockSize(lp, nbLiterals, + ofCodePtr, llCodePtr, mlCodePtr, nbSeqs, + &nextCBlock->entropy, entropyMetadata, + workspace, wkspSize, + writeLitEntropy, writeSeqEntropy); + /* quick estimation */ + size_t const avgLitCost = nbLiterals ? (ebs.estLitSize * BYTESCALE) / nbLiterals : BYTESCALE; + size_t const avgSeqCost = ((ebs.estBlockSize - ebs.estLitSize) * BYTESCALE) / nbSeqs; + const size_t nbSubBlocks = MAX((ebs.estBlockSize + (targetCBlockSize/2)) / targetCBlockSize, 1); + size_t n, avgBlockBudget, blockBudgetSupp=0; + avgBlockBudget = (ebs.estBlockSize * BYTESCALE) / nbSubBlocks; + DEBUGLOG(5, "estimated fullblock size=%u bytes ; avgLitCost=%.2f ; avgSeqCost=%.2f ; targetCBlockSize=%u, nbSubBlocks=%u ; avgBlockBudget=%.0f bytes", + (unsigned)ebs.estBlockSize, (double)avgLitCost/BYTESCALE, (double)avgSeqCost/BYTESCALE, + (unsigned)targetCBlockSize, (unsigned)nbSubBlocks, (double)avgBlockBudget/BYTESCALE); + /* simplification: if estimates states that the full superblock doesn't compress, just bail out immediately + * this will result in the production of a single uncompressed block covering @srcSize.*/ + if (ebs.estBlockSize > srcSize) return 0; + + /* compress and write sub-blocks */ + assert(nbSubBlocks>0); + for (n=0; n < nbSubBlocks-1; n++) { + /* determine nb of sequences for current sub-block + nbLiterals from next sequence */ + size_t const seqCount = sizeBlockSequences(sp, (size_t)(send-sp), + avgBlockBudget + blockBudgetSupp, avgLitCost, avgSeqCost, n==0); + /* if reached last sequence : break to last sub-block (simplification) */ + assert(seqCount <= (size_t)(send-sp)); + if (sp + seqCount == send) break; + assert(seqCount > 0); + /* compress sub-block */ + { int litEntropyWritten = 0; + int seqEntropyWritten = 0; + size_t litSize = countLiterals(seqStorePtr, sp, seqCount); + const size_t decompressedSize = + ZSTD_seqDecompressedSize(seqStorePtr, sp, seqCount, litSize, 0); + size_t const cSize = ZSTD_compressSubBlock(&nextCBlock->entropy, entropyMetadata, + sp, seqCount, + lp, litSize, + llCodePtr, mlCodePtr, ofCodePtr, + cctxParams, + op, (size_t)(oend-op), + bmi2, writeLitEntropy, writeSeqEntropy, + &litEntropyWritten, &seqEntropyWritten, + 0); + FORWARD_IF_ERROR(cSize, "ZSTD_compressSubBlock failed"); + + /* check compressibility, update state components */ + if (cSize > 0 && cSize < decompressedSize) { + DEBUGLOG(5, "Committed sub-block compressing %u bytes => %u bytes", + (unsigned)decompressedSize, (unsigned)cSize); + assert(ip + decompressedSize <= iend); + ip += decompressedSize; + lp += litSize; + op += cSize; + llCodePtr += seqCount; + mlCodePtr += seqCount; + ofCodePtr += seqCount; + /* Entropy only needs to be written once */ + if (litEntropyWritten) { + writeLitEntropy = 0; + } + if (seqEntropyWritten) { + writeSeqEntropy = 0; + } + sp += seqCount; + blockBudgetSupp = 0; + } } + /* otherwise : do not compress yet, coalesce current sub-block with following one */ } - /* I think there is an optimization opportunity here. - * Calling ZSTD_estimateSubBlockSize for every sequence can be wasteful - * since it recalculates estimate from scratch. - * For example, it would recount literal distribution and symbol codes every time. - */ - cBlockSizeEstimate = ZSTD_estimateSubBlockSize(lp, litSize, ofCodePtr, llCodePtr, mlCodePtr, seqCount, - &nextCBlock->entropy, entropyMetadata, - workspace, wkspSize, writeLitEntropy, writeSeqEntropy); - if (cBlockSizeEstimate > targetCBlockSize || lastSequence) { - int litEntropyWritten = 0; - int seqEntropyWritten = 0; - const size_t decompressedSize = ZSTD_seqDecompressedSize(seqStorePtr, sp, seqCount, litSize, lastSequence); - const size_t cSize = ZSTD_compressSubBlock(&nextCBlock->entropy, entropyMetadata, - sp, seqCount, - lp, litSize, - llCodePtr, mlCodePtr, ofCodePtr, - cctxParams, - op, oend-op, - bmi2, writeLitEntropy, writeSeqEntropy, - &litEntropyWritten, &seqEntropyWritten, - lastBlock && lastSequence); - FORWARD_IF_ERROR(cSize, "ZSTD_compressSubBlock failed"); - if (cSize > 0 && cSize < decompressedSize) { - DEBUGLOG(5, "Committed the sub-block"); - assert(ip + decompressedSize <= iend); - ip += decompressedSize; - sp += seqCount; - lp += litSize; - op += cSize; - llCodePtr += seqCount; - mlCodePtr += seqCount; - ofCodePtr += seqCount; - litSize = 0; - seqCount = 0; - /* Entropy only needs to be written once */ - if (litEntropyWritten) { - writeLitEntropy = 0; - } - if (seqEntropyWritten) { - writeSeqEntropy = 0; - } + } /* if (nbSeqs > 0) */ + + /* write last block */ + DEBUGLOG(5, "Generate last sub-block: %u sequences remaining", (unsigned)(send - sp)); + { int litEntropyWritten = 0; + int seqEntropyWritten = 0; + size_t litSize = (size_t)(lend - lp); + size_t seqCount = (size_t)(send - sp); + const size_t decompressedSize = + ZSTD_seqDecompressedSize(seqStorePtr, sp, seqCount, litSize, 1); + size_t const cSize = ZSTD_compressSubBlock(&nextCBlock->entropy, entropyMetadata, + sp, seqCount, + lp, litSize, + llCodePtr, mlCodePtr, ofCodePtr, + cctxParams, + op, (size_t)(oend-op), + bmi2, writeLitEntropy, writeSeqEntropy, + &litEntropyWritten, &seqEntropyWritten, + lastBlock); + FORWARD_IF_ERROR(cSize, "ZSTD_compressSubBlock failed"); + + /* update pointers, the nb of literals borrowed from next sequence must be preserved */ + if (cSize > 0 && cSize < decompressedSize) { + DEBUGLOG(5, "Last sub-block compressed %u bytes => %u bytes", + (unsigned)decompressedSize, (unsigned)cSize); + assert(ip + decompressedSize <= iend); + ip += decompressedSize; + lp += litSize; + op += cSize; + llCodePtr += seqCount; + mlCodePtr += seqCount; + ofCodePtr += seqCount; + /* Entropy only needs to be written once */ + if (litEntropyWritten) { + writeLitEntropy = 0; + } + if (seqEntropyWritten) { + writeSeqEntropy = 0; } + sp += seqCount; } - } while (!lastSequence); + } + + if (writeLitEntropy) { - DEBUGLOG(5, "ZSTD_compressSubBlock_multi has literal entropy tables unwritten"); + DEBUGLOG(5, "Literal entropy tables were never written"); ZSTD_memcpy(&nextCBlock->entropy.huf, &prevCBlock->entropy.huf, sizeof(prevCBlock->entropy.huf)); } if (writeSeqEntropy && ZSTD_needSequenceEntropyTables(&entropyMetadata->fseMetadata)) { /* If we haven't written our entropy tables, then we've violated our contract and * must emit an uncompressed block. */ - DEBUGLOG(5, "ZSTD_compressSubBlock_multi has sequence entropy tables unwritten"); + DEBUGLOG(5, "Sequence entropy tables were never written => cancel, emit an uncompressed block"); return 0; } + if (ip < iend) { - size_t const cSize = ZSTD_noCompressBlock(op, oend - op, ip, iend - ip, lastBlock); - DEBUGLOG(5, "ZSTD_compressSubBlock_multi last sub-block uncompressed, %zu bytes", (size_t)(iend - ip)); + /* some data left : last part of the block sent uncompressed */ + size_t const rSize = (size_t)((iend - ip)); + size_t const cSize = ZSTD_noCompressBlock(op, (size_t)(oend - op), ip, rSize, lastBlock); + DEBUGLOG(5, "Generate last uncompressed sub-block of %u bytes", (unsigned)(rSize)); FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed"); assert(cSize != 0); op += cSize; /* We have to regenerate the repcodes because we've skipped some sequences */ if (sp < send) { - seqDef const* seq; + const seqDef* seq; repcodes_t rep; ZSTD_memcpy(&rep, prevCBlock->rep, sizeof(rep)); for (seq = sstart; seq < sp; ++seq) { @@ -548,14 +656,17 @@ static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr, ZSTD_memcpy(nextCBlock->rep, &rep, sizeof(rep)); } } - DEBUGLOG(5, "ZSTD_compressSubBlock_multi compressed"); - return op-ostart; + + DEBUGLOG(5, "ZSTD_compressSubBlock_multi compressed all subBlocks: total compressed size = %u", + (unsigned)(op-ostart)); + return (size_t)(op-ostart); } size_t ZSTD_compressSuperBlock(ZSTD_CCtx* zc, void* dst, size_t dstCapacity, - void const* src, size_t srcSize, - unsigned lastBlock) { + const void* src, size_t srcSize, + unsigned lastBlock) +{ ZSTD_entropyCTablesMetadata_t entropyMetadata; FORWARD_IF_ERROR(ZSTD_buildBlockEntropyStats(&zc->seqStore, diff --git a/thirdparty/zstd/compress/zstd_cwksp.h b/thirdparty/zstd/compress/zstd_cwksp.h index cc7fb1c715..3eddbd334e 100644 --- a/thirdparty/zstd/compress/zstd_cwksp.h +++ b/thirdparty/zstd/compress/zstd_cwksp.h @@ -192,6 +192,7 @@ MEM_STATIC void ZSTD_cwksp_assert_internal_consistency(ZSTD_cwksp* ws) { { intptr_t const offset = __msan_test_shadow(ws->initOnceStart, (U8*)ZSTD_cwksp_initialAllocStart(ws) - (U8*)ws->initOnceStart); + (void)offset; #if defined(ZSTD_MSAN_PRINT) if(offset!=-1) { __msan_print_shadow((U8*)ws->initOnceStart + offset - 8, 32); @@ -433,7 +434,7 @@ MEM_STATIC void* ZSTD_cwksp_reserve_aligned(ZSTD_cwksp* ws, size_t bytes) /** * Aligned on 64 bytes. These buffers have the special property that - * their values remain constrained, allowing us to re-use them without + * their values remain constrained, allowing us to reuse them without * memset()-ing them. */ MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes) @@ -525,7 +526,7 @@ MEM_STATIC void ZSTD_cwksp_mark_tables_dirty(ZSTD_cwksp* ws) DEBUGLOG(4, "cwksp: ZSTD_cwksp_mark_tables_dirty"); #if ZSTD_MEMORY_SANITIZER && !defined (ZSTD_MSAN_DONT_POISON_WORKSPACE) - /* To validate that the table re-use logic is sound, and that we don't + /* To validate that the table reuse logic is sound, and that we don't * access table space that we haven't cleaned, we re-"poison" the table * space every time we mark it dirty. * Since tableValidEnd space and initOnce space may overlap we don't poison @@ -602,9 +603,9 @@ MEM_STATIC void ZSTD_cwksp_clear(ZSTD_cwksp* ws) { DEBUGLOG(4, "cwksp: clearing!"); #if ZSTD_MEMORY_SANITIZER && !defined (ZSTD_MSAN_DONT_POISON_WORKSPACE) - /* To validate that the context re-use logic is sound, and that we don't + /* To validate that the context reuse logic is sound, and that we don't * access stuff that this compression hasn't initialized, we re-"poison" - * the workspace except for the areas in which we expect memory re-use + * the workspace except for the areas in which we expect memory reuse * without initialization (objects, valid tables area and init once * memory). */ { @@ -635,6 +636,15 @@ MEM_STATIC void ZSTD_cwksp_clear(ZSTD_cwksp* ws) { ZSTD_cwksp_assert_internal_consistency(ws); } +MEM_STATIC size_t ZSTD_cwksp_sizeof(const ZSTD_cwksp* ws) { + return (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->workspace); +} + +MEM_STATIC size_t ZSTD_cwksp_used(const ZSTD_cwksp* ws) { + return (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->workspace) + + (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->allocStart); +} + /** * The provided workspace takes ownership of the buffer [start, start+size). * Any existing values in the workspace are ignored (the previously managed @@ -666,6 +676,11 @@ MEM_STATIC size_t ZSTD_cwksp_create(ZSTD_cwksp* ws, size_t size, ZSTD_customMem MEM_STATIC void ZSTD_cwksp_free(ZSTD_cwksp* ws, ZSTD_customMem customMem) { void *ptr = ws->workspace; DEBUGLOG(4, "cwksp: freeing workspace"); +#if ZSTD_MEMORY_SANITIZER && !defined(ZSTD_MSAN_DONT_POISON_WORKSPACE) + if (ptr != NULL && customMem.customFree != NULL) { + __msan_unpoison(ptr, ZSTD_cwksp_sizeof(ws)); + } +#endif ZSTD_memset(ws, 0, sizeof(ZSTD_cwksp)); ZSTD_customFree(ptr, customMem); } @@ -679,15 +694,6 @@ MEM_STATIC void ZSTD_cwksp_move(ZSTD_cwksp* dst, ZSTD_cwksp* src) { ZSTD_memset(src, 0, sizeof(ZSTD_cwksp)); } -MEM_STATIC size_t ZSTD_cwksp_sizeof(const ZSTD_cwksp* ws) { - return (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->workspace); -} - -MEM_STATIC size_t ZSTD_cwksp_used(const ZSTD_cwksp* ws) { - return (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->workspace) - + (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->allocStart); -} - MEM_STATIC int ZSTD_cwksp_reserve_failed(const ZSTD_cwksp* ws) { return ws->allocFailed; } diff --git a/thirdparty/zstd/compress/zstd_double_fast.c b/thirdparty/zstd/compress/zstd_double_fast.c index 0ad88ffc7b..a4e9c50d3b 100644 --- a/thirdparty/zstd/compress/zstd_double_fast.c +++ b/thirdparty/zstd/compress/zstd_double_fast.c @@ -11,7 +11,11 @@ #include "zstd_compress_internal.h" #include "zstd_double_fast.h" -static void ZSTD_fillDoubleHashTableForCDict(ZSTD_matchState_t* ms, +#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR + +static +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +void ZSTD_fillDoubleHashTableForCDict(ZSTD_matchState_t* ms, void const* end, ZSTD_dictTableLoadMethod_e dtlm) { const ZSTD_compressionParameters* const cParams = &ms->cParams; @@ -47,7 +51,9 @@ static void ZSTD_fillDoubleHashTableForCDict(ZSTD_matchState_t* ms, } } } -static void ZSTD_fillDoubleHashTableForCCtx(ZSTD_matchState_t* ms, +static +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +void ZSTD_fillDoubleHashTableForCCtx(ZSTD_matchState_t* ms, void const* end, ZSTD_dictTableLoadMethod_e dtlm) { const ZSTD_compressionParameters* const cParams = &ms->cParams; @@ -95,6 +101,7 @@ void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms, FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR size_t ZSTD_compressBlock_doubleFast_noDict_generic( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize, U32 const mls /* template */) @@ -305,6 +312,7 @@ _match_stored: FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize, @@ -348,8 +356,8 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( if (ms->prefetchCDictTables) { size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32); size_t const chainTableBytes = (((size_t)1) << dictCParams->chainLog) * sizeof(U32); - PREFETCH_AREA(dictHashLong, hashTableBytes) - PREFETCH_AREA(dictHashSmall, chainTableBytes) + PREFETCH_AREA(dictHashLong, hashTableBytes); + PREFETCH_AREA(dictHashSmall, chainTableBytes); } /* init */ @@ -589,7 +597,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState( } -static size_t ZSTD_compressBlock_doubleFast_extDict_generic( +static +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +size_t ZSTD_compressBlock_doubleFast_extDict_generic( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize, U32 const mls /* template */) @@ -756,3 +766,5 @@ size_t ZSTD_compressBlock_doubleFast_extDict( return ZSTD_compressBlock_doubleFast_extDict_7(ms, seqStore, rep, src, srcSize); } } + +#endif /* ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR */ diff --git a/thirdparty/zstd/compress/zstd_double_fast.h b/thirdparty/zstd/compress/zstd_double_fast.h index 6f0047c4ba..ce6ed8c97f 100644 --- a/thirdparty/zstd/compress/zstd_double_fast.h +++ b/thirdparty/zstd/compress/zstd_double_fast.h @@ -18,9 +18,12 @@ extern "C" { #include "../common/mem.h" /* U32 */ #include "zstd_compress_internal.h" /* ZSTD_CCtx, size_t */ +#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR + void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms, void const* end, ZSTD_dictTableLoadMethod_e dtlm, ZSTD_tableFillPurpose_e tfp); + size_t ZSTD_compressBlock_doubleFast( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); @@ -31,6 +34,14 @@ size_t ZSTD_compressBlock_doubleFast_extDict( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); +#define ZSTD_COMPRESSBLOCK_DOUBLEFAST ZSTD_compressBlock_doubleFast +#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_DICTMATCHSTATE ZSTD_compressBlock_doubleFast_dictMatchState +#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_EXTDICT ZSTD_compressBlock_doubleFast_extDict +#else +#define ZSTD_COMPRESSBLOCK_DOUBLEFAST NULL +#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_DICTMATCHSTATE NULL +#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_EXTDICT NULL +#endif /* ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR */ #if defined (__cplusplus) } diff --git a/thirdparty/zstd/compress/zstd_fast.c b/thirdparty/zstd/compress/zstd_fast.c index 5f2c6a2eda..6c4554cfca 100644 --- a/thirdparty/zstd/compress/zstd_fast.c +++ b/thirdparty/zstd/compress/zstd_fast.c @@ -11,7 +11,9 @@ #include "zstd_compress_internal.h" /* ZSTD_hashPtr, ZSTD_count, ZSTD_storeSeq */ #include "zstd_fast.h" -static void ZSTD_fillHashTableForCDict(ZSTD_matchState_t* ms, +static +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +void ZSTD_fillHashTableForCDict(ZSTD_matchState_t* ms, const void* const end, ZSTD_dictTableLoadMethod_e dtlm) { @@ -46,7 +48,9 @@ static void ZSTD_fillHashTableForCDict(ZSTD_matchState_t* ms, } } } } } -static void ZSTD_fillHashTableForCCtx(ZSTD_matchState_t* ms, +static +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +void ZSTD_fillHashTableForCCtx(ZSTD_matchState_t* ms, const void* const end, ZSTD_dictTableLoadMethod_e dtlm) { @@ -139,8 +143,9 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms, * * This is also the work we do at the beginning to enter the loop initially. */ -FORCE_INLINE_TEMPLATE size_t -ZSTD_compressBlock_fast_noDict_generic( +FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +size_t ZSTD_compressBlock_fast_noDict_generic( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize, U32 const mls, U32 const hasStep) @@ -456,6 +461,7 @@ size_t ZSTD_compressBlock_fast( } FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR size_t ZSTD_compressBlock_fast_dictMatchState_generic( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize, U32 const mls, U32 const hasStep) @@ -502,7 +508,7 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( if (ms->prefetchCDictTables) { size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32); - PREFETCH_AREA(dictHashTable, hashTableBytes) + PREFETCH_AREA(dictHashTable, hashTableBytes); } /* init */ @@ -681,7 +687,9 @@ size_t ZSTD_compressBlock_fast_dictMatchState( } -static size_t ZSTD_compressBlock_fast_extDict_generic( +static +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +size_t ZSTD_compressBlock_fast_extDict_generic( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize, U32 const mls, U32 const hasStep) { diff --git a/thirdparty/zstd/compress/zstd_lazy.c b/thirdparty/zstd/compress/zstd_lazy.c index 5ba88e8678..67dd55fdb8 100644 --- a/thirdparty/zstd/compress/zstd_lazy.c +++ b/thirdparty/zstd/compress/zstd_lazy.c @@ -12,6 +12,11 @@ #include "zstd_lazy.h" #include "../common/bits.h" /* ZSTD_countTrailingZeros64 */ +#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \ + || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \ + || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \ + || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) + #define kLazySkippingStep 8 @@ -19,8 +24,9 @@ * Binary Tree search ***************************************/ -static void -ZSTD_updateDUBT(ZSTD_matchState_t* ms, +static +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +void ZSTD_updateDUBT(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend, U32 mls) { @@ -63,8 +69,9 @@ ZSTD_updateDUBT(ZSTD_matchState_t* ms, * sort one already inserted but unsorted position * assumption : curr >= btlow == (curr - btmask) * doesn't fail */ -static void -ZSTD_insertDUBT1(const ZSTD_matchState_t* ms, +static +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +void ZSTD_insertDUBT1(const ZSTD_matchState_t* ms, U32 curr, const BYTE* inputEnd, U32 nbCompares, U32 btLow, const ZSTD_dictMode_e dictMode) @@ -152,8 +159,9 @@ ZSTD_insertDUBT1(const ZSTD_matchState_t* ms, } -static size_t -ZSTD_DUBT_findBetterDictMatch ( +static +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +size_t ZSTD_DUBT_findBetterDictMatch ( const ZSTD_matchState_t* ms, const BYTE* const ip, const BYTE* const iend, size_t* offsetPtr, @@ -230,8 +238,9 @@ ZSTD_DUBT_findBetterDictMatch ( } -static size_t -ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, +static +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +size_t ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, const BYTE* const ip, const BYTE* const iend, size_t* offBasePtr, U32 const mls, @@ -381,8 +390,9 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, /** ZSTD_BtFindBestMatch() : Tree updater, providing best match */ -FORCE_INLINE_TEMPLATE size_t -ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms, +FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +size_t ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms, const BYTE* const ip, const BYTE* const iLimit, size_t* offBasePtr, const U32 mls /* template */, @@ -617,7 +627,9 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb /* Update chains up to ip (excluded) Assumption : always within prefix (i.e. not within extDict) */ -FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal( +FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +U32 ZSTD_insertAndFindFirstIndex_internal( ZSTD_matchState_t* ms, const ZSTD_compressionParameters* const cParams, const BYTE* ip, U32 const mls, U32 const lazySkipping) @@ -651,6 +663,7 @@ U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) { /* inlining is important to hardwire a hot branch (template emulation) */ FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR size_t ZSTD_HcFindBestMatch( ZSTD_matchState_t* ms, const BYTE* const ip, const BYTE* const iLimit, @@ -819,7 +832,9 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, BYTE const* t * Fill up the hash cache starting at idx, prefetching up to ZSTD_ROW_HASH_CACHE_SIZE entries, * but not beyond iLimit. */ -FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base, +FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base, U32 const rowLog, U32 const mls, U32 idx, const BYTE* const iLimit) { @@ -845,7 +860,9 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const B * Returns the hash of base + idx, and replaces the hash in the hash cache with the byte at * base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate rows from hashTable and tagTable. */ -FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable, +FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable, BYTE const* tagTable, BYTE const* base, U32 idx, U32 const hashLog, U32 const rowLog, U32 const mls, @@ -863,10 +880,12 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTab /* ZSTD_row_update_internalImpl(): * Updates the hash table with positions starting from updateStartIdx until updateEndIdx. */ -FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms, - U32 updateStartIdx, U32 const updateEndIdx, - U32 const mls, U32 const rowLog, - U32 const rowMask, U32 const useCache) +FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms, + U32 updateStartIdx, U32 const updateEndIdx, + U32 const mls, U32 const rowLog, + U32 const rowMask, U32 const useCache) { U32* const hashTable = ms->hashTable; BYTE* const tagTable = ms->tagTable; @@ -892,9 +911,11 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms, * Inserts the byte at ip into the appropriate position in the hash table, and updates ms->nextToUpdate. * Skips sections of long matches as is necessary. */ -FORCE_INLINE_TEMPLATE void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip, - U32 const mls, U32 const rowLog, - U32 const rowMask, U32 const useCache) +FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip, + U32 const mls, U32 const rowLog, + U32 const rowMask, U32 const useCache) { U32 idx = ms->nextToUpdate; const BYTE* const base = ms->window.base; @@ -1102,20 +1123,21 @@ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 headGr /* The high-level approach of the SIMD row based match finder is as follows: * - Figure out where to insert the new entry: - * - Generate a hash from a byte along with an additional 1-byte "short hash". The additional byte is our "tag" - * - The hashTable is effectively split into groups or "rows" of 16 or 32 entries of U32, and the hash determines + * - Generate a hash for current input posistion and split it into a one byte of tag and `rowHashLog` bits of index. + * - The hash is salted by a value that changes on every contex reset, so when the same table is used + * we will avoid collisions that would otherwise slow us down by intorducing phantom matches. + * - The hashTable is effectively split into groups or "rows" of 15 or 31 entries of U32, and the index determines * which row to insert into. - * - Determine the correct position within the row to insert the entry into. Each row of 16 or 32 can - * be considered as a circular buffer with a "head" index that resides in the tagTable. - * - Also insert the "tag" into the equivalent row and position in the tagTable. - * - Note: The tagTable has 17 or 33 1-byte entries per row, due to 16 or 32 tags, and 1 "head" entry. - * The 17 or 33 entry rows are spaced out to occur every 32 or 64 bytes, respectively, - * for alignment/performance reasons, leaving some bytes unused. - * - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte "short hash" and + * - Determine the correct position within the row to insert the entry into. Each row of 15 or 31 can + * be considered as a circular buffer with a "head" index that resides in the tagTable (overall 16 or 32 bytes + * per row). + * - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte tag calculated for the position and * generate a bitfield that we can cycle through to check the collisions in the hash table. * - Pick the longest match. + * - Insert the tag into the equivalent row and position in the tagTable. */ FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR size_t ZSTD_RowFindBestMatch( ZSTD_matchState_t* ms, const BYTE* const ip, const BYTE* const iLimit, @@ -1489,8 +1511,9 @@ FORCE_INLINE_TEMPLATE size_t ZSTD_searchMax( * Common parser - lazy strategy *********************************/ -FORCE_INLINE_TEMPLATE size_t -ZSTD_compressBlock_lazy_generic( +FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +size_t ZSTD_compressBlock_lazy_generic( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], const void* src, size_t srcSize, @@ -1754,152 +1777,163 @@ _storeSequence: /* Return the last literals size */ return (size_t)(iend - anchor); } +#endif /* build exclusions */ -size_t ZSTD_compressBlock_btlazy2( +#ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR +size_t ZSTD_compressBlock_greedy( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { - return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_noDict); + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_noDict); } -size_t ZSTD_compressBlock_lazy2( +size_t ZSTD_compressBlock_greedy_dictMatchState( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { - return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_noDict); + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dictMatchState); } -size_t ZSTD_compressBlock_lazy( +size_t ZSTD_compressBlock_greedy_dedicatedDictSearch( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { - return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_noDict); + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch); } -size_t ZSTD_compressBlock_greedy( +size_t ZSTD_compressBlock_greedy_row( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { - return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_noDict); + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_noDict); } -size_t ZSTD_compressBlock_btlazy2_dictMatchState( +size_t ZSTD_compressBlock_greedy_dictMatchState_row( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { - return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_dictMatchState); + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dictMatchState); } -size_t ZSTD_compressBlock_lazy2_dictMatchState( +size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { - return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dictMatchState); + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dedicatedDictSearch); } +#endif -size_t ZSTD_compressBlock_lazy_dictMatchState( +#ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR +size_t ZSTD_compressBlock_lazy( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { - return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dictMatchState); + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_noDict); } -size_t ZSTD_compressBlock_greedy_dictMatchState( +size_t ZSTD_compressBlock_lazy_dictMatchState( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { - return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dictMatchState); + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dictMatchState); } - -size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch( +size_t ZSTD_compressBlock_lazy_dedicatedDictSearch( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { - return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dedicatedDictSearch); + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dedicatedDictSearch); } -size_t ZSTD_compressBlock_lazy_dedicatedDictSearch( +size_t ZSTD_compressBlock_lazy_row( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { - return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dedicatedDictSearch); + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_noDict); } -size_t ZSTD_compressBlock_greedy_dedicatedDictSearch( +size_t ZSTD_compressBlock_lazy_dictMatchState_row( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { - return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch); + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dictMatchState); } -/* Row-based matchfinder */ -size_t ZSTD_compressBlock_lazy2_row( +size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { - return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_noDict); + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dedicatedDictSearch); } +#endif -size_t ZSTD_compressBlock_lazy_row( +#ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR +size_t ZSTD_compressBlock_lazy2( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { - return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_noDict); + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_noDict); } -size_t ZSTD_compressBlock_greedy_row( +size_t ZSTD_compressBlock_lazy2_dictMatchState( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { - return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_noDict); + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dictMatchState); } -size_t ZSTD_compressBlock_lazy2_dictMatchState_row( +size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { - return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dictMatchState); + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dedicatedDictSearch); } -size_t ZSTD_compressBlock_lazy_dictMatchState_row( +size_t ZSTD_compressBlock_lazy2_row( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { - return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dictMatchState); + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_noDict); } -size_t ZSTD_compressBlock_greedy_dictMatchState_row( +size_t ZSTD_compressBlock_lazy2_dictMatchState_row( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { - return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dictMatchState); + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dictMatchState); } - size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dedicatedDictSearch); } +#endif -size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row( +#ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR +size_t ZSTD_compressBlock_btlazy2( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { - return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dedicatedDictSearch); + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_noDict); } -size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row( +size_t ZSTD_compressBlock_btlazy2_dictMatchState( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { - return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dedicatedDictSearch); + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_dictMatchState); } +#endif +#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \ + || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \ + || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \ + || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR size_t ZSTD_compressBlock_lazy_extDict_generic( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], @@ -2101,8 +2135,9 @@ _storeSequence: /* Return the last literals size */ return (size_t)(iend - anchor); } +#endif /* build exclusions */ - +#ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR size_t ZSTD_compressBlock_greedy_extDict( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) @@ -2110,48 +2145,55 @@ size_t ZSTD_compressBlock_greedy_extDict( return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0); } -size_t ZSTD_compressBlock_lazy_extDict( +size_t ZSTD_compressBlock_greedy_extDict_row( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) - { - return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1); + return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0); } +#endif -size_t ZSTD_compressBlock_lazy2_extDict( +#ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR +size_t ZSTD_compressBlock_lazy_extDict( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { - return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2); + return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1); } -size_t ZSTD_compressBlock_btlazy2_extDict( +size_t ZSTD_compressBlock_lazy_extDict_row( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { - return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2); + return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1); } +#endif -size_t ZSTD_compressBlock_greedy_extDict_row( +#ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR +size_t ZSTD_compressBlock_lazy2_extDict( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) + { - return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0); + return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2); } -size_t ZSTD_compressBlock_lazy_extDict_row( +size_t ZSTD_compressBlock_lazy2_extDict_row( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) - { - return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1); + return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2); } +#endif -size_t ZSTD_compressBlock_lazy2_extDict_row( +#ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR +size_t ZSTD_compressBlock_btlazy2_extDict( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) + { - return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2); + return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2); } +#endif diff --git a/thirdparty/zstd/compress/zstd_lazy.h b/thirdparty/zstd/compress/zstd_lazy.h index 3bde67331e..3635813bdd 100644 --- a/thirdparty/zstd/compress/zstd_lazy.h +++ b/thirdparty/zstd/compress/zstd_lazy.h @@ -27,98 +27,173 @@ extern "C" { #define ZSTD_ROW_HASH_TAG_BITS 8 /* nb bits to use for the tag */ +#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \ + || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \ + || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \ + || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip); void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip); void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const BYTE* const ip); void ZSTD_preserveUnsortedMark (U32* const table, U32 const size, U32 const reducerValue); /*! used in ZSTD_reduceIndex(). preemptively increase value of ZSTD_DUBT_UNSORTED_MARK */ +#endif -size_t ZSTD_compressBlock_btlazy2( +#ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR +size_t ZSTD_compressBlock_greedy( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); -size_t ZSTD_compressBlock_lazy2( +size_t ZSTD_compressBlock_greedy_row( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); -size_t ZSTD_compressBlock_lazy( +size_t ZSTD_compressBlock_greedy_dictMatchState( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); -size_t ZSTD_compressBlock_greedy( +size_t ZSTD_compressBlock_greedy_dictMatchState_row( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); -size_t ZSTD_compressBlock_lazy2_row( +size_t ZSTD_compressBlock_greedy_dedicatedDictSearch( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); -size_t ZSTD_compressBlock_lazy_row( +size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); -size_t ZSTD_compressBlock_greedy_row( +size_t ZSTD_compressBlock_greedy_extDict( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); - -size_t ZSTD_compressBlock_btlazy2_dictMatchState( +size_t ZSTD_compressBlock_greedy_extDict_row( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); -size_t ZSTD_compressBlock_lazy2_dictMatchState( + +#define ZSTD_COMPRESSBLOCK_GREEDY ZSTD_compressBlock_greedy +#define ZSTD_COMPRESSBLOCK_GREEDY_ROW ZSTD_compressBlock_greedy_row +#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE ZSTD_compressBlock_greedy_dictMatchState +#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE_ROW ZSTD_compressBlock_greedy_dictMatchState_row +#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH ZSTD_compressBlock_greedy_dedicatedDictSearch +#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH_ROW ZSTD_compressBlock_greedy_dedicatedDictSearch_row +#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT ZSTD_compressBlock_greedy_extDict +#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT_ROW ZSTD_compressBlock_greedy_extDict_row +#else +#define ZSTD_COMPRESSBLOCK_GREEDY NULL +#define ZSTD_COMPRESSBLOCK_GREEDY_ROW NULL +#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE NULL +#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE_ROW NULL +#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH NULL +#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH_ROW NULL +#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT NULL +#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT_ROW NULL +#endif + +#ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR +size_t ZSTD_compressBlock_lazy( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); -size_t ZSTD_compressBlock_lazy_dictMatchState( +size_t ZSTD_compressBlock_lazy_row( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); -size_t ZSTD_compressBlock_greedy_dictMatchState( +size_t ZSTD_compressBlock_lazy_dictMatchState( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); -size_t ZSTD_compressBlock_lazy2_dictMatchState_row( +size_t ZSTD_compressBlock_lazy_dictMatchState_row( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); -size_t ZSTD_compressBlock_lazy_dictMatchState_row( +size_t ZSTD_compressBlock_lazy_dedicatedDictSearch( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); -size_t ZSTD_compressBlock_greedy_dictMatchState_row( +size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); - -size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch( +size_t ZSTD_compressBlock_lazy_extDict( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); -size_t ZSTD_compressBlock_lazy_dedicatedDictSearch( +size_t ZSTD_compressBlock_lazy_extDict_row( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); -size_t ZSTD_compressBlock_greedy_dedicatedDictSearch( + +#define ZSTD_COMPRESSBLOCK_LAZY ZSTD_compressBlock_lazy +#define ZSTD_COMPRESSBLOCK_LAZY_ROW ZSTD_compressBlock_lazy_row +#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE ZSTD_compressBlock_lazy_dictMatchState +#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE_ROW ZSTD_compressBlock_lazy_dictMatchState_row +#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH ZSTD_compressBlock_lazy_dedicatedDictSearch +#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH_ROW ZSTD_compressBlock_lazy_dedicatedDictSearch_row +#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT ZSTD_compressBlock_lazy_extDict +#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT_ROW ZSTD_compressBlock_lazy_extDict_row +#else +#define ZSTD_COMPRESSBLOCK_LAZY NULL +#define ZSTD_COMPRESSBLOCK_LAZY_ROW NULL +#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE NULL +#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE_ROW NULL +#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH NULL +#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH_ROW NULL +#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT NULL +#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT_ROW NULL +#endif + +#ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR +size_t ZSTD_compressBlock_lazy2( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); -size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row( +size_t ZSTD_compressBlock_lazy2_row( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); -size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row( +size_t ZSTD_compressBlock_lazy2_dictMatchState( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); -size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row( +size_t ZSTD_compressBlock_lazy2_dictMatchState_row( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); - -size_t ZSTD_compressBlock_greedy_extDict( +size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); -size_t ZSTD_compressBlock_lazy_extDict( +size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); size_t ZSTD_compressBlock_lazy2_extDict( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); -size_t ZSTD_compressBlock_greedy_extDict_row( +size_t ZSTD_compressBlock_lazy2_extDict_row( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); -size_t ZSTD_compressBlock_lazy_extDict_row( + +#define ZSTD_COMPRESSBLOCK_LAZY2 ZSTD_compressBlock_lazy2 +#define ZSTD_COMPRESSBLOCK_LAZY2_ROW ZSTD_compressBlock_lazy2_row +#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE ZSTD_compressBlock_lazy2_dictMatchState +#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE_ROW ZSTD_compressBlock_lazy2_dictMatchState_row +#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH ZSTD_compressBlock_lazy2_dedicatedDictSearch +#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH_ROW ZSTD_compressBlock_lazy2_dedicatedDictSearch_row +#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT ZSTD_compressBlock_lazy2_extDict +#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT_ROW ZSTD_compressBlock_lazy2_extDict_row +#else +#define ZSTD_COMPRESSBLOCK_LAZY2 NULL +#define ZSTD_COMPRESSBLOCK_LAZY2_ROW NULL +#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE NULL +#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE_ROW NULL +#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH NULL +#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH_ROW NULL +#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT NULL +#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT_ROW NULL +#endif + +#ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR +size_t ZSTD_compressBlock_btlazy2( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); -size_t ZSTD_compressBlock_lazy2_extDict_row( +size_t ZSTD_compressBlock_btlazy2_dictMatchState( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); size_t ZSTD_compressBlock_btlazy2_extDict( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); +#define ZSTD_COMPRESSBLOCK_BTLAZY2 ZSTD_compressBlock_btlazy2 +#define ZSTD_COMPRESSBLOCK_BTLAZY2_DICTMATCHSTATE ZSTD_compressBlock_btlazy2_dictMatchState +#define ZSTD_COMPRESSBLOCK_BTLAZY2_EXTDICT ZSTD_compressBlock_btlazy2_extDict +#else +#define ZSTD_COMPRESSBLOCK_BTLAZY2 NULL +#define ZSTD_COMPRESSBLOCK_BTLAZY2_DICTMATCHSTATE NULL +#define ZSTD_COMPRESSBLOCK_BTLAZY2_EXTDICT NULL +#endif + #if defined (__cplusplus) } diff --git a/thirdparty/zstd/compress/zstd_ldm.c b/thirdparty/zstd/compress/zstd_ldm.c index 3d74ff19e3..17c069fe1d 100644 --- a/thirdparty/zstd/compress/zstd_ldm.c +++ b/thirdparty/zstd/compress/zstd_ldm.c @@ -246,7 +246,11 @@ static size_t ZSTD_ldm_fillFastTables(ZSTD_matchState_t* ms, break; case ZSTD_dfast: +#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast, ZSTD_tfp_forCCtx); +#else + assert(0); /* shouldn't be called: cparams should've been adjusted. */ +#endif break; case ZSTD_greedy: @@ -318,7 +322,9 @@ static void ZSTD_ldm_limitTableUpdate(ZSTD_matchState_t* ms, const BYTE* anchor) } } -static size_t ZSTD_ldm_generateSequences_internal( +static +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +size_t ZSTD_ldm_generateSequences_internal( ldmState_t* ldmState, rawSeqStore_t* rawSeqStore, ldmParams_t const* params, void const* src, size_t srcSize) { @@ -689,7 +695,6 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore, /* maybeSplitSequence updates rawSeqStore->pos */ rawSeq const sequence = maybeSplitSequence(rawSeqStore, (U32)(iend - ip), minMatch); - int i; /* End signal */ if (sequence.offset == 0) break; @@ -702,6 +707,7 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore, /* Run the block compressor */ DEBUGLOG(5, "pos %u : calling block compressor on segment of size %u", (unsigned)(ip-istart), sequence.litLength); { + int i; size_t const newLitLength = blockCompressor(ms, seqStore, rep, ip, sequence.litLength); ip += sequence.litLength; diff --git a/thirdparty/zstd/compress/zstd_opt.c b/thirdparty/zstd/compress/zstd_opt.c index f02a760946..e63073e5a4 100644 --- a/thirdparty/zstd/compress/zstd_opt.c +++ b/thirdparty/zstd/compress/zstd_opt.c @@ -12,6 +12,9 @@ #include "hist.h" #include "zstd_opt.h" +#if !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) \ + || !defined(ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR) \ + || !defined(ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR) #define ZSTD_LITFREQ_ADD 2 /* scaling factor for litFreq, so that frequencies adapt faster to new stats */ #define ZSTD_MAX_PRICE (1<<30) @@ -264,6 +267,7 @@ static U32 ZSTD_rawLiteralsCost(const BYTE* const literals, U32 const litLength, const optState_t* const optPtr, int optLevel) { + DEBUGLOG(8, "ZSTD_rawLiteralsCost (%u literals)", litLength); if (litLength == 0) return 0; if (!ZSTD_compressedLiterals(optPtr)) @@ -402,9 +406,11 @@ MEM_STATIC U32 ZSTD_readMINMATCH(const void* memPtr, U32 length) /* Update hashTable3 up to ip (excluded) Assumption : always within prefix (i.e. not within extDict) */ -static U32 ZSTD_insertAndFindFirstIndexHash3 (const ZSTD_matchState_t* ms, - U32* nextToUpdate3, - const BYTE* const ip) +static +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +U32 ZSTD_insertAndFindFirstIndexHash3 (const ZSTD_matchState_t* ms, + U32* nextToUpdate3, + const BYTE* const ip) { U32* const hashTable3 = ms->hashTable3; U32 const hashLog3 = ms->hashLog3; @@ -431,7 +437,9 @@ static U32 ZSTD_insertAndFindFirstIndexHash3 (const ZSTD_matchState_t* ms, * @param ip assumed <= iend-8 . * @param target The target of ZSTD_updateTree_internal() - we are filling to this position * @return : nb of positions added */ -static U32 ZSTD_insertBt1( +static +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +U32 ZSTD_insertBt1( const ZSTD_matchState_t* ms, const BYTE* const ip, const BYTE* const iend, U32 const target, @@ -550,6 +558,7 @@ static U32 ZSTD_insertBt1( } FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR void ZSTD_updateTree_internal( ZSTD_matchState_t* ms, const BYTE* const ip, const BYTE* const iend, @@ -558,7 +567,7 @@ void ZSTD_updateTree_internal( const BYTE* const base = ms->window.base; U32 const target = (U32)(ip - base); U32 idx = ms->nextToUpdate; - DEBUGLOG(6, "ZSTD_updateTree_internal, from %u to %u (dictMode:%u)", + DEBUGLOG(7, "ZSTD_updateTree_internal, from %u to %u (dictMode:%u)", idx, target, dictMode); while(idx < target) { @@ -575,7 +584,9 @@ void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend) { ZSTD_updateTree_internal(ms, ip, iend, ms->cParams.minMatch, ZSTD_noDict); } -FORCE_INLINE_TEMPLATE U32 +FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +U32 ZSTD_insertBtAndGetAllMatches ( ZSTD_match_t* matches, /* store result (found matches) in this table (presumed large enough) */ ZSTD_matchState_t* ms, @@ -816,7 +827,9 @@ typedef U32 (*ZSTD_getAllMatchesFn)( U32 const ll0, U32 const lengthToBeat); -FORCE_INLINE_TEMPLATE U32 ZSTD_btGetAllMatches_internal( +FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +U32 ZSTD_btGetAllMatches_internal( ZSTD_match_t* matches, ZSTD_matchState_t* ms, U32* nextToUpdate3, @@ -1035,11 +1048,6 @@ ZSTD_optLdm_processMatchCandidate(ZSTD_optLdm_t* optLdm, * Optimal parser *********************************/ -static U32 ZSTD_totalLen(ZSTD_optimal_t sol) -{ - return sol.litlen + sol.mlen; -} - #if 0 /* debug */ static void @@ -1057,7 +1065,13 @@ listStats(const U32* table, int lastEltID) #endif -FORCE_INLINE_TEMPLATE size_t +#define LIT_PRICE(_p) (int)ZSTD_rawLiteralsCost(_p, 1, optStatePtr, optLevel) +#define LL_PRICE(_l) (int)ZSTD_litLengthPrice(_l, optStatePtr, optLevel) +#define LL_INCPRICE(_l) (LL_PRICE(_l) - LL_PRICE(_l-1)) + +FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +size_t ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], @@ -1083,10 +1097,10 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, ZSTD_optimal_t* const opt = optStatePtr->priceTable; ZSTD_match_t* const matches = optStatePtr->matchTable; - ZSTD_optimal_t lastSequence; + ZSTD_optimal_t lastStretch; ZSTD_optLdm_t optLdm; - ZSTD_memset(&lastSequence, 0, sizeof(ZSTD_optimal_t)); + ZSTD_memset(&lastStretch, 0, sizeof(ZSTD_optimal_t)); optLdm.seqStore = ms->ldmSeqStore ? *ms->ldmSeqStore : kNullRawSeqStore; optLdm.endPosInBlock = optLdm.startPosInBlock = optLdm.offset = 0; @@ -1108,19 +1122,31 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, U32 const ll0 = !litlen; U32 nbMatches = getAllMatches(matches, ms, &nextToUpdate3, ip, iend, rep, ll0, minMatch); ZSTD_optLdm_processMatchCandidate(&optLdm, matches, &nbMatches, - (U32)(ip-istart), (U32)(iend - ip)); - if (!nbMatches) { ip++; continue; } + (U32)(ip-istart), (U32)(iend-ip)); + if (!nbMatches) { + DEBUGLOG(8, "no match found at cPos %u", (unsigned)(ip-istart)); + ip++; + continue; + } + + /* Match found: let's store this solution, and eventually find more candidates. + * During this forward pass, @opt is used to store stretches, + * defined as "a match followed by N literals". + * Note how this is different from a Sequence, which is "N literals followed by a match". + * Storing stretches allows us to store different match predecessors + * for each literal position part of a literals run. */ /* initialize opt[0] */ - { U32 i ; for (i=0; i<ZSTD_REP_NUM; i++) opt[0].rep[i] = rep[i]; } - opt[0].mlen = 0; /* means is_a_literal */ + opt[0].mlen = 0; /* there are only literals so far */ opt[0].litlen = litlen; - /* We don't need to include the actual price of the literals because - * it is static for the duration of the forward pass, and is included - * in every price. We include the literal length to avoid negative - * prices when we subtract the previous literal length. + /* No need to include the actual price of the literals before the first match + * because it is static for the duration of the forward pass, and is included + * in every subsequent price. But, we include the literal length because + * the cost variation of litlen depends on the value of litlen. */ - opt[0].price = (int)ZSTD_litLengthPrice(litlen, optStatePtr, optLevel); + opt[0].price = LL_PRICE(litlen); + ZSTD_STATIC_ASSERT(sizeof(opt[0].rep[0]) == sizeof(rep[0])); + ZSTD_memcpy(&opt[0].rep, rep, sizeof(opt[0].rep)); /* large match -> immediate encoding */ { U32 const maxML = matches[nbMatches-1].len; @@ -1129,82 +1155,106 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, nbMatches, maxML, maxOffBase, (U32)(ip-prefixStart)); if (maxML > sufficient_len) { - lastSequence.litlen = litlen; - lastSequence.mlen = maxML; - lastSequence.off = maxOffBase; - DEBUGLOG(6, "large match (%u>%u), immediate encoding", + lastStretch.litlen = 0; + lastStretch.mlen = maxML; + lastStretch.off = maxOffBase; + DEBUGLOG(6, "large match (%u>%u) => immediate encoding", maxML, sufficient_len); cur = 0; - last_pos = ZSTD_totalLen(lastSequence); + last_pos = maxML; goto _shortestPath; } } /* set prices for first matches starting position == 0 */ assert(opt[0].price >= 0); - { U32 const literalsPrice = (U32)opt[0].price + ZSTD_litLengthPrice(0, optStatePtr, optLevel); - U32 pos; + { U32 pos; U32 matchNb; for (pos = 1; pos < minMatch; pos++) { - opt[pos].price = ZSTD_MAX_PRICE; /* mlen, litlen and price will be fixed during forward scanning */ + opt[pos].price = ZSTD_MAX_PRICE; + opt[pos].mlen = 0; + opt[pos].litlen = litlen + pos; } for (matchNb = 0; matchNb < nbMatches; matchNb++) { U32 const offBase = matches[matchNb].off; U32 const end = matches[matchNb].len; for ( ; pos <= end ; pos++ ) { - U32 const matchPrice = ZSTD_getMatchPrice(offBase, pos, optStatePtr, optLevel); - U32 const sequencePrice = literalsPrice + matchPrice; + int const matchPrice = (int)ZSTD_getMatchPrice(offBase, pos, optStatePtr, optLevel); + int const sequencePrice = opt[0].price + matchPrice; DEBUGLOG(7, "rPos:%u => set initial price : %.2f", - pos, ZSTD_fCost((int)sequencePrice)); + pos, ZSTD_fCost(sequencePrice)); opt[pos].mlen = pos; opt[pos].off = offBase; - opt[pos].litlen = litlen; - opt[pos].price = (int)sequencePrice; - } } + opt[pos].litlen = 0; /* end of match */ + opt[pos].price = sequencePrice + LL_PRICE(0); + } + } last_pos = pos-1; + opt[pos].price = ZSTD_MAX_PRICE; } } /* check further positions */ for (cur = 1; cur <= last_pos; cur++) { const BYTE* const inr = ip + cur; - assert(cur < ZSTD_OPT_NUM); - DEBUGLOG(7, "cPos:%zi==rPos:%u", inr-istart, cur) + assert(cur <= ZSTD_OPT_NUM); + DEBUGLOG(7, "cPos:%zi==rPos:%u", inr-istart, cur); /* Fix current position with one literal if cheaper */ - { U32 const litlen = (opt[cur-1].mlen == 0) ? opt[cur-1].litlen + 1 : 1; + { U32 const litlen = opt[cur-1].litlen + 1; int const price = opt[cur-1].price - + (int)ZSTD_rawLiteralsCost(ip+cur-1, 1, optStatePtr, optLevel) - + (int)ZSTD_litLengthPrice(litlen, optStatePtr, optLevel) - - (int)ZSTD_litLengthPrice(litlen-1, optStatePtr, optLevel); + + LIT_PRICE(ip+cur-1) + + LL_INCPRICE(litlen); assert(price < 1000000000); /* overflow check */ if (price <= opt[cur].price) { + ZSTD_optimal_t const prevMatch = opt[cur]; DEBUGLOG(7, "cPos:%zi==rPos:%u : better price (%.2f<=%.2f) using literal (ll==%u) (hist:%u,%u,%u)", inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price), litlen, opt[cur-1].rep[0], opt[cur-1].rep[1], opt[cur-1].rep[2]); - opt[cur].mlen = 0; - opt[cur].off = 0; + opt[cur] = opt[cur-1]; opt[cur].litlen = litlen; opt[cur].price = price; + if ( (optLevel >= 1) /* additional check only for higher modes */ + && (prevMatch.litlen == 0) /* replace a match */ + && (LL_INCPRICE(1) < 0) /* ll1 is cheaper than ll0 */ + && LIKELY(ip + cur < iend) + ) { + /* check next position, in case it would be cheaper */ + int with1literal = prevMatch.price + LIT_PRICE(ip+cur) + LL_INCPRICE(1); + int withMoreLiterals = price + LIT_PRICE(ip+cur) + LL_INCPRICE(litlen+1); + DEBUGLOG(7, "then at next rPos %u : match+1lit %.2f vs %ulits %.2f", + cur+1, ZSTD_fCost(with1literal), litlen+1, ZSTD_fCost(withMoreLiterals)); + if ( (with1literal < withMoreLiterals) + && (with1literal < opt[cur+1].price) ) { + /* update offset history - before it disappears */ + U32 const prev = cur - prevMatch.mlen; + repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, prevMatch.off, opt[prev].litlen==0); + assert(cur >= prevMatch.mlen); + DEBUGLOG(7, "==> match+1lit is cheaper (%.2f < %.2f) (hist:%u,%u,%u) !", + ZSTD_fCost(with1literal), ZSTD_fCost(withMoreLiterals), + newReps.rep[0], newReps.rep[1], newReps.rep[2] ); + opt[cur+1] = prevMatch; /* mlen & offbase */ + ZSTD_memcpy(opt[cur+1].rep, &newReps, sizeof(repcodes_t)); + opt[cur+1].litlen = 1; + opt[cur+1].price = with1literal; + if (last_pos < cur+1) last_pos = cur+1; + } + } } else { - DEBUGLOG(7, "cPos:%zi==rPos:%u : literal would cost more (%.2f>%.2f) (hist:%u,%u,%u)", - inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price), - opt[cur].rep[0], opt[cur].rep[1], opt[cur].rep[2]); + DEBUGLOG(7, "cPos:%zi==rPos:%u : literal would cost more (%.2f>%.2f)", + inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price)); } } - /* Set the repcodes of the current position. We must do it here - * because we rely on the repcodes of the 2nd to last sequence being - * correct to set the next chunks repcodes during the backward - * traversal. + /* Offset history is not updated during match comparison. + * Do it here, now that the match is selected and confirmed. */ ZSTD_STATIC_ASSERT(sizeof(opt[cur].rep) == sizeof(repcodes_t)); assert(cur >= opt[cur].mlen); - if (opt[cur].mlen != 0) { + if (opt[cur].litlen == 0) { + /* just finished a match => alter offset history */ U32 const prev = cur - opt[cur].mlen; - repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, opt[cur].off, opt[cur].litlen==0); + repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, opt[cur].off, opt[prev].litlen==0); ZSTD_memcpy(opt[cur].rep, &newReps, sizeof(repcodes_t)); - } else { - ZSTD_memcpy(opt[cur].rep, opt[cur - 1].rep, sizeof(repcodes_t)); } /* last match must start at a minimum distance of 8 from oend */ @@ -1214,15 +1264,14 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, if ( (optLevel==0) /*static_test*/ && (opt[cur+1].price <= opt[cur].price + (BITCOST_MULTIPLIER/2)) ) { - DEBUGLOG(7, "move to next rPos:%u : price is <=", cur+1); + DEBUGLOG(7, "skip current position : next rPos(%u) price is cheaper", cur+1); continue; /* skip unpromising positions; about ~+6% speed, -0.01 ratio */ } assert(opt[cur].price >= 0); - { U32 const ll0 = (opt[cur].mlen != 0); - U32 const litlen = (opt[cur].mlen == 0) ? opt[cur].litlen : 0; - U32 const previousPrice = (U32)opt[cur].price; - U32 const basePrice = previousPrice + ZSTD_litLengthPrice(0, optStatePtr, optLevel); + { U32 const ll0 = (opt[cur].litlen == 0); + int const previousPrice = opt[cur].price; + int const basePrice = previousPrice + LL_PRICE(0); U32 nbMatches = getAllMatches(matches, ms, &nextToUpdate3, inr, iend, opt[cur].rep, ll0, minMatch); U32 matchNb; @@ -1234,18 +1283,17 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, continue; } - { U32 const maxML = matches[nbMatches-1].len; - DEBUGLOG(7, "cPos:%zi==rPos:%u, found %u matches, of maxLength=%u", - inr-istart, cur, nbMatches, maxML); - - if ( (maxML > sufficient_len) - || (cur + maxML >= ZSTD_OPT_NUM) ) { - lastSequence.mlen = maxML; - lastSequence.off = matches[nbMatches-1].off; - lastSequence.litlen = litlen; - cur -= (opt[cur].mlen==0) ? opt[cur].litlen : 0; /* last sequence is actually only literals, fix cur to last match - note : may underflow, in which case, it's first sequence, and it's okay */ - last_pos = cur + ZSTD_totalLen(lastSequence); - if (cur > ZSTD_OPT_NUM) cur = 0; /* underflow => first match */ + { U32 const longestML = matches[nbMatches-1].len; + DEBUGLOG(7, "cPos:%zi==rPos:%u, found %u matches, of longest ML=%u", + inr-istart, cur, nbMatches, longestML); + + if ( (longestML > sufficient_len) + || (cur + longestML >= ZSTD_OPT_NUM) + || (ip + cur + longestML >= iend) ) { + lastStretch.mlen = longestML; + lastStretch.off = matches[nbMatches-1].off; + lastStretch.litlen = 0; + last_pos = cur + longestML; goto _shortestPath; } } @@ -1257,19 +1305,24 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, U32 mlen; DEBUGLOG(7, "testing match %u => offBase=%4u, mlen=%2u, llen=%2u", - matchNb, matches[matchNb].off, lastML, litlen); + matchNb, matches[matchNb].off, lastML, opt[cur].litlen); for (mlen = lastML; mlen >= startML; mlen--) { /* scan downward */ U32 const pos = cur + mlen; - int const price = (int)basePrice + (int)ZSTD_getMatchPrice(offset, mlen, optStatePtr, optLevel); + int const price = basePrice + (int)ZSTD_getMatchPrice(offset, mlen, optStatePtr, optLevel); if ((pos > last_pos) || (price < opt[pos].price)) { DEBUGLOG(7, "rPos:%u (ml=%2u) => new better price (%.2f<%.2f)", pos, mlen, ZSTD_fCost(price), ZSTD_fCost(opt[pos].price)); - while (last_pos < pos) { opt[last_pos+1].price = ZSTD_MAX_PRICE; last_pos++; } /* fill empty positions */ + while (last_pos < pos) { + /* fill empty positions, for future comparisons */ + last_pos++; + opt[last_pos].price = ZSTD_MAX_PRICE; + opt[last_pos].litlen = !0; /* just needs to be != 0, to mean "not an end of match" */ + } opt[pos].mlen = mlen; opt[pos].off = offset; - opt[pos].litlen = litlen; + opt[pos].litlen = 0; opt[pos].price = price; } else { DEBUGLOG(7, "rPos:%u (ml=%2u) => new price is worse (%.2f>=%.2f)", @@ -1277,47 +1330,81 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, if (optLevel==0) break; /* early update abort; gets ~+10% speed for about -0.01 ratio loss */ } } } } + opt[last_pos+1].price = ZSTD_MAX_PRICE; } /* for (cur = 1; cur <= last_pos; cur++) */ - lastSequence = opt[last_pos]; - cur = last_pos > ZSTD_totalLen(lastSequence) ? last_pos - ZSTD_totalLen(lastSequence) : 0; /* single sequence, and it starts before `ip` */ - assert(cur < ZSTD_OPT_NUM); /* control overflow*/ + lastStretch = opt[last_pos]; + assert(cur >= lastStretch.mlen); + cur = last_pos - lastStretch.mlen; _shortestPath: /* cur, last_pos, best_mlen, best_off have to be set */ assert(opt[0].mlen == 0); + assert(last_pos >= lastStretch.mlen); + assert(cur == last_pos - lastStretch.mlen); - /* Set the next chunk's repcodes based on the repcodes of the beginning - * of the last match, and the last sequence. This avoids us having to - * update them while traversing the sequences. - */ - if (lastSequence.mlen != 0) { - repcodes_t const reps = ZSTD_newRep(opt[cur].rep, lastSequence.off, lastSequence.litlen==0); - ZSTD_memcpy(rep, &reps, sizeof(reps)); + if (lastStretch.mlen==0) { + /* no solution : all matches have been converted into literals */ + assert(lastStretch.litlen == (ip - anchor) + last_pos); + ip += last_pos; + continue; + } + assert(lastStretch.off > 0); + + /* Update offset history */ + if (lastStretch.litlen == 0) { + /* finishing on a match : update offset history */ + repcodes_t const reps = ZSTD_newRep(opt[cur].rep, lastStretch.off, opt[cur].litlen==0); + ZSTD_memcpy(rep, &reps, sizeof(repcodes_t)); } else { - ZSTD_memcpy(rep, opt[cur].rep, sizeof(repcodes_t)); + ZSTD_memcpy(rep, lastStretch.rep, sizeof(repcodes_t)); + assert(cur >= lastStretch.litlen); + cur -= lastStretch.litlen; } - { U32 const storeEnd = cur + 1; + /* Let's write the shortest path solution. + * It is stored in @opt in reverse order, + * starting from @storeEnd (==cur+2), + * effectively partially @opt overwriting. + * Content is changed too: + * - So far, @opt stored stretches, aka a match followed by literals + * - Now, it will store sequences, aka literals followed by a match + */ + { U32 const storeEnd = cur + 2; U32 storeStart = storeEnd; - U32 seqPos = cur; + U32 stretchPos = cur; DEBUGLOG(6, "start reverse traversal (last_pos:%u, cur:%u)", last_pos, cur); (void)last_pos; - assert(storeEnd < ZSTD_OPT_NUM); - DEBUGLOG(6, "last sequence copied into pos=%u (llen=%u,mlen=%u,ofc=%u)", - storeEnd, lastSequence.litlen, lastSequence.mlen, lastSequence.off); - opt[storeEnd] = lastSequence; - while (seqPos > 0) { - U32 const backDist = ZSTD_totalLen(opt[seqPos]); + assert(storeEnd < ZSTD_OPT_SIZE); + DEBUGLOG(6, "last stretch copied into pos=%u (llen=%u,mlen=%u,ofc=%u)", + storeEnd, lastStretch.litlen, lastStretch.mlen, lastStretch.off); + if (lastStretch.litlen > 0) { + /* last "sequence" is unfinished: just a bunch of literals */ + opt[storeEnd].litlen = lastStretch.litlen; + opt[storeEnd].mlen = 0; + storeStart = storeEnd-1; + opt[storeStart] = lastStretch; + } { + opt[storeEnd] = lastStretch; /* note: litlen will be fixed */ + storeStart = storeEnd; + } + while (1) { + ZSTD_optimal_t nextStretch = opt[stretchPos]; + opt[storeStart].litlen = nextStretch.litlen; + DEBUGLOG(6, "selected sequence (llen=%u,mlen=%u,ofc=%u)", + opt[storeStart].litlen, opt[storeStart].mlen, opt[storeStart].off); + if (nextStretch.mlen == 0) { + /* reaching beginning of segment */ + break; + } storeStart--; - DEBUGLOG(6, "sequence from rPos=%u copied into pos=%u (llen=%u,mlen=%u,ofc=%u)", - seqPos, storeStart, opt[seqPos].litlen, opt[seqPos].mlen, opt[seqPos].off); - opt[storeStart] = opt[seqPos]; - seqPos = (seqPos > backDist) ? seqPos - backDist : 0; + opt[storeStart] = nextStretch; /* note: litlen will be fixed */ + assert(nextStretch.litlen + nextStretch.mlen <= stretchPos); + stretchPos -= nextStretch.litlen + nextStretch.mlen; } /* save sequences */ - DEBUGLOG(6, "sending selected sequences into seqStore") + DEBUGLOG(6, "sending selected sequences into seqStore"); { U32 storePos; for (storePos=storeStart; storePos <= storeEnd; storePos++) { U32 const llen = opt[storePos].litlen; @@ -1339,6 +1426,9 @@ _shortestPath: /* cur, last_pos, best_mlen, best_off have to be set */ anchor += advance; ip = anchor; } } + DEBUGLOG(7, "new offset history : %u, %u, %u", rep[0], rep[1], rep[2]); + + /* update all costs */ ZSTD_setBasePrices(optStatePtr, optLevel); } } /* while (ip < ilimit) */ @@ -1346,21 +1436,27 @@ _shortestPath: /* cur, last_pos, best_mlen, best_off have to be set */ /* Return the last literals size */ return (size_t)(iend - anchor); } +#endif /* build exclusions */ +#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR static size_t ZSTD_compressBlock_opt0( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], const void* src, size_t srcSize, const ZSTD_dictMode_e dictMode) { return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /* optLevel */, dictMode); } +#endif +#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR static size_t ZSTD_compressBlock_opt2( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], const void* src, size_t srcSize, const ZSTD_dictMode_e dictMode) { return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /* optLevel */, dictMode); } +#endif +#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR size_t ZSTD_compressBlock_btopt( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], const void* src, size_t srcSize) @@ -1368,20 +1464,23 @@ size_t ZSTD_compressBlock_btopt( DEBUGLOG(5, "ZSTD_compressBlock_btopt"); return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_noDict); } +#endif +#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR /* ZSTD_initStats_ultra(): * make a first compression pass, just to seed stats with more accurate starting values. * only works on first block, with no dictionary and no ldm. * this function cannot error out, its narrow contract must be respected. */ -static void -ZSTD_initStats_ultra(ZSTD_matchState_t* ms, - seqStore_t* seqStore, - U32 rep[ZSTD_REP_NUM], - const void* src, size_t srcSize) +static +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +void ZSTD_initStats_ultra(ZSTD_matchState_t* ms, + seqStore_t* seqStore, + U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize) { U32 tmpRep[ZSTD_REP_NUM]; /* updated rep codes will sink here */ ZSTD_memcpy(tmpRep, rep, sizeof(tmpRep)); @@ -1425,7 +1524,7 @@ size_t ZSTD_compressBlock_btultra2( * Consequently, this can only work if no data has been previously loaded in tables, * aka, no dictionary, no prefix, no ldm preprocessing. * The compression ratio gain is generally small (~0.5% on first block), - ** the cost is 2x cpu time on first block. */ + * the cost is 2x cpu time on first block. */ assert(srcSize <= ZSTD_BLOCKSIZE_MAX); if ( (ms->opt.litLengthSum==0) /* first block */ && (seqStore->sequences == seqStore->sequencesStart) /* no ldm */ @@ -1438,7 +1537,9 @@ size_t ZSTD_compressBlock_btultra2( return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_noDict); } +#endif +#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR size_t ZSTD_compressBlock_btopt_dictMatchState( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], const void* src, size_t srcSize) @@ -1446,18 +1547,20 @@ size_t ZSTD_compressBlock_btopt_dictMatchState( return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState); } -size_t ZSTD_compressBlock_btultra_dictMatchState( +size_t ZSTD_compressBlock_btopt_extDict( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], const void* src, size_t srcSize) { - return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState); + return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_extDict); } +#endif -size_t ZSTD_compressBlock_btopt_extDict( +#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR +size_t ZSTD_compressBlock_btultra_dictMatchState( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], const void* src, size_t srcSize) { - return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_extDict); + return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState); } size_t ZSTD_compressBlock_btultra_extDict( @@ -1466,6 +1569,7 @@ size_t ZSTD_compressBlock_btultra_extDict( { return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_extDict); } +#endif /* note : no btultra2 variant for extDict nor dictMatchState, * because btultra2 is not meant to work with dictionaries diff --git a/thirdparty/zstd/compress/zstd_opt.h b/thirdparty/zstd/compress/zstd_opt.h index 342e5a3112..d4e7113157 100644 --- a/thirdparty/zstd/compress/zstd_opt.h +++ b/thirdparty/zstd/compress/zstd_opt.h @@ -17,30 +17,40 @@ extern "C" { #include "zstd_compress_internal.h" +#if !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) \ + || !defined(ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR) \ + || !defined(ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR) /* used in ZSTD_loadDictionaryContent() */ void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend); +#endif +#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR size_t ZSTD_compressBlock_btopt( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); -size_t ZSTD_compressBlock_btultra( +size_t ZSTD_compressBlock_btopt_dictMatchState( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); -size_t ZSTD_compressBlock_btultra2( +size_t ZSTD_compressBlock_btopt_extDict( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); +#define ZSTD_COMPRESSBLOCK_BTOPT ZSTD_compressBlock_btopt +#define ZSTD_COMPRESSBLOCK_BTOPT_DICTMATCHSTATE ZSTD_compressBlock_btopt_dictMatchState +#define ZSTD_COMPRESSBLOCK_BTOPT_EXTDICT ZSTD_compressBlock_btopt_extDict +#else +#define ZSTD_COMPRESSBLOCK_BTOPT NULL +#define ZSTD_COMPRESSBLOCK_BTOPT_DICTMATCHSTATE NULL +#define ZSTD_COMPRESSBLOCK_BTOPT_EXTDICT NULL +#endif -size_t ZSTD_compressBlock_btopt_dictMatchState( +#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR +size_t ZSTD_compressBlock_btultra( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); size_t ZSTD_compressBlock_btultra_dictMatchState( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); - -size_t ZSTD_compressBlock_btopt_extDict( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize); size_t ZSTD_compressBlock_btultra_extDict( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); @@ -48,6 +58,20 @@ size_t ZSTD_compressBlock_btultra_extDict( /* note : no btultra2 variant for extDict nor dictMatchState, * because btultra2 is not meant to work with dictionaries * and is only specific for the first block (no prefix) */ +size_t ZSTD_compressBlock_btultra2( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + +#define ZSTD_COMPRESSBLOCK_BTULTRA ZSTD_compressBlock_btultra +#define ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE ZSTD_compressBlock_btultra_dictMatchState +#define ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT ZSTD_compressBlock_btultra_extDict +#define ZSTD_COMPRESSBLOCK_BTULTRA2 ZSTD_compressBlock_btultra2 +#else +#define ZSTD_COMPRESSBLOCK_BTULTRA NULL +#define ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE NULL +#define ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT NULL +#define ZSTD_COMPRESSBLOCK_BTULTRA2 NULL +#endif #if defined (__cplusplus) } diff --git a/thirdparty/zstd/compress/zstdmt_compress.c b/thirdparty/zstd/compress/zstdmt_compress.c index 6786075569..86ccce3184 100644 --- a/thirdparty/zstd/compress/zstdmt_compress.c +++ b/thirdparty/zstd/compress/zstdmt_compress.c @@ -15,17 +15,13 @@ #endif -/* ====== Constants ====== */ -#define ZSTDMT_OVERLAPLOG_DEFAULT 0 - - /* ====== Dependencies ====== */ -#include "../common/allocations.h" /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */ +#include "../common/allocations.h" /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */ #include "../common/zstd_deps.h" /* ZSTD_memcpy, ZSTD_memset, INT_MAX, UINT_MAX */ #include "../common/mem.h" /* MEM_STATIC */ #include "../common/pool.h" /* threadpool */ #include "../common/threading.h" /* mutex */ -#include "zstd_compress_internal.h" /* MIN, ERROR, ZSTD_*, ZSTD_highbit32 */ +#include "zstd_compress_internal.h" /* MIN, ERROR, ZSTD_*, ZSTD_highbit32 */ #include "zstd_ldm.h" #include "zstdmt_compress.h" @@ -44,12 +40,13 @@ # include <unistd.h> # include <sys/times.h> -# define DEBUG_PRINTHEX(l,p,n) { \ - unsigned debug_u; \ - for (debug_u=0; debug_u<(n); debug_u++) \ - RAWLOG(l, "%02X ", ((const unsigned char*)(p))[debug_u]); \ - RAWLOG(l, " \n"); \ -} +# define DEBUG_PRINTHEX(l,p,n) \ + do { \ + unsigned debug_u; \ + for (debug_u=0; debug_u<(n); debug_u++) \ + RAWLOG(l, "%02X ", ((const unsigned char*)(p))[debug_u]); \ + RAWLOG(l, " \n"); \ + } while (0) static unsigned long long GetCurrentClockTimeMicroseconds(void) { @@ -61,25 +58,28 @@ static unsigned long long GetCurrentClockTimeMicroseconds(void) } } #define MUTEX_WAIT_TIME_DLEVEL 6 -#define ZSTD_PTHREAD_MUTEX_LOCK(mutex) { \ - if (DEBUGLEVEL >= MUTEX_WAIT_TIME_DLEVEL) { \ - unsigned long long const beforeTime = GetCurrentClockTimeMicroseconds(); \ - ZSTD_pthread_mutex_lock(mutex); \ - { unsigned long long const afterTime = GetCurrentClockTimeMicroseconds(); \ - unsigned long long const elapsedTime = (afterTime-beforeTime); \ - if (elapsedTime > 1000) { /* or whatever threshold you like; I'm using 1 millisecond here */ \ - DEBUGLOG(MUTEX_WAIT_TIME_DLEVEL, "Thread took %llu microseconds to acquire mutex %s \n", \ - elapsedTime, #mutex); \ - } } \ - } else { \ - ZSTD_pthread_mutex_lock(mutex); \ - } \ -} +#define ZSTD_PTHREAD_MUTEX_LOCK(mutex) \ + do { \ + if (DEBUGLEVEL >= MUTEX_WAIT_TIME_DLEVEL) { \ + unsigned long long const beforeTime = GetCurrentClockTimeMicroseconds(); \ + ZSTD_pthread_mutex_lock(mutex); \ + { unsigned long long const afterTime = GetCurrentClockTimeMicroseconds(); \ + unsigned long long const elapsedTime = (afterTime-beforeTime); \ + if (elapsedTime > 1000) { \ + /* or whatever threshold you like; I'm using 1 millisecond here */ \ + DEBUGLOG(MUTEX_WAIT_TIME_DLEVEL, \ + "Thread took %llu microseconds to acquire mutex %s \n", \ + elapsedTime, #mutex); \ + } } \ + } else { \ + ZSTD_pthread_mutex_lock(mutex); \ + } \ + } while (0) #else # define ZSTD_PTHREAD_MUTEX_LOCK(m) ZSTD_pthread_mutex_lock(m) -# define DEBUG_PRINTHEX(l,p,n) {} +# define DEBUG_PRINTHEX(l,p,n) do { } while (0) #endif @@ -100,18 +100,39 @@ typedef struct ZSTDMT_bufferPool_s { unsigned totalBuffers; unsigned nbBuffers; ZSTD_customMem cMem; - buffer_t bTable[1]; /* variable size */ + buffer_t* buffers; } ZSTDMT_bufferPool; +static void ZSTDMT_freeBufferPool(ZSTDMT_bufferPool* bufPool) +{ + DEBUGLOG(3, "ZSTDMT_freeBufferPool (address:%08X)", (U32)(size_t)bufPool); + if (!bufPool) return; /* compatibility with free on NULL */ + if (bufPool->buffers) { + unsigned u; + for (u=0; u<bufPool->totalBuffers; u++) { + DEBUGLOG(4, "free buffer %2u (address:%08X)", u, (U32)(size_t)bufPool->buffers[u].start); + ZSTD_customFree(bufPool->buffers[u].start, bufPool->cMem); + } + ZSTD_customFree(bufPool->buffers, bufPool->cMem); + } + ZSTD_pthread_mutex_destroy(&bufPool->poolMutex); + ZSTD_customFree(bufPool, bufPool->cMem); +} + static ZSTDMT_bufferPool* ZSTDMT_createBufferPool(unsigned maxNbBuffers, ZSTD_customMem cMem) { - ZSTDMT_bufferPool* const bufPool = (ZSTDMT_bufferPool*)ZSTD_customCalloc( - sizeof(ZSTDMT_bufferPool) + (maxNbBuffers-1) * sizeof(buffer_t), cMem); + ZSTDMT_bufferPool* const bufPool = + (ZSTDMT_bufferPool*)ZSTD_customCalloc(sizeof(ZSTDMT_bufferPool), cMem); if (bufPool==NULL) return NULL; if (ZSTD_pthread_mutex_init(&bufPool->poolMutex, NULL)) { ZSTD_customFree(bufPool, cMem); return NULL; } + bufPool->buffers = (buffer_t*)ZSTD_customCalloc(maxNbBuffers * sizeof(buffer_t), cMem); + if (bufPool->buffers==NULL) { + ZSTDMT_freeBufferPool(bufPool); + return NULL; + } bufPool->bufferSize = 64 KB; bufPool->totalBuffers = maxNbBuffers; bufPool->nbBuffers = 0; @@ -119,32 +140,19 @@ static ZSTDMT_bufferPool* ZSTDMT_createBufferPool(unsigned maxNbBuffers, ZSTD_cu return bufPool; } -static void ZSTDMT_freeBufferPool(ZSTDMT_bufferPool* bufPool) -{ - unsigned u; - DEBUGLOG(3, "ZSTDMT_freeBufferPool (address:%08X)", (U32)(size_t)bufPool); - if (!bufPool) return; /* compatibility with free on NULL */ - for (u=0; u<bufPool->totalBuffers; u++) { - DEBUGLOG(4, "free buffer %2u (address:%08X)", u, (U32)(size_t)bufPool->bTable[u].start); - ZSTD_customFree(bufPool->bTable[u].start, bufPool->cMem); - } - ZSTD_pthread_mutex_destroy(&bufPool->poolMutex); - ZSTD_customFree(bufPool, bufPool->cMem); -} - /* only works at initialization, not during compression */ static size_t ZSTDMT_sizeof_bufferPool(ZSTDMT_bufferPool* bufPool) { - size_t const poolSize = sizeof(*bufPool) - + (bufPool->totalBuffers - 1) * sizeof(buffer_t); + size_t const poolSize = sizeof(*bufPool); + size_t const arraySize = bufPool->totalBuffers * sizeof(buffer_t); unsigned u; size_t totalBufferSize = 0; ZSTD_pthread_mutex_lock(&bufPool->poolMutex); for (u=0; u<bufPool->totalBuffers; u++) - totalBufferSize += bufPool->bTable[u].capacity; + totalBufferSize += bufPool->buffers[u].capacity; ZSTD_pthread_mutex_unlock(&bufPool->poolMutex); - return poolSize + totalBufferSize; + return poolSize + arraySize + totalBufferSize; } /* ZSTDMT_setBufferSize() : @@ -187,9 +195,9 @@ static buffer_t ZSTDMT_getBuffer(ZSTDMT_bufferPool* bufPool) DEBUGLOG(5, "ZSTDMT_getBuffer: bSize = %u", (U32)bufPool->bufferSize); ZSTD_pthread_mutex_lock(&bufPool->poolMutex); if (bufPool->nbBuffers) { /* try to use an existing buffer */ - buffer_t const buf = bufPool->bTable[--(bufPool->nbBuffers)]; + buffer_t const buf = bufPool->buffers[--(bufPool->nbBuffers)]; size_t const availBufferSize = buf.capacity; - bufPool->bTable[bufPool->nbBuffers] = g_nullBuffer; + bufPool->buffers[bufPool->nbBuffers] = g_nullBuffer; if ((availBufferSize >= bSize) & ((availBufferSize>>3) <= bSize)) { /* large enough, but not too much */ DEBUGLOG(5, "ZSTDMT_getBuffer: provide buffer %u of size %u", @@ -250,14 +258,14 @@ static void ZSTDMT_releaseBuffer(ZSTDMT_bufferPool* bufPool, buffer_t buf) if (buf.start == NULL) return; /* compatible with release on NULL */ ZSTD_pthread_mutex_lock(&bufPool->poolMutex); if (bufPool->nbBuffers < bufPool->totalBuffers) { - bufPool->bTable[bufPool->nbBuffers++] = buf; /* stored for later use */ + bufPool->buffers[bufPool->nbBuffers++] = buf; /* stored for later use */ DEBUGLOG(5, "ZSTDMT_releaseBuffer: stored buffer of size %u in slot %u", (U32)buf.capacity, (U32)(bufPool->nbBuffers-1)); ZSTD_pthread_mutex_unlock(&bufPool->poolMutex); return; } ZSTD_pthread_mutex_unlock(&bufPool->poolMutex); - /* Reached bufferPool capacity (should not happen) */ + /* Reached bufferPool capacity (note: should not happen) */ DEBUGLOG(5, "ZSTDMT_releaseBuffer: pool capacity reached => freeing "); ZSTD_customFree(buf.start, bufPool->cMem); } @@ -350,16 +358,20 @@ typedef struct { int totalCCtx; int availCCtx; ZSTD_customMem cMem; - ZSTD_CCtx* cctx[1]; /* variable size */ + ZSTD_CCtx** cctxs; } ZSTDMT_CCtxPool; -/* note : all CCtx borrowed from the pool should be released back to the pool _before_ freeing the pool */ +/* note : all CCtx borrowed from the pool must be reverted back to the pool _before_ freeing the pool */ static void ZSTDMT_freeCCtxPool(ZSTDMT_CCtxPool* pool) { - int cid; - for (cid=0; cid<pool->totalCCtx; cid++) - ZSTD_freeCCtx(pool->cctx[cid]); /* note : compatible with free on NULL */ + if (!pool) return; ZSTD_pthread_mutex_destroy(&pool->poolMutex); + if (pool->cctxs) { + int cid; + for (cid=0; cid<pool->totalCCtx; cid++) + ZSTD_freeCCtx(pool->cctxs[cid]); /* free compatible with NULL */ + ZSTD_customFree(pool->cctxs, pool->cMem); + } ZSTD_customFree(pool, pool->cMem); } @@ -368,19 +380,24 @@ static void ZSTDMT_freeCCtxPool(ZSTDMT_CCtxPool* pool) static ZSTDMT_CCtxPool* ZSTDMT_createCCtxPool(int nbWorkers, ZSTD_customMem cMem) { - ZSTDMT_CCtxPool* const cctxPool = (ZSTDMT_CCtxPool*) ZSTD_customCalloc( - sizeof(ZSTDMT_CCtxPool) + (nbWorkers-1)*sizeof(ZSTD_CCtx*), cMem); + ZSTDMT_CCtxPool* const cctxPool = + (ZSTDMT_CCtxPool*) ZSTD_customCalloc(sizeof(ZSTDMT_CCtxPool), cMem); assert(nbWorkers > 0); if (!cctxPool) return NULL; if (ZSTD_pthread_mutex_init(&cctxPool->poolMutex, NULL)) { ZSTD_customFree(cctxPool, cMem); return NULL; } - cctxPool->cMem = cMem; cctxPool->totalCCtx = nbWorkers; + cctxPool->cctxs = (ZSTD_CCtx**)ZSTD_customCalloc(nbWorkers * sizeof(ZSTD_CCtx*), cMem); + if (!cctxPool->cctxs) { + ZSTDMT_freeCCtxPool(cctxPool); + return NULL; + } + cctxPool->cMem = cMem; + cctxPool->cctxs[0] = ZSTD_createCCtx_advanced(cMem); + if (!cctxPool->cctxs[0]) { ZSTDMT_freeCCtxPool(cctxPool); return NULL; } cctxPool->availCCtx = 1; /* at least one cctx for single-thread mode */ - cctxPool->cctx[0] = ZSTD_createCCtx_advanced(cMem); - if (!cctxPool->cctx[0]) { ZSTDMT_freeCCtxPool(cctxPool); return NULL; } DEBUGLOG(3, "cctxPool created, with %u workers", nbWorkers); return cctxPool; } @@ -402,16 +419,16 @@ static size_t ZSTDMT_sizeof_CCtxPool(ZSTDMT_CCtxPool* cctxPool) { ZSTD_pthread_mutex_lock(&cctxPool->poolMutex); { unsigned const nbWorkers = cctxPool->totalCCtx; - size_t const poolSize = sizeof(*cctxPool) - + (nbWorkers-1) * sizeof(ZSTD_CCtx*); - unsigned u; + size_t const poolSize = sizeof(*cctxPool); + size_t const arraySize = cctxPool->totalCCtx * sizeof(ZSTD_CCtx*); size_t totalCCtxSize = 0; + unsigned u; for (u=0; u<nbWorkers; u++) { - totalCCtxSize += ZSTD_sizeof_CCtx(cctxPool->cctx[u]); + totalCCtxSize += ZSTD_sizeof_CCtx(cctxPool->cctxs[u]); } ZSTD_pthread_mutex_unlock(&cctxPool->poolMutex); assert(nbWorkers > 0); - return poolSize + totalCCtxSize; + return poolSize + arraySize + totalCCtxSize; } } @@ -421,7 +438,7 @@ static ZSTD_CCtx* ZSTDMT_getCCtx(ZSTDMT_CCtxPool* cctxPool) ZSTD_pthread_mutex_lock(&cctxPool->poolMutex); if (cctxPool->availCCtx) { cctxPool->availCCtx--; - { ZSTD_CCtx* const cctx = cctxPool->cctx[cctxPool->availCCtx]; + { ZSTD_CCtx* const cctx = cctxPool->cctxs[cctxPool->availCCtx]; ZSTD_pthread_mutex_unlock(&cctxPool->poolMutex); return cctx; } } @@ -435,7 +452,7 @@ static void ZSTDMT_releaseCCtx(ZSTDMT_CCtxPool* pool, ZSTD_CCtx* cctx) if (cctx==NULL) return; /* compatibility with release on NULL */ ZSTD_pthread_mutex_lock(&pool->poolMutex); if (pool->availCCtx < pool->totalCCtx) - pool->cctx[pool->availCCtx++] = cctx; + pool->cctxs[pool->availCCtx++] = cctx; else { /* pool overflow : should not happen, since totalCCtx==nbWorkers */ DEBUGLOG(4, "CCtx pool overflow : free cctx"); @@ -601,11 +618,8 @@ static void ZSTDMT_serialState_update(serialState_t* serialState, ZSTD_pthread_mutex_unlock(&serialState->mutex); if (seqStore.size > 0) { - size_t const err = ZSTD_referenceExternalSequences( - jobCCtx, seqStore.seq, seqStore.size); + ZSTD_referenceExternalSequences(jobCCtx, seqStore.seq, seqStore.size); assert(serialState->params.ldmParams.enableLdm == ZSTD_ps_enable); - assert(!ZSTD_isError(err)); - (void)err; } } @@ -657,12 +671,13 @@ typedef struct { unsigned frameChecksumNeeded; /* used only by mtctx */ } ZSTDMT_jobDescription; -#define JOB_ERROR(e) { \ - ZSTD_PTHREAD_MUTEX_LOCK(&job->job_mutex); \ - job->cSize = e; \ - ZSTD_pthread_mutex_unlock(&job->job_mutex); \ - goto _endJob; \ -} +#define JOB_ERROR(e) \ + do { \ + ZSTD_PTHREAD_MUTEX_LOCK(&job->job_mutex); \ + job->cSize = e; \ + ZSTD_pthread_mutex_unlock(&job->job_mutex); \ + goto _endJob; \ + } while (0) /* ZSTDMT_compressionJob() is a POOL_function type */ static void ZSTDMT_compressionJob(void* jobDescription) @@ -1091,7 +1106,7 @@ ZSTD_frameProgression ZSTDMT_getFrameProgression(ZSTDMT_CCtx* mtctx) { unsigned jobNb; unsigned lastJobNb = mtctx->nextJobID + mtctx->jobReady; assert(mtctx->jobReady <= 1); DEBUGLOG(6, "ZSTDMT_getFrameProgression: jobs: from %u to <%u (jobReady:%u)", - mtctx->doneJobID, lastJobNb, mtctx->jobReady) + mtctx->doneJobID, lastJobNb, mtctx->jobReady); for (jobNb = mtctx->doneJobID ; jobNb < lastJobNb ; jobNb++) { unsigned const wJobID = jobNb & mtctx->jobIDMask; ZSTDMT_jobDescription* jobPtr = &mtctx->jobs[wJobID]; diff --git a/thirdparty/zstd/decompress/huf_decompress.c b/thirdparty/zstd/decompress/huf_decompress.c index 5b217ac586..f85dd0beea 100644 --- a/thirdparty/zstd/decompress/huf_decompress.c +++ b/thirdparty/zstd/decompress/huf_decompress.c @@ -34,6 +34,12 @@ * Macros ****************************************************************/ +#ifdef HUF_DISABLE_FAST_DECODE +# define HUF_ENABLE_FAST_DECODE 0 +#else +# define HUF_ENABLE_FAST_DECODE 1 +#endif + /* These two optional macros force the use one way or another of the two * Huffman decompression implementations. You can't force in both directions * at the same time. @@ -158,17 +164,18 @@ static size_t HUF_initFastDStream(BYTE const* ip) { * op [in/out] - The output pointers, must be updated to reflect what is written. * bits [in/out] - The bitstream containers, must be updated to reflect the current state. * dt [in] - The decoding table. - * ilimit [in] - The input limit, stop when any input pointer is below ilimit. + * ilowest [in] - The beginning of the valid range of the input. Decoders may read + * down to this pointer. It may be below iend[0]. * oend [in] - The end of the output stream. op[3] must not cross oend. * iend [in] - The end of each input stream. ip[i] may cross iend[i], - * as long as it is above ilimit, but that indicates corruption. + * as long as it is above ilowest, but that indicates corruption. */ typedef struct { BYTE const* ip[4]; BYTE* op[4]; U64 bits[4]; void const* dt; - BYTE const* ilimit; + BYTE const* ilowest; BYTE* oend; BYTE const* iend[4]; } HUF_DecompressFastArgs; @@ -186,9 +193,9 @@ static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* ds void const* dt = DTable + 1; U32 const dtLog = HUF_getDTableDesc(DTable).tableLog; - const BYTE* const ilimit = (const BYTE*)src + 6 + 8; + const BYTE* const istart = (const BYTE*)src; - BYTE* const oend = (BYTE*)dst + dstSize; + BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize); /* The fast decoding loop assumes 64-bit little-endian. * This condition is false on x32. @@ -196,6 +203,11 @@ static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* ds if (!MEM_isLittleEndian() || MEM_32bits()) return 0; + /* Avoid nullptr addition */ + if (dstSize == 0) + return 0; + assert(dst != NULL); + /* strict minimum : jump table + 1 byte per stream */ if (srcSize < 10) return ERROR(corruption_detected); @@ -209,7 +221,6 @@ static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* ds /* Read the jump table. */ { - const BYTE* const istart = (const BYTE*)src; size_t const length1 = MEM_readLE16(istart); size_t const length2 = MEM_readLE16(istart+2); size_t const length3 = MEM_readLE16(istart+4); @@ -221,10 +232,8 @@ static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* ds /* HUF_initFastDStream() requires this, and this small of an input * won't benefit from the ASM loop anyways. - * length1 must be >= 16 so that ip[0] >= ilimit before the loop - * starts. */ - if (length1 < 16 || length2 < 8 || length3 < 8 || length4 < 8) + if (length1 < 8 || length2 < 8 || length3 < 8 || length4 < 8) return 0; if (length4 > srcSize) return ERROR(corruption_detected); /* overflow */ } @@ -256,11 +265,12 @@ static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* ds args->bits[2] = HUF_initFastDStream(args->ip[2]); args->bits[3] = HUF_initFastDStream(args->ip[3]); - /* If ip[] >= ilimit, it is guaranteed to be safe to - * reload bits[]. It may be beyond its section, but is - * guaranteed to be valid (>= istart). - */ - args->ilimit = ilimit; + /* The decoders must be sure to never read beyond ilowest. + * This is lower than iend[0], but allowing decoders to read + * down to ilowest can allow an extra iteration or two in the + * fast loop. + */ + args->ilowest = istart; args->oend = oend; args->dt = dt; @@ -285,13 +295,31 @@ static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressFastArg assert(sizeof(size_t) == 8); bit->bitContainer = MEM_readLEST(args->ip[stream]); bit->bitsConsumed = ZSTD_countTrailingZeros64(args->bits[stream]); - bit->start = (const char*)args->iend[0]; + bit->start = (const char*)args->ilowest; bit->limitPtr = bit->start + sizeof(size_t); bit->ptr = (const char*)args->ip[stream]; return 0; } +/* Calls X(N) for each stream 0, 1, 2, 3. */ +#define HUF_4X_FOR_EACH_STREAM(X) \ + do { \ + X(0); \ + X(1); \ + X(2); \ + X(3); \ + } while (0) + +/* Calls X(N, var) for each stream 0, 1, 2, 3. */ +#define HUF_4X_FOR_EACH_STREAM_WITH_VAR(X, var) \ + do { \ + X(0, (var)); \ + X(1, (var)); \ + X(2, (var)); \ + X(3, (var)); \ + } while (0) + #ifndef HUF_FORCE_DECOMPRESS_X2 @@ -500,15 +528,19 @@ HUF_decodeSymbolX1(BIT_DStream_t* Dstream, const HUF_DEltX1* dt, const U32 dtLog } #define HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr) \ - *ptr++ = HUF_decodeSymbolX1(DStreamPtr, dt, dtLog) + do { *ptr++ = HUF_decodeSymbolX1(DStreamPtr, dt, dtLog); } while (0) -#define HUF_DECODE_SYMBOLX1_1(ptr, DStreamPtr) \ - if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \ - HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr) +#define HUF_DECODE_SYMBOLX1_1(ptr, DStreamPtr) \ + do { \ + if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \ + HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr); \ + } while (0) -#define HUF_DECODE_SYMBOLX1_2(ptr, DStreamPtr) \ - if (MEM_64bits()) \ - HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr) +#define HUF_DECODE_SYMBOLX1_2(ptr, DStreamPtr) \ + do { \ + if (MEM_64bits()) \ + HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr); \ + } while (0) HINT_INLINE size_t HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, const HUF_DEltX1* const dt, const U32 dtLog) @@ -546,7 +578,7 @@ HUF_decompress1X1_usingDTable_internal_body( const HUF_DTable* DTable) { BYTE* op = (BYTE*)dst; - BYTE* const oend = op + dstSize; + BYTE* const oend = ZSTD_maybeNullPtrAdd(op, dstSize); const void* dtPtr = DTable + 1; const HUF_DEltX1* const dt = (const HUF_DEltX1*)dtPtr; BIT_DStream_t bitD; @@ -574,6 +606,7 @@ HUF_decompress4X1_usingDTable_internal_body( { /* Check */ if (cSrcSize < 10) return ERROR(corruption_detected); /* strict minimum : jump table + 1 byte per stream */ + if (dstSize < 6) return ERROR(corruption_detected); /* stream 4-split doesn't work */ { const BYTE* const istart = (const BYTE*) cSrc; BYTE* const ostart = (BYTE*) dst; @@ -609,7 +642,7 @@ HUF_decompress4X1_usingDTable_internal_body( if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */ if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */ - if (dstSize < 6) return ERROR(corruption_detected); /* stream 4-split doesn't work */ + assert(dstSize >= 6); /* validated above */ CHECK_F( BIT_initDStream(&bitD1, istart1, length1) ); CHECK_F( BIT_initDStream(&bitD2, istart2, length2) ); CHECK_F( BIT_initDStream(&bitD3, istart3, length3) ); @@ -692,7 +725,7 @@ void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* BYTE* op[4]; U16 const* const dtable = (U16 const*)args->dt; BYTE* const oend = args->oend; - BYTE const* const ilimit = args->ilimit; + BYTE const* const ilowest = args->ilowest; /* Copy the arguments to local variables */ ZSTD_memcpy(&bits, &args->bits, sizeof(bits)); @@ -705,13 +738,12 @@ void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* for (;;) { BYTE* olimit; int stream; - int symbol; /* Assert loop preconditions */ #ifndef NDEBUG for (stream = 0; stream < 4; ++stream) { assert(op[stream] <= (stream == 3 ? oend : op[stream + 1])); - assert(ip[stream] >= ilimit); + assert(ip[stream] >= ilowest); } #endif /* Compute olimit */ @@ -721,7 +753,7 @@ void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* /* Each iteration consumes up to 11 bits * 5 = 55 bits < 7 bytes * per stream. */ - size_t const iiters = (size_t)(ip[0] - ilimit) / 7; + size_t const iiters = (size_t)(ip[0] - ilowest) / 7; /* We can safely run iters iterations before running bounds checks */ size_t const iters = MIN(oiters, iiters); size_t const symbols = iters * 5; @@ -732,8 +764,8 @@ void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* */ olimit = op[3] + symbols; - /* Exit fast decoding loop once we get close to the end. */ - if (op[3] + 20 > olimit) + /* Exit fast decoding loop once we reach the end. */ + if (op[3] == olimit) break; /* Exit the decoding loop if any input pointer has crossed the @@ -752,27 +784,42 @@ void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* } #endif +#define HUF_4X1_DECODE_SYMBOL(_stream, _symbol) \ + do { \ + int const index = (int)(bits[(_stream)] >> 53); \ + int const entry = (int)dtable[index]; \ + bits[(_stream)] <<= (entry & 0x3F); \ + op[(_stream)][(_symbol)] = (BYTE)((entry >> 8) & 0xFF); \ + } while (0) + +#define HUF_4X1_RELOAD_STREAM(_stream) \ + do { \ + int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \ + int const nbBits = ctz & 7; \ + int const nbBytes = ctz >> 3; \ + op[(_stream)] += 5; \ + ip[(_stream)] -= nbBytes; \ + bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1; \ + bits[(_stream)] <<= nbBits; \ + } while (0) + + /* Manually unroll the loop because compilers don't consistently + * unroll the inner loops, which destroys performance. + */ do { /* Decode 5 symbols in each of the 4 streams */ - for (symbol = 0; symbol < 5; ++symbol) { - for (stream = 0; stream < 4; ++stream) { - int const index = (int)(bits[stream] >> 53); - int const entry = (int)dtable[index]; - bits[stream] <<= (entry & 63); - op[stream][symbol] = (BYTE)((entry >> 8) & 0xFF); - } - } - /* Reload the bitstreams */ - for (stream = 0; stream < 4; ++stream) { - int const ctz = ZSTD_countTrailingZeros64(bits[stream]); - int const nbBits = ctz & 7; - int const nbBytes = ctz >> 3; - op[stream] += 5; - ip[stream] -= nbBytes; - bits[stream] = MEM_read64(ip[stream]) | 1; - bits[stream] <<= nbBits; - } + HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 0); + HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 1); + HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 2); + HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 3); + HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 4); + + /* Reload each of the 4 the bitstreams */ + HUF_4X_FOR_EACH_STREAM(HUF_4X1_RELOAD_STREAM); } while (op[3] < olimit); + +#undef HUF_4X1_DECODE_SYMBOL +#undef HUF_4X1_RELOAD_STREAM } _out: @@ -797,8 +844,8 @@ HUF_decompress4X1_usingDTable_internal_fast( HUF_DecompressFastLoopFn loopFn) { void const* dt = DTable + 1; - const BYTE* const iend = (const BYTE*)cSrc + 6; - BYTE* const oend = (BYTE*)dst + dstSize; + BYTE const* const ilowest = (BYTE const*)cSrc; + BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize); HUF_DecompressFastArgs args; { size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); FORWARD_IF_ERROR(ret, "Failed to init fast loop args"); @@ -806,18 +853,22 @@ HUF_decompress4X1_usingDTable_internal_fast( return 0; } - assert(args.ip[0] >= args.ilimit); + assert(args.ip[0] >= args.ilowest); loopFn(&args); - /* Our loop guarantees that ip[] >= ilimit and that we haven't + /* Our loop guarantees that ip[] >= ilowest and that we haven't * overwritten any op[]. */ - assert(args.ip[0] >= iend); - assert(args.ip[1] >= iend); - assert(args.ip[2] >= iend); - assert(args.ip[3] >= iend); + assert(args.ip[0] >= ilowest); + assert(args.ip[0] >= ilowest); + assert(args.ip[1] >= ilowest); + assert(args.ip[2] >= ilowest); + assert(args.ip[3] >= ilowest); assert(args.op[3] <= oend); - (void)iend; + + assert(ilowest == args.ilowest); + assert(ilowest + 6 == args.iend[0]); + (void)ilowest; /* finish bit streams one by one. */ { size_t const segmentSize = (dstSize+3) / 4; @@ -868,7 +919,7 @@ static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dstSize, } #endif - if (!(flags & HUF_flags_disableFast)) { + if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) { size_t const ret = HUF_decompress4X1_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn); if (ret != 0) return ret; @@ -1239,15 +1290,19 @@ HUF_decodeLastSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, c } #define HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) \ - ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog) + do { ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); } while (0) -#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \ - if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \ - ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog) +#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \ + do { \ + if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \ + ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); \ + } while (0) -#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \ - if (MEM_64bits()) \ - ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog) +#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \ + do { \ + if (MEM_64bits()) \ + ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); \ + } while (0) HINT_INLINE size_t HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd, @@ -1307,7 +1362,7 @@ HUF_decompress1X2_usingDTable_internal_body( /* decode */ { BYTE* const ostart = (BYTE*) dst; - BYTE* const oend = ostart + dstSize; + BYTE* const oend = ZSTD_maybeNullPtrAdd(ostart, dstSize); const void* const dtPtr = DTable+1; /* force compiler to not use strict-aliasing */ const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr; DTableDesc const dtd = HUF_getDTableDesc(DTable); @@ -1332,6 +1387,7 @@ HUF_decompress4X2_usingDTable_internal_body( const HUF_DTable* DTable) { if (cSrcSize < 10) return ERROR(corruption_detected); /* strict minimum : jump table + 1 byte per stream */ + if (dstSize < 6) return ERROR(corruption_detected); /* stream 4-split doesn't work */ { const BYTE* const istart = (const BYTE*) cSrc; BYTE* const ostart = (BYTE*) dst; @@ -1367,7 +1423,7 @@ HUF_decompress4X2_usingDTable_internal_body( if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */ if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */ - if (dstSize < 6) return ERROR(corruption_detected); /* stream 4-split doesn't work */ + assert(dstSize >= 6 /* validated above */); CHECK_F( BIT_initDStream(&bitD1, istart1, length1) ); CHECK_F( BIT_initDStream(&bitD2, istart2, length2) ); CHECK_F( BIT_initDStream(&bitD3, istart3, length3) ); @@ -1472,7 +1528,7 @@ void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* BYTE* op[4]; BYTE* oend[4]; HUF_DEltX2 const* const dtable = (HUF_DEltX2 const*)args->dt; - BYTE const* const ilimit = args->ilimit; + BYTE const* const ilowest = args->ilowest; /* Copy the arguments to local registers. */ ZSTD_memcpy(&bits, &args->bits, sizeof(bits)); @@ -1490,13 +1546,12 @@ void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* for (;;) { BYTE* olimit; int stream; - int symbol; /* Assert loop preconditions */ #ifndef NDEBUG for (stream = 0; stream < 4; ++stream) { assert(op[stream] <= oend[stream]); - assert(ip[stream] >= ilimit); + assert(ip[stream] >= ilowest); } #endif /* Compute olimit */ @@ -1509,7 +1564,7 @@ void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* * We also know that each input pointer is >= ip[0]. So we can run * iters loops before running out of input. */ - size_t iters = (size_t)(ip[0] - ilimit) / 7; + size_t iters = (size_t)(ip[0] - ilowest) / 7; /* Each iteration can produce up to 10 bytes of output per stream. * Each output stream my advance at different rates. So take the * minimum number of safe iterations among all the output streams. @@ -1527,8 +1582,8 @@ void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* */ olimit = op[3] + (iters * 5); - /* Exit the fast decoding loop if we are too close to the end. */ - if (op[3] + 10 > olimit) + /* Exit the fast decoding loop once we reach the end. */ + if (op[3] == olimit) break; /* Exit the decoding loop if any input pointer has crossed the @@ -1547,54 +1602,58 @@ void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* } #endif +#define HUF_4X2_DECODE_SYMBOL(_stream, _decode3) \ + do { \ + if ((_decode3) || (_stream) != 3) { \ + int const index = (int)(bits[(_stream)] >> 53); \ + HUF_DEltX2 const entry = dtable[index]; \ + MEM_write16(op[(_stream)], entry.sequence); \ + bits[(_stream)] <<= (entry.nbBits) & 0x3F; \ + op[(_stream)] += (entry.length); \ + } \ + } while (0) + +#define HUF_4X2_RELOAD_STREAM(_stream) \ + do { \ + HUF_4X2_DECODE_SYMBOL(3, 1); \ + { \ + int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \ + int const nbBits = ctz & 7; \ + int const nbBytes = ctz >> 3; \ + ip[(_stream)] -= nbBytes; \ + bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1; \ + bits[(_stream)] <<= nbBits; \ + } \ + } while (0) + + /* Manually unroll the loop because compilers don't consistently + * unroll the inner loops, which destroys performance. + */ do { - /* Do 5 table lookups for each of the first 3 streams */ - for (symbol = 0; symbol < 5; ++symbol) { - for (stream = 0; stream < 3; ++stream) { - int const index = (int)(bits[stream] >> 53); - HUF_DEltX2 const entry = dtable[index]; - MEM_write16(op[stream], entry.sequence); - bits[stream] <<= (entry.nbBits); - op[stream] += (entry.length); - } - } - /* Do 1 table lookup from the final stream */ - { - int const index = (int)(bits[3] >> 53); - HUF_DEltX2 const entry = dtable[index]; - MEM_write16(op[3], entry.sequence); - bits[3] <<= (entry.nbBits); - op[3] += (entry.length); - } - /* Do 4 table lookups from the final stream & reload bitstreams */ - for (stream = 0; stream < 4; ++stream) { - /* Do a table lookup from the final stream. - * This is interleaved with the reloading to reduce register - * pressure. This shouldn't be necessary, but compilers can - * struggle with codegen with high register pressure. - */ - { - int const index = (int)(bits[3] >> 53); - HUF_DEltX2 const entry = dtable[index]; - MEM_write16(op[3], entry.sequence); - bits[3] <<= (entry.nbBits); - op[3] += (entry.length); - } - /* Reload the bistreams. The final bitstream must be reloaded - * after the 5th symbol was decoded. - */ - { - int const ctz = ZSTD_countTrailingZeros64(bits[stream]); - int const nbBits = ctz & 7; - int const nbBytes = ctz >> 3; - ip[stream] -= nbBytes; - bits[stream] = MEM_read64(ip[stream]) | 1; - bits[stream] <<= nbBits; - } - } + /* Decode 5 symbols from each of the first 3 streams. + * The final stream will be decoded during the reload phase + * to reduce register pressure. + */ + HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0); + HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0); + HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0); + HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0); + HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0); + + /* Decode one symbol from the final stream */ + HUF_4X2_DECODE_SYMBOL(3, 1); + + /* Decode 4 symbols from the final stream & reload bitstreams. + * The final stream is reloaded last, meaning that all 5 symbols + * are decoded from the final stream before it is reloaded. + */ + HUF_4X_FOR_EACH_STREAM(HUF_4X2_RELOAD_STREAM); } while (op[3] < olimit); } +#undef HUF_4X2_DECODE_SYMBOL +#undef HUF_4X2_RELOAD_STREAM + _out: /* Save the final values of each of the state variables back to args. */ @@ -1611,8 +1670,8 @@ HUF_decompress4X2_usingDTable_internal_fast( const HUF_DTable* DTable, HUF_DecompressFastLoopFn loopFn) { void const* dt = DTable + 1; - const BYTE* const iend = (const BYTE*)cSrc + 6; - BYTE* const oend = (BYTE*)dst + dstSize; + const BYTE* const ilowest = (const BYTE*)cSrc; + BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize); HUF_DecompressFastArgs args; { size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); @@ -1621,16 +1680,19 @@ HUF_decompress4X2_usingDTable_internal_fast( return 0; } - assert(args.ip[0] >= args.ilimit); + assert(args.ip[0] >= args.ilowest); loopFn(&args); /* note : op4 already verified within main loop */ - assert(args.ip[0] >= iend); - assert(args.ip[1] >= iend); - assert(args.ip[2] >= iend); - assert(args.ip[3] >= iend); + assert(args.ip[0] >= ilowest); + assert(args.ip[1] >= ilowest); + assert(args.ip[2] >= ilowest); + assert(args.ip[3] >= ilowest); assert(args.op[3] <= oend); - (void)iend; + + assert(ilowest == args.ilowest); + assert(ilowest + 6 == args.iend[0]); + (void)ilowest; /* finish bitStreams one by one */ { @@ -1679,7 +1741,7 @@ static size_t HUF_decompress4X2_usingDTable_internal(void* dst, size_t dstSize, } #endif - if (!(flags & HUF_flags_disableFast)) { + if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) { size_t const ret = HUF_decompress4X2_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn); if (ret != 0) return ret; diff --git a/thirdparty/zstd/decompress/huf_decompress_amd64.S b/thirdparty/zstd/decompress/huf_decompress_amd64.S index 671624fe34..78da291ee3 100644 --- a/thirdparty/zstd/decompress/huf_decompress_amd64.S +++ b/thirdparty/zstd/decompress/huf_decompress_amd64.S @@ -10,11 +10,32 @@ #include "../common/portability_macros.h" +#if defined(__ELF__) && defined(__GNUC__) /* Stack marking * ref: https://wiki.gentoo.org/wiki/Hardened/GNU_stack_quickstart */ -#if defined(__ELF__) && defined(__GNUC__) .section .note.GNU-stack,"",%progbits + +#if defined(__aarch64__) +/* Mark that this assembly supports BTI & PAC, because it is empty for aarch64. + * See: https://github.com/facebook/zstd/issues/3841 + * See: https://gcc.godbolt.org/z/sqr5T4ffK + * See: https://lore.kernel.org/linux-arm-kernel/20200429211641.9279-8-broonie@kernel.org/ + * See: https://reviews.llvm.org/D62609 + */ +.pushsection .note.gnu.property, "a" +.p2align 3 +.long 4 /* size of the name - "GNU\0" */ +.long 0x10 /* size of descriptor */ +.long 0x5 /* NT_GNU_PROPERTY_TYPE_0 */ +.asciz "GNU" +.long 0xc0000000 /* pr_type - GNU_PROPERTY_AARCH64_FEATURE_1_AND */ +.long 4 /* pr_datasz - 4 bytes */ +.long 3 /* pr_data - GNU_PROPERTY_AARCH64_FEATURE_1_BTI | GNU_PROPERTY_AARCH64_FEATURE_1_PAC */ +.p2align 3 /* pr_padding - bring everything to 8 byte alignment */ +.popsection +#endif + #endif #if ZSTD_ENABLE_ASM_X86_64_BMI2 @@ -131,7 +152,7 @@ HUF_decompress4X1_usingDTable_internal_fast_asm_loop: movq 88(%rax), %bits3 movq 96(%rax), %dtable push %rax /* argument */ - push 104(%rax) /* ilimit */ + push 104(%rax) /* ilowest */ push 112(%rax) /* oend */ push %olimit /* olimit space */ @@ -156,11 +177,11 @@ HUF_decompress4X1_usingDTable_internal_fast_asm_loop: shrq $2, %r15 movq %ip0, %rax /* rax = ip0 */ - movq 40(%rsp), %rdx /* rdx = ilimit */ - subq %rdx, %rax /* rax = ip0 - ilimit */ - movq %rax, %rbx /* rbx = ip0 - ilimit */ + movq 40(%rsp), %rdx /* rdx = ilowest */ + subq %rdx, %rax /* rax = ip0 - ilowest */ + movq %rax, %rbx /* rbx = ip0 - ilowest */ - /* rdx = (ip0 - ilimit) / 7 */ + /* rdx = (ip0 - ilowest) / 7 */ movabsq $2635249153387078803, %rdx mulq %rdx subq %rdx, %rbx @@ -183,9 +204,8 @@ HUF_decompress4X1_usingDTable_internal_fast_asm_loop: /* If (op3 + 20 > olimit) */ movq %op3, %rax /* rax = op3 */ - addq $20, %rax /* rax = op3 + 20 */ - cmpq %rax, %olimit /* op3 + 20 > olimit */ - jb .L_4X1_exit + cmpq %rax, %olimit /* op3 == olimit */ + je .L_4X1_exit /* If (ip1 < ip0) go to exit */ cmpq %ip0, %ip1 @@ -316,7 +336,7 @@ HUF_decompress4X1_usingDTable_internal_fast_asm_loop: /* Restore stack (oend & olimit) */ pop %rax /* olimit */ pop %rax /* oend */ - pop %rax /* ilimit */ + pop %rax /* ilowest */ pop %rax /* arg */ /* Save ip / op / bits */ @@ -387,7 +407,7 @@ HUF_decompress4X2_usingDTable_internal_fast_asm_loop: movq 96(%rax), %dtable push %rax /* argument */ push %rax /* olimit */ - push 104(%rax) /* ilimit */ + push 104(%rax) /* ilowest */ movq 112(%rax), %rax push %rax /* oend3 */ @@ -414,9 +434,9 @@ HUF_decompress4X2_usingDTable_internal_fast_asm_loop: /* We can consume up to 7 input bytes each iteration. */ movq %ip0, %rax /* rax = ip0 */ - movq 40(%rsp), %rdx /* rdx = ilimit */ - subq %rdx, %rax /* rax = ip0 - ilimit */ - movq %rax, %r15 /* r15 = ip0 - ilimit */ + movq 40(%rsp), %rdx /* rdx = ilowest */ + subq %rdx, %rax /* rax = ip0 - ilowest */ + movq %rax, %r15 /* r15 = ip0 - ilowest */ /* rdx = rax / 7 */ movabsq $2635249153387078803, %rdx @@ -426,7 +446,7 @@ HUF_decompress4X2_usingDTable_internal_fast_asm_loop: addq %r15, %rdx shrq $2, %rdx - /* r15 = (ip0 - ilimit) / 7 */ + /* r15 = (ip0 - ilowest) / 7 */ movq %rdx, %r15 /* r15 = min(r15, min(oend0 - op0, oend1 - op1, oend2 - op2, oend3 - op3) / 10) */ @@ -467,9 +487,8 @@ HUF_decompress4X2_usingDTable_internal_fast_asm_loop: /* If (op3 + 10 > olimit) */ movq %op3, %rax /* rax = op3 */ - addq $10, %rax /* rax = op3 + 10 */ - cmpq %rax, %olimit /* op3 + 10 > olimit */ - jb .L_4X2_exit + cmpq %rax, %olimit /* op3 == olimit */ + je .L_4X2_exit /* If (ip1 < ip0) go to exit */ cmpq %ip0, %ip1 @@ -537,7 +556,7 @@ HUF_decompress4X2_usingDTable_internal_fast_asm_loop: pop %rax /* oend1 */ pop %rax /* oend2 */ pop %rax /* oend3 */ - pop %rax /* ilimit */ + pop %rax /* ilowest */ pop %rax /* olimit */ pop %rax /* arg */ diff --git a/thirdparty/zstd/decompress/zstd_decompress.c b/thirdparty/zstd/decompress/zstd_decompress.c index 7bc2713429..2f03cf7b0c 100644 --- a/thirdparty/zstd/decompress/zstd_decompress.c +++ b/thirdparty/zstd/decompress/zstd_decompress.c @@ -55,18 +55,19 @@ /*-******************************************************* * Dependencies *********************************************************/ -#include "../common/allocations.h" /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */ #include "../common/zstd_deps.h" /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */ +#include "../common/allocations.h" /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */ +#include "../common/error_private.h" +#include "../common/zstd_internal.h" /* blockProperties_t */ #include "../common/mem.h" /* low level memory routines */ +#include "../common/bits.h" /* ZSTD_highbit32 */ #define FSE_STATIC_LINKING_ONLY #include "../common/fse.h" #include "../common/huf.h" #include "../common/xxhash.h" /* XXH64_reset, XXH64_update, XXH64_digest, XXH64 */ -#include "../common/zstd_internal.h" /* blockProperties_t */ #include "zstd_decompress_internal.h" /* ZSTD_DCtx */ #include "zstd_ddict.h" /* ZSTD_DDictDictContent */ #include "zstd_decompress_block.h" /* ZSTD_decompressBlock_internal */ -#include "../common/bits.h" /* ZSTD_highbit32 */ #if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1) # include "../legacy/zstd_legacy.h" @@ -245,6 +246,7 @@ static void ZSTD_DCtx_resetParameters(ZSTD_DCtx* dctx) dctx->forceIgnoreChecksum = ZSTD_d_validateChecksum; dctx->refMultipleDDicts = ZSTD_rmd_refSingleDDict; dctx->disableHufAsm = 0; + dctx->maxBlockSizeParam = 0; } static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx) @@ -265,6 +267,7 @@ static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx) #endif dctx->noForwardProgress = 0; dctx->oversizedDuration = 0; + dctx->isFrameDecompression = 1; #if DYNAMIC_BMI2 dctx->bmi2 = ZSTD_cpuSupportsBmi2(); #endif @@ -726,17 +729,17 @@ static ZSTD_frameSizeInfo ZSTD_errorFrameSizeInfo(size_t ret) return frameSizeInfo; } -static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize) +static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize, ZSTD_format_e format) { ZSTD_frameSizeInfo frameSizeInfo; ZSTD_memset(&frameSizeInfo, 0, sizeof(ZSTD_frameSizeInfo)); #if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1) - if (ZSTD_isLegacy(src, srcSize)) + if (format == ZSTD_f_zstd1 && ZSTD_isLegacy(src, srcSize)) return ZSTD_findFrameSizeInfoLegacy(src, srcSize); #endif - if ((srcSize >= ZSTD_SKIPPABLEHEADERSIZE) + if (format == ZSTD_f_zstd1 && (srcSize >= ZSTD_SKIPPABLEHEADERSIZE) && (MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { frameSizeInfo.compressedSize = readSkippableFrameSize(src, srcSize); assert(ZSTD_isError(frameSizeInfo.compressedSize) || @@ -750,7 +753,7 @@ static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize ZSTD_frameHeader zfh; /* Extract Frame Header */ - { size_t const ret = ZSTD_getFrameHeader(&zfh, src, srcSize); + { size_t const ret = ZSTD_getFrameHeader_advanced(&zfh, src, srcSize, format); if (ZSTD_isError(ret)) return ZSTD_errorFrameSizeInfo(ret); if (ret > 0) @@ -793,15 +796,17 @@ static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize } } +static size_t ZSTD_findFrameCompressedSize_advanced(const void *src, size_t srcSize, ZSTD_format_e format) { + ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize, format); + return frameSizeInfo.compressedSize; +} + /** ZSTD_findFrameCompressedSize() : - * compatible with legacy mode - * `src` must point to the start of a ZSTD frame, ZSTD legacy frame, or skippable frame - * `srcSize` must be at least as large as the frame contained - * @return : the compressed size of the frame starting at `src` */ + * See docs in zstd.h + * Note: compatible with legacy mode */ size_t ZSTD_findFrameCompressedSize(const void *src, size_t srcSize) { - ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize); - return frameSizeInfo.compressedSize; + return ZSTD_findFrameCompressedSize_advanced(src, srcSize, ZSTD_f_zstd1); } /** ZSTD_decompressBound() : @@ -815,7 +820,7 @@ unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize) unsigned long long bound = 0; /* Iterate over each frame */ while (srcSize > 0) { - ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize); + ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize, ZSTD_f_zstd1); size_t const compressedSize = frameSizeInfo.compressedSize; unsigned long long const decompressedBound = frameSizeInfo.decompressedBound; if (ZSTD_isError(compressedSize) || decompressedBound == ZSTD_CONTENTSIZE_ERROR) @@ -835,7 +840,7 @@ size_t ZSTD_decompressionMargin(void const* src, size_t srcSize) /* Iterate over each frame */ while (srcSize > 0) { - ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize); + ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize, ZSTD_f_zstd1); size_t const compressedSize = frameSizeInfo.compressedSize; unsigned long long const decompressedBound = frameSizeInfo.decompressedBound; ZSTD_frameHeader zfh; @@ -971,6 +976,10 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx, ip += frameHeaderSize; remainingSrcSize -= frameHeaderSize; } + /* Shrink the blockSizeMax if enabled */ + if (dctx->maxBlockSizeParam != 0) + dctx->fParams.blockSizeMax = MIN(dctx->fParams.blockSizeMax, (unsigned)dctx->maxBlockSizeParam); + /* Loop on each block */ while (1) { BYTE* oBlockEnd = oend; @@ -1003,7 +1012,8 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx, switch(blockProperties.blockType) { case bt_compressed: - decodedSize = ZSTD_decompressBlock_internal(dctx, op, (size_t)(oBlockEnd-op), ip, cBlockSize, /* frame */ 1, not_streaming); + assert(dctx->isFrameDecompression == 1); + decodedSize = ZSTD_decompressBlock_internal(dctx, op, (size_t)(oBlockEnd-op), ip, cBlockSize, not_streaming); break; case bt_raw : /* Use oend instead of oBlockEnd because this function is safe to overlap. It uses memmove. */ @@ -1016,12 +1026,14 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx, default: RETURN_ERROR(corruption_detected, "invalid block type"); } - - if (ZSTD_isError(decodedSize)) return decodedSize; - if (dctx->validateChecksum) + FORWARD_IF_ERROR(decodedSize, "Block decompression failure"); + DEBUGLOG(5, "Decompressed block of dSize = %u", (unsigned)decodedSize); + if (dctx->validateChecksum) { XXH64_update(&dctx->xxhState, op, decodedSize); - if (decodedSize != 0) + } + if (decodedSize) /* support dst = NULL,0 */ { op += decodedSize; + } assert(ip != NULL); ip += cBlockSize; remainingSrcSize -= cBlockSize; @@ -1051,7 +1063,9 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx, return (size_t)(op-ostart); } -static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx, +static +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize, const void* dict, size_t dictSize, @@ -1071,7 +1085,7 @@ static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx, while (srcSize >= ZSTD_startingInputLength(dctx->format)) { #if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1) - if (ZSTD_isLegacy(src, srcSize)) { + if (dctx->format == ZSTD_f_zstd1 && ZSTD_isLegacy(src, srcSize)) { size_t decodedSize; size_t const frameSize = ZSTD_findFrameCompressedSizeLegacy(src, srcSize); if (ZSTD_isError(frameSize)) return frameSize; @@ -1081,6 +1095,15 @@ static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx, decodedSize = ZSTD_decompressLegacy(dst, dstCapacity, src, frameSize, dict, dictSize); if (ZSTD_isError(decodedSize)) return decodedSize; + { + unsigned long long const expectedSize = ZSTD_getFrameContentSize(src, srcSize); + RETURN_ERROR_IF(expectedSize == ZSTD_CONTENTSIZE_ERROR, corruption_detected, "Corrupted frame header!"); + if (expectedSize != ZSTD_CONTENTSIZE_UNKNOWN) { + RETURN_ERROR_IF(expectedSize != decodedSize, corruption_detected, + "Frame header size does not match decoded size!"); + } + } + assert(decodedSize <= dstCapacity); dst = (BYTE*)dst + decodedSize; dstCapacity -= decodedSize; @@ -1092,7 +1115,7 @@ static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx, } #endif - if (srcSize >= 4) { + if (dctx->format == ZSTD_f_zstd1 && srcSize >= 4) { U32 const magicNumber = MEM_readLE32(src); DEBUGLOG(5, "reading magic number %08X", (unsigned)magicNumber); if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { @@ -1319,7 +1342,8 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c { case bt_compressed: DEBUGLOG(5, "ZSTD_decompressContinue: case bt_compressed"); - rSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 1, is_streaming); + assert(dctx->isFrameDecompression == 1); + rSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, is_streaming); dctx->expected = 0; /* Streaming not supported */ break; case bt_raw : @@ -1388,6 +1412,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c case ZSTDds_decodeSkippableHeader: assert(src != NULL); assert(srcSize <= ZSTD_SKIPPABLEHEADERSIZE); + assert(dctx->format != ZSTD_f_zstd1_magicless); ZSTD_memcpy(dctx->headerBuffer + (ZSTD_SKIPPABLEHEADERSIZE - srcSize), src, srcSize); /* complete skippable header */ dctx->expected = MEM_readLE32(dctx->headerBuffer + ZSTD_FRAMEIDSIZE); /* note : dctx->expected can grow seriously large, beyond local buffer size */ dctx->stage = ZSTDds_skipFrame; @@ -1548,6 +1573,7 @@ size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx) dctx->litEntropy = dctx->fseEntropy = 0; dctx->dictID = 0; dctx->bType = bt_reserved; + dctx->isFrameDecompression = 1; ZSTD_STATIC_ASSERT(sizeof(dctx->entropy.rep) == sizeof(repStartValue)); ZSTD_memcpy(dctx->entropy.rep, repStartValue, sizeof(repStartValue)); /* initial repcodes */ dctx->LLTptr = dctx->entropy.LLTable; @@ -1819,6 +1845,10 @@ ZSTD_bounds ZSTD_dParam_getBounds(ZSTD_dParameter dParam) bounds.lowerBound = 0; bounds.upperBound = 1; return bounds; + case ZSTD_d_maxBlockSize: + bounds.lowerBound = ZSTD_BLOCKSIZE_MAX_MIN; + bounds.upperBound = ZSTD_BLOCKSIZE_MAX; + return bounds; default:; } @@ -1863,6 +1893,9 @@ size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param, int* value case ZSTD_d_disableHuffmanAssembly: *value = (int)dctx->disableHufAsm; return 0; + case ZSTD_d_maxBlockSize: + *value = dctx->maxBlockSizeParam; + return 0; default:; } RETURN_ERROR(parameter_unsupported, ""); @@ -1900,6 +1933,10 @@ size_t ZSTD_DCtx_setParameter(ZSTD_DCtx* dctx, ZSTD_dParameter dParam, int value CHECK_DBOUNDS(ZSTD_d_disableHuffmanAssembly, value); dctx->disableHufAsm = value != 0; return 0; + case ZSTD_d_maxBlockSize: + if (value != 0) CHECK_DBOUNDS(ZSTD_d_maxBlockSize, value); + dctx->maxBlockSizeParam = value; + return 0; default:; } RETURN_ERROR(parameter_unsupported, ""); @@ -1911,6 +1948,7 @@ size_t ZSTD_DCtx_reset(ZSTD_DCtx* dctx, ZSTD_ResetDirective reset) || (reset == ZSTD_reset_session_and_parameters) ) { dctx->streamStage = zdss_init; dctx->noForwardProgress = 0; + dctx->isFrameDecompression = 1; } if ( (reset == ZSTD_reset_parameters) || (reset == ZSTD_reset_session_and_parameters) ) { @@ -1927,11 +1965,17 @@ size_t ZSTD_sizeof_DStream(const ZSTD_DStream* dctx) return ZSTD_sizeof_DCtx(dctx); } -size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize) +static size_t ZSTD_decodingBufferSize_internal(unsigned long long windowSize, unsigned long long frameContentSize, size_t blockSizeMax) { - size_t const blockSize = (size_t) MIN(windowSize, ZSTD_BLOCKSIZE_MAX); - /* space is needed to store the litbuffer after the output of a given block without stomping the extDict of a previous run, as well as to cover both windows against wildcopy*/ - unsigned long long const neededRBSize = windowSize + blockSize + ZSTD_BLOCKSIZE_MAX + (WILDCOPY_OVERLENGTH * 2); + size_t const blockSize = MIN((size_t)MIN(windowSize, ZSTD_BLOCKSIZE_MAX), blockSizeMax); + /* We need blockSize + WILDCOPY_OVERLENGTH worth of buffer so that if a block + * ends at windowSize + WILDCOPY_OVERLENGTH + 1 bytes, we can start writing + * the block at the beginning of the output buffer, and maintain a full window. + * + * We need another blockSize worth of buffer so that we can store split + * literals at the end of the block without overwriting the extDict window. + */ + unsigned long long const neededRBSize = windowSize + (blockSize * 2) + (WILDCOPY_OVERLENGTH * 2); unsigned long long const neededSize = MIN(frameContentSize, neededRBSize); size_t const minRBSize = (size_t) neededSize; RETURN_ERROR_IF((unsigned long long)minRBSize != neededSize, @@ -1939,6 +1983,11 @@ size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long return minRBSize; } +size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize) +{ + return ZSTD_decodingBufferSize_internal(windowSize, frameContentSize, ZSTD_BLOCKSIZE_MAX); +} + size_t ZSTD_estimateDStreamSize(size_t windowSize) { size_t const blockSize = MIN(windowSize, ZSTD_BLOCKSIZE_MAX); @@ -2134,12 +2183,12 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB if (zds->fParams.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN && zds->fParams.frameType != ZSTD_skippableFrame && (U64)(size_t)(oend-op) >= zds->fParams.frameContentSize) { - size_t const cSize = ZSTD_findFrameCompressedSize(istart, (size_t)(iend-istart)); + size_t const cSize = ZSTD_findFrameCompressedSize_advanced(istart, (size_t)(iend-istart), zds->format); if (cSize <= (size_t)(iend-istart)) { /* shortcut : using single-pass mode */ size_t const decompressedSize = ZSTD_decompress_usingDDict(zds, op, (size_t)(oend-op), istart, cSize, ZSTD_getDDict(zds)); if (ZSTD_isError(decompressedSize)) return decompressedSize; - DEBUGLOG(4, "shortcut to single-pass ZSTD_decompress_usingDDict()") + DEBUGLOG(4, "shortcut to single-pass ZSTD_decompress_usingDDict()"); assert(istart != NULL); ip = istart + cSize; op = op ? op + decompressedSize : op; /* can occur if frameContentSize = 0 (empty frame) */ @@ -2161,7 +2210,8 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB DEBUGLOG(4, "Consume header"); FORWARD_IF_ERROR(ZSTD_decompressBegin_usingDDict(zds, ZSTD_getDDict(zds)), ""); - if ((MEM_readLE32(zds->headerBuffer) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { /* skippable frame */ + if (zds->format == ZSTD_f_zstd1 + && (MEM_readLE32(zds->headerBuffer) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { /* skippable frame */ zds->expected = MEM_readLE32(zds->headerBuffer + ZSTD_FRAMEIDSIZE); zds->stage = ZSTDds_skipFrame; } else { @@ -2177,11 +2227,13 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB zds->fParams.windowSize = MAX(zds->fParams.windowSize, 1U << ZSTD_WINDOWLOG_ABSOLUTEMIN); RETURN_ERROR_IF(zds->fParams.windowSize > zds->maxWindowSize, frameParameter_windowTooLarge, ""); + if (zds->maxBlockSizeParam != 0) + zds->fParams.blockSizeMax = MIN(zds->fParams.blockSizeMax, (unsigned)zds->maxBlockSizeParam); /* Adapt buffer sizes to frame header instructions */ { size_t const neededInBuffSize = MAX(zds->fParams.blockSizeMax, 4 /* frame checksum */); size_t const neededOutBuffSize = zds->outBufferMode == ZSTD_bm_buffered - ? ZSTD_decodingBufferSize_min(zds->fParams.windowSize, zds->fParams.frameContentSize) + ? ZSTD_decodingBufferSize_internal(zds->fParams.windowSize, zds->fParams.frameContentSize, zds->fParams.blockSizeMax) : 0; ZSTD_DCtx_updateOversizedDuration(zds, neededInBuffSize, neededOutBuffSize); diff --git a/thirdparty/zstd/decompress/zstd_decompress_block.c b/thirdparty/zstd/decompress/zstd_decompress_block.c index 09896a931e..76d7332e88 100644 --- a/thirdparty/zstd/decompress/zstd_decompress_block.c +++ b/thirdparty/zstd/decompress/zstd_decompress_block.c @@ -51,6 +51,13 @@ static void ZSTD_copy4(void* dst, const void* src) { ZSTD_memcpy(dst, src, 4); } * Block decoding ***************************************************************/ +static size_t ZSTD_blockSizeMax(ZSTD_DCtx const* dctx) +{ + size_t const blockSizeMax = dctx->isFrameDecompression ? dctx->fParams.blockSizeMax : ZSTD_BLOCKSIZE_MAX; + assert(blockSizeMax <= ZSTD_BLOCKSIZE_MAX); + return blockSizeMax; +} + /*! ZSTD_getcBlockSize() : * Provides the size of compressed block from block header `src` */ size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, @@ -73,41 +80,49 @@ size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, static void ZSTD_allocateLiteralsBuffer(ZSTD_DCtx* dctx, void* const dst, const size_t dstCapacity, const size_t litSize, const streaming_operation streaming, const size_t expectedWriteSize, const unsigned splitImmediately) { - if (streaming == not_streaming && dstCapacity > ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH + litSize + WILDCOPY_OVERLENGTH) - { - /* room for litbuffer to fit without read faulting */ - dctx->litBuffer = (BYTE*)dst + ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH; + size_t const blockSizeMax = ZSTD_blockSizeMax(dctx); + assert(litSize <= blockSizeMax); + assert(dctx->isFrameDecompression || streaming == not_streaming); + assert(expectedWriteSize <= blockSizeMax); + if (streaming == not_streaming && dstCapacity > blockSizeMax + WILDCOPY_OVERLENGTH + litSize + WILDCOPY_OVERLENGTH) { + /* If we aren't streaming, we can just put the literals after the output + * of the current block. We don't need to worry about overwriting the + * extDict of our window, because it doesn't exist. + * So if we have space after the end of the block, just put it there. + */ + dctx->litBuffer = (BYTE*)dst + blockSizeMax + WILDCOPY_OVERLENGTH; dctx->litBufferEnd = dctx->litBuffer + litSize; dctx->litBufferLocation = ZSTD_in_dst; - } - else if (litSize > ZSTD_LITBUFFEREXTRASIZE) - { - /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */ + } else if (litSize <= ZSTD_LITBUFFEREXTRASIZE) { + /* Literals fit entirely within the extra buffer, put them there to avoid + * having to split the literals. + */ + dctx->litBuffer = dctx->litExtraBuffer; + dctx->litBufferEnd = dctx->litBuffer + litSize; + dctx->litBufferLocation = ZSTD_not_in_dst; + } else { + assert(blockSizeMax > ZSTD_LITBUFFEREXTRASIZE); + /* Literals must be split between the output block and the extra lit + * buffer. We fill the extra lit buffer with the tail of the literals, + * and put the rest of the literals at the end of the block, with + * WILDCOPY_OVERLENGTH of buffer room to allow for overreads. + * This MUST not write more than our maxBlockSize beyond dst, because in + * streaming mode, that could overwrite part of our extDict window. + */ if (splitImmediately) { /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */ dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH; dctx->litBufferEnd = dctx->litBuffer + litSize - ZSTD_LITBUFFEREXTRASIZE; - } - else { + } else { /* initially this will be stored entirely in dst during huffman decoding, it will partially be shifted to litExtraBuffer after */ dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize; dctx->litBufferEnd = (BYTE*)dst + expectedWriteSize; } dctx->litBufferLocation = ZSTD_split; - } - else - { - /* fits entirely within litExtraBuffer, so no split is necessary */ - dctx->litBuffer = dctx->litExtraBuffer; - dctx->litBufferEnd = dctx->litBuffer + litSize; - dctx->litBufferLocation = ZSTD_not_in_dst; + assert(dctx->litBufferEnd <= (BYTE*)dst + expectedWriteSize); } } -/* Hidden declaration for fullbench */ -size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, - const void* src, size_t srcSize, - void* dst, size_t dstCapacity, const streaming_operation streaming); /*! ZSTD_decodeLiteralsBlock() : * Where it is possible to do so without being stomped by the output during decompression, the literals block will be stored * in the dstBuffer. If there is room to do so, it will be stored in full in the excess dst space after where the current @@ -116,7 +131,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, * * @return : nb of bytes read from src (< srcSize ) * note : symbol not declared but exposed for fullbench */ -size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, +static size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, const void* src, size_t srcSize, /* note : srcSize < BLOCKSIZE */ void* dst, size_t dstCapacity, const streaming_operation streaming) { @@ -125,6 +140,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, { const BYTE* const istart = (const BYTE*) src; symbolEncodingType_e const litEncType = (symbolEncodingType_e)(istart[0] & 3); + size_t const blockSizeMax = ZSTD_blockSizeMax(dctx); switch(litEncType) { @@ -140,7 +156,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, U32 const lhlCode = (istart[0] >> 2) & 3; U32 const lhc = MEM_readLE32(istart); size_t hufSuccess; - size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity); + size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity); int const flags = 0 | (ZSTD_DCtx_get_bmi2(dctx) ? HUF_flags_bmi2 : 0) | (dctx->disableHufAsm ? HUF_flags_disableAsm : 0); @@ -167,7 +183,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, break; } RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled"); - RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, ""); + RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, ""); if (!singleStream) RETURN_ERROR_IF(litSize < MIN_LITERALS_FOR_4_STREAMS, literals_headerWrong, "Not enough literals (%zu) for the 4-streams mode (min %u)", @@ -214,10 +230,12 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, } if (dctx->litBufferLocation == ZSTD_split) { + assert(litSize > ZSTD_LITBUFFEREXTRASIZE); ZSTD_memcpy(dctx->litExtraBuffer, dctx->litBufferEnd - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE); ZSTD_memmove(dctx->litBuffer + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH, dctx->litBuffer, litSize - ZSTD_LITBUFFEREXTRASIZE); dctx->litBuffer += ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH; dctx->litBufferEnd -= WILDCOPY_OVERLENGTH; + assert(dctx->litBufferEnd <= (BYTE*)dst + blockSizeMax); } RETURN_ERROR_IF(HUF_isError(hufSuccess), corruption_detected, ""); @@ -232,7 +250,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, case set_basic: { size_t litSize, lhSize; U32 const lhlCode = ((istart[0]) >> 2) & 3; - size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity); + size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity); switch(lhlCode) { case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */ @@ -251,6 +269,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, } RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled"); + RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, ""); RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, ""); ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1); if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize) { /* risk reading beyond src buffer with wildcopy */ @@ -279,7 +298,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, case set_rle: { U32 const lhlCode = ((istart[0]) >> 2) & 3; size_t litSize, lhSize; - size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity); + size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity); switch(lhlCode) { case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */ @@ -298,7 +317,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, break; } RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled"); - RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, ""); + RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, ""); RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, ""); ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1); if (dctx->litBufferLocation == ZSTD_split) @@ -320,6 +339,18 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, } } +/* Hidden declaration for fullbench */ +size_t ZSTD_decodeLiteralsBlock_wrapper(ZSTD_DCtx* dctx, + const void* src, size_t srcSize, + void* dst, size_t dstCapacity); +size_t ZSTD_decodeLiteralsBlock_wrapper(ZSTD_DCtx* dctx, + const void* src, size_t srcSize, + void* dst, size_t dstCapacity) +{ + dctx->isFrameDecompression = 0; + return ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, not_streaming); +} + /* Default FSE distribution tables. * These are pre-calculated FSE decoding tables using default distributions as defined in specification : * https://github.com/facebook/zstd/blob/release/doc/zstd_compression_format.md#default-distributions @@ -675,11 +706,6 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr, /* SeqHead */ nbSeq = *ip++; - if (!nbSeq) { - *nbSeqPtr=0; - RETURN_ERROR_IF(srcSize != 1, srcSize_wrong, ""); - return 1; - } if (nbSeq > 0x7F) { if (nbSeq == 0xFF) { RETURN_ERROR_IF(ip+2 > iend, srcSize_wrong, ""); @@ -692,8 +718,16 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr, } *nbSeqPtr = nbSeq; + if (nbSeq == 0) { + /* No sequence : section ends immediately */ + RETURN_ERROR_IF(ip != iend, corruption_detected, + "extraneous data present in the Sequences section"); + return (size_t)(ip - istart); + } + /* FSE table descriptors */ RETURN_ERROR_IF(ip+1 > iend, srcSize_wrong, ""); /* minimum possible size: 1 byte for symbol encoding types */ + RETURN_ERROR_IF(*ip & 3, corruption_detected, ""); /* The last field, Reserved, must be all-zeroes. */ { symbolEncodingType_e const LLtype = (symbolEncodingType_e)(*ip >> 6); symbolEncodingType_e const OFtype = (symbolEncodingType_e)((*ip >> 4) & 3); symbolEncodingType_e const MLtype = (symbolEncodingType_e)((*ip >> 2) & 3); @@ -840,7 +874,7 @@ static void ZSTD_safecopy(BYTE* op, const BYTE* const oend_w, BYTE const* ip, pt /* ZSTD_safecopyDstBeforeSrc(): * This version allows overlap with dst before src, or handles the non-overlap case with dst after src * Kept separate from more common ZSTD_safecopy case to avoid performance impact to the safecopy common case */ -static void ZSTD_safecopyDstBeforeSrc(BYTE* op, BYTE const* ip, ptrdiff_t length) { +static void ZSTD_safecopyDstBeforeSrc(BYTE* op, const BYTE* ip, ptrdiff_t length) { ptrdiff_t const diff = op - ip; BYTE* const oend = op + length; @@ -869,6 +903,7 @@ static void ZSTD_safecopyDstBeforeSrc(BYTE* op, BYTE const* ip, ptrdiff_t length * to be optimized for many small sequences, since those fall into ZSTD_execSequence(). */ FORCE_NOINLINE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR size_t ZSTD_execSequenceEnd(BYTE* op, BYTE* const oend, seq_t sequence, const BYTE** litPtr, const BYTE* const litLimit, @@ -916,6 +951,7 @@ size_t ZSTD_execSequenceEnd(BYTE* op, * This version is intended to be used during instances where the litBuffer is still split. It is kept separate to avoid performance impact for the good case. */ FORCE_NOINLINE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR size_t ZSTD_execSequenceEndSplitLitBuffer(BYTE* op, BYTE* const oend, const BYTE* const oend_w, seq_t sequence, const BYTE** litPtr, const BYTE* const litLimit, @@ -961,6 +997,7 @@ size_t ZSTD_execSequenceEndSplitLitBuffer(BYTE* op, } HINT_INLINE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR size_t ZSTD_execSequence(BYTE* op, BYTE* const oend, seq_t sequence, const BYTE** litPtr, const BYTE* const litLimit, @@ -1059,6 +1096,7 @@ size_t ZSTD_execSequence(BYTE* op, } HINT_INLINE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR size_t ZSTD_execSequenceSplitLitBuffer(BYTE* op, BYTE* const oend, const BYTE* const oend_w, seq_t sequence, const BYTE** litPtr, const BYTE* const litLimit, @@ -1181,14 +1219,20 @@ ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, U16 typedef enum { ZSTD_lo_isRegularOffset, ZSTD_lo_isLongOffset=1 } ZSTD_longOffset_e; +/** + * ZSTD_decodeSequence(): + * @p longOffsets : tells the decoder to reload more bit while decoding large offsets + * only used in 32-bit mode + * @return : Sequence (litL + matchL + offset) + */ FORCE_INLINE_TEMPLATE seq_t -ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) +ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets, const int isLastSeq) { seq_t seq; /* - * ZSTD_seqSymbol is a structure with a total of 64 bits wide. So it can be - * loaded in one operation and extracted its fields by simply shifting or - * bit-extracting on aarch64. + * ZSTD_seqSymbol is a 64 bits wide structure. + * It can be loaded in one operation + * and its fields extracted by simply shifting or bit-extracting on aarch64. * GCC doesn't recognize this and generates more unnecessary ldr/ldrb/ldrh * operations that cause performance drop. This can be avoided by using this * ZSTD_memcpy hack. @@ -1261,7 +1305,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) } else { offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1); { size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset]; - temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */ + temp -= !temp; /* 0 is not valid: input corrupted => force offset to -1 => corruption detected at execSequence */ if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1]; seqState->prevOffset[1] = seqState->prevOffset[0]; seqState->prevOffset[0] = offset = temp; @@ -1288,17 +1332,22 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u", (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset); - ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits); /* <= 9 bits */ - ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits); /* <= 9 bits */ - if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */ - ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits); /* <= 8 bits */ + if (!isLastSeq) { + /* don't update FSE state for last Sequence */ + ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits); /* <= 9 bits */ + ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits); /* <= 9 bits */ + if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */ + ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits); /* <= 8 bits */ + BIT_reloadDStream(&seqState->DStream); + } } return seq; } -#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION -MEM_STATIC int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd) +#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) +#if DEBUGLEVEL >= 1 +static int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd) { size_t const windowSize = dctx->fParams.windowSize; /* No dictionary used. */ @@ -1312,30 +1361,33 @@ MEM_STATIC int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefix /* Dictionary is active. */ return 1; } +#endif -MEM_STATIC void ZSTD_assertValidSequence( +static void ZSTD_assertValidSequence( ZSTD_DCtx const* dctx, BYTE const* op, BYTE const* oend, seq_t const seq, BYTE const* prefixStart, BYTE const* virtualStart) { #if DEBUGLEVEL >= 1 - size_t const windowSize = dctx->fParams.windowSize; - size_t const sequenceSize = seq.litLength + seq.matchLength; - BYTE const* const oLitEnd = op + seq.litLength; - DEBUGLOG(6, "Checking sequence: litL=%u matchL=%u offset=%u", - (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset); - assert(op <= oend); - assert((size_t)(oend - op) >= sequenceSize); - assert(sequenceSize <= ZSTD_BLOCKSIZE_MAX); - if (ZSTD_dictionaryIsActive(dctx, prefixStart, oLitEnd)) { - size_t const dictSize = (size_t)((char const*)dctx->dictContentEndForFuzzing - (char const*)dctx->dictContentBeginForFuzzing); - /* Offset must be within the dictionary. */ - assert(seq.offset <= (size_t)(oLitEnd - virtualStart)); - assert(seq.offset <= windowSize + dictSize); - } else { - /* Offset must be within our window. */ - assert(seq.offset <= windowSize); + if (dctx->isFrameDecompression) { + size_t const windowSize = dctx->fParams.windowSize; + size_t const sequenceSize = seq.litLength + seq.matchLength; + BYTE const* const oLitEnd = op + seq.litLength; + DEBUGLOG(6, "Checking sequence: litL=%u matchL=%u offset=%u", + (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset); + assert(op <= oend); + assert((size_t)(oend - op) >= sequenceSize); + assert(sequenceSize <= ZSTD_blockSizeMax(dctx)); + if (ZSTD_dictionaryIsActive(dctx, prefixStart, oLitEnd)) { + size_t const dictSize = (size_t)((char const*)dctx->dictContentEndForFuzzing - (char const*)dctx->dictContentBeginForFuzzing); + /* Offset must be within the dictionary. */ + assert(seq.offset <= (size_t)(oLitEnd - virtualStart)); + assert(seq.offset <= windowSize + dictSize); + } else { + /* Offset must be within our window. */ + assert(seq.offset <= windowSize); + } } #else (void)dctx, (void)op, (void)oend, (void)seq, (void)prefixStart, (void)virtualStart; @@ -1351,23 +1403,21 @@ DONT_VECTORIZE ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, const void* seqStart, size_t seqSize, int nbSeq, - const ZSTD_longOffset_e isLongOffset, - const int frame) + const ZSTD_longOffset_e isLongOffset) { const BYTE* ip = (const BYTE*)seqStart; const BYTE* const iend = ip + seqSize; BYTE* const ostart = (BYTE*)dst; - BYTE* const oend = ostart + maxDstSize; + BYTE* const oend = ZSTD_maybeNullPtrAdd(ostart, maxDstSize); BYTE* op = ostart; const BYTE* litPtr = dctx->litPtr; const BYTE* litBufferEnd = dctx->litBufferEnd; const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart); const BYTE* const vBase = (const BYTE*) (dctx->virtualStart); const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd); - DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer"); - (void)frame; + DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer (%i seqs)", nbSeq); - /* Regen sequences */ + /* Literals are split between internal buffer & output buffer */ if (nbSeq) { seqState_t seqState; dctx->fseEntropy = 1; @@ -1386,8 +1436,7 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx, BIT_DStream_completed < BIT_DStream_overflow); /* decompress without overrunning litPtr begins */ - { - seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset); + { seq_t sequence = {0,0,0}; /* some static analyzer believe that @sequence is not initialized (it necessarily is, since for(;;) loop as at least one iteration) */ /* Align the decompression loop to 32 + 16 bytes. * * zstd compiled with gcc-9 on an Intel i9-9900k shows 10% decompression @@ -1449,27 +1498,26 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx, #endif /* Handle the initial state where litBuffer is currently split between dst and litExtraBuffer */ - for (; litPtr + sequence.litLength <= dctx->litBufferEnd; ) { - size_t const oneSeqSize = ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence.litLength - WILDCOPY_OVERLENGTH, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); + for ( ; nbSeq; nbSeq--) { + sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1); + if (litPtr + sequence.litLength > dctx->litBufferEnd) break; + { size_t const oneSeqSize = ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence.litLength - WILDCOPY_OVERLENGTH, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) - assert(!ZSTD_isError(oneSeqSize)); - if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); + assert(!ZSTD_isError(oneSeqSize)); + ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); #endif - if (UNLIKELY(ZSTD_isError(oneSeqSize))) - return oneSeqSize; - DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); - op += oneSeqSize; - if (UNLIKELY(!--nbSeq)) - break; - BIT_reloadDStream(&(seqState.DStream)); - sequence = ZSTD_decodeSequence(&seqState, isLongOffset); - } + if (UNLIKELY(ZSTD_isError(oneSeqSize))) + return oneSeqSize; + DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); + op += oneSeqSize; + } } + DEBUGLOG(6, "reached: (litPtr + sequence.litLength > dctx->litBufferEnd)"); /* If there are more sequences, they will need to read literals from litExtraBuffer; copy over the remainder from dst and update litPtr and litEnd */ if (nbSeq > 0) { const size_t leftoverLit = dctx->litBufferEnd - litPtr; - if (leftoverLit) - { + DEBUGLOG(6, "There are %i sequences left, and %zu/%zu literals left in buffer", nbSeq, leftoverLit, sequence.litLength); + if (leftoverLit) { RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer"); ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit); sequence.litLength -= leftoverLit; @@ -1478,24 +1526,22 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx, litPtr = dctx->litExtraBuffer; litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; dctx->litBufferLocation = ZSTD_not_in_dst; - { - size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); + { size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) assert(!ZSTD_isError(oneSeqSize)); - if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); + ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); #endif if (UNLIKELY(ZSTD_isError(oneSeqSize))) return oneSeqSize; DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); op += oneSeqSize; - if (--nbSeq) - BIT_reloadDStream(&(seqState.DStream)); } + nbSeq--; } } - if (nbSeq > 0) /* there is remaining lit from extra buffer */ - { + if (nbSeq > 0) { + /* there is remaining lit from extra buffer */ #if defined(__GNUC__) && defined(__x86_64__) __asm__(".p2align 6"); @@ -1514,35 +1560,34 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx, # endif #endif - for (; ; ) { - seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset); + for ( ; nbSeq ; nbSeq--) { + seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1); size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) assert(!ZSTD_isError(oneSeqSize)); - if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); + ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); #endif if (UNLIKELY(ZSTD_isError(oneSeqSize))) return oneSeqSize; DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); op += oneSeqSize; - if (UNLIKELY(!--nbSeq)) - break; - BIT_reloadDStream(&(seqState.DStream)); } } /* check if reached exact end */ DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer: after decode loop, remaining nbSeq : %i", nbSeq); RETURN_ERROR_IF(nbSeq, corruption_detected, ""); - RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, ""); + DEBUGLOG(5, "bitStream : start=%p, ptr=%p, bitsConsumed=%u", seqState.DStream.start, seqState.DStream.ptr, seqState.DStream.bitsConsumed); + RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, ""); /* save reps for next block */ { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); } } /* last literal segment */ - if (dctx->litBufferLocation == ZSTD_split) /* split hasn't been reached yet, first get dst then copy litExtraBuffer */ - { - size_t const lastLLSize = litBufferEnd - litPtr; + if (dctx->litBufferLocation == ZSTD_split) { + /* split hasn't been reached yet, first get dst then copy litExtraBuffer */ + size_t const lastLLSize = (size_t)(litBufferEnd - litPtr); + DEBUGLOG(6, "copy last literals from segment : %u", (U32)lastLLSize); RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, ""); if (op != NULL) { ZSTD_memmove(op, litPtr, lastLLSize); @@ -1552,15 +1597,17 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx, litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; dctx->litBufferLocation = ZSTD_not_in_dst; } - { size_t const lastLLSize = litBufferEnd - litPtr; + /* copy last literals from internal buffer */ + { size_t const lastLLSize = (size_t)(litBufferEnd - litPtr); + DEBUGLOG(6, "copy last literals from internal buffer : %u", (U32)lastLLSize); RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, ""); if (op != NULL) { ZSTD_memcpy(op, litPtr, lastLLSize); op += lastLLSize; - } - } + } } - return op-ostart; + DEBUGLOG(6, "decoded block of size %u bytes", (U32)(op - ostart)); + return (size_t)(op - ostart); } FORCE_INLINE_TEMPLATE size_t @@ -1568,13 +1615,12 @@ DONT_VECTORIZE ZSTD_decompressSequences_body(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, const void* seqStart, size_t seqSize, int nbSeq, - const ZSTD_longOffset_e isLongOffset, - const int frame) + const ZSTD_longOffset_e isLongOffset) { const BYTE* ip = (const BYTE*)seqStart; const BYTE* const iend = ip + seqSize; BYTE* const ostart = (BYTE*)dst; - BYTE* const oend = dctx->litBufferLocation == ZSTD_not_in_dst ? ostart + maxDstSize : dctx->litBuffer; + BYTE* const oend = dctx->litBufferLocation == ZSTD_not_in_dst ? ZSTD_maybeNullPtrAdd(ostart, maxDstSize) : dctx->litBuffer; BYTE* op = ostart; const BYTE* litPtr = dctx->litPtr; const BYTE* const litEnd = litPtr + dctx->litSize; @@ -1582,7 +1628,6 @@ ZSTD_decompressSequences_body(ZSTD_DCtx* dctx, const BYTE* const vBase = (const BYTE*)(dctx->virtualStart); const BYTE* const dictEnd = (const BYTE*)(dctx->dictEnd); DEBUGLOG(5, "ZSTD_decompressSequences_body: nbSeq = %d", nbSeq); - (void)frame; /* Regen sequences */ if (nbSeq) { @@ -1597,11 +1642,6 @@ ZSTD_decompressSequences_body(ZSTD_DCtx* dctx, ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr); assert(dst != NULL); - ZSTD_STATIC_ASSERT( - BIT_DStream_unfinished < BIT_DStream_completed && - BIT_DStream_endOfBuffer < BIT_DStream_completed && - BIT_DStream_completed < BIT_DStream_overflow); - #if defined(__GNUC__) && defined(__x86_64__) __asm__(".p2align 6"); __asm__("nop"); @@ -1616,73 +1656,70 @@ ZSTD_decompressSequences_body(ZSTD_DCtx* dctx, # endif #endif - for ( ; ; ) { - seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset); + for ( ; nbSeq ; nbSeq--) { + seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1); size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd); #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) assert(!ZSTD_isError(oneSeqSize)); - if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); + ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); #endif if (UNLIKELY(ZSTD_isError(oneSeqSize))) return oneSeqSize; DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); op += oneSeqSize; - if (UNLIKELY(!--nbSeq)) - break; - BIT_reloadDStream(&(seqState.DStream)); } /* check if reached exact end */ - DEBUGLOG(5, "ZSTD_decompressSequences_body: after decode loop, remaining nbSeq : %i", nbSeq); - RETURN_ERROR_IF(nbSeq, corruption_detected, ""); - RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, ""); + assert(nbSeq == 0); + RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, ""); /* save reps for next block */ { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); } } /* last literal segment */ - { size_t const lastLLSize = litEnd - litPtr; + { size_t const lastLLSize = (size_t)(litEnd - litPtr); + DEBUGLOG(6, "copy last literals : %u", (U32)lastLLSize); RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, ""); if (op != NULL) { ZSTD_memcpy(op, litPtr, lastLLSize); op += lastLLSize; - } - } + } } - return op-ostart; + DEBUGLOG(6, "decoded block of size %u bytes", (U32)(op - ostart)); + return (size_t)(op - ostart); } static size_t ZSTD_decompressSequences_default(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, const void* seqStart, size_t seqSize, int nbSeq, - const ZSTD_longOffset_e isLongOffset, - const int frame) + const ZSTD_longOffset_e isLongOffset) { - return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); + return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); } static size_t ZSTD_decompressSequencesSplitLitBuffer_default(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, const void* seqStart, size_t seqSize, int nbSeq, - const ZSTD_longOffset_e isLongOffset, - const int frame) + const ZSTD_longOffset_e isLongOffset) { - return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); + return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); } #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT -FORCE_INLINE_TEMPLATE size_t -ZSTD_prefetchMatch(size_t prefetchPos, seq_t const sequence, +FORCE_INLINE_TEMPLATE + +size_t ZSTD_prefetchMatch(size_t prefetchPos, seq_t const sequence, const BYTE* const prefixStart, const BYTE* const dictEnd) { prefetchPos += sequence.litLength; { const BYTE* const matchBase = (sequence.offset > prefetchPos) ? dictEnd : prefixStart; - const BYTE* const match = matchBase + prefetchPos - sequence.offset; /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted. - * No consequence though : memory address is only used for prefetching, not for dereferencing */ + /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted. + * No consequence though : memory address is only used for prefetching, not for dereferencing */ + const BYTE* const match = ZSTD_wrappedPtrSub(ZSTD_wrappedPtrAdd(matchBase, prefetchPos), sequence.offset); PREFETCH_L1(match); PREFETCH_L1(match+CACHELINE_SIZE); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */ } return prefetchPos + sequence.matchLength; @@ -1697,20 +1734,18 @@ ZSTD_decompressSequencesLong_body( ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, const void* seqStart, size_t seqSize, int nbSeq, - const ZSTD_longOffset_e isLongOffset, - const int frame) + const ZSTD_longOffset_e isLongOffset) { const BYTE* ip = (const BYTE*)seqStart; const BYTE* const iend = ip + seqSize; BYTE* const ostart = (BYTE*)dst; - BYTE* const oend = dctx->litBufferLocation == ZSTD_in_dst ? dctx->litBuffer : ostart + maxDstSize; + BYTE* const oend = dctx->litBufferLocation == ZSTD_in_dst ? dctx->litBuffer : ZSTD_maybeNullPtrAdd(ostart, maxDstSize); BYTE* op = ostart; const BYTE* litPtr = dctx->litPtr; const BYTE* litBufferEnd = dctx->litBufferEnd; const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart); const BYTE* const dictStart = (const BYTE*) (dctx->virtualStart); const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd); - (void)frame; /* Regen sequences */ if (nbSeq) { @@ -1735,20 +1770,17 @@ ZSTD_decompressSequencesLong_body( ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr); /* prepare in advance */ - for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && (seqNb<seqAdvance); seqNb++) { - seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset); + for (seqNb=0; seqNb<seqAdvance; seqNb++) { + seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, seqNb == nbSeq-1); prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd); sequences[seqNb] = sequence; } - RETURN_ERROR_IF(seqNb<seqAdvance, corruption_detected, ""); /* decompress without stomping litBuffer */ - for (; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && (seqNb < nbSeq); seqNb++) { - seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset); - size_t oneSeqSize; + for (; seqNb < nbSeq; seqNb++) { + seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset, seqNb == nbSeq-1); - if (dctx->litBufferLocation == ZSTD_split && litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength > dctx->litBufferEnd) - { + if (dctx->litBufferLocation == ZSTD_split && litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength > dctx->litBufferEnd) { /* lit buffer is reaching split point, empty out the first buffer and transition to litExtraBuffer */ const size_t leftoverLit = dctx->litBufferEnd - litPtr; if (leftoverLit) @@ -1761,26 +1793,26 @@ ZSTD_decompressSequencesLong_body( litPtr = dctx->litExtraBuffer; litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; dctx->litBufferLocation = ZSTD_not_in_dst; - oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); + { size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) - assert(!ZSTD_isError(oneSeqSize)); - if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart); + assert(!ZSTD_isError(oneSeqSize)); + ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart); #endif - if (ZSTD_isError(oneSeqSize)) return oneSeqSize; + if (ZSTD_isError(oneSeqSize)) return oneSeqSize; - prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd); - sequences[seqNb & STORED_SEQS_MASK] = sequence; - op += oneSeqSize; - } + prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd); + sequences[seqNb & STORED_SEQS_MASK] = sequence; + op += oneSeqSize; + } } else { /* lit buffer is either wholly contained in first or second split, or not split at all*/ - oneSeqSize = dctx->litBufferLocation == ZSTD_split ? + size_t const oneSeqSize = dctx->litBufferLocation == ZSTD_split ? ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength - WILDCOPY_OVERLENGTH, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) : ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) assert(!ZSTD_isError(oneSeqSize)); - if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart); + ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart); #endif if (ZSTD_isError(oneSeqSize)) return oneSeqSize; @@ -1789,17 +1821,15 @@ ZSTD_decompressSequencesLong_body( op += oneSeqSize; } } - RETURN_ERROR_IF(seqNb<nbSeq, corruption_detected, ""); + RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, ""); /* finish queue */ seqNb -= seqAdvance; for ( ; seqNb<nbSeq ; seqNb++) { seq_t *sequence = &(sequences[seqNb&STORED_SEQS_MASK]); - if (dctx->litBufferLocation == ZSTD_split && litPtr + sequence->litLength > dctx->litBufferEnd) - { + if (dctx->litBufferLocation == ZSTD_split && litPtr + sequence->litLength > dctx->litBufferEnd) { const size_t leftoverLit = dctx->litBufferEnd - litPtr; - if (leftoverLit) - { + if (leftoverLit) { RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer"); ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit); sequence->litLength -= leftoverLit; @@ -1808,11 +1838,10 @@ ZSTD_decompressSequencesLong_body( litPtr = dctx->litExtraBuffer; litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; dctx->litBufferLocation = ZSTD_not_in_dst; - { - size_t const oneSeqSize = ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); + { size_t const oneSeqSize = ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) assert(!ZSTD_isError(oneSeqSize)); - if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart); + ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart); #endif if (ZSTD_isError(oneSeqSize)) return oneSeqSize; op += oneSeqSize; @@ -1825,7 +1854,7 @@ ZSTD_decompressSequencesLong_body( ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) assert(!ZSTD_isError(oneSeqSize)); - if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart); + ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart); #endif if (ZSTD_isError(oneSeqSize)) return oneSeqSize; op += oneSeqSize; @@ -1837,8 +1866,7 @@ ZSTD_decompressSequencesLong_body( } /* last literal segment */ - if (dctx->litBufferLocation == ZSTD_split) /* first deplete literal buffer in dst, then copy litExtraBuffer */ - { + if (dctx->litBufferLocation == ZSTD_split) { /* first deplete literal buffer in dst, then copy litExtraBuffer */ size_t const lastLLSize = litBufferEnd - litPtr; RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, ""); if (op != NULL) { @@ -1856,17 +1884,16 @@ ZSTD_decompressSequencesLong_body( } } - return op-ostart; + return (size_t)(op - ostart); } static size_t ZSTD_decompressSequencesLong_default(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, const void* seqStart, size_t seqSize, int nbSeq, - const ZSTD_longOffset_e isLongOffset, - const int frame) + const ZSTD_longOffset_e isLongOffset) { - return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); + return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); } #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */ @@ -1880,20 +1907,18 @@ DONT_VECTORIZE ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, const void* seqStart, size_t seqSize, int nbSeq, - const ZSTD_longOffset_e isLongOffset, - const int frame) + const ZSTD_longOffset_e isLongOffset) { - return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); + return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); } static BMI2_TARGET_ATTRIBUTE size_t DONT_VECTORIZE ZSTD_decompressSequencesSplitLitBuffer_bmi2(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, const void* seqStart, size_t seqSize, int nbSeq, - const ZSTD_longOffset_e isLongOffset, - const int frame) + const ZSTD_longOffset_e isLongOffset) { - return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); + return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); } #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ @@ -1902,10 +1927,9 @@ static BMI2_TARGET_ATTRIBUTE size_t ZSTD_decompressSequencesLong_bmi2(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, const void* seqStart, size_t seqSize, int nbSeq, - const ZSTD_longOffset_e isLongOffset, - const int frame) + const ZSTD_longOffset_e isLongOffset) { - return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); + return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); } #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */ @@ -1915,37 +1939,34 @@ typedef size_t (*ZSTD_decompressSequences_t)( ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, const void* seqStart, size_t seqSize, int nbSeq, - const ZSTD_longOffset_e isLongOffset, - const int frame); + const ZSTD_longOffset_e isLongOffset); #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG static size_t ZSTD_decompressSequences(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, const void* seqStart, size_t seqSize, int nbSeq, - const ZSTD_longOffset_e isLongOffset, - const int frame) + const ZSTD_longOffset_e isLongOffset) { DEBUGLOG(5, "ZSTD_decompressSequences"); #if DYNAMIC_BMI2 if (ZSTD_DCtx_get_bmi2(dctx)) { - return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); + return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); } #endif - return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); + return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); } static size_t ZSTD_decompressSequencesSplitLitBuffer(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, const void* seqStart, size_t seqSize, int nbSeq, - const ZSTD_longOffset_e isLongOffset, - const int frame) + const ZSTD_longOffset_e isLongOffset) { DEBUGLOG(5, "ZSTD_decompressSequencesSplitLitBuffer"); #if DYNAMIC_BMI2 if (ZSTD_DCtx_get_bmi2(dctx)) { - return ZSTD_decompressSequencesSplitLitBuffer_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); + return ZSTD_decompressSequencesSplitLitBuffer_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); } #endif - return ZSTD_decompressSequencesSplitLitBuffer_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); + return ZSTD_decompressSequencesSplitLitBuffer_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); } #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ @@ -1960,16 +1981,15 @@ static size_t ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, const void* seqStart, size_t seqSize, int nbSeq, - const ZSTD_longOffset_e isLongOffset, - const int frame) + const ZSTD_longOffset_e isLongOffset) { DEBUGLOG(5, "ZSTD_decompressSequencesLong"); #if DYNAMIC_BMI2 if (ZSTD_DCtx_get_bmi2(dctx)) { - return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); + return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); } #endif - return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); + return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); } #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */ @@ -2051,20 +2071,20 @@ static size_t ZSTD_maxShortOffset(void) size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, - const void* src, size_t srcSize, const int frame, const streaming_operation streaming) + const void* src, size_t srcSize, const streaming_operation streaming) { /* blockType == blockCompressed */ const BYTE* ip = (const BYTE*)src; - DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize); + DEBUGLOG(5, "ZSTD_decompressBlock_internal (cSize : %u)", (unsigned)srcSize); /* Note : the wording of the specification - * allows compressed block to be sized exactly ZSTD_BLOCKSIZE_MAX. + * allows compressed block to be sized exactly ZSTD_blockSizeMax(dctx). * This generally does not happen, as it makes little sense, * since an uncompressed block would feature same size and have no decompression cost. * Also, note that decoder from reference libzstd before < v1.5.4 * would consider this edge case as an error. - * As a consequence, avoid generating compressed blocks of size ZSTD_BLOCKSIZE_MAX + * As a consequence, avoid generating compressed blocks of size ZSTD_blockSizeMax(dctx) * for broader compatibility with the deployed ecosystem of zstd decoders */ - RETURN_ERROR_IF(srcSize > ZSTD_BLOCKSIZE_MAX, srcSize_wrong, ""); + RETURN_ERROR_IF(srcSize > ZSTD_blockSizeMax(dctx), srcSize_wrong, ""); /* Decode literals section */ { size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, streaming); @@ -2079,8 +2099,8 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, /* Compute the maximum block size, which must also work when !frame and fParams are unset. * Additionally, take the min with dstCapacity to ensure that the totalHistorySize fits in a size_t. */ - size_t const blockSizeMax = MIN(dstCapacity, (frame ? dctx->fParams.blockSizeMax : ZSTD_BLOCKSIZE_MAX)); - size_t const totalHistorySize = ZSTD_totalHistorySize((BYTE*)dst + blockSizeMax, (BYTE const*)dctx->virtualStart); + size_t const blockSizeMax = MIN(dstCapacity, ZSTD_blockSizeMax(dctx)); + size_t const totalHistorySize = ZSTD_totalHistorySize(ZSTD_maybeNullPtrAdd((BYTE*)dst, blockSizeMax), (BYTE const*)dctx->virtualStart); /* isLongOffset must be true if there are long offsets. * Offsets are long if they are larger than ZSTD_maxShortOffset(). * We don't expect that to be the case in 64-bit mode. @@ -2145,21 +2165,22 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, { #endif #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT - return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame); + return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset); #endif } #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG /* else */ if (dctx->litBufferLocation == ZSTD_split) - return ZSTD_decompressSequencesSplitLitBuffer(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame); + return ZSTD_decompressSequencesSplitLitBuffer(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset); else - return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame); + return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset); #endif } } +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize) { if (dst != dctx->previousDstEnd && dstSize > 0) { /* not contiguous */ @@ -2176,8 +2197,10 @@ size_t ZSTD_decompressBlock_deprecated(ZSTD_DCtx* dctx, const void* src, size_t srcSize) { size_t dSize; + dctx->isFrameDecompression = 0; ZSTD_checkContinuity(dctx, dst, dstCapacity); - dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 0, not_streaming); + dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, not_streaming); + FORWARD_IF_ERROR(dSize, ""); dctx->previousDstEnd = (char*)dst + dSize; return dSize; } diff --git a/thirdparty/zstd/decompress/zstd_decompress_block.h b/thirdparty/zstd/decompress/zstd_decompress_block.h index 9d1318882d..ab152404ba 100644 --- a/thirdparty/zstd/decompress/zstd_decompress_block.h +++ b/thirdparty/zstd/decompress/zstd_decompress_block.h @@ -47,7 +47,7 @@ typedef enum { */ size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, - const void* src, size_t srcSize, const int frame, const streaming_operation streaming); + const void* src, size_t srcSize, const streaming_operation streaming); /* ZSTD_buildFSETable() : * generate FSE decoding table for one symbol (ll, ml or off) diff --git a/thirdparty/zstd/decompress/zstd_decompress_internal.h b/thirdparty/zstd/decompress/zstd_decompress_internal.h index c2ec5d9fbe..83a7a0115f 100644 --- a/thirdparty/zstd/decompress/zstd_decompress_internal.h +++ b/thirdparty/zstd/decompress/zstd_decompress_internal.h @@ -153,6 +153,7 @@ struct ZSTD_DCtx_s size_t litSize; size_t rleSize; size_t staticSize; + int isFrameDecompression; #if DYNAMIC_BMI2 != 0 int bmi2; /* == 1 if the CPU supports BMI2 and 0 otherwise. CPU support is determined dynamically once per context lifetime. */ #endif @@ -166,6 +167,7 @@ struct ZSTD_DCtx_s ZSTD_DDictHashSet* ddictSet; /* Hash set for multiple ddicts */ ZSTD_refMultipleDDicts_e refMultipleDDicts; /* User specified: if == 1, will allow references to multiple DDicts. Default == 0 (disabled) */ int disableHufAsm; + int maxBlockSizeParam; /* streaming */ ZSTD_dStreamStage streamStage; diff --git a/thirdparty/zstd/zstd.h b/thirdparty/zstd/zstd.h index e5c3f8b68b..5d1fef8a6b 100644 --- a/thirdparty/zstd/zstd.h +++ b/thirdparty/zstd/zstd.h @@ -106,7 +106,7 @@ extern "C" { /*------ Version ------*/ #define ZSTD_VERSION_MAJOR 1 #define ZSTD_VERSION_MINOR 5 -#define ZSTD_VERSION_RELEASE 5 +#define ZSTD_VERSION_RELEASE 6 #define ZSTD_VERSION_NUMBER (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE) /*! ZSTD_versionNumber() : @@ -228,7 +228,7 @@ ZSTDLIB_API size_t ZSTD_findFrameCompressedSize(const void* src, size_t srcSize) * for example to size a static array on stack. * Will produce constant value 0 if srcSize too large. */ -#define ZSTD_MAX_INPUT_SIZE ((sizeof(size_t)==8) ? 0xFF00FF00FF00FF00LLU : 0xFF00FF00U) +#define ZSTD_MAX_INPUT_SIZE ((sizeof(size_t)==8) ? 0xFF00FF00FF00FF00ULL : 0xFF00FF00U) #define ZSTD_COMPRESSBOUND(srcSize) (((size_t)(srcSize) >= ZSTD_MAX_INPUT_SIZE) ? 0 : (srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0)) /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */ ZSTDLIB_API size_t ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */ /* ZSTD_isError() : @@ -249,7 +249,7 @@ ZSTDLIB_API int ZSTD_defaultCLevel(void); /*!< default compres /*= Compression context * When compressing many times, * it is recommended to allocate a context just once, - * and re-use it for each successive compression operation. + * and reuse it for each successive compression operation. * This will make workload friendlier for system's memory. * Note : re-using context is just a speed / resource optimization. * It doesn't change the compression ratio, which remains identical. @@ -262,9 +262,9 @@ ZSTDLIB_API size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx); /* accept NULL pointer * /*! ZSTD_compressCCtx() : * Same as ZSTD_compress(), using an explicit ZSTD_CCtx. - * Important : in order to behave similarly to `ZSTD_compress()`, - * this function compresses at requested compression level, - * __ignoring any other parameter__ . + * Important : in order to mirror `ZSTD_compress()` behavior, + * this function compresses at the requested compression level, + * __ignoring any other advanced parameter__ . * If any advanced parameter was set using the advanced API, * they will all be reset. Only `compressionLevel` remains. */ @@ -276,7 +276,7 @@ ZSTDLIB_API size_t ZSTD_compressCCtx(ZSTD_CCtx* cctx, /*= Decompression context * When decompressing many times, * it is recommended to allocate a context only once, - * and re-use it for each successive compression operation. + * and reuse it for each successive compression operation. * This will make workload friendlier for system's memory. * Use one context per thread for parallel execution. */ typedef struct ZSTD_DCtx_s ZSTD_DCtx; @@ -286,7 +286,7 @@ ZSTDLIB_API size_t ZSTD_freeDCtx(ZSTD_DCtx* dctx); /* accept NULL pointer * /*! ZSTD_decompressDCtx() : * Same as ZSTD_decompress(), * requires an allocated ZSTD_DCtx. - * Compatible with sticky parameters. + * Compatible with sticky parameters (see below). */ ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, @@ -302,12 +302,12 @@ ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx, * using ZSTD_CCtx_set*() functions. * Pushed parameters are sticky : they are valid for next compressed frame, and any subsequent frame. * "sticky" parameters are applicable to `ZSTD_compress2()` and `ZSTD_compressStream*()` ! - * __They do not apply to "simple" one-shot variants such as ZSTD_compressCCtx()__ . + * __They do not apply to one-shot variants such as ZSTD_compressCCtx()__ . * * It's possible to reset all parameters to "default" using ZSTD_CCtx_reset(). * * This API supersedes all other "advanced" API entry points in the experimental section. - * In the future, we expect to remove from experimental API entry points which are redundant with this API. + * In the future, we expect to remove API entry points from experimental which are redundant with this API. */ @@ -390,6 +390,19 @@ typedef enum { * The higher the value of selected strategy, the more complex it is, * resulting in stronger and slower compression. * Special: value 0 means "use default strategy". */ + + ZSTD_c_targetCBlockSize=130, /* v1.5.6+ + * Attempts to fit compressed block size into approximatively targetCBlockSize. + * Bound by ZSTD_TARGETCBLOCKSIZE_MIN and ZSTD_TARGETCBLOCKSIZE_MAX. + * Note that it's not a guarantee, just a convergence target (default:0). + * No target when targetCBlockSize == 0. + * This is helpful in low bandwidth streaming environments to improve end-to-end latency, + * when a client can make use of partial documents (a prominent example being Chrome). + * Note: this parameter is stable since v1.5.6. + * It was present as an experimental parameter in earlier versions, + * but it's not recommended using it with earlier library versions + * due to massive performance regressions. + */ /* LDM mode parameters */ ZSTD_c_enableLongDistanceMatching=160, /* Enable long distance matching. * This parameter is designed to improve compression ratio @@ -469,7 +482,6 @@ typedef enum { * ZSTD_c_forceMaxWindow * ZSTD_c_forceAttachDict * ZSTD_c_literalCompressionMode - * ZSTD_c_targetCBlockSize * ZSTD_c_srcSizeHint * ZSTD_c_enableDedicatedDictSearch * ZSTD_c_stableInBuffer @@ -490,7 +502,7 @@ typedef enum { ZSTD_c_experimentalParam3=1000, ZSTD_c_experimentalParam4=1001, ZSTD_c_experimentalParam5=1002, - ZSTD_c_experimentalParam6=1003, + /* was ZSTD_c_experimentalParam6=1003; is now ZSTD_c_targetCBlockSize */ ZSTD_c_experimentalParam7=1004, ZSTD_c_experimentalParam8=1005, ZSTD_c_experimentalParam9=1006, @@ -575,6 +587,7 @@ ZSTDLIB_API size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset); /*! ZSTD_compress2() : * Behave the same as ZSTD_compressCCtx(), but compression parameters are set using the advanced API. + * (note that this entry point doesn't even expose a compression level parameter). * ZSTD_compress2() always starts a new frame. * Should cctx hold data from a previously unfinished frame, everything about it is forgotten. * - Compression parameters are pushed into CCtx before starting compression, using ZSTD_CCtx_set*() @@ -618,6 +631,7 @@ typedef enum { * ZSTD_d_forceIgnoreChecksum * ZSTD_d_refMultipleDDicts * ZSTD_d_disableHuffmanAssembly + * ZSTD_d_maxBlockSize * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them. * note : never ever use experimentalParam? names directly */ @@ -625,7 +639,8 @@ typedef enum { ZSTD_d_experimentalParam2=1001, ZSTD_d_experimentalParam3=1002, ZSTD_d_experimentalParam4=1003, - ZSTD_d_experimentalParam5=1004 + ZSTD_d_experimentalParam5=1004, + ZSTD_d_experimentalParam6=1005 } ZSTD_dParameter; @@ -680,14 +695,14 @@ typedef struct ZSTD_outBuffer_s { * A ZSTD_CStream object is required to track streaming operation. * Use ZSTD_createCStream() and ZSTD_freeCStream() to create/release resources. * ZSTD_CStream objects can be reused multiple times on consecutive compression operations. -* It is recommended to re-use ZSTD_CStream since it will play nicer with system's memory, by re-using already allocated memory. +* It is recommended to reuse ZSTD_CStream since it will play nicer with system's memory, by re-using already allocated memory. * * For parallel execution, use one separate ZSTD_CStream per thread. * * note : since v1.3.0, ZSTD_CStream and ZSTD_CCtx are the same thing. * * Parameters are sticky : when starting a new compression on the same context, -* it will re-use the same sticky parameters as previous compression session. +* it will reuse the same sticky parameters as previous compression session. * When in doubt, it's recommended to fully initialize the context before usage. * Use ZSTD_CCtx_reset() to reset the context and ZSTD_CCtx_setParameter(), * ZSTD_CCtx_setPledgedSrcSize(), or ZSTD_CCtx_loadDictionary() and friends to @@ -776,6 +791,11 @@ typedef enum { * only ZSTD_e_end or ZSTD_e_flush operations are allowed. * Before starting a new compression job, or changing compression parameters, * it is required to fully flush internal buffers. + * - note: if an operation ends with an error, it may leave @cctx in an undefined state. + * Therefore, it's UB to invoke ZSTD_compressStream2() of ZSTD_compressStream() on such a state. + * In order to be re-employed after an error, a state must be reset, + * which can be done explicitly (ZSTD_CCtx_reset()), + * or is sometimes implied by methods starting a new compression job (ZSTD_initCStream(), ZSTD_compressCCtx()) */ ZSTDLIB_API size_t ZSTD_compressStream2( ZSTD_CCtx* cctx, ZSTD_outBuffer* output, @@ -835,7 +855,7 @@ ZSTDLIB_API size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output); * * A ZSTD_DStream object is required to track streaming operations. * Use ZSTD_createDStream() and ZSTD_freeDStream() to create/release resources. -* ZSTD_DStream objects can be re-used multiple times. +* ZSTD_DStream objects can be reused multiple times. * * Use ZSTD_initDStream() to start a new decompression operation. * @return : recommended first input size @@ -889,6 +909,12 @@ ZSTDLIB_API size_t ZSTD_initDStream(ZSTD_DStream* zds); * @return : 0 when a frame is completely decoded and fully flushed, * or an error code, which can be tested using ZSTD_isError(), * or any other value > 0, which means there is some decoding or flushing to do to complete current frame. + * + * Note: when an operation returns with an error code, the @zds state may be left in undefined state. + * It's UB to invoke `ZSTD_decompressStream()` on such a state. + * In order to re-use such a state, it must be first reset, + * which can be done explicitly (`ZSTD_DCtx_reset()`), + * or is implied for operations starting some new decompression job (`ZSTD_initDStream`, `ZSTD_decompressDCtx()`, `ZSTD_decompress_usingDict()`) */ ZSTDLIB_API size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inBuffer* input); @@ -1021,7 +1047,7 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize); * * This API allows dictionaries to be used with ZSTD_compress2(), * ZSTD_compressStream2(), and ZSTD_decompressDCtx(). - * Dictionaries are sticky, they remain valid when same context is re-used, + * Dictionaries are sticky, they remain valid when same context is reused, * they only reset when the context is reset * with ZSTD_reset_parameters or ZSTD_reset_session_and_parameters. * In contrast, Prefixes are single-use. @@ -1239,7 +1265,7 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict); #define ZSTD_LDM_HASHRATELOG_MAX (ZSTD_WINDOWLOG_MAX - ZSTD_HASHLOG_MIN) /* Advanced parameter bounds */ -#define ZSTD_TARGETCBLOCKSIZE_MIN 64 +#define ZSTD_TARGETCBLOCKSIZE_MIN 1340 /* suitable to fit into an ethernet / wifi / 4G transport frame */ #define ZSTD_TARGETCBLOCKSIZE_MAX ZSTD_BLOCKSIZE_MAX #define ZSTD_SRCSIZEHINT_MIN 0 #define ZSTD_SRCSIZEHINT_MAX INT_MAX @@ -1527,25 +1553,38 @@ typedef enum { ZSTDLIB_STATIC_API size_t ZSTD_sequenceBound(size_t srcSize); /*! ZSTD_generateSequences() : + * WARNING: This function is meant for debugging and informational purposes ONLY! + * Its implementation is flawed, and it will be deleted in a future version. + * It is not guaranteed to succeed, as there are several cases where it will give + * up and fail. You should NOT use this function in production code. + * + * This function is deprecated, and will be removed in a future version. + * * Generate sequences using ZSTD_compress2(), given a source buffer. * + * @param zc The compression context to be used for ZSTD_compress2(). Set any + * compression parameters you need on this context. + * @param outSeqs The output sequences buffer of size @p outSeqsSize + * @param outSeqsSize The size of the output sequences buffer. + * ZSTD_sequenceBound(srcSize) is an upper bound on the number + * of sequences that can be generated. + * @param src The source buffer to generate sequences from of size @p srcSize. + * @param srcSize The size of the source buffer. + * * Each block will end with a dummy sequence * with offset == 0, matchLength == 0, and litLength == length of last literals. * litLength may be == 0, and if so, then the sequence of (of: 0 ml: 0 ll: 0) * simply acts as a block delimiter. * - * @zc can be used to insert custom compression params. - * This function invokes ZSTD_compress2(). - * - * The output of this function can be fed into ZSTD_compressSequences() with CCtx - * setting of ZSTD_c_blockDelimiters as ZSTD_sf_explicitBlockDelimiters - * @return : number of sequences generated + * @returns The number of sequences generated, necessarily less than + * ZSTD_sequenceBound(srcSize), or an error code that can be checked + * with ZSTD_isError(). */ - +ZSTD_DEPRECATED("For debugging only, will be replaced by ZSTD_extractSequences()") ZSTDLIB_STATIC_API size_t -ZSTD_generateSequences( ZSTD_CCtx* zc, - ZSTD_Sequence* outSeqs, size_t outSeqsSize, - const void* src, size_t srcSize); +ZSTD_generateSequences(ZSTD_CCtx* zc, + ZSTD_Sequence* outSeqs, size_t outSeqsSize, + const void* src, size_t srcSize); /*! ZSTD_mergeBlockDelimiters() : * Given an array of ZSTD_Sequence, remove all sequences that represent block delimiters/last literals @@ -1640,56 +1679,59 @@ ZSTDLIB_API unsigned ZSTD_isSkippableFrame(const void* buffer, size_t size); /*! ZSTD_estimate*() : * These functions make it possible to estimate memory usage * of a future {D,C}Ctx, before its creation. + * This is useful in combination with ZSTD_initStatic(), + * which makes it possible to employ a static buffer for ZSTD_CCtx* state. * * ZSTD_estimateCCtxSize() will provide a memory budget large enough - * for any compression level up to selected one. - * Note : Unlike ZSTD_estimateCStreamSize*(), this estimate - * does not include space for a window buffer. - * Therefore, the estimation is only guaranteed for single-shot compressions, not streaming. + * to compress data of any size using one-shot compression ZSTD_compressCCtx() or ZSTD_compress2() + * associated with any compression level up to max specified one. * The estimate will assume the input may be arbitrarily large, * which is the worst case. * + * Note that the size estimation is specific for one-shot compression, + * it is not valid for streaming (see ZSTD_estimateCStreamSize*()) + * nor other potential ways of using a ZSTD_CCtx* state. + * * When srcSize can be bound by a known and rather "small" value, - * this fact can be used to provide a tighter estimation - * because the CCtx compression context will need less memory. - * This tighter estimation can be provided by more advanced functions + * this knowledge can be used to provide a tighter budget estimation + * because the ZSTD_CCtx* state will need less memory for small inputs. + * This tighter estimation can be provided by employing more advanced functions * ZSTD_estimateCCtxSize_usingCParams(), which can be used in tandem with ZSTD_getCParams(), * and ZSTD_estimateCCtxSize_usingCCtxParams(), which can be used in tandem with ZSTD_CCtxParams_setParameter(). * Both can be used to estimate memory using custom compression parameters and arbitrary srcSize limits. * * Note : only single-threaded compression is supported. * ZSTD_estimateCCtxSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1. - * - * Note 2 : ZSTD_estimateCCtxSize* functions are not compatible with the Block-Level Sequence Producer API at this time. - * Size estimates assume that no external sequence producer is registered. */ -ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize(int compressionLevel); +ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize(int maxCompressionLevel); ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams); ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params); ZSTDLIB_STATIC_API size_t ZSTD_estimateDCtxSize(void); /*! ZSTD_estimateCStreamSize() : - * ZSTD_estimateCStreamSize() will provide a budget large enough for any compression level up to selected one. - * It will also consider src size to be arbitrarily "large", which is worst case. + * ZSTD_estimateCStreamSize() will provide a memory budget large enough for streaming compression + * using any compression level up to the max specified one. + * It will also consider src size to be arbitrarily "large", which is a worst case scenario. * If srcSize is known to always be small, ZSTD_estimateCStreamSize_usingCParams() can provide a tighter estimation. * ZSTD_estimateCStreamSize_usingCParams() can be used in tandem with ZSTD_getCParams() to create cParams from compressionLevel. * ZSTD_estimateCStreamSize_usingCCtxParams() can be used in tandem with ZSTD_CCtxParams_setParameter(). Only single-threaded compression is supported. This function will return an error code if ZSTD_c_nbWorkers is >= 1. * Note : CStream size estimation is only correct for single-threaded compression. - * ZSTD_DStream memory budget depends on window Size. + * ZSTD_estimateCStreamSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1. + * Note 2 : ZSTD_estimateCStreamSize* functions are not compatible with the Block-Level Sequence Producer API at this time. + * Size estimates assume that no external sequence producer is registered. + * + * ZSTD_DStream memory budget depends on frame's window Size. * This information can be passed manually, using ZSTD_estimateDStreamSize, * or deducted from a valid frame Header, using ZSTD_estimateDStreamSize_fromFrame(); + * Any frame requesting a window size larger than max specified one will be rejected. * Note : if streaming is init with function ZSTD_init?Stream_usingDict(), * an internal ?Dict will be created, which additional size is not estimated here. * In this case, get total size by adding ZSTD_estimate?DictSize - * Note 2 : only single-threaded compression is supported. - * ZSTD_estimateCStreamSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1. - * Note 3 : ZSTD_estimateCStreamSize* functions are not compatible with the Block-Level Sequence Producer API at this time. - * Size estimates assume that no external sequence producer is registered. */ -ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize(int compressionLevel); +ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize(int maxCompressionLevel); ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams); ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params); -ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize(size_t windowSize); +ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize(size_t maxWindowSize); ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize); /*! ZSTD_estimate?DictSize() : @@ -1946,11 +1988,6 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo */ #define ZSTD_c_literalCompressionMode ZSTD_c_experimentalParam5 -/* Tries to fit compressed block size to be around targetCBlockSize. - * No target when targetCBlockSize == 0. - * There is no guarantee on compressed block size (default:0) */ -#define ZSTD_c_targetCBlockSize ZSTD_c_experimentalParam6 - /* User's best guess of source size. * Hint is not valid when srcSizeHint == 0. * There is no guarantee that hint is close to actual source size, @@ -2430,6 +2467,22 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete */ #define ZSTD_d_disableHuffmanAssembly ZSTD_d_experimentalParam5 +/* ZSTD_d_maxBlockSize + * Allowed values are between 1KB and ZSTD_BLOCKSIZE_MAX (128KB). + * The default is ZSTD_BLOCKSIZE_MAX, and setting to 0 will set to the default. + * + * Forces the decompressor to reject blocks whose content size is + * larger than the configured maxBlockSize. When maxBlockSize is + * larger than the windowSize, the windowSize is used instead. + * This saves memory on the decoder when you know all blocks are small. + * + * This option is typically used in conjunction with ZSTD_c_maxBlockSize. + * + * WARNING: This causes the decoder to reject otherwise valid frames + * that have block sizes larger than the configured maxBlockSize. + */ +#define ZSTD_d_maxBlockSize ZSTD_d_experimentalParam6 + /*! ZSTD_DCtx_setFormat() : * This function is REDUNDANT. Prefer ZSTD_DCtx_setParameter(). @@ -2557,7 +2610,7 @@ size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, * explicitly specified. * * start a new frame, using same parameters from previous frame. - * This is typically useful to skip dictionary loading stage, since it will re-use it in-place. + * This is typically useful to skip dictionary loading stage, since it will reuse it in-place. * Note that zcs must be init at least once before using ZSTD_resetCStream(). * If pledgedSrcSize is not known at reset time, use macro ZSTD_CONTENTSIZE_UNKNOWN. * If pledgedSrcSize > 0, its value must be correct, as it will be written in header, and controlled at the end. @@ -2633,7 +2686,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const Z * * ZSTD_DCtx_reset(zds, ZSTD_reset_session_only); * - * re-use decompression parameters from previous init; saves dictionary loading + * reuse decompression parameters from previous init; saves dictionary loading */ ZSTD_DEPRECATED("use ZSTD_DCtx_reset, see zstd.h for detailed instructions") ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds); @@ -2765,7 +2818,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds); #define ZSTD_SEQUENCE_PRODUCER_ERROR ((size_t)(-1)) -typedef size_t ZSTD_sequenceProducer_F ( +typedef size_t (*ZSTD_sequenceProducer_F) ( void* sequenceProducerState, ZSTD_Sequence* outSeqs, size_t outSeqsCapacity, const void* src, size_t srcSize, @@ -2797,7 +2850,23 @@ ZSTDLIB_STATIC_API void ZSTD_registerSequenceProducer( ZSTD_CCtx* cctx, void* sequenceProducerState, - ZSTD_sequenceProducer_F* sequenceProducer + ZSTD_sequenceProducer_F sequenceProducer +); + +/*! ZSTD_CCtxParams_registerSequenceProducer() : + * Same as ZSTD_registerSequenceProducer(), but operates on ZSTD_CCtx_params. + * This is used for accurate size estimation with ZSTD_estimateCCtxSize_usingCCtxParams(), + * which is needed when creating a ZSTD_CCtx with ZSTD_initStaticCCtx(). + * + * If you are using the external sequence producer API in a scenario where ZSTD_initStaticCCtx() + * is required, then this function is for you. Otherwise, you probably don't need it. + * + * See tests/zstreamtest.c for example usage. */ +ZSTDLIB_STATIC_API void +ZSTD_CCtxParams_registerSequenceProducer( + ZSTD_CCtx_params* params, + void* sequenceProducerState, + ZSTD_sequenceProducer_F sequenceProducer ); @@ -2820,7 +2889,7 @@ ZSTD_registerSequenceProducer( A ZSTD_CCtx object is required to track streaming operations. Use ZSTD_createCCtx() / ZSTD_freeCCtx() to manage resource. - ZSTD_CCtx object can be re-used multiple times within successive compression operations. + ZSTD_CCtx object can be reused multiple times within successive compression operations. Start by initializing a context. Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary compression. @@ -2841,7 +2910,7 @@ ZSTD_registerSequenceProducer( It's possible to use srcSize==0, in which case, it will write a final empty block to end the frame. Without last block mark, frames are considered unfinished (hence corrupted) by compliant decoders. - `ZSTD_CCtx` object can be re-used (ZSTD_compressBegin()) to compress again. + `ZSTD_CCtx` object can be reused (ZSTD_compressBegin()) to compress again. */ /*===== Buffer-less streaming compression functions =====*/ @@ -2873,7 +2942,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_ A ZSTD_DCtx object is required to track streaming operations. Use ZSTD_createDCtx() / ZSTD_freeDCtx() to manage it. - A ZSTD_DCtx object can be re-used multiple times. + A ZSTD_DCtx object can be reused multiple times. First typical operation is to retrieve frame parameters, using ZSTD_getFrameHeader(). Frame header is extracted from the beginning of compressed frame, so providing only the frame's beginning is enough. |