diff options
Diffstat (limited to 'thirdparty/zstd/compress/zstd_lazy.c')
-rw-r--r-- | thirdparty/zstd/compress/zstd_lazy.c | 743 |
1 files changed, 398 insertions, 345 deletions
diff --git a/thirdparty/zstd/compress/zstd_lazy.c b/thirdparty/zstd/compress/zstd_lazy.c index 2e38dcb46d..5ba88e8678 100644 --- a/thirdparty/zstd/compress/zstd_lazy.c +++ b/thirdparty/zstd/compress/zstd_lazy.c @@ -1,5 +1,5 @@ /* - * Copyright (c) Yann Collet, Facebook, Inc. + * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -10,6 +10,9 @@ #include "zstd_compress_internal.h" #include "zstd_lazy.h" +#include "../common/bits.h" /* ZSTD_countTrailingZeros64 */ + +#define kLazySkippingStep 8 /*-************************************* @@ -197,8 +200,8 @@ ZSTD_DUBT_findBetterDictMatch ( U32 matchIndex = dictMatchIndex + dictIndexDelta; if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) { DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)", - curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, STORE_OFFSET(curr - matchIndex), dictMatchIndex, matchIndex); - bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex); + curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, OFFSET_TO_OFFBASE(curr - matchIndex), dictMatchIndex, matchIndex); + bestLength = matchLength, *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex); } if (ip+matchLength == iend) { /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */ break; /* drop, to guarantee consistency (miss a little bit of compression) */ @@ -218,7 +221,7 @@ ZSTD_DUBT_findBetterDictMatch ( } if (bestLength >= MINMATCH) { - U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex; + U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offsetPtr); (void)mIndex; DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)", curr, (U32)bestLength, (U32)*offsetPtr, mIndex); } @@ -230,7 +233,7 @@ ZSTD_DUBT_findBetterDictMatch ( static size_t ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, const BYTE* const ip, const BYTE* const iend, - size_t* offsetPtr, + size_t* offBasePtr, U32 const mls, const ZSTD_dictMode_e dictMode) { @@ -327,8 +330,8 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, if (matchLength > bestLength) { if (matchLength > matchEndIdx - matchIndex) matchEndIdx = matchIndex + (U32)matchLength; - if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) - bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex); + if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr - matchIndex + 1) - ZSTD_highbit32((U32)*offBasePtr)) ) + bestLength = matchLength, *offBasePtr = OFFSET_TO_OFFBASE(curr - matchIndex); if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */ if (dictMode == ZSTD_dictMatchState) { nbCompares = 0; /* in addition to avoiding checking any @@ -361,16 +364,16 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, if (dictMode == ZSTD_dictMatchState && nbCompares) { bestLength = ZSTD_DUBT_findBetterDictMatch( ms, ip, iend, - offsetPtr, bestLength, nbCompares, + offBasePtr, bestLength, nbCompares, mls, dictMode); } assert(matchEndIdx > curr+8); /* ensure nextToUpdate is increased */ ms->nextToUpdate = matchEndIdx - 8; /* skip repetitive patterns */ if (bestLength >= MINMATCH) { - U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex; + U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offBasePtr); (void)mIndex; DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)", - curr, (U32)bestLength, (U32)*offsetPtr, mIndex); + curr, (U32)bestLength, (U32)*offBasePtr, mIndex); } return bestLength; } @@ -381,14 +384,14 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, FORCE_INLINE_TEMPLATE size_t ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms, const BYTE* const ip, const BYTE* const iLimit, - size_t* offsetPtr, + size_t* offBasePtr, const U32 mls /* template */, const ZSTD_dictMode_e dictMode) { DEBUGLOG(7, "ZSTD_BtFindBestMatch"); if (ip < ms->window.base + ms->nextToUpdate) return 0; /* skipped area */ ZSTD_updateDUBT(ms, ip, iLimit, mls); - return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offsetPtr, mls, dictMode); + return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offBasePtr, mls, dictMode); } /*********************************** @@ -561,7 +564,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb /* save best solution */ if (currentMl > ml) { ml = currentMl; - *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta)); + *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta)); if (ip+currentMl == iLimit) { /* best possible, avoids read overflow on next attempt */ return ml; @@ -598,7 +601,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb /* save best solution */ if (currentMl > ml) { ml = currentMl; - *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta)); + *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta)); if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ } } @@ -617,7 +620,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal( ZSTD_matchState_t* ms, const ZSTD_compressionParameters* const cParams, - const BYTE* ip, U32 const mls) + const BYTE* ip, U32 const mls, U32 const lazySkipping) { U32* const hashTable = ms->hashTable; const U32 hashLog = cParams->hashLog; @@ -632,6 +635,9 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal( NEXT_IN_CHAIN(idx, chainMask) = hashTable[h]; hashTable[h] = idx; idx++; + /* Stop inserting every position when in the lazy skipping mode. */ + if (lazySkipping) + break; } ms->nextToUpdate = target; @@ -640,7 +646,7 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal( U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) { const ZSTD_compressionParameters* const cParams = &ms->cParams; - return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch); + return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch, /* lazySkipping*/ 0); } /* inlining is important to hardwire a hot branch (template emulation) */ @@ -684,14 +690,15 @@ size_t ZSTD_HcFindBestMatch( } /* HC4 match finder */ - matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls); + matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls, ms->lazySkipping); for ( ; (matchIndex>=lowLimit) & (nbAttempts>0) ; nbAttempts--) { size_t currentMl=0; if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) { const BYTE* const match = base + matchIndex; assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */ - if (match[ml] == ip[ml]) /* potentially better */ + /* read 4B starting from (match + ml + 1 - sizeof(U32)) */ + if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3)) /* potentially better */ currentMl = ZSTD_count(ip, match, iLimit); } else { const BYTE* const match = dictBase + matchIndex; @@ -703,7 +710,7 @@ size_t ZSTD_HcFindBestMatch( /* save best solution */ if (currentMl > ml) { ml = currentMl; - *offsetPtr = STORE_OFFSET(curr - matchIndex); + *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex); if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ } @@ -739,7 +746,7 @@ size_t ZSTD_HcFindBestMatch( if (currentMl > ml) { ml = currentMl; assert(curr > matchIndex + dmsIndexDelta); - *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta)); + *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta)); if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ } @@ -756,8 +763,6 @@ size_t ZSTD_HcFindBestMatch( * (SIMD) Row-based matchfinder ***********************************/ /* Constants for row-based hash */ -#define ZSTD_ROW_HASH_TAG_OFFSET 16 /* byte offset of hashes in the match state's tagTable from the beginning of a row */ -#define ZSTD_ROW_HASH_TAG_BITS 8 /* nb bits to use for the tag */ #define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1) #define ZSTD_ROW_HASH_MAX_ENTRIES 64 /* absolute maximum number of entries per row, for all configurations */ @@ -769,73 +774,19 @@ typedef U64 ZSTD_VecMask; /* Clarifies when we are interacting with a U64 repr * Starting from the LSB, returns the idx of the next non-zero bit. * Basically counting the nb of trailing zeroes. */ -static U32 ZSTD_VecMask_next(ZSTD_VecMask val) { - assert(val != 0); -# if defined(_MSC_VER) && defined(_WIN64) - if (val != 0) { - unsigned long r; - _BitScanForward64(&r, val); - return (U32)(r); - } else { - /* Should not reach this code path */ - __assume(0); - } -# elif (defined(__GNUC__) && ((__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4)))) - if (sizeof(size_t) == 4) { - U32 mostSignificantWord = (U32)(val >> 32); - U32 leastSignificantWord = (U32)val; - if (leastSignificantWord == 0) { - return 32 + (U32)__builtin_ctz(mostSignificantWord); - } else { - return (U32)__builtin_ctz(leastSignificantWord); - } - } else { - return (U32)__builtin_ctzll(val); - } -# else - /* Software ctz version: http://aggregate.org/MAGIC/#Trailing%20Zero%20Count - * and: https://stackoverflow.com/questions/2709430/count-number-of-bits-in-a-64-bit-long-big-integer - */ - val = ~val & (val - 1ULL); /* Lowest set bit mask */ - val = val - ((val >> 1) & 0x5555555555555555); - val = (val & 0x3333333333333333ULL) + ((val >> 2) & 0x3333333333333333ULL); - return (U32)((((val + (val >> 4)) & 0xF0F0F0F0F0F0F0FULL) * 0x101010101010101ULL) >> 56); -# endif -} - -/* ZSTD_rotateRight_*(): - * Rotates a bitfield to the right by "count" bits. - * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts - */ -FORCE_INLINE_TEMPLATE -U64 ZSTD_rotateRight_U64(U64 const value, U32 count) { - assert(count < 64); - count &= 0x3F; /* for fickle pattern recognition */ - return (value >> count) | (U64)(value << ((0U - count) & 0x3F)); -} - -FORCE_INLINE_TEMPLATE -U32 ZSTD_rotateRight_U32(U32 const value, U32 count) { - assert(count < 32); - count &= 0x1F; /* for fickle pattern recognition */ - return (value >> count) | (U32)(value << ((0U - count) & 0x1F)); -} - -FORCE_INLINE_TEMPLATE -U16 ZSTD_rotateRight_U16(U16 const value, U32 count) { - assert(count < 16); - count &= 0x0F; /* for fickle pattern recognition */ - return (value >> count) | (U16)(value << ((0U - count) & 0x0F)); +MEM_STATIC U32 ZSTD_VecMask_next(ZSTD_VecMask val) { + return ZSTD_countTrailingZeros64(val); } /* ZSTD_row_nextIndex(): * Returns the next index to insert at within a tagTable row, and updates the "head" - * value to reflect the update. Essentially cycles backwards from [0, {entries per row}) + * value to reflect the update. Essentially cycles backwards from [1, {entries per row}) */ FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextIndex(BYTE* const tagRow, U32 const rowMask) { - U32 const next = (*tagRow - 1) & rowMask; - *tagRow = (BYTE)next; - return next; + U32 next = (*tagRow-1) & rowMask; + next += (next == 0) ? rowMask : 0; /* skip first position */ + *tagRow = (BYTE)next; + return next; } /* ZSTD_isAligned(): @@ -849,7 +800,7 @@ MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) { /* ZSTD_row_prefetch(): * Performs prefetching for the hashTable and tagTable at a given row. */ -FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 const* tagTable, U32 const relRow, U32 const rowLog) { +FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, BYTE const* tagTable, U32 const relRow, U32 const rowLog) { PREFETCH_L1(hashTable + relRow); if (rowLog >= 5) { PREFETCH_L1(hashTable + relRow + 16); @@ -873,13 +824,13 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const B U32 idx, const BYTE* const iLimit) { U32 const* const hashTable = ms->hashTable; - U16 const* const tagTable = ms->tagTable; + BYTE const* const tagTable = ms->tagTable; U32 const hashLog = ms->rowHashLog; U32 const maxElemsToPrefetch = (base + idx) > iLimit ? 0 : (U32)(iLimit - (base + idx) + 1); U32 const lim = idx + MIN(ZSTD_ROW_HASH_CACHE_SIZE, maxElemsToPrefetch); for (; idx < lim; ++idx) { - U32 const hash = (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls); + U32 const hash = (U32)ZSTD_hashPtrSalted(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt); U32 const row = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; ZSTD_row_prefetch(hashTable, tagTable, row, rowLog); ms->hashCache[idx & ZSTD_ROW_HASH_CACHE_MASK] = hash; @@ -895,11 +846,12 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const B * base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate rows from hashTable and tagTable. */ FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable, - U16 const* tagTable, BYTE const* base, + BYTE const* tagTable, BYTE const* base, U32 idx, U32 const hashLog, - U32 const rowLog, U32 const mls) + U32 const rowLog, U32 const mls, + U64 const hashSalt) { - U32 const newHash = (U32)ZSTD_hashPtr(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls); + U32 const newHash = (U32)ZSTD_hashPtrSalted(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt); U32 const row = (newHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; ZSTD_row_prefetch(hashTable, tagTable, row, rowLog); { U32 const hash = cache[idx & ZSTD_ROW_HASH_CACHE_MASK]; @@ -917,22 +869,21 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms, U32 const rowMask, U32 const useCache) { U32* const hashTable = ms->hashTable; - U16* const tagTable = ms->tagTable; + BYTE* const tagTable = ms->tagTable; U32 const hashLog = ms->rowHashLog; const BYTE* const base = ms->window.base; DEBUGLOG(6, "ZSTD_row_update_internalImpl(): updateStartIdx=%u, updateEndIdx=%u", updateStartIdx, updateEndIdx); for (; updateStartIdx < updateEndIdx; ++updateStartIdx) { - U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls) - : (U32)ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls); + U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls, ms->hashSalt) + : (U32)ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt); U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; U32* const row = hashTable + relRow; - BYTE* tagRow = (BYTE*)(tagTable + relRow); /* Though tagTable is laid out as a table of U16, each tag is only 1 byte. - Explicit cast allows us to get exact desired position within each row */ + BYTE* tagRow = tagTable + relRow; U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask); - assert(hash == ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls)); - ((BYTE*)tagRow)[pos + ZSTD_ROW_HASH_TAG_OFFSET] = hash & ZSTD_ROW_HASH_TAG_MASK; + assert(hash == ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt)); + tagRow[pos] = hash & ZSTD_ROW_HASH_TAG_MASK; row[pos] = updateStartIdx; } } @@ -980,7 +931,35 @@ void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) { const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */); DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog); - ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* dont use cache */); + ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* don't use cache */); +} + +/* Returns the mask width of bits group of which will be set to 1. Given not all + * architectures have easy movemask instruction, this helps to iterate over + * groups of bits easier and faster. + */ +FORCE_INLINE_TEMPLATE U32 +ZSTD_row_matchMaskGroupWidth(const U32 rowEntries) +{ + assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64); + assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES); + (void)rowEntries; +#if defined(ZSTD_ARCH_ARM_NEON) + /* NEON path only works for little endian */ + if (!MEM_isLittleEndian()) { + return 1; + } + if (rowEntries == 16) { + return 4; + } + if (rowEntries == 32) { + return 2; + } + if (rowEntries == 64) { + return 1; + } +#endif + return 1; } #if defined(ZSTD_ARCH_X86_SSE2) @@ -1003,71 +982,82 @@ ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U } #endif -/* Returns a ZSTD_VecMask (U32) that has the nth bit set to 1 if the newly-computed "tag" matches - * the hash at the nth position in a row of the tagTable. - * Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield - * to match up with the actual layout of the entries within the hashTable */ +#if defined(ZSTD_ARCH_ARM_NEON) +FORCE_INLINE_TEMPLATE ZSTD_VecMask +ZSTD_row_getNEONMask(const U32 rowEntries, const BYTE* const src, const BYTE tag, const U32 headGrouped) +{ + assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64); + if (rowEntries == 16) { + /* vshrn_n_u16 shifts by 4 every u16 and narrows to 8 lower bits. + * After that groups of 4 bits represent the equalMask. We lower + * all bits except the highest in these groups by doing AND with + * 0x88 = 0b10001000. + */ + const uint8x16_t chunk = vld1q_u8(src); + const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag))); + const uint8x8_t res = vshrn_n_u16(equalMask, 4); + const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0); + return ZSTD_rotateRight_U64(matches, headGrouped) & 0x8888888888888888ull; + } else if (rowEntries == 32) { + /* Same idea as with rowEntries == 16 but doing AND with + * 0x55 = 0b01010101. + */ + const uint16x8x2_t chunk = vld2q_u16((const uint16_t*)(const void*)src); + const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]); + const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]); + const uint8x16_t dup = vdupq_n_u8(tag); + const uint8x8_t t0 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk0, dup)), 6); + const uint8x8_t t1 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk1, dup)), 6); + const uint8x8_t res = vsli_n_u8(t0, t1, 4); + const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0) ; + return ZSTD_rotateRight_U64(matches, headGrouped) & 0x5555555555555555ull; + } else { /* rowEntries == 64 */ + const uint8x16x4_t chunk = vld4q_u8(src); + const uint8x16_t dup = vdupq_n_u8(tag); + const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup); + const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup); + const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup); + const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup); + + const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1); + const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1); + const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2); + const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4); + const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4); + const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0); + return ZSTD_rotateRight_U64(matches, headGrouped); + } +} +#endif + +/* Returns a ZSTD_VecMask (U64) that has the nth group (determined by + * ZSTD_row_matchMaskGroupWidth) of bits set to 1 if the newly-computed "tag" + * matches the hash at the nth position in a row of the tagTable. + * Each row is a circular buffer beginning at the value of "headGrouped". So we + * must rotate the "matches" bitfield to match up with the actual layout of the + * entries within the hashTable */ FORCE_INLINE_TEMPLATE ZSTD_VecMask -ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, const U32 rowEntries) +ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 headGrouped, const U32 rowEntries) { - const BYTE* const src = tagRow + ZSTD_ROW_HASH_TAG_OFFSET; + const BYTE* const src = tagRow; assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64); assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES); + assert(ZSTD_row_matchMaskGroupWidth(rowEntries) * rowEntries <= sizeof(ZSTD_VecMask) * 8); #if defined(ZSTD_ARCH_X86_SSE2) - return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, head); + return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, headGrouped); #else /* SW or NEON-LE */ # if defined(ZSTD_ARCH_ARM_NEON) /* This NEON path only works for little endian - otherwise use SWAR below */ if (MEM_isLittleEndian()) { - if (rowEntries == 16) { - const uint8x16_t chunk = vld1q_u8(src); - const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag))); - const uint16x8_t t0 = vshlq_n_u16(equalMask, 7); - const uint32x4_t t1 = vreinterpretq_u32_u16(vsriq_n_u16(t0, t0, 14)); - const uint64x2_t t2 = vreinterpretq_u64_u32(vshrq_n_u32(t1, 14)); - const uint8x16_t t3 = vreinterpretq_u8_u64(vsraq_n_u64(t2, t2, 28)); - const U16 hi = (U16)vgetq_lane_u8(t3, 8); - const U16 lo = (U16)vgetq_lane_u8(t3, 0); - return ZSTD_rotateRight_U16((hi << 8) | lo, head); - } else if (rowEntries == 32) { - const uint16x8x2_t chunk = vld2q_u16((const U16*)(const void*)src); - const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]); - const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]); - const uint8x16_t equalMask0 = vceqq_u8(chunk0, vdupq_n_u8(tag)); - const uint8x16_t equalMask1 = vceqq_u8(chunk1, vdupq_n_u8(tag)); - const int8x8_t pack0 = vqmovn_s16(vreinterpretq_s16_u8(equalMask0)); - const int8x8_t pack1 = vqmovn_s16(vreinterpretq_s16_u8(equalMask1)); - const uint8x8_t t0 = vreinterpret_u8_s8(pack0); - const uint8x8_t t1 = vreinterpret_u8_s8(pack1); - const uint8x8_t t2 = vsri_n_u8(t1, t0, 2); - const uint8x8x2_t t3 = vuzp_u8(t2, t0); - const uint8x8_t t4 = vsri_n_u8(t3.val[1], t3.val[0], 4); - const U32 matches = vget_lane_u32(vreinterpret_u32_u8(t4), 0); - return ZSTD_rotateRight_U32(matches, head); - } else { /* rowEntries == 64 */ - const uint8x16x4_t chunk = vld4q_u8(src); - const uint8x16_t dup = vdupq_n_u8(tag); - const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup); - const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup); - const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup); - const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup); - - const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1); - const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1); - const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2); - const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4); - const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4); - const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0); - return ZSTD_rotateRight_U64(matches, head); - } + return ZSTD_row_getNEONMask(rowEntries, src, tag, headGrouped); } # endif /* ZSTD_ARCH_ARM_NEON */ /* SWAR */ - { const size_t chunkSize = sizeof(size_t); + { const int chunkSize = sizeof(size_t); const size_t shiftAmount = ((chunkSize * 8) - chunkSize); const size_t xFF = ~((size_t)0); const size_t x01 = xFF / 0xFF; @@ -1100,11 +1090,11 @@ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, } matches = ~matches; if (rowEntries == 16) { - return ZSTD_rotateRight_U16((U16)matches, head); + return ZSTD_rotateRight_U16((U16)matches, headGrouped); } else if (rowEntries == 32) { - return ZSTD_rotateRight_U32((U32)matches, head); + return ZSTD_rotateRight_U32((U32)matches, headGrouped); } else { - return ZSTD_rotateRight_U64((U64)matches, head); + return ZSTD_rotateRight_U64((U64)matches, headGrouped); } } #endif @@ -1134,7 +1124,7 @@ size_t ZSTD_RowFindBestMatch( const U32 rowLog) { U32* const hashTable = ms->hashTable; - U16* const tagTable = ms->tagTable; + BYTE* const tagTable = ms->tagTable; U32* const hashCache = ms->hashCache; const U32 hashLog = ms->rowHashLog; const ZSTD_compressionParameters* const cParams = &ms->cParams; @@ -1152,8 +1142,11 @@ size_t ZSTD_RowFindBestMatch( const U32 rowEntries = (1U << rowLog); const U32 rowMask = rowEntries - 1; const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */ + const U32 groupWidth = ZSTD_row_matchMaskGroupWidth(rowEntries); + const U64 hashSalt = ms->hashSalt; U32 nbAttempts = 1U << cappedSearchLog; size_t ml=4-1; + U32 hash; /* DMS/DDS variables that may be referenced laster */ const ZSTD_matchState_t* const dms = ms->dictMatchState; @@ -1177,7 +1170,7 @@ size_t ZSTD_RowFindBestMatch( if (dictMode == ZSTD_dictMatchState) { /* Prefetch DMS rows */ U32* const dmsHashTable = dms->hashTable; - U16* const dmsTagTable = dms->tagTable; + BYTE* const dmsTagTable = dms->tagTable; U32 const dmsHash = (U32)ZSTD_hashPtr(ip, dms->rowHashLog + ZSTD_ROW_HASH_TAG_BITS, mls); U32 const dmsRelRow = (dmsHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; dmsTag = dmsHash & ZSTD_ROW_HASH_TAG_MASK; @@ -1187,23 +1180,34 @@ size_t ZSTD_RowFindBestMatch( } /* Update the hashTable and tagTable up to (but not including) ip */ - ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */); + if (!ms->lazySkipping) { + ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */); + hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls, hashSalt); + } else { + /* Stop inserting every position when in the lazy skipping mode. + * The hash cache is also not kept up to date in this mode. + */ + hash = (U32)ZSTD_hashPtrSalted(ip, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt); + ms->nextToUpdate = curr; + } + ms->hashSaltEntropy += hash; /* collect salt entropy */ + { /* Get the hash for ip, compute the appropriate row */ - U32 const hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls); U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK; U32* const row = hashTable + relRow; BYTE* tagRow = (BYTE*)(tagTable + relRow); - U32 const head = *tagRow & rowMask; + U32 const headGrouped = (*tagRow & rowMask) * groupWidth; U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES]; size_t numMatches = 0; size_t currMatch = 0; - ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, head, rowEntries); + ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, headGrouped, rowEntries); /* Cycle through the matches and prefetch */ - for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) { - U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask; + for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) { + U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask; U32 const matchIndex = row[matchPos]; + if(matchPos == 0) continue; assert(numMatches < rowEntries); if (matchIndex < lowLimit) break; @@ -1213,13 +1217,14 @@ size_t ZSTD_RowFindBestMatch( PREFETCH_L1(dictBase + matchIndex); } matchBuffer[numMatches++] = matchIndex; + --nbAttempts; } /* Speed opt: insert current byte into hashtable too. This allows us to avoid one iteration of the loop in ZSTD_row_update_internal() at the next search. */ { U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask); - tagRow[pos + ZSTD_ROW_HASH_TAG_OFFSET] = (BYTE)tag; + tagRow[pos] = (BYTE)tag; row[pos] = ms->nextToUpdate++; } @@ -1233,7 +1238,8 @@ size_t ZSTD_RowFindBestMatch( if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) { const BYTE* const match = base + matchIndex; assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */ - if (match[ml] == ip[ml]) /* potentially better */ + /* read 4B starting from (match + ml + 1 - sizeof(U32)) */ + if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3)) /* potentially better */ currentMl = ZSTD_count(ip, match, iLimit); } else { const BYTE* const match = dictBase + matchIndex; @@ -1245,7 +1251,7 @@ size_t ZSTD_RowFindBestMatch( /* Save best solution */ if (currentMl > ml) { ml = currentMl; - *offsetPtr = STORE_OFFSET(curr - matchIndex); + *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex); if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ } } @@ -1263,19 +1269,21 @@ size_t ZSTD_RowFindBestMatch( const U32 dmsSize = (U32)(dmsEnd - dmsBase); const U32 dmsIndexDelta = dictLimit - dmsSize; - { U32 const head = *dmsTagRow & rowMask; + { U32 const headGrouped = (*dmsTagRow & rowMask) * groupWidth; U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES]; size_t numMatches = 0; size_t currMatch = 0; - ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, head, rowEntries); + ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, headGrouped, rowEntries); - for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) { - U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask; + for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) { + U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask; U32 const matchIndex = dmsRow[matchPos]; + if(matchPos == 0) continue; if (matchIndex < dmsLowestIndex) break; PREFETCH_L1(dmsBase + matchIndex); matchBuffer[numMatches++] = matchIndex; + --nbAttempts; } /* Return the longest match */ @@ -1294,7 +1302,7 @@ size_t ZSTD_RowFindBestMatch( if (currentMl > ml) { ml = currentMl; assert(curr > matchIndex + dmsIndexDelta); - *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta)); + *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta)); if (ip+currentMl == iLimit) break; } } @@ -1304,14 +1312,10 @@ size_t ZSTD_RowFindBestMatch( } -typedef size_t (*searchMax_f)( - ZSTD_matchState_t* ms, - const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr); - /** - * This struct contains the functions necessary for lazy to search. - * Currently, that is only searchMax. However, it is still valuable to have the - * VTable because this makes it easier to add more functions to the VTable later. + * Generate search functions templated on (dictMode, mls, rowLog). + * These functions are outlined for code size & compilation time. + * ZSTD_searchMax() dispatches to the correct implementation function. * * TODO: The start of the search function involves loading and calculating a * bunch of constants from the ZSTD_matchState_t. These computations could be @@ -1329,25 +1333,25 @@ typedef size_t (*searchMax_f)( * the single segment loop. It should go in searchMax instead of its own * function to avoid having multiple virtual function calls per search. */ -typedef struct { - searchMax_f searchMax; -} ZSTD_LazyVTable; -#define GEN_ZSTD_BT_VTABLE(dictMode, mls) \ - static size_t ZSTD_BtFindBestMatch_##dictMode##_##mls( \ - ZSTD_matchState_t* ms, \ - const BYTE* ip, const BYTE* const iLimit, \ - size_t* offsetPtr) \ - { \ - assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \ - return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode); \ - } \ - static const ZSTD_LazyVTable ZSTD_BtVTable_##dictMode##_##mls = { \ - ZSTD_BtFindBestMatch_##dictMode##_##mls \ - }; +#define ZSTD_BT_SEARCH_FN(dictMode, mls) ZSTD_BtFindBestMatch_##dictMode##_##mls +#define ZSTD_HC_SEARCH_FN(dictMode, mls) ZSTD_HcFindBestMatch_##dictMode##_##mls +#define ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog) ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog -#define GEN_ZSTD_HC_VTABLE(dictMode, mls) \ - static size_t ZSTD_HcFindBestMatch_##dictMode##_##mls( \ +#define ZSTD_SEARCH_FN_ATTRS FORCE_NOINLINE + +#define GEN_ZSTD_BT_SEARCH_FN(dictMode, mls) \ + ZSTD_SEARCH_FN_ATTRS size_t ZSTD_BT_SEARCH_FN(dictMode, mls)( \ + ZSTD_matchState_t* ms, \ + const BYTE* ip, const BYTE* const iLimit, \ + size_t* offBasePtr) \ + { \ + assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \ + return ZSTD_BtFindBestMatch(ms, ip, iLimit, offBasePtr, mls, ZSTD_##dictMode); \ + } \ + +#define GEN_ZSTD_HC_SEARCH_FN(dictMode, mls) \ + ZSTD_SEARCH_FN_ATTRS size_t ZSTD_HC_SEARCH_FN(dictMode, mls)( \ ZSTD_matchState_t* ms, \ const BYTE* ip, const BYTE* const iLimit, \ size_t* offsetPtr) \ @@ -1355,12 +1359,9 @@ typedef struct { assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \ return ZSTD_HcFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode); \ } \ - static const ZSTD_LazyVTable ZSTD_HcVTable_##dictMode##_##mls = { \ - ZSTD_HcFindBestMatch_##dictMode##_##mls \ - }; -#define GEN_ZSTD_ROW_VTABLE(dictMode, mls, rowLog) \ - static size_t ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog( \ +#define GEN_ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog) \ + ZSTD_SEARCH_FN_ATTRS size_t ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)( \ ZSTD_matchState_t* ms, \ const BYTE* ip, const BYTE* const iLimit, \ size_t* offsetPtr) \ @@ -1369,9 +1370,6 @@ typedef struct { assert(MAX(4, MIN(6, ms->cParams.searchLog)) == rowLog); \ return ZSTD_RowFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode, rowLog); \ } \ - static const ZSTD_LazyVTable ZSTD_RowVTable_##dictMode##_##mls##_##rowLog = { \ - ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog \ - }; #define ZSTD_FOR_EACH_ROWLOG(X, dictMode, mls) \ X(dictMode, mls, 4) \ @@ -1394,84 +1392,103 @@ typedef struct { X(__VA_ARGS__, dictMatchState) \ X(__VA_ARGS__, dedicatedDictSearch) -/* Generate Row VTables for each combination of (dictMode, mls, rowLog) */ -ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS_ROWLOG, GEN_ZSTD_ROW_VTABLE) -/* Generate Binary Tree VTables for each combination of (dictMode, mls) */ -ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_BT_VTABLE) -/* Generate Hash Chain VTables for each combination of (dictMode, mls) */ -ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_HC_VTABLE) - -#define GEN_ZSTD_BT_VTABLE_ARRAY(dictMode) \ - { \ - &ZSTD_BtVTable_##dictMode##_4, \ - &ZSTD_BtVTable_##dictMode##_5, \ - &ZSTD_BtVTable_##dictMode##_6 \ - } +/* Generate row search fns for each combination of (dictMode, mls, rowLog) */ +ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS_ROWLOG, GEN_ZSTD_ROW_SEARCH_FN) +/* Generate binary Tree search fns for each combination of (dictMode, mls) */ +ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_BT_SEARCH_FN) +/* Generate hash chain search fns for each combination of (dictMode, mls) */ +ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_HC_SEARCH_FN) -#define GEN_ZSTD_HC_VTABLE_ARRAY(dictMode) \ - { \ - &ZSTD_HcVTable_##dictMode##_4, \ - &ZSTD_HcVTable_##dictMode##_5, \ - &ZSTD_HcVTable_##dictMode##_6 \ - } - -#define GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, mls) \ - { \ - &ZSTD_RowVTable_##dictMode##_##mls##_4, \ - &ZSTD_RowVTable_##dictMode##_##mls##_5, \ - &ZSTD_RowVTable_##dictMode##_##mls##_6 \ - } +typedef enum { search_hashChain=0, search_binaryTree=1, search_rowHash=2 } searchMethod_e; -#define GEN_ZSTD_ROW_VTABLE_ARRAY(dictMode) \ - { \ - GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, 4), \ - GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, 5), \ - GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, 6) \ +#define GEN_ZSTD_CALL_BT_SEARCH_FN(dictMode, mls) \ + case mls: \ + return ZSTD_BT_SEARCH_FN(dictMode, mls)(ms, ip, iend, offsetPtr); +#define GEN_ZSTD_CALL_HC_SEARCH_FN(dictMode, mls) \ + case mls: \ + return ZSTD_HC_SEARCH_FN(dictMode, mls)(ms, ip, iend, offsetPtr); +#define GEN_ZSTD_CALL_ROW_SEARCH_FN(dictMode, mls, rowLog) \ + case rowLog: \ + return ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)(ms, ip, iend, offsetPtr); + +#define ZSTD_SWITCH_MLS(X, dictMode) \ + switch (mls) { \ + ZSTD_FOR_EACH_MLS(X, dictMode) \ } -#define GEN_ZSTD_VTABLE_ARRAY(X) \ - { \ - X(noDict), \ - X(extDict), \ - X(dictMatchState), \ - X(dedicatedDictSearch) \ - } - -/* ******************************* -* Common parser - lazy strategy -*********************************/ -typedef enum { search_hashChain=0, search_binaryTree=1, search_rowHash=2 } searchMethod_e; +#define ZSTD_SWITCH_ROWLOG(dictMode, mls) \ + case mls: \ + switch (rowLog) { \ + ZSTD_FOR_EACH_ROWLOG(GEN_ZSTD_CALL_ROW_SEARCH_FN, dictMode, mls) \ + } \ + ZSTD_UNREACHABLE; \ + break; + +#define ZSTD_SWITCH_SEARCH_METHOD(dictMode) \ + switch (searchMethod) { \ + case search_hashChain: \ + ZSTD_SWITCH_MLS(GEN_ZSTD_CALL_HC_SEARCH_FN, dictMode) \ + break; \ + case search_binaryTree: \ + ZSTD_SWITCH_MLS(GEN_ZSTD_CALL_BT_SEARCH_FN, dictMode) \ + break; \ + case search_rowHash: \ + ZSTD_SWITCH_MLS(ZSTD_SWITCH_ROWLOG, dictMode) \ + break; \ + } \ + ZSTD_UNREACHABLE; /** - * This table is indexed first by the four ZSTD_dictMode_e values, and then - * by the two searchMethod_e values. NULLs are placed for configurations - * that should never occur (extDict modes go to the other implementation - * below and there is no DDSS for binary tree search yet). + * Searches for the longest match at @p ip. + * Dispatches to the correct implementation function based on the + * (searchMethod, dictMode, mls, rowLog). We use switch statements + * here instead of using an indirect function call through a function + * pointer because after Spectre and Meltdown mitigations, indirect + * function calls can be very costly, especially in the kernel. + * + * NOTE: dictMode and searchMethod should be templated, so those switch + * statements should be optimized out. Only the mls & rowLog switches + * should be left. + * + * @param ms The match state. + * @param ip The position to search at. + * @param iend The end of the input data. + * @param[out] offsetPtr Stores the match offset into this pointer. + * @param mls The minimum search length, in the range [4, 6]. + * @param rowLog The row log (if applicable), in the range [4, 6]. + * @param searchMethod The search method to use (templated). + * @param dictMode The dictMode (templated). + * + * @returns The length of the longest match found, or < mls if no match is found. + * If a match is found its offset is stored in @p offsetPtr. */ - -static ZSTD_LazyVTable const* -ZSTD_selectLazyVTable(ZSTD_matchState_t const* ms, searchMethod_e searchMethod, ZSTD_dictMode_e dictMode) +FORCE_INLINE_TEMPLATE size_t ZSTD_searchMax( + ZSTD_matchState_t* ms, + const BYTE* ip, + const BYTE* iend, + size_t* offsetPtr, + U32 const mls, + U32 const rowLog, + searchMethod_e const searchMethod, + ZSTD_dictMode_e const dictMode) { - /* Fill the Hc/Bt VTable arrays with the right functions for the (dictMode, mls) combination. */ - ZSTD_LazyVTable const* const hcVTables[4][3] = GEN_ZSTD_VTABLE_ARRAY(GEN_ZSTD_HC_VTABLE_ARRAY); - ZSTD_LazyVTable const* const btVTables[4][3] = GEN_ZSTD_VTABLE_ARRAY(GEN_ZSTD_BT_VTABLE_ARRAY); - /* Fill the Row VTable array with the right functions for the (dictMode, mls, rowLog) combination. */ - ZSTD_LazyVTable const* const rowVTables[4][3][3] = GEN_ZSTD_VTABLE_ARRAY(GEN_ZSTD_ROW_VTABLE_ARRAY); - - U32 const mls = MAX(4, MIN(6, ms->cParams.minMatch)); - U32 const rowLog = MAX(4, MIN(6, ms->cParams.searchLog)); - switch (searchMethod) { - case search_hashChain: - return hcVTables[dictMode][mls - 4]; - case search_binaryTree: - return btVTables[dictMode][mls - 4]; - case search_rowHash: - return rowVTables[dictMode][mls - 4][rowLog - 4]; - default: - return NULL; + if (dictMode == ZSTD_noDict) { + ZSTD_SWITCH_SEARCH_METHOD(noDict) + } else if (dictMode == ZSTD_extDict) { + ZSTD_SWITCH_SEARCH_METHOD(extDict) + } else if (dictMode == ZSTD_dictMatchState) { + ZSTD_SWITCH_SEARCH_METHOD(dictMatchState) + } else if (dictMode == ZSTD_dedicatedDictSearch) { + ZSTD_SWITCH_SEARCH_METHOD(dedicatedDictSearch) } + ZSTD_UNREACHABLE; + return 0; } +/* ******************************* +* Common parser - lazy strategy +*********************************/ + FORCE_INLINE_TEMPLATE size_t ZSTD_compressBlock_lazy_generic( ZSTD_matchState_t* ms, seqStore_t* seqStore, @@ -1488,9 +1505,11 @@ ZSTD_compressBlock_lazy_generic( const BYTE* const base = ms->window.base; const U32 prefixLowestIndex = ms->window.dictLimit; const BYTE* const prefixLowest = base + prefixLowestIndex; + const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6); + const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6); - searchMax_f const searchMax = ZSTD_selectLazyVTable(ms, searchMethod, dictMode)->searchMax; - U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0; + U32 offset_1 = rep[0], offset_2 = rep[1]; + U32 offsetSaved1 = 0, offsetSaved2 = 0; const int isDMS = dictMode == ZSTD_dictMatchState; const int isDDS = dictMode == ZSTD_dedicatedDictSearch; @@ -1505,16 +1524,14 @@ ZSTD_compressBlock_lazy_generic( 0; const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictLowest)); - assert(searchMax != NULL); - DEBUGLOG(5, "ZSTD_compressBlock_lazy_generic (dictMode=%u) (searchFunc=%u)", (U32)dictMode, (U32)searchMethod); ip += (dictAndPrefixLength == 0); if (dictMode == ZSTD_noDict) { U32 const curr = (U32)(ip - base); U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, ms->cParams.windowLog); U32 const maxRep = curr - windowLow; - if (offset_2 > maxRep) savedOffset = offset_2, offset_2 = 0; - if (offset_1 > maxRep) savedOffset = offset_1, offset_1 = 0; + if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0; + if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0; } if (isDxS) { /* dictMatchState repCode checks don't currently handle repCode == 0 @@ -1523,11 +1540,11 @@ ZSTD_compressBlock_lazy_generic( assert(offset_2 <= dictAndPrefixLength); } + /* Reset the lazy skipping state */ + ms->lazySkipping = 0; + if (searchMethod == search_rowHash) { - const U32 rowLog = MAX(4, MIN(6, ms->cParams.searchLog)); - ZSTD_row_fillHashCache(ms, base, rowLog, - MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */), - ms->nextToUpdate, ilimit); + ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit); } /* Match Loop */ @@ -1539,7 +1556,7 @@ ZSTD_compressBlock_lazy_generic( #endif while (ip < ilimit) { size_t matchLength=0; - size_t offcode=STORE_REPCODE_1; + size_t offBase = REPCODE1_TO_OFFBASE; const BYTE* start=ip+1; DEBUGLOG(7, "search baseline (depth 0)"); @@ -1564,14 +1581,23 @@ ZSTD_compressBlock_lazy_generic( } /* first search (depth 0) */ - { size_t offsetFound = 999999999; - size_t const ml2 = searchMax(ms, ip, iend, &offsetFound); + { size_t offbaseFound = 999999999; + size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offbaseFound, mls, rowLog, searchMethod, dictMode); if (ml2 > matchLength) - matchLength = ml2, start = ip, offcode=offsetFound; + matchLength = ml2, start = ip, offBase = offbaseFound; } if (matchLength < 4) { - ip += ((ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */ + size_t const step = ((size_t)(ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */; + ip += step; + /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time. + * In this mode we stop inserting every position into our tables, and only insert + * positions that we search, which is one in step positions. + * The exact cutoff is flexible, I've just chosen a number that is reasonably high, + * so we minimize the compression ratio loss in "normal" scenarios. This mode gets + * triggered once we've gone 2KB without finding any matches. + */ + ms->lazySkipping = step > kLazySkippingStep; continue; } @@ -1581,12 +1607,12 @@ ZSTD_compressBlock_lazy_generic( DEBUGLOG(7, "search depth 1"); ip ++; if ( (dictMode == ZSTD_noDict) - && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { + && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4; int const gain2 = (int)(mlRep * 3); - int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); + int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1); if ((mlRep >= 4) && (gain2 > gain1)) - matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip; + matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip; } if (isDxS) { const U32 repIndex = (U32)(ip - base) - offset_1; @@ -1598,17 +1624,17 @@ ZSTD_compressBlock_lazy_generic( const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; int const gain2 = (int)(mlRep * 3); - int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); + int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1); if ((mlRep >= 4) && (gain2 > gain1)) - matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip; + matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip; } } - { size_t offset2=999999999; - size_t const ml2 = searchMax(ms, ip, iend, &offset2); - int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */ - int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4); + { size_t ofbCandidate=999999999; + size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode); + int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */ + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4); if ((ml2 >= 4) && (gain2 > gain1)) { - matchLength = ml2, offcode = offset2, start = ip; + matchLength = ml2, offBase = ofbCandidate, start = ip; continue; /* search a better one */ } } @@ -1617,12 +1643,12 @@ ZSTD_compressBlock_lazy_generic( DEBUGLOG(7, "search depth 2"); ip ++; if ( (dictMode == ZSTD_noDict) - && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { + && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4; int const gain2 = (int)(mlRep * 4); - int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1); if ((mlRep >= 4) && (gain2 > gain1)) - matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip; + matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip; } if (isDxS) { const U32 repIndex = (U32)(ip - base) - offset_1; @@ -1634,17 +1660,17 @@ ZSTD_compressBlock_lazy_generic( const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; int const gain2 = (int)(mlRep * 4); - int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1); if ((mlRep >= 4) && (gain2 > gain1)) - matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip; + matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip; } } - { size_t offset2=999999999; - size_t const ml2 = searchMax(ms, ip, iend, &offset2); - int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */ - int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7); + { size_t ofbCandidate=999999999; + size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode); + int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */ + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7); if ((ml2 >= 4) && (gain2 > gain1)) { - matchLength = ml2, offcode = offset2, start = ip; + matchLength = ml2, offBase = ofbCandidate, start = ip; continue; } } } break; /* nothing found : store previous solution */ @@ -1655,26 +1681,33 @@ ZSTD_compressBlock_lazy_generic( * notably if `value` is unsigned, resulting in a large positive `-value`. */ /* catch up */ - if (STORED_IS_OFFSET(offcode)) { + if (OFFBASE_IS_OFFSET(offBase)) { if (dictMode == ZSTD_noDict) { - while ( ((start > anchor) & (start - STORED_OFFSET(offcode) > prefixLowest)) - && (start[-1] == (start-STORED_OFFSET(offcode))[-1]) ) /* only search for offset within prefix */ + while ( ((start > anchor) & (start - OFFBASE_TO_OFFSET(offBase) > prefixLowest)) + && (start[-1] == (start-OFFBASE_TO_OFFSET(offBase))[-1]) ) /* only search for offset within prefix */ { start--; matchLength++; } } if (isDxS) { - U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode)); + U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase)); const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex; const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest; while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */ } - offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode); + offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase); } /* store sequence */ _storeSequence: { size_t const litLength = (size_t)(start - anchor); - ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength); + ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength); anchor = ip = start + matchLength; } + if (ms->lazySkipping) { + /* We've found a match, disable lazy skipping mode, and refill the hash cache. */ + if (searchMethod == search_rowHash) { + ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit); + } + ms->lazySkipping = 0; + } /* check immediate repcode */ if (isDxS) { @@ -1688,8 +1721,8 @@ _storeSequence: && (MEM_read32(repMatch) == MEM_read32(ip)) ) { const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend; matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4; - offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap offset_2 <=> offset_1 */ - ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength); + offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap offset_2 <=> offset_1 */ + ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength); ip += matchLength; anchor = ip; continue; @@ -1703,16 +1736,20 @@ _storeSequence: && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) { /* store sequence */ matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4; - offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap repcodes */ - ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength); + offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap repcodes */ + ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength); ip += matchLength; anchor = ip; continue; /* faster when present ... (?) */ } } } - /* Save reps for next block */ - rep[0] = offset_1 ? offset_1 : savedOffset; - rep[1] = offset_2 ? offset_2 : savedOffset; + /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0), + * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */ + offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2; + + /* save reps for next block */ + rep[0] = offset_1 ? offset_1 : offsetSaved1; + rep[1] = offset_2 ? offset_2 : offsetSaved2; /* Return the last literals size */ return (size_t)(iend - anchor); @@ -1881,19 +1918,20 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( const BYTE* const dictEnd = dictBase + dictLimit; const BYTE* const dictStart = dictBase + ms->window.lowLimit; const U32 windowLog = ms->cParams.windowLog; - const U32 rowLog = ms->cParams.searchLog < 5 ? 4 : 5; + const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6); + const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6); - searchMax_f const searchMax = ZSTD_selectLazyVTable(ms, searchMethod, ZSTD_extDict)->searchMax; U32 offset_1 = rep[0], offset_2 = rep[1]; DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod); + /* Reset the lazy skipping state */ + ms->lazySkipping = 0; + /* init */ ip += (ip == prefixStart); if (searchMethod == search_rowHash) { - ZSTD_row_fillHashCache(ms, base, rowLog, - MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */), - ms->nextToUpdate, ilimit); + ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit); } /* Match Loop */ @@ -1905,7 +1943,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( #endif while (ip < ilimit) { size_t matchLength=0; - size_t offcode=STORE_REPCODE_1; + size_t offBase = REPCODE1_TO_OFFBASE; const BYTE* start=ip+1; U32 curr = (U32)(ip-base); @@ -1924,14 +1962,23 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( } } /* first search (depth 0) */ - { size_t offsetFound = 999999999; - size_t const ml2 = searchMax(ms, ip, iend, &offsetFound); + { size_t ofbCandidate = 999999999; + size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict); if (ml2 > matchLength) - matchLength = ml2, start = ip, offcode=offsetFound; + matchLength = ml2, start = ip, offBase = ofbCandidate; } if (matchLength < 4) { - ip += ((ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */ + size_t const step = ((size_t)(ip-anchor) >> kSearchStrength); + ip += step + 1; /* jump faster over incompressible sections */ + /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time. + * In this mode we stop inserting every position into our tables, and only insert + * positions that we search, which is one in step positions. + * The exact cutoff is flexible, I've just chosen a number that is reasonably high, + * so we minimize the compression ratio loss in "normal" scenarios. This mode gets + * triggered once we've gone 2KB without finding any matches. + */ + ms->lazySkipping = step > kLazySkippingStep; continue; } @@ -1941,7 +1988,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( ip ++; curr++; /* check repCode */ - if (offcode) { + if (offBase) { const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog); const U32 repIndex = (U32)(curr - offset_1); const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; @@ -1953,18 +2000,18 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; int const gain2 = (int)(repLength * 3); - int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); + int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1); if ((repLength >= 4) && (gain2 > gain1)) - matchLength = repLength, offcode = STORE_REPCODE_1, start = ip; + matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip; } } /* search match, depth 1 */ - { size_t offset2=999999999; - size_t const ml2 = searchMax(ms, ip, iend, &offset2); - int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */ - int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4); + { size_t ofbCandidate = 999999999; + size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict); + int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */ + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4); if ((ml2 >= 4) && (gain2 > gain1)) { - matchLength = ml2, offcode = offset2, start = ip; + matchLength = ml2, offBase = ofbCandidate, start = ip; continue; /* search a better one */ } } @@ -1973,7 +2020,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( ip ++; curr++; /* check repCode */ - if (offcode) { + if (offBase) { const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog); const U32 repIndex = (U32)(curr - offset_1); const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; @@ -1985,38 +2032,45 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; int const gain2 = (int)(repLength * 4); - int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1); if ((repLength >= 4) && (gain2 > gain1)) - matchLength = repLength, offcode = STORE_REPCODE_1, start = ip; + matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip; } } /* search match, depth 2 */ - { size_t offset2=999999999; - size_t const ml2 = searchMax(ms, ip, iend, &offset2); - int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */ - int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7); + { size_t ofbCandidate = 999999999; + size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict); + int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */ + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7); if ((ml2 >= 4) && (gain2 > gain1)) { - matchLength = ml2, offcode = offset2, start = ip; + matchLength = ml2, offBase = ofbCandidate, start = ip; continue; } } } break; /* nothing found : store previous solution */ } /* catch up */ - if (STORED_IS_OFFSET(offcode)) { - U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode)); + if (OFFBASE_IS_OFFSET(offBase)) { + U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase)); const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex; const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart; while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */ - offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode); + offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase); } /* store sequence */ _storeSequence: { size_t const litLength = (size_t)(start - anchor); - ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength); + ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength); anchor = ip = start + matchLength; } + if (ms->lazySkipping) { + /* We've found a match, disable lazy skipping mode, and refill the hash cache. */ + if (searchMethod == search_rowHash) { + ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit); + } + ms->lazySkipping = 0; + } /* check immediate repcode */ while (ip <= ilimit) { @@ -2031,8 +2085,8 @@ _storeSequence: /* repcode detected we should take it */ const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; - offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap offset history */ - ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength); + offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap offset history */ + ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength); ip += matchLength; anchor = ip; continue; /* faster when present ... (?) */ @@ -2098,7 +2152,6 @@ size_t ZSTD_compressBlock_lazy_extDict_row( size_t ZSTD_compressBlock_lazy2_extDict_row( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) - { return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2); } |