Skip to content

Commit

Permalink
"Short cache" optimization for level 1-4 DMS (+5-30% compression spee…
Browse files Browse the repository at this point in the history
…d) (#3152)

* first attempt at fast DMS short cache

* significant wins for some scenarios

* fix all clang regressions

* nits

* fix 1.5% gcc11 regression on hot 110Kdict scenario

* fix CI

* nit

* Add tags to doublefast hash table

* use tags in doublefast DMS

* Fix CI

* Clean up some hardcoded logic / constants

* Switch forCCtx to an enum

* nit

* add short cache to ip+1 long search

* Move tag size into hashLog

* Minor nits

* Truncate dictionaries greater than 16MB in short cache mode

* Helper function for tag comparison

* Cap short cache hashLog at 24 to prevent overflow

* size_t dictTagsMatch -> int dictTagsMatch

* nit

* Clean up and comment dictionary truncation

* Move ZSTD_tableFillPurpose_e next to ZSTD_dictTableLoadMethod_e

* Comment and expand helper functions

* Asserts and documentation

* nit
  • Loading branch information
embg authored Jun 21, 2022
1 parent eb842a2 commit f6ef143
Show file tree
Hide file tree
Showing 7 changed files with 273 additions and 69 deletions.
94 changes: 71 additions & 23 deletions lib/compress/zstd_compress.c
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,12 @@ static ZSTD_paramSwitch_e ZSTD_resolveEnableLdm(ZSTD_paramSwitch_e mode,
return (cParams->strategy >= ZSTD_btopt && cParams->windowLog >= 27) ? ZSTD_ps_enable : ZSTD_ps_disable;
}

/* Returns 1 if compression parameters are such that CDict hashtable and chaintable indices are tagged.
* If so, the tags need to be removed in ZSTD_resetCCtx_byCopyingCDict. */
static int ZSTD_CDictIndicesAreTagged(const ZSTD_compressionParameters* const cParams) {
return cParams->strategy == ZSTD_fast || cParams->strategy == ZSTD_dfast;
}

static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams(
ZSTD_compressionParameters cParams)
{
Expand Down Expand Up @@ -1367,6 +1373,13 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
if (cPar.windowLog < ZSTD_WINDOWLOG_ABSOLUTEMIN)
cPar.windowLog = ZSTD_WINDOWLOG_ABSOLUTEMIN; /* minimum wlog required for valid frame header */

if (mode == ZSTD_cpm_createCDict && ZSTD_CDictIndicesAreTagged(&cPar)) {
U32 const maxShortCacheHashLog = 32 - ZSTD_SHORT_CACHE_TAG_BITS;
if (cPar.hashLog > maxShortCacheHashLog) {
cPar.hashLog = maxShortCacheHashLog;
}
}

return cPar;
}

Expand Down Expand Up @@ -2096,6 +2109,22 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx,
return 0;
}

static void ZSTD_copyCDictTableIntoCCtx(U32* dst, U32 const* src, size_t tableSize,
ZSTD_compressionParameters const* cParams) {
if (ZSTD_CDictIndicesAreTagged(cParams)){
/* Remove tags from the CDict table if they are present.
* See docs on "short cache" in zstd_compress_internal.h for context. */
size_t i;
for (i = 0; i < tableSize; i++) {
U32 const taggedIndex = src[i];
U32 const index = taggedIndex >> ZSTD_SHORT_CACHE_TAG_BITS;
dst[i] = index;
}
} else {
ZSTD_memcpy(dst, src, tableSize * sizeof(U32));
}
}

static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx,
const ZSTD_CDict* cdict,
ZSTD_CCtx_params params,
Expand Down Expand Up @@ -2131,14 +2160,15 @@ static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx,
: 0;
size_t const hSize = (size_t)1 << cdict_cParams->hashLog;

ZSTD_memcpy(cctx->blockState.matchState.hashTable,
cdict->matchState.hashTable,
hSize * sizeof(U32));
ZSTD_copyCDictTableIntoCCtx(cctx->blockState.matchState.hashTable,
cdict->matchState.hashTable,
hSize, cdict_cParams);

/* Do not copy cdict's chainTable if cctx has parameters such that it would not use chainTable */
if (ZSTD_allocateChainTable(cctx->appliedParams.cParams.strategy, cctx->appliedParams.useRowMatchFinder, 0 /* forDDSDict */)) {
ZSTD_memcpy(cctx->blockState.matchState.chainTable,
cdict->matchState.chainTable,
chainSize * sizeof(U32));
ZSTD_copyCDictTableIntoCCtx(cctx->blockState.matchState.chainTable,
cdict->matchState.chainTable,
chainSize, cdict_cParams);
}
/* copy tag table */
if (ZSTD_rowMatchFinderUsed(cdict_cParams->strategy, cdict->useRowMatchFinder)) {
Expand Down Expand Up @@ -4205,7 +4235,8 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
ZSTD_cwksp* ws,
ZSTD_CCtx_params const* params,
const void* src, size_t srcSize,
ZSTD_dictTableLoadMethod_e dtlm)
ZSTD_dictTableLoadMethod_e dtlm,
ZSTD_tableFillPurpose_e tfp)
{
const BYTE* ip = (const BYTE*) src;
const BYTE* const iend = ip + srcSize;
Expand All @@ -4214,22 +4245,37 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
/* Assert that the ms params match the params we're being given */
ZSTD_assertEqualCParams(params->cParams, ms->cParams);

if (srcSize > ZSTD_CHUNKSIZE_MAX) {
{ /* Ensure large dictionaries can't cause index overflow */

/* Allow the dictionary to set indices up to exactly ZSTD_CURRENT_MAX.
* Dictionaries right at the edge will immediately trigger overflow
* correction, but I don't want to insert extra constraints here.
*/
U32 const maxDictSize = ZSTD_CURRENT_MAX - ZSTD_WINDOW_START_INDEX;
/* We must have cleared our windows when our source is this large. */
assert(ZSTD_window_isEmpty(ms->window));
if (loadLdmDict)
assert(ZSTD_window_isEmpty(ls->window));
U32 maxDictSize = ZSTD_CURRENT_MAX - ZSTD_WINDOW_START_INDEX;

int const CDictTaggedIndices = ZSTD_CDictIndicesAreTagged(&params->cParams);
if (CDictTaggedIndices && tfp == ZSTD_tfp_forCDict) {
/* Some dictionary matchfinders in zstd use "short cache",
* which treats the lower ZSTD_SHORT_CACHE_TAG_BITS of each
* CDict hashtable entry as a tag rather than as part of an index.
* When short cache is used, we need to truncate the dictionary
* so that its indices don't overlap with the tag. */
U32 const shortCacheMaxDictSize = (1u << (32 - ZSTD_SHORT_CACHE_TAG_BITS)) - ZSTD_WINDOW_START_INDEX;
maxDictSize = MIN(maxDictSize, shortCacheMaxDictSize);
assert(!loadLdmDict);
}

/* If the dictionary is too large, only load the suffix of the dictionary. */
if (srcSize > maxDictSize) {
ip = iend - maxDictSize;
src = ip;
srcSize = maxDictSize;
}
} }

if (srcSize > ZSTD_CHUNKSIZE_MAX) {
/* We must have cleared our windows when our source is this large. */
assert(ZSTD_window_isEmpty(ms->window));
if (loadLdmDict) assert(ZSTD_window_isEmpty(ls->window));
}

DEBUGLOG(4, "ZSTD_loadDictionaryContent(): useRowMatchFinder=%d", (int)params->useRowMatchFinder);
Expand All @@ -4252,10 +4298,10 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
switch(params->cParams.strategy)
{
case ZSTD_fast:
ZSTD_fillHashTable(ms, iend, dtlm);
ZSTD_fillHashTable(ms, iend, dtlm, tfp);
break;
case ZSTD_dfast:
ZSTD_fillDoubleHashTable(ms, iend, dtlm);
ZSTD_fillDoubleHashTable(ms, iend, dtlm, tfp);
break;

case ZSTD_greedy:
Expand Down Expand Up @@ -4421,6 +4467,7 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
ZSTD_CCtx_params const* params,
const void* dict, size_t dictSize,
ZSTD_dictTableLoadMethod_e dtlm,
ZSTD_tableFillPurpose_e tfp,
void* workspace)
{
const BYTE* dictPtr = (const BYTE*)dict;
Expand All @@ -4439,7 +4486,7 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
{
size_t const dictContentSize = (size_t)(dictEnd - dictPtr);
FORWARD_IF_ERROR(ZSTD_loadDictionaryContent(
ms, NULL, ws, params, dictPtr, dictContentSize, dtlm), "");
ms, NULL, ws, params, dictPtr, dictContentSize, dtlm, tfp), "");
}
return dictID;
}
Expand All @@ -4455,6 +4502,7 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
const void* dict, size_t dictSize,
ZSTD_dictContentType_e dictContentType,
ZSTD_dictTableLoadMethod_e dtlm,
ZSTD_tableFillPurpose_e tfp,
void* workspace)
{
DEBUGLOG(4, "ZSTD_compress_insertDictionary (dictSize=%u)", (U32)dictSize);
Expand All @@ -4467,21 +4515,21 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,

/* dict restricted modes */
if (dictContentType == ZSTD_dct_rawContent)
return ZSTD_loadDictionaryContent(ms, ls, ws, params, dict, dictSize, dtlm);
return ZSTD_loadDictionaryContent(ms, ls, ws, params, dict, dictSize, dtlm, tfp);

if (MEM_readLE32(dict) != ZSTD_MAGIC_DICTIONARY) {
if (dictContentType == ZSTD_dct_auto) {
DEBUGLOG(4, "raw content dictionary detected");
return ZSTD_loadDictionaryContent(
ms, ls, ws, params, dict, dictSize, dtlm);
ms, ls, ws, params, dict, dictSize, dtlm, tfp);
}
RETURN_ERROR_IF(dictContentType == ZSTD_dct_fullDict, dictionary_wrong, "");
assert(0); /* impossible */
}

/* dict as full zstd dictionary */
return ZSTD_loadZstdDictionary(
bs, ms, ws, params, dict, dictSize, dtlm, workspace);
bs, ms, ws, params, dict, dictSize, dtlm, tfp, workspace);
}

#define ZSTD_USE_CDICT_PARAMS_SRCSIZE_CUTOFF (128 KB)
Expand Down Expand Up @@ -4524,11 +4572,11 @@ static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx,
cctx->blockState.prevCBlock, &cctx->blockState.matchState,
&cctx->ldmState, &cctx->workspace, &cctx->appliedParams, cdict->dictContent,
cdict->dictContentSize, cdict->dictContentType, dtlm,
cctx->entropyWorkspace)
ZSTD_tfp_forCCtx, cctx->entropyWorkspace)
: ZSTD_compress_insertDictionary(
cctx->blockState.prevCBlock, &cctx->blockState.matchState,
&cctx->ldmState, &cctx->workspace, &cctx->appliedParams, dict, dictSize,
dictContentType, dtlm, cctx->entropyWorkspace);
dictContentType, dtlm, ZSTD_tfp_forCCtx, cctx->entropyWorkspace);
FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed");
assert(dictID <= UINT_MAX);
cctx->dictID = (U32)dictID;
Expand Down Expand Up @@ -4832,7 +4880,7 @@ static size_t ZSTD_initCDict_internal(
{ size_t const dictID = ZSTD_compress_insertDictionary(
&cdict->cBlockState, &cdict->matchState, NULL, &cdict->workspace,
&params, cdict->dictContent, cdict->dictContentSize,
dictContentType, ZSTD_dtlm_full, cdict->entropyWorkspace);
dictContentType, ZSTD_dtlm_full, ZSTD_tfp_forCDict, cdict->entropyWorkspace);
FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed");
assert(dictID <= (size_t)(U32)-1);
cdict->dictID = (U32)dictID;
Expand Down
53 changes: 47 additions & 6 deletions lib/compress/zstd_compress_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -434,6 +434,7 @@ struct ZSTD_CCtx_s {
};

typedef enum { ZSTD_dtlm_fast, ZSTD_dtlm_full } ZSTD_dictTableLoadMethod_e;
typedef enum { ZSTD_tfp_forCCtx, ZSTD_tfp_forCDict } ZSTD_tableFillPurpose_e;

typedef enum {
ZSTD_noDict = 0,
Expand Down Expand Up @@ -745,32 +746,36 @@ ZSTD_count_2segments(const BYTE* ip, const BYTE* match,
* Hashes
***************************************/
static const U32 prime3bytes = 506832829U;
static U32 ZSTD_hash3(U32 u, U32 h) { return ((u << (32-24)) * prime3bytes) >> (32-h) ; }
static U32 ZSTD_hash3(U32 u, U32 h) { assert(h <= 32); return ((u << (32-24)) * prime3bytes) >> (32-h) ; }
MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h); } /* only in zstd_opt.h */

static const U32 prime4bytes = 2654435761U;
static U32 ZSTD_hash4(U32 u, U32 h) { return (u * prime4bytes) >> (32-h) ; }
static U32 ZSTD_hash4(U32 u, U32 h) { assert(h <= 32); return (u * prime4bytes) >> (32-h) ; }
static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_read32(ptr), h); }

static const U64 prime5bytes = 889523592379ULL;
static size_t ZSTD_hash5(U64 u, U32 h) { return (size_t)(((u << (64-40)) * prime5bytes) >> (64-h)) ; }
static size_t ZSTD_hash5(U64 u, U32 h) { assert(h <= 64); return (size_t)(((u << (64-40)) * prime5bytes) >> (64-h)) ; }
static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h); }

static const U64 prime6bytes = 227718039650203ULL;
static size_t ZSTD_hash6(U64 u, U32 h) { return (size_t)(((u << (64-48)) * prime6bytes) >> (64-h)) ; }
static size_t ZSTD_hash6(U64 u, U32 h) { assert(h <= 64); return (size_t)(((u << (64-48)) * prime6bytes) >> (64-h)) ; }
static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h); }

static const U64 prime7bytes = 58295818150454627ULL;
static size_t ZSTD_hash7(U64 u, U32 h) { return (size_t)(((u << (64-56)) * prime7bytes) >> (64-h)) ; }
static size_t ZSTD_hash7(U64 u, U32 h) { assert(h <= 64); return (size_t)(((u << (64-56)) * prime7bytes) >> (64-h)) ; }
static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h); }

static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL;
static size_t ZSTD_hash8(U64 u, U32 h) { return (size_t)(((u) * prime8bytes) >> (64-h)) ; }
static size_t ZSTD_hash8(U64 u, U32 h) { assert(h <= 64); return (size_t)(((u) * prime8bytes) >> (64-h)) ; }
static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h); }

MEM_STATIC FORCE_INLINE_ATTR
size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls)
{
/* Although some of these hashes do support hBits up to 64, some do not.
* To be on the safe side, always avoid hBits > 32. */
assert(hBits <= 32);

switch(mls)
{
default:
Expand Down Expand Up @@ -1264,6 +1269,42 @@ MEM_STATIC void ZSTD_debugTable(const U32* table, U32 max)

#endif

/* Short Cache */

/* Normally, zstd matchfinders follow this flow:
* 1. Compute hash at ip
* 2. Load index from hashTable[hash]
* 3. Check if *ip == *(base + index)
* In dictionary compression, loading *(base + index) is often an L2 or even L3 miss.
*
* Short cache is an optimization which allows us to avoid step 3 most of the time
* when the data doesn't actually match. With short cache, the flow becomes:
* 1. Compute (hash, currentTag) at ip. currentTag is an 8-bit independent hash at ip.
* 2. Load (index, matchTag) from hashTable[hash]. See ZSTD_writeTaggedIndex to understand how this works.
* 3. Only if currentTag == matchTag, check *ip == *(base + index). Otherwise, continue.
*
* Currently, short cache is only implemented in CDict hashtables. Thus, its use is limited to
* dictMatchState matchfinders.
*/
#define ZSTD_SHORT_CACHE_TAG_BITS 8
#define ZSTD_SHORT_CACHE_TAG_MASK ((1u << ZSTD_SHORT_CACHE_TAG_BITS) - 1)

/* Helper function for ZSTD_fillHashTable and ZSTD_fillDoubleHashTable.
* Unpacks hashAndTag into (hash, tag), then packs (index, tag) into hashTable[hash]. */
MEM_STATIC void ZSTD_writeTaggedIndex(U32* const hashTable, size_t hashAndTag, U32 index) {
size_t const hash = hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS;
U32 const tag = (U32)(hashAndTag & ZSTD_SHORT_CACHE_TAG_MASK);
assert(index >> (32 - ZSTD_SHORT_CACHE_TAG_BITS) == 0);
hashTable[hash] = (index << ZSTD_SHORT_CACHE_TAG_BITS) | tag;
}

/* Helper function for short cache matchfinders.
* Unpacks tag1 and tag2 from lower bits of packedTag1 and packedTag2, then checks if the tags match. */
MEM_STATIC int ZSTD_comparePackedTags(size_t packedTag1, size_t packedTag2) {
U32 const tag1 = packedTag1 & ZSTD_SHORT_CACHE_TAG_MASK;
U32 const tag2 = packedTag2 & ZSTD_SHORT_CACHE_TAG_MASK;
return tag1 == tag2;
}

#if defined (__cplusplus)
}
Expand Down
Loading

0 comments on commit f6ef143

Please sign in to comment.