zstd_lazy.c source code [linux/lib/zstd/compress/zstd_lazy.c]

1	/*
2	* Copyright (c) Yann Collet, Facebook, Inc.
3	* All rights reserved.
4	*
5	* This source code is licensed under both the BSD-style license (found in the
6	* LICENSE file in the root directory of this source tree) and the GPLv2 (found
7	* in the COPYING file in the root directory of this source tree).
8	* You may select, at your option, one of the above-listed licenses.
9	*/
10
11	#include "zstd_compress_internal.h"
12	#include "zstd_lazy.h"
13
14
15	/-************************************
16	* Binary Tree search
17	***************************************/
18
19	static void
20	ZSTD_updateDUBT(ZSTD_matchState_t* ms,
21	const BYTE* ip, const BYTE* iend,
22	U32 mls)
23	{
24	const ZSTD_compressionParameters* const cParams = &ms->cParams;
25	U32* const hashTable = ms->hashTable;
26	U32 const hashLog = cParams->hashLog;
27
28	U32* const bt = ms->chainTable;
29	U32 const btLog = cParams->chainLog - `1`;
30	U32 const btMask = (`1` << btLog) - `1`;
31
32	const BYTE* const base = ms->window.base;
33	U32 const target = (U32)(ip - base);
34	U32 idx = ms->nextToUpdate;
35
36	if (idx != target)
37	DEBUGLOG(`7`, "ZSTD_updateDUBT, from %u to %u (dictLimit:%u)",
38	idx, target, ms->window.dictLimit);
39	assert(ip + `8` <= iend); / condition for ZSTD_hashPtr /
40	(void)iend;
41
42	assert(idx >= ms->window.dictLimit); / condition for valid base+idx /
43	for ( ; idx < target ; idx++) {
44	size_t const h = ZSTD_hashPtr(p: base + idx, hBits: hashLog, mls); / assumption : ip + 8 <= iend /
45	U32 const matchIndex = hashTable[h];
46
47	U32* const nextCandidatePtr = bt + `2`*(idx&btMask);
48	U32* const sortMarkPtr = nextCandidatePtr + `1`;
49
50	DEBUGLOG(`8`, "ZSTD_updateDUBT: insert %u", idx);
51	hashTable[h] = idx; / Update Hash Table /
52	nextCandidatePtr = matchIndex; /* update BT like a chain /
53	*sortMarkPtr = ZSTD_DUBT_UNSORTED_MARK;
54	}
55	ms->nextToUpdate = target;
56	}
57
58
59	/ ZSTD_insertDUBT1() :*
60	* sort one already inserted but unsorted position
61	* assumption : curr >= btlow == (curr - btmask)
62	* doesn't fail */
63	static void
64	ZSTD_insertDUBT1(const ZSTD_matchState_t* ms,
65	U32 curr, const BYTE* inputEnd,
66	U32 nbCompares, U32 btLow,
67	const ZSTD_dictMode_e dictMode)
68	{
69	const ZSTD_compressionParameters* const cParams = &ms->cParams;
70	U32* const bt = ms->chainTable;
71	U32 const btLog = cParams->chainLog - `1`;
72	U32 const btMask = (`1` << btLog) - `1`;
73	size_t commonLengthSmaller=`0`, commonLengthLarger=`0`;
74	const BYTE* const base = ms->window.base;
75	const BYTE* const dictBase = ms->window.dictBase;
76	const U32 dictLimit = ms->window.dictLimit;
77	const BYTE* const ip = (curr>=dictLimit) ? base + curr : dictBase + curr;
78	const BYTE* const iend = (curr>=dictLimit) ? inputEnd : dictBase + dictLimit;
79	const BYTE* const dictEnd = dictBase + dictLimit;
80	const BYTE* const prefixStart = base + dictLimit;
81	const BYTE* match;
82	U32* smallerPtr = bt + `2`*(curr&btMask);
83	U32* largerPtr = smallerPtr + `1`;
84	U32 matchIndex = smallerPtr; /* this candidate is unsorted : next sorted candidate is reached through smallerPtr, while largerPtr contains previous unsorted candidate (which is already saved and can be overwritten) /
85	U32 dummy32; / to be nullified at the end /
86	U32 const windowValid = ms->window.lowLimit;
87	U32 const maxDistance = `1U` << cParams->windowLog;
88	U32 const windowLow = (curr - windowValid > maxDistance) ? curr - maxDistance : windowValid;
89
90
91	DEBUGLOG(`8`, "ZSTD_insertDUBT1(%u) (dictLimit=%u, lowLimit=%u)",
92	curr, dictLimit, windowLow);
93	assert(curr >= btLow);
94	assert(ip < iend); / condition for ZSTD_count /
95
96	for (; nbCompares && (matchIndex > windowLow); --nbCompares) {
97	U32* const nextPtr = bt + `2`*(matchIndex & btMask);
98	size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); / guaranteed minimum nb of common bytes /
99	assert(matchIndex < curr);
100	/ note : all candidates are now supposed sorted,*
101	* but it's still possible to have nextPtr[1] == ZSTD_DUBT_UNSORTED_MARK
102	* when a real index has the same value as ZSTD_DUBT_UNSORTED_MARK */
103
104	if ( (dictMode != ZSTD_extDict)
105	\|\| (matchIndex+matchLength >= dictLimit) / both in current segment/
106	\|\| (curr < dictLimit) / both in extDict /) {
107	const BYTE* const mBase = ( (dictMode != ZSTD_extDict)
108	\|\| (matchIndex+matchLength >= dictLimit)) ?
109	base : dictBase;
110	assert( (matchIndex+matchLength >= dictLimit) / might be wrong if extDict is incorrectly set to 0 /
111	\|\| (curr < dictLimit) );
112	match = mBase + matchIndex;
113	matchLength += ZSTD_count(pIn: ip+matchLength, pMatch: match+matchLength, pInLimit: iend);
114	} else {
115	match = dictBase + matchIndex;
116	matchLength += ZSTD_count_2segments(ip: ip+matchLength, match: match+matchLength, iEnd: iend, mEnd: dictEnd, iStart: prefixStart);
117	if (matchIndex+matchLength >= dictLimit)
118	match = base + matchIndex; / preparation for next read of match[matchLength] /
119	}
120
121	DEBUGLOG(`8`, "ZSTD_insertDUBT1: comparing %u with %u : found %u common bytes ",
122	curr, matchIndex, (U32)matchLength);
123
124	if (ip+matchLength == iend) { / equal : no way to know if inf or sup /
125	break; / drop , to guarantee consistency ; miss a bit of compression, but other solutions can corrupt tree /
126	}
127
128	if (match[matchLength] < ip[matchLength]) { / necessarily within buffer /
129	/ match is smaller than current /
130	smallerPtr = matchIndex; /* update smaller idx /
131	commonLengthSmaller = matchLength; / all smaller will now have at least this guaranteed common length /
132	if (matchIndex <= btLow) { smallerPtr=&dummy32; break; } / beyond tree size, stop searching /
133	DEBUGLOG(`8`, "ZSTD_insertDUBT1: %u (>btLow=%u) is smaller : next => %u",
134	matchIndex, btLow, nextPtr[`1`]);
135	smallerPtr = nextPtr+`1`; / new "candidate" => larger than match, which was smaller than target /
136	matchIndex = nextPtr[`1`]; / new matchIndex, larger than previous and closer to current /
137	} else {
138	/ match is larger than current /
139	*largerPtr = matchIndex;
140	commonLengthLarger = matchLength;
141	if (matchIndex <= btLow) { largerPtr=&dummy32; break; } / beyond tree size, stop searching /
142	DEBUGLOG(`8`, "ZSTD_insertDUBT1: %u (>btLow=%u) is larger => %u",
143	matchIndex, btLow, nextPtr[`0`]);
144	largerPtr = nextPtr;
145	matchIndex = nextPtr[`0`];
146	} }
147
148	smallerPtr = largerPtr = `0`;
149	}
150
151
152	static size_t
153	ZSTD_DUBT_findBetterDictMatch (
154	const ZSTD_matchState_t* ms,
155	const BYTE* const ip, const BYTE* const iend,
156	size_t* offsetPtr,
157	size_t bestLength,
158	U32 nbCompares,
159	U32 const mls,
160	const ZSTD_dictMode_e dictMode)
161	{
162	const ZSTD_matchState_t * const dms = ms->dictMatchState;
163	const ZSTD_compressionParameters* const dmsCParams = &dms->cParams;
164	const U32 * const dictHashTable = dms->hashTable;
165	U32 const hashLog = dmsCParams->hashLog;
166	size_t const h = ZSTD_hashPtr(p: ip, hBits: hashLog, mls);
167	U32 dictMatchIndex = dictHashTable[h];
168
169	const BYTE* const base = ms->window.base;
170	const BYTE* const prefixStart = base + ms->window.dictLimit;
171	U32 const curr = (U32)(ip-base);
172	const BYTE* const dictBase = dms->window.base;
173	const BYTE* const dictEnd = dms->window.nextSrc;
174	U32 const dictHighLimit = (U32)(dms->window.nextSrc - dms->window.base);
175	U32 const dictLowLimit = dms->window.lowLimit;
176	U32 const dictIndexDelta = ms->window.lowLimit - dictHighLimit;
177
178	U32* const dictBt = dms->chainTable;
179	U32 const btLog = dmsCParams->chainLog - `1`;
180	U32 const btMask = (`1` << btLog) - `1`;
181	U32 const btLow = (btMask >= dictHighLimit - dictLowLimit) ? dictLowLimit : dictHighLimit - btMask;
182
183	size_t commonLengthSmaller=`0`, commonLengthLarger=`0`;
184
185	(void)dictMode;
186	assert(dictMode == ZSTD_dictMatchState);
187
188	for (; nbCompares && (dictMatchIndex > dictLowLimit); --nbCompares) {
189	U32* const nextPtr = dictBt + `2`*(dictMatchIndex & btMask);
190	size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); / guaranteed minimum nb of common bytes /
191	const BYTE* match = dictBase + dictMatchIndex;
192	matchLength += ZSTD_count_2segments(ip: ip+matchLength, match: match+matchLength, iEnd: iend, mEnd: dictEnd, iStart: prefixStart);
193	if (dictMatchIndex+matchLength >= dictHighLimit)
194	match = base + dictMatchIndex + dictIndexDelta; / to prepare for next usage of match[matchLength] /
195
196	if (matchLength > bestLength) {
197	U32 matchIndex = dictMatchIndex + dictIndexDelta;
198	if ( (`4`(int)(matchLength-bestLength)) > (int*)(ZSTD_highbit32(val: curr-matchIndex+`1`) - ZSTD_highbit32(val: (U32)offsetPtr[`0`]+`1`)) ) {
199	DEBUGLOG(`9`, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)",
200	curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, STORE_OFFSET(curr - matchIndex), dictMatchIndex, matchIndex);
201	bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex);
202	}
203	if (ip+matchLength == iend) { / reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match /
204	break; / drop, to guarantee consistency (miss a little bit of compression) /
205	}
206	}
207
208	if (match[matchLength] < ip[matchLength]) {
209	if (dictMatchIndex <= btLow) { break; } / beyond tree size, stop the search /
210	commonLengthSmaller = matchLength; / all smaller will now have at least this guaranteed common length /
211	dictMatchIndex = nextPtr[`1`]; / new matchIndex larger than previous (closer to current) /
212	} else {
213	/ match is larger than current /
214	if (dictMatchIndex <= btLow) { break; } / beyond tree size, stop the search /
215	commonLengthLarger = matchLength;
216	dictMatchIndex = nextPtr[`0`];
217	}
218	}
219
220	if (bestLength >= MINMATCH) {
221	U32 const mIndex = curr - (U32)STORED_OFFSET(offsetPtr); (void*)mIndex;
222	DEBUGLOG(`8`, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
223	curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
224	}
225	return bestLength;
226
227	}
228
229
230	static size_t
231	ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
232	const BYTE* const ip, const BYTE* const iend,
233	size_t* offsetPtr,
234	U32 const mls,
235	const ZSTD_dictMode_e dictMode)
236	{
237	const ZSTD_compressionParameters* const cParams = &ms->cParams;
238	U32* const hashTable = ms->hashTable;
239	U32 const hashLog = cParams->hashLog;
240	size_t const h = ZSTD_hashPtr(p: ip, hBits: hashLog, mls);
241	U32 matchIndex = hashTable[h];
242
243	const BYTE* const base = ms->window.base;
244	U32 const curr = (U32)(ip-base);
245	U32 const windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog: cParams->windowLog);
246
247	U32* const bt = ms->chainTable;
248	U32 const btLog = cParams->chainLog - `1`;
249	U32 const btMask = (`1` << btLog) - `1`;
250	U32 const btLow = (btMask >= curr) ? `0` : curr - btMask;
251	U32 const unsortLimit = MAX(btLow, windowLow);
252
253	U32* nextCandidate = bt + `2`*(matchIndex&btMask);
254	U32* unsortedMark = bt + `2`*(matchIndex&btMask) + `1`;
255	U32 nbCompares = `1U` << cParams->searchLog;
256	U32 nbCandidates = nbCompares;
257	U32 previousCandidate = `0`;
258
259	DEBUGLOG(`7`, "ZSTD_DUBT_findBestMatch (%u) ", curr);
260	assert(ip <= iend-`8`); / required for h calculation /
261	assert(dictMode != ZSTD_dedicatedDictSearch);
262
263	/ reach end of unsorted candidates list /
264	while ( (matchIndex > unsortLimit)
265	&& (*unsortedMark == ZSTD_DUBT_UNSORTED_MARK)
266	&& (nbCandidates > `1`) ) {
267	DEBUGLOG(`8`, "ZSTD_DUBT_findBestMatch: candidate %u is unsorted",
268	matchIndex);
269	unsortedMark = previousCandidate; /* the unsortedMark becomes a reversed chain, to move up back to original position /
270	previousCandidate = matchIndex;
271	matchIndex = *nextCandidate;
272	nextCandidate = bt + `2`*(matchIndex&btMask);
273	unsortedMark = bt + `2`*(matchIndex&btMask) + `1`;
274	nbCandidates --;
275	}
276
277	/ nullify last candidate if it's still unsorted*
278	* simplification, detrimental to compression ratio, beneficial for speed */
279	if ( (matchIndex > unsortLimit)
280	&& (*unsortedMark==ZSTD_DUBT_UNSORTED_MARK) ) {
281	DEBUGLOG(`7`, "ZSTD_DUBT_findBestMatch: nullify last unsorted candidate %u",
282	matchIndex);
283	nextCandidate = unsortedMark = `0`;
284	}
285
286	/ batch sort stacked candidates /
287	matchIndex = previousCandidate;
288	while (matchIndex) { / will end on matchIndex == 0 /
289	U32* const nextCandidateIdxPtr = bt + `2`*(matchIndex&btMask) + `1`;
290	U32 const nextCandidateIdx = *nextCandidateIdxPtr;
291	ZSTD_insertDUBT1(ms, curr: matchIndex, inputEnd: iend,
292	nbCompares: nbCandidates, btLow: unsortLimit, dictMode);
293	matchIndex = nextCandidateIdx;
294	nbCandidates++;
295	}
296
297	/ find longest match /
298	{ size_t commonLengthSmaller = `0`, commonLengthLarger = `0`;
299	const BYTE* const dictBase = ms->window.dictBase;
300	const U32 dictLimit = ms->window.dictLimit;
301	const BYTE* const dictEnd = dictBase + dictLimit;
302	const BYTE* const prefixStart = base + dictLimit;
303	U32* smallerPtr = bt + `2`*(curr&btMask);
304	U32* largerPtr = bt + `2`*(curr&btMask) + `1`;
305	U32 matchEndIdx = curr + `8` + `1`;
306	U32 dummy32; / to be nullified at the end /
307	size_t bestLength = `0`;
308
309	matchIndex = hashTable[h];
310	hashTable[h] = curr; / Update Hash Table /
311
312	for (; nbCompares && (matchIndex > windowLow); --nbCompares) {
313	U32* const nextPtr = bt + `2`*(matchIndex & btMask);
314	size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); / guaranteed minimum nb of common bytes /
315	const BYTE* match;
316
317	if ((dictMode != ZSTD_extDict) \|\| (matchIndex+matchLength >= dictLimit)) {
318	match = base + matchIndex;
319	matchLength += ZSTD_count(pIn: ip+matchLength, pMatch: match+matchLength, pInLimit: iend);
320	} else {
321	match = dictBase + matchIndex;
322	matchLength += ZSTD_count_2segments(ip: ip+matchLength, match: match+matchLength, iEnd: iend, mEnd: dictEnd, iStart: prefixStart);
323	if (matchIndex+matchLength >= dictLimit)
324	match = base + matchIndex; / to prepare for next usage of match[matchLength] /
325	}
326
327	if (matchLength > bestLength) {
328	if (matchLength > matchEndIdx - matchIndex)
329	matchEndIdx = matchIndex + (U32)matchLength;
330	if ( (`4`(int)(matchLength-bestLength)) > (int*)(ZSTD_highbit32(val: curr-matchIndex+`1`) - ZSTD_highbit32(val: (U32)offsetPtr[`0`]+`1`)) )
331	bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex);
332	if (ip+matchLength == iend) { / equal : no way to know if inf or sup /
333	if (dictMode == ZSTD_dictMatchState) {
334	nbCompares = `0`; / in addition to avoiding checking any*
335	* further in this loop, make sure we
336	* skip checking in the dictionary. */
337	}
338	break; / drop, to guarantee consistency (miss a little bit of compression) /
339	}
340	}
341
342	if (match[matchLength] < ip[matchLength]) {
343	/ match is smaller than current /
344	smallerPtr = matchIndex; /* update smaller idx /
345	commonLengthSmaller = matchLength; / all smaller will now have at least this guaranteed common length /
346	if (matchIndex <= btLow) { smallerPtr=&dummy32; break; } / beyond tree size, stop the search /
347	smallerPtr = nextPtr+`1`; / new "smaller" => larger of match /
348	matchIndex = nextPtr[`1`]; / new matchIndex larger than previous (closer to current) /
349	} else {
350	/ match is larger than current /
351	*largerPtr = matchIndex;
352	commonLengthLarger = matchLength;
353	if (matchIndex <= btLow) { largerPtr=&dummy32; break; } / beyond tree size, stop the search /
354	largerPtr = nextPtr;
355	matchIndex = nextPtr[`0`];
356	} }
357
358	smallerPtr = largerPtr = `0`;
359
360	assert(nbCompares <= (`1U` << ZSTD_SEARCHLOG_MAX)); / Check we haven't underflowed. /
361	if (dictMode == ZSTD_dictMatchState && nbCompares) {
362	bestLength = ZSTD_DUBT_findBetterDictMatch(
363	ms, ip, iend,
364	offsetPtr, bestLength, nbCompares,
365	mls, dictMode);
366	}
367
368	assert(matchEndIdx > curr+`8`); / ensure nextToUpdate is increased /
369	ms->nextToUpdate = matchEndIdx - `8`; / skip repetitive patterns /
370	if (bestLength >= MINMATCH) {
371	U32 const mIndex = curr - (U32)STORED_OFFSET(offsetPtr); (void*)mIndex;
372	DEBUGLOG(`8`, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
373	curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
374	}
375	return bestLength;
376	}
377	}
378
379
380	/ ZSTD_BtFindBestMatch() : Tree updater, providing best match /
381	FORCE_INLINE_TEMPLATE size_t
382	ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms,
383	const BYTE* const ip, const BYTE* const iLimit,
384	size_t* offsetPtr,
385	const U32 mls / template /,
386	const ZSTD_dictMode_e dictMode)
387	{
388	DEBUGLOG(`7`, "ZSTD_BtFindBestMatch");
389	if (ip < ms->window.base + ms->nextToUpdate) return `0`; / skipped area /
390	ZSTD_updateDUBT(ms, ip, iend: iLimit, mls);
391	return ZSTD_DUBT_findBestMatch(ms, ip, iend: iLimit, offsetPtr, mls, dictMode);
392	}
393
394	/* *********************************
395	* Dedicated dict search
396	***********************************/
397
398	void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const BYTE* const ip)
399	{
400	const BYTE* const base = ms->window.base;
401	U32 const target = (U32)(ip - base);
402	U32* const hashTable = ms->hashTable;
403	U32* const chainTable = ms->chainTable;
404	U32 const chainSize = `1` << ms->cParams.chainLog;
405	U32 idx = ms->nextToUpdate;
406	U32 const minChain = chainSize < target - idx ? target - chainSize : idx;
407	U32 const bucketSize = `1` << ZSTD_LAZY_DDSS_BUCKET_LOG;
408	U32 const cacheSize = bucketSize - `1`;
409	U32 const chainAttempts = (`1` << ms->cParams.searchLog) - cacheSize;
410	U32 const chainLimit = chainAttempts > `255` ? `255` : chainAttempts;
411
412	/ We know the hashtable is oversized by a factor of `bucketSize`.*
413	* We are going to temporarily pretend `bucketSize == 1`, keeping only a
414	* single entry. We will use the rest of the space to construct a temporary
415	* chaintable.
416	*/
417	U32 const hashLog = ms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG;
418	U32* const tmpHashTable = hashTable;
419	U32* const tmpChainTable = hashTable + ((size_t)`1` << hashLog);
420	U32 const tmpChainSize = (U32)((`1` << ZSTD_LAZY_DDSS_BUCKET_LOG) - `1`) << hashLog;
421	U32 const tmpMinChain = tmpChainSize < target ? target - tmpChainSize : idx;
422	U32 hashIdx;
423
424	assert(ms->cParams.chainLog <= `24`);
425	assert(ms->cParams.hashLog > ms->cParams.chainLog);
426	assert(idx != `0`);
427	assert(tmpMinChain <= minChain);
428
429	/ fill conventional hash table and conventional chain table /
430	for ( ; idx < target; idx++) {
431	U32 const h = (U32)ZSTD_hashPtr(p: base + idx, hBits: hashLog, mls: ms->cParams.minMatch);
432	if (idx >= tmpMinChain) {
433	tmpChainTable[idx - tmpMinChain] = hashTable[h];
434	}
435	tmpHashTable[h] = idx;
436	}
437
438	/ sort chains into ddss chain table /
439	{
440	U32 chainPos = `0`;
441	for (hashIdx = `0`; hashIdx < (`1U` << hashLog); hashIdx++) {
442	U32 count;
443	U32 countBeyondMinChain = `0`;
444	U32 i = tmpHashTable[hashIdx];
445	for (count = `0`; i >= tmpMinChain && count < cacheSize; count++) {
446	/ skip through the chain to the first position that won't be*
447	* in the hash cache bucket */
448	if (i < minChain) {
449	countBeyondMinChain++;
450	}
451	i = tmpChainTable[i - tmpMinChain];
452	}
453	if (count == cacheSize) {
454	for (count = `0`; count < chainLimit;) {
455	if (i < minChain) {
456	if (!i \|\| ++countBeyondMinChain > cacheSize) {
457	/ only allow pulling `cacheSize` number of entries*
458	* into the cache or chainTable beyond `minChain`,
459	* to replace the entries pulled out of the
460	* chainTable into the cache. This lets us reach
461	* back further without increasing the total number
462	* of entries in the chainTable, guaranteeing the
463	* DDSS chain table will fit into the space
464	* allocated for the regular one. */
465	break;
466	}
467	}
468	chainTable[chainPos++] = i;
469	count++;
470	if (i < tmpMinChain) {
471	break;
472	}
473	i = tmpChainTable[i - tmpMinChain];
474	}
475	} else {
476	count = `0`;
477	}
478	if (count) {
479	tmpHashTable[hashIdx] = ((chainPos - count) << `8`) + count;
480	} else {
481	tmpHashTable[hashIdx] = `0`;
482	}
483	}
484	assert(chainPos <= chainSize); / I believe this is guaranteed... /
485	}
486
487	/ move chain pointers into the last entry of each hash bucket /
488	for (hashIdx = (`1` << hashLog); hashIdx; ) {
489	U32 const bucketIdx = --hashIdx << ZSTD_LAZY_DDSS_BUCKET_LOG;
490	U32 const chainPackedPointer = tmpHashTable[hashIdx];
491	U32 i;
492	for (i = `0`; i < cacheSize; i++) {
493	hashTable[bucketIdx + i] = `0`;
494	}
495	hashTable[bucketIdx + bucketSize - `1`] = chainPackedPointer;
496	}
497
498	/ fill the buckets of the hash table /
499	for (idx = ms->nextToUpdate; idx < target; idx++) {
500	U32 const h = (U32)ZSTD_hashPtr(p: base + idx, hBits: hashLog, mls: ms->cParams.minMatch)
501	<< ZSTD_LAZY_DDSS_BUCKET_LOG;
502	U32 i;
503	/ Shift hash cache down 1. /
504	for (i = cacheSize - `1`; i; i--)
505	hashTable[h + i] = hashTable[h + i - `1`];
506	hashTable[h] = idx;
507	}
508
509	ms->nextToUpdate = target;
510	}
511
512	/ Returns the longest match length found in the dedicated dict search structure.*
513	* If none are longer than the argument ml, then ml will be returned.
514	*/
515	FORCE_INLINE_TEMPLATE
516	size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nbAttempts,
517	const ZSTD_matchState_t* const dms,
518	const BYTE* const ip, const BYTE* const iLimit,
519	const BYTE* const prefixStart, const U32 curr,
520	const U32 dictLimit, const size_t ddsIdx) {
521	const U32 ddsLowestIndex = dms->window.dictLimit;
522	const BYTE* const ddsBase = dms->window.base;
523	const BYTE* const ddsEnd = dms->window.nextSrc;
524	const U32 ddsSize = (U32)(ddsEnd - ddsBase);
525	const U32 ddsIndexDelta = dictLimit - ddsSize;
526	const U32 bucketSize = (`1` << ZSTD_LAZY_DDSS_BUCKET_LOG);
527	const U32 bucketLimit = nbAttempts < bucketSize - `1` ? nbAttempts : bucketSize - `1`;
528	U32 ddsAttempt;
529	U32 matchIndex;
530
531	for (ddsAttempt = `0`; ddsAttempt < bucketSize - `1`; ddsAttempt++) {
532	PREFETCH_L1(ddsBase + dms->hashTable[ddsIdx + ddsAttempt]);
533	}
534
535	{
536	U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - `1`];
537	U32 const chainIndex = chainPackedPointer >> `8`;
538
539	PREFETCH_L1(&dms->chainTable[chainIndex]);
540	}
541
542	for (ddsAttempt = `0`; ddsAttempt < bucketLimit; ddsAttempt++) {
543	size_t currentMl=`0`;
544	const BYTE* match;
545	matchIndex = dms->hashTable[ddsIdx + ddsAttempt];
546	match = ddsBase + matchIndex;
547
548	if (!matchIndex) {
549	return ml;
550	}
551
552	/ guaranteed by table construction /
553	(void)ddsLowestIndex;
554	assert(matchIndex >= ddsLowestIndex);
555	assert(match+`4` <= ddsEnd);
556	if (MEM_read32(memPtr: match) == MEM_read32(memPtr: ip)) {
557	/ assumption : matchIndex <= dictLimit-4 (by table construction) /
558	currentMl = ZSTD_count_2segments(ip: ip+`4`, match: match+`4`, iEnd: iLimit, mEnd: ddsEnd, iStart: prefixStart) + `4`;
559	}
560
561	/ save best solution /
562	if (currentMl > ml) {
563	ml = currentMl;
564	*offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta));
565	if (ip+currentMl == iLimit) {
566	/ best possible, avoids read overflow on next attempt /
567	return ml;
568	}
569	}
570	}
571
572	{
573	U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - `1`];
574	U32 chainIndex = chainPackedPointer >> `8`;
575	U32 const chainLength = chainPackedPointer & `0xFF`;
576	U32 const chainAttempts = nbAttempts - ddsAttempt;
577	U32 const chainLimit = chainAttempts > chainLength ? chainLength : chainAttempts;
578	U32 chainAttempt;
579
580	for (chainAttempt = `0` ; chainAttempt < chainLimit; chainAttempt++) {
581	PREFETCH_L1(ddsBase + dms->chainTable[chainIndex + chainAttempt]);
582	}
583
584	for (chainAttempt = `0` ; chainAttempt < chainLimit; chainAttempt++, chainIndex++) {
585	size_t currentMl=`0`;
586	const BYTE* match;
587	matchIndex = dms->chainTable[chainIndex];
588	match = ddsBase + matchIndex;
589
590	/ guaranteed by table construction /
591	assert(matchIndex >= ddsLowestIndex);
592	assert(match+`4` <= ddsEnd);
593	if (MEM_read32(memPtr: match) == MEM_read32(memPtr: ip)) {
594	/ assumption : matchIndex <= dictLimit-4 (by table construction) /
595	currentMl = ZSTD_count_2segments(ip: ip+`4`, match: match+`4`, iEnd: iLimit, mEnd: ddsEnd, iStart: prefixStart) + `4`;
596	}
597
598	/ save best solution /
599	if (currentMl > ml) {
600	ml = currentMl;
601	*offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta));
602	if (ip+currentMl == iLimit) break; / best possible, avoids read overflow on next attempt /
603	}
604	}
605	}
606	return ml;
607	}
608
609
610	/* *********************************
611	* Hash Chain
612	***********************************/
613	#define NEXT_IN_CHAIN(d, mask) chainTable[(d) & (mask)]
614
615	/ Update chains up to ip (excluded)*
616	Assumption : always within prefix (i.e. not within extDict) /*
617	FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
618	ZSTD_matchState_t* ms,
619	const ZSTD_compressionParameters* const cParams,
620	const BYTE* ip, U32 const mls)
621	{
622	U32* const hashTable = ms->hashTable;
623	const U32 hashLog = cParams->hashLog;
624	U32* const chainTable = ms->chainTable;
625	const U32 chainMask = (`1` << cParams->chainLog) - `1`;
626	const BYTE* const base = ms->window.base;
627	const U32 target = (U32)(ip - base);
628	U32 idx = ms->nextToUpdate;
629
630	while(idx < target) { / catch up /
631	size_t const h = ZSTD_hashPtr(p: base+idx, hBits: hashLog, mls);
632	NEXT_IN_CHAIN(idx, chainMask) = hashTable[h];
633	hashTable[h] = idx;
634	idx++;
635	}
636
637	ms->nextToUpdate = target;
638	return hashTable[ZSTD_hashPtr(p: ip, hBits: hashLog, mls)];
639	}
640
641	U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) {
642	const ZSTD_compressionParameters* const cParams = &ms->cParams;
643	return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls: ms->cParams.minMatch);
644	}
645
646	/ inlining is important to hardwire a hot branch (template emulation) /
647	FORCE_INLINE_TEMPLATE
648	size_t ZSTD_HcFindBestMatch(
649	ZSTD_matchState_t* ms,
650	const BYTE* const ip, const BYTE* const iLimit,
651	size_t* offsetPtr,
652	const U32 mls, const ZSTD_dictMode_e dictMode)
653	{
654	const ZSTD_compressionParameters* const cParams = &ms->cParams;
655	U32* const chainTable = ms->chainTable;
656	const U32 chainSize = (`1` << cParams->chainLog);
657	const U32 chainMask = chainSize-`1`;
658	const BYTE* const base = ms->window.base;
659	const BYTE* const dictBase = ms->window.dictBase;
660	const U32 dictLimit = ms->window.dictLimit;
661	const BYTE* const prefixStart = base + dictLimit;
662	const BYTE* const dictEnd = dictBase + dictLimit;
663	const U32 curr = (U32)(ip-base);
664	const U32 maxDistance = `1U` << cParams->windowLog;
665	const U32 lowestValid = ms->window.lowLimit;
666	const U32 withinMaxDistance = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid;
667	const U32 isDictionary = (ms->loadedDictEnd != `0`);
668	const U32 lowLimit = isDictionary ? lowestValid : withinMaxDistance;
669	const U32 minChain = curr > chainSize ? curr - chainSize : `0`;
670	U32 nbAttempts = `1U` << cParams->searchLog;
671	size_t ml=`4`-`1`;
672
673	const ZSTD_matchState_t* const dms = ms->dictMatchState;
674	const U32 ddsHashLog = dictMode == ZSTD_dedicatedDictSearch
675	? dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG : `0`;
676	const size_t ddsIdx = dictMode == ZSTD_dedicatedDictSearch
677	? ZSTD_hashPtr(p: ip, hBits: ddsHashLog, mls) << ZSTD_LAZY_DDSS_BUCKET_LOG : `0`;
678
679	U32 matchIndex;
680
681	if (dictMode == ZSTD_dedicatedDictSearch) {
682	const U32* entry = &dms->hashTable[ddsIdx];
683	PREFETCH_L1(entry);
684	}
685
686	/ HC4 match finder /
687	matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls);
688
689	for ( ; (matchIndex>=lowLimit) & (nbAttempts>`0`) ; nbAttempts--) {
690	size_t currentMl=`0`;
691	if ((dictMode != ZSTD_extDict) \|\| matchIndex >= dictLimit) {
692	const BYTE* const match = base + matchIndex;
693	assert(matchIndex >= dictLimit); / ensures this is true if dictMode != ZSTD_extDict /
694	if (match[ml] == ip[ml]) / potentially better /
695	currentMl = ZSTD_count(pIn: ip, pMatch: match, pInLimit: iLimit);
696	} else {
697	const BYTE* const match = dictBase + matchIndex;
698	assert(match+`4` <= dictEnd);
699	if (MEM_read32(memPtr: match) == MEM_read32(memPtr: ip)) / assumption : matchIndex <= dictLimit-4 (by table construction) /
700	currentMl = ZSTD_count_2segments(ip: ip+`4`, match: match+`4`, iEnd: iLimit, mEnd: dictEnd, iStart: prefixStart) + `4`;
701	}
702
703	/ save best solution /
704	if (currentMl > ml) {
705	ml = currentMl;
706	*offsetPtr = STORE_OFFSET(curr - matchIndex);
707	if (ip+currentMl == iLimit) break; / best possible, avoids read overflow on next attempt /
708	}
709
710	if (matchIndex <= minChain) break;
711	matchIndex = NEXT_IN_CHAIN(matchIndex, chainMask);
712	}
713
714	assert(nbAttempts <= (`1U` << ZSTD_SEARCHLOG_MAX)); / Check we haven't underflowed. /
715	if (dictMode == ZSTD_dedicatedDictSearch) {
716	ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts, dms,
717	ip, iLimit, prefixStart, curr, dictLimit, ddsIdx);
718	} else if (dictMode == ZSTD_dictMatchState) {
719	const U32* const dmsChainTable = dms->chainTable;
720	const U32 dmsChainSize = (`1` << dms->cParams.chainLog);
721	const U32 dmsChainMask = dmsChainSize - `1`;
722	const U32 dmsLowestIndex = dms->window.dictLimit;
723	const BYTE* const dmsBase = dms->window.base;
724	const BYTE* const dmsEnd = dms->window.nextSrc;
725	const U32 dmsSize = (U32)(dmsEnd - dmsBase);
726	const U32 dmsIndexDelta = dictLimit - dmsSize;
727	const U32 dmsMinChain = dmsSize > dmsChainSize ? dmsSize - dmsChainSize : `0`;
728
729	matchIndex = dms->hashTable[ZSTD_hashPtr(p: ip, hBits: dms->cParams.hashLog, mls)];
730
731	for ( ; (matchIndex>=dmsLowestIndex) & (nbAttempts>`0`) ; nbAttempts--) {
732	size_t currentMl=`0`;
733	const BYTE* const match = dmsBase + matchIndex;
734	assert(match+`4` <= dmsEnd);
735	if (MEM_read32(memPtr: match) == MEM_read32(memPtr: ip)) / assumption : matchIndex <= dictLimit-4 (by table construction) /
736	currentMl = ZSTD_count_2segments(ip: ip+`4`, match: match+`4`, iEnd: iLimit, mEnd: dmsEnd, iStart: prefixStart) + `4`;
737
738	/ save best solution /
739	if (currentMl > ml) {
740	ml = currentMl;
741	assert(curr > matchIndex + dmsIndexDelta);
742	*offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta));
743	if (ip+currentMl == iLimit) break; / best possible, avoids read overflow on next attempt /
744	}
745
746	if (matchIndex <= dmsMinChain) break;
747
748	matchIndex = dmsChainTable[matchIndex & dmsChainMask];
749	}
750	}
751
752	return ml;
753	}
754
755	/* *********************************
756	* (SIMD) Row-based matchfinder
757	***********************************/
758	/ Constants for row-based hash /
759	#define ZSTD_ROW_HASH_TAG_OFFSET 16 /* byte offset of hashes in the match state's tagTable from the beginning of a row */
760	#define ZSTD_ROW_HASH_TAG_BITS 8 /* nb bits to use for the tag */
761	#define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1)
762	#define ZSTD_ROW_HASH_MAX_ENTRIES 64 /* absolute maximum number of entries per row, for all configurations */
763
764	#define ZSTD_ROW_HASH_CACHE_MASK (ZSTD_ROW_HASH_CACHE_SIZE - 1)
765
766	typedef U64 ZSTD_VecMask; / Clarifies when we are interacting with a U64 representing a mask of matches /
767
768	/ ZSTD_VecMask_next():*
769	* Starting from the LSB, returns the idx of the next non-zero bit.
770	* Basically counting the nb of trailing zeroes.
771	*/
772	static U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
773	assert(val != `0`);
774	# if (defined(__GNUC__) && ((__GNUC__ > 3) \|\| ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))
775	if (sizeof(size_t) == `4`) {
776	U32 mostSignificantWord = (U32)(val >> `32`);
777	U32 leastSignificantWord = (U32)val;
778	if (leastSignificantWord == `0`) {
779	return `32` + (U32)__builtin_ctz(mostSignificantWord);
780	} else {
781	return (U32)__builtin_ctz(leastSignificantWord);
782	}
783	} else {
784	return (U32)__builtin_ctzll(val);
785	}
786	# else
787	/ Software ctz version: http://aggregate.org/MAGIC/#Trailing%20Zero%20Count*
788	* and: https://stackoverflow.com/questions/2709430/count-number-of-bits-in-a-64-bit-long-big-integer
789	*/
790	val = ~val & (val - `1ULL`); / Lowest set bit mask /
791	val = val - ((val >> `1`) & `0x5555555555555555`);
792	val = (val & `0x3333333333333333ULL`) + ((val >> `2`) & `0x3333333333333333ULL`);
793	return (U32)((((val + (val >> `4`)) & `0xF0F0F0F0F0F0F0FULL`) * `0x101010101010101ULL`) >> `56`);
794	# endif
795	}
796
797	/ ZSTD_rotateRight_():
798	* Rotates a bitfield to the right by "count" bits.
799	* https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts
800	*/
801	FORCE_INLINE_TEMPLATE
802	U64 ZSTD_rotateRight_U64(U64 const value, U32 count) {
803	assert(count < `64`);
804	count &= `0x3F`; / for fickle pattern recognition /
805	return (value >> count) \| (U64)(value << ((`0U` - count) & `0x3F`));
806	}
807
808	FORCE_INLINE_TEMPLATE
809	U32 ZSTD_rotateRight_U32(U32 const value, U32 count) {
810	assert(count < `32`);
811	count &= `0x1F`; / for fickle pattern recognition /
812	return (value >> count) \| (U32)(value << ((`0U` - count) & `0x1F`));
813	}
814
815	FORCE_INLINE_TEMPLATE
816	U16 ZSTD_rotateRight_U16(U16 const value, U32 count) {
817	assert(count < `16`);
818	count &= `0x0F`; / for fickle pattern recognition /
819	return (value >> count) \| (U16)(value << ((`0U` - count) & `0x0F`));
820	}
821
822	/ ZSTD_row_nextIndex():*
823	* Returns the next index to insert at within a tagTable row, and updates the "head"
824	* value to reflect the update. Essentially cycles backwards from [0, {entries per row})
825	*/
826	FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextIndex(BYTE* const tagRow, U32 const rowMask) {
827	U32 const next = (*tagRow - `1`) & rowMask;
828	*tagRow = (BYTE)next;
829	return next;
830	}
831
832	/ ZSTD_isAligned():*
833	* Checks that a pointer is aligned to "align" bytes which must be a power of 2.
834	*/
835	MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) {
836	assert((align & (align - `1`)) == `0`);
837	return (((size_t)ptr) & (align - `1`)) == `0`;
838	}
839
840	/ ZSTD_row_prefetch():*
841	* Performs prefetching for the hashTable and tagTable at a given row.
842	*/
843	FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 const* tagTable, U32 const relRow, U32 const rowLog) {
844	PREFETCH_L1(hashTable + relRow);
845	if (rowLog >= `5`) {
846	PREFETCH_L1(hashTable + relRow + `16`);
847	/ Note: prefetching more of the hash table does not appear to be beneficial for 128-entry rows /
848	}
849	PREFETCH_L1(tagTable + relRow);
850	if (rowLog == `6`) {
851	PREFETCH_L1(tagTable + relRow + `32`);
852	}
853	assert(rowLog == `4` \|\| rowLog == `5` \|\| rowLog == `6`);
854	assert(ZSTD_isAligned(hashTable + relRow, `64`)); / prefetched hash row always 64-byte aligned /
855	assert(ZSTD_isAligned(tagTable + relRow, (size_t)`1` << rowLog)); / prefetched tagRow sits on correct multiple of bytes (32,64,128) /
856	}
857
858	/ ZSTD_row_fillHashCache():*
859	* Fill up the hash cache starting at idx, prefetching up to ZSTD_ROW_HASH_CACHE_SIZE entries,
860	* but not beyond iLimit.
861	*/
862	FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base,
863	U32 const rowLog, U32 const mls,
864	U32 idx, const BYTE* const iLimit)
865	{
866	U32 const* const hashTable = ms->hashTable;
867	U16 const* const tagTable = ms->tagTable;
868	U32 const hashLog = ms->rowHashLog;
869	U32 const maxElemsToPrefetch = (base + idx) > iLimit ? `0` : (U32)(iLimit - (base + idx) + `1`);
870	U32 const lim = idx + MIN(ZSTD_ROW_HASH_CACHE_SIZE, maxElemsToPrefetch);
871
872	for (; idx < lim; ++idx) {
873	U32 const hash = (U32)ZSTD_hashPtr(p: base + idx, hBits: hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
874	U32 const row = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
875	ZSTD_row_prefetch(hashTable, tagTable, relRow: row, rowLog);
876	ms->hashCache[idx & ZSTD_ROW_HASH_CACHE_MASK] = hash;
877	}
878
879	DEBUGLOG(`6`, "ZSTD_row_fillHashCache(): [%u %u %u %u %u %u %u %u]", ms->hashCache[`0`], ms->hashCache[`1`],
880	ms->hashCache[`2`], ms->hashCache[`3`], ms->hashCache[`4`],
881	ms->hashCache[`5`], ms->hashCache[`6`], ms->hashCache[`7`]);
882	}
883
884	/ ZSTD_row_nextCachedHash():*
885	* Returns the hash of base + idx, and replaces the hash in the hash cache with the byte at
886	* base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate rows from hashTable and tagTable.
887	*/
888	FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable,
889	U16 const* tagTable, BYTE const* base,
890	U32 idx, U32 const hashLog,
891	U32 const rowLog, U32 const mls)
892	{
893	U32 const newHash = (U32)ZSTD_hashPtr(p: base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hBits: hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
894	U32 const row = (newHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
895	ZSTD_row_prefetch(hashTable, tagTable, relRow: row, rowLog);
896	{ U32 const hash = cache[idx & ZSTD_ROW_HASH_CACHE_MASK];
897	cache[idx & ZSTD_ROW_HASH_CACHE_MASK] = newHash;
898	return hash;
899	}
900	}
901
902	/ ZSTD_row_update_internalImpl():*
903	* Updates the hash table with positions starting from updateStartIdx until updateEndIdx.
904	*/
905	FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms,
906	U32 updateStartIdx, U32 const updateEndIdx,
907	U32 const mls, U32 const rowLog,
908	U32 const rowMask, U32 const useCache)
909	{
910	U32* const hashTable = ms->hashTable;
911	U16* const tagTable = ms->tagTable;
912	U32 const hashLog = ms->rowHashLog;
913	const BYTE* const base = ms->window.base;
914
915	DEBUGLOG(`6`, "ZSTD_row_update_internalImpl(): updateStartIdx=%u, updateEndIdx=%u", updateStartIdx, updateEndIdx);
916	for (; updateStartIdx < updateEndIdx; ++updateStartIdx) {
917	U32 const hash = useCache ? ZSTD_row_nextCachedHash(cache: ms->hashCache, hashTable, tagTable, base, idx: updateStartIdx, hashLog, rowLog, mls)
918	: (U32)ZSTD_hashPtr(p: base + updateStartIdx, hBits: hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
919	U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
920	U32* const row = hashTable + relRow;
921	BYTE* tagRow = (BYTE)(tagTable + relRow); /* Though tagTable is laid out as a table of U16, each tag is only 1 byte.*
922	Explicit cast allows us to get exact desired position within each row /*
923	U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
924
925	assert(hash == ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls));
926	((BYTE*)tagRow)[pos + ZSTD_ROW_HASH_TAG_OFFSET] = hash & ZSTD_ROW_HASH_TAG_MASK;
927	row[pos] = updateStartIdx;
928	}
929	}
930
931	/ ZSTD_row_update_internal():*
932	* Inserts the byte at ip into the appropriate position in the hash table, and updates ms->nextToUpdate.
933	* Skips sections of long matches as is necessary.
934	*/
935	FORCE_INLINE_TEMPLATE void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip,
936	U32 const mls, U32 const rowLog,
937	U32 const rowMask, U32 const useCache)
938	{
939	U32 idx = ms->nextToUpdate;
940	const BYTE* const base = ms->window.base;
941	const U32 target = (U32)(ip - base);
942	const U32 kSkipThreshold = `384`;
943	const U32 kMaxMatchStartPositionsToUpdate = `96`;
944	const U32 kMaxMatchEndPositionsToUpdate = `32`;
945
946	if (useCache) {
947	/ Only skip positions when using hash cache, i.e.*
948	* if we are loading a dict, don't skip anything.
949	* If we decide to skip, then we only update a set number
950	* of positions at the beginning and end of the match.
951	*/
952	if (UNLIKELY(target - idx > kSkipThreshold)) {
953	U32 const bound = idx + kMaxMatchStartPositionsToUpdate;
954	ZSTD_row_update_internalImpl(ms, updateStartIdx: idx, updateEndIdx: bound, mls, rowLog, rowMask, useCache);
955	idx = target - kMaxMatchEndPositionsToUpdate;
956	ZSTD_row_fillHashCache(ms, base, rowLog, mls, idx, iLimit: ip+`1`);
957	}
958	}
959	assert(target >= idx);
960	ZSTD_row_update_internalImpl(ms, updateStartIdx: idx, updateEndIdx: target, mls, rowLog, rowMask, useCache);
961	ms->nextToUpdate = target;
962	}
963
964	/ ZSTD_row_update():*
965	* External wrapper for ZSTD_row_update_internal(). Used for filling the hashtable during dictionary
966	* processing.
967	*/
968	void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) {
969	const U32 rowLog = BOUNDED(`4`, ms->cParams.searchLog, `6`);
970	const U32 rowMask = (`1u` << rowLog) - `1`;
971	const U32 mls = MIN(ms->cParams.minMatch, `6` / mls caps out at 6 /);
972
973	DEBUGLOG(`5`, "ZSTD_row_update(), rowLog=%u", rowLog);
974	ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, useCache: `0` / dont use cache /);
975	}
976
977	#if defined(ZSTD_ARCH_X86_SSE2)
978	FORCE_INLINE_TEMPLATE ZSTD_VecMask
979	ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U32 head)
980	{
981	const __m128i comparisonMask = _mm_set1_epi8((char)tag);
982	int matches[`4`] = {`0`};
983	int i;
984	assert(nbChunks == `1` \|\| nbChunks == `2` \|\| nbChunks == `4`);
985	for (i=`0`; i<nbChunks; i++) {
986	const __m128i chunk = _mm_loadu_si128((const __m128i)(const* void)(src + `16`i));
987	const __m128i equalMask = _mm_cmpeq_epi8(chunk, comparisonMask);
988	matches[i] = _mm_movemask_epi8(equalMask);
989	}
990	if (nbChunks == `1`) return ZSTD_rotateRight_U16((U16)matches[`0`], head);
991	if (nbChunks == `2`) return ZSTD_rotateRight_U32((U32)matches[`1`] << `16` \| (U32)matches[`0`], head);
992	assert(nbChunks == `4`);
993	return ZSTD_rotateRight_U64((U64)matches[`3`] << `48` \| (U64)matches[`2`] << `32` \| (U64)matches[`1`] << `16` \| (U64)matches[`0`], head);
994	}
995	#endif
996
997	/ Returns a ZSTD_VecMask (U32) that has the nth bit set to 1 if the newly-computed "tag" matches*
998	* the hash at the nth position in a row of the tagTable.
999	* Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield
1000	* to match up with the actual layout of the entries within the hashTable */
1001	FORCE_INLINE_TEMPLATE ZSTD_VecMask
1002	ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, const U32 rowEntries)
1003	{
1004	const BYTE* const src = tagRow + ZSTD_ROW_HASH_TAG_OFFSET;
1005	assert((rowEntries == `16`) \|\| (rowEntries == `32`) \|\| rowEntries == `64`);
1006	assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);
1007
1008	#if defined(ZSTD_ARCH_X86_SSE2)
1009
1010	return ZSTD_row_getSSEMask(rowEntries / `16`, src, tag, head);
1011
1012	#else /* SW or NEON-LE */
1013
1014	# if defined(ZSTD_ARCH_ARM_NEON)
1015	/ This NEON path only works for little endian - otherwise use SWAR below /
1016	if (MEM_isLittleEndian()) {
1017	if (rowEntries == `16`) {
1018	const uint8x16_t chunk = vld1q_u8(src);
1019	const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag)));
1020	const uint16x8_t t0 = vshlq_n_u16(equalMask, `7`);
1021	const uint32x4_t t1 = vreinterpretq_u32_u16(vsriq_n_u16(t0, t0, `14`));
1022	const uint64x2_t t2 = vreinterpretq_u64_u32(vshrq_n_u32(t1, `14`));
1023	const uint8x16_t t3 = vreinterpretq_u8_u64(vsraq_n_u64(t2, t2, `28`));
1024	const U16 hi = (U16)vgetq_lane_u8(t3, `8`);
1025	const U16 lo = (U16)vgetq_lane_u8(t3, `0`);
1026	return ZSTD_rotateRight_U16((hi << `8`) \| lo, head);
1027	} else if (rowEntries == `32`) {
1028	const uint16x8x2_t chunk = vld2q_u16((const U16)(const* void*)src);
1029	const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[`0`]);
1030	const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[`1`]);
1031	const uint8x16_t equalMask0 = vceqq_u8(chunk0, vdupq_n_u8(tag));
1032	const uint8x16_t equalMask1 = vceqq_u8(chunk1, vdupq_n_u8(tag));
1033	const int8x8_t pack0 = vqmovn_s16(vreinterpretq_s16_u8(equalMask0));
1034	const int8x8_t pack1 = vqmovn_s16(vreinterpretq_s16_u8(equalMask1));
1035	const uint8x8_t t0 = vreinterpret_u8_s8(pack0);
1036	const uint8x8_t t1 = vreinterpret_u8_s8(pack1);
1037	const uint8x8_t t2 = vsri_n_u8(t1, t0, `2`);
1038	const uint8x8x2_t t3 = vuzp_u8(t2, t0);
1039	const uint8x8_t t4 = vsri_n_u8(t3.val[`1`], t3.val[`0`], `4`);
1040	const U32 matches = vget_lane_u32(vreinterpret_u32_u8(t4), `0`);
1041	return ZSTD_rotateRight_U32(matches, head);
1042	} else { / rowEntries == 64 /
1043	const uint8x16x4_t chunk = vld4q_u8(src);
1044	const uint8x16_t dup = vdupq_n_u8(tag);
1045	const uint8x16_t cmp0 = vceqq_u8(chunk.val[`0`], dup);
1046	const uint8x16_t cmp1 = vceqq_u8(chunk.val[`1`], dup);
1047	const uint8x16_t cmp2 = vceqq_u8(chunk.val[`2`], dup);
1048	const uint8x16_t cmp3 = vceqq_u8(chunk.val[`3`], dup);
1049
1050	const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, `1`);
1051	const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, `1`);
1052	const uint8x16_t t2 = vsriq_n_u8(t1, t0, `2`);
1053	const uint8x16_t t3 = vsriq_n_u8(t2, t2, `4`);
1054	const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), `4`);
1055	const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), `0`);
1056	return ZSTD_rotateRight_U64(matches, head);
1057	}
1058	}
1059	# endif /* ZSTD_ARCH_ARM_NEON */
1060	/ SWAR /
1061	{ const size_t chunkSize = sizeof(size_t);
1062	const size_t shiftAmount = ((chunkSize * `8`) - chunkSize);
1063	const size_t xFF = ~((size_t)`0`);
1064	const size_t x01 = xFF / `0xFF`;
1065	const size_t x80 = x01 << `7`;
1066	const size_t splatChar = tag * x01;
1067	ZSTD_VecMask matches = `0`;
1068	int i = rowEntries - chunkSize;
1069	assert((sizeof(size_t) == `4`) \|\| (sizeof(size_t) == `8`));
1070	if (MEM_isLittleEndian()) { / runtime check so have two loops /
1071	const size_t extractMagic = (xFF / `0x7F`) >> chunkSize;
1072	do {
1073	size_t chunk = MEM_readST(memPtr: &src[i]);
1074	chunk ^= splatChar;
1075	chunk = (((chunk \| x80) - x01) \| chunk) & x80;
1076	matches <<= chunkSize;
1077	matches \|= (chunk * extractMagic) >> shiftAmount;
1078	i -= chunkSize;
1079	} while (i >= `0`);
1080	} else { / big endian: reverse bits during extraction /
1081	const size_t msb = xFF ^ (xFF >> `1`);
1082	const size_t extractMagic = (msb / `0x1FF`) \| msb;
1083	do {
1084	size_t chunk = MEM_readST(memPtr: &src[i]);
1085	chunk ^= splatChar;
1086	chunk = (((chunk \| x80) - x01) \| chunk) & x80;
1087	matches <<= chunkSize;
1088	matches \|= ((chunk >> `7`) * extractMagic) >> shiftAmount;
1089	i -= chunkSize;
1090	} while (i >= `0`);
1091	}
1092	matches = ~matches;
1093	if (rowEntries == `16`) {
1094	return ZSTD_rotateRight_U16(value: (U16)matches, count: head);
1095	} else if (rowEntries == `32`) {
1096	return ZSTD_rotateRight_U32(value: (U32)matches, count: head);
1097	} else {
1098	return ZSTD_rotateRight_U64(value: (U64)matches, count: head);
1099	}
1100	}
1101	#endif
1102	}
1103
1104	/ The high-level approach of the SIMD row based match finder is as follows:*
1105	* - Figure out where to insert the new entry:
1106	* - Generate a hash from a byte along with an additional 1-byte "short hash". The additional byte is our "tag"
1107	* - The hashTable is effectively split into groups or "rows" of 16 or 32 entries of U32, and the hash determines
1108	* which row to insert into.
1109	* - Determine the correct position within the row to insert the entry into. Each row of 16 or 32 can
1110	* be considered as a circular buffer with a "head" index that resides in the tagTable.
1111	* - Also insert the "tag" into the equivalent row and position in the tagTable.
1112	* - Note: The tagTable has 17 or 33 1-byte entries per row, due to 16 or 32 tags, and 1 "head" entry.
1113	* The 17 or 33 entry rows are spaced out to occur every 32 or 64 bytes, respectively,
1114	* for alignment/performance reasons, leaving some bytes unused.
1115	* - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte "short hash" and
1116	* generate a bitfield that we can cycle through to check the collisions in the hash table.
1117	* - Pick the longest match.
1118	*/
1119	FORCE_INLINE_TEMPLATE
1120	size_t ZSTD_RowFindBestMatch(
1121	ZSTD_matchState_t* ms,
1122	const BYTE* const ip, const BYTE* const iLimit,
1123	size_t* offsetPtr,
1124	const U32 mls, const ZSTD_dictMode_e dictMode,
1125	const U32 rowLog)
1126	{
1127	U32* const hashTable = ms->hashTable;
1128	U16* const tagTable = ms->tagTable;
1129	U32* const hashCache = ms->hashCache;
1130	const U32 hashLog = ms->rowHashLog;
1131	const ZSTD_compressionParameters* const cParams = &ms->cParams;
1132	const BYTE* const base = ms->window.base;
1133	const BYTE* const dictBase = ms->window.dictBase;
1134	const U32 dictLimit = ms->window.dictLimit;
1135	const BYTE* const prefixStart = base + dictLimit;
1136	const BYTE* const dictEnd = dictBase + dictLimit;
1137	const U32 curr = (U32)(ip-base);
1138	const U32 maxDistance = `1U` << cParams->windowLog;
1139	const U32 lowestValid = ms->window.lowLimit;
1140	const U32 withinMaxDistance = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid;
1141	const U32 isDictionary = (ms->loadedDictEnd != `0`);
1142	const U32 lowLimit = isDictionary ? lowestValid : withinMaxDistance;
1143	const U32 rowEntries = (`1U` << rowLog);
1144	const U32 rowMask = rowEntries - `1`;
1145	const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); / nb of searches is capped at nb entries per row /
1146	U32 nbAttempts = `1U` << cappedSearchLog;
1147	size_t ml=`4`-`1`;
1148
1149	/ DMS/DDS variables that may be referenced laster /
1150	const ZSTD_matchState_t* const dms = ms->dictMatchState;
1151
1152	/ Initialize the following variables to satisfy static analyzer /
1153	size_t ddsIdx = `0`;
1154	U32 ddsExtraAttempts = `0`; / cctx hash tables are limited in searches, but allow extra searches into DDS /
1155	U32 dmsTag = `0`;
1156	U32* dmsRow = NULL;
1157	BYTE* dmsTagRow = NULL;
1158
1159	if (dictMode == ZSTD_dedicatedDictSearch) {
1160	const U32 ddsHashLog = dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG;
1161	{ / Prefetch DDS hashtable entry /
1162	ddsIdx = ZSTD_hashPtr(p: ip, hBits: ddsHashLog, mls) << ZSTD_LAZY_DDSS_BUCKET_LOG;
1163	PREFETCH_L1(&dms->hashTable[ddsIdx]);
1164	}
1165	ddsExtraAttempts = cParams->searchLog > rowLog ? `1U` << (cParams->searchLog - rowLog) : `0`;
1166	}
1167
1168	if (dictMode == ZSTD_dictMatchState) {
1169	/ Prefetch DMS rows /
1170	U32* const dmsHashTable = dms->hashTable;
1171	U16* const dmsTagTable = dms->tagTable;
1172	U32 const dmsHash = (U32)ZSTD_hashPtr(p: ip, hBits: dms->rowHashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
1173	U32 const dmsRelRow = (dmsHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
1174	dmsTag = dmsHash & ZSTD_ROW_HASH_TAG_MASK;
1175	dmsTagRow = (BYTE*)(dmsTagTable + dmsRelRow);
1176	dmsRow = dmsHashTable + dmsRelRow;
1177	ZSTD_row_prefetch(hashTable: dmsHashTable, tagTable: dmsTagTable, relRow: dmsRelRow, rowLog);
1178	}
1179
1180	/ Update the hashTable and tagTable up to (but not including) ip /
1181	ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, useCache: `1` / useCache /);
1182	{ / Get the hash for ip, compute the appropriate row /
1183	U32 const hash = ZSTD_row_nextCachedHash(cache: hashCache, hashTable, tagTable, base, idx: curr, hashLog, rowLog, mls);
1184	U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
1185	U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK;
1186	U32* const row = hashTable + relRow;
1187	BYTE* tagRow = (BYTE*)(tagTable + relRow);
1188	U32 const head = *tagRow & rowMask;
1189	U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
1190	size_t numMatches = `0`;
1191	size_t currMatch = `0`;
1192	ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, tag: (BYTE)tag, head, rowEntries);
1193
1194	/ Cycle through the matches and prefetch /
1195	for (; (matches > `0`) && (nbAttempts > `0`); --nbAttempts, matches &= (matches - `1`)) {
1196	U32 const matchPos = (head + ZSTD_VecMask_next(val: matches)) & rowMask;
1197	U32 const matchIndex = row[matchPos];
1198	assert(numMatches < rowEntries);
1199	if (matchIndex < lowLimit)
1200	break;
1201	if ((dictMode != ZSTD_extDict) \|\| matchIndex >= dictLimit) {
1202	PREFETCH_L1(base + matchIndex);
1203	} else {
1204	PREFETCH_L1(dictBase + matchIndex);
1205	}
1206	matchBuffer[numMatches++] = matchIndex;
1207	}
1208
1209	/ Speed opt: insert current byte into hashtable too. This allows us to avoid one iteration of the loop*
1210	in ZSTD_row_update_internal() at the next search. /*
1211	{
1212	U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
1213	tagRow[pos + ZSTD_ROW_HASH_TAG_OFFSET] = (BYTE)tag;
1214	row[pos] = ms->nextToUpdate++;
1215	}
1216
1217	/ Return the longest match /
1218	for (; currMatch < numMatches; ++currMatch) {
1219	U32 const matchIndex = matchBuffer[currMatch];
1220	size_t currentMl=`0`;
1221	assert(matchIndex < curr);
1222	assert(matchIndex >= lowLimit);
1223
1224	if ((dictMode != ZSTD_extDict) \|\| matchIndex >= dictLimit) {
1225	const BYTE* const match = base + matchIndex;
1226	assert(matchIndex >= dictLimit); / ensures this is true if dictMode != ZSTD_extDict /
1227	if (match[ml] == ip[ml]) / potentially better /
1228	currentMl = ZSTD_count(pIn: ip, pMatch: match, pInLimit: iLimit);
1229	} else {
1230	const BYTE* const match = dictBase + matchIndex;
1231	assert(match+`4` <= dictEnd);
1232	if (MEM_read32(memPtr: match) == MEM_read32(memPtr: ip)) / assumption : matchIndex <= dictLimit-4 (by table construction) /
1233	currentMl = ZSTD_count_2segments(ip: ip+`4`, match: match+`4`, iEnd: iLimit, mEnd: dictEnd, iStart: prefixStart) + `4`;
1234	}
1235
1236	/ Save best solution /
1237	if (currentMl > ml) {
1238	ml = currentMl;
1239	*offsetPtr = STORE_OFFSET(curr - matchIndex);
1240	if (ip+currentMl == iLimit) break; / best possible, avoids read overflow on next attempt /
1241	}
1242	}
1243	}
1244
1245	assert(nbAttempts <= (`1U` << ZSTD_SEARCHLOG_MAX)); / Check we haven't underflowed. /
1246	if (dictMode == ZSTD_dedicatedDictSearch) {
1247	ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts: nbAttempts + ddsExtraAttempts, dms,
1248	ip, iLimit, prefixStart, curr, dictLimit, ddsIdx);
1249	} else if (dictMode == ZSTD_dictMatchState) {
1250	/ TODO: Measure and potentially add prefetching to DMS /
1251	const U32 dmsLowestIndex = dms->window.dictLimit;
1252	const BYTE* const dmsBase = dms->window.base;
1253	const BYTE* const dmsEnd = dms->window.nextSrc;
1254	const U32 dmsSize = (U32)(dmsEnd - dmsBase);
1255	const U32 dmsIndexDelta = dictLimit - dmsSize;
1256
1257	{ U32 const head = *dmsTagRow & rowMask;
1258	U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
1259	size_t numMatches = `0`;
1260	size_t currMatch = `0`;
1261	ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow: dmsTagRow, tag: (BYTE)dmsTag, head, rowEntries);
1262
1263	for (; (matches > `0`) && (nbAttempts > `0`); --nbAttempts, matches &= (matches - `1`)) {
1264	U32 const matchPos = (head + ZSTD_VecMask_next(val: matches)) & rowMask;
1265	U32 const matchIndex = dmsRow[matchPos];
1266	if (matchIndex < dmsLowestIndex)
1267	break;
1268	PREFETCH_L1(dmsBase + matchIndex);
1269	matchBuffer[numMatches++] = matchIndex;
1270	}
1271
1272	/ Return the longest match /
1273	for (; currMatch < numMatches; ++currMatch) {
1274	U32 const matchIndex = matchBuffer[currMatch];
1275	size_t currentMl=`0`;
1276	assert(matchIndex >= dmsLowestIndex);
1277	assert(matchIndex < curr);
1278
1279	{ const BYTE* const match = dmsBase + matchIndex;
1280	assert(match+`4` <= dmsEnd);
1281	if (MEM_read32(memPtr: match) == MEM_read32(memPtr: ip))
1282	currentMl = ZSTD_count_2segments(ip: ip+`4`, match: match+`4`, iEnd: iLimit, mEnd: dmsEnd, iStart: prefixStart) + `4`;
1283	}
1284
1285	if (currentMl > ml) {
1286	ml = currentMl;
1287	assert(curr > matchIndex + dmsIndexDelta);
1288	*offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta));
1289	if (ip+currentMl == iLimit) break;
1290	}
1291	}
1292	}
1293	}
1294	return ml;
1295	}
1296
1297
1298	/*
1299	* Generate search functions templated on (dictMode, mls, rowLog).
1300	* These functions are outlined for code size & compilation time.
1301	* ZSTD_searchMax() dispatches to the correct implementation function.
1302	*
1303	* TODO: The start of the search function involves loading and calculating a
1304	* bunch of constants from the ZSTD_matchState_t. These computations could be
1305	* done in an initialization function, and saved somewhere in the match state.
1306	* Then we could pass a pointer to the saved state instead of the match state,
1307	* and avoid duplicate computations.
1308	*
1309	* TODO: Move the match re-winding into searchMax. This improves compression
1310	* ratio, and unlocks further simplifications with the next TODO.
1311	*
1312	* TODO: Try moving the repcode search into searchMax. After the re-winding
1313	* and repcode search are in searchMax, there is no more logic in the match
1314	* finder loop that requires knowledge about the dictMode. So we should be
1315	* able to avoid force inlining it, and we can join the extDict loop with
1316	* the single segment loop. It should go in searchMax instead of its own
1317	* function to avoid having multiple virtual function calls per search.
1318	*/
1319
1320	#define ZSTD_BT_SEARCH_FN(dictMode, mls) ZSTD_BtFindBestMatch_##dictMode##_##mls
1321	#define ZSTD_HC_SEARCH_FN(dictMode, mls) ZSTD_HcFindBestMatch_##dictMode##_##mls
1322	#define ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog) ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog
1323
1324	#define ZSTD_SEARCH_FN_ATTRS FORCE_NOINLINE
1325
1326	#define GEN_ZSTD_BT_SEARCH_FN(dictMode, mls) \
1327	ZSTD_SEARCH_FN_ATTRS size_t ZSTD_BT_SEARCH_FN(dictMode, mls)( \
1328	ZSTD_matchState_t* ms, \
1329	const BYTE* ip, const BYTE* const iLimit, \
1330	size_t* offBasePtr) \
1331	{ \
1332	assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \
1333	return ZSTD_BtFindBestMatch(ms, ip, iLimit, offBasePtr, mls, ZSTD_##dictMode); \
1334	} \
1335
1336	#define GEN_ZSTD_HC_SEARCH_FN(dictMode, mls) \
1337	ZSTD_SEARCH_FN_ATTRS size_t ZSTD_HC_SEARCH_FN(dictMode, mls)( \
1338	ZSTD_matchState_t* ms, \
1339	const BYTE* ip, const BYTE* const iLimit, \
1340	size_t* offsetPtr) \
1341	{ \
1342	assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \
1343	return ZSTD_HcFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode); \
1344	} \
1345
1346	#define GEN_ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog) \
1347	ZSTD_SEARCH_FN_ATTRS size_t ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)( \
1348	ZSTD_matchState_t* ms, \
1349	const BYTE* ip, const BYTE* const iLimit, \
1350	size_t* offsetPtr) \
1351	{ \
1352	assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \
1353	assert(MAX(4, MIN(6, ms->cParams.searchLog)) == rowLog); \
1354	return ZSTD_RowFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode, rowLog); \
1355	} \
1356
1357	#define ZSTD_FOR_EACH_ROWLOG(X, dictMode, mls) \
1358	X(dictMode, mls, 4) \
1359	X(dictMode, mls, 5) \
1360	X(dictMode, mls, 6)
1361
1362	#define ZSTD_FOR_EACH_MLS_ROWLOG(X, dictMode) \
1363	ZSTD_FOR_EACH_ROWLOG(X, dictMode, 4) \
1364	ZSTD_FOR_EACH_ROWLOG(X, dictMode, 5) \
1365	ZSTD_FOR_EACH_ROWLOG(X, dictMode, 6)
1366
1367	#define ZSTD_FOR_EACH_MLS(X, dictMode) \
1368	X(dictMode, 4) \
1369	X(dictMode, 5) \
1370	X(dictMode, 6)
1371
1372	#define ZSTD_FOR_EACH_DICT_MODE(X, ...) \
1373	X(__VA_ARGS__, noDict) \
1374	X(__VA_ARGS__, extDict) \
1375	X(__VA_ARGS__, dictMatchState) \
1376	X(__VA_ARGS__, dedicatedDictSearch)
1377
1378	/ Generate row search fns for each combination of (dictMode, mls, rowLog) /
1379	ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS_ROWLOG, GEN_ZSTD_ROW_SEARCH_FN)
1380	/ Generate binary Tree search fns for each combination of (dictMode, mls) /
1381	ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_BT_SEARCH_FN)
1382	/ Generate hash chain search fns for each combination of (dictMode, mls) /
1383	ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_HC_SEARCH_FN)
1384
1385	typedef enum { search_hashChain=`0`, search_binaryTree=`1`, search_rowHash=`2` } searchMethod_e;
1386
1387	#define GEN_ZSTD_CALL_BT_SEARCH_FN(dictMode, mls) \
1388	case mls: \
1389	return ZSTD_BT_SEARCH_FN(dictMode, mls)(ms, ip, iend, offsetPtr);
1390	#define GEN_ZSTD_CALL_HC_SEARCH_FN(dictMode, mls) \
1391	case mls: \
1392	return ZSTD_HC_SEARCH_FN(dictMode, mls)(ms, ip, iend, offsetPtr);
1393	#define GEN_ZSTD_CALL_ROW_SEARCH_FN(dictMode, mls, rowLog) \
1394	case rowLog: \
1395	return ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)(ms, ip, iend, offsetPtr);
1396
1397	#define ZSTD_SWITCH_MLS(X, dictMode) \
1398	switch (mls) { \
1399	ZSTD_FOR_EACH_MLS(X, dictMode) \
1400	}
1401
1402	#define ZSTD_SWITCH_ROWLOG(dictMode, mls) \
1403	case mls: \
1404	switch (rowLog) { \
1405	ZSTD_FOR_EACH_ROWLOG(GEN_ZSTD_CALL_ROW_SEARCH_FN, dictMode, mls) \
1406	} \
1407	ZSTD_UNREACHABLE; \
1408	break;
1409
1410	#define ZSTD_SWITCH_SEARCH_METHOD(dictMode) \
1411	switch (searchMethod) { \
1412	case search_hashChain: \
1413	ZSTD_SWITCH_MLS(GEN_ZSTD_CALL_HC_SEARCH_FN, dictMode) \
1414	break; \
1415	case search_binaryTree: \
1416	ZSTD_SWITCH_MLS(GEN_ZSTD_CALL_BT_SEARCH_FN, dictMode) \
1417	break; \
1418	case search_rowHash: \
1419	ZSTD_SWITCH_MLS(ZSTD_SWITCH_ROWLOG, dictMode) \
1420	break; \
1421	} \
1422	ZSTD_UNREACHABLE;
1423
1424	/*
1425	* Searches for the longest match at @p ip.
1426	* Dispatches to the correct implementation function based on the
1427	* (searchMethod, dictMode, mls, rowLog). We use switch statements
1428	* here instead of using an indirect function call through a function
1429	* pointer because after Spectre and Meltdown mitigations, indirect
1430	* function calls can be very costly, especially in the kernel.
1431	*
1432	* NOTE: dictMode and searchMethod should be templated, so those switch
1433	* statements should be optimized out. Only the mls & rowLog switches
1434	* should be left.
1435	*
1436	* @param ms The match state.
1437	* @param ip The position to search at.
1438	* @param iend The end of the input data.
1439	* @param[out] offsetPtr Stores the match offset into this pointer.
1440	* @param mls The minimum search length, in the range [4, 6].
1441	* @param rowLog The row log (if applicable), in the range [4, 6].
1442	* @param searchMethod The search method to use (templated).
1443	* @param dictMode The dictMode (templated).
1444	*
1445	* @returns The length of the longest match found, or < mls if no match is found.
1446	* If a match is found its offset is stored in @p offsetPtr.
1447	*/
1448	FORCE_INLINE_TEMPLATE size_t ZSTD_searchMax(
1449	ZSTD_matchState_t* ms,
1450	const BYTE* ip,
1451	const BYTE* iend,
1452	size_t* offsetPtr,
1453	U32 const mls,
1454	U32 const rowLog,
1455	searchMethod_e const searchMethod,
1456	ZSTD_dictMode_e const dictMode)
1457	{
1458	if (dictMode == ZSTD_noDict) {
1459	ZSTD_SWITCH_SEARCH_METHOD(noDict)
1460	} else if (dictMode == ZSTD_extDict) {
1461	ZSTD_SWITCH_SEARCH_METHOD(extDict)
1462	} else if (dictMode == ZSTD_dictMatchState) {
1463	ZSTD_SWITCH_SEARCH_METHOD(dictMatchState)
1464	} else if (dictMode == ZSTD_dedicatedDictSearch) {
1465	ZSTD_SWITCH_SEARCH_METHOD(dedicatedDictSearch)
1466	}
1467	ZSTD_UNREACHABLE;
1468	return `0`;
1469	}
1470
1471	/* *******************************
1472	* Common parser - lazy strategy
1473	*********************************/
1474
1475	FORCE_INLINE_TEMPLATE size_t
1476	ZSTD_compressBlock_lazy_generic(
1477	ZSTD_matchState_t* ms, seqStore_t* seqStore,
1478	U32 rep[ZSTD_REP_NUM],
1479	const void* src, size_t srcSize,
1480	const searchMethod_e searchMethod, const U32 depth,
1481	ZSTD_dictMode_e const dictMode)
1482	{
1483	const BYTE* const istart = (const BYTE*)src;
1484	const BYTE* ip = istart;
1485	const BYTE* anchor = istart;
1486	const BYTE* const iend = istart + srcSize;
1487	const BYTE* const ilimit = (searchMethod == search_rowHash) ? iend - `8` - ZSTD_ROW_HASH_CACHE_SIZE : iend - `8`;
1488	const BYTE* const base = ms->window.base;
1489	const U32 prefixLowestIndex = ms->window.dictLimit;
1490	const BYTE* const prefixLowest = base + prefixLowestIndex;
1491	const U32 mls = BOUNDED(`4`, ms->cParams.minMatch, `6`);
1492	const U32 rowLog = BOUNDED(`4`, ms->cParams.searchLog, `6`);
1493
1494	U32 offset_1 = rep[`0`], offset_2 = rep[`1`], savedOffset=`0`;
1495
1496	const int isDMS = dictMode == ZSTD_dictMatchState;
1497	const int isDDS = dictMode == ZSTD_dedicatedDictSearch;
1498	const int isDxS = isDMS \|\| isDDS;
1499	const ZSTD_matchState_t* const dms = ms->dictMatchState;
1500	const U32 dictLowestIndex = isDxS ? dms->window.dictLimit : `0`;
1501	const BYTE* const dictBase = isDxS ? dms->window.base : NULL;
1502	const BYTE* const dictLowest = isDxS ? dictBase + dictLowestIndex : NULL;
1503	const BYTE* const dictEnd = isDxS ? dms->window.nextSrc : NULL;
1504	const U32 dictIndexDelta = isDxS ?
1505	prefixLowestIndex - (U32)(dictEnd - dictBase) :
1506	`0`;
1507	const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictLowest));
1508
1509	DEBUGLOG(`5`, "ZSTD_compressBlock_lazy_generic (dictMode=%u) (searchFunc=%u)", (U32)dictMode, (U32)searchMethod);
1510	ip += (dictAndPrefixLength == `0`);
1511	if (dictMode == ZSTD_noDict) {
1512	U32 const curr = (U32)(ip - base);
1513	U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, windowLog: ms->cParams.windowLog);
1514	U32 const maxRep = curr - windowLow;
1515	if (offset_2 > maxRep) savedOffset = offset_2, offset_2 = `0`;
1516	if (offset_1 > maxRep) savedOffset = offset_1, offset_1 = `0`;
1517	}
1518	if (isDxS) {
1519	/ dictMatchState repCode checks don't currently handle repCode == 0*
1520	* disabling. */
1521	assert(offset_1 <= dictAndPrefixLength);
1522	assert(offset_2 <= dictAndPrefixLength);
1523	}
1524
1525	if (searchMethod == search_rowHash) {
1526	ZSTD_row_fillHashCache(ms, base, rowLog,
1527	MIN(ms->cParams.minMatch, `6` / mls caps out at 6 /),
1528	idx: ms->nextToUpdate, iLimit: ilimit);
1529	}
1530
1531	/ Match Loop /
1532	#if defined(__x86_64__)
1533	/ I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the*
1534	* code alignment is perturbed. To fix the instability align the loop on 32-bytes.
1535	*/
1536	__asm__(".p2align 5");
1537	#endif
1538	while (ip < ilimit) {
1539	size_t matchLength=`0`;
1540	size_t offcode=STORE_REPCODE_1;
1541	const BYTE* start=ip+`1`;
1542	DEBUGLOG(`7`, "search baseline (depth 0)");
1543
1544	/ check repCode /
1545	if (isDxS) {
1546	const U32 repIndex = (U32)(ip - base) + `1` - offset_1;
1547	const BYTE* repMatch = ((dictMode == ZSTD_dictMatchState \|\| dictMode == ZSTD_dedicatedDictSearch)
1548	&& repIndex < prefixLowestIndex) ?
1549	dictBase + (repIndex - dictIndexDelta) :
1550	base + repIndex;
1551	if (((U32)((prefixLowestIndex-`1`) - repIndex) >= `3` / intentional underflow /)
1552	&& (MEM_read32(memPtr: repMatch) == MEM_read32(memPtr: ip+`1`)) ) {
1553	const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
1554	matchLength = ZSTD_count_2segments(ip: ip+`1`+`4`, match: repMatch+`4`, iEnd: iend, mEnd: repMatchEnd, iStart: prefixLowest) + `4`;
1555	if (depth==`0`) goto _storeSequence;
1556	}
1557	}
1558	if ( dictMode == ZSTD_noDict
1559	&& ((offset_1 > `0`) & (MEM_read32(memPtr: ip+`1`-offset_1) == MEM_read32(memPtr: ip+`1`)))) {
1560	matchLength = ZSTD_count(pIn: ip+`1`+`4`, pMatch: ip+`1`+`4`-offset_1, pInLimit: iend) + `4`;
1561	if (depth==`0`) goto _storeSequence;
1562	}
1563
1564	/ first search (depth 0) /
1565	{ size_t offsetFound = `999999999`;
1566	size_t const ml2 = ZSTD_searchMax(ms, ip, iend, offsetPtr: &offsetFound, mls, rowLog, searchMethod, dictMode);
1567	if (ml2 > matchLength)
1568	matchLength = ml2, start = ip, offcode=offsetFound;
1569	}
1570
1571	if (matchLength < `4`) {
1572	ip += ((ip-anchor) >> kSearchStrength) + `1`; / jump faster over incompressible sections /
1573	continue;
1574	}
1575
1576	/ let's try to find a better solution /
1577	if (depth>=`1`)
1578	while (ip<ilimit) {
1579	DEBUGLOG(`7`, "search depth 1");
1580	ip ++;
1581	if ( (dictMode == ZSTD_noDict)
1582	&& (offcode) && ((offset_1>`0`) & (MEM_read32(memPtr: ip) == MEM_read32(memPtr: ip - offset_1)))) {
1583	size_t const mlRep = ZSTD_count(pIn: ip+`4`, pMatch: ip+`4`-offset_1, pInLimit: iend) + `4`;
1584	int const gain2 = (int)(mlRep * `3`);
1585	int const gain1 = (int)(matchLength*`3` - ZSTD_highbit32(val: (U32)STORED_TO_OFFBASE(offcode)) + `1`);
1586	if ((mlRep >= `4`) && (gain2 > gain1))
1587	matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
1588	}
1589	if (isDxS) {
1590	const U32 repIndex = (U32)(ip - base) - offset_1;
1591	const BYTE* repMatch = repIndex < prefixLowestIndex ?
1592	dictBase + (repIndex - dictIndexDelta) :
1593	base + repIndex;
1594	if (((U32)((prefixLowestIndex-`1`) - repIndex) >= `3` / intentional underflow /)
1595	&& (MEM_read32(memPtr: repMatch) == MEM_read32(memPtr: ip)) ) {
1596	const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
1597	size_t const mlRep = ZSTD_count_2segments(ip: ip+`4`, match: repMatch+`4`, iEnd: iend, mEnd: repMatchEnd, iStart: prefixLowest) + `4`;
1598	int const gain2 = (int)(mlRep * `3`);
1599	int const gain1 = (int)(matchLength*`3` - ZSTD_highbit32(val: (U32)STORED_TO_OFFBASE(offcode)) + `1`);
1600	if ((mlRep >= `4`) && (gain2 > gain1))
1601	matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
1602	}
1603	}
1604	{ size_t offset2=`999999999`;
1605	size_t const ml2 = ZSTD_searchMax(ms, ip, iend, offsetPtr: &offset2, mls, rowLog, searchMethod, dictMode);
1606	int const gain2 = (int)(ml2`4` - ZSTD_highbit32(val: (U32)STORED_TO_OFFBASE(offset2))); /* raw approx /
1607	int const gain1 = (int)(matchLength*`4` - ZSTD_highbit32(val: (U32)STORED_TO_OFFBASE(offcode)) + `4`);
1608	if ((ml2 >= `4`) && (gain2 > gain1)) {
1609	matchLength = ml2, offcode = offset2, start = ip;
1610	continue; / search a better one /
1611	} }
1612
1613	/ let's find an even better one /
1614	if ((depth==`2`) && (ip<ilimit)) {
1615	DEBUGLOG(`7`, "search depth 2");
1616	ip ++;
1617	if ( (dictMode == ZSTD_noDict)
1618	&& (offcode) && ((offset_1>`0`) & (MEM_read32(memPtr: ip) == MEM_read32(memPtr: ip - offset_1)))) {
1619	size_t const mlRep = ZSTD_count(pIn: ip+`4`, pMatch: ip+`4`-offset_1, pInLimit: iend) + `4`;
1620	int const gain2 = (int)(mlRep * `4`);
1621	int const gain1 = (int)(matchLength*`4` - ZSTD_highbit32(val: (U32)STORED_TO_OFFBASE(offcode)) + `1`);
1622	if ((mlRep >= `4`) && (gain2 > gain1))
1623	matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
1624	}
1625	if (isDxS) {
1626	const U32 repIndex = (U32)(ip - base) - offset_1;
1627	const BYTE* repMatch = repIndex < prefixLowestIndex ?
1628	dictBase + (repIndex - dictIndexDelta) :
1629	base + repIndex;
1630	if (((U32)((prefixLowestIndex-`1`) - repIndex) >= `3` / intentional underflow /)
1631	&& (MEM_read32(memPtr: repMatch) == MEM_read32(memPtr: ip)) ) {
1632	const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
1633	size_t const mlRep = ZSTD_count_2segments(ip: ip+`4`, match: repMatch+`4`, iEnd: iend, mEnd: repMatchEnd, iStart: prefixLowest) + `4`;
1634	int const gain2 = (int)(mlRep * `4`);
1635	int const gain1 = (int)(matchLength*`4` - ZSTD_highbit32(val: (U32)STORED_TO_OFFBASE(offcode)) + `1`);
1636	if ((mlRep >= `4`) && (gain2 > gain1))
1637	matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
1638	}
1639	}
1640	{ size_t offset2=`999999999`;
1641	size_t const ml2 = ZSTD_searchMax(ms, ip, iend, offsetPtr: &offset2, mls, rowLog, searchMethod, dictMode);
1642	int const gain2 = (int)(ml2`4` - ZSTD_highbit32(val: (U32)STORED_TO_OFFBASE(offset2))); /* raw approx /
1643	int const gain1 = (int)(matchLength*`4` - ZSTD_highbit32(val: (U32)STORED_TO_OFFBASE(offcode)) + `7`);
1644	if ((ml2 >= `4`) && (gain2 > gain1)) {
1645	matchLength = ml2, offcode = offset2, start = ip;
1646	continue;
1647	} } }
1648	break; / nothing found : store previous solution /
1649	}
1650
1651	/ NOTE:*
1652	* Pay attention that `start[-value]` can lead to strange undefined behavior
1653	* notably if `value` is unsigned, resulting in a large positive `-value`.
1654	*/
1655	/ catch up /
1656	if (STORED_IS_OFFSET(offcode)) {
1657	if (dictMode == ZSTD_noDict) {
1658	while ( ((start > anchor) & (start - STORED_OFFSET(offcode) > prefixLowest))
1659	&& (start[-`1`] == (start-STORED_OFFSET(offcode))[-`1`]) ) / only search for offset within prefix /
1660	{ start--; matchLength++; }
1661	}
1662	if (isDxS) {
1663	U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode));
1664	const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex;
1665	const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest;
1666	while ((start>anchor) && (match>mStart) && (start[-`1`] == match[-`1`])) { start--; match--; matchLength++; } / catch up /
1667	}
1668	offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode);
1669	}
1670	/ store sequence /
1671	_storeSequence:
1672	{ size_t const litLength = (size_t)(start - anchor);
1673	ZSTD_storeSeq(seqStorePtr: seqStore, litLength, literals: anchor, litLimit: iend, offBase_minus1: (U32)offcode, matchLength);
1674	anchor = ip = start + matchLength;
1675	}
1676
1677	/ check immediate repcode /
1678	if (isDxS) {
1679	while (ip <= ilimit) {
1680	U32 const current2 = (U32)(ip-base);
1681	U32 const repIndex = current2 - offset_2;
1682	const BYTE* repMatch = repIndex < prefixLowestIndex ?
1683	dictBase - dictIndexDelta + repIndex :
1684	base + repIndex;
1685	if ( ((U32)((prefixLowestIndex-`1`) - (U32)repIndex) >= `3` / intentional overflow /)
1686	&& (MEM_read32(memPtr: repMatch) == MEM_read32(memPtr: ip)) ) {
1687	const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend;
1688	matchLength = ZSTD_count_2segments(ip: ip+`4`, match: repMatch+`4`, iEnd: iend, mEnd: repEnd2, iStart: prefixLowest) + `4`;
1689	offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; / swap offset_2 <=> offset_1 /
1690	ZSTD_storeSeq(seqStorePtr: seqStore, litLength: `0`, literals: anchor, litLimit: iend, STORE_REPCODE_1, matchLength);
1691	ip += matchLength;
1692	anchor = ip;
1693	continue;
1694	}
1695	break;
1696	}
1697	}
1698
1699	if (dictMode == ZSTD_noDict) {
1700	while ( ((ip <= ilimit) & (offset_2>`0`))
1701	&& (MEM_read32(memPtr: ip) == MEM_read32(memPtr: ip - offset_2)) ) {
1702	/ store sequence /
1703	matchLength = ZSTD_count(pIn: ip+`4`, pMatch: ip+`4`-offset_2, pInLimit: iend) + `4`;
1704	offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; / swap repcodes /
1705	ZSTD_storeSeq(seqStorePtr: seqStore, litLength: `0`, literals: anchor, litLimit: iend, STORE_REPCODE_1, matchLength);
1706	ip += matchLength;
1707	anchor = ip;
1708	continue; / faster when present ... (?) /
1709	} } }
1710
1711	/ Save reps for next block /
1712	rep[`0`] = offset_1 ? offset_1 : savedOffset;
1713	rep[`1`] = offset_2 ? offset_2 : savedOffset;
1714
1715	/ Return the last literals size /
1716	return (size_t)(iend - anchor);
1717	}
1718
1719
1720	size_t ZSTD_compressBlock_btlazy2(
1721	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1722	void const* src, size_t srcSize)
1723	{
1724	return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, searchMethod: search_binaryTree, depth: `2`, dictMode: ZSTD_noDict);
1725	}
1726
1727	size_t ZSTD_compressBlock_lazy2(
1728	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1729	void const* src, size_t srcSize)
1730	{
1731	return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, searchMethod: search_hashChain, depth: `2`, dictMode: ZSTD_noDict);
1732	}
1733
1734	size_t ZSTD_compressBlock_lazy(
1735	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1736	void const* src, size_t srcSize)
1737	{
1738	return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, searchMethod: search_hashChain, depth: `1`, dictMode: ZSTD_noDict);
1739	}
1740
1741	size_t ZSTD_compressBlock_greedy(
1742	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1743	void const* src, size_t srcSize)
1744	{
1745	return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, searchMethod: search_hashChain, depth: `0`, dictMode: ZSTD_noDict);
1746	}
1747
1748	size_t ZSTD_compressBlock_btlazy2_dictMatchState(
1749	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1750	void const* src, size_t srcSize)
1751	{
1752	return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, searchMethod: search_binaryTree, depth: `2`, dictMode: ZSTD_dictMatchState);
1753	}
1754
1755	size_t ZSTD_compressBlock_lazy2_dictMatchState(
1756	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1757	void const* src, size_t srcSize)
1758	{
1759	return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, searchMethod: search_hashChain, depth: `2`, dictMode: ZSTD_dictMatchState);
1760	}
1761
1762	size_t ZSTD_compressBlock_lazy_dictMatchState(
1763	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1764	void const* src, size_t srcSize)
1765	{
1766	return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, searchMethod: search_hashChain, depth: `1`, dictMode: ZSTD_dictMatchState);
1767	}
1768
1769	size_t ZSTD_compressBlock_greedy_dictMatchState(
1770	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1771	void const* src, size_t srcSize)
1772	{
1773	return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, searchMethod: search_hashChain, depth: `0`, dictMode: ZSTD_dictMatchState);
1774	}
1775
1776
1777	size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch(
1778	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1779	void const* src, size_t srcSize)
1780	{
1781	return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, searchMethod: search_hashChain, depth: `2`, dictMode: ZSTD_dedicatedDictSearch);
1782	}
1783
1784	size_t ZSTD_compressBlock_lazy_dedicatedDictSearch(
1785	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1786	void const* src, size_t srcSize)
1787	{
1788	return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, searchMethod: search_hashChain, depth: `1`, dictMode: ZSTD_dedicatedDictSearch);
1789	}
1790
1791	size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(
1792	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1793	void const* src, size_t srcSize)
1794	{
1795	return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, searchMethod: search_hashChain, depth: `0`, dictMode: ZSTD_dedicatedDictSearch);
1796	}
1797
1798	/ Row-based matchfinder /
1799	size_t ZSTD_compressBlock_lazy2_row(
1800	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1801	void const* src, size_t srcSize)
1802	{
1803	return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, searchMethod: search_rowHash, depth: `2`, dictMode: ZSTD_noDict);
1804	}
1805
1806	size_t ZSTD_compressBlock_lazy_row(
1807	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1808	void const* src, size_t srcSize)
1809	{
1810	return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, searchMethod: search_rowHash, depth: `1`, dictMode: ZSTD_noDict);
1811	}
1812
1813	size_t ZSTD_compressBlock_greedy_row(
1814	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1815	void const* src, size_t srcSize)
1816	{
1817	return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, searchMethod: search_rowHash, depth: `0`, dictMode: ZSTD_noDict);
1818	}
1819
1820	size_t ZSTD_compressBlock_lazy2_dictMatchState_row(
1821	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1822	void const* src, size_t srcSize)
1823	{
1824	return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, searchMethod: search_rowHash, depth: `2`, dictMode: ZSTD_dictMatchState);
1825	}
1826
1827	size_t ZSTD_compressBlock_lazy_dictMatchState_row(
1828	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1829	void const* src, size_t srcSize)
1830	{
1831	return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, searchMethod: search_rowHash, depth: `1`, dictMode: ZSTD_dictMatchState);
1832	}
1833
1834	size_t ZSTD_compressBlock_greedy_dictMatchState_row(
1835	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1836	void const* src, size_t srcSize)
1837	{
1838	return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, searchMethod: search_rowHash, depth: `0`, dictMode: ZSTD_dictMatchState);
1839	}
1840
1841
1842	size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row(
1843	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1844	void const* src, size_t srcSize)
1845	{
1846	return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, searchMethod: search_rowHash, depth: `2`, dictMode: ZSTD_dedicatedDictSearch);
1847	}
1848
1849	size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(
1850	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1851	void const* src, size_t srcSize)
1852	{
1853	return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, searchMethod: search_rowHash, depth: `1`, dictMode: ZSTD_dedicatedDictSearch);
1854	}
1855
1856	size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(
1857	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1858	void const* src, size_t srcSize)
1859	{
1860	return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, searchMethod: search_rowHash, depth: `0`, dictMode: ZSTD_dedicatedDictSearch);
1861	}
1862
1863	FORCE_INLINE_TEMPLATE
1864	size_t ZSTD_compressBlock_lazy_extDict_generic(
1865	ZSTD_matchState_t* ms, seqStore_t* seqStore,
1866	U32 rep[ZSTD_REP_NUM],
1867	const void* src, size_t srcSize,
1868	const searchMethod_e searchMethod, const U32 depth)
1869	{
1870	const BYTE* const istart = (const BYTE*)src;
1871	const BYTE* ip = istart;
1872	const BYTE* anchor = istart;
1873	const BYTE* const iend = istart + srcSize;
1874	const BYTE* const ilimit = searchMethod == search_rowHash ? iend - `8` - ZSTD_ROW_HASH_CACHE_SIZE : iend - `8`;
1875	const BYTE* const base = ms->window.base;
1876	const U32 dictLimit = ms->window.dictLimit;
1877	const BYTE* const prefixStart = base + dictLimit;
1878	const BYTE* const dictBase = ms->window.dictBase;
1879	const BYTE* const dictEnd = dictBase + dictLimit;
1880	const BYTE* const dictStart = dictBase + ms->window.lowLimit;
1881	const U32 windowLog = ms->cParams.windowLog;
1882	const U32 mls = BOUNDED(`4`, ms->cParams.minMatch, `6`);
1883	const U32 rowLog = BOUNDED(`4`, ms->cParams.searchLog, `6`);
1884
1885	U32 offset_1 = rep[`0`], offset_2 = rep[`1`];
1886
1887	DEBUGLOG(`5`, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod);
1888
1889	/ init /
1890	ip += (ip == prefixStart);
1891	if (searchMethod == search_rowHash) {
1892	ZSTD_row_fillHashCache(ms, base, rowLog,
1893	MIN(ms->cParams.minMatch, `6` / mls caps out at 6 /),
1894	idx: ms->nextToUpdate, iLimit: ilimit);
1895	}
1896
1897	/ Match Loop /
1898	#if defined(__x86_64__)
1899	/ I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the*
1900	* code alignment is perturbed. To fix the instability align the loop on 32-bytes.
1901	*/
1902	__asm__(".p2align 5");
1903	#endif
1904	while (ip < ilimit) {
1905	size_t matchLength=`0`;
1906	size_t offcode=STORE_REPCODE_1;
1907	const BYTE* start=ip+`1`;
1908	U32 curr = (U32)(ip-base);
1909
1910	/ check repCode /
1911	{ const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr: curr+`1`, windowLog);
1912	const U32 repIndex = (U32)(curr+`1` - offset_1);
1913	const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
1914	const BYTE* const repMatch = repBase + repIndex;
1915	if ( ((U32)((dictLimit-`1`) - repIndex) >= `3`) / intentional overflow /
1916	& (offset_1 <= curr+`1` - windowLow) ) / note: we are searching at curr+1 /
1917	if (MEM_read32(memPtr: ip+`1`) == MEM_read32(memPtr: repMatch)) {
1918	/ repcode detected we should take it /
1919	const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
1920	matchLength = ZSTD_count_2segments(ip: ip+`1`+`4`, match: repMatch+`4`, iEnd: iend, mEnd: repEnd, iStart: prefixStart) + `4`;
1921	if (depth==`0`) goto _storeSequence;
1922	} }
1923
1924	/ first search (depth 0) /
1925	{ size_t offsetFound = `999999999`;
1926	size_t const ml2 = ZSTD_searchMax(ms, ip, iend, offsetPtr: &offsetFound, mls, rowLog, searchMethod, dictMode: ZSTD_extDict);
1927	if (ml2 > matchLength)
1928	matchLength = ml2, start = ip, offcode=offsetFound;
1929	}
1930
1931	if (matchLength < `4`) {
1932	ip += ((ip-anchor) >> kSearchStrength) + `1`; / jump faster over incompressible sections /
1933	continue;
1934	}
1935
1936	/ let's try to find a better solution /
1937	if (depth>=`1`)
1938	while (ip<ilimit) {
1939	ip ++;
1940	curr++;
1941	/ check repCode /
1942	if (offcode) {
1943	const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
1944	const U32 repIndex = (U32)(curr - offset_1);
1945	const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
1946	const BYTE* const repMatch = repBase + repIndex;
1947	if ( ((U32)((dictLimit-`1`) - repIndex) >= `3`) / intentional overflow : do not test positions overlapping 2 memory segments /
1948	& (offset_1 <= curr - windowLow) ) / equivalent to `curr > repIndex >= windowLow` /
1949	if (MEM_read32(memPtr: ip) == MEM_read32(memPtr: repMatch)) {
1950	/ repcode detected /
1951	const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
1952	size_t const repLength = ZSTD_count_2segments(ip: ip+`4`, match: repMatch+`4`, iEnd: iend, mEnd: repEnd, iStart: prefixStart) + `4`;
1953	int const gain2 = (int)(repLength * `3`);
1954	int const gain1 = (int)(matchLength*`3` - ZSTD_highbit32(val: (U32)STORED_TO_OFFBASE(offcode)) + `1`);
1955	if ((repLength >= `4`) && (gain2 > gain1))
1956	matchLength = repLength, offcode = STORE_REPCODE_1, start = ip;
1957	} }
1958
1959	/ search match, depth 1 /
1960	{ size_t offset2=`999999999`;
1961	size_t const ml2 = ZSTD_searchMax(ms, ip, iend, offsetPtr: &offset2, mls, rowLog, searchMethod, dictMode: ZSTD_extDict);
1962	int const gain2 = (int)(ml2`4` - ZSTD_highbit32(val: (U32)STORED_TO_OFFBASE(offset2))); /* raw approx /
1963	int const gain1 = (int)(matchLength*`4` - ZSTD_highbit32(val: (U32)STORED_TO_OFFBASE(offcode)) + `4`);
1964	if ((ml2 >= `4`) && (gain2 > gain1)) {
1965	matchLength = ml2, offcode = offset2, start = ip;
1966	continue; / search a better one /
1967	} }
1968
1969	/ let's find an even better one /
1970	if ((depth==`2`) && (ip<ilimit)) {
1971	ip ++;
1972	curr++;
1973	/ check repCode /
1974	if (offcode) {
1975	const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
1976	const U32 repIndex = (U32)(curr - offset_1);
1977	const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
1978	const BYTE* const repMatch = repBase + repIndex;
1979	if ( ((U32)((dictLimit-`1`) - repIndex) >= `3`) / intentional overflow : do not test positions overlapping 2 memory segments /
1980	& (offset_1 <= curr - windowLow) ) / equivalent to `curr > repIndex >= windowLow` /
1981	if (MEM_read32(memPtr: ip) == MEM_read32(memPtr: repMatch)) {
1982	/ repcode detected /
1983	const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
1984	size_t const repLength = ZSTD_count_2segments(ip: ip+`4`, match: repMatch+`4`, iEnd: iend, mEnd: repEnd, iStart: prefixStart) + `4`;
1985	int const gain2 = (int)(repLength * `4`);
1986	int const gain1 = (int)(matchLength*`4` - ZSTD_highbit32(val: (U32)STORED_TO_OFFBASE(offcode)) + `1`);
1987	if ((repLength >= `4`) && (gain2 > gain1))
1988	matchLength = repLength, offcode = STORE_REPCODE_1, start = ip;
1989	} }
1990
1991	/ search match, depth 2 /
1992	{ size_t offset2=`999999999`;
1993	size_t const ml2 = ZSTD_searchMax(ms, ip, iend, offsetPtr: &offset2, mls, rowLog, searchMethod, dictMode: ZSTD_extDict);
1994	int const gain2 = (int)(ml2`4` - ZSTD_highbit32(val: (U32)STORED_TO_OFFBASE(offset2))); /* raw approx /
1995	int const gain1 = (int)(matchLength*`4` - ZSTD_highbit32(val: (U32)STORED_TO_OFFBASE(offcode)) + `7`);
1996	if ((ml2 >= `4`) && (gain2 > gain1)) {
1997	matchLength = ml2, offcode = offset2, start = ip;
1998	continue;
1999	} } }
2000	break; / nothing found : store previous solution /
2001	}
2002
2003	/ catch up /
2004	if (STORED_IS_OFFSET(offcode)) {
2005	U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode));
2006	const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex;
2007	const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart;
2008	while ((start>anchor) && (match>mStart) && (start[-`1`] == match[-`1`])) { start--; match--; matchLength++; } / catch up /
2009	offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode);
2010	}
2011
2012	/ store sequence /
2013	_storeSequence:
2014	{ size_t const litLength = (size_t)(start - anchor);
2015	ZSTD_storeSeq(seqStorePtr: seqStore, litLength, literals: anchor, litLimit: iend, offBase_minus1: (U32)offcode, matchLength);
2016	anchor = ip = start + matchLength;
2017	}
2018
2019	/ check immediate repcode /
2020	while (ip <= ilimit) {
2021	const U32 repCurrent = (U32)(ip-base);
2022	const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr: repCurrent, windowLog);
2023	const U32 repIndex = repCurrent - offset_2;
2024	const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
2025	const BYTE* const repMatch = repBase + repIndex;
2026	if ( ((U32)((dictLimit-`1`) - repIndex) >= `3`) / intentional overflow : do not test positions overlapping 2 memory segments /
2027	& (offset_2 <= repCurrent - windowLow) ) / equivalent to `curr > repIndex >= windowLow` /
2028	if (MEM_read32(memPtr: ip) == MEM_read32(memPtr: repMatch)) {
2029	/ repcode detected we should take it /
2030	const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
2031	matchLength = ZSTD_count_2segments(ip: ip+`4`, match: repMatch+`4`, iEnd: iend, mEnd: repEnd, iStart: prefixStart) + `4`;
2032	offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; / swap offset history /
2033	ZSTD_storeSeq(seqStorePtr: seqStore, litLength: `0`, literals: anchor, litLimit: iend, STORE_REPCODE_1, matchLength);
2034	ip += matchLength;
2035	anchor = ip;
2036	continue; / faster when present ... (?) /
2037	}
2038	break;
2039	} }
2040
2041	/ Save reps for next block /
2042	rep[`0`] = offset_1;
2043	rep[`1`] = offset_2;
2044
2045	/ Return the last literals size /
2046	return (size_t)(iend - anchor);
2047	}
2048
2049
2050	size_t ZSTD_compressBlock_greedy_extDict(
2051	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
2052	void const* src, size_t srcSize)
2053	{
2054	return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, searchMethod: search_hashChain, depth: `0`);
2055	}
2056
2057	size_t ZSTD_compressBlock_lazy_extDict(
2058	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
2059	void const* src, size_t srcSize)
2060
2061	{
2062	return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, searchMethod: search_hashChain, depth: `1`);
2063	}
2064
2065	size_t ZSTD_compressBlock_lazy2_extDict(
2066	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
2067	void const* src, size_t srcSize)
2068
2069	{
2070	return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, searchMethod: search_hashChain, depth: `2`);
2071	}
2072
2073	size_t ZSTD_compressBlock_btlazy2_extDict(
2074	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
2075	void const* src, size_t srcSize)
2076
2077	{
2078	return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, searchMethod: search_binaryTree, depth: `2`);
2079	}
2080
2081	size_t ZSTD_compressBlock_greedy_extDict_row(
2082	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
2083	void const* src, size_t srcSize)
2084	{
2085	return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, searchMethod: search_rowHash, depth: `0`);
2086	}
2087
2088	size_t ZSTD_compressBlock_lazy_extDict_row(
2089	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
2090	void const* src, size_t srcSize)
2091
2092	{
2093	return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, searchMethod: search_rowHash, depth: `1`);
2094	}
2095
2096	size_t ZSTD_compressBlock_lazy2_extDict_row(
2097	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
2098	void const* src, size_t srcSize)
2099
2100	{
2101	return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, searchMethod: search_rowHash, depth: `2`);
2102	}
2103

source code of linux/lib/zstd/compress/zstd_lazy.c