1 | /* SPDX-License-Identifier: GPL-2.0-or-later */ |
2 | /* |
3 | * decompress_common.h - Code shared by the XPRESS and LZX decompressors |
4 | * |
5 | * Copyright (C) 2015 Eric Biggers |
6 | */ |
7 | |
8 | #ifndef _LINUX_NTFS3_LIB_DECOMPRESS_COMMON_H |
9 | #define _LINUX_NTFS3_LIB_DECOMPRESS_COMMON_H |
10 | |
11 | #include <linux/string.h> |
12 | #include <linux/compiler.h> |
13 | #include <linux/types.h> |
14 | #include <linux/slab.h> |
15 | #include <asm/unaligned.h> |
16 | |
17 | |
18 | /* "Force inline" macro (not required, but helpful for performance) */ |
19 | #define forceinline __always_inline |
20 | |
21 | /* Enable whole-word match copying on selected architectures */ |
22 | #if defined(__i386__) || defined(__x86_64__) || defined(__ARM_FEATURE_UNALIGNED) |
23 | # define FAST_UNALIGNED_ACCESS |
24 | #endif |
25 | |
26 | /* Size of a machine word */ |
27 | #define WORDBYTES (sizeof(size_t)) |
28 | |
29 | static forceinline void |
30 | copy_unaligned_word(const void *src, void *dst) |
31 | { |
32 | put_unaligned(get_unaligned((const size_t *)src), (size_t *)dst); |
33 | } |
34 | |
35 | |
36 | /* Generate a "word" with platform-dependent size whose bytes all contain the |
37 | * value 'b'. |
38 | */ |
39 | static forceinline size_t repeat_byte(u8 b) |
40 | { |
41 | size_t v; |
42 | |
43 | v = b; |
44 | v |= v << 8; |
45 | v |= v << 16; |
46 | v |= v << ((WORDBYTES == 8) ? 32 : 0); |
47 | return v; |
48 | } |
49 | |
50 | /* Structure that encapsulates a block of in-memory data being interpreted as a |
51 | * stream of bits, optionally with interwoven literal bytes. Bits are assumed |
52 | * to be stored in little endian 16-bit coding units, with the bits ordered high |
53 | * to low. |
54 | */ |
55 | struct input_bitstream { |
56 | |
57 | /* Bits that have been read from the input buffer. The bits are |
58 | * left-justified; the next bit is always bit 31. |
59 | */ |
60 | u32 bitbuf; |
61 | |
62 | /* Number of bits currently held in @bitbuf. */ |
63 | u32 bitsleft; |
64 | |
65 | /* Pointer to the next byte to be retrieved from the input buffer. */ |
66 | const u8 *next; |
67 | |
68 | /* Pointer to just past the end of the input buffer. */ |
69 | const u8 *end; |
70 | }; |
71 | |
72 | /* Initialize a bitstream to read from the specified input buffer. */ |
73 | static forceinline void init_input_bitstream(struct input_bitstream *is, |
74 | const void *buffer, u32 size) |
75 | { |
76 | is->bitbuf = 0; |
77 | is->bitsleft = 0; |
78 | is->next = buffer; |
79 | is->end = is->next + size; |
80 | } |
81 | |
82 | /* Ensure the bit buffer variable for the bitstream contains at least @num_bits |
83 | * bits. Following this, bitstream_peek_bits() and/or bitstream_remove_bits() |
84 | * may be called on the bitstream to peek or remove up to @num_bits bits. Note |
85 | * that @num_bits must be <= 16. |
86 | */ |
87 | static forceinline void bitstream_ensure_bits(struct input_bitstream *is, |
88 | u32 num_bits) |
89 | { |
90 | if (is->bitsleft < num_bits) { |
91 | if (is->end - is->next >= 2) { |
92 | is->bitbuf |= (u32)get_unaligned_le16(p: is->next) |
93 | << (16 - is->bitsleft); |
94 | is->next += 2; |
95 | } |
96 | is->bitsleft += 16; |
97 | } |
98 | } |
99 | |
100 | /* Return the next @num_bits bits from the bitstream, without removing them. |
101 | * There must be at least @num_bits remaining in the buffer variable, from a |
102 | * previous call to bitstream_ensure_bits(). |
103 | */ |
104 | static forceinline u32 |
105 | bitstream_peek_bits(const struct input_bitstream *is, const u32 num_bits) |
106 | { |
107 | return (is->bitbuf >> 1) >> (sizeof(is->bitbuf) * 8 - num_bits - 1); |
108 | } |
109 | |
110 | /* Remove @num_bits from the bitstream. There must be at least @num_bits |
111 | * remaining in the buffer variable, from a previous call to |
112 | * bitstream_ensure_bits(). |
113 | */ |
114 | static forceinline void |
115 | bitstream_remove_bits(struct input_bitstream *is, u32 num_bits) |
116 | { |
117 | is->bitbuf <<= num_bits; |
118 | is->bitsleft -= num_bits; |
119 | } |
120 | |
121 | /* Remove and return @num_bits bits from the bitstream. There must be at least |
122 | * @num_bits remaining in the buffer variable, from a previous call to |
123 | * bitstream_ensure_bits(). |
124 | */ |
125 | static forceinline u32 |
126 | bitstream_pop_bits(struct input_bitstream *is, u32 num_bits) |
127 | { |
128 | u32 bits = bitstream_peek_bits(is, num_bits); |
129 | |
130 | bitstream_remove_bits(is, num_bits); |
131 | return bits; |
132 | } |
133 | |
134 | /* Read and return the next @num_bits bits from the bitstream. */ |
135 | static forceinline u32 |
136 | bitstream_read_bits(struct input_bitstream *is, u32 num_bits) |
137 | { |
138 | bitstream_ensure_bits(is, num_bits); |
139 | return bitstream_pop_bits(is, num_bits); |
140 | } |
141 | |
142 | /* Read and return the next literal byte embedded in the bitstream. */ |
143 | static forceinline u8 |
144 | bitstream_read_byte(struct input_bitstream *is) |
145 | { |
146 | if (unlikely(is->end == is->next)) |
147 | return 0; |
148 | return *is->next++; |
149 | } |
150 | |
151 | /* Read and return the next 16-bit integer embedded in the bitstream. */ |
152 | static forceinline u16 |
153 | bitstream_read_u16(struct input_bitstream *is) |
154 | { |
155 | u16 v; |
156 | |
157 | if (unlikely(is->end - is->next < 2)) |
158 | return 0; |
159 | v = get_unaligned_le16(p: is->next); |
160 | is->next += 2; |
161 | return v; |
162 | } |
163 | |
164 | /* Read and return the next 32-bit integer embedded in the bitstream. */ |
165 | static forceinline u32 |
166 | bitstream_read_u32(struct input_bitstream *is) |
167 | { |
168 | u32 v; |
169 | |
170 | if (unlikely(is->end - is->next < 4)) |
171 | return 0; |
172 | v = get_unaligned_le32(p: is->next); |
173 | is->next += 4; |
174 | return v; |
175 | } |
176 | |
177 | /* Read into @dst_buffer an array of literal bytes embedded in the bitstream. |
178 | * Return either a pointer to the byte past the last written, or NULL if the |
179 | * read overflows the input buffer. |
180 | */ |
181 | static forceinline void *bitstream_read_bytes(struct input_bitstream *is, |
182 | void *dst_buffer, size_t count) |
183 | { |
184 | if ((size_t)(is->end - is->next) < count) |
185 | return NULL; |
186 | memcpy(dst_buffer, is->next, count); |
187 | is->next += count; |
188 | return (u8 *)dst_buffer + count; |
189 | } |
190 | |
191 | /* Align the input bitstream on a coding-unit boundary. */ |
192 | static forceinline void bitstream_align(struct input_bitstream *is) |
193 | { |
194 | is->bitsleft = 0; |
195 | is->bitbuf = 0; |
196 | } |
197 | |
198 | extern int make_huffman_decode_table(u16 decode_table[], const u32 num_syms, |
199 | const u32 num_bits, const u8 lens[], |
200 | const u32 max_codeword_len, |
201 | u16 working_space[]); |
202 | |
203 | |
204 | /* Reads and returns the next Huffman-encoded symbol from a bitstream. If the |
205 | * input data is exhausted, the Huffman symbol is decoded as if the missing bits |
206 | * are all zeroes. |
207 | */ |
208 | static forceinline u32 read_huffsym(struct input_bitstream *istream, |
209 | const u16 decode_table[], |
210 | u32 table_bits, |
211 | u32 max_codeword_len) |
212 | { |
213 | u32 entry; |
214 | u32 key_bits; |
215 | |
216 | bitstream_ensure_bits(is: istream, num_bits: max_codeword_len); |
217 | |
218 | /* Index the decode table by the next table_bits bits of the input. */ |
219 | key_bits = bitstream_peek_bits(is: istream, num_bits: table_bits); |
220 | entry = decode_table[key_bits]; |
221 | if (entry < 0xC000) { |
222 | /* Fast case: The decode table directly provided the |
223 | * symbol and codeword length. The low 11 bits are the |
224 | * symbol, and the high 5 bits are the codeword length. |
225 | */ |
226 | bitstream_remove_bits(is: istream, num_bits: entry >> 11); |
227 | return entry & 0x7FF; |
228 | } |
229 | /* Slow case: The codeword for the symbol is longer than |
230 | * table_bits, so the symbol does not have an entry |
231 | * directly in the first (1 << table_bits) entries of the |
232 | * decode table. Traverse the appropriate binary tree |
233 | * bit-by-bit to decode the symbol. |
234 | */ |
235 | bitstream_remove_bits(is: istream, num_bits: table_bits); |
236 | do { |
237 | key_bits = (entry & 0x3FFF) + bitstream_pop_bits(is: istream, num_bits: 1); |
238 | } while ((entry = decode_table[key_bits]) >= 0xC000); |
239 | return entry; |
240 | } |
241 | |
242 | /* |
243 | * Copy an LZ77 match at (dst - offset) to dst. |
244 | * |
245 | * The length and offset must be already validated --- that is, (dst - offset) |
246 | * can't underrun the output buffer, and (dst + length) can't overrun the output |
247 | * buffer. Also, the length cannot be 0. |
248 | * |
249 | * @bufend points to the byte past the end of the output buffer. This function |
250 | * won't write any data beyond this position. |
251 | * |
252 | * Returns dst + length. |
253 | */ |
254 | static forceinline u8 *lz_copy(u8 *dst, u32 length, u32 offset, const u8 *bufend, |
255 | u32 min_length) |
256 | { |
257 | const u8 *src = dst - offset; |
258 | |
259 | /* |
260 | * Try to copy one machine word at a time. On i386 and x86_64 this is |
261 | * faster than copying one byte at a time, unless the data is |
262 | * near-random and all the matches have very short lengths. Note that |
263 | * since this requires unaligned memory accesses, it won't necessarily |
264 | * be faster on every architecture. |
265 | * |
266 | * Also note that we might copy more than the length of the match. For |
267 | * example, if a word is 8 bytes and the match is of length 5, then |
268 | * we'll simply copy 8 bytes. This is okay as long as we don't write |
269 | * beyond the end of the output buffer, hence the check for (bufend - |
270 | * end >= WORDBYTES - 1). |
271 | */ |
272 | #ifdef FAST_UNALIGNED_ACCESS |
273 | u8 * const end = dst + length; |
274 | |
275 | if (bufend - end >= (ptrdiff_t)(WORDBYTES - 1)) { |
276 | |
277 | if (offset >= WORDBYTES) { |
278 | /* The source and destination words don't overlap. */ |
279 | |
280 | /* To improve branch prediction, one iteration of this |
281 | * loop is unrolled. Most matches are short and will |
282 | * fail the first check. But if that check passes, then |
283 | * it becomes increasing likely that the match is long |
284 | * and we'll need to continue copying. |
285 | */ |
286 | |
287 | copy_unaligned_word(src, dst); |
288 | src += WORDBYTES; |
289 | dst += WORDBYTES; |
290 | |
291 | if (dst < end) { |
292 | do { |
293 | copy_unaligned_word(src, dst); |
294 | src += WORDBYTES; |
295 | dst += WORDBYTES; |
296 | } while (dst < end); |
297 | } |
298 | return end; |
299 | } else if (offset == 1) { |
300 | |
301 | /* Offset 1 matches are equivalent to run-length |
302 | * encoding of the previous byte. This case is common |
303 | * if the data contains many repeated bytes. |
304 | */ |
305 | size_t v = repeat_byte(b: *(dst - 1)); |
306 | |
307 | do { |
308 | put_unaligned(v, (size_t *)dst); |
309 | src += WORDBYTES; |
310 | dst += WORDBYTES; |
311 | } while (dst < end); |
312 | return end; |
313 | } |
314 | /* |
315 | * We don't bother with special cases for other 'offset < |
316 | * WORDBYTES', which are usually rarer than 'offset == 1'. Extra |
317 | * checks will just slow things down. Actually, it's possible |
318 | * to handle all the 'offset < WORDBYTES' cases using the same |
319 | * code, but it still becomes more complicated doesn't seem any |
320 | * faster overall; it definitely slows down the more common |
321 | * 'offset == 1' case. |
322 | */ |
323 | } |
324 | #endif /* FAST_UNALIGNED_ACCESS */ |
325 | |
326 | /* Fall back to a bytewise copy. */ |
327 | |
328 | if (min_length >= 2) { |
329 | *dst++ = *src++; |
330 | length--; |
331 | } |
332 | if (min_length >= 3) { |
333 | *dst++ = *src++; |
334 | length--; |
335 | } |
336 | do { |
337 | *dst++ = *src++; |
338 | } while (--length); |
339 | |
340 | return dst; |
341 | } |
342 | |
343 | #endif /* _LINUX_NTFS3_LIB_DECOMPRESS_COMMON_H */ |
344 | |