1 | //===-- sanitizer_lzw.h -----------------------------------------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // Lempel–Ziv–Welch encoding/decoding |
10 | // |
11 | //===----------------------------------------------------------------------===// |
12 | |
13 | #ifndef SANITIZER_LZW_H |
14 | #define SANITIZER_LZW_H |
15 | |
16 | #include "sanitizer_dense_map.h" |
17 | |
18 | namespace __sanitizer { |
19 | |
20 | using LzwCodeType = u32; |
21 | |
22 | template <class T, class ItIn, class ItOut> |
23 | ItOut LzwEncode(ItIn begin, ItIn end, ItOut out) { |
24 | using Substring = |
25 | detail::DenseMapPair<LzwCodeType /* Prefix */, T /* Next input */>; |
26 | |
27 | // Sentinel value for substrings of len 1. |
28 | static constexpr LzwCodeType kNoPrefix = |
29 | Min(DenseMapInfo<Substring>::getEmptyKey().first, |
30 | DenseMapInfo<Substring>::getTombstoneKey().first) - |
31 | 1; |
32 | DenseMap<Substring, LzwCodeType> prefix_to_code; |
33 | { |
34 | // Add all substring of len 1 as initial dictionary. |
35 | InternalMmapVector<T> dict_len1; |
36 | for (auto it = begin; it != end; ++it) |
37 | if (prefix_to_code.try_emplace({kNoPrefix, *it}, 0).second) |
38 | dict_len1.push_back(*it); |
39 | |
40 | // Slightly helps with later delta encoding. |
41 | Sort(dict_len1.data(), dict_len1.size()); |
42 | |
43 | // For large sizeof(T) we have to store dict_len1. Smaller types like u8 can |
44 | // just generate them. |
45 | *out = dict_len1.size(); |
46 | ++out; |
47 | |
48 | for (uptr i = 0; i != dict_len1.size(); ++i) { |
49 | // Remap after the Sort. |
50 | prefix_to_code[{kNoPrefix, dict_len1[i]}] = i; |
51 | *out = dict_len1[i]; |
52 | ++out; |
53 | } |
54 | CHECK_EQ(prefix_to_code.size(), dict_len1.size()); |
55 | } |
56 | |
57 | if (begin == end) |
58 | return out; |
59 | |
60 | // Main LZW encoding loop. |
61 | LzwCodeType match = prefix_to_code.find({kNoPrefix, *begin})->second; |
62 | ++begin; |
63 | for (auto it = begin; it != end; ++it) { |
64 | // Extend match with the new item. |
65 | auto ins = prefix_to_code.try_emplace({match, *it}, prefix_to_code.size()); |
66 | if (ins.second) { |
67 | // This is a new substring, but emit the code for the current match |
68 | // (before extend). This allows LZW decoder to recover the dictionary. |
69 | *out = match; |
70 | ++out; |
71 | // Reset the match to a single item, which must be already in the map. |
72 | match = prefix_to_code.find({kNoPrefix, *it})->second; |
73 | } else { |
74 | // Already known, use as the current match. |
75 | match = ins.first->second; |
76 | } |
77 | } |
78 | |
79 | *out = match; |
80 | ++out; |
81 | |
82 | return out; |
83 | } |
84 | |
85 | template <class T, class ItIn, class ItOut> |
86 | ItOut LzwDecode(ItIn begin, ItIn end, ItOut out) { |
87 | if (begin == end) |
88 | return out; |
89 | |
90 | // Load dictionary of len 1 substrings. Theses correspont to lowest codes. |
91 | InternalMmapVector<T> dict_len1(*begin); |
92 | ++begin; |
93 | |
94 | if (begin == end) |
95 | return out; |
96 | |
97 | for (auto& v : dict_len1) { |
98 | v = *begin; |
99 | ++begin; |
100 | } |
101 | |
102 | // Substrings of len 2 and up. Indexes are shifted because [0, |
103 | // dict_len1.size()) stored in dict_len1. Substings get here after being |
104 | // emitted to the output, so we can use output position. |
105 | InternalMmapVector<detail::DenseMapPair<ItOut /* begin. */, ItOut /* end */>> |
106 | code_to_substr; |
107 | |
108 | // Copies already emitted substrings into the output again. |
109 | auto copy = [&code_to_substr, &dict_len1](LzwCodeType code, ItOut out) { |
110 | if (code < dict_len1.size()) { |
111 | *out = dict_len1[code]; |
112 | ++out; |
113 | return out; |
114 | } |
115 | const auto& s = code_to_substr[code - dict_len1.size()]; |
116 | |
117 | for (ItOut it = s.first; it != s.second; ++it, ++out) *out = *it; |
118 | return out; |
119 | }; |
120 | |
121 | // Returns lens of the substring with the given code. |
122 | auto code_to_len = [&code_to_substr, &dict_len1](LzwCodeType code) -> uptr { |
123 | if (code < dict_len1.size()) |
124 | return 1; |
125 | const auto& s = code_to_substr[code - dict_len1.size()]; |
126 | return s.second - s.first; |
127 | }; |
128 | |
129 | // Main LZW decoding loop. |
130 | LzwCodeType prev_code = *begin; |
131 | ++begin; |
132 | out = copy(prev_code, out); |
133 | for (auto it = begin; it != end; ++it) { |
134 | LzwCodeType code = *it; |
135 | auto start = out; |
136 | if (code == dict_len1.size() + code_to_substr.size()) { |
137 | // Special LZW case. The code is not in the dictionary yet. This is |
138 | // possible only when the new substring is the same as previous one plus |
139 | // the first item of the previous substring. We can emit that in two |
140 | // steps. |
141 | out = copy(prev_code, out); |
142 | *out = *start; |
143 | ++out; |
144 | } else { |
145 | out = copy(code, out); |
146 | } |
147 | |
148 | // Every time encoded emits the code, it also creates substing of len + 1 |
149 | // including the first item of the just emmited substring. Do the same here. |
150 | uptr len = code_to_len(prev_code); |
151 | code_to_substr.push_back({start - len, start + 1}); |
152 | |
153 | prev_code = code; |
154 | } |
155 | return out; |
156 | } |
157 | |
158 | } // namespace __sanitizer |
159 | #endif |
160 | |