1 | //===-- Generate random but valid function descriptors -------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #include "automemcpy/RandomFunctionGenerator.h" |
10 | |
11 | #include <llvm/ADT/StringRef.h> |
12 | #include <llvm/Support/raw_ostream.h> |
13 | |
14 | #include <optional> |
15 | #include <set> |
16 | |
17 | namespace llvm { |
18 | namespace automemcpy { |
19 | |
20 | // Exploration parameters |
21 | // ---------------------- |
22 | // Here we define a set of values that will contraint the exploration and |
23 | // limit combinatorial explosion. |
24 | |
25 | // We limit the number of cases for individual sizes to sizes up to 4. |
26 | // More individual sizes don't bring much over the overlapping strategy. |
27 | static constexpr int kMaxIndividualSize = 4; |
28 | |
29 | // We limit Overlapping Strategy to sizes up to 256. |
30 | // An overlap of 256B means accessing 128B at once which is usually not |
31 | // feasible by current CPUs. We rely on the compiler to generate multiple |
32 | // loads/stores if needed but higher sizes are unlikely to benefit from hardware |
33 | // acceleration. |
34 | static constexpr int kMaxOverlapSize = 256; |
35 | |
36 | // For the loop strategies, we make sure that they iterate at least a certain |
37 | // number of times to amortize the cost of looping. |
38 | static constexpr int kLoopMinIter = 3; |
39 | static constexpr int kAlignedLoopMinIter = 2; |
40 | |
41 | // We restrict the size of the block of data to handle in a loop. |
42 | // Generally speaking block size <= 16 perform poorly. |
43 | static constexpr int kLoopBlockSize[] = {16, 32, 64}; |
44 | |
45 | // We restrict alignment to the following values. |
46 | static constexpr int kLoopAlignments[] = {16, 32, 64}; |
47 | |
48 | // We make sure that the region bounds are one of the following values. |
49 | static constexpr int kAnchors[] = {0, 1, 2, 4, 8, 16, 32, 48, |
50 | 64, 96, 128, 256, 512, 1024, kMaxSize}; |
51 | |
52 | // We also allow disabling loops, aligned loops and accelerators. |
53 | static constexpr bool kDisableLoop = false; |
54 | static constexpr bool kDisableAlignedLoop = false; |
55 | static constexpr bool kDisableAccelerator = false; |
56 | |
57 | // For memcpy, we can also explore whether aligning on source or destination has |
58 | // an effect. |
59 | static constexpr bool kExploreAlignmentArg = true; |
60 | |
61 | // The function we generate code for. |
62 | // BCMP is specifically disabled for now. |
63 | static constexpr int kFunctionTypes[] = { |
64 | (int)FunctionType::MEMCPY, |
65 | (int)FunctionType::MEMCMP, |
66 | // (int)FunctionType::BCMP, |
67 | (int)FunctionType::MEMSET, |
68 | (int)FunctionType::BZERO, |
69 | }; |
70 | |
71 | // The actual implementation of each function can be handled via primitive types |
72 | // (SCALAR), vector types where available (NATIVE) or by the compiler (BUILTIN). |
73 | // We want to move toward delegating the code generation entirely to the |
74 | // compiler but for now we have to make use of -per microarchitecture- custom |
75 | // implementations. Scalar being more portable but also less performant, we |
76 | // remove it as well. |
77 | static constexpr int kElementClasses[] = { |
78 | // (int)ElementTypeClass::SCALAR, |
79 | (int)ElementTypeClass::NATIVE, |
80 | // (int)ElementTypeClass::BUILTIN |
81 | }; |
82 | |
83 | RandomFunctionGenerator::RandomFunctionGenerator() |
84 | : Solver(Context), Type(Context.int_const("Type" )), |
85 | ContiguousBegin(Context.int_const("ContiguousBegin" )), |
86 | ContiguousEnd(Context.int_const("ContiguousEnd" )), |
87 | OverlapBegin(Context.int_const("OverlapBegin" )), |
88 | OverlapEnd(Context.int_const("OverlapEnd" )), |
89 | LoopBegin(Context.int_const("LoopBegin" )), |
90 | LoopEnd(Context.int_const("LoopEnd" )), |
91 | LoopBlockSize(Context.int_const("LoopBlockSize" )), |
92 | AlignedLoopBegin(Context.int_const("AlignedLoopBegin" )), |
93 | AlignedLoopEnd(Context.int_const("AlignedLoopEnd" )), |
94 | AlignedLoopBlockSize(Context.int_const("AlignedLoopBlockSize" )), |
95 | AlignedAlignment(Context.int_const("AlignedAlignment" )), |
96 | AlignedArg(Context.int_const("AlignedArg" )), |
97 | AcceleratorBegin(Context.int_const("AcceleratorBegin" )), |
98 | AcceleratorEnd(Context.int_const("AcceleratorEnd" )), |
99 | ElementClass(Context.int_const("ElementClass" )) { |
100 | // All possible functions. |
101 | Solver.add(inSetConstraint(Type, kFunctionTypes)); |
102 | |
103 | // Add constraints for region bounds. |
104 | addBoundsAndAnchors(ContiguousBegin, ContiguousEnd); |
105 | addBoundsAndAnchors(OverlapBegin, OverlapEnd); |
106 | addBoundsAndAnchors(LoopBegin, LoopEnd); |
107 | addBoundsAndAnchors(AlignedLoopBegin, AlignedLoopEnd); |
108 | addBoundsAndAnchors(AcceleratorBegin, AcceleratorEnd); |
109 | // We always consider strategies in this order, and we |
110 | // always end with the `Accelerator` strategy, as it's typically more |
111 | // efficient for large sizes. |
112 | // Contiguous <= Overlap <= Loop <= AlignedLoop <= Accelerator |
113 | Solver.add(ContiguousEnd == OverlapBegin); |
114 | Solver.add(OverlapEnd == LoopBegin); |
115 | Solver.add(LoopEnd == AlignedLoopBegin); |
116 | Solver.add(AlignedLoopEnd == AcceleratorBegin); |
117 | // Fix endpoints: The minimum size that we want to copy is 0, and we always |
118 | // start with the `Contiguous` strategy. The max size is `kMaxSize`. |
119 | Solver.add(ContiguousBegin == 0); |
120 | Solver.add(AcceleratorEnd == kMaxSize); |
121 | // Contiguous |
122 | Solver.add(ContiguousEnd <= kMaxIndividualSize + 1); |
123 | // Overlap |
124 | Solver.add(OverlapEnd <= kMaxOverlapSize + 1); |
125 | // Overlap only ever makes sense when accessing multiple bytes at a time. |
126 | // i.e. Overlap<1> is useless. |
127 | Solver.add(OverlapBegin == OverlapEnd || OverlapBegin >= 2); |
128 | // Loop |
129 | addLoopConstraints(LoopBegin, LoopEnd, LoopBlockSize, kLoopMinIter); |
130 | // Aligned Loop |
131 | addLoopConstraints(AlignedLoopBegin, AlignedLoopEnd, AlignedLoopBlockSize, |
132 | kAlignedLoopMinIter); |
133 | Solver.add(inSetConstraint(AlignedAlignment, kLoopAlignments)); |
134 | Solver.add(AlignedLoopBegin == AlignedLoopEnd || AlignedLoopBegin >= 64); |
135 | Solver.add(AlignedLoopBlockSize >= AlignedAlignment); |
136 | Solver.add(AlignedLoopBlockSize >= LoopBlockSize); |
137 | z3::expr IsMemcpy = Type == (int)FunctionType::MEMCPY; |
138 | z3::expr ExploreAlignment = IsMemcpy && kExploreAlignmentArg; |
139 | Solver.add( |
140 | (ExploreAlignment && |
141 | inSetConstraint(AlignedArg, {(int)AlignArg::_1, (int)AlignArg::_2})) || |
142 | (!ExploreAlignment && AlignedArg == (int)AlignArg::_1)); |
143 | // Accelerator |
144 | Solver.add(IsMemcpy || |
145 | (AcceleratorBegin == |
146 | AcceleratorEnd)); // Only Memcpy has accelerator for now. |
147 | // Element classes |
148 | Solver.add(inSetConstraint(ElementClass, kElementClasses)); |
149 | |
150 | if (kDisableLoop) |
151 | Solver.add(LoopBegin == LoopEnd); |
152 | if (kDisableAlignedLoop) |
153 | Solver.add(AlignedLoopBegin == AlignedLoopEnd); |
154 | if (kDisableAccelerator) |
155 | Solver.add(AcceleratorBegin == AcceleratorEnd); |
156 | } |
157 | |
158 | // Creates SizeSpan from Begin/End values. |
159 | // Returns std::nullopt if Begin==End. |
160 | static std::optional<SizeSpan> AsSizeSpan(size_t Begin, size_t End) { |
161 | if (Begin == End) |
162 | return std::nullopt; |
163 | SizeSpan SS; |
164 | SS.Begin = Begin; |
165 | SS.End = End; |
166 | return SS; |
167 | } |
168 | |
169 | // Generic method to create a `Region` struct with a Span or std::nullopt if |
170 | // span is empty. |
171 | template <typename Region> |
172 | static std::optional<Region> As(size_t Begin, size_t End) { |
173 | if (auto Span = AsSizeSpan(Begin, End)) { |
174 | Region Output; |
175 | Output.Span = *Span; |
176 | return Output; |
177 | } |
178 | return std::nullopt; |
179 | } |
180 | |
181 | // Returns a Loop struct or std::nullopt if span is empty. |
182 | static std::optional<Loop> AsLoop(size_t Begin, size_t End, size_t BlockSize) { |
183 | if (auto Span = AsSizeSpan(Begin, End)) { |
184 | Loop Output; |
185 | Output.Span = *Span; |
186 | Output.BlockSize = BlockSize; |
187 | return Output; |
188 | } |
189 | return std::nullopt; |
190 | } |
191 | |
192 | // Returns an AlignedLoop struct or std::nullopt if span is empty. |
193 | static std::optional<AlignedLoop> AsAlignedLoop(size_t Begin, size_t End, |
194 | size_t BlockSize, |
195 | size_t Alignment, |
196 | AlignArg AlignTo) { |
197 | if (auto Loop = AsLoop(Begin, End, BlockSize)) { |
198 | AlignedLoop Output; |
199 | Output.Loop = *Loop; |
200 | Output.Alignment = Alignment; |
201 | Output.AlignTo = AlignTo; |
202 | return Output; |
203 | } |
204 | return std::nullopt; |
205 | } |
206 | |
207 | std::optional<FunctionDescriptor> RandomFunctionGenerator::next() { |
208 | if (Solver.check() != z3::sat) |
209 | return {}; |
210 | |
211 | z3::model m = Solver.get_model(); |
212 | |
213 | // Helper method to get the current numerical value of a z3::expr. |
214 | const auto E = [&m](z3::expr &V) -> int { |
215 | return m.eval(V).get_numeral_int(); |
216 | }; |
217 | |
218 | // Fill is the function descriptor to return. |
219 | FunctionDescriptor R; |
220 | R.Type = FunctionType(E(Type)); |
221 | R.Contiguous = As<Contiguous>(E(ContiguousBegin), E(ContiguousEnd)); |
222 | R.Overlap = As<Overlap>(E(OverlapBegin), E(OverlapEnd)); |
223 | R.Loop = AsLoop(E(LoopBegin), E(LoopEnd), E(LoopBlockSize)); |
224 | R.AlignedLoop = AsAlignedLoop(E(AlignedLoopBegin), E(AlignedLoopEnd), |
225 | E(AlignedLoopBlockSize), E(AlignedAlignment), |
226 | AlignArg(E(AlignedArg))); |
227 | R.Accelerator = As<Accelerator>(E(AcceleratorBegin), E(AcceleratorEnd)); |
228 | R.ElementClass = ElementTypeClass(E(ElementClass)); |
229 | |
230 | // Express current state as a set of constraints. |
231 | z3::expr CurrentLayout = |
232 | (Type == E(Type)) && (ContiguousBegin == E(ContiguousBegin)) && |
233 | (ContiguousEnd == E(ContiguousEnd)) && |
234 | (OverlapBegin == E(OverlapBegin)) && (OverlapEnd == E(OverlapEnd)) && |
235 | (LoopBegin == E(LoopBegin)) && (LoopEnd == E(LoopEnd)) && |
236 | (LoopBlockSize == E(LoopBlockSize)) && |
237 | (AlignedLoopBegin == E(AlignedLoopBegin)) && |
238 | (AlignedLoopEnd == E(AlignedLoopEnd)) && |
239 | (AlignedLoopBlockSize == E(AlignedLoopBlockSize)) && |
240 | (AlignedAlignment == E(AlignedAlignment)) && |
241 | (AlignedArg == E(AlignedArg)) && |
242 | (AcceleratorBegin == E(AcceleratorBegin)) && |
243 | (AcceleratorEnd == E(AcceleratorEnd)) && |
244 | (ElementClass == E(ElementClass)); |
245 | |
246 | // Ask solver to never show this configuration ever again. |
247 | Solver.add(!CurrentLayout); |
248 | return R; |
249 | } |
250 | |
251 | // Make sure `Variable` is one of the provided values. |
252 | z3::expr RandomFunctionGenerator::inSetConstraint(z3::expr &Variable, |
253 | ArrayRef<int> Values) const { |
254 | z3::expr_vector Args(Variable.ctx()); |
255 | for (int Value : Values) |
256 | Args.push_back(Variable == Value); |
257 | return z3::mk_or(Args); |
258 | } |
259 | |
260 | void RandomFunctionGenerator::addBoundsAndAnchors(z3::expr &Begin, |
261 | z3::expr &End) { |
262 | // Begin and End are picked amongst a set of predefined values. |
263 | Solver.add(inSetConstraint(Begin, kAnchors)); |
264 | Solver.add(inSetConstraint(End, kAnchors)); |
265 | Solver.add(Begin >= 0); |
266 | Solver.add(Begin <= End); |
267 | Solver.add(End <= kMaxSize); |
268 | } |
269 | |
270 | void RandomFunctionGenerator::addLoopConstraints(const z3::expr &LoopBegin, |
271 | const z3::expr &LoopEnd, |
272 | z3::expr &LoopBlockSize, |
273 | int LoopMinIter) { |
274 | Solver.add(inSetConstraint(LoopBlockSize, kLoopBlockSize)); |
275 | Solver.add(LoopBegin == LoopEnd || |
276 | (LoopBegin > (LoopMinIter * LoopBlockSize))); |
277 | } |
278 | |
279 | } // namespace automemcpy |
280 | } // namespace llvm |
281 | |