1 | //== ArrayBoundCheckerV2.cpp ------------------------------------*- C++ -*--==// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file defines ArrayBoundCheckerV2, which is a path-sensitive check |
10 | // which looks for an out-of-bound array element access. |
11 | // |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #include "clang/AST/CharUnits.h" |
15 | #include "clang/AST/ParentMapContext.h" |
16 | #include "clang/StaticAnalyzer/Checkers/BuiltinCheckerRegistration.h" |
17 | #include "clang/StaticAnalyzer/Checkers/Taint.h" |
18 | #include "clang/StaticAnalyzer/Core/BugReporter/BugType.h" |
19 | #include "clang/StaticAnalyzer/Core/Checker.h" |
20 | #include "clang/StaticAnalyzer/Core/CheckerManager.h" |
21 | #include "clang/StaticAnalyzer/Core/PathSensitive/APSIntType.h" |
22 | #include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h" |
23 | #include "clang/StaticAnalyzer/Core/PathSensitive/DynamicExtent.h" |
24 | #include "clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h" |
25 | #include "llvm/ADT/SmallString.h" |
26 | #include "llvm/Support/FormatVariadic.h" |
27 | #include "llvm/Support/raw_ostream.h" |
28 | #include <optional> |
29 | |
30 | using namespace clang; |
31 | using namespace ento; |
32 | using namespace taint; |
33 | using llvm::formatv; |
34 | |
35 | namespace { |
36 | /// If `E` is a "clean" array subscript expression, return the type of the |
37 | /// accessed element. If the base of the subscript expression is modified by |
38 | /// pointer arithmetic (and not the beginning of a "full" memory region), this |
39 | /// always returns nullopt because that's the right (or the least bad) thing to |
40 | /// do for the diagnostic output that's relying on this. |
41 | static std::optional<QualType> determineElementType(const Expr *E, |
42 | const CheckerContext &C) { |
43 | const auto *ASE = dyn_cast<ArraySubscriptExpr>(Val: E); |
44 | if (!ASE) |
45 | return std::nullopt; |
46 | |
47 | const MemRegion *SubscriptBaseReg = C.getSVal(ASE->getBase()).getAsRegion(); |
48 | if (!SubscriptBaseReg) |
49 | return std::nullopt; |
50 | |
51 | // The base of the subscript expression is affected by pointer arithmetics, |
52 | // so we want to report byte offsets instead of indices. |
53 | if (isa<ElementRegion>(Val: SubscriptBaseReg->StripCasts())) |
54 | return std::nullopt; |
55 | |
56 | return ASE->getType(); |
57 | } |
58 | |
59 | static std::optional<int64_t> |
60 | determineElementSize(const std::optional<QualType> T, const CheckerContext &C) { |
61 | if (!T) |
62 | return std::nullopt; |
63 | return C.getASTContext().getTypeSizeInChars(T: *T).getQuantity(); |
64 | } |
65 | |
66 | class StateUpdateReporter { |
67 | const SubRegion *Reg; |
68 | const NonLoc ByteOffsetVal; |
69 | const std::optional<QualType> ElementType; |
70 | const std::optional<int64_t> ElementSize; |
71 | bool AssumedNonNegative = false; |
72 | std::optional<NonLoc> AssumedUpperBound = std::nullopt; |
73 | |
74 | public: |
75 | StateUpdateReporter(const SubRegion *R, NonLoc ByteOffsVal, const Expr *E, |
76 | CheckerContext &C) |
77 | : Reg(R), ByteOffsetVal(ByteOffsVal), |
78 | ElementType(determineElementType(E, C)), |
79 | ElementSize(determineElementSize(ElementType, C)) {} |
80 | |
81 | void recordNonNegativeAssumption() { AssumedNonNegative = true; } |
82 | void recordUpperBoundAssumption(NonLoc UpperBoundVal) { |
83 | AssumedUpperBound = UpperBoundVal; |
84 | } |
85 | |
86 | bool assumedNonNegative() { return AssumedNonNegative; } |
87 | |
88 | const NoteTag *createNoteTag(CheckerContext &C) const; |
89 | |
90 | private: |
91 | std::string getMessage(PathSensitiveBugReport &BR) const; |
92 | |
93 | /// Return true if information about the value of `Sym` can put constraints |
94 | /// on some symbol which is interesting within the bug report `BR`. |
95 | /// In particular, this returns true when `Sym` is interesting within `BR`; |
96 | /// but it also returns true if `Sym` is an expression that contains integer |
97 | /// constants and a single symbolic operand which is interesting (in `BR`). |
98 | /// We need to use this instead of plain `BR.isInteresting()` because if we |
99 | /// are analyzing code like |
100 | /// int array[10]; |
101 | /// int f(int arg) { |
102 | /// return array[arg] && array[arg + 10]; |
103 | /// } |
104 | /// then the byte offsets are `arg * 4` and `(arg + 10) * 4`, which are not |
105 | /// sub-expressions of each other (but `getSimplifiedOffsets` is smart enough |
106 | /// to detect this out of bounds access). |
107 | static bool providesInformationAboutInteresting(SymbolRef Sym, |
108 | PathSensitiveBugReport &BR); |
109 | static bool providesInformationAboutInteresting(SVal SV, |
110 | PathSensitiveBugReport &BR) { |
111 | return providesInformationAboutInteresting(Sym: SV.getAsSymbol(), BR); |
112 | } |
113 | }; |
114 | |
115 | struct Messages { |
116 | std::string Short, Full; |
117 | }; |
118 | |
119 | // NOTE: The `ArraySubscriptExpr` and `UnaryOperator` callbacks are `PostStmt` |
120 | // instead of `PreStmt` because the current implementation passes the whole |
121 | // expression to `CheckerContext::getSVal()` which only works after the |
122 | // symbolic evaluation of the expression. (To turn them into `PreStmt` |
123 | // callbacks, we'd need to duplicate the logic that evaluates these |
124 | // expressions.) The `MemberExpr` callback would work as `PreStmt` but it's |
125 | // defined as `PostStmt` for the sake of consistency with the other callbacks. |
126 | class ArrayBoundCheckerV2 : public Checker<check::PostStmt<ArraySubscriptExpr>, |
127 | check::PostStmt<UnaryOperator>, |
128 | check::PostStmt<MemberExpr>> { |
129 | BugType BT{this, "Out-of-bound access" }; |
130 | BugType TaintBT{this, "Out-of-bound access" , categories::TaintedData}; |
131 | |
132 | void performCheck(const Expr *E, CheckerContext &C) const; |
133 | |
134 | void reportOOB(CheckerContext &C, ProgramStateRef ErrorState, Messages Msgs, |
135 | NonLoc Offset, std::optional<NonLoc> Extent, |
136 | bool IsTaintBug = false) const; |
137 | |
138 | static void markPartsInteresting(PathSensitiveBugReport &BR, |
139 | ProgramStateRef ErrorState, NonLoc Val, |
140 | bool MarkTaint); |
141 | |
142 | static bool isFromCtypeMacro(const Stmt *S, ASTContext &AC); |
143 | |
144 | static bool isIdiomaticPastTheEndPtr(const Expr *E, ProgramStateRef State, |
145 | NonLoc Offset, NonLoc Limit, |
146 | CheckerContext &C); |
147 | static bool isInAddressOf(const Stmt *S, ASTContext &AC); |
148 | |
149 | public: |
150 | void checkPostStmt(const ArraySubscriptExpr *E, CheckerContext &C) const { |
151 | performCheck(E, C); |
152 | } |
153 | void checkPostStmt(const UnaryOperator *E, CheckerContext &C) const { |
154 | if (E->getOpcode() == UO_Deref) |
155 | performCheck(E, C); |
156 | } |
157 | void checkPostStmt(const MemberExpr *E, CheckerContext &C) const { |
158 | if (E->isArrow()) |
159 | performCheck(E: E->getBase(), C); |
160 | } |
161 | }; |
162 | |
163 | } // anonymous namespace |
164 | |
165 | /// For a given Location that can be represented as a symbolic expression |
166 | /// Arr[Idx] (or perhaps Arr[Idx1][Idx2] etc.), return the parent memory block |
167 | /// Arr and the distance of Location from the beginning of Arr (expressed in a |
168 | /// NonLoc that specifies the number of CharUnits). Returns nullopt when these |
169 | /// cannot be determined. |
170 | static std::optional<std::pair<const SubRegion *, NonLoc>> |
171 | computeOffset(ProgramStateRef State, SValBuilder &SVB, SVal Location) { |
172 | QualType T = SVB.getArrayIndexType(); |
173 | auto EvalBinOp = [&SVB, State, T](BinaryOperatorKind Op, NonLoc L, NonLoc R) { |
174 | // We will use this utility to add and multiply values. |
175 | return SVB.evalBinOpNN(state: State, op: Op, lhs: L, rhs: R, resultTy: T).getAs<NonLoc>(); |
176 | }; |
177 | |
178 | const SubRegion *OwnerRegion = nullptr; |
179 | std::optional<NonLoc> Offset = SVB.makeZeroArrayIndex(); |
180 | |
181 | const ElementRegion *CurRegion = |
182 | dyn_cast_or_null<ElementRegion>(Val: Location.getAsRegion()); |
183 | |
184 | while (CurRegion) { |
185 | const auto Index = CurRegion->getIndex().getAs<NonLoc>(); |
186 | if (!Index) |
187 | return std::nullopt; |
188 | |
189 | QualType ElemType = CurRegion->getElementType(); |
190 | |
191 | // FIXME: The following early return was presumably added to safeguard the |
192 | // getTypeSizeInChars() call (which doesn't accept an incomplete type), but |
193 | // it seems that `ElemType` cannot be incomplete at this point. |
194 | if (ElemType->isIncompleteType()) |
195 | return std::nullopt; |
196 | |
197 | // Calculate Delta = Index * sizeof(ElemType). |
198 | NonLoc Size = SVB.makeArrayIndex( |
199 | idx: SVB.getContext().getTypeSizeInChars(T: ElemType).getQuantity()); |
200 | auto Delta = EvalBinOp(BO_Mul, *Index, Size); |
201 | if (!Delta) |
202 | return std::nullopt; |
203 | |
204 | // Perform Offset += Delta. |
205 | Offset = EvalBinOp(BO_Add, *Offset, *Delta); |
206 | if (!Offset) |
207 | return std::nullopt; |
208 | |
209 | OwnerRegion = CurRegion->getSuperRegion()->getAs<SubRegion>(); |
210 | // When this is just another ElementRegion layer, we need to continue the |
211 | // offset calculations: |
212 | CurRegion = dyn_cast_or_null<ElementRegion>(Val: OwnerRegion); |
213 | } |
214 | |
215 | if (OwnerRegion) |
216 | return std::make_pair(x&: OwnerRegion, y&: *Offset); |
217 | |
218 | return std::nullopt; |
219 | } |
220 | |
221 | // NOTE: This function is the "heart" of this checker. It simplifies |
222 | // inequalities with transformations that are valid (and very elementary) in |
223 | // pure mathematics, but become invalid if we use them in C++ number model |
224 | // where the calculations may overflow. |
225 | // Due to the overflow issues I think it's impossible (or at least not |
226 | // practical) to integrate this kind of simplification into the resolution of |
227 | // arbitrary inequalities (i.e. the code of `evalBinOp`); but this function |
228 | // produces valid results when the calculations are handling memory offsets |
229 | // and every value is well below SIZE_MAX. |
230 | // TODO: This algorithm should be moved to a central location where it's |
231 | // available for other checkers that need to compare memory offsets. |
232 | // NOTE: the simplification preserves the order of the two operands in a |
233 | // mathematical sense, but it may change the result produced by a C++ |
234 | // comparison operator (and the automatic type conversions). |
235 | // For example, consider a comparison "X+1 < 0", where the LHS is stored as a |
236 | // size_t and the RHS is stored in an int. (As size_t is unsigned, this |
237 | // comparison is false for all values of "X".) However, the simplification may |
238 | // turn it into "X < -1", which is still always false in a mathematical sense, |
239 | // but can produce a true result when evaluated by `evalBinOp` (which follows |
240 | // the rules of C++ and casts -1 to SIZE_MAX). |
241 | static std::pair<NonLoc, nonloc::ConcreteInt> |
242 | getSimplifiedOffsets(NonLoc offset, nonloc::ConcreteInt extent, |
243 | SValBuilder &svalBuilder) { |
244 | std::optional<nonloc::SymbolVal> SymVal = offset.getAs<nonloc::SymbolVal>(); |
245 | if (SymVal && SymVal->isExpression()) { |
246 | if (const SymIntExpr *SIE = dyn_cast<SymIntExpr>(Val: SymVal->getSymbol())) { |
247 | llvm::APSInt constant = |
248 | APSIntType(extent.getValue()).convert(Value: SIE->getRHS()); |
249 | switch (SIE->getOpcode()) { |
250 | case BO_Mul: |
251 | // The constant should never be 0 here, becasue multiplication by zero |
252 | // is simplified by the engine. |
253 | if ((extent.getValue() % constant) != 0) |
254 | return std::pair<NonLoc, nonloc::ConcreteInt>(offset, extent); |
255 | else |
256 | return getSimplifiedOffsets( |
257 | offset: nonloc::SymbolVal(SIE->getLHS()), |
258 | extent: svalBuilder.makeIntVal(integer: extent.getValue() / constant), |
259 | svalBuilder); |
260 | case BO_Add: |
261 | return getSimplifiedOffsets( |
262 | offset: nonloc::SymbolVal(SIE->getLHS()), |
263 | extent: svalBuilder.makeIntVal(integer: extent.getValue() - constant), svalBuilder); |
264 | default: |
265 | break; |
266 | } |
267 | } |
268 | } |
269 | |
270 | return std::pair<NonLoc, nonloc::ConcreteInt>(offset, extent); |
271 | } |
272 | |
273 | static bool isNegative(SValBuilder &SVB, ProgramStateRef State, NonLoc Value) { |
274 | const llvm::APSInt *MaxV = SVB.getMaxValue(state: State, val: Value); |
275 | return MaxV && MaxV->isNegative(); |
276 | } |
277 | |
278 | static bool isUnsigned(SValBuilder &SVB, NonLoc Value) { |
279 | QualType T = Value.getType(SVB.getContext()); |
280 | return T->isUnsignedIntegerType(); |
281 | } |
282 | |
283 | // Evaluate the comparison Value < Threshold with the help of the custom |
284 | // simplification algorithm defined for this checker. Return a pair of states, |
285 | // where the first one corresponds to "value below threshold" and the second |
286 | // corresponds to "value at or above threshold". Returns {nullptr, nullptr} in |
287 | // the case when the evaluation fails. |
288 | // If the optional argument CheckEquality is true, then use BO_EQ instead of |
289 | // the default BO_LT after consistently applying the same simplification steps. |
290 | static std::pair<ProgramStateRef, ProgramStateRef> |
291 | compareValueToThreshold(ProgramStateRef State, NonLoc Value, NonLoc Threshold, |
292 | SValBuilder &SVB, bool CheckEquality = false) { |
293 | if (auto ConcreteThreshold = Threshold.getAs<nonloc::ConcreteInt>()) { |
294 | std::tie(args&: Value, args&: Threshold) = getSimplifiedOffsets(offset: Value, extent: *ConcreteThreshold, svalBuilder&: SVB); |
295 | } |
296 | |
297 | // We want to perform a _mathematical_ comparison between the numbers `Value` |
298 | // and `Threshold`; but `evalBinOpNN` evaluates a C/C++ operator that may |
299 | // perform automatic conversions. For example the number -1 is less than the |
300 | // number 1000, but -1 < `1000ull` will evaluate to `false` because the `int` |
301 | // -1 is converted to ULONGLONG_MAX. |
302 | // To avoid automatic conversions, we evaluate the "obvious" cases without |
303 | // calling `evalBinOpNN`: |
304 | if (isNegative(SVB, State, Value) && isUnsigned(SVB, Value: Threshold)) { |
305 | if (CheckEquality) { |
306 | // negative_value == unsigned_threshold is always false |
307 | return {nullptr, State}; |
308 | } |
309 | // negative_value < unsigned_threshold is always true |
310 | return {State, nullptr}; |
311 | } |
312 | if (isUnsigned(SVB, Value) && isNegative(SVB, State, Value: Threshold)) { |
313 | // unsigned_value == negative_threshold and |
314 | // unsigned_value < negative_threshold are both always false |
315 | return {nullptr, State}; |
316 | } |
317 | // FIXME: These special cases are sufficient for handling real-world |
318 | // comparisons, but in theory there could be contrived situations where |
319 | // automatic conversion of a symbolic value (which can be negative and can be |
320 | // positive) leads to incorrect results. |
321 | // NOTE: We NEED to use the `evalBinOpNN` call in the "common" case, because |
322 | // we want to ensure that assumptions coming from this precondition and |
323 | // assumptions coming from regular C/C++ operator calls are represented by |
324 | // constraints on the same symbolic expression. A solution that would |
325 | // evaluate these "mathematical" compariosns through a separate pathway would |
326 | // be a step backwards in this sense. |
327 | |
328 | const BinaryOperatorKind OpKind = CheckEquality ? BO_EQ : BO_LT; |
329 | auto BelowThreshold = |
330 | SVB.evalBinOpNN(state: State, op: OpKind, lhs: Value, rhs: Threshold, resultTy: SVB.getConditionType()) |
331 | .getAs<NonLoc>(); |
332 | |
333 | if (BelowThreshold) |
334 | return State->assume(Cond: *BelowThreshold); |
335 | |
336 | return {nullptr, nullptr}; |
337 | } |
338 | |
339 | static std::string getRegionName(const SubRegion *Region) { |
340 | if (std::string RegName = Region->getDescriptiveName(); !RegName.empty()) |
341 | return RegName; |
342 | |
343 | // Field regions only have descriptive names when their parent has a |
344 | // descriptive name; so we provide a fallback representation for them: |
345 | if (const auto *FR = Region->getAs<FieldRegion>()) { |
346 | if (StringRef Name = FR->getDecl()->getName(); !Name.empty()) |
347 | return formatv(Fmt: "the field '{0}'" , Vals&: Name); |
348 | return "the unnamed field" ; |
349 | } |
350 | |
351 | if (isa<AllocaRegion>(Val: Region)) |
352 | return "the memory returned by 'alloca'" ; |
353 | |
354 | if (isa<SymbolicRegion>(Val: Region) && |
355 | isa<HeapSpaceRegion>(Val: Region->getMemorySpace())) |
356 | return "the heap area" ; |
357 | |
358 | if (isa<StringRegion>(Val: Region)) |
359 | return "the string literal" ; |
360 | |
361 | return "the region" ; |
362 | } |
363 | |
364 | static std::optional<int64_t> getConcreteValue(NonLoc SV) { |
365 | if (auto ConcreteVal = SV.getAs<nonloc::ConcreteInt>()) { |
366 | return ConcreteVal->getValue().tryExtValue(); |
367 | } |
368 | return std::nullopt; |
369 | } |
370 | |
371 | static std::optional<int64_t> getConcreteValue(std::optional<NonLoc> SV) { |
372 | return SV ? getConcreteValue(SV: *SV) : std::nullopt; |
373 | } |
374 | |
375 | static Messages getPrecedesMsgs(const SubRegion *Region, NonLoc Offset) { |
376 | std::string RegName = getRegionName(Region); |
377 | SmallString<128> Buf; |
378 | llvm::raw_svector_ostream Out(Buf); |
379 | Out << "Access of " << RegName << " at negative byte offset" ; |
380 | if (auto ConcreteIdx = Offset.getAs<nonloc::ConcreteInt>()) |
381 | Out << ' ' << ConcreteIdx->getValue(); |
382 | return {.Short: formatv(Fmt: "Out of bound access to memory preceding {0}" , Vals&: RegName), |
383 | .Full: std::string(Buf)}; |
384 | } |
385 | |
386 | /// Try to divide `Val1` and `Val2` (in place) by `Divisor` and return true if |
387 | /// it can be performed (`Divisor` is nonzero and there is no remainder). The |
388 | /// values `Val1` and `Val2` may be nullopt and in that case the corresponding |
389 | /// division is considered to be successful. |
390 | static bool tryDividePair(std::optional<int64_t> &Val1, |
391 | std::optional<int64_t> &Val2, int64_t Divisor) { |
392 | if (!Divisor) |
393 | return false; |
394 | const bool Val1HasRemainder = Val1 && *Val1 % Divisor; |
395 | const bool Val2HasRemainder = Val2 && *Val2 % Divisor; |
396 | if (!Val1HasRemainder && !Val2HasRemainder) { |
397 | if (Val1) |
398 | *Val1 /= Divisor; |
399 | if (Val2) |
400 | *Val2 /= Divisor; |
401 | return true; |
402 | } |
403 | return false; |
404 | } |
405 | |
406 | static Messages getExceedsMsgs(ASTContext &ACtx, const SubRegion *Region, |
407 | NonLoc Offset, NonLoc Extent, SVal Location, |
408 | bool AlsoMentionUnderflow) { |
409 | std::string RegName = getRegionName(Region); |
410 | const auto *EReg = Location.getAsRegion()->getAs<ElementRegion>(); |
411 | assert(EReg && "this checker only handles element access" ); |
412 | QualType ElemType = EReg->getElementType(); |
413 | |
414 | std::optional<int64_t> OffsetN = getConcreteValue(SV: Offset); |
415 | std::optional<int64_t> ExtentN = getConcreteValue(SV: Extent); |
416 | |
417 | int64_t ElemSize = ACtx.getTypeSizeInChars(T: ElemType).getQuantity(); |
418 | |
419 | bool UseByteOffsets = !tryDividePair(Val1&: OffsetN, Val2&: ExtentN, Divisor: ElemSize); |
420 | const char *OffsetOrIndex = UseByteOffsets ? "byte offset" : "index" ; |
421 | |
422 | SmallString<256> Buf; |
423 | llvm::raw_svector_ostream Out(Buf); |
424 | Out << "Access of " ; |
425 | if (!ExtentN && !UseByteOffsets) |
426 | Out << "'" << ElemType.getAsString() << "' element in " ; |
427 | Out << RegName << " at " ; |
428 | if (AlsoMentionUnderflow) { |
429 | Out << "a negative or overflowing " << OffsetOrIndex; |
430 | } else if (OffsetN) { |
431 | Out << OffsetOrIndex << " " << *OffsetN; |
432 | } else { |
433 | Out << "an overflowing " << OffsetOrIndex; |
434 | } |
435 | if (ExtentN) { |
436 | Out << ", while it holds only " ; |
437 | if (*ExtentN != 1) |
438 | Out << *ExtentN; |
439 | else |
440 | Out << "a single" ; |
441 | if (UseByteOffsets) |
442 | Out << " byte" ; |
443 | else |
444 | Out << " '" << ElemType.getAsString() << "' element" ; |
445 | |
446 | if (*ExtentN > 1) |
447 | Out << "s" ; |
448 | } |
449 | |
450 | return {.Short: formatv(Fmt: "Out of bound access to memory {0} {1}" , |
451 | Vals: AlsoMentionUnderflow ? "around" : "after the end of" , |
452 | Vals&: RegName), |
453 | .Full: std::string(Buf)}; |
454 | } |
455 | |
456 | static Messages getTaintMsgs(const SubRegion *Region, const char *OffsetName, |
457 | bool AlsoMentionUnderflow) { |
458 | std::string RegName = getRegionName(Region); |
459 | return {.Short: formatv(Fmt: "Potential out of bound access to {0} with tainted {1}" , |
460 | Vals&: RegName, Vals&: OffsetName), |
461 | .Full: formatv(Fmt: "Access of {0} with a tainted {1} that may be {2}too large" , |
462 | Vals&: RegName, Vals&: OffsetName, |
463 | Vals: AlsoMentionUnderflow ? "negative or " : "" )}; |
464 | } |
465 | |
466 | const NoteTag *StateUpdateReporter::createNoteTag(CheckerContext &C) const { |
467 | // Don't create a note tag if we didn't assume anything: |
468 | if (!AssumedNonNegative && !AssumedUpperBound) |
469 | return nullptr; |
470 | |
471 | return C.getNoteTag([*this](PathSensitiveBugReport &BR) -> std::string { |
472 | return getMessage(BR); |
473 | }); |
474 | } |
475 | |
476 | std::string StateUpdateReporter::getMessage(PathSensitiveBugReport &BR) const { |
477 | bool ShouldReportNonNegative = AssumedNonNegative; |
478 | if (!providesInformationAboutInteresting(SV: ByteOffsetVal, BR)) { |
479 | if (AssumedUpperBound && |
480 | providesInformationAboutInteresting(SV: *AssumedUpperBound, BR)) { |
481 | // Even if the byte offset isn't interesting (e.g. it's a constant value), |
482 | // the assumption can still be interesting if it provides information |
483 | // about an interesting symbolic upper bound. |
484 | ShouldReportNonNegative = false; |
485 | } else { |
486 | // We don't have anything interesting, don't report the assumption. |
487 | return "" ; |
488 | } |
489 | } |
490 | |
491 | std::optional<int64_t> OffsetN = getConcreteValue(SV: ByteOffsetVal); |
492 | std::optional<int64_t> ExtentN = getConcreteValue(SV: AssumedUpperBound); |
493 | |
494 | const bool UseIndex = |
495 | ElementSize && tryDividePair(Val1&: OffsetN, Val2&: ExtentN, Divisor: *ElementSize); |
496 | |
497 | SmallString<256> Buf; |
498 | llvm::raw_svector_ostream Out(Buf); |
499 | Out << "Assuming " ; |
500 | if (UseIndex) { |
501 | Out << "index " ; |
502 | if (OffsetN) |
503 | Out << "'" << OffsetN << "' " ; |
504 | } else if (AssumedUpperBound) { |
505 | Out << "byte offset " ; |
506 | if (OffsetN) |
507 | Out << "'" << OffsetN << "' " ; |
508 | } else { |
509 | Out << "offset " ; |
510 | } |
511 | |
512 | Out << "is" ; |
513 | if (ShouldReportNonNegative) { |
514 | Out << " non-negative" ; |
515 | } |
516 | if (AssumedUpperBound) { |
517 | if (ShouldReportNonNegative) |
518 | Out << " and" ; |
519 | Out << " less than " ; |
520 | if (ExtentN) |
521 | Out << *ExtentN << ", " ; |
522 | if (UseIndex && ElementType) |
523 | Out << "the number of '" << ElementType->getAsString() |
524 | << "' elements in " ; |
525 | else |
526 | Out << "the extent of " ; |
527 | Out << getRegionName(Region: Reg); |
528 | } |
529 | return std::string(Out.str()); |
530 | } |
531 | |
532 | bool StateUpdateReporter::providesInformationAboutInteresting( |
533 | SymbolRef Sym, PathSensitiveBugReport &BR) { |
534 | if (!Sym) |
535 | return false; |
536 | for (SymbolRef PartSym : Sym->symbols()) { |
537 | // The interestingess mark may appear on any layer as we're stripping off |
538 | // the SymIntExpr, UnarySymExpr etc. layers... |
539 | if (BR.isInteresting(sym: PartSym)) |
540 | return true; |
541 | // ...but if both sides of the expression are symbolic, then there is no |
542 | // practical algorithm to produce separate constraints for the two |
543 | // operands (from the single combined result). |
544 | if (isa<SymSymExpr>(Val: PartSym)) |
545 | return false; |
546 | } |
547 | return false; |
548 | } |
549 | |
550 | void ArrayBoundCheckerV2::performCheck(const Expr *E, CheckerContext &C) const { |
551 | const SVal Location = C.getSVal(E); |
552 | |
553 | // The header ctype.h (from e.g. glibc) implements the isXXXXX() macros as |
554 | // #define isXXXXX(arg) (LOOKUP_TABLE[arg] & BITMASK_FOR_XXXXX) |
555 | // and incomplete analysis of these leads to false positives. As even |
556 | // accurate reports would be confusing for the users, just disable reports |
557 | // from these macros: |
558 | if (isFromCtypeMacro(E, C.getASTContext())) |
559 | return; |
560 | |
561 | ProgramStateRef State = C.getState(); |
562 | SValBuilder &SVB = C.getSValBuilder(); |
563 | |
564 | const std::optional<std::pair<const SubRegion *, NonLoc>> &RawOffset = |
565 | computeOffset(State, SVB, Location); |
566 | |
567 | if (!RawOffset) |
568 | return; |
569 | |
570 | auto [Reg, ByteOffset] = *RawOffset; |
571 | |
572 | // The state updates will be reported as a single note tag, which will be |
573 | // composed by this helper class. |
574 | StateUpdateReporter SUR(Reg, ByteOffset, E, C); |
575 | |
576 | // CHECK LOWER BOUND |
577 | const MemSpaceRegion *Space = Reg->getMemorySpace(); |
578 | if (!(isa<SymbolicRegion>(Reg) && isa<UnknownSpaceRegion>(Val: Space))) { |
579 | // A symbolic region in unknown space represents an unknown pointer that |
580 | // may point into the middle of an array, so we don't look for underflows. |
581 | // Both conditions are significant because we want to check underflows in |
582 | // symbolic regions on the heap (which may be introduced by checkers like |
583 | // MallocChecker that call SValBuilder::getConjuredHeapSymbolVal()) and |
584 | // non-symbolic regions (e.g. a field subregion of a symbolic region) in |
585 | // unknown space. |
586 | auto [PrecedesLowerBound, WithinLowerBound] = compareValueToThreshold( |
587 | State, ByteOffset, SVB.makeZeroArrayIndex(), SVB); |
588 | |
589 | if (PrecedesLowerBound) { |
590 | // The offset may be invalid (negative)... |
591 | if (!WithinLowerBound) { |
592 | // ...and it cannot be valid (>= 0), so report an error. |
593 | Messages Msgs = getPrecedesMsgs(Reg, ByteOffset); |
594 | reportOOB(C, ErrorState: PrecedesLowerBound, Msgs, Offset: ByteOffset, Extent: std::nullopt); |
595 | return; |
596 | } |
597 | // ...but it can be valid as well, so the checker will (optimistically) |
598 | // assume that it's valid and mention this in the note tag. |
599 | SUR.recordNonNegativeAssumption(); |
600 | } |
601 | |
602 | // Actually update the state. The "if" only fails in the extremely unlikely |
603 | // case when compareValueToThreshold returns {nullptr, nullptr} becasue |
604 | // evalBinOpNN fails to evaluate the less-than operator. |
605 | if (WithinLowerBound) |
606 | State = WithinLowerBound; |
607 | } |
608 | |
609 | // CHECK UPPER BOUND |
610 | DefinedOrUnknownSVal Size = getDynamicExtent(State, Reg, SVB); |
611 | if (auto KnownSize = Size.getAs<NonLoc>()) { |
612 | // In a situation where both overflow and overflow are possible (but the |
613 | // index is either tainted or known to be invalid), the logic of this |
614 | // checker will first assume that the offset is non-negative, and then |
615 | // (with this additional assumption) it will detect an overflow error. |
616 | // In this situation the warning message should mention both possibilities. |
617 | bool AlsoMentionUnderflow = SUR.assumedNonNegative(); |
618 | |
619 | auto [WithinUpperBound, ExceedsUpperBound] = |
620 | compareValueToThreshold(State, ByteOffset, *KnownSize, SVB); |
621 | |
622 | if (ExceedsUpperBound) { |
623 | // The offset may be invalid (>= Size)... |
624 | if (!WithinUpperBound) { |
625 | // ...and it cannot be within bounds, so report an error, unless we can |
626 | // definitely determine that this is an idiomatic `&array[size]` |
627 | // expression that calculates the past-the-end pointer. |
628 | if (isIdiomaticPastTheEndPtr(E, State: ExceedsUpperBound, Offset: ByteOffset, |
629 | Limit: *KnownSize, C)) { |
630 | C.addTransition(ExceedsUpperBound, SUR.createNoteTag(C)); |
631 | return; |
632 | } |
633 | |
634 | Messages Msgs = |
635 | getExceedsMsgs(C.getASTContext(), Reg, ByteOffset, *KnownSize, |
636 | Location, AlsoMentionUnderflow); |
637 | reportOOB(C, ErrorState: ExceedsUpperBound, Msgs, Offset: ByteOffset, Extent: KnownSize); |
638 | return; |
639 | } |
640 | // ...and it can be valid as well... |
641 | if (isTainted(State, ByteOffset)) { |
642 | // ...but it's tainted, so report an error. |
643 | |
644 | // Diagnostic detail: saying "tainted offset" is always correct, but |
645 | // the common case is that 'idx' is tainted in 'arr[idx]' and then it's |
646 | // nicer to say "tainted index". |
647 | const char *OffsetName = "offset" ; |
648 | if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(Val: E)) |
649 | if (isTainted(State, ASE->getIdx(), C.getLocationContext())) |
650 | OffsetName = "index" ; |
651 | |
652 | Messages Msgs = getTaintMsgs(Reg, OffsetName, AlsoMentionUnderflow); |
653 | reportOOB(C, ErrorState: ExceedsUpperBound, Msgs, Offset: ByteOffset, Extent: KnownSize, |
654 | /*IsTaintBug=*/true); |
655 | return; |
656 | } |
657 | // ...and it isn't tainted, so the checker will (optimistically) assume |
658 | // that the offset is in bounds and mention this in the note tag. |
659 | SUR.recordUpperBoundAssumption(UpperBoundVal: *KnownSize); |
660 | } |
661 | |
662 | // Actually update the state. The "if" only fails in the extremely unlikely |
663 | // case when compareValueToThreshold returns {nullptr, nullptr} becasue |
664 | // evalBinOpNN fails to evaluate the less-than operator. |
665 | if (WithinUpperBound) |
666 | State = WithinUpperBound; |
667 | } |
668 | |
669 | // Add a transition, reporting the state updates that we accumulated. |
670 | C.addTransition(State, Tag: SUR.createNoteTag(C)); |
671 | } |
672 | |
673 | void ArrayBoundCheckerV2::markPartsInteresting(PathSensitiveBugReport &BR, |
674 | ProgramStateRef ErrorState, |
675 | NonLoc Val, bool MarkTaint) { |
676 | if (SymbolRef Sym = Val.getAsSymbol()) { |
677 | // If the offset is a symbolic value, iterate over its "parts" with |
678 | // `SymExpr::symbols()` and mark each of them as interesting. |
679 | // For example, if the offset is `x*4 + y` then we put interestingness onto |
680 | // the SymSymExpr `x*4 + y`, the SymIntExpr `x*4` and the two data symbols |
681 | // `x` and `y`. |
682 | for (SymbolRef PartSym : Sym->symbols()) |
683 | BR.markInteresting(sym: PartSym); |
684 | } |
685 | |
686 | if (MarkTaint) { |
687 | // If the issue that we're reporting depends on the taintedness of the |
688 | // offset, then put interestingness onto symbols that could be the origin |
689 | // of the taint. Note that this may find symbols that did not appear in |
690 | // `Sym->symbols()` (because they're only loosely connected to `Val`). |
691 | for (SymbolRef Sym : getTaintedSymbols(State: ErrorState, V: Val)) |
692 | BR.markInteresting(sym: Sym); |
693 | } |
694 | } |
695 | |
696 | void ArrayBoundCheckerV2::reportOOB(CheckerContext &C, |
697 | ProgramStateRef ErrorState, Messages Msgs, |
698 | NonLoc Offset, std::optional<NonLoc> Extent, |
699 | bool IsTaintBug /*=false*/) const { |
700 | |
701 | ExplodedNode *ErrorNode = C.generateErrorNode(State: ErrorState); |
702 | if (!ErrorNode) |
703 | return; |
704 | |
705 | auto BR = std::make_unique<PathSensitiveBugReport>( |
706 | args: IsTaintBug ? TaintBT : BT, args&: Msgs.Short, args&: Msgs.Full, args&: ErrorNode); |
707 | |
708 | // FIXME: ideally we would just call trackExpressionValue() and that would |
709 | // "do the right thing": mark the relevant symbols as interesting, track the |
710 | // control dependencies and statements storing the relevant values and add |
711 | // helpful diagnostic pieces. However, right now trackExpressionValue() is |
712 | // a heap of unreliable heuristics, so it would cause several issues: |
713 | // - Interestingness is not applied consistently, e.g. if `array[x+10]` |
714 | // causes an overflow, then `x` is not marked as interesting. |
715 | // - We get irrelevant diagnostic pieces, e.g. in the code |
716 | // `int *p = (int*)malloc(2*sizeof(int)); p[3] = 0;` |
717 | // it places a "Storing uninitialized value" note on the `malloc` call |
718 | // (which is technically true, but irrelevant). |
719 | // If trackExpressionValue() becomes reliable, it should be applied instead |
720 | // of this custom markPartsInteresting(). |
721 | markPartsInteresting(BR&: *BR, ErrorState, Val: Offset, MarkTaint: IsTaintBug); |
722 | if (Extent) |
723 | markPartsInteresting(BR&: *BR, ErrorState, Val: *Extent, MarkTaint: IsTaintBug); |
724 | |
725 | C.emitReport(R: std::move(BR)); |
726 | } |
727 | |
728 | bool ArrayBoundCheckerV2::isFromCtypeMacro(const Stmt *S, ASTContext &ACtx) { |
729 | SourceLocation Loc = S->getBeginLoc(); |
730 | if (!Loc.isMacroID()) |
731 | return false; |
732 | |
733 | StringRef MacroName = Lexer::getImmediateMacroName( |
734 | Loc, SM: ACtx.getSourceManager(), LangOpts: ACtx.getLangOpts()); |
735 | |
736 | if (MacroName.size() < 7 || MacroName[0] != 'i' || MacroName[1] != 's') |
737 | return false; |
738 | |
739 | return ((MacroName == "isalnum" ) || (MacroName == "isalpha" ) || |
740 | (MacroName == "isblank" ) || (MacroName == "isdigit" ) || |
741 | (MacroName == "isgraph" ) || (MacroName == "islower" ) || |
742 | (MacroName == "isnctrl" ) || (MacroName == "isprint" ) || |
743 | (MacroName == "ispunct" ) || (MacroName == "isspace" ) || |
744 | (MacroName == "isupper" ) || (MacroName == "isxdigit" )); |
745 | } |
746 | |
747 | bool ArrayBoundCheckerV2::isInAddressOf(const Stmt *S, ASTContext &ACtx) { |
748 | ParentMapContext &ParentCtx = ACtx.getParentMapContext(); |
749 | do { |
750 | const DynTypedNodeList Parents = ParentCtx.getParents(Node: *S); |
751 | if (Parents.empty()) |
752 | return false; |
753 | S = Parents[0].get<Stmt>(); |
754 | } while (isa_and_nonnull<ParenExpr, ImplicitCastExpr>(Val: S)); |
755 | const auto *UnaryOp = dyn_cast_or_null<UnaryOperator>(Val: S); |
756 | return UnaryOp && UnaryOp->getOpcode() == UO_AddrOf; |
757 | } |
758 | |
759 | bool ArrayBoundCheckerV2::isIdiomaticPastTheEndPtr(const Expr *E, |
760 | ProgramStateRef State, |
761 | NonLoc Offset, NonLoc Limit, |
762 | CheckerContext &C) { |
763 | if (isa<ArraySubscriptExpr>(Val: E) && isInAddressOf(E, C.getASTContext())) { |
764 | auto [EqualsToThreshold, NotEqualToThreshold] = compareValueToThreshold( |
765 | State, Value: Offset, Threshold: Limit, SVB&: C.getSValBuilder(), /*CheckEquality=*/true); |
766 | return EqualsToThreshold && !NotEqualToThreshold; |
767 | } |
768 | return false; |
769 | } |
770 | |
771 | void ento::registerArrayBoundCheckerV2(CheckerManager &mgr) { |
772 | mgr.registerChecker<ArrayBoundCheckerV2>(); |
773 | } |
774 | |
775 | bool ento::shouldRegisterArrayBoundCheckerV2(const CheckerManager &mgr) { |
776 | return true; |
777 | } |
778 | |