| 1 | //! Cosine function implemented using a parabolic approximation. |
| 2 | //! |
| 3 | //! Note that while this method is inspired by a [Taylor series] approximation, |
| 4 | //! it provides both better performance and accuracy. |
| 5 | //! |
| 6 | //! Most of the other trigonometric functions in this crate are implemented in |
| 7 | //! terms of this approximation. |
| 8 | //! |
| 9 | //! Method described: <https://stackoverflow.com/posts/28050328/revisions> |
| 10 | //! |
| 11 | //! Originally based on this approximation by Nick Capen: |
| 12 | //! |
| 13 | //! <https://web.archive.org/web/20180105200343if_/http://forum.devmaster.net/t/fast-and-accurate-sine-cosine/9648> |
| 14 | //! |
| 15 | //! Post quoted below for posterity as the original site is down: |
| 16 | //! |
| 17 | //! > In some situations you need a good approximation of sine and cosine that |
| 18 | //! > runs at very high performance. One example is to implement dynamic |
| 19 | //! > subdivision of round surfaces, comparable to those in Quake 3. Or to |
| 20 | //! > implement wave motion, in case there are no vertex shaders 2.0 available. |
| 21 | //! > |
| 22 | //! > The standard C `sinf()` and `cosf()` functions are terribly slow and offer |
| 23 | //! > much more precision than what we actually need. What we really want is an |
| 24 | //! > approximation that offers the best compromise between precision and |
| 25 | //! > performance. The most well known approximation method is to use a |
| 26 | //! > [Taylor series] about 0 (also known as a Maclaurin series), which for |
| 27 | //! > sine becomes: |
| 28 | //! > |
| 29 | //! > ```text |
| 30 | //! > x - 1/6 x^3 + 1/120 x^5 - 1/5040 x^7 + ... |
| 31 | //! > ``` |
| 32 | //! > |
| 33 | //! > When we plot this we get: taylor.gif. The green line is the real sine, |
| 34 | //! > the red line is the first four terms of the Taylor series. This seems |
| 35 | //! > like an acceptable approximation, but lets have a closer look: |
| 36 | //! > taylor_zoom.gif. It behaves very nicely up till `pi/2`, but after that it |
| 37 | //! > quickly deviates. At pi, it evaluates to -0.075 instead of 0. Using this |
| 38 | //! > for wave simulation will result in jerky motion which is unacceptable. |
| 39 | //! > |
| 40 | //! > We could add another term, which in fact reduces the error significantly, |
| 41 | //! > but this makes the formula pretty lengthy. We already need 7 |
| 42 | //! > multiplications and 3 additions for the 4-term version. The Taylor series |
| 43 | //! > just can't give us the precision and performance we're looking for. |
| 44 | //! > |
| 45 | //! > We did however note that we need `sin(pi) = 0`. And there's another thing |
| 46 | //! > we can see from taylor_zoom.gif: this looks very much like a parabola! |
| 47 | //! > So let's try to find the formula of a parabola that matches it as closely |
| 48 | //! > as possible. The generic formula for a parabola is `A + B x + C x²`. So |
| 49 | //! > this gives us three degrees of freedom. Obvious choices are that we want |
| 50 | //! > `sin(0) = 0`, `sin(pi/2) = 1` and `sin(pi) = 0`. This gives us the following |
| 51 | //! > three equations: |
| 52 | //! > |
| 53 | //! > ```text |
| 54 | //! > A + B 0 + C 0² = 0 |
| 55 | //! > A + B pi/2 + C (pi/2)² = 1 |
| 56 | //! > A + B pi + C pi² = 0 |
| 57 | //! > ``` |
| 58 | //! > |
| 59 | //! > Which has as solution `A = 0, B = 4/pi, C = -4/pi²`. So our parabola |
| 60 | //! > approximation becomes `4/pi x - 4/pi² x²`. Plotting this we get: |
| 61 | //! > parabola.gif. This looks worse than the 4-term Taylor series, right? |
| 62 | //! > Wrong! The maximum absolute error is 0.056. Furthermore, this |
| 63 | //! > approximation will give us smooth wave motion, and can be calculated |
| 64 | //! > in only 3 multiplications and 1 addition! |
| 65 | //! > |
| 66 | //! > Unfortunately it's not very practical yet. This is what we get in the |
| 67 | //! > `[-pi, pi]` range: negative.gif. It's quite obvious we want at least a |
| 68 | //! > full period. But it's also clear that it's just another parabola, |
| 69 | //! > mirrored around the origin. The formula for it is `4/pi x + 4/pi² x²`. |
| 70 | //! > So the straightforward (pseudo-C) solution is: |
| 71 | //! > |
| 72 | //! > ```text |
| 73 | //! > if(x > 0) |
| 74 | //! > { |
| 75 | //! > y = 4/pi x - 4/pi² x²; |
| 76 | //! > } |
| 77 | //! > else |
| 78 | //! > { |
| 79 | //! > y = 4/pi x + 4/pi² x²; |
| 80 | //! > } |
| 81 | //! > ``` |
| 82 | //! > |
| 83 | //! > Adding a branch is not a good idea though. It makes the code |
| 84 | //! > significantly slower. But look at how similar the two parts really are. |
| 85 | //! > A subtraction becomes an addition depending on the sign of x. In a naive |
| 86 | //! > first attempt to eliminate the branch we can 'extract' the sign of x |
| 87 | //! > using `x / abs(x)`. The division is very expensive but look at the resulting |
| 88 | //! > formula: `4/pi x - x / abs(x) 4/pi² x²`. By inverting the division we can |
| 89 | //! > simplify this to a very nice and clean `4/pi x - 4/pi² x abs(x)`. So for |
| 90 | //! > just one extra operation we get both halves of our sine approximation! |
| 91 | //! > Here's the graph of this formula confirming the result: abs.gif. |
| 92 | //! > |
| 93 | //! > Now let's look at cosine. Basic trigonometry tells us that |
| 94 | //! > `cos(x) = sin(pi/2 + x)`. Is that all, add pi/2 to x? No, we actually get |
| 95 | //! > the unwanted part of the parabola again: shift_sine.gif. What we need to |
| 96 | //! > do is 'wrap around' when x > pi/2. This can be done by subtracting 2 pi. |
| 97 | //! > So the code becomes: |
| 98 | //! > |
| 99 | //! > ```text |
| 100 | //! > x += pi/2; |
| 101 | //! > |
| 102 | //! > if(x > pi) // Original x > pi/2 |
| 103 | //! > { |
| 104 | //! > x -= 2 * pi; // Wrap: cos(x) = cos(x - 2 pi) |
| 105 | //! > } |
| 106 | //! > |
| 107 | //! > y = sin(x); |
| 108 | //! > ``` |
| 109 | //! > |
| 110 | //! > Yet another branch. To eliminate it, we can use binary logic, like this: |
| 111 | //! > |
| 112 | //! > ```text |
| 113 | //! > x -= (x > pi) & (2 * pi); |
| 114 | //! > ``` |
| 115 | //! > |
| 116 | //! > Note that this isn't valid C code at all. But it should clarify how this |
| 117 | //! > can work. When x > pi is false the & operation zeroes the right hand part |
| 118 | //! > so the subtraction does nothing, which is perfectly equivalent. I'll |
| 119 | //! > leave it as an excercise to the reader to create working C code for this |
| 120 | //! > (or just read on). Clearly the cosine requires a few more operations than |
| 121 | //! > the sine but there doesn't seem to be any other way and it's still |
| 122 | //! > extremely fast. |
| 123 | //! > |
| 124 | //! > Now, a maximum error of 0.056 is nice but clearly the 4-term Taylor series |
| 125 | //! > still has a smaller error on average. Recall what our sine looked like: |
| 126 | //! > abs.gif. So isn't there anything we can do to further increase precision |
| 127 | //! > at minimal cost? Of course the current version is already applicable for |
| 128 | //! > many situations where something that looks like a sine is just as good as |
| 129 | //! > the real sine. But for other situations that's just not good enough. |
| 130 | //! > |
| 131 | //! > Looking at the graphs you'll notice that our approximation always |
| 132 | //! > overestimates the real sine, except at 0, pi/2 and pi. So what we need is |
| 133 | //! > to 'scale it down' without touching these important points. The solution |
| 134 | //! > is to use the squared parabola, which looks like this: squared.gif. Note |
| 135 | //! > how it preserves those important points but it's always lower than the |
| 136 | //! > real sine. So we can use a weighted average of the two to get a better |
| 137 | //! > approximation: |
| 138 | //! > |
| 139 | //! > ```text |
| 140 | //! > Q (4/pi x - 4/pi² x²) + P (4/pi x - 4/pi² x²)² |
| 141 | //! > ``` |
| 142 | //! > |
| 143 | //! > With `Q + P = 1`. You can use exact minimization of either the absolute |
| 144 | //! > or relative error but I'll save you the math. The optimal weights are |
| 145 | //! > `Q = 0.775`, `P = 0.225` for the absolute error and `Q = 0.782`, |
| 146 | //! > `P = 0.218` for the relative error. I will use the former. The resulting |
| 147 | //! > graph is: average.gif. Where did the red line go? It's almost entirely |
| 148 | //! > covered by the green line, which instantly shows how good this |
| 149 | //! > approximation really is. The maximum error is about 0.001, a 50x |
| 150 | //! > improvement! The formula looks lengthy but the part between parenthesis |
| 151 | //! > is the same value from the parabola, which needs to be computed only |
| 152 | //! > once. In fact only 2 extra multiplications and 2 additions are required |
| 153 | //! > to achieve this precision boost. |
| 154 | //! > |
| 155 | //! > It shouldn't come as a big surprise that to make it work for negative x |
| 156 | //! > as well we need a second abs() operation. The final C code for the sine |
| 157 | //! > becomes: |
| 158 | //! > |
| 159 | //! > ```text |
| 160 | //! > float sin(float x) |
| 161 | //! > { |
| 162 | //! > const float B = 4/pi; |
| 163 | //! > const float C = -4/(pi*pi); |
| 164 | //! > |
| 165 | //! > float y = B * x + C * x * abs(x); |
| 166 | //! > |
| 167 | //! > #ifdef EXTRA_PRECISION |
| 168 | //! > // const float Q = 0.775; |
| 169 | //! > const float P = 0.225; |
| 170 | //! > |
| 171 | //! > y = P * (y * abs(y) - y) + y; // Q * y + P * y * abs(y) |
| 172 | //! > #endif |
| 173 | //! > } |
| 174 | //! > ``` |
| 175 | //! > |
| 176 | //! > So we need merely 5 multiplications and 3 additions; still faster than |
| 177 | //! > the 4-term Taylor if we neglect the abs(), and much more precise! The |
| 178 | //! > cosine version just needs the extra shift and wrap operations on x. |
| 179 | //! > |
| 180 | //! > Last but not least, I wouldn't be Nick if I didn't include the SIMD |
| 181 | //! > optimized assembly version. It allows to perform the wrap operation very |
| 182 | //! > efficiently so I'll give you the cosine: |
| 183 | //! > |
| 184 | //! > ```asm |
| 185 | //! > // cos(x) = sin(x + pi/2) |
| 186 | //! > addps xmm0, PI_2 |
| 187 | //! > movaps xmm1, xmm0 |
| 188 | //! > cmpnltps xmm1, PI |
| 189 | //! > andps xmm1, PIx2 |
| 190 | //! > subps xmm0, xmm1 |
| 191 | //! > |
| 192 | //! > // Parabola |
| 193 | //! > movaps xmm1, xmm0 |
| 194 | //! > andps xmm1, abs |
| 195 | //! > mulps xmm1, xmm0 |
| 196 | //! > mulps xmm0, B |
| 197 | //! > mulps xmm1, C |
| 198 | //! > addps xmm0, xmm1 |
| 199 | //! > |
| 200 | //! > // Extra precision |
| 201 | //! > movaps xmm1, xmm0 |
| 202 | //! > andps xmm1, abs |
| 203 | //! > mulps xmm1, xmm0 |
| 204 | //! > subps xmm1, xmm0 |
| 205 | //! > mulps xmm1, P |
| 206 | //! > addps xmm0, xmm1 |
| 207 | //! > ``` |
| 208 | //! > |
| 209 | //! > This code computes four cosines in parallel, resulting in a peak |
| 210 | //! > performance of about 9 clock cycles per cosine for most CPU architectures. |
| 211 | //! > Sines take only 6 clock cycles ideally. The lower precision version can |
| 212 | //! > even run at 3 clock cycles per sine... And lets not forget that all input |
| 213 | //! > between -pi and pi is valid and the formula is exact at -pi, -pi/2, 0, |
| 214 | //! > pi/2 and pi. |
| 215 | //! > |
| 216 | //! > So, the conclusion is don't ever again use a Taylor series to approximate |
| 217 | //! > a sine or cosine! To add a useful discussion to this article, I'd love to |
| 218 | //! > hear if anyone knows good approximations for other transcendental functions |
| 219 | //! > like the exponential, logarithm and power functions. |
| 220 | |
| 221 | use super::F32; |
| 222 | use core::f32::consts::FRAC_1_PI; |
| 223 | |
| 224 | impl F32 { |
| 225 | /// Approximates `cos(x)` in radians with a maximum error of `0.002`. |
| 226 | pub fn cos(self) -> Self { |
| 227 | let mut x: F32 = self; |
| 228 | x *= FRAC_1_PI / 2.0; |
| 229 | x -= 0.25 + (x + 0.25).floor().0; |
| 230 | x *= 16.0 * (x.abs() - 0.5); |
| 231 | x += 0.225 * x * (x.abs() - 1.0); |
| 232 | x |
| 233 | } |
| 234 | } |
| 235 | |
| 236 | #[cfg (test)] |
| 237 | pub(crate) mod tests { |
| 238 | use super::F32; |
| 239 | |
| 240 | /// Maximum error in radians |
| 241 | pub(crate) const MAX_ERROR: f32 = 0.002; |
| 242 | |
| 243 | /// Cosine test vectors - `(input, output)` |
| 244 | const TEST_VECTORS: &[(f32, f32)] = &[ |
| 245 | (0.000, 1.000), |
| 246 | (0.140, 0.990), |
| 247 | (0.279, 0.961), |
| 248 | (0.419, 0.914), |
| 249 | (0.559, 0.848), |
| 250 | (0.698, 0.766), |
| 251 | (0.838, 0.669), |
| 252 | (0.977, 0.559), |
| 253 | (1.117, 0.438), |
| 254 | (1.257, 0.309), |
| 255 | (1.396, 0.174), |
| 256 | (1.536, 0.035), |
| 257 | (1.676, -0.105), |
| 258 | (1.815, -0.242), |
| 259 | (1.955, -0.375), |
| 260 | (2.094, -0.500), |
| 261 | (2.234, -0.616), |
| 262 | (2.374, -0.719), |
| 263 | (2.513, -0.809), |
| 264 | (2.653, -0.883), |
| 265 | (2.793, -0.940), |
| 266 | (2.932, -0.978), |
| 267 | (3.072, -0.998), |
| 268 | (3.211, -0.998), |
| 269 | (3.351, -0.978), |
| 270 | (3.491, -0.940), |
| 271 | (3.630, -0.883), |
| 272 | (3.770, -0.809), |
| 273 | (3.910, -0.719), |
| 274 | (4.049, -0.616), |
| 275 | (4.189, -0.500), |
| 276 | (4.328, -0.375), |
| 277 | (4.468, -0.242), |
| 278 | (4.608, -0.105), |
| 279 | (4.747, 0.035), |
| 280 | (4.887, 0.174), |
| 281 | (5.027, 0.309), |
| 282 | (5.166, 0.438), |
| 283 | (5.306, 0.559), |
| 284 | (5.445, 0.669), |
| 285 | (5.585, 0.766), |
| 286 | (5.725, 0.848), |
| 287 | (5.864, 0.914), |
| 288 | (6.004, 0.961), |
| 289 | (6.144, 0.990), |
| 290 | (6.283, 1.000), |
| 291 | ]; |
| 292 | |
| 293 | #[test ] |
| 294 | fn sanity_check() { |
| 295 | for &(x, expected) in TEST_VECTORS { |
| 296 | let cos_x = F32(x).cos(); |
| 297 | let delta = (cos_x - expected).abs(); |
| 298 | |
| 299 | assert!( |
| 300 | delta <= MAX_ERROR, |
| 301 | "delta {} too large: {} vs {}" , |
| 302 | delta, |
| 303 | cos_x, |
| 304 | expected |
| 305 | ); |
| 306 | } |
| 307 | } |
| 308 | } |
| 309 | |