1 | |
2 | /* |
3 | =============================================================================== |
4 | |
5 | This C source fragment is part of the SoftFloat IEC/IEEE Floating-point |
6 | Arithmetic Package, Release 2. |
7 | |
8 | Written by John R. Hauser. This work was made possible in part by the |
9 | International Computer Science Institute, located at Suite 600, 1947 Center |
10 | Street, Berkeley, California 94704. Funding was partially provided by the |
11 | National Science Foundation under grant MIP-9311980. The original version |
12 | of this code was written as part of a project to build a fixed-point vector |
13 | processor in collaboration with the University of California at Berkeley, |
14 | overseen by Profs. Nelson Morgan and John Wawrzynek. More information |
15 | is available through the web page |
16 | http://www.jhauser.us/arithmetic/SoftFloat-2b/SoftFloat-source.txt |
17 | |
18 | THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort |
19 | has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT |
20 | TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO |
21 | PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY |
22 | AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. |
23 | |
24 | Derivative works are acceptable, even for commercial purposes, so long as |
25 | (1) they include prominent notice that the work is derivative, and (2) they |
26 | include prominent notice akin to these three paragraphs for those parts of |
27 | this code that are retained. |
28 | |
29 | =============================================================================== |
30 | */ |
31 | |
32 | /* |
33 | ------------------------------------------------------------------------------- |
34 | Shifts `a' right by the number of bits given in `count'. If any nonzero |
35 | bits are shifted off, they are ``jammed'' into the least significant bit of |
36 | the result by setting the least significant bit to 1. The value of `count' |
37 | can be arbitrarily large; in particular, if `count' is greater than 32, the |
38 | result will be either 0 or 1, depending on whether `a' is zero or nonzero. |
39 | The result is stored in the location pointed to by `zPtr'. |
40 | ------------------------------------------------------------------------------- |
41 | */ |
42 | INLINE void shift32RightJamming( bits32 a, int16 count, bits32 *zPtr ) |
43 | { |
44 | bits32 z; |
45 | if ( count == 0 ) { |
46 | z = a; |
47 | } |
48 | else if ( count < 32 ) { |
49 | z = ( a>>count ) | ( ( a<<( ( - count ) & 31 ) ) != 0 ); |
50 | } |
51 | else { |
52 | z = ( a != 0 ); |
53 | } |
54 | *zPtr = z; |
55 | } |
56 | |
57 | /* |
58 | ------------------------------------------------------------------------------- |
59 | Shifts `a' right by the number of bits given in `count'. If any nonzero |
60 | bits are shifted off, they are ``jammed'' into the least significant bit of |
61 | the result by setting the least significant bit to 1. The value of `count' |
62 | can be arbitrarily large; in particular, if `count' is greater than 64, the |
63 | result will be either 0 or 1, depending on whether `a' is zero or nonzero. |
64 | The result is stored in the location pointed to by `zPtr'. |
65 | ------------------------------------------------------------------------------- |
66 | */ |
67 | INLINE void shift64RightJamming( bits64 a, int16 count, bits64 *zPtr ) |
68 | { |
69 | bits64 z; |
70 | |
71 | __asm__("@shift64RightJamming -- start" ); |
72 | if ( count == 0 ) { |
73 | z = a; |
74 | } |
75 | else if ( count < 64 ) { |
76 | z = ( a>>count ) | ( ( a<<( ( - count ) & 63 ) ) != 0 ); |
77 | } |
78 | else { |
79 | z = ( a != 0 ); |
80 | } |
81 | __asm__("@shift64RightJamming -- end" ); |
82 | *zPtr = z; |
83 | } |
84 | |
85 | /* |
86 | ------------------------------------------------------------------------------- |
87 | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by 64 |
88 | _plus_ the number of bits given in `count'. The shifted result is at most |
89 | 64 nonzero bits; this is stored at the location pointed to by `z0Ptr'. The |
90 | bits shifted off form a second 64-bit result as follows: The _last_ bit |
91 | shifted off is the most-significant bit of the extra result, and the other |
92 | 63 bits of the extra result are all zero if and only if _all_but_the_last_ |
93 | bits shifted off were all zero. This extra result is stored in the location |
94 | pointed to by `z1Ptr'. The value of `count' can be arbitrarily large. |
95 | (This routine makes more sense if `a0' and `a1' are considered to form a |
96 | fixed-point value with binary point between `a0' and `a1'. This fixed-point |
97 | value is shifted right by the number of bits given in `count', and the |
98 | integer part of the result is returned at the location pointed to by |
99 | `z0Ptr'. The fractional part of the result may be slightly corrupted as |
100 | described above, and is returned at the location pointed to by `z1Ptr'.) |
101 | ------------------------------------------------------------------------------- |
102 | */ |
103 | INLINE void |
104 | ( |
105 | bits64 a0, bits64 a1, int16 count, bits64 *z0Ptr, bits64 *z1Ptr ) |
106 | { |
107 | bits64 z0, z1; |
108 | int8 negCount = ( - count ) & 63; |
109 | |
110 | if ( count == 0 ) { |
111 | z1 = a1; |
112 | z0 = a0; |
113 | } |
114 | else if ( count < 64 ) { |
115 | z1 = ( a0<<negCount ) | ( a1 != 0 ); |
116 | z0 = a0>>count; |
117 | } |
118 | else { |
119 | if ( count == 64 ) { |
120 | z1 = a0 | ( a1 != 0 ); |
121 | } |
122 | else { |
123 | z1 = ( ( a0 | a1 ) != 0 ); |
124 | } |
125 | z0 = 0; |
126 | } |
127 | *z1Ptr = z1; |
128 | *z0Ptr = z0; |
129 | |
130 | } |
131 | |
132 | /* |
133 | ------------------------------------------------------------------------------- |
134 | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the |
135 | number of bits given in `count'. Any bits shifted off are lost. The value |
136 | of `count' can be arbitrarily large; in particular, if `count' is greater |
137 | than 128, the result will be 0. The result is broken into two 64-bit pieces |
138 | which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'. |
139 | ------------------------------------------------------------------------------- |
140 | */ |
141 | INLINE void |
142 | shift128Right( |
143 | bits64 a0, bits64 a1, int16 count, bits64 *z0Ptr, bits64 *z1Ptr ) |
144 | { |
145 | bits64 z0, z1; |
146 | int8 negCount = ( - count ) & 63; |
147 | |
148 | if ( count == 0 ) { |
149 | z1 = a1; |
150 | z0 = a0; |
151 | } |
152 | else if ( count < 64 ) { |
153 | z1 = ( a0<<negCount ) | ( a1>>count ); |
154 | z0 = a0>>count; |
155 | } |
156 | else { |
157 | z1 = ( count < 64 ) ? ( a0>>( count & 63 ) ) : 0; |
158 | z0 = 0; |
159 | } |
160 | *z1Ptr = z1; |
161 | *z0Ptr = z0; |
162 | |
163 | } |
164 | |
165 | /* |
166 | ------------------------------------------------------------------------------- |
167 | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the |
168 | number of bits given in `count'. If any nonzero bits are shifted off, they |
169 | are ``jammed'' into the least significant bit of the result by setting the |
170 | least significant bit to 1. The value of `count' can be arbitrarily large; |
171 | in particular, if `count' is greater than 128, the result will be either 0 |
172 | or 1, depending on whether the concatenation of `a0' and `a1' is zero or |
173 | nonzero. The result is broken into two 64-bit pieces which are stored at |
174 | the locations pointed to by `z0Ptr' and `z1Ptr'. |
175 | ------------------------------------------------------------------------------- |
176 | */ |
177 | INLINE void |
178 | shift128RightJamming( |
179 | bits64 a0, bits64 a1, int16 count, bits64 *z0Ptr, bits64 *z1Ptr ) |
180 | { |
181 | bits64 z0, z1; |
182 | int8 negCount = ( - count ) & 63; |
183 | |
184 | if ( count == 0 ) { |
185 | z1 = a1; |
186 | z0 = a0; |
187 | } |
188 | else if ( count < 64 ) { |
189 | z1 = ( a0<<negCount ) | ( a1>>count ) | ( ( a1<<negCount ) != 0 ); |
190 | z0 = a0>>count; |
191 | } |
192 | else { |
193 | if ( count == 64 ) { |
194 | z1 = a0 | ( a1 != 0 ); |
195 | } |
196 | else if ( count < 128 ) { |
197 | z1 = ( a0>>( count & 63 ) ) | ( ( ( a0<<negCount ) | a1 ) != 0 ); |
198 | } |
199 | else { |
200 | z1 = ( ( a0 | a1 ) != 0 ); |
201 | } |
202 | z0 = 0; |
203 | } |
204 | *z1Ptr = z1; |
205 | *z0Ptr = z0; |
206 | |
207 | } |
208 | |
209 | /* |
210 | ------------------------------------------------------------------------------- |
211 | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' right |
212 | by 64 _plus_ the number of bits given in `count'. The shifted result is |
213 | at most 128 nonzero bits; these are broken into two 64-bit pieces which are |
214 | stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted |
215 | off form a third 64-bit result as follows: The _last_ bit shifted off is |
216 | the most-significant bit of the extra result, and the other 63 bits of the |
217 | extra result are all zero if and only if _all_but_the_last_ bits shifted off |
218 | were all zero. This extra result is stored in the location pointed to by |
219 | `z2Ptr'. The value of `count' can be arbitrarily large. |
220 | (This routine makes more sense if `a0', `a1', and `a2' are considered |
221 | to form a fixed-point value with binary point between `a1' and `a2'. This |
222 | fixed-point value is shifted right by the number of bits given in `count', |
223 | and the integer part of the result is returned at the locations pointed to |
224 | by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly |
225 | corrupted as described above, and is returned at the location pointed to by |
226 | `z2Ptr'.) |
227 | ------------------------------------------------------------------------------- |
228 | */ |
229 | INLINE void |
230 | ( |
231 | bits64 a0, |
232 | bits64 a1, |
233 | bits64 a2, |
234 | int16 count, |
235 | bits64 *z0Ptr, |
236 | bits64 *z1Ptr, |
237 | bits64 *z2Ptr |
238 | ) |
239 | { |
240 | bits64 z0, z1, z2; |
241 | int8 negCount = ( - count ) & 63; |
242 | |
243 | if ( count == 0 ) { |
244 | z2 = a2; |
245 | z1 = a1; |
246 | z0 = a0; |
247 | } |
248 | else { |
249 | if ( count < 64 ) { |
250 | z2 = a1<<negCount; |
251 | z1 = ( a0<<negCount ) | ( a1>>count ); |
252 | z0 = a0>>count; |
253 | } |
254 | else { |
255 | if ( count == 64 ) { |
256 | z2 = a1; |
257 | z1 = a0; |
258 | } |
259 | else { |
260 | a2 |= a1; |
261 | if ( count < 128 ) { |
262 | z2 = a0<<negCount; |
263 | z1 = a0>>( count & 63 ); |
264 | } |
265 | else { |
266 | z2 = ( count == 128 ) ? a0 : ( a0 != 0 ); |
267 | z1 = 0; |
268 | } |
269 | } |
270 | z0 = 0; |
271 | } |
272 | z2 |= ( a2 != 0 ); |
273 | } |
274 | *z2Ptr = z2; |
275 | *z1Ptr = z1; |
276 | *z0Ptr = z0; |
277 | |
278 | } |
279 | |
280 | /* |
281 | ------------------------------------------------------------------------------- |
282 | Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the |
283 | number of bits given in `count'. Any bits shifted off are lost. The value |
284 | of `count' must be less than 64. The result is broken into two 64-bit |
285 | pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'. |
286 | ------------------------------------------------------------------------------- |
287 | */ |
288 | INLINE void |
289 | shortShift128Left( |
290 | bits64 a0, bits64 a1, int16 count, bits64 *z0Ptr, bits64 *z1Ptr ) |
291 | { |
292 | |
293 | *z1Ptr = a1<<count; |
294 | *z0Ptr = |
295 | ( count == 0 ) ? a0 : ( a0<<count ) | ( a1>>( ( - count ) & 63 ) ); |
296 | |
297 | } |
298 | |
299 | /* |
300 | ------------------------------------------------------------------------------- |
301 | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' left |
302 | by the number of bits given in `count'. Any bits shifted off are lost. |
303 | The value of `count' must be less than 64. The result is broken into three |
304 | 64-bit pieces which are stored at the locations pointed to by `z0Ptr', |
305 | `z1Ptr', and `z2Ptr'. |
306 | ------------------------------------------------------------------------------- |
307 | */ |
308 | INLINE void |
309 | shortShift192Left( |
310 | bits64 a0, |
311 | bits64 a1, |
312 | bits64 a2, |
313 | int16 count, |
314 | bits64 *z0Ptr, |
315 | bits64 *z1Ptr, |
316 | bits64 *z2Ptr |
317 | ) |
318 | { |
319 | bits64 z0, z1, z2; |
320 | int8 negCount; |
321 | |
322 | z2 = a2<<count; |
323 | z1 = a1<<count; |
324 | z0 = a0<<count; |
325 | if ( 0 < count ) { |
326 | negCount = ( ( - count ) & 63 ); |
327 | z1 |= a2>>negCount; |
328 | z0 |= a1>>negCount; |
329 | } |
330 | *z2Ptr = z2; |
331 | *z1Ptr = z1; |
332 | *z0Ptr = z0; |
333 | |
334 | } |
335 | |
336 | /* |
337 | ------------------------------------------------------------------------------- |
338 | Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit |
339 | value formed by concatenating `b0' and `b1'. Addition is modulo 2^128, so |
340 | any carry out is lost. The result is broken into two 64-bit pieces which |
341 | are stored at the locations pointed to by `z0Ptr' and `z1Ptr'. |
342 | ------------------------------------------------------------------------------- |
343 | */ |
344 | INLINE void |
345 | add128( |
346 | bits64 a0, bits64 a1, bits64 b0, bits64 b1, bits64 *z0Ptr, bits64 *z1Ptr ) |
347 | { |
348 | bits64 z1; |
349 | |
350 | z1 = a1 + b1; |
351 | *z1Ptr = z1; |
352 | *z0Ptr = a0 + b0 + ( z1 < a1 ); |
353 | |
354 | } |
355 | |
356 | /* |
357 | ------------------------------------------------------------------------------- |
358 | Adds the 192-bit value formed by concatenating `a0', `a1', and `a2' to the |
359 | 192-bit value formed by concatenating `b0', `b1', and `b2'. Addition is |
360 | modulo 2^192, so any carry out is lost. The result is broken into three |
361 | 64-bit pieces which are stored at the locations pointed to by `z0Ptr', |
362 | `z1Ptr', and `z2Ptr'. |
363 | ------------------------------------------------------------------------------- |
364 | */ |
365 | INLINE void |
366 | add192( |
367 | bits64 a0, |
368 | bits64 a1, |
369 | bits64 a2, |
370 | bits64 b0, |
371 | bits64 b1, |
372 | bits64 b2, |
373 | bits64 *z0Ptr, |
374 | bits64 *z1Ptr, |
375 | bits64 *z2Ptr |
376 | ) |
377 | { |
378 | bits64 z0, z1, z2; |
379 | int8 carry0, carry1; |
380 | |
381 | z2 = a2 + b2; |
382 | carry1 = ( z2 < a2 ); |
383 | z1 = a1 + b1; |
384 | carry0 = ( z1 < a1 ); |
385 | z0 = a0 + b0; |
386 | z1 += carry1; |
387 | z0 += ( z1 < carry1 ); |
388 | z0 += carry0; |
389 | *z2Ptr = z2; |
390 | *z1Ptr = z1; |
391 | *z0Ptr = z0; |
392 | |
393 | } |
394 | |
395 | /* |
396 | ------------------------------------------------------------------------------- |
397 | Subtracts the 128-bit value formed by concatenating `b0' and `b1' from the |
398 | 128-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo |
399 | 2^128, so any borrow out (carry out) is lost. The result is broken into two |
400 | 64-bit pieces which are stored at the locations pointed to by `z0Ptr' and |
401 | `z1Ptr'. |
402 | ------------------------------------------------------------------------------- |
403 | */ |
404 | INLINE void |
405 | sub128( |
406 | bits64 a0, bits64 a1, bits64 b0, bits64 b1, bits64 *z0Ptr, bits64 *z1Ptr ) |
407 | { |
408 | |
409 | *z1Ptr = a1 - b1; |
410 | *z0Ptr = a0 - b0 - ( a1 < b1 ); |
411 | |
412 | } |
413 | |
414 | /* |
415 | ------------------------------------------------------------------------------- |
416 | Subtracts the 192-bit value formed by concatenating `b0', `b1', and `b2' |
417 | from the 192-bit value formed by concatenating `a0', `a1', and `a2'. |
418 | Subtraction is modulo 2^192, so any borrow out (carry out) is lost. The |
419 | result is broken into three 64-bit pieces which are stored at the locations |
420 | pointed to by `z0Ptr', `z1Ptr', and `z2Ptr'. |
421 | ------------------------------------------------------------------------------- |
422 | */ |
423 | INLINE void |
424 | sub192( |
425 | bits64 a0, |
426 | bits64 a1, |
427 | bits64 a2, |
428 | bits64 b0, |
429 | bits64 b1, |
430 | bits64 b2, |
431 | bits64 *z0Ptr, |
432 | bits64 *z1Ptr, |
433 | bits64 *z2Ptr |
434 | ) |
435 | { |
436 | bits64 z0, z1, z2; |
437 | int8 borrow0, borrow1; |
438 | |
439 | z2 = a2 - b2; |
440 | borrow1 = ( a2 < b2 ); |
441 | z1 = a1 - b1; |
442 | borrow0 = ( a1 < b1 ); |
443 | z0 = a0 - b0; |
444 | z0 -= ( z1 < borrow1 ); |
445 | z1 -= borrow1; |
446 | z0 -= borrow0; |
447 | *z2Ptr = z2; |
448 | *z1Ptr = z1; |
449 | *z0Ptr = z0; |
450 | |
451 | } |
452 | |
453 | /* |
454 | ------------------------------------------------------------------------------- |
455 | Multiplies `a' by `b' to obtain a 128-bit product. The product is broken |
456 | into two 64-bit pieces which are stored at the locations pointed to by |
457 | `z0Ptr' and `z1Ptr'. |
458 | ------------------------------------------------------------------------------- |
459 | */ |
460 | INLINE void mul64To128( bits64 a, bits64 b, bits64 *z0Ptr, bits64 *z1Ptr ) |
461 | { |
462 | bits32 aHigh, aLow, bHigh, bLow; |
463 | bits64 z0, zMiddleA, zMiddleB, z1; |
464 | |
465 | aLow = a; |
466 | aHigh = a>>32; |
467 | bLow = b; |
468 | bHigh = b>>32; |
469 | z1 = ( (bits64) aLow ) * bLow; |
470 | zMiddleA = ( (bits64) aLow ) * bHigh; |
471 | zMiddleB = ( (bits64) aHigh ) * bLow; |
472 | z0 = ( (bits64) aHigh ) * bHigh; |
473 | zMiddleA += zMiddleB; |
474 | z0 += ( ( (bits64) ( zMiddleA < zMiddleB ) )<<32 ) + ( zMiddleA>>32 ); |
475 | zMiddleA <<= 32; |
476 | z1 += zMiddleA; |
477 | z0 += ( z1 < zMiddleA ); |
478 | *z1Ptr = z1; |
479 | *z0Ptr = z0; |
480 | |
481 | } |
482 | |
483 | /* |
484 | ------------------------------------------------------------------------------- |
485 | Multiplies the 128-bit value formed by concatenating `a0' and `a1' by `b' to |
486 | obtain a 192-bit product. The product is broken into three 64-bit pieces |
487 | which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and |
488 | `z2Ptr'. |
489 | ------------------------------------------------------------------------------- |
490 | */ |
491 | INLINE void |
492 | mul128By64To192( |
493 | bits64 a0, |
494 | bits64 a1, |
495 | bits64 b, |
496 | bits64 *z0Ptr, |
497 | bits64 *z1Ptr, |
498 | bits64 *z2Ptr |
499 | ) |
500 | { |
501 | bits64 z0, z1, z2, more1; |
502 | |
503 | mul64To128( a: a1, b, z0Ptr: &z1, z1Ptr: &z2 ); |
504 | mul64To128( a: a0, b, z0Ptr: &z0, z1Ptr: &more1 ); |
505 | add128( a0: z0, a1: more1, b0: 0, b1: z1, z0Ptr: &z0, z1Ptr: &z1 ); |
506 | *z2Ptr = z2; |
507 | *z1Ptr = z1; |
508 | *z0Ptr = z0; |
509 | |
510 | } |
511 | |
512 | /* |
513 | ------------------------------------------------------------------------------- |
514 | Multiplies the 128-bit value formed by concatenating `a0' and `a1' to the |
515 | 128-bit value formed by concatenating `b0' and `b1' to obtain a 256-bit |
516 | product. The product is broken into four 64-bit pieces which are stored at |
517 | the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'. |
518 | ------------------------------------------------------------------------------- |
519 | */ |
520 | INLINE void |
521 | mul128To256( |
522 | bits64 a0, |
523 | bits64 a1, |
524 | bits64 b0, |
525 | bits64 b1, |
526 | bits64 *z0Ptr, |
527 | bits64 *z1Ptr, |
528 | bits64 *z2Ptr, |
529 | bits64 *z3Ptr |
530 | ) |
531 | { |
532 | bits64 z0, z1, z2, z3; |
533 | bits64 more1, more2; |
534 | |
535 | mul64To128( a: a1, b: b1, z0Ptr: &z2, z1Ptr: &z3 ); |
536 | mul64To128( a: a1, b: b0, z0Ptr: &z1, z1Ptr: &more2 ); |
537 | add128( a0: z1, a1: more2, b0: 0, b1: z2, z0Ptr: &z1, z1Ptr: &z2 ); |
538 | mul64To128( a: a0, b: b0, z0Ptr: &z0, z1Ptr: &more1 ); |
539 | add128( a0: z0, a1: more1, b0: 0, b1: z1, z0Ptr: &z0, z1Ptr: &z1 ); |
540 | mul64To128( a: a0, b: b1, z0Ptr: &more1, z1Ptr: &more2 ); |
541 | add128( a0: more1, a1: more2, b0: 0, b1: z2, z0Ptr: &more1, z1Ptr: &z2 ); |
542 | add128( a0: z0, a1: z1, b0: 0, b1: more1, z0Ptr: &z0, z1Ptr: &z1 ); |
543 | *z3Ptr = z3; |
544 | *z2Ptr = z2; |
545 | *z1Ptr = z1; |
546 | *z0Ptr = z0; |
547 | |
548 | } |
549 | |
550 | /* |
551 | ------------------------------------------------------------------------------- |
552 | Returns an approximation to the 64-bit integer quotient obtained by dividing |
553 | `b' into the 128-bit value formed by concatenating `a0' and `a1'. The |
554 | divisor `b' must be at least 2^63. If q is the exact quotient truncated |
555 | toward zero, the approximation returned lies between q and q + 2 inclusive. |
556 | If the exact quotient q is larger than 64 bits, the maximum positive 64-bit |
557 | unsigned integer is returned. |
558 | ------------------------------------------------------------------------------- |
559 | */ |
560 | static bits64 estimateDiv128To64( bits64 a0, bits64 a1, bits64 b ) |
561 | { |
562 | bits64 b0, b1; |
563 | bits64 rem0, rem1, term0, term1; |
564 | bits64 z; |
565 | if ( b <= a0 ) return LIT64( 0xFFFFFFFFFFFFFFFF ); |
566 | b0 = b>>32; /* hence b0 is 32 bits wide now */ |
567 | if ( b0<<32 <= a0 ) { |
568 | z = LIT64( 0xFFFFFFFF00000000 ); |
569 | } else { |
570 | z = a0; |
571 | do_div( z, b0 ); |
572 | z <<= 32; |
573 | } |
574 | mul64To128( a: b, b: z, z0Ptr: &term0, z1Ptr: &term1 ); |
575 | sub128( a0, a1, b0: term0, b1: term1, z0Ptr: &rem0, z1Ptr: &rem1 ); |
576 | while ( ( (sbits64) rem0 ) < 0 ) { |
577 | z -= LIT64( 0x100000000 ); |
578 | b1 = b<<32; |
579 | add128( a0: rem0, a1: rem1, b0, b1, z0Ptr: &rem0, z1Ptr: &rem1 ); |
580 | } |
581 | rem0 = ( rem0<<32 ) | ( rem1>>32 ); |
582 | if ( b0<<32 <= rem0 ) { |
583 | z |= 0xFFFFFFFF; |
584 | } else { |
585 | do_div( rem0, b0 ); |
586 | z |= rem0; |
587 | } |
588 | return z; |
589 | |
590 | } |
591 | |
592 | /* |
593 | ------------------------------------------------------------------------------- |
594 | Returns an approximation to the square root of the 32-bit significand given |
595 | by `a'. Considered as an integer, `a' must be at least 2^31. If bit 0 of |
596 | `aExp' (the least significant bit) is 1, the integer returned approximates |
597 | 2^31*sqrt(`a'/2^31), where `a' is considered an integer. If bit 0 of `aExp' |
598 | is 0, the integer returned approximates 2^31*sqrt(`a'/2^30). In either |
599 | case, the approximation returned lies strictly within +/-2 of the exact |
600 | value. |
601 | ------------------------------------------------------------------------------- |
602 | */ |
603 | static bits32 estimateSqrt32( int16 aExp, bits32 a ) |
604 | { |
605 | static const bits16 sqrtOddAdjustments[] = { |
606 | 0x0004, 0x0022, 0x005D, 0x00B1, 0x011D, 0x019F, 0x0236, 0x02E0, |
607 | 0x039C, 0x0468, 0x0545, 0x0631, 0x072B, 0x0832, 0x0946, 0x0A67 |
608 | }; |
609 | static const bits16 sqrtEvenAdjustments[] = { |
610 | 0x0A2D, 0x08AF, 0x075A, 0x0629, 0x051A, 0x0429, 0x0356, 0x029E, |
611 | 0x0200, 0x0179, 0x0109, 0x00AF, 0x0068, 0x0034, 0x0012, 0x0002 |
612 | }; |
613 | int8 index; |
614 | bits32 z; |
615 | bits64 A; |
616 | |
617 | index = ( a>>27 ) & 15; |
618 | if ( aExp & 1 ) { |
619 | z = 0x4000 + ( a>>17 ) - sqrtOddAdjustments[ index ]; |
620 | z = ( ( a / z )<<14 ) + ( z<<15 ); |
621 | a >>= 1; |
622 | } |
623 | else { |
624 | z = 0x8000 + ( a>>17 ) - sqrtEvenAdjustments[ index ]; |
625 | z = a / z + z; |
626 | z = ( 0x20000 <= z ) ? 0xFFFF8000 : ( z<<15 ); |
627 | if ( z <= a ) return (bits32) ( ( (sbits32) a )>>1 ); |
628 | } |
629 | A = ( (bits64) a )<<31; |
630 | do_div( A, z ); |
631 | return ( (bits32) A ) + ( z>>1 ); |
632 | |
633 | } |
634 | |
635 | /* |
636 | ------------------------------------------------------------------------------- |
637 | Returns the number of leading 0 bits before the most-significant 1 bit |
638 | of `a'. If `a' is zero, 32 is returned. |
639 | ------------------------------------------------------------------------------- |
640 | */ |
641 | static int8 countLeadingZeros32( bits32 a ) |
642 | { |
643 | static const int8 countLeadingZerosHigh[] = { |
644 | 8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, |
645 | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
646 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
647 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
648 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
649 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
650 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
651 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
652 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
653 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
654 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
655 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
656 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
657 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
658 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
659 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
660 | }; |
661 | int8 shiftCount; |
662 | |
663 | shiftCount = 0; |
664 | if ( a < 0x10000 ) { |
665 | shiftCount += 16; |
666 | a <<= 16; |
667 | } |
668 | if ( a < 0x1000000 ) { |
669 | shiftCount += 8; |
670 | a <<= 8; |
671 | } |
672 | shiftCount += countLeadingZerosHigh[ a>>24 ]; |
673 | return shiftCount; |
674 | |
675 | } |
676 | |
677 | /* |
678 | ------------------------------------------------------------------------------- |
679 | Returns the number of leading 0 bits before the most-significant 1 bit |
680 | of `a'. If `a' is zero, 64 is returned. |
681 | ------------------------------------------------------------------------------- |
682 | */ |
683 | static int8 countLeadingZeros64( bits64 a ) |
684 | { |
685 | int8 shiftCount; |
686 | |
687 | shiftCount = 0; |
688 | if ( a < ( (bits64) 1 )<<32 ) { |
689 | shiftCount += 32; |
690 | } |
691 | else { |
692 | a >>= 32; |
693 | } |
694 | shiftCount += countLeadingZeros32( a ); |
695 | return shiftCount; |
696 | |
697 | } |
698 | |
699 | /* |
700 | ------------------------------------------------------------------------------- |
701 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' |
702 | is equal to the 128-bit value formed by concatenating `b0' and `b1'. |
703 | Otherwise, returns 0. |
704 | ------------------------------------------------------------------------------- |
705 | */ |
706 | INLINE flag eq128( bits64 a0, bits64 a1, bits64 b0, bits64 b1 ) |
707 | { |
708 | |
709 | return ( a0 == b0 ) && ( a1 == b1 ); |
710 | |
711 | } |
712 | |
713 | /* |
714 | ------------------------------------------------------------------------------- |
715 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less |
716 | than or equal to the 128-bit value formed by concatenating `b0' and `b1'. |
717 | Otherwise, returns 0. |
718 | ------------------------------------------------------------------------------- |
719 | */ |
720 | INLINE flag le128( bits64 a0, bits64 a1, bits64 b0, bits64 b1 ) |
721 | { |
722 | |
723 | return ( a0 < b0 ) || ( ( a0 == b0 ) && ( a1 <= b1 ) ); |
724 | |
725 | } |
726 | |
727 | /* |
728 | ------------------------------------------------------------------------------- |
729 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less |
730 | than the 128-bit value formed by concatenating `b0' and `b1'. Otherwise, |
731 | returns 0. |
732 | ------------------------------------------------------------------------------- |
733 | */ |
734 | INLINE flag lt128( bits64 a0, bits64 a1, bits64 b0, bits64 b1 ) |
735 | { |
736 | |
737 | return ( a0 < b0 ) || ( ( a0 == b0 ) && ( a1 < b1 ) ); |
738 | |
739 | } |
740 | |
741 | /* |
742 | ------------------------------------------------------------------------------- |
743 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is |
744 | not equal to the 128-bit value formed by concatenating `b0' and `b1'. |
745 | Otherwise, returns 0. |
746 | ------------------------------------------------------------------------------- |
747 | */ |
748 | INLINE flag ne128( bits64 a0, bits64 a1, bits64 b0, bits64 b1 ) |
749 | { |
750 | |
751 | return ( a0 != b0 ) || ( a1 != b1 ); |
752 | |
753 | } |
754 | |
755 | |