| 1 | /* This is JavaScriptCore's variant of the PCRE library. While this library | 
| 2 | started out as a copy of PCRE, many of the features of PCRE have been | 
| 3 | removed. This library now supports only the regular expression features | 
| 4 | required by the JavaScript language specification, and has only the functions | 
| 5 | needed by JavaScriptCore and the rest of WebKit. | 
| 6 |  | 
| 7 |                  Originally written by Philip Hazel | 
| 8 |            Copyright (c) 1997-2006 University of Cambridge | 
| 9 |     Copyright (C) 2002, 2004, 2006, 2007 Apple Inc. All rights reserved. | 
| 10 |  | 
| 11 | ----------------------------------------------------------------------------- | 
| 12 | Redistribution and use in source and binary forms, with or without | 
| 13 | modification, are permitted provided that the following conditions are met: | 
| 14 |  | 
| 15 |     * Redistributions of source code must retain the above copyright notice, | 
| 16 |       this list of conditions and the following disclaimer. | 
| 17 |  | 
| 18 |     * Redistributions in binary form must reproduce the above copyright | 
| 19 |       notice, this list of conditions and the following disclaimer in the | 
| 20 |       documentation and/or other materials provided with the distribution. | 
| 21 |  | 
| 22 |     * Neither the name of the University of Cambridge nor the names of its | 
| 23 |       contributors may be used to endorse or promote products derived from | 
| 24 |       this software without specific prior written permission. | 
| 25 |  | 
| 26 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | 
| 27 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | 
| 28 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | 
| 29 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | 
| 30 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | 
| 31 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | 
| 32 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | 
| 33 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | 
| 34 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | 
| 35 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | 
| 36 | POSSIBILITY OF SUCH DAMAGE. | 
| 37 | ----------------------------------------------------------------------------- | 
| 38 | */ | 
| 39 |  | 
| 40 | /************************************************* | 
| 41 | *           Unicode Property Table handler       * | 
| 42 | *************************************************/ | 
| 43 |  | 
| 44 | /* Internal header file defining the layout of the bits in each pair of 32-bit | 
| 45 | words that form a data item in the table. */ | 
| 46 |  | 
| 47 | typedef struct cnode { | 
| 48 |   unsigned f0; | 
| 49 |   unsigned f1; | 
| 50 | } cnode; | 
| 51 |  | 
| 52 | /* Things for the f0 field */ | 
| 53 |  | 
| 54 | #define f0_scriptmask   0xff000000  /* Mask for script field */ | 
| 55 | #define f0_scriptshift          24  /* Shift for script value */ | 
| 56 | #define f0_rangeflag    0x00f00000  /* Flag for a range item */ | 
| 57 | #define f0_charmask     0x001fffff  /* Mask for code point value */ | 
| 58 |  | 
| 59 | /* Things for the f1 field */ | 
| 60 |  | 
| 61 | #define f1_typemask     0xfc000000  /* Mask for char type field */ | 
| 62 | #define f1_typeshift            26  /* Shift for the type field */ | 
| 63 | #define f1_rangemask    0x0000ffff  /* Mask for a range offset */ | 
| 64 | #define f1_casemask     0x0000ffff  /* Mask for a case offset */ | 
| 65 | #define f1_caseneg      0xffff8000  /* Bits for negation */ | 
| 66 |  | 
| 67 | /* The data consists of a vector of structures of type cnode. The two unsigned | 
| 68 | 32-bit integers are used as follows: | 
| 69 |  | 
| 70 | (f0) (1) The most significant byte holds the script number. The numbers are | 
| 71 |          defined by the enum in ucp.h. | 
| 72 |  | 
| 73 |      (2) The 0x00800000 bit is set if this entry defines a range of characters. | 
| 74 |          It is not set if this entry defines a single character | 
| 75 |  | 
| 76 |      (3) The 0x00600000 bits are spare. | 
| 77 |  | 
| 78 |      (4) The 0x001fffff bits contain the code point. No Unicode code point will | 
| 79 |          ever be greater than 0x0010ffff, so this should be OK for ever. | 
| 80 |  | 
| 81 | (f1) (1) The 0xfc000000 bits contain the character type number. The numbers are | 
| 82 |          defined by an enum in ucp.h. | 
| 83 |  | 
| 84 |      (2) The 0x03ff0000 bits are spare. | 
| 85 |  | 
| 86 |      (3) The 0x0000ffff bits contain EITHER the unsigned offset to the top of | 
| 87 |          range if this entry defines a range, OR the *signed* offset to the | 
| 88 |          character's "other case" partner if this entry defines a single | 
| 89 |          character. There is no partner if the value is zero. | 
| 90 |  | 
| 91 | ------------------------------------------------------------------------------- | 
| 92 | | script (8) |.|.|.| codepoint (21) || type (6) |.|.| spare (8) | offset (16) | | 
| 93 | ------------------------------------------------------------------------------- | 
| 94 |               | | |                              | | | 
| 95 |               | | |-> spare                      | |-> spare | 
| 96 |               | |                                | | 
| 97 |               | |-> spare                        |-> spare | 
| 98 |               | | 
| 99 |               |-> range flag | 
| 100 |  | 
| 101 | The upper/lower casing information is set only for characters that come in | 
| 102 | pairs. The non-one-to-one mappings in the Unicode data are ignored. | 
| 103 |  | 
| 104 | When searching the data, proceed as follows: | 
| 105 |  | 
| 106 | (1) Set up for a binary chop search. | 
| 107 |  | 
| 108 | (2) If the top is not greater than the bottom, the character is not in the | 
| 109 |     table. Its type must therefore be "Cn" ("Undefined"). | 
| 110 |  | 
| 111 | (3) Find the middle vector element. | 
| 112 |  | 
| 113 | (4) Extract the code point and compare. If equal, we are done. | 
| 114 |  | 
| 115 | (5) If the test character is smaller, set the top to the current point, and | 
| 116 |     goto (2). | 
| 117 |  | 
| 118 | (6) If the current entry defines a range, compute the last character by adding | 
| 119 |     the offset, and see if the test character is within the range. If it is, | 
| 120 |     we are done. | 
| 121 |  | 
| 122 | (7) Otherwise, set the bottom to one element past the current point and goto | 
| 123 |     (2). | 
| 124 | */ | 
| 125 |  | 
| 126 | /* End of ucpinternal.h */ | 
| 127 |  |