| 1 | // Copyright (C) 2016 The Qt Company Ltd. |
| 2 | // SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only |
| 3 | #include "qisciicodec_p.h" |
| 4 | #include "qtextcodec_p.h" |
| 5 | #include "qlist.h" |
| 6 | |
| 7 | QT_BEGIN_NAMESPACE |
| 8 | |
| 9 | /*! |
| 10 | \class QIsciiCodec |
| 11 | \inmodule QtCore5Compat |
| 12 | \brief The QIsciiCodec class provides conversion to and from the ISCII encoding. |
| 13 | |
| 14 | \internal |
| 15 | */ |
| 16 | |
| 17 | |
| 18 | struct Codecs { |
| 19 | const char name[10]; |
| 20 | ushort base; |
| 21 | }; |
| 22 | |
| 23 | static const Codecs codecs [] = { |
| 24 | { .name: "iscii-dev" , .base: 0x900 }, |
| 25 | { .name: "iscii-bng" , .base: 0x980 }, |
| 26 | { .name: "iscii-pnj" , .base: 0xa00 }, |
| 27 | { .name: "iscii-gjr" , .base: 0xa80 }, |
| 28 | { .name: "iscii-ori" , .base: 0xb00 }, |
| 29 | { .name: "iscii-tml" , .base: 0xb80 }, |
| 30 | { .name: "iscii-tlg" , .base: 0xc00 }, |
| 31 | { .name: "iscii-knd" , .base: 0xc80 }, |
| 32 | { .name: "iscii-mlm" , .base: 0xd00 } |
| 33 | }; |
| 34 | |
| 35 | QTextCodec *QIsciiCodec::create(const char *name) |
| 36 | { |
| 37 | QIsciiCodec *codec = nullptr; |
| 38 | for (int i = 0; i < 9; ++i) { |
| 39 | if (qTextCodecNameMatch(a: name, b: codecs[i].name)) { |
| 40 | codec = new QIsciiCodec(i); |
| 41 | break; |
| 42 | } |
| 43 | } |
| 44 | return codec; |
| 45 | } |
| 46 | |
| 47 | QIsciiCodec::~QIsciiCodec() |
| 48 | { |
| 49 | } |
| 50 | |
| 51 | QByteArray QIsciiCodec::name() const |
| 52 | { |
| 53 | return codecs[idx].name; |
| 54 | } |
| 55 | |
| 56 | int QIsciiCodec::mibEnum() const |
| 57 | { |
| 58 | /* There is no MIBEnum for Iscii */ |
| 59 | return -3000-idx; |
| 60 | } |
| 61 | |
| 62 | static const uchar inv = 0xFF; |
| 63 | |
| 64 | /* iscii range from 0xa0 - 0xff */ |
| 65 | static const uchar iscii_to_uni_table[0x60] = { |
| 66 | 0x00, 0x01, 0x02, 0x03, |
| 67 | 0x05, 0x06, 0x07, 0x08, |
| 68 | 0x09, 0x0a, 0x0b, 0x0e, |
| 69 | 0x0f, 0x20, 0x0d, 0x12, |
| 70 | |
| 71 | 0x13, 0x14, 0x11, 0x15, |
| 72 | 0x16, 0x17, 0x18, 0x19, |
| 73 | 0x1a, 0x1b, 0x1c, 0x1d, |
| 74 | 0x1e, 0x1f, 0x20, 0x21, |
| 75 | |
| 76 | 0x22, 0x23, 0x24, 0x25, |
| 77 | 0x26, 0x27, 0x28, 0x29, |
| 78 | 0x2a, 0x2b, 0x2c, 0x2d, |
| 79 | 0x2e, 0x2f, 0x5f, 0x30, |
| 80 | |
| 81 | 0x31, 0x32, 0x33, 0x34, |
| 82 | 0x35, 0x36, 0x37, 0x38, |
| 83 | 0x39, inv, 0x3e, 0x3f, |
| 84 | 0x40, 0x41, 0x42, 0x43, |
| 85 | |
| 86 | 0x46, 0x47, 0x48, 0x45, |
| 87 | 0x4a, 0x4b, 0x4c, 0x49, |
| 88 | 0x4d, 0x3c, 0x64, 0x00, |
| 89 | 0x00, 0x00, 0x00, 0x00, |
| 90 | |
| 91 | 0x00, 0x66, 0x67, 0x68, |
| 92 | 0x69, 0x6a, 0x6b, 0x6c, |
| 93 | 0x6d, 0x6e, 0x6f, 0x00, |
| 94 | 0x00, 0x00, 0x00, 0x00 |
| 95 | }; |
| 96 | |
| 97 | static const uchar uni_to_iscii_table[0x80] = { |
| 98 | 0x00, 0xa1, 0xa2, 0xa3, |
| 99 | 0x00, 0xa4, 0xa5, 0xa6, |
| 100 | 0xa7, 0xa8, 0xa9, 0xaa, |
| 101 | 0x00, 0xae, 0xab, 0xac, |
| 102 | |
| 103 | 0xad, 0xb2, 0xaf, 0xb0, |
| 104 | 0xb1, 0xb3, 0xb4, 0xb5, |
| 105 | 0xb6, 0xb7, 0xb8, 0xb9, |
| 106 | 0xba, 0xbb, 0xbc, 0xbd, |
| 107 | |
| 108 | 0xbe, 0xbf, 0xc0, 0xc1, |
| 109 | 0xc2, 0xc3, 0xc4, 0xc5, |
| 110 | 0xc6, 0xc7, 0xc8, 0xc9, |
| 111 | 0xca, 0xcb, 0xcc, 0xcd, |
| 112 | |
| 113 | 0xcf, 0xd0, 0xd1, 0xd2, |
| 114 | 0xd3, 0xd4, 0xd5, 0xd6, |
| 115 | 0xd7, 0xd8, 0x00, 0x00, |
| 116 | 0xe9, 0x00, 0xda, 0xdb, |
| 117 | |
| 118 | 0xdc, 0xdd, 0xde, 0xdf, |
| 119 | 0x00, 0xe3, 0xe0, 0xe1, |
| 120 | 0xe2, 0xe7, 0xe4, 0xe5, |
| 121 | 0xe6, 0xe8, 0x00, 0x00, |
| 122 | |
| 123 | 0x00, 0x00, 0x00, 0x00, |
| 124 | 0x00, 0x00, 0x00, 0x00, |
| 125 | 0x01, 0x02, 0x03, 0x04, // decomposable into the uc codes listed here + nukta |
| 126 | 0x05, 0x06, 0x07, 0xce, |
| 127 | |
| 128 | 0x00, 0x00, 0x00, 0x00, |
| 129 | 0xea, 0x08, 0xf1, 0xf2, |
| 130 | 0xf3, 0xf4, 0xf5, 0xf6, |
| 131 | 0xf7, 0xf8, 0xf9, 0xfa, |
| 132 | |
| 133 | 0x00, 0x00, 0x00, 0x00, |
| 134 | 0x00, 0x00, 0x00, 0x00, |
| 135 | 0x00, 0x00, 0x00, 0x00, |
| 136 | 0x00, 0x00, 0x00, 0x00 |
| 137 | }; |
| 138 | |
| 139 | static const uchar uni_to_iscii_pairs[] = { |
| 140 | 0x00, 0x00, |
| 141 | 0x15, 0x3c, // 0x958 |
| 142 | 0x16, 0x3c, // 0x959 |
| 143 | 0x17, 0x3c, // 0x95a |
| 144 | 0x1c, 0x3c, // 0x95b |
| 145 | 0x21, 0x3c, // 0x95c |
| 146 | 0x22, 0x3c, // 0x95d |
| 147 | 0x2b, 0x3c, // 0x95e |
| 148 | 0x64, 0x64 // 0x965 |
| 149 | }; |
| 150 | |
| 151 | |
| 152 | QByteArray QIsciiCodec::convertFromUnicode(const QChar *uc, int len, ConverterState *state) const |
| 153 | { |
| 154 | char replacement = '?'; |
| 155 | bool halant = false; |
| 156 | if (state) { |
| 157 | if (state->flags & ConvertInvalidToNull) |
| 158 | replacement = 0; |
| 159 | halant = state->state_data[0]; |
| 160 | } |
| 161 | int invalid = 0; |
| 162 | |
| 163 | QByteArray result(2 * len, QT_PREPEND_NAMESPACE(Qt::Uninitialized)); // worst case |
| 164 | |
| 165 | uchar *ch = reinterpret_cast<uchar *>(result.data()); |
| 166 | |
| 167 | const int base = codecs[idx].base; |
| 168 | |
| 169 | for (int i =0; i < len; ++i) { |
| 170 | const ushort codePoint = uc[i].unicode(); |
| 171 | |
| 172 | /* The low 7 bits of ISCII is plain ASCII. However, we go all the |
| 173 | * way up to 0xA0 such that we can roundtrip with convertToUnicode()'s |
| 174 | * behavior. */ |
| 175 | if(codePoint < 0xA0) { |
| 176 | *ch++ = static_cast<uchar>(codePoint); |
| 177 | continue; |
| 178 | } |
| 179 | |
| 180 | const int pos = codePoint - base; |
| 181 | if (pos > 0 && pos < 0x80) { |
| 182 | uchar iscii = uni_to_iscii_table[pos]; |
| 183 | if (iscii > 0x80) { |
| 184 | *ch++ = iscii; |
| 185 | } else if (iscii) { |
| 186 | Q_ASSERT((2 * iscii) < (sizeof(uni_to_iscii_pairs) / sizeof(uni_to_iscii_pairs[0]))); |
| 187 | const uchar *pair = uni_to_iscii_pairs + 2*iscii; |
| 188 | *ch++ = *pair++; |
| 189 | *ch++ = *pair++; |
| 190 | } else { |
| 191 | *ch++ = replacement; |
| 192 | ++invalid; |
| 193 | } |
| 194 | } else { |
| 195 | if (uc[i].unicode() == 0x200c) { // ZWNJ |
| 196 | if (halant) |
| 197 | // Consonant Halant ZWNJ -> Consonant Halant Halant |
| 198 | *ch++ = 0xe8; |
| 199 | } else if (uc[i].unicode() == 0x200d) { // ZWJ |
| 200 | if (halant) |
| 201 | // Consonant Halant ZWJ -> Consonant Halant Nukta |
| 202 | *ch++ = 0xe9; |
| 203 | } else { |
| 204 | *ch++ = replacement; |
| 205 | ++invalid; |
| 206 | } |
| 207 | } |
| 208 | halant = (pos == 0x4d); |
| 209 | } |
| 210 | result.truncate(pos: ch - (uchar *)result.data()); |
| 211 | |
| 212 | if (state) { |
| 213 | state->invalidChars += invalid; |
| 214 | state->state_data[0] = halant; |
| 215 | } |
| 216 | return result; |
| 217 | } |
| 218 | |
| 219 | QString QIsciiCodec::convertToUnicode(const char* chars, int len, ConverterState *state) const |
| 220 | { |
| 221 | bool halant = false; |
| 222 | if (state) { |
| 223 | halant = state->state_data[0]; |
| 224 | } |
| 225 | |
| 226 | QString result(len, QT_PREPEND_NAMESPACE(Qt::Uninitialized)); |
| 227 | QChar *uc = result.data(); |
| 228 | |
| 229 | const int base = codecs[idx].base; |
| 230 | |
| 231 | for (int i = 0; i < len; ++i) { |
| 232 | char16_t ch = (uchar) chars[i]; |
| 233 | if (ch < 0xa0) |
| 234 | *uc++ = ch; |
| 235 | else { |
| 236 | ushort c = iscii_to_uni_table[ch - 0xa0]; |
| 237 | if (halant && (ch == inv || ch == 0xe9)) { |
| 238 | // Consonant Halant inv -> Consonant Halant ZWJ |
| 239 | // Consonant Halant Nukta -> Consonant Halant ZWJ |
| 240 | *uc++ = QChar(0x200d); |
| 241 | } else if (halant && ch == 0xe8) { |
| 242 | // Consonant Halant Halant -> Consonant Halant ZWNJ |
| 243 | *uc++ = QChar(0x200c); |
| 244 | } else { |
| 245 | *uc++ = QChar(c+base); |
| 246 | } |
| 247 | } |
| 248 | halant = ((uchar)chars[i] == 0xe8); |
| 249 | } |
| 250 | result.resize(size: uc - result.unicode()); |
| 251 | |
| 252 | if (state) { |
| 253 | state->state_data[0] = halant; |
| 254 | } |
| 255 | return result; |
| 256 | } |
| 257 | |
| 258 | QT_END_NAMESPACE |
| 259 | |