| 1 | /* |
| 2 | SPDX-FileCopyrightText: 2002-2008 The Kopete developers <kopete-devel@kde.org> |
| 3 | SPDX-FileCopyrightText: 2008 Carlo Segato <brandon.ml@gmail.com> |
| 4 | SPDX-FileCopyrightText: 2002-2003 Stefan Gehn <metz@gehn.net> |
| 5 | SPDX-FileCopyrightText: 2005 Engin AYDOGAN <engin@bzzzt.biz> |
| 6 | |
| 7 | SPDX-License-Identifier: LGPL-2.1-or-later |
| 8 | */ |
| 9 | |
| 10 | #include "kemoticonsparser_p.h" |
| 11 | |
| 12 | #include <QDebug> |
| 13 | #include <QString> |
| 14 | |
| 15 | #include <cstring> |
| 16 | |
| 17 | // ### keep sorted by first column and HTML entity-encoded! |
| 18 | struct Emoticon { |
| 19 | const char *match; |
| 20 | const char *replacement; |
| 21 | }; |
| 22 | // clang-format off |
| 23 | static constexpr const Emoticon emoticons_map[] = { |
| 24 | {.match: ">-(" , .replacement: "đ " }, |
| 25 | {.match: ">:(" , .replacement: "đ " }, |
| 26 | {.match: ">:)" , .replacement: "đ" }, |
| 27 | {.match: ">:-(" , .replacement: "đ " }, |
| 28 | {.match: ">w<" , .replacement: "đ" }, |
| 29 | {.match: "<-.->" , .replacement: "đ´" }, |
| 30 | {.match: "<3" , .replacement: "âĨī¸" }, |
| 31 | {.match: "<]:o){" , .replacement: "đ¤Ą" }, |
| 32 | {.match: "<|:^0|" , .replacement: "đ¤Ą" }, |
| 33 | {.match: "()-()" , .replacement: "đ¤" }, |
| 34 | {.match: "(-_o)zzZ" , .replacement: "đ´" }, |
| 35 | {.match: "(:|" , .replacement: "đĨą" }, |
| 36 | {.match: "(@_@)" , .replacement: "đ" }, |
| 37 | {.match: "(c:>*" , .replacement: "đ¤Ą" }, |
| 38 | {.match: "({)" , .replacement: "đ¤" }, |
| 39 | {.match: "(})" , .replacement: "đ¤" }, |
| 40 | {.match: "*<:^)" , .replacement: "đ¤Ą" }, |
| 41 | {.match: "*<:o)" , .replacement: "đ¤Ą" }, |
| 42 | {.match: "*:o)" , .replacement: "đ¤Ą" }, |
| 43 | {.match: "*:oB" , .replacement: "đ¤Ą" }, |
| 44 | {.match: "*:oP" , .replacement: "đ¤Ą" }, |
| 45 | {.match: "+o(" , .replacement: "đ¤ĸ" }, |
| 46 | {.match: ",':(" , .replacement: "đ" }, |
| 47 | {.match: "-_-" , .replacement: "đ´" }, |
| 48 | {.match: "-_-+" , .replacement: "đ " }, |
| 49 | {.match: "-o-o-" , .replacement: "đ¤" }, |
| 50 | {.match: "/00\\" , .replacement: "đ" }, |
| 51 | {.match: "0:)" , .replacement: "đ" }, |
| 52 | {.match: "0:-)" , .replacement: "đ" }, |
| 53 | {.match: "0;)" , .replacement: "đ" }, |
| 54 | {.match: "0=)" , .replacement: "đ" }, |
| 55 | {.match: "3:)" , .replacement: "đ" }, |
| 56 | {.match: "8)" , .replacement: "đ" }, |
| 57 | {.match: "8-)" , .replacement: "đ" }, |
| 58 | {.match: "8:::(" , .replacement: "đ" }, |
| 59 | {.match: ":\"-(" , .replacement: "đĸ" }, |
| 60 | {.match: ":'(" , .replacement: "đĸ" }, |
| 61 | {.match: ":'-(" , .replacement: "đĸ" }, |
| 62 | {.match: ":'D" , .replacement: "đ" }, |
| 63 | {.match: ":(" , .replacement: "đ" }, |
| 64 | {.match: ":((" , .replacement: "đĸ" }, |
| 65 | {.match: ":)" , .replacement: "đ" }, |
| 66 | {.match: ":))" , .replacement: "đ" }, |
| 67 | {.match: ":*" , .replacement: "đ" }, |
| 68 | {.match: ":*(" , .replacement: "đĸ" }, |
| 69 | {.match: ":*)" , .replacement: "đ" }, |
| 70 | {.match: ":-$" , .replacement: "đ¯" }, |
| 71 | {.match: ":-&" , .replacement: "đ¤ĸ" }, |
| 72 | {.match: ":->" , .replacement: "âēī¸" }, |
| 73 | {.match: ":->>" , .replacement: "âēī¸" }, |
| 74 | {.match: ":-(" , .replacement: "đ" }, |
| 75 | {.match: ":-)" , .replacement: "đ" }, |
| 76 | {.match: ":-))" , .replacement: "đ" }, |
| 77 | {.match: ":-)*" , .replacement: "đ" }, |
| 78 | {.match: ":-*" , .replacement: "đ" }, |
| 79 | {.match: ":-/" , .replacement: "đ" }, |
| 80 | {.match: ":-@" , .replacement: "đ " }, |
| 81 | {.match: ":-D" , .replacement: "đ" }, |
| 82 | {.match: ":-O" , .replacement: "đŽ" }, |
| 83 | {.match: ":-P" , .replacement: "đ" }, |
| 84 | {.match: ":-Q" , .replacement: "đ" }, |
| 85 | {.match: ":-S" , .replacement: "đ" }, |
| 86 | {.match: ":-X" , .replacement: "đ¤Ģ" }, |
| 87 | {.match: ":-[" , .replacement: "đ¯" }, |
| 88 | {.match: ":-o" , .replacement: "đŽ" }, |
| 89 | {.match: ":-p" , .replacement: "đ" }, |
| 90 | {.match: ":-s" , .replacement: "đ" }, |
| 91 | {.match: ":-t" , .replacement: "đ" }, |
| 92 | {.match: ":-x" , .replacement: "đ¤Ģ" }, |
| 93 | {.match: ":-|" , .replacement: "đ" }, |
| 94 | {.match: ":-||" , .replacement: "đ " }, |
| 95 | {.match: ":/" , .replacement: "đ̤" }, |
| 96 | {.match: ":@" , .replacement: "đ " }, |
| 97 | {.match: ":C" , .replacement: "âšī¸" }, |
| 98 | {.match: ":D" , .replacement: "đ" }, |
| 99 | {.match: ":O" , .replacement: "đŽ" }, |
| 100 | {.match: ":P" , .replacement: "đ" }, |
| 101 | {.match: ":S" , .replacement: "đ" }, |
| 102 | {.match: ":X" , .replacement: "đ¤Ģ" }, |
| 103 | {.match: ":\\" , .replacement: "đ̤" }, |
| 104 | {.match: ":_(" , .replacement: "đĸ" }, |
| 105 | {.match: ":c" , .replacement: "âšī¸" }, |
| 106 | {.match: ":o" , .replacement: "đŽ" }, |
| 107 | {.match: ":o)" , .replacement: "đ¤Ą" }, |
| 108 | {.match: ":p" , .replacement: "đ" }, |
| 109 | {.match: ":s" , .replacement: "đ" }, |
| 110 | {.match: ":x" , .replacement: "đ¤Ģ" }, |
| 111 | {.match: ":|))" , .replacement: "đ" }, |
| 112 | {.match: ";(" , .replacement: "đĸ" }, |
| 113 | {.match: ";)" , .replacement: "đ" }, |
| 114 | {.match: ";-(!)" , .replacement: "đ" }, |
| 115 | {.match: ";-(" , .replacement: "đĸ" }, |
| 116 | {.match: ";-)" , .replacement: "đ" }, |
| 117 | {.match: ";_;" , .replacement: "đĸ" }, |
| 118 | {.match: "= #" , .replacement: "đ" }, |
| 119 | {.match: "='(" , .replacement: "đĸ" }, |
| 120 | {.match: "=(" , .replacement: "đ" }, |
| 121 | {.match: "=[" , .replacement: "đ" }, |
| 122 | {.match: "=^D" , .replacement: "đ" }, |
| 123 | {.match: "B-)" , .replacement: "đ" }, |
| 124 | {.match: "D:" , .replacement: "đ" }, |
| 125 | {.match: "D=" , .replacement: "đ" }, |
| 126 | {.match: "O-)" , .replacement: "đ" }, |
| 127 | {.match: "O.o" , .replacement: "đ¤" }, |
| 128 | {.match: "O.o?" , .replacement: "đ¤" }, |
| 129 | {.match: "O:)" , .replacement: "đ" }, |
| 130 | {.match: "O:-)" , .replacement: "đ" }, |
| 131 | {.match: "O;" , .replacement: "đ" }, |
| 132 | {.match: "T.T" , .replacement: "đ" }, |
| 133 | {.match: "T_T" , .replacement: "đ" }, |
| 134 | {.match: "X-(" , .replacement: "đ " }, |
| 135 | {.match: "Y_Y" , .replacement: "đ" }, |
| 136 | {.match: "Z_Z" , .replacement: "đ´" }, |
| 137 | {.match: "\\o-o/" , .replacement: "đ¤" }, |
| 138 | {.match: "\\~/" , .replacement: "đ¤" }, |
| 139 | {.match: "]:->" , .replacement: "đ" }, |
| 140 | {.match: "^j^" , .replacement: "đ" }, |
| 141 | {.match: "i_i" , .replacement: "đ" }, |
| 142 | {.match: "t.t" , .replacement: "đ" }, |
| 143 | {.match: "y_y" , .replacement: "đ" }, |
| 144 | {.match: "|-O" , .replacement: "đĨą" }, |
| 145 | {.match: "}:-)" , .replacement: "đ" }, |
| 146 | }; |
| 147 | // clang-format on |
| 148 | |
| 149 | static const Emoticon *findEmoticon(QStringView s) |
| 150 | { |
| 151 | auto it = std::lower_bound(first: std::begin(arr: emoticons_map), last: std::end(arr: emoticons_map), val: s, comp: [](const auto &emoticon, auto s) { |
| 152 | return QLatin1String(emoticon.match) < s; |
| 153 | }); |
| 154 | if (it != std::end(arr: emoticons_map) && s.startsWith(s: QLatin1String((*it).match))) { |
| 155 | return it; |
| 156 | } |
| 157 | // if we don't have an exact match but a prefix, that will be in the item before the one returned by lower_bound |
| 158 | if (it != std::begin(arr: emoticons_map)) { |
| 159 | it = std::prev(x: it); |
| 160 | if (s.startsWith(s: QLatin1String((*it).match))) { |
| 161 | return it; |
| 162 | } |
| 163 | } |
| 164 | return nullptr; |
| 165 | } |
| 166 | |
| 167 | QString KEmoticonsParser::parseEmoticons(const QString &message) |
| 168 | { |
| 169 | QString result; |
| 170 | |
| 171 | /* previous char, in the firs iteration assume that it is space since we want |
| 172 | * to let emoticons at the beginning, the very first previous QChar must be a space. */ |
| 173 | QChar p = QLatin1Char(' '); |
| 174 | |
| 175 | int pos = 0; |
| 176 | int previousPos = 0; |
| 177 | |
| 178 | bool inHTMLTag = false; |
| 179 | bool inHTMLLink = false; |
| 180 | bool inHTMLEntity = false; |
| 181 | |
| 182 | for (; pos < message.length(); ++pos) { |
| 183 | const QChar c = message[pos]; |
| 184 | |
| 185 | if (!inHTMLTag) { // Are we already in an HTML tag ? |
| 186 | if (c == QLatin1Char('<')) { // If not check if are going into one |
| 187 | inHTMLTag = true; // If we are, change the state to inHTML |
| 188 | p = c; |
| 189 | continue; |
| 190 | } |
| 191 | } else { // We are already in a HTML tag |
| 192 | if (c == QLatin1Char('>')) { // Check if it ends |
| 193 | inHTMLTag = false; // If so, change the state |
| 194 | |
| 195 | if (p == QLatin1Char('a')) { |
| 196 | inHTMLLink = false; |
| 197 | } |
| 198 | } else if (c == QLatin1Char('a') && p == QLatin1Char('<')) { // check if we just entered an anchor tag |
| 199 | inHTMLLink = true; // don't put smileys in urls |
| 200 | } |
| 201 | p = c; |
| 202 | continue; |
| 203 | } |
| 204 | |
| 205 | if (!inHTMLEntity) { // are we |
| 206 | if (c == QLatin1Char('&')) { |
| 207 | inHTMLEntity = true; |
| 208 | } |
| 209 | } |
| 210 | |
| 211 | if (inHTMLLink) { // i can't think of any situation where a link address might need emoticons |
| 212 | p = c; |
| 213 | continue; |
| 214 | } |
| 215 | |
| 216 | if (!p.isSpace() && p != QLatin1Char('>')) { // '>' may mark the end of an html tag |
| 217 | p = c; |
| 218 | continue; |
| 219 | } /* strict requires space before the emoticon */ |
| 220 | |
| 221 | const auto emoticon = findEmoticon(s: QStringView(message).mid(pos)); |
| 222 | if (emoticon) { |
| 223 | bool found = true; |
| 224 | /* check if the character after this match is space or end of string*/ |
| 225 | const int matchLen = std::strlen(s: emoticon->match); |
| 226 | if (message.length() > pos + matchLen) { |
| 227 | const QChar n = message[pos + matchLen]; |
| 228 | //<br/> marks the end of a line |
| 229 | if (n != QLatin1Char('<') && !n.isSpace() && !n.isNull() && n != QLatin1Char('&')) { |
| 230 | found = false; |
| 231 | } |
| 232 | } |
| 233 | |
| 234 | if (found) { |
| 235 | result += QStringView(message).mid(pos: previousPos, n: pos - previousPos); |
| 236 | result += QString::fromUtf8(utf8: emoticon->replacement); |
| 237 | |
| 238 | /* Skip the matched emoticon's matchText */ |
| 239 | pos += matchLen - 1; |
| 240 | previousPos = pos + 1; |
| 241 | } else { |
| 242 | if (inHTMLEntity) { |
| 243 | // If we are in an HTML entity such as > |
| 244 | const int htmlEnd = message.indexOf(ch: QLatin1Char(';'), from: pos); |
| 245 | // Search for where it ends |
| 246 | if (htmlEnd == -1) { |
| 247 | // Apparently this HTML entity isn't ended, something is wrong, try skip the '&' |
| 248 | // and continue |
| 249 | // qCDebug(KEMOTICONS_CORE) << "Broken HTML entity, trying to recover."; |
| 250 | inHTMLEntity = false; |
| 251 | pos++; |
| 252 | } else { |
| 253 | pos = htmlEnd; |
| 254 | inHTMLEntity = false; |
| 255 | } |
| 256 | } |
| 257 | } |
| 258 | } /* else no emoticons begin with this character, so don't do anything */ |
| 259 | p = c; |
| 260 | } |
| 261 | |
| 262 | if (result.isEmpty()) { |
| 263 | return message; |
| 264 | } |
| 265 | if (previousPos < message.length()) { |
| 266 | result += QStringView(message).mid(pos: previousPos); |
| 267 | } |
| 268 | return result; |
| 269 | } |
| 270 | |