1 | /* -*- C++ -*- |
2 | SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org> |
3 | |
4 | SPDX-License-Identifier: MIT |
5 | */ |
6 | |
7 | #include "nsCharSetProber.h" |
8 | |
9 | #include <stdlib.h> |
10 | |
11 | namespace kencodingprober |
12 | { |
13 | // This filter applies to all scripts which do not use English characters |
14 | bool nsCharSetProber::FilterWithoutEnglishLetters(const char *aBuf, unsigned int aLen, char **newBuf, unsigned int &newLen) |
15 | { |
16 | char *newptr; |
17 | char *prevPtr; |
18 | char *curPtr; |
19 | |
20 | bool meetMSB = false; |
21 | newptr = *newBuf = (char *)malloc(size: aLen); |
22 | if (!newptr) { |
23 | return false; |
24 | } |
25 | |
26 | for (curPtr = prevPtr = (char *)aBuf; curPtr < aBuf + aLen; ++curPtr) { |
27 | if (*curPtr & 0x80) { |
28 | meetMSB = true; |
29 | } else if (*curPtr < 'A' || (*curPtr > 'Z' && *curPtr < 'a') || *curPtr > 'z') { |
30 | // current char is a symbol, most likely a punctuation. we treat it as segment delimiter |
31 | if (meetMSB && curPtr > prevPtr) |
32 | // this segment contains more than single symbol, and it has upper ASCII, we need to keep it |
33 | { |
34 | while (prevPtr < curPtr) { |
35 | *newptr++ = *prevPtr++; |
36 | } |
37 | prevPtr++; |
38 | *newptr++ = ' '; |
39 | meetMSB = false; |
40 | } else { // ignore current segment. (either because it is just a symbol or just an English word) |
41 | prevPtr = curPtr + 1; |
42 | } |
43 | } |
44 | } |
45 | if (meetMSB && curPtr > prevPtr) { |
46 | while (prevPtr < curPtr) { |
47 | *newptr++ = *prevPtr++; |
48 | } |
49 | } |
50 | |
51 | newLen = newptr - *newBuf; |
52 | |
53 | return true; |
54 | } |
55 | |
56 | // This filter applies to all scripts which contain both English characters and upper ASCII characters. |
57 | bool nsCharSetProber::FilterWithEnglishLetters(const char *aBuf, unsigned int aLen, char **newBuf, unsigned int &newLen) |
58 | { |
59 | // do filtering to reduce load to probers |
60 | char *newptr; |
61 | char *prevPtr; |
62 | char *curPtr; |
63 | bool isInTag = false; |
64 | |
65 | newptr = *newBuf = (char *)malloc(size: aLen); |
66 | if (!newptr) { |
67 | return false; |
68 | } |
69 | |
70 | for (curPtr = prevPtr = (char *)aBuf; curPtr < aBuf + aLen; ++curPtr) { |
71 | if (*curPtr == '>') { |
72 | isInTag = false; |
73 | } else if (*curPtr == '<') { |
74 | isInTag = true; |
75 | } |
76 | |
77 | if (!(*curPtr & 0x80) // |
78 | && (*curPtr < 'A' || (*curPtr > 'Z' && *curPtr < 'a') || *curPtr > 'z')) { |
79 | if (curPtr > prevPtr && !isInTag) // Current segment contains more than just a symbol |
80 | // and it is not inside a tag, keep it. |
81 | { |
82 | while (prevPtr < curPtr) { |
83 | *newptr++ = *prevPtr++; |
84 | } |
85 | prevPtr++; |
86 | *newptr++ = ' '; |
87 | } else { |
88 | prevPtr = curPtr + 1; |
89 | } |
90 | } |
91 | } |
92 | |
93 | // If the current segment contains more than just a symbol |
94 | // and it is not inside a tag then keep it. |
95 | if (!isInTag) { |
96 | while (prevPtr < curPtr) { |
97 | *newptr++ = *prevPtr++; |
98 | } |
99 | } |
100 | |
101 | newLen = newptr - *newBuf; |
102 | |
103 | return true; |
104 | } |
105 | } |
106 | |