1/* -*- C++ -*-
2 SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org>
3
4 SPDX-License-Identifier: MIT
5*/
6
7#include "nsCharSetProber.h"
8
9#include <stdlib.h>
10
11namespace kencodingprober
12{
13// This filter applies to all scripts which do not use English characters
14bool nsCharSetProber::FilterWithoutEnglishLetters(const char *aBuf, unsigned int aLen, char **newBuf, unsigned int &newLen)
15{
16 char *newptr;
17 char *prevPtr;
18 char *curPtr;
19
20 bool meetMSB = false;
21 newptr = *newBuf = (char *)malloc(size: aLen);
22 if (!newptr) {
23 return false;
24 }
25
26 for (curPtr = prevPtr = (char *)aBuf; curPtr < aBuf + aLen; ++curPtr) {
27 if (*curPtr & 0x80) {
28 meetMSB = true;
29 } else if (*curPtr < 'A' || (*curPtr > 'Z' && *curPtr < 'a') || *curPtr > 'z') {
30 // current char is a symbol, most likely a punctuation. we treat it as segment delimiter
31 if (meetMSB && curPtr > prevPtr)
32 // this segment contains more than single symbol, and it has upper ASCII, we need to keep it
33 {
34 while (prevPtr < curPtr) {
35 *newptr++ = *prevPtr++;
36 }
37 prevPtr++;
38 *newptr++ = ' ';
39 meetMSB = false;
40 } else { // ignore current segment. (either because it is just a symbol or just an English word)
41 prevPtr = curPtr + 1;
42 }
43 }
44 }
45 if (meetMSB && curPtr > prevPtr) {
46 while (prevPtr < curPtr) {
47 *newptr++ = *prevPtr++;
48 }
49 }
50
51 newLen = newptr - *newBuf;
52
53 return true;
54}
55
56// This filter applies to all scripts which contain both English characters and upper ASCII characters.
57bool nsCharSetProber::FilterWithEnglishLetters(const char *aBuf, unsigned int aLen, char **newBuf, unsigned int &newLen)
58{
59 // do filtering to reduce load to probers
60 char *newptr;
61 char *prevPtr;
62 char *curPtr;
63 bool isInTag = false;
64
65 newptr = *newBuf = (char *)malloc(size: aLen);
66 if (!newptr) {
67 return false;
68 }
69
70 for (curPtr = prevPtr = (char *)aBuf; curPtr < aBuf + aLen; ++curPtr) {
71 if (*curPtr == '>') {
72 isInTag = false;
73 } else if (*curPtr == '<') {
74 isInTag = true;
75 }
76
77 if (!(*curPtr & 0x80) //
78 && (*curPtr < 'A' || (*curPtr > 'Z' && *curPtr < 'a') || *curPtr > 'z')) {
79 if (curPtr > prevPtr && !isInTag) // Current segment contains more than just a symbol
80 // and it is not inside a tag, keep it.
81 {
82 while (prevPtr < curPtr) {
83 *newptr++ = *prevPtr++;
84 }
85 prevPtr++;
86 *newptr++ = ' ';
87 } else {
88 prevPtr = curPtr + 1;
89 }
90 }
91 }
92
93 // If the current segment contains more than just a symbol
94 // and it is not inside a tag then keep it.
95 if (!isInTag) {
96 while (prevPtr < curPtr) {
97 *newptr++ = *prevPtr++;
98 }
99 }
100
101 newLen = newptr - *newBuf;
102
103 return true;
104}
105}
106

source code of kcodecs/src/probers/nsCharSetProber.cpp