nsCharSetProber.cpp source code [kcodecs/src/probers/nsCharSetProber.cpp]

1	/ -- C++ --*
2	SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org>
3
4	SPDX-License-Identifier: MIT
5	*/
6
7	#include "nsCharSetProber.h"
8
9	#include <stdlib.h>
10
11	namespace kencodingprober
12	{
13	// This filter applies to all scripts which do not use English characters
14	bool nsCharSetProber::FilterWithoutEnglishLetters(const char aBuf, unsigned* int aLen, char *newBuf, unsigned* int &newLen)
15	{
16	char *newptr;
17	char *prevPtr;
18	char *curPtr;
19
20	bool meetMSB = false;
21	newptr = newBuf = (char* *)malloc(size: aLen);
22	if (!newptr) {
23	return false;
24	}
25
26	for (curPtr = prevPtr = (char *)aBuf; curPtr < aBuf + aLen; ++curPtr) {
27	if (*curPtr & `0x80`) {
28	meetMSB = true;
29	} else if (curPtr < `'A'` \|\| (curPtr > `'Z'` && curPtr < `'a'`) \|\| curPtr > `'z'`) {
30	// current char is a symbol, most likely a punctuation. we treat it as segment delimiter
31	if (meetMSB && curPtr > prevPtr)
32	// this segment contains more than single symbol, and it has upper ASCII, we need to keep it
33	{
34	while (prevPtr < curPtr) {
35	newptr++ = prevPtr++;
36	}
37	prevPtr++;
38	*newptr++ = `' '`;
39	meetMSB = false;
40	} else { // ignore current segment. (either because it is just a symbol or just an English word)
41	prevPtr = curPtr + `1`;
42	}
43	}
44	}
45	if (meetMSB && curPtr > prevPtr) {
46	while (prevPtr < curPtr) {
47	newptr++ = prevPtr++;
48	}
49	}
50
51	newLen = newptr - *newBuf;
52
53	return true;
54	}
55
56	// This filter applies to all scripts which contain both English characters and upper ASCII characters.
57	bool nsCharSetProber::FilterWithEnglishLetters(const char aBuf, unsigned* int aLen, char *newBuf, unsigned* int &newLen)
58	{
59	// do filtering to reduce load to probers
60	char *newptr;
61	char *prevPtr;
62	char *curPtr;
63	bool isInTag = false;
64
65	newptr = newBuf = (char* *)malloc(size: aLen);
66	if (!newptr) {
67	return false;
68	}
69
70	for (curPtr = prevPtr = (char *)aBuf; curPtr < aBuf + aLen; ++curPtr) {
71	if (*curPtr == `'>'`) {
72	isInTag = false;
73	} else if (*curPtr == `'<'`) {
74	isInTag = true;
75	}
76
77	if (!(*curPtr & `0x80`) //
78	&& (curPtr < `'A'` \|\| (curPtr > `'Z'` && curPtr < `'a'`) \|\| curPtr > `'z'`)) {
79	if (curPtr > prevPtr && !isInTag) // Current segment contains more than just a symbol
80	// and it is not inside a tag, keep it.
81	{
82	while (prevPtr < curPtr) {
83	newptr++ = prevPtr++;
84	}
85	prevPtr++;
86	*newptr++ = `' '`;
87	} else {
88	prevPtr = curPtr + `1`;
89	}
90	}
91	}
92
93	// If the current segment contains more than just a symbol
94	// and it is not inside a tag then keep it.
95	if (!isInTag) {
96	while (prevPtr < curPtr) {
97	newptr++ = prevPtr++;
98	}
99	}
100
101	newLen = newptr - *newBuf;
102
103	return true;
104	}
105	}
106

source code of kcodecs/src/probers/nsCharSetProber.cpp