shift_jis.rs source code [crates/encoding_rs/src/shift_jis.rs]

1	// Copyright Mozilla Foundation. See the COPYRIGHT
2	// file at the top-level directory of this distribution.
3	//
4	// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5	// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6	// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7	// option. This file may not be copied, modified, or distributed
8	// except according to those terms.
9
10	use super::*;
11	use crate::data::*;
12	use crate::handles::*;
13	use crate::variant::*;
14	// Rust 1.14.0 requires the following despite the asterisk above.
15	use super::in_inclusive_range;
16	use super::in_inclusive_range16;
17
18	pub struct ShiftJisDecoder {
19	lead: Option<u8>,
20	}
21
22	impl ShiftJisDecoder {
23	pub fn new() -> VariantDecoder {
24	VariantDecoder::ShiftJis(ShiftJisDecoder { lead: None })
25	}
26
27	pub fn in_neutral_state(&self) -> bool {
28	self.lead.is_none()
29	}
30
31	fn plus_one_if_lead(&self, byte_length: usize) -> Option<usize> {
32	byte_length.checked_add(match self.lead {
33	None => `0`,
34	Some(_) => `1`,
35	})
36	}
37
38	pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
39	self.plus_one_if_lead(byte_length)
40	}
41
42	pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
43	// worst case: 1 to 3 (half-width katakana)
44	self.max_utf8_buffer_length(byte_length)
45	}
46
47	pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
48	checked_mul(`3`, self.plus_one_if_lead(byte_length))
49	}
50
51	ascii_compatible_two_byte_decoder_functions!(
52	{
53	// If lead is between 0x81 and 0x9F, inclusive,
54	// subtract offset 0x81. Else if lead is
55	// between 0xE0 and 0xFC, inclusive, subtract
56	// offset 0xC1. Else if lead is between
57	// 0xA1 and 0xDF, inclusive, map to half-width
58	// Katakana. Else if lead is 0x80, pass through.
59	let mut non_ascii_minus_offset =
60	non_ascii.wrapping_sub(`0x81`);
61	if non_ascii_minus_offset > (`0x9F` - `0x81`) {
62	let non_ascii_minus_range_start = non_ascii.wrapping_sub(`0xE0`);
63	if non_ascii_minus_range_start > (`0xFC` - `0xE0`) {
64	let non_ascii_minus_half_with_katakana_start = non_ascii.wrapping_sub(`0xA1`);
65	if non_ascii_minus_half_with_katakana_start > (`0xDF` - `0xA1`) {
66	if non_ascii == `0x80` {
67	handle.write_mid_bmp(`0x80`);
68	// Not caring about optimizing subsequent non-ASCII
69	continue 'outermost;
70	}
71	return (DecoderResult::Malformed(`1`, `0`),
72	source.consumed(),
73	handle.written());
74	}
75	handle.write_upper_bmp(`0xFF61` + u16::from(non_ascii_minus_half_with_katakana_start));
76	// Not caring about optimizing subsequent non-ASCII
77	continue 'outermost;
78	}
79	non_ascii_minus_offset = non_ascii - `0xC1`;
80	}
81	non_ascii_minus_offset
82	},
83	{
84	// If trail is between 0x40 and 0x7E, inclusive,
85	// subtract offset 0x40. Else if trail is
86	// between 0x80 and 0xFC, inclusive, subtract
87	// offset 0x41.
88	// Fast-track Hiragana (60% according to Lunde)
89	// and Katakana (10% acconding to Lunde).
90	// Hiragana doesn't cross 0x7F, but Katakana does.
91	// We can check for Hiragana before normalizing
92	// trail.
93	let trail_minus_hiragana = byte.wrapping_sub(`0x9F`);
94	if lead_minus_offset == `0x01` && trail_minus_hiragana < `0x53` {
95	// Hiragana
96	handle.write_upper_bmp(`0x3041` + u16::from(trail_minus_hiragana))
97	} else {
98	let mut trail_minus_offset =
99	byte.wrapping_sub(`0x40`);
100	if trail_minus_offset > (`0x7E` - `0x40`) {
101	let trail_minus_range_start =
102	byte.wrapping_sub(`0x80`);
103	if trail_minus_range_start > (`0xFC` - `0x80`) {
104	if byte < `0x80` {
105	return (DecoderResult::Malformed(`1`, `0`),
106	unread_handle_trail.unread(),
107	handle.written());
108	}
109	return (DecoderResult::Malformed(`2`, `0`),
110	unread_handle_trail.consumed(),
111	handle.written());
112	}
113	trail_minus_offset = byte - `0x41`;
114	}
115	if lead_minus_offset == `0x02` &&
116	trail_minus_offset < `0x56` {
117	// Katakana
118	handle.write_upper_bmp(`0x30A1` + u16::from(trail_minus_offset))
119	} else {
120	let pointer = lead_minus_offset as usize *
121	`188usize` +
122	trail_minus_offset as usize;
123	let level1_pointer = pointer.wrapping_sub(`1410`);
124	if level1_pointer < JIS0208_LEVEL1_KANJI.len() {
125	handle.write_upper_bmp(JIS0208_LEVEL1_KANJI[level1_pointer])
126	} else {
127	let level2_pointer = pointer.wrapping_sub(`4418`);
128	if level2_pointer <
129	JIS0208_LEVEL2_AND_ADDITIONAL_KANJI.len() {
130	handle.write_upper_bmp(JIS0208_LEVEL2_AND_ADDITIONAL_KANJI[level2_pointer])
131	} else {
132	let upper_ibm_pointer = pointer.wrapping_sub(`10744`);
133	if upper_ibm_pointer < IBM_KANJI.len() {
134	handle.write_upper_bmp(IBM_KANJI[upper_ibm_pointer])
135	} else {
136	let lower_ibm_pointer = pointer.wrapping_sub(`8272`);
137	if lower_ibm_pointer < IBM_KANJI.len() {
138	handle.write_upper_bmp(IBM_KANJI[lower_ibm_pointer])
139	} else if in_inclusive_range(pointer, `8836`, `10715`) {
140	handle.write_upper_bmp((`0xE000` - `8836` + pointer) as u16)
141	} else if let Some(bmp) = jis0208_symbol_decode(pointer) {
142	handle.write_bmp_excl_ascii(bmp)
143	} else if let Some(bmp) = jis0208_range_decode(pointer) {
144	handle.write_bmp_excl_ascii(bmp)
145	} else {
146	if byte < `0x80` {
147	return (DecoderResult::Malformed(`1`, `0`),
148	unread_handle_trail.unread(),
149	handle.written());
150	}
151	return (DecoderResult::Malformed(`2`, `0`),
152	unread_handle_trail.consumed(),
153	handle.written());
154	}
155	}
156	}
157	}
158	}
159	}
160	},
161	self,
162	non_ascii,
163	byte,
164	lead_minus_offset,
165	unread_handle_trail,
166	source,
167	handle,
168	'outermost,
169	copy_ascii_from_check_space_bmp,
170	check_space_bmp,
171	`false`);
172	}
173
174	#[cfg(feature = "fast-kanji-encode")]
175	#[inline(always)]
176	fn encode_kanji(bmp: u16) -> Option<(u8, u8)> {
177	jis0208_kanji_shift_jis_encode(bmp)
178	}
179
180	#[cfg(not(feature = "fast-kanji-encode"))]
181	#[inline(always)]
182	fn encode_kanji(bmp: u16) -> Option<(u8, u8)> {
183	if let Some((lead: u8, trail: u8)) = jis0208_level1_kanji_shift_jis_encode(bmp) {
184	return Some((lead, trail));
185	}
186	let pointer: usize = if `0x4EDD` == bmp {
187	// Ideograph on the symbol row!
188	`23`
189	} else if let Some(pos: usize) = jis0208_level2_and_additional_kanji_encode(bmp) {
190	`4418` + pos
191	} else if let Some(pos: usize) = position(&IBM_KANJI[..], needle:bmp) {
192	`10744` + pos
193	} else {
194	return None;
195	};
196	let lead: usize = pointer / `188`;
197	let lead_offset: usize = if lead < `0x1F` { `0x81usize` } else { `0xC1usize` };
198	let trail: usize = pointer % `188`;
199	let trail_offset: usize = if trail < `0x3F` { `0x40usize` } else { `0x41usize` };
200	Some(((lead + lead_offset) as u8, (trail + trail_offset) as u8))
201	}
202
203	pub struct ShiftJisEncoder;
204
205	impl ShiftJisEncoder {
206	pub fn new(encoding: &'static Encoding) -> Encoder {
207	Encoder::new(encoding, VariantEncoder::ShiftJis(ShiftJisEncoder))
208	}
209
210	pub fn max_buffer_length_from_utf16_without_replacement(
211	&self,
212	u16_length: usize,
213	) -> Option<usize> {
214	u16_length.checked_mul(`2`)
215	}
216
217	pub fn max_buffer_length_from_utf8_without_replacement(
218	&self,
219	byte_length: usize,
220	) -> Option<usize> {
221	byte_length.checked_add(`1`)
222	}
223
224	ascii_compatible_bmp_encoder_functions!(
225	{
226	// Lunde says 60% Hiragana, 30% Kanji, 10% Katakana
227	let bmp_minus_hiragana = bmp.wrapping_sub(`0x3041`);
228	if bmp_minus_hiragana < `0x53` {
229	handle.write_two(`0x82`, `0x9F` + bmp_minus_hiragana as u8)
230	} else if in_inclusive_range16(bmp, `0x4E00`, `0x9FA0`) {
231	if let Some((lead, trail)) = encode_kanji(bmp) {
232	handle.write_two(lead, trail)
233	} else {
234	return (
235	EncoderResult::unmappable_from_bmp(bmp),
236	source.consumed(),
237	handle.written(),
238	);
239	}
240	} else {
241	let bmp_minus_katakana = bmp.wrapping_sub(`0x30A1`);
242	if bmp_minus_katakana < `0x56` {
243	let trail_offset = if bmp_minus_katakana < `0x3F` {
244	`0x40`
245	} else {
246	`0x41`
247	};
248	handle.write_two(`0x83`, (trail_offset + bmp_minus_katakana) as u8)
249	} else {
250	let bmp_minus_space = bmp.wrapping_sub(`0x3000`);
251	if bmp_minus_space < `3` {
252	// fast-track common punctuation
253	handle.write_two(`0x81`, `0x40` + bmp_minus_space as u8)
254	} else if bmp == `0xA5` {
255	handle.write_one(`0x5Cu8`)
256	} else if bmp == `0x80` {
257	handle.write_one(`0x80u8`)
258	} else if bmp == `0x203E` {
259	handle.write_one(`0x7Eu8`)
260	} else if in_inclusive_range16(bmp, `0xFF61`, `0xFF9F`) {
261	handle.write_one((bmp - (`0xFF61` - `0xA1`)) as u8)
262	} else if bmp == `0x2212` {
263	handle.write_two(`0x81u8`, `0x7Cu8`)
264	} else {
265	let bmp_minus_roman = bmp.wrapping_sub(`0x2170`);
266	let pointer = if bmp_minus_roman <= (`0x2179` - `0x2170`) {
267	`10716` + bmp_minus_roman as usize
268	} else if let Some(pointer) = jis0208_range_encode(bmp) {
269	pointer
270	} else if in_inclusive_range16(bmp, `0xFA0E`, `0xFA2D`)
271	\|\| bmp == `0xF929`
272	\|\| bmp == `0xF9DC`
273	{
274	// Guaranteed to be found in IBM_KANJI
275	let pos = position(&IBM_KANJI[..], bmp).unwrap();
276	`10744` + pos
277	} else if let Some(pointer) = jis0208_symbol_encode(bmp) {
278	pointer
279	} else {
280	return (
281	EncoderResult::unmappable_from_bmp(bmp),
282	source.consumed(),
283	handle.written(),
284	);
285	};
286	let lead = pointer / `188`;
287	let lead_offset = if lead < `0x1F` { `0x81usize` } else { `0xC1usize` };
288	let trail = pointer % `188`;
289	let trail_offset = if trail < `0x3F` { `0x40usize` } else { `0x41usize` };
290	handle.write_two((lead + lead_offset) as u8, (trail + trail_offset) as u8)
291	}
292	}
293	}
294	},
295	bmp,
296	self,
297	source,
298	handle,
299	copy_ascii_to_check_space_two,
300	check_space_two,
301	`false`
302	);
303	}
304
305	// Any copyright to the test code below this comment is dedicated to the
306	// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
307
308	#[cfg(all(test, feature = "alloc"))]
309	mod tests {
310	use super::super::testing::*;
311	use super::super::*;
312
313	fn decode_shift_jis(bytes: &[u8], expect: &str) {
314	decode(SHIFT_JIS, bytes, expect);
315	}
316
317	fn encode_shift_jis(string: &str, expect: &[u8]) {
318	encode(SHIFT_JIS, string, expect);
319	}
320
321	#[test]
322	fn test_shift_jis_decode() {
323	// Empty
324	decode_shift_jis(b"", &"");
325
326	// ASCII
327	decode_shift_jis(b"`\x61\x62`", "`\u{0061}\u{0062}`");
328
329	// Half-width
330	decode_shift_jis(b"`\xA1`", "`\u{FF61}`");
331	decode_shift_jis(b"`\xDF`", "`\u{FF9F}`");
332	decode_shift_jis(b"`\xA0`", "`\u{FFFD}`");
333	decode_shift_jis(b"`\xE0`", "`\u{FFFD}`");
334	decode_shift_jis(b"`\xA0`+", "`\u{FFFD}`+");
335	decode_shift_jis(b"`\xE0`+", "`\u{FFFD}`+");
336
337	// EUDC
338	decode_shift_jis(b"`\xF0\x40`", "`\u{E000}`");
339	decode_shift_jis(b"`\xF9\xFC`", "`\u{E757}`");
340	decode_shift_jis(b"`\xEF\xFC`", "`\u{FFFD}`");
341	decode_shift_jis(b"`\xFA\x40`", "`\u{2170}`");
342
343	// JIS 0208
344	decode_shift_jis(b"`\x81\x40`", "`\u{3000}`");
345	decode_shift_jis(b"`\x81\x3F`", "`\u{FFFD}`?");
346	decode_shift_jis(b"`\xEE\xFC`", "`\u{FF02}`");
347	decode_shift_jis(b"`\xEE\xFD`", "`\u{FFFD}`");
348	decode_shift_jis(b"`\xFA\x40`", "`\u{2170}`");
349	decode_shift_jis(b"`\xFA\x3F`", "`\u{FFFD}`?");
350	decode_shift_jis(b"`\xFC\x4B`", "`\u{9ED1}`");
351	decode_shift_jis(b"`\xFC\x4C`", "`\u{FFFD}`L");
352	//
353	}
354
355	#[test]
356	fn test_shift_jis_encode() {
357	// Empty
358	encode_shift_jis("", b"");
359
360	// ASCII
361	encode_shift_jis("`\u{0061}\u{0062}`", b"`\x61\x62`");
362
363	// Exceptional code points
364	encode_shift_jis("`\u{0080}`", b"`\x80`");
365	encode_shift_jis("`\u{00A5}`", b"`\x5C`");
366	encode_shift_jis("`\u{203E}`", b"`\x7E`");
367	encode_shift_jis("`\u{2212}`", b"`\x81\x7C`");
368
369	// Half-width
370	encode_shift_jis("`\u{FF61}`", b"`\xA1`");
371	encode_shift_jis("`\u{FF9F}`", b"`\xDF`");
372
373	// EUDC
374	encode_shift_jis("`\u{E000}`", b"");
375	encode_shift_jis("`\u{E757}`", b"");
376
377	// JIS 0212
378	encode_shift_jis("`\u{02D8}`", b"˘");
379
380	// JIS 0208
381	encode_shift_jis("`\u{3000}`", b"`\x81\x40`");
382	encode_shift_jis("`\u{FF02}`", b"`\xFA\x57`");
383	encode_shift_jis("`\u{2170}`", b"`\xFA\x40`");
384	encode_shift_jis("`\u{9ED1}`", b"`\xFC\x4B`");
385	}
386
387	#[test]
388	#[cfg_attr(miri, ignore)] // Miri is too slow
389	fn test_shift_jis_decode_all() {
390	let input = include_bytes!("test_data/shift_jis_in.txt");
391	let expectation = include_str!("test_data/shift_jis_in_ref.txt");
392	let (cow, had_errors) = SHIFT_JIS.decode_without_bom_handling(input);
393	assert!(had_errors, "Should have had errors.");
394	assert_eq!(&cow[..], expectation);
395	}
396
397	#[test]
398	#[cfg_attr(miri, ignore)] // Miri is too slow
399	fn test_shift_jis_encode_all() {
400	let input = include_str!("test_data/shift_jis_out.txt");
401	let expectation = include_bytes!("test_data/shift_jis_out_ref.txt");
402	let (cow, encoding, had_errors) = SHIFT_JIS.encode(input);
403	assert!(!had_errors, "Should not have had errors.");
404	assert_eq!(encoding, SHIFT_JIS);
405	assert_eq!(&cow[..], &expectation[..]);
406	}
407
408	#[test]
409	fn test_shift_jis_half_width_katakana_length() {
410	let mut output = [`0u8`; `20`];
411	let mut decoder = SHIFT_JIS.new_decoder();
412	{
413	let needed = decoder
414	.max_utf8_buffer_length_without_replacement(`1`)
415	.unwrap();
416	let (result, read, written) =
417	decoder.decode_to_utf8_without_replacement(b"`\xA1`", &mut output[..needed], `true`);
418	assert_eq!(result, DecoderResult::InputEmpty);
419	assert_eq!(read, `1`);
420	assert_eq!(written, `3`);
421	assert_eq!(output[`0`], `0xEF`);
422	assert_eq!(output[`1`], `0xBD`);
423	assert_eq!(output[`2`], `0xA1`);
424	}
425	}
426	}
427