1/*
2 * MD4C: Markdown parser for C
3 * (http://github.com/mity/md4c)
4 *
5 * Copyright (c) 2016-2020 Martin Mitas
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 * and/or sell copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice shall be included in
15 * all copies or substantial portions of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
23 * IN THE SOFTWARE.
24 */
25
26#include "md4c.h"
27
28#include <limits.h>
29#include <stdio.h>
30#include <stdlib.h>
31#include <string.h>
32
33
34/*****************************
35 *** Miscellaneous Stuff ***
36 *****************************/
37
38#if !defined(__STDC_VERSION__) || __STDC_VERSION__ < 199409L
39 /* C89/90 or old compilers in general may not understand "inline". */
40 #if defined __GNUC__
41 #define inline __inline__
42 #elif defined _MSC_VER
43 #define inline __inline
44 #else
45 #define inline
46 #endif
47#endif
48
49/* Make the UTF-8 support the default. */
50#if !defined MD4C_USE_ASCII && !defined MD4C_USE_UTF8 && !defined MD4C_USE_UTF16
51 #define MD4C_USE_UTF8
52#endif
53
54/* Magic for making wide literals with MD4C_USE_UTF16. */
55#ifdef _T
56 #undef _T
57#endif
58#if defined MD4C_USE_UTF16
59 #define _T(x) L##x
60#else
61 #define _T(x) x
62#endif
63
64/* Misc. macros. */
65#define SIZEOF_ARRAY(a) (sizeof(a) / sizeof(a[0]))
66
67#define STRINGIZE_(x) #x
68#define STRINGIZE(x) STRINGIZE_(x)
69
70#ifndef TRUE
71 #define TRUE 1
72 #define FALSE 0
73#endif
74
75#define MD_LOG(msg) \
76 do { \
77 if(ctx->parser.debug_log != NULL) \
78 ctx->parser.debug_log((msg), ctx->userdata); \
79 } while(0)
80
81#ifdef DEBUG
82 #define MD_ASSERT(cond) \
83 do { \
84 if(!(cond)) { \
85 MD_LOG(__FILE__ ":" STRINGIZE(__LINE__) ": " \
86 "Assertion '" STRINGIZE(cond) "' failed."); \
87 exit(1); \
88 } \
89 } while(0)
90
91 #define MD_UNREACHABLE() MD_ASSERT(1 == 0)
92#else
93 #ifdef __GNUC__
94 #define MD_ASSERT(cond) do { if(!(cond)) __builtin_unreachable(); } while(0)
95 #define MD_UNREACHABLE() do { __builtin_unreachable(); } while(0)
96 #elif defined _MSC_VER && _MSC_VER > 120
97 #define MD_ASSERT(cond) do { __assume(cond); } while(0)
98 #define MD_UNREACHABLE() do { __assume(0); } while(0)
99 #else
100 #define MD_ASSERT(cond) do {} while(0)
101 #define MD_UNREACHABLE() do {} while(0)
102 #endif
103#endif
104
105/* For falling through case labels in switch statements. */
106#if defined __clang__ && __clang_major__ >= 12
107 #define MD_FALLTHROUGH() __attribute__((fallthrough))
108#elif defined __GNUC__ && __GNUC__ >= 7
109 #define MD_FALLTHROUGH() __attribute__((fallthrough))
110#else
111 #define MD_FALLTHROUGH() ((void)0)
112#endif
113
114/* Suppress "unused parameter" warnings. */
115#define MD_UNUSED(x) ((void)x)
116
117
118/************************
119 *** Internal Types ***
120 ************************/
121
122/* These are omnipresent so lets save some typing. */
123#define CHAR MD_CHAR
124#define SZ MD_SIZE
125#define OFF MD_OFFSET
126
127typedef struct MD_MARK_tag MD_MARK;
128typedef struct MD_BLOCK_tag MD_BLOCK;
129typedef struct MD_CONTAINER_tag MD_CONTAINER;
130typedef struct MD_REF_DEF_tag MD_REF_DEF;
131
132
133/* During analyzes of inline marks, we need to manage some "mark chains",
134 * of (yet unresolved) openers. This structure holds start/end of the chain.
135 * The chain internals are then realized through MD_MARK::prev and ::next.
136 */
137typedef struct MD_MARKCHAIN_tag MD_MARKCHAIN;
138struct MD_MARKCHAIN_tag {
139 int head; /* Index of first mark in the chain, or -1 if empty. */
140 int tail; /* Index of last mark in the chain, or -1 if empty. */
141};
142
143/* Context propagated through all the parsing. */
144typedef struct MD_CTX_tag MD_CTX;
145struct MD_CTX_tag {
146 /* Immutable stuff (parameters of md_parse()). */
147 const CHAR* text;
148 SZ size;
149 MD_PARSER parser;
150 void* userdata;
151
152 /* When this is true, it allows some optimizations. */
153 int doc_ends_with_newline;
154
155 /* Helper temporary growing buffer. */
156 CHAR* buffer;
157 unsigned alloc_buffer;
158
159 /* Reference definitions. */
160 MD_REF_DEF* ref_defs;
161 int n_ref_defs;
162 int alloc_ref_defs;
163 void** ref_def_hashtable;
164 int ref_def_hashtable_size;
165
166 /* Stack of inline/span markers.
167 * This is only used for parsing a single block contents but by storing it
168 * here we may reuse the stack for subsequent blocks; i.e. we have fewer
169 * (re)allocations. */
170 MD_MARK* marks;
171 int n_marks;
172 int alloc_marks;
173
174#if defined MD4C_USE_UTF16
175 char mark_char_map[128];
176#else
177 char mark_char_map[256];
178#endif
179
180 /* For resolving of inline spans. */
181 MD_MARKCHAIN mark_chains[13];
182#define PTR_CHAIN (ctx->mark_chains[0])
183#define TABLECELLBOUNDARIES (ctx->mark_chains[1])
184#define ASTERISK_OPENERS_extraword_mod3_0 (ctx->mark_chains[2])
185#define ASTERISK_OPENERS_extraword_mod3_1 (ctx->mark_chains[3])
186#define ASTERISK_OPENERS_extraword_mod3_2 (ctx->mark_chains[4])
187#define ASTERISK_OPENERS_intraword_mod3_0 (ctx->mark_chains[5])
188#define ASTERISK_OPENERS_intraword_mod3_1 (ctx->mark_chains[6])
189#define ASTERISK_OPENERS_intraword_mod3_2 (ctx->mark_chains[7])
190#define UNDERSCORE_OPENERS (ctx->mark_chains[8])
191#define TILDE_OPENERS_1 (ctx->mark_chains[9])
192#define TILDE_OPENERS_2 (ctx->mark_chains[10])
193#define BRACKET_OPENERS (ctx->mark_chains[11])
194#define DOLLAR_OPENERS (ctx->mark_chains[12])
195#define OPENERS_CHAIN_FIRST 2
196#define OPENERS_CHAIN_LAST 12
197
198 int n_table_cell_boundaries;
199
200 /* For resolving links. */
201 int unresolved_link_head;
202 int unresolved_link_tail;
203
204 /* For resolving raw HTML. */
205 OFF html_comment_horizon;
206 OFF html_proc_instr_horizon;
207 OFF html_decl_horizon;
208 OFF html_cdata_horizon;
209
210 /* For block analysis.
211 * Notes:
212 * -- It holds MD_BLOCK as well as MD_LINE structures. After each
213 * MD_BLOCK, its (multiple) MD_LINE(s) follow.
214 * -- For MD_BLOCK_HTML and MD_BLOCK_CODE, MD_VERBATIMLINE(s) are used
215 * instead of MD_LINE(s).
216 */
217 void* block_bytes;
218 MD_BLOCK* current_block;
219 int n_block_bytes;
220 int alloc_block_bytes;
221
222 /* For container block analysis. */
223 MD_CONTAINER* containers;
224 int n_containers;
225 int alloc_containers;
226
227 /* Minimal indentation to call the block "indented code block". */
228 unsigned code_indent_offset;
229
230 /* Contextual info for line analysis. */
231 SZ code_fence_length; /* For checking closing fence length. */
232 int html_block_type; /* For checking closing raw HTML condition. */
233 int last_line_has_list_loosening_effect;
234 int last_list_item_starts_with_two_blank_lines;
235};
236
237enum MD_LINETYPE_tag {
238 MD_LINE_BLANK,
239 MD_LINE_HR,
240 MD_LINE_ATXHEADER,
241 MD_LINE_SETEXTHEADER,
242 MD_LINE_SETEXTUNDERLINE,
243 MD_LINE_INDENTEDCODE,
244 MD_LINE_FENCEDCODE,
245 MD_LINE_HTML,
246 MD_LINE_TEXT,
247 MD_LINE_TABLE,
248 MD_LINE_TABLEUNDERLINE
249};
250typedef enum MD_LINETYPE_tag MD_LINETYPE;
251
252typedef struct MD_LINE_ANALYSIS_tag MD_LINE_ANALYSIS;
253struct MD_LINE_ANALYSIS_tag {
254 MD_LINETYPE type : 16;
255 unsigned data : 16;
256 OFF beg;
257 OFF end;
258 unsigned indent; /* Indentation level. */
259};
260
261typedef struct MD_LINE_tag MD_LINE;
262struct MD_LINE_tag {
263 OFF beg;
264 OFF end;
265};
266
267typedef struct MD_VERBATIMLINE_tag MD_VERBATIMLINE;
268struct MD_VERBATIMLINE_tag {
269 OFF beg;
270 OFF end;
271 OFF indent;
272};
273
274
275/*****************
276 *** Helpers ***
277 *****************/
278
279/* Character accessors. */
280#define CH(off) (ctx->text[(off)])
281#define STR(off) (ctx->text + (off))
282
283/* Character classification.
284 * Note we assume ASCII compatibility of code points < 128 here. */
285#define ISIN_(ch, ch_min, ch_max) ((ch_min) <= (unsigned)(ch) && (unsigned)(ch) <= (ch_max))
286#define ISANYOF_(ch, palette) ((ch) != _T('\0') && md_strchr((palette), (ch)) != NULL)
287#define ISANYOF2_(ch, ch1, ch2) ((ch) == (ch1) || (ch) == (ch2))
288#define ISANYOF3_(ch, ch1, ch2, ch3) ((ch) == (ch1) || (ch) == (ch2) || (ch) == (ch3))
289#define ISASCII_(ch) ((unsigned)(ch) <= 127)
290#define ISBLANK_(ch) (ISANYOF2_((ch), _T(' '), _T('\t')))
291#define ISNEWLINE_(ch) (ISANYOF2_((ch), _T('\r'), _T('\n')))
292#define ISWHITESPACE_(ch) (ISBLANK_(ch) || ISANYOF2_((ch), _T('\v'), _T('\f')))
293#define ISCNTRL_(ch) ((unsigned)(ch) <= 31 || (unsigned)(ch) == 127)
294#define ISPUNCT_(ch) (ISIN_(ch, 33, 47) || ISIN_(ch, 58, 64) || ISIN_(ch, 91, 96) || ISIN_(ch, 123, 126))
295#define ISUPPER_(ch) (ISIN_(ch, _T('A'), _T('Z')))
296#define ISLOWER_(ch) (ISIN_(ch, _T('a'), _T('z')))
297#define ISALPHA_(ch) (ISUPPER_(ch) || ISLOWER_(ch))
298#define ISDIGIT_(ch) (ISIN_(ch, _T('0'), _T('9')))
299#define ISXDIGIT_(ch) (ISDIGIT_(ch) || ISIN_(ch, _T('A'), _T('F')) || ISIN_(ch, _T('a'), _T('f')))
300#define ISALNUM_(ch) (ISALPHA_(ch) || ISDIGIT_(ch))
301
302#define ISANYOF(off, palette) ISANYOF_(CH(off), (palette))
303#define ISANYOF2(off, ch1, ch2) ISANYOF2_(CH(off), (ch1), (ch2))
304#define ISANYOF3(off, ch1, ch2, ch3) ISANYOF3_(CH(off), (ch1), (ch2), (ch3))
305#define ISASCII(off) ISASCII_(CH(off))
306#define ISBLANK(off) ISBLANK_(CH(off))
307#define ISNEWLINE(off) ISNEWLINE_(CH(off))
308#define ISWHITESPACE(off) ISWHITESPACE_(CH(off))
309#define ISCNTRL(off) ISCNTRL_(CH(off))
310#define ISPUNCT(off) ISPUNCT_(CH(off))
311#define ISUPPER(off) ISUPPER_(CH(off))
312#define ISLOWER(off) ISLOWER_(CH(off))
313#define ISALPHA(off) ISALPHA_(CH(off))
314#define ISDIGIT(off) ISDIGIT_(CH(off))
315#define ISXDIGIT(off) ISXDIGIT_(CH(off))
316#define ISALNUM(off) ISALNUM_(CH(off))
317
318
319#if defined MD4C_USE_UTF16
320 #define md_strchr wcschr
321#else
322 #define md_strchr strchr
323#endif
324
325
326/* Case insensitive check of string equality. */
327static inline int
328md_ascii_case_eq(const CHAR* s1, const CHAR* s2, SZ n)
329{
330 OFF i;
331 for(i = 0; i < n; i++) {
332 CHAR ch1 = s1[i];
333 CHAR ch2 = s2[i];
334
335 if(ISLOWER_(ch1))
336 ch1 += ('A'-'a');
337 if(ISLOWER_(ch2))
338 ch2 += ('A'-'a');
339 if(ch1 != ch2)
340 return FALSE;
341 }
342 return TRUE;
343}
344
345static inline int
346md_ascii_eq(const CHAR* s1, const CHAR* s2, SZ n)
347{
348 return memcmp(s1: s1, s2: s2, n: n * sizeof(CHAR)) == 0;
349}
350
351static int
352md_text_with_null_replacement(MD_CTX* ctx, MD_TEXTTYPE type, const CHAR* str, SZ size)
353{
354 OFF off = 0;
355 int ret = 0;
356
357 while(1) {
358 while(off < size && str[off] != _T('\0'))
359 off++;
360
361 if(off > 0) {
362 ret = ctx->parser.text(type, str, off, ctx->userdata);
363 if(ret != 0)
364 return ret;
365
366 str += off;
367 size -= off;
368 off = 0;
369 }
370
371 if(off >= size)
372 return 0;
373
374 ret = ctx->parser.text(MD_TEXT_NULLCHAR, _T(""), 1, ctx->userdata);
375 if(ret != 0)
376 return ret;
377 off++;
378 }
379}
380
381
382#define MD_CHECK(func) \
383 do { \
384 ret = (func); \
385 if(ret < 0) \
386 goto abort; \
387 } while(0)
388
389
390#define MD_TEMP_BUFFER(sz) \
391 do { \
392 if(sz > ctx->alloc_buffer) { \
393 CHAR* new_buffer; \
394 SZ new_size = ((sz) + (sz) / 2 + 128) & ~127; \
395 \
396 new_buffer = realloc(ctx->buffer, new_size); \
397 if(new_buffer == NULL) { \
398 MD_LOG("realloc() failed."); \
399 ret = -1; \
400 goto abort; \
401 } \
402 \
403 ctx->buffer = new_buffer; \
404 ctx->alloc_buffer = new_size; \
405 } \
406 } while(0)
407
408
409#define MD_ENTER_BLOCK(type, arg) \
410 do { \
411 ret = ctx->parser.enter_block((type), (arg), ctx->userdata); \
412 if(ret != 0) { \
413 MD_LOG("Aborted from enter_block() callback."); \
414 goto abort; \
415 } \
416 } while(0)
417
418#define MD_LEAVE_BLOCK(type, arg) \
419 do { \
420 ret = ctx->parser.leave_block((type), (arg), ctx->userdata); \
421 if(ret != 0) { \
422 MD_LOG("Aborted from leave_block() callback."); \
423 goto abort; \
424 } \
425 } while(0)
426
427#define MD_ENTER_SPAN(type, arg) \
428 do { \
429 ret = ctx->parser.enter_span((type), (arg), ctx->userdata); \
430 if(ret != 0) { \
431 MD_LOG("Aborted from enter_span() callback."); \
432 goto abort; \
433 } \
434 } while(0)
435
436#define MD_LEAVE_SPAN(type, arg) \
437 do { \
438 ret = ctx->parser.leave_span((type), (arg), ctx->userdata); \
439 if(ret != 0) { \
440 MD_LOG("Aborted from leave_span() callback."); \
441 goto abort; \
442 } \
443 } while(0)
444
445#define MD_TEXT(type, str, size) \
446 do { \
447 if(size > 0) { \
448 ret = ctx->parser.text((type), (str), (size), ctx->userdata); \
449 if(ret != 0) { \
450 MD_LOG("Aborted from text() callback."); \
451 goto abort; \
452 } \
453 } \
454 } while(0)
455
456#define MD_TEXT_INSECURE(type, str, size) \
457 do { \
458 if(size > 0) { \
459 ret = md_text_with_null_replacement(ctx, type, str, size); \
460 if(ret != 0) { \
461 MD_LOG("Aborted from text() callback."); \
462 goto abort; \
463 } \
464 } \
465 } while(0)
466
467
468
469/*************************
470 *** Unicode Support ***
471 *************************/
472
473typedef struct MD_UNICODE_FOLD_INFO_tag MD_UNICODE_FOLD_INFO;
474struct MD_UNICODE_FOLD_INFO_tag {
475 unsigned codepoints[3];
476 unsigned n_codepoints;
477};
478
479
480#if defined MD4C_USE_UTF16 || defined MD4C_USE_UTF8
481 /* Binary search over sorted "map" of codepoints. Consecutive sequences
482 * of codepoints may be encoded in the map by just using the
483 * (MIN_CODEPOINT | 0x40000000) and (MAX_CODEPOINT | 0x80000000).
484 *
485 * Returns index of the found record in the map (in the case of ranges,
486 * the minimal value is used); or -1 on failure. */
487 static int
488 md_unicode_bsearch__(unsigned codepoint, const unsigned* map, size_t map_size)
489 {
490 int beg, end;
491 int pivot_beg, pivot_end;
492
493 beg = 0;
494 end = (int) map_size-1;
495 while(beg <= end) {
496 /* Pivot may be a range, not just a single value. */
497 pivot_beg = pivot_end = (beg + end) / 2;
498 if(map[pivot_end] & 0x40000000)
499 pivot_end++;
500 if(map[pivot_beg] & 0x80000000)
501 pivot_beg--;
502
503 if(codepoint < (map[pivot_beg] & 0x00ffffff))
504 end = pivot_beg - 1;
505 else if(codepoint > (map[pivot_end] & 0x00ffffff))
506 beg = pivot_end + 1;
507 else
508 return pivot_beg;
509 }
510
511 return -1;
512 }
513
514 static int
515 md_is_unicode_whitespace__(unsigned codepoint)
516 {
517#define R(cp_min, cp_max) ((cp_min) | 0x40000000), ((cp_max) | 0x80000000)
518#define S(cp) (cp)
519 /* Unicode "Zs" category.
520 * (generated by scripts/build_whitespace_map.py) */
521 static const unsigned WHITESPACE_MAP[] = {
522 S(0x0020), S(0x00a0), S(0x1680), R(0x2000,0x200a), S(0x202f), S(0x205f), S(0x3000)
523 };
524#undef R
525#undef S
526
527 /* The ASCII ones are the most frequently used ones, also CommonMark
528 * specification requests few more in this range. */
529 if(codepoint <= 0x7f)
530 return ISWHITESPACE_(codepoint);
531
532 return (md_unicode_bsearch__(codepoint, map: WHITESPACE_MAP, SIZEOF_ARRAY(WHITESPACE_MAP)) >= 0);
533 }
534
535 static int
536 md_is_unicode_punct__(unsigned codepoint)
537 {
538#define R(cp_min, cp_max) ((cp_min) | 0x40000000), ((cp_max) | 0x80000000)
539#define S(cp) (cp)
540 /* Unicode "Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps" categories.
541 * (generated by scripts/build_punct_map.py) */
542 static const unsigned PUNCT_MAP[] = {
543 R(0x0021,0x0023), R(0x0025,0x002a), R(0x002c,0x002f), R(0x003a,0x003b), R(0x003f,0x0040),
544 R(0x005b,0x005d), S(0x005f), S(0x007b), S(0x007d), S(0x00a1), S(0x00a7), S(0x00ab), R(0x00b6,0x00b7),
545 S(0x00bb), S(0x00bf), S(0x037e), S(0x0387), R(0x055a,0x055f), R(0x0589,0x058a), S(0x05be), S(0x05c0),
546 S(0x05c3), S(0x05c6), R(0x05f3,0x05f4), R(0x0609,0x060a), R(0x060c,0x060d), S(0x061b), R(0x061e,0x061f),
547 R(0x066a,0x066d), S(0x06d4), R(0x0700,0x070d), R(0x07f7,0x07f9), R(0x0830,0x083e), S(0x085e),
548 R(0x0964,0x0965), S(0x0970), S(0x09fd), S(0x0a76), S(0x0af0), S(0x0c77), S(0x0c84), S(0x0df4), S(0x0e4f),
549 R(0x0e5a,0x0e5b), R(0x0f04,0x0f12), S(0x0f14), R(0x0f3a,0x0f3d), S(0x0f85), R(0x0fd0,0x0fd4),
550 R(0x0fd9,0x0fda), R(0x104a,0x104f), S(0x10fb), R(0x1360,0x1368), S(0x1400), S(0x166e), R(0x169b,0x169c),
551 R(0x16eb,0x16ed), R(0x1735,0x1736), R(0x17d4,0x17d6), R(0x17d8,0x17da), R(0x1800,0x180a),
552 R(0x1944,0x1945), R(0x1a1e,0x1a1f), R(0x1aa0,0x1aa6), R(0x1aa8,0x1aad), R(0x1b5a,0x1b60),
553 R(0x1bfc,0x1bff), R(0x1c3b,0x1c3f), R(0x1c7e,0x1c7f), R(0x1cc0,0x1cc7), S(0x1cd3), R(0x2010,0x2027),
554 R(0x2030,0x2043), R(0x2045,0x2051), R(0x2053,0x205e), R(0x207d,0x207e), R(0x208d,0x208e),
555 R(0x2308,0x230b), R(0x2329,0x232a), R(0x2768,0x2775), R(0x27c5,0x27c6), R(0x27e6,0x27ef),
556 R(0x2983,0x2998), R(0x29d8,0x29db), R(0x29fc,0x29fd), R(0x2cf9,0x2cfc), R(0x2cfe,0x2cff), S(0x2d70),
557 R(0x2e00,0x2e2e), R(0x2e30,0x2e4f), S(0x2e52), R(0x3001,0x3003), R(0x3008,0x3011), R(0x3014,0x301f),
558 S(0x3030), S(0x303d), S(0x30a0), S(0x30fb), R(0xa4fe,0xa4ff), R(0xa60d,0xa60f), S(0xa673), S(0xa67e),
559 R(0xa6f2,0xa6f7), R(0xa874,0xa877), R(0xa8ce,0xa8cf), R(0xa8f8,0xa8fa), S(0xa8fc), R(0xa92e,0xa92f),
560 S(0xa95f), R(0xa9c1,0xa9cd), R(0xa9de,0xa9df), R(0xaa5c,0xaa5f), R(0xaade,0xaadf), R(0xaaf0,0xaaf1),
561 S(0xabeb), R(0xfd3e,0xfd3f), R(0xfe10,0xfe19), R(0xfe30,0xfe52), R(0xfe54,0xfe61), S(0xfe63), S(0xfe68),
562 R(0xfe6a,0xfe6b), R(0xff01,0xff03), R(0xff05,0xff0a), R(0xff0c,0xff0f), R(0xff1a,0xff1b),
563 R(0xff1f,0xff20), R(0xff3b,0xff3d), S(0xff3f), S(0xff5b), S(0xff5d), R(0xff5f,0xff65), R(0x10100,0x10102),
564 S(0x1039f), S(0x103d0), S(0x1056f), S(0x10857), S(0x1091f), S(0x1093f), R(0x10a50,0x10a58), S(0x10a7f),
565 R(0x10af0,0x10af6), R(0x10b39,0x10b3f), R(0x10b99,0x10b9c), S(0x10ead), R(0x10f55,0x10f59),
566 R(0x11047,0x1104d), R(0x110bb,0x110bc), R(0x110be,0x110c1), R(0x11140,0x11143), R(0x11174,0x11175),
567 R(0x111c5,0x111c8), S(0x111cd), S(0x111db), R(0x111dd,0x111df), R(0x11238,0x1123d), S(0x112a9),
568 R(0x1144b,0x1144f), R(0x1145a,0x1145b), S(0x1145d), S(0x114c6), R(0x115c1,0x115d7), R(0x11641,0x11643),
569 R(0x11660,0x1166c), R(0x1173c,0x1173e), S(0x1183b), R(0x11944,0x11946), S(0x119e2), R(0x11a3f,0x11a46),
570 R(0x11a9a,0x11a9c), R(0x11a9e,0x11aa2), R(0x11c41,0x11c45), R(0x11c70,0x11c71), R(0x11ef7,0x11ef8),
571 S(0x11fff), R(0x12470,0x12474), R(0x16a6e,0x16a6f), S(0x16af5), R(0x16b37,0x16b3b), S(0x16b44),
572 R(0x16e97,0x16e9a), S(0x16fe2), S(0x1bc9f), R(0x1da87,0x1da8b), R(0x1e95e,0x1e95f)
573 };
574#undef R
575#undef S
576
577 /* The ASCII ones are the most frequently used ones, also CommonMark
578 * specification requests few more in this range. */
579 if(codepoint <= 0x7f)
580 return ISPUNCT_(codepoint);
581
582 return (md_unicode_bsearch__(codepoint, map: PUNCT_MAP, SIZEOF_ARRAY(PUNCT_MAP)) >= 0);
583 }
584
585 static void
586 md_get_unicode_fold_info(unsigned codepoint, MD_UNICODE_FOLD_INFO* info)
587 {
588#define R(cp_min, cp_max) ((cp_min) | 0x40000000), ((cp_max) | 0x80000000)
589#define S(cp) (cp)
590 /* Unicode "Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps" categories.
591 * (generated by scripts/build_folding_map.py) */
592 static const unsigned FOLD_MAP_1[] = {
593 R(0x0041,0x005a), S(0x00b5), R(0x00c0,0x00d6), R(0x00d8,0x00de), R(0x0100,0x012e), R(0x0132,0x0136),
594 R(0x0139,0x0147), R(0x014a,0x0176), S(0x0178), R(0x0179,0x017d), S(0x017f), S(0x0181), S(0x0182),
595 S(0x0184), S(0x0186), S(0x0187), S(0x0189), S(0x018a), S(0x018b), S(0x018e), S(0x018f), S(0x0190),
596 S(0x0191), S(0x0193), S(0x0194), S(0x0196), S(0x0197), S(0x0198), S(0x019c), S(0x019d), S(0x019f),
597 R(0x01a0,0x01a4), S(0x01a6), S(0x01a7), S(0x01a9), S(0x01ac), S(0x01ae), S(0x01af), S(0x01b1), S(0x01b2),
598 S(0x01b3), S(0x01b5), S(0x01b7), S(0x01b8), S(0x01bc), S(0x01c4), S(0x01c5), S(0x01c7), S(0x01c8),
599 S(0x01ca), R(0x01cb,0x01db), R(0x01de,0x01ee), S(0x01f1), S(0x01f2), S(0x01f4), S(0x01f6), S(0x01f7),
600 R(0x01f8,0x021e), S(0x0220), R(0x0222,0x0232), S(0x023a), S(0x023b), S(0x023d), S(0x023e), S(0x0241),
601 S(0x0243), S(0x0244), S(0x0245), R(0x0246,0x024e), S(0x0345), S(0x0370), S(0x0372), S(0x0376), S(0x037f),
602 S(0x0386), R(0x0388,0x038a), S(0x038c), S(0x038e), S(0x038f), R(0x0391,0x03a1), R(0x03a3,0x03ab),
603 S(0x03c2), S(0x03cf), S(0x03d0), S(0x03d1), S(0x03d5), S(0x03d6), R(0x03d8,0x03ee), S(0x03f0), S(0x03f1),
604 S(0x03f4), S(0x03f5), S(0x03f7), S(0x03f9), S(0x03fa), R(0x03fd,0x03ff), R(0x0400,0x040f),
605 R(0x0410,0x042f), R(0x0460,0x0480), R(0x048a,0x04be), S(0x04c0), R(0x04c1,0x04cd), R(0x04d0,0x052e),
606 R(0x0531,0x0556), R(0x10a0,0x10c5), S(0x10c7), S(0x10cd), R(0x13f8,0x13fd), S(0x1c80), S(0x1c81),
607 S(0x1c82), S(0x1c83), S(0x1c84), S(0x1c85), S(0x1c86), S(0x1c87), S(0x1c88), R(0x1c90,0x1cba),
608 R(0x1cbd,0x1cbf), R(0x1e00,0x1e94), S(0x1e9b), R(0x1ea0,0x1efe), R(0x1f08,0x1f0f), R(0x1f18,0x1f1d),
609 R(0x1f28,0x1f2f), R(0x1f38,0x1f3f), R(0x1f48,0x1f4d), S(0x1f59), S(0x1f5b), S(0x1f5d), S(0x1f5f),
610 R(0x1f68,0x1f6f), S(0x1fb8), S(0x1fb9), S(0x1fba), S(0x1fbb), S(0x1fbe), R(0x1fc8,0x1fcb), S(0x1fd8),
611 S(0x1fd9), S(0x1fda), S(0x1fdb), S(0x1fe8), S(0x1fe9), S(0x1fea), S(0x1feb), S(0x1fec), S(0x1ff8),
612 S(0x1ff9), S(0x1ffa), S(0x1ffb), S(0x2126), S(0x212a), S(0x212b), S(0x2132), R(0x2160,0x216f), S(0x2183),
613 R(0x24b6,0x24cf), R(0x2c00,0x2c2e), S(0x2c60), S(0x2c62), S(0x2c63), S(0x2c64), R(0x2c67,0x2c6b),
614 S(0x2c6d), S(0x2c6e), S(0x2c6f), S(0x2c70), S(0x2c72), S(0x2c75), S(0x2c7e), S(0x2c7f), R(0x2c80,0x2ce2),
615 S(0x2ceb), S(0x2ced), S(0x2cf2), R(0xa640,0xa66c), R(0xa680,0xa69a), R(0xa722,0xa72e), R(0xa732,0xa76e),
616 S(0xa779), S(0xa77b), S(0xa77d), R(0xa77e,0xa786), S(0xa78b), S(0xa78d), S(0xa790), S(0xa792),
617 R(0xa796,0xa7a8), S(0xa7aa), S(0xa7ab), S(0xa7ac), S(0xa7ad), S(0xa7ae), S(0xa7b0), S(0xa7b1), S(0xa7b2),
618 S(0xa7b3), R(0xa7b4,0xa7be), S(0xa7c2), S(0xa7c4), S(0xa7c5), S(0xa7c6), S(0xa7c7), S(0xa7c9), S(0xa7f5),
619 R(0xab70,0xabbf), R(0xff21,0xff3a), R(0x10400,0x10427), R(0x104b0,0x104d3), R(0x10c80,0x10cb2),
620 R(0x118a0,0x118bf), R(0x16e40,0x16e5f), R(0x1e900,0x1e921)
621 };
622 static const unsigned FOLD_MAP_1_DATA[] = {
623 0x0061, 0x007a, 0x03bc, 0x00e0, 0x00f6, 0x00f8, 0x00fe, 0x0101, 0x012f, 0x0133, 0x0137, 0x013a, 0x0148,
624 0x014b, 0x0177, 0x00ff, 0x017a, 0x017e, 0x0073, 0x0253, 0x0183, 0x0185, 0x0254, 0x0188, 0x0256, 0x0257,
625 0x018c, 0x01dd, 0x0259, 0x025b, 0x0192, 0x0260, 0x0263, 0x0269, 0x0268, 0x0199, 0x026f, 0x0272, 0x0275,
626 0x01a1, 0x01a5, 0x0280, 0x01a8, 0x0283, 0x01ad, 0x0288, 0x01b0, 0x028a, 0x028b, 0x01b4, 0x01b6, 0x0292,
627 0x01b9, 0x01bd, 0x01c6, 0x01c6, 0x01c9, 0x01c9, 0x01cc, 0x01cc, 0x01dc, 0x01df, 0x01ef, 0x01f3, 0x01f3,
628 0x01f5, 0x0195, 0x01bf, 0x01f9, 0x021f, 0x019e, 0x0223, 0x0233, 0x2c65, 0x023c, 0x019a, 0x2c66, 0x0242,
629 0x0180, 0x0289, 0x028c, 0x0247, 0x024f, 0x03b9, 0x0371, 0x0373, 0x0377, 0x03f3, 0x03ac, 0x03ad, 0x03af,
630 0x03cc, 0x03cd, 0x03ce, 0x03b1, 0x03c1, 0x03c3, 0x03cb, 0x03c3, 0x03d7, 0x03b2, 0x03b8, 0x03c6, 0x03c0,
631 0x03d9, 0x03ef, 0x03ba, 0x03c1, 0x03b8, 0x03b5, 0x03f8, 0x03f2, 0x03fb, 0x037b, 0x037d, 0x0450, 0x045f,
632 0x0430, 0x044f, 0x0461, 0x0481, 0x048b, 0x04bf, 0x04cf, 0x04c2, 0x04ce, 0x04d1, 0x052f, 0x0561, 0x0586,
633 0x2d00, 0x2d25, 0x2d27, 0x2d2d, 0x13f0, 0x13f5, 0x0432, 0x0434, 0x043e, 0x0441, 0x0442, 0x0442, 0x044a,
634 0x0463, 0xa64b, 0x10d0, 0x10fa, 0x10fd, 0x10ff, 0x1e01, 0x1e95, 0x1e61, 0x1ea1, 0x1eff, 0x1f00, 0x1f07,
635 0x1f10, 0x1f15, 0x1f20, 0x1f27, 0x1f30, 0x1f37, 0x1f40, 0x1f45, 0x1f51, 0x1f53, 0x1f55, 0x1f57, 0x1f60,
636 0x1f67, 0x1fb0, 0x1fb1, 0x1f70, 0x1f71, 0x03b9, 0x1f72, 0x1f75, 0x1fd0, 0x1fd1, 0x1f76, 0x1f77, 0x1fe0,
637 0x1fe1, 0x1f7a, 0x1f7b, 0x1fe5, 0x1f78, 0x1f79, 0x1f7c, 0x1f7d, 0x03c9, 0x006b, 0x00e5, 0x214e, 0x2170,
638 0x217f, 0x2184, 0x24d0, 0x24e9, 0x2c30, 0x2c5e, 0x2c61, 0x026b, 0x1d7d, 0x027d, 0x2c68, 0x2c6c, 0x0251,
639 0x0271, 0x0250, 0x0252, 0x2c73, 0x2c76, 0x023f, 0x0240, 0x2c81, 0x2ce3, 0x2cec, 0x2cee, 0x2cf3, 0xa641,
640 0xa66d, 0xa681, 0xa69b, 0xa723, 0xa72f, 0xa733, 0xa76f, 0xa77a, 0xa77c, 0x1d79, 0xa77f, 0xa787, 0xa78c,
641 0x0265, 0xa791, 0xa793, 0xa797, 0xa7a9, 0x0266, 0x025c, 0x0261, 0x026c, 0x026a, 0x029e, 0x0287, 0x029d,
642 0xab53, 0xa7b5, 0xa7bf, 0xa7c3, 0xa794, 0x0282, 0x1d8e, 0xa7c8, 0xa7ca, 0xa7f6, 0x13a0, 0x13ef, 0xff41,
643 0xff5a, 0x10428, 0x1044f, 0x104d8, 0x104fb, 0x10cc0, 0x10cf2, 0x118c0, 0x118df, 0x16e60, 0x16e7f, 0x1e922,
644 0x1e943
645 };
646 static const unsigned FOLD_MAP_2[] = {
647 S(0x00df), S(0x0130), S(0x0149), S(0x01f0), S(0x0587), S(0x1e96), S(0x1e97), S(0x1e98), S(0x1e99),
648 S(0x1e9a), S(0x1e9e), S(0x1f50), R(0x1f80,0x1f87), R(0x1f88,0x1f8f), R(0x1f90,0x1f97), R(0x1f98,0x1f9f),
649 R(0x1fa0,0x1fa7), R(0x1fa8,0x1faf), S(0x1fb2), S(0x1fb3), S(0x1fb4), S(0x1fb6), S(0x1fbc), S(0x1fc2),
650 S(0x1fc3), S(0x1fc4), S(0x1fc6), S(0x1fcc), S(0x1fd6), S(0x1fe4), S(0x1fe6), S(0x1ff2), S(0x1ff3),
651 S(0x1ff4), S(0x1ff6), S(0x1ffc), S(0xfb00), S(0xfb01), S(0xfb02), S(0xfb05), S(0xfb06), S(0xfb13),
652 S(0xfb14), S(0xfb15), S(0xfb16), S(0xfb17)
653 };
654 static const unsigned FOLD_MAP_2_DATA[] = {
655 0x0073,0x0073, 0x0069,0x0307, 0x02bc,0x006e, 0x006a,0x030c, 0x0565,0x0582, 0x0068,0x0331, 0x0074,0x0308,
656 0x0077,0x030a, 0x0079,0x030a, 0x0061,0x02be, 0x0073,0x0073, 0x03c5,0x0313, 0x1f00,0x03b9, 0x1f07,0x03b9,
657 0x1f00,0x03b9, 0x1f07,0x03b9, 0x1f20,0x03b9, 0x1f27,0x03b9, 0x1f20,0x03b9, 0x1f27,0x03b9, 0x1f60,0x03b9,
658 0x1f67,0x03b9, 0x1f60,0x03b9, 0x1f67,0x03b9, 0x1f70,0x03b9, 0x03b1,0x03b9, 0x03ac,0x03b9, 0x03b1,0x0342,
659 0x03b1,0x03b9, 0x1f74,0x03b9, 0x03b7,0x03b9, 0x03ae,0x03b9, 0x03b7,0x0342, 0x03b7,0x03b9, 0x03b9,0x0342,
660 0x03c1,0x0313, 0x03c5,0x0342, 0x1f7c,0x03b9, 0x03c9,0x03b9, 0x03ce,0x03b9, 0x03c9,0x0342, 0x03c9,0x03b9,
661 0x0066,0x0066, 0x0066,0x0069, 0x0066,0x006c, 0x0073,0x0074, 0x0073,0x0074, 0x0574,0x0576, 0x0574,0x0565,
662 0x0574,0x056b, 0x057e,0x0576, 0x0574,0x056d
663 };
664 static const unsigned FOLD_MAP_3[] = {
665 S(0x0390), S(0x03b0), S(0x1f52), S(0x1f54), S(0x1f56), S(0x1fb7), S(0x1fc7), S(0x1fd2), S(0x1fd3),
666 S(0x1fd7), S(0x1fe2), S(0x1fe3), S(0x1fe7), S(0x1ff7), S(0xfb03), S(0xfb04)
667 };
668 static const unsigned FOLD_MAP_3_DATA[] = {
669 0x03b9,0x0308,0x0301, 0x03c5,0x0308,0x0301, 0x03c5,0x0313,0x0300, 0x03c5,0x0313,0x0301,
670 0x03c5,0x0313,0x0342, 0x03b1,0x0342,0x03b9, 0x03b7,0x0342,0x03b9, 0x03b9,0x0308,0x0300,
671 0x03b9,0x0308,0x0301, 0x03b9,0x0308,0x0342, 0x03c5,0x0308,0x0300, 0x03c5,0x0308,0x0301,
672 0x03c5,0x0308,0x0342, 0x03c9,0x0342,0x03b9, 0x0066,0x0066,0x0069, 0x0066,0x0066,0x006c
673 };
674#undef R
675#undef S
676 static const struct {
677 const unsigned* map;
678 const unsigned* data;
679 size_t map_size;
680 unsigned n_codepoints;
681 } FOLD_MAP_LIST[] = {
682 { FOLD_MAP_1, FOLD_MAP_1_DATA, SIZEOF_ARRAY(FOLD_MAP_1), 1 },
683 { FOLD_MAP_2, FOLD_MAP_2_DATA, SIZEOF_ARRAY(FOLD_MAP_2), 2 },
684 { FOLD_MAP_3, FOLD_MAP_3_DATA, SIZEOF_ARRAY(FOLD_MAP_3), 3 }
685 };
686
687 int i;
688
689 /* Fast path for ASCII characters. */
690 if(codepoint <= 0x7f) {
691 info->codepoints[0] = codepoint;
692 if(ISUPPER_(codepoint))
693 info->codepoints[0] += 'a' - 'A';
694 info->n_codepoints = 1;
695 return;
696 }
697
698 /* Try to locate the codepoint in any of the maps. */
699 for(i = 0; i < (int) SIZEOF_ARRAY(FOLD_MAP_LIST); i++) {
700 int index;
701
702 index = md_unicode_bsearch__(codepoint, map: FOLD_MAP_LIST[i].map, map_size: FOLD_MAP_LIST[i].map_size);
703 if(index >= 0) {
704 /* Found the mapping. */
705 unsigned n_codepoints = FOLD_MAP_LIST[i].n_codepoints;
706 const unsigned* map = FOLD_MAP_LIST[i].map;
707 const unsigned* codepoints = FOLD_MAP_LIST[i].data + (index * n_codepoints);
708
709 memcpy(dest: info->codepoints, src: codepoints, n: sizeof(unsigned) * n_codepoints);
710 info->n_codepoints = n_codepoints;
711
712 if(FOLD_MAP_LIST[i].map[index] != codepoint) {
713 /* The found mapping maps whole range of codepoints,
714 * i.e. we have to offset info->codepoints[0] accordingly. */
715 if((map[index] & 0x00ffffff)+1 == codepoints[0]) {
716 /* Alternating type of the range. */
717 info->codepoints[0] = codepoint + ((codepoint & 0x1) == (map[index] & 0x1) ? 1 : 0);
718 } else {
719 /* Range to range kind of mapping. */
720 info->codepoints[0] += (codepoint - (map[index] & 0x00ffffff));
721 }
722 }
723
724 return;
725 }
726 }
727
728 /* No mapping found. Map the codepoint to itself. */
729 info->codepoints[0] = codepoint;
730 info->n_codepoints = 1;
731 }
732#endif
733
734
735#if defined MD4C_USE_UTF16
736 #define IS_UTF16_SURROGATE_HI(word) (((WORD)(word) & 0xfc00) == 0xd800)
737 #define IS_UTF16_SURROGATE_LO(word) (((WORD)(word) & 0xfc00) == 0xdc00)
738 #define UTF16_DECODE_SURROGATE(hi, lo) (0x10000 + ((((unsigned)(hi) & 0x3ff) << 10) | (((unsigned)(lo) & 0x3ff) << 0)))
739
740 static unsigned
741 md_decode_utf16le__(const CHAR* str, SZ str_size, SZ* p_size)
742 {
743 if(IS_UTF16_SURROGATE_HI(str[0])) {
744 if(1 < str_size && IS_UTF16_SURROGATE_LO(str[1])) {
745 if(p_size != NULL)
746 *p_size = 2;
747 return UTF16_DECODE_SURROGATE(str[0], str[1]);
748 }
749 }
750
751 if(p_size != NULL)
752 *p_size = 1;
753 return str[0];
754 }
755
756 static unsigned
757 md_decode_utf16le_before__(MD_CTX* ctx, OFF off)
758 {
759 if(off > 2 && IS_UTF16_SURROGATE_HI(CH(off-2)) && IS_UTF16_SURROGATE_LO(CH(off-1)))
760 return UTF16_DECODE_SURROGATE(CH(off-2), CH(off-1));
761
762 return CH(off);
763 }
764
765 /* No whitespace uses surrogates, so no decoding needed here. */
766 #define ISUNICODEWHITESPACE_(codepoint) md_is_unicode_whitespace__(codepoint)
767 #define ISUNICODEWHITESPACE(off) md_is_unicode_whitespace__(CH(off))
768 #define ISUNICODEWHITESPACEBEFORE(off) md_is_unicode_whitespace__(CH((off)-1))
769
770 #define ISUNICODEPUNCT(off) md_is_unicode_punct__(md_decode_utf16le__(STR(off), ctx->size - (off), NULL))
771 #define ISUNICODEPUNCTBEFORE(off) md_is_unicode_punct__(md_decode_utf16le_before__(ctx, off))
772
773 static inline int
774 md_decode_unicode(const CHAR* str, OFF off, SZ str_size, SZ* p_char_size)
775 {
776 return md_decode_utf16le__(str+off, str_size-off, p_char_size);
777 }
778#elif defined MD4C_USE_UTF8
779 #define IS_UTF8_LEAD1(byte) ((unsigned char)(byte) <= 0x7f)
780 #define IS_UTF8_LEAD2(byte) (((unsigned char)(byte) & 0xe0) == 0xc0)
781 #define IS_UTF8_LEAD3(byte) (((unsigned char)(byte) & 0xf0) == 0xe0)
782 #define IS_UTF8_LEAD4(byte) (((unsigned char)(byte) & 0xf8) == 0xf0)
783 #define IS_UTF8_TAIL(byte) (((unsigned char)(byte) & 0xc0) == 0x80)
784
785 static unsigned
786 md_decode_utf8__(const CHAR* str, SZ str_size, SZ* p_size)
787 {
788 if(!IS_UTF8_LEAD1(str[0])) {
789 if(IS_UTF8_LEAD2(str[0])) {
790 if(1 < str_size && IS_UTF8_TAIL(str[1])) {
791 if(p_size != NULL)
792 *p_size = 2;
793
794 return (((unsigned int)str[0] & 0x1f) << 6) |
795 (((unsigned int)str[1] & 0x3f) << 0);
796 }
797 } else if(IS_UTF8_LEAD3(str[0])) {
798 if(2 < str_size && IS_UTF8_TAIL(str[1]) && IS_UTF8_TAIL(str[2])) {
799 if(p_size != NULL)
800 *p_size = 3;
801
802 return (((unsigned int)str[0] & 0x0f) << 12) |
803 (((unsigned int)str[1] & 0x3f) << 6) |
804 (((unsigned int)str[2] & 0x3f) << 0);
805 }
806 } else if(IS_UTF8_LEAD4(str[0])) {
807 if(3 < str_size && IS_UTF8_TAIL(str[1]) && IS_UTF8_TAIL(str[2]) && IS_UTF8_TAIL(str[3])) {
808 if(p_size != NULL)
809 *p_size = 4;
810
811 return (((unsigned int)str[0] & 0x07) << 18) |
812 (((unsigned int)str[1] & 0x3f) << 12) |
813 (((unsigned int)str[2] & 0x3f) << 6) |
814 (((unsigned int)str[3] & 0x3f) << 0);
815 }
816 }
817 }
818
819 if(p_size != NULL)
820 *p_size = 1;
821 return (unsigned) str[0];
822 }
823
824 static unsigned
825 md_decode_utf8_before__(MD_CTX* ctx, OFF off)
826 {
827 if(!IS_UTF8_LEAD1(CH(off-1))) {
828 if(off > 1 && IS_UTF8_LEAD2(CH(off-2)) && IS_UTF8_TAIL(CH(off-1)))
829 return (((unsigned int)CH(off-2) & 0x1f) << 6) |
830 (((unsigned int)CH(off-1) & 0x3f) << 0);
831
832 if(off > 2 && IS_UTF8_LEAD3(CH(off-3)) && IS_UTF8_TAIL(CH(off-2)) && IS_UTF8_TAIL(CH(off-1)))
833 return (((unsigned int)CH(off-3) & 0x0f) << 12) |
834 (((unsigned int)CH(off-2) & 0x3f) << 6) |
835 (((unsigned int)CH(off-1) & 0x3f) << 0);
836
837 if(off > 3 && IS_UTF8_LEAD4(CH(off-4)) && IS_UTF8_TAIL(CH(off-3)) && IS_UTF8_TAIL(CH(off-2)) && IS_UTF8_TAIL(CH(off-1)))
838 return (((unsigned int)CH(off-4) & 0x07) << 18) |
839 (((unsigned int)CH(off-3) & 0x3f) << 12) |
840 (((unsigned int)CH(off-2) & 0x3f) << 6) |
841 (((unsigned int)CH(off-1) & 0x3f) << 0);
842 }
843
844 return (unsigned) CH(off-1);
845 }
846
847 #define ISUNICODEWHITESPACE_(codepoint) md_is_unicode_whitespace__(codepoint)
848 #define ISUNICODEWHITESPACE(off) md_is_unicode_whitespace__(md_decode_utf8__(STR(off), ctx->size - (off), NULL))
849 #define ISUNICODEWHITESPACEBEFORE(off) md_is_unicode_whitespace__(md_decode_utf8_before__(ctx, off))
850
851 #define ISUNICODEPUNCT(off) md_is_unicode_punct__(md_decode_utf8__(STR(off), ctx->size - (off), NULL))
852 #define ISUNICODEPUNCTBEFORE(off) md_is_unicode_punct__(md_decode_utf8_before__(ctx, off))
853
854 static inline unsigned
855 md_decode_unicode(const CHAR* str, OFF off, SZ str_size, SZ* p_char_size)
856 {
857 return md_decode_utf8__(str: str+off, str_size: str_size-off, p_size: p_char_size);
858 }
859#else
860 #define ISUNICODEWHITESPACE_(codepoint) ISWHITESPACE_(codepoint)
861 #define ISUNICODEWHITESPACE(off) ISWHITESPACE(off)
862 #define ISUNICODEWHITESPACEBEFORE(off) ISWHITESPACE((off)-1)
863
864 #define ISUNICODEPUNCT(off) ISPUNCT(off)
865 #define ISUNICODEPUNCTBEFORE(off) ISPUNCT((off)-1)
866
867 static inline void
868 md_get_unicode_fold_info(unsigned codepoint, MD_UNICODE_FOLD_INFO* info)
869 {
870 info->codepoints[0] = codepoint;
871 if(ISUPPER_(codepoint))
872 info->codepoints[0] += 'a' - 'A';
873 info->n_codepoints = 1;
874 }
875
876 static inline unsigned
877 md_decode_unicode(const CHAR* str, OFF off, SZ str_size, SZ* p_size)
878 {
879 *p_size = 1;
880 return (unsigned) str[off];
881 }
882#endif
883
884
885/*************************************
886 *** Helper string manipulations ***
887 *************************************/
888
889/* Fill buffer with copy of the string between 'beg' and 'end' but replace any
890 * line breaks with given replacement character.
891 *
892 * NOTE: Caller is responsible to make sure the buffer is large enough.
893 * (Given the output is always shorter then input, (end - beg) is good idea
894 * what the caller should allocate.)
895 */
896static void
897md_merge_lines(MD_CTX* ctx, OFF beg, OFF end, const MD_LINE* lines, int n_lines,
898 CHAR line_break_replacement_char, CHAR* buffer, SZ* p_size)
899{
900 CHAR* ptr = buffer;
901 int line_index = 0;
902 OFF off = beg;
903
904 MD_UNUSED(n_lines);
905
906 while(1) {
907 const MD_LINE* line = &lines[line_index];
908 OFF line_end = line->end;
909 if(end < line_end)
910 line_end = end;
911
912 while(off < line_end) {
913 *ptr = CH(off);
914 ptr++;
915 off++;
916 }
917
918 if(off >= end) {
919 *p_size = (MD_SIZE)(ptr - buffer);
920 return;
921 }
922
923 *ptr = line_break_replacement_char;
924 ptr++;
925
926 line_index++;
927 off = lines[line_index].beg;
928 }
929}
930
931/* Wrapper of md_merge_lines() which allocates new buffer for the output string.
932 */
933static int
934md_merge_lines_alloc(MD_CTX* ctx, OFF beg, OFF end, const MD_LINE* lines, int n_lines,
935 CHAR line_break_replacement_char, CHAR** p_str, SZ* p_size)
936{
937 CHAR* buffer;
938
939 buffer = (CHAR*) malloc(size: sizeof(CHAR) * (end - beg));
940 if(buffer == NULL) {
941 MD_LOG("malloc() failed.");
942 return -1;
943 }
944
945 md_merge_lines(ctx, beg, end, lines, n_lines,
946 line_break_replacement_char, buffer, p_size);
947
948 *p_str = buffer;
949 return 0;
950}
951
952static OFF
953md_skip_unicode_whitespace(const CHAR* label, OFF off, SZ size)
954{
955 SZ char_size;
956 unsigned codepoint;
957
958 while(off < size) {
959 codepoint = md_decode_unicode(str: label, off, str_size: size, p_char_size: &char_size);
960 if(!ISUNICODEWHITESPACE_(codepoint) && !ISNEWLINE_(label[off]))
961 break;
962 off += char_size;
963 }
964
965 return off;
966}
967
968
969/******************************
970 *** Recognizing raw HTML ***
971 ******************************/
972
973/* md_is_html_tag() may be called when processing inlines (inline raw HTML)
974 * or when breaking document to blocks (checking for start of HTML block type 7).
975 *
976 * When breaking document to blocks, we do not yet know line boundaries, but
977 * in that case the whole tag has to live on a single line. We distinguish this
978 * by n_lines == 0.
979 */
980static int
981md_is_html_tag(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end)
982{
983 int attr_state;
984 OFF off = beg;
985 OFF line_end = (n_lines > 0) ? lines[0].end : ctx->size;
986 int i = 0;
987
988 MD_ASSERT(CH(beg) == _T('<'));
989
990 if(off + 1 >= line_end)
991 return FALSE;
992 off++;
993
994 /* For parsing attributes, we need a little state automaton below.
995 * State -1: no attributes are allowed.
996 * State 0: attribute could follow after some whitespace.
997 * State 1: after a whitespace (attribute name may follow).
998 * State 2: after attribute name ('=' MAY follow).
999 * State 3: after '=' (value specification MUST follow).
1000 * State 41: in middle of unquoted attribute value.
1001 * State 42: in middle of single-quoted attribute value.
1002 * State 43: in middle of double-quoted attribute value.
1003 */
1004 attr_state = 0;
1005
1006 if(CH(off) == _T('/')) {
1007 /* Closer tag "</ ... >". No attributes may be present. */
1008 attr_state = -1;
1009 off++;
1010 }
1011
1012 /* Tag name */
1013 if(off >= line_end || !ISALPHA(off))
1014 return FALSE;
1015 off++;
1016 while(off < line_end && (ISALNUM(off) || CH(off) == _T('-')))
1017 off++;
1018
1019 /* (Optional) attributes (if not closer), (optional) '/' (if not closer)
1020 * and final '>'. */
1021 while(1) {
1022 while(off < line_end && !ISNEWLINE(off)) {
1023 if(attr_state > 40) {
1024 if(attr_state == 41 && (ISBLANK(off) || ISANYOF(off, _T("\"'=<>`")))) {
1025 attr_state = 0;
1026 off--; /* Put the char back for re-inspection in the new state. */
1027 } else if(attr_state == 42 && CH(off) == _T('\'')) {
1028 attr_state = 0;
1029 } else if(attr_state == 43 && CH(off) == _T('"')) {
1030 attr_state = 0;
1031 }
1032 off++;
1033 } else if(ISWHITESPACE(off)) {
1034 if(attr_state == 0)
1035 attr_state = 1;
1036 off++;
1037 } else if(attr_state <= 2 && CH(off) == _T('>')) {
1038 /* End. */
1039 goto done;
1040 } else if(attr_state <= 2 && CH(off) == _T('/') && off+1 < line_end && CH(off+1) == _T('>')) {
1041 /* End with digraph '/>' */
1042 off++;
1043 goto done;
1044 } else if((attr_state == 1 || attr_state == 2) && (ISALPHA(off) || CH(off) == _T('_') || CH(off) == _T(':'))) {
1045 off++;
1046 /* Attribute name */
1047 while(off < line_end && (ISALNUM(off) || ISANYOF(off, _T("_.:-"))))
1048 off++;
1049 attr_state = 2;
1050 } else if(attr_state == 2 && CH(off) == _T('=')) {
1051 /* Attribute assignment sign */
1052 off++;
1053 attr_state = 3;
1054 } else if(attr_state == 3) {
1055 /* Expecting start of attribute value. */
1056 if(CH(off) == _T('"'))
1057 attr_state = 43;
1058 else if(CH(off) == _T('\''))
1059 attr_state = 42;
1060 else if(!ISANYOF(off, _T("\"'=<>`")) && !ISNEWLINE(off))
1061 attr_state = 41;
1062 else
1063 return FALSE;
1064 off++;
1065 } else {
1066 /* Anything unexpected. */
1067 return FALSE;
1068 }
1069 }
1070
1071 /* We have to be on a single line. See definition of start condition
1072 * of HTML block, type 7. */
1073 if(n_lines == 0)
1074 return FALSE;
1075
1076 i++;
1077 if(i >= n_lines)
1078 return FALSE;
1079
1080 off = lines[i].beg;
1081 line_end = lines[i].end;
1082
1083 if(attr_state == 0 || attr_state == 41)
1084 attr_state = 1;
1085
1086 if(off >= max_end)
1087 return FALSE;
1088 }
1089
1090done:
1091 if(off >= max_end)
1092 return FALSE;
1093
1094 *p_end = off+1;
1095 return TRUE;
1096}
1097
1098static int
1099md_scan_for_html_closer(MD_CTX* ctx, const MD_CHAR* str, MD_SIZE len,
1100 const MD_LINE* lines, int n_lines,
1101 OFF beg, OFF max_end, OFF* p_end,
1102 OFF* p_scan_horizon)
1103{
1104 OFF off = beg;
1105 int i = 0;
1106
1107 if(off < *p_scan_horizon && *p_scan_horizon >= max_end - len) {
1108 /* We have already scanned the range up to the max_end so we know
1109 * there is nothing to see. */
1110 return FALSE;
1111 }
1112
1113 while(TRUE) {
1114 while(off + len <= lines[i].end && off + len <= max_end) {
1115 if(md_ascii_eq(STR(off), s2: str, n: len)) {
1116 /* Success. */
1117 *p_end = off + len;
1118 return TRUE;
1119 }
1120 off++;
1121 }
1122
1123 i++;
1124 if(off >= max_end || i >= n_lines) {
1125 /* Failure. */
1126 *p_scan_horizon = off;
1127 return FALSE;
1128 }
1129
1130 off = lines[i].beg;
1131 }
1132}
1133
1134static int
1135md_is_html_comment(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end)
1136{
1137 OFF off = beg;
1138
1139 MD_ASSERT(CH(beg) == _T('<'));
1140
1141 if(off + 4 >= lines[0].end)
1142 return FALSE;
1143 if(CH(off+1) != _T('!') || CH(off+2) != _T('-') || CH(off+3) != _T('-'))
1144 return FALSE;
1145 off += 4;
1146
1147 /* ">" and "->" must not follow the opening. */
1148 if(off < lines[0].end && CH(off) == _T('>'))
1149 return FALSE;
1150 if(off+1 < lines[0].end && CH(off) == _T('-') && CH(off+1) == _T('>'))
1151 return FALSE;
1152
1153 /* HTML comment must not contain "--", so we scan just for "--" instead
1154 * of "-->" and verify manually that '>' follows. */
1155 if(md_scan_for_html_closer(ctx, _T("--"), len: 2,
1156 lines, n_lines, beg: off, max_end, p_end, p_scan_horizon: &ctx->html_comment_horizon))
1157 {
1158 if(*p_end < max_end && CH(*p_end) == _T('>')) {
1159 *p_end = *p_end + 1;
1160 return TRUE;
1161 }
1162 }
1163
1164 return FALSE;
1165}
1166
1167static int
1168md_is_html_processing_instruction(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end)
1169{
1170 OFF off = beg;
1171
1172 if(off + 2 >= lines[0].end)
1173 return FALSE;
1174 if(CH(off+1) != _T('?'))
1175 return FALSE;
1176 off += 2;
1177
1178 return md_scan_for_html_closer(ctx, _T("?>"), len: 2,
1179 lines, n_lines, beg: off, max_end, p_end, p_scan_horizon: &ctx->html_proc_instr_horizon);
1180}
1181
1182static int
1183md_is_html_declaration(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end)
1184{
1185 OFF off = beg;
1186
1187 if(off + 2 >= lines[0].end)
1188 return FALSE;
1189 if(CH(off+1) != _T('!'))
1190 return FALSE;
1191 off += 2;
1192
1193 /* Declaration name. */
1194 if(off >= lines[0].end || !ISALPHA(off))
1195 return FALSE;
1196 off++;
1197 while(off < lines[0].end && ISALPHA(off))
1198 off++;
1199 if(off < lines[0].end && !ISWHITESPACE(off))
1200 return FALSE;
1201
1202 return md_scan_for_html_closer(ctx, _T(">"), len: 1,
1203 lines, n_lines, beg: off, max_end, p_end, p_scan_horizon: &ctx->html_decl_horizon);
1204}
1205
1206static int
1207md_is_html_cdata(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end)
1208{
1209 static const CHAR open_str[] = _T("<![CDATA[");
1210 static const SZ open_size = SIZEOF_ARRAY(open_str) - 1;
1211
1212 OFF off = beg;
1213
1214 if(off + open_size >= lines[0].end)
1215 return FALSE;
1216 if(memcmp(STR(off), s2: open_str, n: open_size) != 0)
1217 return FALSE;
1218 off += open_size;
1219
1220 if(lines[n_lines-1].end < max_end)
1221 max_end = lines[n_lines-1].end - 2;
1222
1223 return md_scan_for_html_closer(ctx, _T("]]>"), len: 3,
1224 lines, n_lines, beg: off, max_end, p_end, p_scan_horizon: &ctx->html_cdata_horizon);
1225}
1226
1227static int
1228md_is_html_any(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end)
1229{
1230 MD_ASSERT(CH(beg) == _T('<'));
1231 return (md_is_html_tag(ctx, lines, n_lines, beg, max_end, p_end) ||
1232 md_is_html_comment(ctx, lines, n_lines, beg, max_end, p_end) ||
1233 md_is_html_processing_instruction(ctx, lines, n_lines, beg, max_end, p_end) ||
1234 md_is_html_declaration(ctx, lines, n_lines, beg, max_end, p_end) ||
1235 md_is_html_cdata(ctx, lines, n_lines, beg, max_end, p_end));
1236}
1237
1238
1239/****************************
1240 *** Recognizing Entity ***
1241 ****************************/
1242
1243static int
1244md_is_hex_entity_contents(MD_CTX* ctx, const CHAR* text, OFF beg, OFF max_end, OFF* p_end)
1245{
1246 OFF off = beg;
1247 MD_UNUSED(ctx);
1248
1249 while(off < max_end && ISXDIGIT_(text[off]) && off - beg <= 8)
1250 off++;
1251
1252 if(1 <= off - beg && off - beg <= 6) {
1253 *p_end = off;
1254 return TRUE;
1255 } else {
1256 return FALSE;
1257 }
1258}
1259
1260static int
1261md_is_dec_entity_contents(MD_CTX* ctx, const CHAR* text, OFF beg, OFF max_end, OFF* p_end)
1262{
1263 OFF off = beg;
1264 MD_UNUSED(ctx);
1265
1266 while(off < max_end && ISDIGIT_(text[off]) && off - beg <= 8)
1267 off++;
1268
1269 if(1 <= off - beg && off - beg <= 7) {
1270 *p_end = off;
1271 return TRUE;
1272 } else {
1273 return FALSE;
1274 }
1275}
1276
1277static int
1278md_is_named_entity_contents(MD_CTX* ctx, const CHAR* text, OFF beg, OFF max_end, OFF* p_end)
1279{
1280 OFF off = beg;
1281 MD_UNUSED(ctx);
1282
1283 if(off < max_end && ISALPHA_(text[off]))
1284 off++;
1285 else
1286 return FALSE;
1287
1288 while(off < max_end && ISALNUM_(text[off]) && off - beg <= 48)
1289 off++;
1290
1291 if(2 <= off - beg && off - beg <= 48) {
1292 *p_end = off;
1293 return TRUE;
1294 } else {
1295 return FALSE;
1296 }
1297}
1298
1299static int
1300md_is_entity_str(MD_CTX* ctx, const CHAR* text, OFF beg, OFF max_end, OFF* p_end)
1301{
1302 int is_contents;
1303 OFF off = beg;
1304
1305 MD_ASSERT(text[off] == _T('&'));
1306 off++;
1307
1308 if(off+2 < max_end && text[off] == _T('#') && (text[off+1] == _T('x') || text[off+1] == _T('X')))
1309 is_contents = md_is_hex_entity_contents(ctx, text, beg: off+2, max_end, p_end: &off);
1310 else if(off+1 < max_end && text[off] == _T('#'))
1311 is_contents = md_is_dec_entity_contents(ctx, text, beg: off+1, max_end, p_end: &off);
1312 else
1313 is_contents = md_is_named_entity_contents(ctx, text, beg: off, max_end, p_end: &off);
1314
1315 if(is_contents && off < max_end && text[off] == _T(';')) {
1316 *p_end = off+1;
1317 return TRUE;
1318 } else {
1319 return FALSE;
1320 }
1321}
1322
1323static inline int
1324md_is_entity(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end)
1325{
1326 return md_is_entity_str(ctx, text: ctx->text, beg, max_end, p_end);
1327}
1328
1329
1330/******************************
1331 *** Attribute Management ***
1332 ******************************/
1333
1334typedef struct MD_ATTRIBUTE_BUILD_tag MD_ATTRIBUTE_BUILD;
1335struct MD_ATTRIBUTE_BUILD_tag {
1336 CHAR* text;
1337 MD_TEXTTYPE* substr_types;
1338 OFF* substr_offsets;
1339 int substr_count;
1340 int substr_alloc;
1341 MD_TEXTTYPE trivial_types[1];
1342 OFF trivial_offsets[2];
1343};
1344
1345
1346#define MD_BUILD_ATTR_NO_ESCAPES 0x0001
1347
1348static int
1349md_build_attr_append_substr(MD_CTX* ctx, MD_ATTRIBUTE_BUILD* build,
1350 MD_TEXTTYPE type, OFF off)
1351{
1352 if(build->substr_count >= build->substr_alloc) {
1353 MD_TEXTTYPE* new_substr_types;
1354 OFF* new_substr_offsets;
1355
1356 build->substr_alloc = (build->substr_alloc > 0
1357 ? build->substr_alloc + build->substr_alloc / 2
1358 : 8);
1359 new_substr_types = (MD_TEXTTYPE*) realloc(ptr: build->substr_types,
1360 size: build->substr_alloc * sizeof(MD_TEXTTYPE));
1361 if(new_substr_types == NULL) {
1362 MD_LOG("realloc() failed.");
1363 return -1;
1364 }
1365 /* Note +1 to reserve space for final offset (== raw_size). */
1366 new_substr_offsets = (OFF*) realloc(ptr: build->substr_offsets,
1367 size: (build->substr_alloc+1) * sizeof(OFF));
1368 if(new_substr_offsets == NULL) {
1369 MD_LOG("realloc() failed.");
1370 free(ptr: new_substr_types);
1371 return -1;
1372 }
1373
1374 build->substr_types = new_substr_types;
1375 build->substr_offsets = new_substr_offsets;
1376 }
1377
1378 build->substr_types[build->substr_count] = type;
1379 build->substr_offsets[build->substr_count] = off;
1380 build->substr_count++;
1381 return 0;
1382}
1383
1384static void
1385md_free_attribute(MD_CTX* ctx, MD_ATTRIBUTE_BUILD* build)
1386{
1387 MD_UNUSED(ctx);
1388
1389 if(build->substr_alloc > 0) {
1390 free(ptr: build->text);
1391 free(ptr: build->substr_types);
1392 free(ptr: build->substr_offsets);
1393 }
1394}
1395
1396static int
1397md_build_attribute(MD_CTX* ctx, const CHAR* raw_text, SZ raw_size,
1398 unsigned flags, MD_ATTRIBUTE* attr, MD_ATTRIBUTE_BUILD* build)
1399{
1400 OFF raw_off, off;
1401 int is_trivial;
1402 int ret = 0;
1403
1404 memset(s: build, c: 0, n: sizeof(MD_ATTRIBUTE_BUILD));
1405
1406 /* If there is no backslash and no ampersand, build trivial attribute
1407 * without any malloc(). */
1408 is_trivial = TRUE;
1409 for(raw_off = 0; raw_off < raw_size; raw_off++) {
1410 if(ISANYOF3_(raw_text[raw_off], _T('\\'), _T('&'), _T('\0'))) {
1411 is_trivial = FALSE;
1412 break;
1413 }
1414 }
1415
1416 if(is_trivial) {
1417 build->text = (CHAR*) (raw_size ? raw_text : NULL);
1418 build->substr_types = build->trivial_types;
1419 build->substr_offsets = build->trivial_offsets;
1420 build->substr_count = 1;
1421 build->substr_alloc = 0;
1422 build->trivial_types[0] = MD_TEXT_NORMAL;
1423 build->trivial_offsets[0] = 0;
1424 build->trivial_offsets[1] = raw_size;
1425 off = raw_size;
1426 } else {
1427 build->text = (CHAR*) malloc(size: raw_size * sizeof(CHAR));
1428 if(build->text == NULL) {
1429 MD_LOG("malloc() failed.");
1430 goto abort;
1431 }
1432
1433 raw_off = 0;
1434 off = 0;
1435
1436 while(raw_off < raw_size) {
1437 if(raw_text[raw_off] == _T('\0')) {
1438 MD_CHECK(md_build_attr_append_substr(ctx, build, MD_TEXT_NULLCHAR, off));
1439 memcpy(dest: build->text + off, src: raw_text + raw_off, n: 1);
1440 off++;
1441 raw_off++;
1442 continue;
1443 }
1444
1445 if(raw_text[raw_off] == _T('&')) {
1446 OFF ent_end;
1447
1448 if(md_is_entity_str(ctx, text: raw_text, beg: raw_off, max_end: raw_size, p_end: &ent_end)) {
1449 MD_CHECK(md_build_attr_append_substr(ctx, build, MD_TEXT_ENTITY, off));
1450 memcpy(dest: build->text + off, src: raw_text + raw_off, n: ent_end - raw_off);
1451 off += ent_end - raw_off;
1452 raw_off = ent_end;
1453 continue;
1454 }
1455 }
1456
1457 if(build->substr_count == 0 || build->substr_types[build->substr_count-1] != MD_TEXT_NORMAL)
1458 MD_CHECK(md_build_attr_append_substr(ctx, build, MD_TEXT_NORMAL, off));
1459
1460 if(!(flags & MD_BUILD_ATTR_NO_ESCAPES) &&
1461 raw_text[raw_off] == _T('\\') && raw_off+1 < raw_size &&
1462 (ISPUNCT_(raw_text[raw_off+1]) || ISNEWLINE_(raw_text[raw_off+1])))
1463 raw_off++;
1464
1465 build->text[off++] = raw_text[raw_off++];
1466 }
1467 build->substr_offsets[build->substr_count] = off;
1468 }
1469
1470 attr->text = build->text;
1471 attr->size = off;
1472 attr->substr_offsets = build->substr_offsets;
1473 attr->substr_types = build->substr_types;
1474 return 0;
1475
1476abort:
1477 md_free_attribute(ctx, build);
1478 return -1;
1479}
1480
1481
1482/*********************************************
1483 *** Dictionary of Reference Definitions ***
1484 *********************************************/
1485
1486#define MD_FNV1A_BASE 2166136261U
1487#define MD_FNV1A_PRIME 16777619U
1488
1489static inline unsigned
1490md_fnv1a(unsigned base, const void* data, size_t n)
1491{
1492 const unsigned char* buf = (const unsigned char*) data;
1493 unsigned hash = base;
1494 size_t i;
1495
1496 for(i = 0; i < n; i++) {
1497 hash ^= buf[i];
1498 hash *= MD_FNV1A_PRIME;
1499 }
1500
1501 return hash;
1502}
1503
1504
1505struct MD_REF_DEF_tag {
1506 CHAR* label;
1507 CHAR* title;
1508 unsigned hash;
1509 SZ label_size;
1510 SZ title_size;
1511 OFF dest_beg;
1512 OFF dest_end;
1513 unsigned char label_needs_free : 1;
1514 unsigned char title_needs_free : 1;
1515};
1516
1517/* Label equivalence is quite complicated with regards to whitespace and case
1518 * folding. This complicates computing a hash of it as well as direct comparison
1519 * of two labels. */
1520
1521static unsigned
1522md_link_label_hash(const CHAR* label, SZ size)
1523{
1524 unsigned hash = MD_FNV1A_BASE;
1525 OFF off;
1526 unsigned codepoint;
1527 int is_whitespace = FALSE;
1528
1529 off = md_skip_unicode_whitespace(label, off: 0, size);
1530 while(off < size) {
1531 SZ char_size;
1532
1533 codepoint = md_decode_unicode(str: label, off, str_size: size, p_char_size: &char_size);
1534 is_whitespace = ISUNICODEWHITESPACE_(codepoint) || ISNEWLINE_(label[off]);
1535
1536 if(is_whitespace) {
1537 codepoint = ' ';
1538 hash = md_fnv1a(base: hash, data: &codepoint, n: sizeof(unsigned));
1539 off = md_skip_unicode_whitespace(label, off, size);
1540 } else {
1541 MD_UNICODE_FOLD_INFO fold_info;
1542
1543 md_get_unicode_fold_info(codepoint, info: &fold_info);
1544 hash = md_fnv1a(base: hash, data: fold_info.codepoints, n: fold_info.n_codepoints * sizeof(unsigned));
1545 off += char_size;
1546 }
1547 }
1548
1549 return hash;
1550}
1551
1552static OFF
1553md_link_label_cmp_load_fold_info(const CHAR* label, OFF off, SZ size,
1554 MD_UNICODE_FOLD_INFO* fold_info)
1555{
1556 unsigned codepoint;
1557 SZ char_size;
1558
1559 if(off >= size) {
1560 /* Treat end of a link label as a whitespace. */
1561 goto whitespace;
1562 }
1563
1564 codepoint = md_decode_unicode(str: label, off, str_size: size, p_char_size: &char_size);
1565 off += char_size;
1566 if(ISUNICODEWHITESPACE_(codepoint)) {
1567 /* Treat all whitespace as equivalent */
1568 goto whitespace;
1569 }
1570
1571 /* Get real folding info. */
1572 md_get_unicode_fold_info(codepoint, info: fold_info);
1573 return off;
1574
1575whitespace:
1576 fold_info->codepoints[0] = _T(' ');
1577 fold_info->n_codepoints = 1;
1578 return md_skip_unicode_whitespace(label, off, size);
1579}
1580
1581static int
1582md_link_label_cmp(const CHAR* a_label, SZ a_size, const CHAR* b_label, SZ b_size)
1583{
1584 OFF a_off;
1585 OFF b_off;
1586 MD_UNICODE_FOLD_INFO a_fi = { { 0 }, 0 };
1587 MD_UNICODE_FOLD_INFO b_fi = { { 0 }, 0 };
1588 OFF a_fi_off = 0;
1589 OFF b_fi_off = 0;
1590 int cmp;
1591
1592 a_off = md_skip_unicode_whitespace(label: a_label, off: 0, size: a_size);
1593 b_off = md_skip_unicode_whitespace(label: b_label, off: 0, size: b_size);
1594 while(a_off < a_size || a_fi_off < a_fi.n_codepoints ||
1595 b_off < b_size || b_fi_off < b_fi.n_codepoints)
1596 {
1597 /* If needed, load fold info for next char. */
1598 if(a_fi_off >= a_fi.n_codepoints) {
1599 a_fi_off = 0;
1600 a_off = md_link_label_cmp_load_fold_info(label: a_label, off: a_off, size: a_size, fold_info: &a_fi);
1601 }
1602 if(b_fi_off >= b_fi.n_codepoints) {
1603 b_fi_off = 0;
1604 b_off = md_link_label_cmp_load_fold_info(label: b_label, off: b_off, size: b_size, fold_info: &b_fi);
1605 }
1606
1607 cmp = b_fi.codepoints[b_fi_off] - a_fi.codepoints[a_fi_off];
1608 if(cmp != 0)
1609 return cmp;
1610
1611 a_fi_off++;
1612 b_fi_off++;
1613 }
1614
1615 return 0;
1616}
1617
1618typedef struct MD_REF_DEF_LIST_tag MD_REF_DEF_LIST;
1619struct MD_REF_DEF_LIST_tag {
1620 int n_ref_defs;
1621 int alloc_ref_defs;
1622 MD_REF_DEF* ref_defs[]; /* Valid items always point into ctx->ref_defs[] */
1623};
1624
1625static int
1626md_ref_def_cmp(const void* a, const void* b)
1627{
1628 const MD_REF_DEF* a_ref = *(const MD_REF_DEF**)a;
1629 const MD_REF_DEF* b_ref = *(const MD_REF_DEF**)b;
1630
1631 if(a_ref->hash < b_ref->hash)
1632 return -1;
1633 else if(a_ref->hash > b_ref->hash)
1634 return +1;
1635 else
1636 return md_link_label_cmp(a_label: a_ref->label, a_size: a_ref->label_size, b_label: b_ref->label, b_size: b_ref->label_size);
1637}
1638
1639static int
1640md_ref_def_cmp_for_sort(const void* a, const void* b)
1641{
1642 int cmp;
1643
1644 cmp = md_ref_def_cmp(a, b);
1645
1646 /* Ensure stability of the sorting. */
1647 if(cmp == 0) {
1648 const MD_REF_DEF* a_ref = *(const MD_REF_DEF**)a;
1649 const MD_REF_DEF* b_ref = *(const MD_REF_DEF**)b;
1650
1651 if(a_ref < b_ref)
1652 cmp = -1;
1653 else if(a_ref > b_ref)
1654 cmp = +1;
1655 else
1656 cmp = 0;
1657 }
1658
1659 return cmp;
1660}
1661
1662static int
1663md_build_ref_def_hashtable(MD_CTX* ctx)
1664{
1665 int i, j;
1666
1667 if(ctx->n_ref_defs == 0)
1668 return 0;
1669
1670 ctx->ref_def_hashtable_size = (ctx->n_ref_defs * 5) / 4;
1671 ctx->ref_def_hashtable = malloc(size: ctx->ref_def_hashtable_size * sizeof(void*));
1672 if(ctx->ref_def_hashtable == NULL) {
1673 MD_LOG("malloc() failed.");
1674 goto abort;
1675 }
1676 memset(s: ctx->ref_def_hashtable, c: 0, n: ctx->ref_def_hashtable_size * sizeof(void*));
1677
1678 /* Each member of ctx->ref_def_hashtable[] can be:
1679 * -- NULL,
1680 * -- pointer to the MD_REF_DEF in ctx->ref_defs[], or
1681 * -- pointer to a MD_REF_DEF_LIST, which holds multiple pointers to
1682 * such MD_REF_DEFs.
1683 */
1684 for(i = 0; i < ctx->n_ref_defs; i++) {
1685 MD_REF_DEF* def = &ctx->ref_defs[i];
1686 void* bucket;
1687 MD_REF_DEF_LIST* list;
1688
1689 def->hash = md_link_label_hash(label: def->label, size: def->label_size);
1690 bucket = ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size];
1691
1692 if(bucket == NULL) {
1693 /* The bucket is empty. Make it just point to the def. */
1694 ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size] = def;
1695 continue;
1696 }
1697
1698 if(ctx->ref_defs <= (MD_REF_DEF*) bucket && (MD_REF_DEF*) bucket < ctx->ref_defs + ctx->n_ref_defs) {
1699 /* The bucket already contains one ref. def. Lets see whether it
1700 * is the same label (ref. def. duplicate) or different one
1701 * (hash conflict). */
1702 MD_REF_DEF* old_def = (MD_REF_DEF*) bucket;
1703
1704 if(md_link_label_cmp(a_label: def->label, a_size: def->label_size, b_label: old_def->label, b_size: old_def->label_size) == 0) {
1705 /* Duplicate label: Ignore this ref. def. */
1706 continue;
1707 }
1708
1709 /* Make the bucket complex, i.e. able to hold more ref. defs. */
1710 list = (MD_REF_DEF_LIST*) malloc(size: sizeof(MD_REF_DEF_LIST) + 2 * sizeof(MD_REF_DEF*));
1711 if(list == NULL) {
1712 MD_LOG("malloc() failed.");
1713 goto abort;
1714 }
1715 list->ref_defs[0] = old_def;
1716 list->ref_defs[1] = def;
1717 list->n_ref_defs = 2;
1718 list->alloc_ref_defs = 2;
1719 ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size] = list;
1720 continue;
1721 }
1722
1723 /* Append the def to the complex bucket list.
1724 *
1725 * Note in this case we ignore potential duplicates to avoid expensive
1726 * iterating over the complex bucket. Below, we revisit all the complex
1727 * buckets and handle it more cheaply after the complex bucket contents
1728 * is sorted. */
1729 list = (MD_REF_DEF_LIST*) bucket;
1730 if(list->n_ref_defs >= list->alloc_ref_defs) {
1731 int alloc_ref_defs = list->alloc_ref_defs + list->alloc_ref_defs / 2;
1732 MD_REF_DEF_LIST* list_tmp = (MD_REF_DEF_LIST*) realloc(ptr: list,
1733 size: sizeof(MD_REF_DEF_LIST) + alloc_ref_defs * sizeof(MD_REF_DEF*));
1734 if(list_tmp == NULL) {
1735 MD_LOG("realloc() failed.");
1736 goto abort;
1737 }
1738 list = list_tmp;
1739 list->alloc_ref_defs = alloc_ref_defs;
1740 ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size] = list;
1741 }
1742
1743 list->ref_defs[list->n_ref_defs] = def;
1744 list->n_ref_defs++;
1745 }
1746
1747 /* Sort the complex buckets so we can use bsearch() with them. */
1748 for(i = 0; i < ctx->ref_def_hashtable_size; i++) {
1749 void* bucket = ctx->ref_def_hashtable[i];
1750 MD_REF_DEF_LIST* list;
1751
1752 if(bucket == NULL)
1753 continue;
1754 if(ctx->ref_defs <= (MD_REF_DEF*) bucket && (MD_REF_DEF*) bucket < ctx->ref_defs + ctx->n_ref_defs)
1755 continue;
1756
1757 list = (MD_REF_DEF_LIST*) bucket;
1758 qsort(base: list->ref_defs, nmemb: list->n_ref_defs, size: sizeof(MD_REF_DEF*), compar: md_ref_def_cmp_for_sort);
1759
1760 /* Disable all duplicates in the complex bucket by forcing all such
1761 * records to point to the 1st such ref. def. I.e. no matter which
1762 * record is found during the lookup, it will always point to the right
1763 * ref. def. in ctx->ref_defs[]. */
1764 for(j = 1; j < list->n_ref_defs; j++) {
1765 if(md_ref_def_cmp(a: &list->ref_defs[j-1], b: &list->ref_defs[j]) == 0)
1766 list->ref_defs[j] = list->ref_defs[j-1];
1767 }
1768 }
1769
1770 return 0;
1771
1772abort:
1773 return -1;
1774}
1775
1776static void
1777md_free_ref_def_hashtable(MD_CTX* ctx)
1778{
1779 if(ctx->ref_def_hashtable != NULL) {
1780 int i;
1781
1782 for(i = 0; i < ctx->ref_def_hashtable_size; i++) {
1783 void* bucket = ctx->ref_def_hashtable[i];
1784 if(bucket == NULL)
1785 continue;
1786 if(ctx->ref_defs <= (MD_REF_DEF*) bucket && (MD_REF_DEF*) bucket < ctx->ref_defs + ctx->n_ref_defs)
1787 continue;
1788 free(ptr: bucket);
1789 }
1790
1791 free(ptr: ctx->ref_def_hashtable);
1792 }
1793}
1794
1795static const MD_REF_DEF*
1796md_lookup_ref_def(MD_CTX* ctx, const CHAR* label, SZ label_size)
1797{
1798 unsigned hash;
1799 void* bucket;
1800
1801 if(ctx->ref_def_hashtable_size == 0)
1802 return NULL;
1803
1804 hash = md_link_label_hash(label, size: label_size);
1805 bucket = ctx->ref_def_hashtable[hash % ctx->ref_def_hashtable_size];
1806
1807 if(bucket == NULL) {
1808 return NULL;
1809 } else if(ctx->ref_defs <= (MD_REF_DEF*) bucket && (MD_REF_DEF*) bucket < ctx->ref_defs + ctx->n_ref_defs) {
1810 const MD_REF_DEF* def = (MD_REF_DEF*) bucket;
1811
1812 if(md_link_label_cmp(a_label: def->label, a_size: def->label_size, b_label: label, b_size: label_size) == 0)
1813 return def;
1814 else
1815 return NULL;
1816 } else {
1817 MD_REF_DEF_LIST* list = (MD_REF_DEF_LIST*) bucket;
1818 MD_REF_DEF key_buf;
1819 const MD_REF_DEF* key = &key_buf;
1820 const MD_REF_DEF** ret;
1821
1822 key_buf.label = (CHAR*) label;
1823 key_buf.label_size = label_size;
1824 key_buf.hash = md_link_label_hash(label: key_buf.label, size: key_buf.label_size);
1825
1826 ret = (const MD_REF_DEF**) bsearch(key: &key, base: list->ref_defs,
1827 nmemb: list->n_ref_defs, size: sizeof(MD_REF_DEF*), compar: md_ref_def_cmp);
1828 if(ret != NULL)
1829 return *ret;
1830 else
1831 return NULL;
1832 }
1833}
1834
1835
1836/***************************
1837 *** Recognizing Links ***
1838 ***************************/
1839
1840/* Note this code is partially shared between processing inlines and blocks
1841 * as reference definitions and links share some helper parser functions.
1842 */
1843
1844typedef struct MD_LINK_ATTR_tag MD_LINK_ATTR;
1845struct MD_LINK_ATTR_tag {
1846 OFF dest_beg;
1847 OFF dest_end;
1848
1849 CHAR* title;
1850 SZ title_size;
1851 int title_needs_free;
1852};
1853
1854
1855static int
1856md_is_link_label(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg,
1857 OFF* p_end, int* p_beg_line_index, int* p_end_line_index,
1858 OFF* p_contents_beg, OFF* p_contents_end)
1859{
1860 OFF off = beg;
1861 OFF contents_beg = 0;
1862 OFF contents_end = 0;
1863 int line_index = 0;
1864 int len = 0;
1865
1866 if(CH(off) != _T('['))
1867 return FALSE;
1868 off++;
1869
1870 while(1) {
1871 OFF line_end = lines[line_index].end;
1872
1873 while(off < line_end) {
1874 if(CH(off) == _T('\\') && off+1 < ctx->size && (ISPUNCT(off+1) || ISNEWLINE(off+1))) {
1875 if(contents_end == 0) {
1876 contents_beg = off;
1877 *p_beg_line_index = line_index;
1878 }
1879 contents_end = off + 2;
1880 off += 2;
1881 } else if(CH(off) == _T('[')) {
1882 return FALSE;
1883 } else if(CH(off) == _T(']')) {
1884 if(contents_beg < contents_end) {
1885 /* Success. */
1886 *p_contents_beg = contents_beg;
1887 *p_contents_end = contents_end;
1888 *p_end = off+1;
1889 *p_end_line_index = line_index;
1890 return TRUE;
1891 } else {
1892 /* Link label must have some non-whitespace contents. */
1893 return FALSE;
1894 }
1895 } else {
1896 unsigned codepoint;
1897 SZ char_size;
1898
1899 codepoint = md_decode_unicode(str: ctx->text, off, str_size: ctx->size, p_char_size: &char_size);
1900 if(!ISUNICODEWHITESPACE_(codepoint)) {
1901 if(contents_end == 0) {
1902 contents_beg = off;
1903 *p_beg_line_index = line_index;
1904 }
1905 contents_end = off + char_size;
1906 }
1907
1908 off += char_size;
1909 }
1910
1911 len++;
1912 if(len > 999)
1913 return FALSE;
1914 }
1915
1916 line_index++;
1917 len++;
1918 if(line_index < n_lines)
1919 off = lines[line_index].beg;
1920 else
1921 break;
1922 }
1923
1924 return FALSE;
1925}
1926
1927static int
1928md_is_link_destination_A(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end,
1929 OFF* p_contents_beg, OFF* p_contents_end)
1930{
1931 OFF off = beg;
1932
1933 if(off >= max_end || CH(off) != _T('<'))
1934 return FALSE;
1935 off++;
1936
1937 while(off < max_end) {
1938 if(CH(off) == _T('\\') && off+1 < max_end && ISPUNCT(off+1)) {
1939 off += 2;
1940 continue;
1941 }
1942
1943 if(ISNEWLINE(off) || CH(off) == _T('<'))
1944 return FALSE;
1945
1946 if(CH(off) == _T('>')) {
1947 /* Success. */
1948 *p_contents_beg = beg+1;
1949 *p_contents_end = off;
1950 *p_end = off+1;
1951 return TRUE;
1952 }
1953
1954 off++;
1955 }
1956
1957 return FALSE;
1958}
1959
1960static int
1961md_is_link_destination_B(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end,
1962 OFF* p_contents_beg, OFF* p_contents_end)
1963{
1964 OFF off = beg;
1965 int parenthesis_level = 0;
1966
1967 while(off < max_end) {
1968 if(CH(off) == _T('\\') && off+1 < max_end && ISPUNCT(off+1)) {
1969 off += 2;
1970 continue;
1971 }
1972
1973 if(ISWHITESPACE(off) || ISCNTRL(off))
1974 break;
1975
1976 /* Link destination may include balanced pairs of unescaped '(' ')'.
1977 * Note we limit the maximal nesting level by 32 to protect us from
1978 * https://github.com/jgm/cmark/issues/214 */
1979 if(CH(off) == _T('(')) {
1980 parenthesis_level++;
1981 if(parenthesis_level > 32)
1982 return FALSE;
1983 } else if(CH(off) == _T(')')) {
1984 if(parenthesis_level == 0)
1985 break;
1986 parenthesis_level--;
1987 }
1988
1989 off++;
1990 }
1991
1992 if(parenthesis_level != 0 || off == beg)
1993 return FALSE;
1994
1995 /* Success. */
1996 *p_contents_beg = beg;
1997 *p_contents_end = off;
1998 *p_end = off;
1999 return TRUE;
2000}
2001
2002static inline int
2003md_is_link_destination(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end,
2004 OFF* p_contents_beg, OFF* p_contents_end)
2005{
2006 if(CH(beg) == _T('<'))
2007 return md_is_link_destination_A(ctx, beg, max_end, p_end, p_contents_beg, p_contents_end);
2008 else
2009 return md_is_link_destination_B(ctx, beg, max_end, p_end, p_contents_beg, p_contents_end);
2010}
2011
2012static int
2013md_is_link_title(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg,
2014 OFF* p_end, int* p_beg_line_index, int* p_end_line_index,
2015 OFF* p_contents_beg, OFF* p_contents_end)
2016{
2017 OFF off = beg;
2018 CHAR closer_char;
2019 int line_index = 0;
2020
2021 /* White space with up to one line break. */
2022 while(off < lines[line_index].end && ISWHITESPACE(off))
2023 off++;
2024 if(off >= lines[line_index].end) {
2025 line_index++;
2026 if(line_index >= n_lines)
2027 return FALSE;
2028 off = lines[line_index].beg;
2029 }
2030 if(off == beg)
2031 return FALSE;
2032
2033 *p_beg_line_index = line_index;
2034
2035 /* First char determines how to detect end of it. */
2036 switch(CH(off)) {
2037 case _T('"'): closer_char = _T('"'); break;
2038 case _T('\''): closer_char = _T('\''); break;
2039 case _T('('): closer_char = _T(')'); break;
2040 default: return FALSE;
2041 }
2042 off++;
2043
2044 *p_contents_beg = off;
2045
2046 while(line_index < n_lines) {
2047 OFF line_end = lines[line_index].end;
2048
2049 while(off < line_end) {
2050 if(CH(off) == _T('\\') && off+1 < ctx->size && (ISPUNCT(off+1) || ISNEWLINE(off+1))) {
2051 off++;
2052 } else if(CH(off) == closer_char) {
2053 /* Success. */
2054 *p_contents_end = off;
2055 *p_end = off+1;
2056 *p_end_line_index = line_index;
2057 return TRUE;
2058 } else if(closer_char == _T(')') && CH(off) == _T('(')) {
2059 /* ()-style title cannot contain (unescaped '(')) */
2060 return FALSE;
2061 }
2062
2063 off++;
2064 }
2065
2066 line_index++;
2067 }
2068
2069 return FALSE;
2070}
2071
2072/* Returns 0 if it is not a reference definition.
2073 *
2074 * Returns N > 0 if it is a reference definition. N then corresponds to the
2075 * number of lines forming it). In this case the definition is stored for
2076 * resolving any links referring to it.
2077 *
2078 * Returns -1 in case of an error (out of memory).
2079 */
2080static int
2081md_is_link_reference_definition(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
2082{
2083 OFF label_contents_beg;
2084 OFF label_contents_end;
2085 int label_contents_line_index = -1;
2086 int label_is_multiline = FALSE;
2087 OFF dest_contents_beg;
2088 OFF dest_contents_end;
2089 OFF title_contents_beg;
2090 OFF title_contents_end;
2091 int title_contents_line_index;
2092 int title_is_multiline = FALSE;
2093 OFF off;
2094 int line_index = 0;
2095 int tmp_line_index;
2096 MD_REF_DEF* def = NULL;
2097 int ret = 0;
2098
2099 /* Link label. */
2100 if(!md_is_link_label(ctx, lines, n_lines, beg: lines[0].beg,
2101 p_end: &off, p_beg_line_index: &label_contents_line_index, p_end_line_index: &line_index,
2102 p_contents_beg: &label_contents_beg, p_contents_end: &label_contents_end))
2103 return FALSE;
2104 label_is_multiline = (label_contents_line_index != line_index);
2105
2106 /* Colon. */
2107 if(off >= lines[line_index].end || CH(off) != _T(':'))
2108 return FALSE;
2109 off++;
2110
2111 /* Optional white space with up to one line break. */
2112 while(off < lines[line_index].end && ISWHITESPACE(off))
2113 off++;
2114 if(off >= lines[line_index].end) {
2115 line_index++;
2116 if(line_index >= n_lines)
2117 return FALSE;
2118 off = lines[line_index].beg;
2119 }
2120
2121 /* Link destination. */
2122 if(!md_is_link_destination(ctx, beg: off, max_end: lines[line_index].end,
2123 p_end: &off, p_contents_beg: &dest_contents_beg, p_contents_end: &dest_contents_end))
2124 return FALSE;
2125
2126 /* (Optional) title. Note we interpret it as an title only if nothing
2127 * more follows on its last line. */
2128 if(md_is_link_title(ctx, lines: lines + line_index, n_lines: n_lines - line_index, beg: off,
2129 p_end: &off, p_beg_line_index: &title_contents_line_index, p_end_line_index: &tmp_line_index,
2130 p_contents_beg: &title_contents_beg, p_contents_end: &title_contents_end)
2131 && off >= lines[line_index + tmp_line_index].end)
2132 {
2133 title_is_multiline = (tmp_line_index != title_contents_line_index);
2134 title_contents_line_index += line_index;
2135 line_index += tmp_line_index;
2136 } else {
2137 /* Not a title. */
2138 title_is_multiline = FALSE;
2139 title_contents_beg = off;
2140 title_contents_end = off;
2141 title_contents_line_index = 0;
2142 }
2143
2144 /* Nothing more can follow on the last line. */
2145 if(off < lines[line_index].end)
2146 return FALSE;
2147
2148 /* So, it _is_ a reference definition. Remember it. */
2149 if(ctx->n_ref_defs >= ctx->alloc_ref_defs) {
2150 MD_REF_DEF* new_defs;
2151
2152 ctx->alloc_ref_defs = (ctx->alloc_ref_defs > 0
2153 ? ctx->alloc_ref_defs + ctx->alloc_ref_defs / 2
2154 : 16);
2155 new_defs = (MD_REF_DEF*) realloc(ptr: ctx->ref_defs, size: ctx->alloc_ref_defs * sizeof(MD_REF_DEF));
2156 if(new_defs == NULL) {
2157 MD_LOG("realloc() failed.");
2158 goto abort;
2159 }
2160
2161 ctx->ref_defs = new_defs;
2162 }
2163 def = &ctx->ref_defs[ctx->n_ref_defs];
2164 memset(s: def, c: 0, n: sizeof(MD_REF_DEF));
2165
2166 if(label_is_multiline) {
2167 MD_CHECK(md_merge_lines_alloc(ctx, label_contents_beg, label_contents_end,
2168 lines + label_contents_line_index, n_lines - label_contents_line_index,
2169 _T(' '), &def->label, &def->label_size));
2170 def->label_needs_free = TRUE;
2171 } else {
2172 def->label = (CHAR*) STR(label_contents_beg);
2173 def->label_size = label_contents_end - label_contents_beg;
2174 }
2175
2176 if(title_is_multiline) {
2177 MD_CHECK(md_merge_lines_alloc(ctx, title_contents_beg, title_contents_end,
2178 lines + title_contents_line_index, n_lines - title_contents_line_index,
2179 _T('\n'), &def->title, &def->title_size));
2180 def->title_needs_free = TRUE;
2181 } else {
2182 def->title = (CHAR*) STR(title_contents_beg);
2183 def->title_size = title_contents_end - title_contents_beg;
2184 }
2185
2186 def->dest_beg = dest_contents_beg;
2187 def->dest_end = dest_contents_end;
2188
2189 /* Success. */
2190 ctx->n_ref_defs++;
2191 return line_index + 1;
2192
2193abort:
2194 /* Failure. */
2195 if(def != NULL && def->label_needs_free)
2196 free(ptr: def->label);
2197 if(def != NULL && def->title_needs_free)
2198 free(ptr: def->title);
2199 return ret;
2200}
2201
2202static int
2203md_is_link_reference(MD_CTX* ctx, const MD_LINE* lines, int n_lines,
2204 OFF beg, OFF end, MD_LINK_ATTR* attr)
2205{
2206 const MD_REF_DEF* def;
2207 const MD_LINE* beg_line;
2208 const MD_LINE* end_line;
2209 CHAR* label;
2210 SZ label_size;
2211 int ret;
2212
2213 MD_ASSERT(CH(beg) == _T('[') || CH(beg) == _T('!'));
2214 MD_ASSERT(CH(end-1) == _T(']'));
2215
2216 beg += (CH(beg) == _T('!') ? 2 : 1);
2217 end--;
2218
2219 /* Find lines corresponding to the beg and end positions. */
2220 MD_ASSERT(lines[0].beg <= beg);
2221 beg_line = lines;
2222 while(beg >= beg_line->end)
2223 beg_line++;
2224
2225 MD_ASSERT(end <= lines[n_lines-1].end);
2226 end_line = beg_line;
2227 while(end >= end_line->end)
2228 end_line++;
2229
2230 if(beg_line != end_line) {
2231 MD_CHECK(md_merge_lines_alloc(ctx, beg, end, beg_line,
2232 (int)(n_lines - (beg_line - lines)), _T(' '), &label, &label_size));
2233 } else {
2234 label = (CHAR*) STR(beg);
2235 label_size = end - beg;
2236 }
2237
2238 def = md_lookup_ref_def(ctx, label, label_size);
2239 if(def != NULL) {
2240 attr->dest_beg = def->dest_beg;
2241 attr->dest_end = def->dest_end;
2242 attr->title = def->title;
2243 attr->title_size = def->title_size;
2244 attr->title_needs_free = FALSE;
2245 }
2246
2247 if(beg_line != end_line)
2248 free(ptr: label);
2249
2250 ret = (def != NULL);
2251
2252abort:
2253 return ret;
2254}
2255
2256static int
2257md_is_inline_link_spec(MD_CTX* ctx, const MD_LINE* lines, int n_lines,
2258 OFF beg, OFF* p_end, MD_LINK_ATTR* attr)
2259{
2260 int line_index = 0;
2261 int tmp_line_index;
2262 OFF title_contents_beg;
2263 OFF title_contents_end;
2264 int title_contents_line_index;
2265 int title_is_multiline;
2266 OFF off = beg;
2267 int ret = FALSE;
2268
2269 while(off >= lines[line_index].end)
2270 line_index++;
2271
2272 MD_ASSERT(CH(off) == _T('('));
2273 off++;
2274
2275 /* Optional white space with up to one line break. */
2276 while(off < lines[line_index].end && ISWHITESPACE(off))
2277 off++;
2278 if(off >= lines[line_index].end && ISNEWLINE(off)) {
2279 line_index++;
2280 if(line_index >= n_lines)
2281 return FALSE;
2282 off = lines[line_index].beg;
2283 }
2284
2285 /* Link destination may be omitted, but only when not also having a title. */
2286 if(off < ctx->size && CH(off) == _T(')')) {
2287 attr->dest_beg = off;
2288 attr->dest_end = off;
2289 attr->title = NULL;
2290 attr->title_size = 0;
2291 attr->title_needs_free = FALSE;
2292 off++;
2293 *p_end = off;
2294 return TRUE;
2295 }
2296
2297 /* Link destination. */
2298 if(!md_is_link_destination(ctx, beg: off, max_end: lines[line_index].end,
2299 p_end: &off, p_contents_beg: &attr->dest_beg, p_contents_end: &attr->dest_end))
2300 return FALSE;
2301
2302 /* (Optional) title. */
2303 if(md_is_link_title(ctx, lines: lines + line_index, n_lines: n_lines - line_index, beg: off,
2304 p_end: &off, p_beg_line_index: &title_contents_line_index, p_end_line_index: &tmp_line_index,
2305 p_contents_beg: &title_contents_beg, p_contents_end: &title_contents_end))
2306 {
2307 title_is_multiline = (tmp_line_index != title_contents_line_index);
2308 title_contents_line_index += line_index;
2309 line_index += tmp_line_index;
2310 } else {
2311 /* Not a title. */
2312 title_is_multiline = FALSE;
2313 title_contents_beg = off;
2314 title_contents_end = off;
2315 title_contents_line_index = 0;
2316 }
2317
2318 /* Optional whitespace followed with final ')'. */
2319 while(off < lines[line_index].end && ISWHITESPACE(off))
2320 off++;
2321 if(off >= lines[line_index].end && ISNEWLINE(off)) {
2322 line_index++;
2323 if(line_index >= n_lines)
2324 return FALSE;
2325 off = lines[line_index].beg;
2326 }
2327 if(CH(off) != _T(')'))
2328 goto abort;
2329 off++;
2330
2331 if(title_contents_beg >= title_contents_end) {
2332 attr->title = NULL;
2333 attr->title_size = 0;
2334 attr->title_needs_free = FALSE;
2335 } else if(!title_is_multiline) {
2336 attr->title = (CHAR*) STR(title_contents_beg);
2337 attr->title_size = title_contents_end - title_contents_beg;
2338 attr->title_needs_free = FALSE;
2339 } else {
2340 MD_CHECK(md_merge_lines_alloc(ctx, title_contents_beg, title_contents_end,
2341 lines + title_contents_line_index, n_lines - title_contents_line_index,
2342 _T('\n'), &attr->title, &attr->title_size));
2343 attr->title_needs_free = TRUE;
2344 }
2345
2346 *p_end = off;
2347 ret = TRUE;
2348
2349abort:
2350 return ret;
2351}
2352
2353static void
2354md_free_ref_defs(MD_CTX* ctx)
2355{
2356 int i;
2357
2358 for(i = 0; i < ctx->n_ref_defs; i++) {
2359 MD_REF_DEF* def = &ctx->ref_defs[i];
2360
2361 if(def->label_needs_free)
2362 free(ptr: def->label);
2363 if(def->title_needs_free)
2364 free(ptr: def->title);
2365 }
2366
2367 free(ptr: ctx->ref_defs);
2368}
2369
2370
2371/******************************************
2372 *** Processing Inlines (a.k.a Spans) ***
2373 ******************************************/
2374
2375/* We process inlines in few phases:
2376 *
2377 * (1) We go through the block text and collect all significant characters
2378 * which may start/end a span or some other significant position into
2379 * ctx->marks[]. Core of this is what md_collect_marks() does.
2380 *
2381 * We also do some very brief preliminary context-less analysis, whether
2382 * it might be opener or closer (e.g. of an emphasis span).
2383 *
2384 * This speeds the other steps as we do not need to re-iterate over all
2385 * characters anymore.
2386 *
2387 * (2) We analyze each potential mark types, in order by their precedence.
2388 *
2389 * In each md_analyze_XXX() function, we re-iterate list of the marks,
2390 * skipping already resolved regions (in preceding precedences) and try to
2391 * resolve them.
2392 *
2393 * (2.1) For trivial marks, which are single (e.g. HTML entity), we just mark
2394 * them as resolved.
2395 *
2396 * (2.2) For range-type marks, we analyze whether the mark could be closer
2397 * and, if yes, whether there is some preceding opener it could satisfy.
2398 *
2399 * If not we check whether it could be really an opener and if yes, we
2400 * remember it so subsequent closers may resolve it.
2401 *
2402 * (3) Finally, when all marks were analyzed, we render the block contents
2403 * by calling MD_RENDERER::text() callback, interrupting by ::enter_span()
2404 * or ::close_span() whenever we reach a resolved mark.
2405 */
2406
2407
2408/* The mark structure.
2409 *
2410 * '\\': Maybe escape sequence.
2411 * '\0': NULL char.
2412 * '*': Maybe (strong) emphasis start/end.
2413 * '_': Maybe (strong) emphasis start/end.
2414 * '~': Maybe strikethrough start/end (needs MD_FLAG_STRIKETHROUGH).
2415 * '`': Maybe code span start/end.
2416 * '&': Maybe start of entity.
2417 * ';': Maybe end of entity.
2418 * '<': Maybe start of raw HTML or autolink.
2419 * '>': Maybe end of raw HTML or autolink.
2420 * '[': Maybe start of link label or link text.
2421 * '!': Equivalent of '[' for image.
2422 * ']': Maybe end of link label or link text.
2423 * '@': Maybe permissive e-mail auto-link (needs MD_FLAG_PERMISSIVEEMAILAUTOLINKS).
2424 * ':': Maybe permissive URL auto-link (needs MD_FLAG_PERMISSIVEURLAUTOLINKS).
2425 * '.': Maybe permissive WWW auto-link (needs MD_FLAG_PERMISSIVEWWWAUTOLINKS).
2426 * 'D': Dummy mark, it reserves a space for splitting a previous mark
2427 * (e.g. emphasis) or to make more space for storing some special data
2428 * related to the preceding mark (e.g. link).
2429 *
2430 * Note that not all instances of these chars in the text imply creation of the
2431 * structure. Only those which have (or may have, after we see more context)
2432 * the special meaning.
2433 *
2434 * (Keep this struct as small as possible to fit as much of them into CPU
2435 * cache line.)
2436 */
2437struct MD_MARK_tag {
2438 OFF beg;
2439 OFF end;
2440
2441 /* For unresolved openers, 'prev' and 'next' form the chain of open openers
2442 * of given type 'ch'.
2443 *
2444 * During resolving, we disconnect from the chain and point to the
2445 * corresponding counterpart so opener points to its closer and vice versa.
2446 */
2447 int prev;
2448 int next;
2449 CHAR ch;
2450 unsigned char flags;
2451};
2452
2453/* Mark flags (these apply to ALL mark types). */
2454#define MD_MARK_POTENTIAL_OPENER 0x01 /* Maybe opener. */
2455#define MD_MARK_POTENTIAL_CLOSER 0x02 /* Maybe closer. */
2456#define MD_MARK_OPENER 0x04 /* Definitely opener. */
2457#define MD_MARK_CLOSER 0x08 /* Definitely closer. */
2458#define MD_MARK_RESOLVED 0x10 /* Resolved in any definite way. */
2459
2460/* Mark flags specific for various mark types (so they can share bits). */
2461#define MD_MARK_EMPH_INTRAWORD 0x20 /* Helper for the "rule of 3". */
2462#define MD_MARK_EMPH_MOD3_0 0x40
2463#define MD_MARK_EMPH_MOD3_1 0x80
2464#define MD_MARK_EMPH_MOD3_2 (0x40 | 0x80)
2465#define MD_MARK_EMPH_MOD3_MASK (0x40 | 0x80)
2466#define MD_MARK_AUTOLINK 0x20 /* Distinguisher for '<', '>'. */
2467#define MD_MARK_VALIDPERMISSIVEAUTOLINK 0x20 /* For permissive autolinks. */
2468
2469static MD_MARKCHAIN*
2470md_asterisk_chain(MD_CTX* ctx, unsigned flags)
2471{
2472 switch(flags & (MD_MARK_EMPH_INTRAWORD | MD_MARK_EMPH_MOD3_MASK)) {
2473 case MD_MARK_EMPH_INTRAWORD | MD_MARK_EMPH_MOD3_0: return &ASTERISK_OPENERS_intraword_mod3_0;
2474 case MD_MARK_EMPH_INTRAWORD | MD_MARK_EMPH_MOD3_1: return &ASTERISK_OPENERS_intraword_mod3_1;
2475 case MD_MARK_EMPH_INTRAWORD | MD_MARK_EMPH_MOD3_2: return &ASTERISK_OPENERS_intraword_mod3_2;
2476 case MD_MARK_EMPH_MOD3_0: return &ASTERISK_OPENERS_extraword_mod3_0;
2477 case MD_MARK_EMPH_MOD3_1: return &ASTERISK_OPENERS_extraword_mod3_1;
2478 case MD_MARK_EMPH_MOD3_2: return &ASTERISK_OPENERS_extraword_mod3_2;
2479 default: MD_UNREACHABLE();
2480 }
2481 return NULL;
2482}
2483
2484static MD_MARKCHAIN*
2485md_mark_chain(MD_CTX* ctx, int mark_index)
2486{
2487 MD_MARK* mark = &ctx->marks[mark_index];
2488
2489 switch(mark->ch) {
2490 case _T('*'): return md_asterisk_chain(ctx, flags: mark->flags);
2491 case _T('_'): return &UNDERSCORE_OPENERS;
2492 case _T('~'): return (mark->end - mark->beg == 1) ? &TILDE_OPENERS_1 : &TILDE_OPENERS_2;
2493 case _T('['): return &BRACKET_OPENERS;
2494 case _T('|'): return &TABLECELLBOUNDARIES;
2495 default: return NULL;
2496 }
2497}
2498
2499static MD_MARK*
2500md_push_mark(MD_CTX* ctx)
2501{
2502 if(ctx->n_marks >= ctx->alloc_marks) {
2503 MD_MARK* new_marks;
2504
2505 ctx->alloc_marks = (ctx->alloc_marks > 0
2506 ? ctx->alloc_marks + ctx->alloc_marks / 2
2507 : 64);
2508 new_marks = realloc(ptr: ctx->marks, size: ctx->alloc_marks * sizeof(MD_MARK));
2509 if(new_marks == NULL) {
2510 MD_LOG("realloc() failed.");
2511 return NULL;
2512 }
2513
2514 ctx->marks = new_marks;
2515 }
2516
2517 return &ctx->marks[ctx->n_marks++];
2518}
2519
2520#define PUSH_MARK_() \
2521 do { \
2522 mark = md_push_mark(ctx); \
2523 if(mark == NULL) { \
2524 ret = -1; \
2525 goto abort; \
2526 } \
2527 } while(0)
2528
2529#define PUSH_MARK(ch_, beg_, end_, flags_) \
2530 do { \
2531 PUSH_MARK_(); \
2532 mark->beg = (beg_); \
2533 mark->end = (end_); \
2534 mark->prev = -1; \
2535 mark->next = -1; \
2536 mark->ch = (char)(ch_); \
2537 mark->flags = (flags_); \
2538 } while(0)
2539
2540
2541static void
2542md_mark_chain_append(MD_CTX* ctx, MD_MARKCHAIN* chain, int mark_index)
2543{
2544 if(chain->tail >= 0)
2545 ctx->marks[chain->tail].next = mark_index;
2546 else
2547 chain->head = mark_index;
2548
2549 ctx->marks[mark_index].prev = chain->tail;
2550 ctx->marks[mark_index].next = -1;
2551 chain->tail = mark_index;
2552}
2553
2554/* Sometimes, we need to store a pointer into the mark. It is quite rare
2555 * so we do not bother to make MD_MARK use union, and it can only happen
2556 * for dummy marks. */
2557static inline void
2558md_mark_store_ptr(MD_CTX* ctx, int mark_index, void* ptr)
2559{
2560 MD_MARK* mark = &ctx->marks[mark_index];
2561 MD_ASSERT(mark->ch == 'D');
2562
2563 /* Check only members beg and end are misused for this. */
2564 MD_ASSERT(sizeof(void*) <= 2 * sizeof(OFF));
2565 memcpy(dest: mark, src: &ptr, n: sizeof(void*));
2566}
2567
2568static inline void*
2569md_mark_get_ptr(MD_CTX* ctx, int mark_index)
2570{
2571 void* ptr;
2572 MD_MARK* mark = &ctx->marks[mark_index];
2573 MD_ASSERT(mark->ch == 'D');
2574 memcpy(dest: &ptr, src: mark, n: sizeof(void*));
2575 return ptr;
2576}
2577
2578static void
2579md_resolve_range(MD_CTX* ctx, MD_MARKCHAIN* chain, int opener_index, int closer_index)
2580{
2581 MD_MARK* opener = &ctx->marks[opener_index];
2582 MD_MARK* closer = &ctx->marks[closer_index];
2583
2584 /* Remove opener from the list of openers. */
2585 if(chain != NULL) {
2586 if(opener->prev >= 0)
2587 ctx->marks[opener->prev].next = opener->next;
2588 else
2589 chain->head = opener->next;
2590
2591 if(opener->next >= 0)
2592 ctx->marks[opener->next].prev = opener->prev;
2593 else
2594 chain->tail = opener->prev;
2595 }
2596
2597 /* Interconnect opener and closer and mark both as resolved. */
2598 opener->next = closer_index;
2599 opener->flags |= MD_MARK_OPENER | MD_MARK_RESOLVED;
2600 closer->prev = opener_index;
2601 closer->flags |= MD_MARK_CLOSER | MD_MARK_RESOLVED;
2602}
2603
2604
2605#define MD_ROLLBACK_ALL 0
2606#define MD_ROLLBACK_CROSSING 1
2607
2608/* In the range ctx->marks[opener_index] ... [closer_index], undo some or all
2609 * resolvings accordingly to these rules:
2610 *
2611 * (1) All openers BEFORE the range corresponding to any closer inside the
2612 * range are un-resolved and they are re-added to their respective chains
2613 * of unresolved openers. This ensures we can reuse the opener for closers
2614 * AFTER the range.
2615 *
2616 * (2) If 'how' is MD_ROLLBACK_ALL, then ALL resolved marks inside the range
2617 * are discarded.
2618 *
2619 * (3) If 'how' is MD_ROLLBACK_CROSSING, only closers with openers handled
2620 * in (1) are discarded. I.e. pairs of openers and closers which are both
2621 * inside the range are retained as well as any unpaired marks.
2622 */
2623static void
2624md_rollback(MD_CTX* ctx, int opener_index, int closer_index, int how)
2625{
2626 int i;
2627 int mark_index;
2628
2629 /* Cut all unresolved openers at the mark index. */
2630 for(i = OPENERS_CHAIN_FIRST; i < OPENERS_CHAIN_LAST+1; i++) {
2631 MD_MARKCHAIN* chain = &ctx->mark_chains[i];
2632
2633 while(chain->tail >= opener_index)
2634 chain->tail = ctx->marks[chain->tail].prev;
2635
2636 if(chain->tail >= 0)
2637 ctx->marks[chain->tail].next = -1;
2638 else
2639 chain->head = -1;
2640 }
2641
2642 /* Go backwards so that unresolved openers are re-added into their
2643 * respective chains, in the right order. */
2644 mark_index = closer_index - 1;
2645 while(mark_index > opener_index) {
2646 MD_MARK* mark = &ctx->marks[mark_index];
2647 int mark_flags = mark->flags;
2648 int discard_flag = (how == MD_ROLLBACK_ALL);
2649
2650 if(mark->flags & MD_MARK_CLOSER) {
2651 int mark_opener_index = mark->prev;
2652
2653 /* Undo opener BEFORE the range. */
2654 if(mark_opener_index < opener_index) {
2655 MD_MARK* mark_opener = &ctx->marks[mark_opener_index];
2656 MD_MARKCHAIN* chain;
2657
2658 mark_opener->flags &= ~(MD_MARK_OPENER | MD_MARK_CLOSER | MD_MARK_RESOLVED);
2659 chain = md_mark_chain(ctx, mark_index: opener_index);
2660 if(chain != NULL) {
2661 md_mark_chain_append(ctx, chain, mark_index: mark_opener_index);
2662 discard_flag = 1;
2663 }
2664 }
2665 }
2666
2667 /* And reset our flags. */
2668 if(discard_flag)
2669 mark->flags &= ~(MD_MARK_OPENER | MD_MARK_CLOSER | MD_MARK_RESOLVED);
2670
2671 /* Jump as far as we can over unresolved or non-interesting marks. */
2672 switch(how) {
2673 case MD_ROLLBACK_CROSSING:
2674 if((mark_flags & MD_MARK_CLOSER) && mark->prev > opener_index) {
2675 /* If we are closer with opener INSIDE the range, there may
2676 * not be any other crosser inside the subrange. */
2677 mark_index = mark->prev;
2678 break;
2679 }
2680 MD_FALLTHROUGH();
2681 default:
2682 mark_index--;
2683 break;
2684 }
2685 }
2686}
2687
2688static void
2689md_build_mark_char_map(MD_CTX* ctx)
2690{
2691 memset(s: ctx->mark_char_map, c: 0, n: sizeof(ctx->mark_char_map));
2692
2693 ctx->mark_char_map['\\'] = 1;
2694 ctx->mark_char_map['*'] = 1;
2695 ctx->mark_char_map['_'] = 1;
2696 ctx->mark_char_map['`'] = 1;
2697 ctx->mark_char_map['&'] = 1;
2698 ctx->mark_char_map[';'] = 1;
2699 ctx->mark_char_map['<'] = 1;
2700 ctx->mark_char_map['>'] = 1;
2701 ctx->mark_char_map['['] = 1;
2702 ctx->mark_char_map['!'] = 1;
2703 ctx->mark_char_map[']'] = 1;
2704 ctx->mark_char_map['\0'] = 1;
2705
2706 if(ctx->parser.flags & MD_FLAG_STRIKETHROUGH)
2707 ctx->mark_char_map['~'] = 1;
2708
2709 if(ctx->parser.flags & MD_FLAG_LATEXMATHSPANS)
2710 ctx->mark_char_map['$'] = 1;
2711
2712 if(ctx->parser.flags & MD_FLAG_PERMISSIVEEMAILAUTOLINKS)
2713 ctx->mark_char_map['@'] = 1;
2714
2715 if(ctx->parser.flags & MD_FLAG_PERMISSIVEURLAUTOLINKS)
2716 ctx->mark_char_map[':'] = 1;
2717
2718 if(ctx->parser.flags & MD_FLAG_PERMISSIVEWWWAUTOLINKS)
2719 ctx->mark_char_map['.'] = 1;
2720
2721 if((ctx->parser.flags & MD_FLAG_TABLES) || (ctx->parser.flags & MD_FLAG_WIKILINKS))
2722 ctx->mark_char_map['|'] = 1;
2723
2724 if(ctx->parser.flags & MD_FLAG_COLLAPSEWHITESPACE) {
2725 int i;
2726
2727 for(i = 0; i < (int) sizeof(ctx->mark_char_map); i++) {
2728 if(ISWHITESPACE_(i))
2729 ctx->mark_char_map[i] = 1;
2730 }
2731 }
2732}
2733
2734/* We limit code span marks to lower than 32 backticks. This solves the
2735 * pathologic case of too many openers, each of different length: Their
2736 * resolving would be then O(n^2). */
2737#define CODESPAN_MARK_MAXLEN 32
2738
2739static int
2740md_is_code_span(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg,
2741 OFF* p_opener_beg, OFF* p_opener_end,
2742 OFF* p_closer_beg, OFF* p_closer_end,
2743 OFF last_potential_closers[CODESPAN_MARK_MAXLEN],
2744 int* p_reached_paragraph_end)
2745{
2746 OFF opener_beg = beg;
2747 OFF opener_end;
2748 OFF closer_beg;
2749 OFF closer_end;
2750 SZ mark_len;
2751 OFF line_end;
2752 int has_space_after_opener = FALSE;
2753 int has_eol_after_opener = FALSE;
2754 int has_space_before_closer = FALSE;
2755 int has_eol_before_closer = FALSE;
2756 int has_only_space = TRUE;
2757 int line_index = 0;
2758
2759 line_end = lines[0].end;
2760 opener_end = opener_beg;
2761 while(opener_end < line_end && CH(opener_end) == _T('`'))
2762 opener_end++;
2763 has_space_after_opener = (opener_end < line_end && CH(opener_end) == _T(' '));
2764 has_eol_after_opener = (opener_end == line_end);
2765
2766 /* The caller needs to know end of the opening mark even if we fail. */
2767 *p_opener_end = opener_end;
2768
2769 mark_len = opener_end - opener_beg;
2770 if(mark_len > CODESPAN_MARK_MAXLEN)
2771 return FALSE;
2772
2773 /* Check whether we already know there is no closer of this length.
2774 * If so, re-scan does no sense. This fixes issue #59. */
2775 if(last_potential_closers[mark_len-1] >= lines[n_lines-1].end ||
2776 (*p_reached_paragraph_end && last_potential_closers[mark_len-1] < opener_end))
2777 return FALSE;
2778
2779 closer_beg = opener_end;
2780 closer_end = opener_end;
2781
2782 /* Find closer mark. */
2783 while(TRUE) {
2784 while(closer_beg < line_end && CH(closer_beg) != _T('`')) {
2785 if(CH(closer_beg) != _T(' '))
2786 has_only_space = FALSE;
2787 closer_beg++;
2788 }
2789 closer_end = closer_beg;
2790 while(closer_end < line_end && CH(closer_end) == _T('`'))
2791 closer_end++;
2792
2793 if(closer_end - closer_beg == mark_len) {
2794 /* Success. */
2795 has_space_before_closer = (closer_beg > lines[line_index].beg && CH(closer_beg-1) == _T(' '));
2796 has_eol_before_closer = (closer_beg == lines[line_index].beg);
2797 break;
2798 }
2799
2800 if(closer_end - closer_beg > 0) {
2801 /* We have found a back-tick which is not part of the closer. */
2802 has_only_space = FALSE;
2803
2804 /* But if we eventually fail, remember it as a potential closer
2805 * of its own length for future attempts. This mitigates needs for
2806 * rescans. */
2807 if(closer_end - closer_beg < CODESPAN_MARK_MAXLEN) {
2808 if(closer_beg > last_potential_closers[closer_end - closer_beg - 1])
2809 last_potential_closers[closer_end - closer_beg - 1] = closer_beg;
2810 }
2811 }
2812
2813 if(closer_end >= line_end) {
2814 line_index++;
2815 if(line_index >= n_lines) {
2816 /* Reached end of the paragraph and still nothing. */
2817 *p_reached_paragraph_end = TRUE;
2818 return FALSE;
2819 }
2820 /* Try on the next line. */
2821 line_end = lines[line_index].end;
2822 closer_beg = lines[line_index].beg;
2823 } else {
2824 closer_beg = closer_end;
2825 }
2826 }
2827
2828 /* If there is a space or a new line both after and before the opener
2829 * (and if the code span is not made of spaces only), consume one initial
2830 * and one trailing space as part of the marks. */
2831 if(!has_only_space &&
2832 (has_space_after_opener || has_eol_after_opener) &&
2833 (has_space_before_closer || has_eol_before_closer))
2834 {
2835 if(has_space_after_opener)
2836 opener_end++;
2837 else
2838 opener_end = lines[1].beg;
2839
2840 if(has_space_before_closer)
2841 closer_beg--;
2842 else {
2843 closer_beg = lines[line_index-1].end;
2844 /* We need to eat the preceding "\r\n" but not any line trailing
2845 * spaces. */
2846 while(closer_beg < ctx->size && ISBLANK(closer_beg))
2847 closer_beg++;
2848 }
2849 }
2850
2851 *p_opener_beg = opener_beg;
2852 *p_opener_end = opener_end;
2853 *p_closer_beg = closer_beg;
2854 *p_closer_end = closer_end;
2855 return TRUE;
2856}
2857
2858static int
2859md_is_autolink_uri(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end)
2860{
2861 OFF off = beg+1;
2862
2863 MD_ASSERT(CH(beg) == _T('<'));
2864
2865 /* Check for scheme. */
2866 if(off >= max_end || !ISASCII(off))
2867 return FALSE;
2868 off++;
2869 while(1) {
2870 if(off >= max_end)
2871 return FALSE;
2872 if(off - beg > 32)
2873 return FALSE;
2874 if(CH(off) == _T(':') && off - beg >= 3)
2875 break;
2876 if(!ISALNUM(off) && CH(off) != _T('+') && CH(off) != _T('-') && CH(off) != _T('.'))
2877 return FALSE;
2878 off++;
2879 }
2880
2881 /* Check the path after the scheme. */
2882 while(off < max_end && CH(off) != _T('>')) {
2883 if(ISWHITESPACE(off) || ISCNTRL(off) || CH(off) == _T('<'))
2884 return FALSE;
2885 off++;
2886 }
2887
2888 if(off >= max_end)
2889 return FALSE;
2890
2891 MD_ASSERT(CH(off) == _T('>'));
2892 *p_end = off+1;
2893 return TRUE;
2894}
2895
2896static int
2897md_is_autolink_email(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end)
2898{
2899 OFF off = beg + 1;
2900 int label_len;
2901
2902 MD_ASSERT(CH(beg) == _T('<'));
2903
2904 /* The code should correspond to this regexp:
2905 /^[a-zA-Z0-9.!#$%&'*+\/=?^_`{|}~-]+
2906 @[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?
2907 (?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$/
2908 */
2909
2910 /* Username (before '@'). */
2911 while(off < max_end && (ISALNUM(off) || ISANYOF(off, _T(".!#$%&'*+/=?^_`{|}~-"))))
2912 off++;
2913 if(off <= beg+1)
2914 return FALSE;
2915
2916 /* '@' */
2917 if(off >= max_end || CH(off) != _T('@'))
2918 return FALSE;
2919 off++;
2920
2921 /* Labels delimited with '.'; each label is sequence of 1 - 63 alnum
2922 * characters or '-', but '-' is not allowed as first or last char. */
2923 label_len = 0;
2924 while(off < max_end) {
2925 if(ISALNUM(off))
2926 label_len++;
2927 else if(CH(off) == _T('-') && label_len > 0)
2928 label_len++;
2929 else if(CH(off) == _T('.') && label_len > 0 && CH(off-1) != _T('-'))
2930 label_len = 0;
2931 else
2932 break;
2933
2934 if(label_len > 63)
2935 return FALSE;
2936
2937 off++;
2938 }
2939
2940 if(label_len <= 0 || off >= max_end || CH(off) != _T('>') || CH(off-1) == _T('-'))
2941 return FALSE;
2942
2943 *p_end = off+1;
2944 return TRUE;
2945}
2946
2947static int
2948md_is_autolink(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end, int* p_missing_mailto)
2949{
2950 if(md_is_autolink_uri(ctx, beg, max_end, p_end)) {
2951 *p_missing_mailto = FALSE;
2952 return TRUE;
2953 }
2954
2955 if(md_is_autolink_email(ctx, beg, max_end, p_end)) {
2956 *p_missing_mailto = TRUE;
2957 return TRUE;
2958 }
2959
2960 return FALSE;
2961}
2962
2963static int
2964md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines, int table_mode)
2965{
2966 int i;
2967 int ret = 0;
2968 MD_MARK* mark;
2969 OFF codespan_last_potential_closers[CODESPAN_MARK_MAXLEN] = { 0 };
2970 int codespan_scanned_till_paragraph_end = FALSE;
2971
2972 for(i = 0; i < n_lines; i++) {
2973 const MD_LINE* line = &lines[i];
2974 OFF off = line->beg;
2975 OFF line_end = line->end;
2976
2977 while(TRUE) {
2978 CHAR ch;
2979
2980#ifdef MD4C_USE_UTF16
2981 /* For UTF-16, mark_char_map[] covers only ASCII. */
2982 #define IS_MARK_CHAR(off) ((CH(off) < SIZEOF_ARRAY(ctx->mark_char_map)) && \
2983 (ctx->mark_char_map[(unsigned char) CH(off)]))
2984#else
2985 /* For 8-bit encodings, mark_char_map[] covers all 256 elements. */
2986 #define IS_MARK_CHAR(off) (ctx->mark_char_map[(unsigned char) CH(off)])
2987#endif
2988
2989 /* Optimization: Use some loop unrolling. */
2990 while(off + 3 < line_end && !IS_MARK_CHAR(off+0) && !IS_MARK_CHAR(off+1)
2991 && !IS_MARK_CHAR(off+2) && !IS_MARK_CHAR(off+3))
2992 off += 4;
2993 while(off < line_end && !IS_MARK_CHAR(off+0))
2994 off++;
2995
2996 if(off >= line_end)
2997 break;
2998
2999 ch = CH(off);
3000
3001 /* A backslash escape.
3002 * It can go beyond line->end as it may involve escaped new
3003 * line to form a hard break. */
3004 if(ch == _T('\\') && off+1 < ctx->size && (ISPUNCT(off+1) || ISNEWLINE(off+1))) {
3005 /* Hard-break cannot be on the last line of the block. */
3006 if(!ISNEWLINE(off+1) || i+1 < n_lines)
3007 PUSH_MARK(ch, off, off+2, MD_MARK_RESOLVED);
3008 off += 2;
3009 continue;
3010 }
3011
3012 /* A potential (string) emphasis start/end. */
3013 if(ch == _T('*') || ch == _T('_')) {
3014 OFF tmp = off+1;
3015 int left_level; /* What precedes: 0 = whitespace; 1 = punctuation; 2 = other char. */
3016 int right_level; /* What follows: 0 = whitespace; 1 = punctuation; 2 = other char. */
3017
3018 while(tmp < line_end && CH(tmp) == ch)
3019 tmp++;
3020
3021 if(off == line->beg || ISUNICODEWHITESPACEBEFORE(off))
3022 left_level = 0;
3023 else if(ISUNICODEPUNCTBEFORE(off))
3024 left_level = 1;
3025 else
3026 left_level = 2;
3027
3028 if(tmp == line_end || ISUNICODEWHITESPACE(tmp))
3029 right_level = 0;
3030 else if(ISUNICODEPUNCT(tmp))
3031 right_level = 1;
3032 else
3033 right_level = 2;
3034
3035 /* Intra-word underscore doesn't have special meaning. */
3036 if(ch == _T('_') && left_level == 2 && right_level == 2) {
3037 left_level = 0;
3038 right_level = 0;
3039 }
3040
3041 if(left_level != 0 || right_level != 0) {
3042 unsigned flags = 0;
3043
3044 if(left_level > 0 && left_level >= right_level)
3045 flags |= MD_MARK_POTENTIAL_CLOSER;
3046 if(right_level > 0 && right_level >= left_level)
3047 flags |= MD_MARK_POTENTIAL_OPENER;
3048 if(left_level == 2 && right_level == 2)
3049 flags |= MD_MARK_EMPH_INTRAWORD;
3050
3051 /* For "the rule of three" we need to remember the original
3052 * size of the mark (modulo three), before we potentially
3053 * split the mark when being later resolved partially by some
3054 * shorter closer. */
3055 switch((tmp - off) % 3) {
3056 case 0: flags |= MD_MARK_EMPH_MOD3_0; break;
3057 case 1: flags |= MD_MARK_EMPH_MOD3_1; break;
3058 case 2: flags |= MD_MARK_EMPH_MOD3_2; break;
3059 }
3060
3061 PUSH_MARK(ch, off, tmp, flags);
3062
3063 /* During resolving, multiple asterisks may have to be
3064 * split into independent span start/ends. Consider e.g.
3065 * "**foo* bar*". Therefore we push also some empty dummy
3066 * marks to have enough space for that. */
3067 off++;
3068 while(off < tmp) {
3069 PUSH_MARK('D', off, off, 0);
3070 off++;
3071 }
3072 continue;
3073 }
3074
3075 off = tmp;
3076 continue;
3077 }
3078
3079 /* A potential code span start/end. */
3080 if(ch == _T('`')) {
3081 OFF opener_beg, opener_end;
3082 OFF closer_beg, closer_end;
3083 int is_code_span;
3084
3085 is_code_span = md_is_code_span(ctx, lines: lines + i, n_lines: n_lines - i, beg: off,
3086 p_opener_beg: &opener_beg, p_opener_end: &opener_end, p_closer_beg: &closer_beg, p_closer_end: &closer_end,
3087 last_potential_closers: codespan_last_potential_closers,
3088 p_reached_paragraph_end: &codespan_scanned_till_paragraph_end);
3089 if(is_code_span) {
3090 PUSH_MARK(_T('`'), opener_beg, opener_end, MD_MARK_OPENER | MD_MARK_RESOLVED);
3091 PUSH_MARK(_T('`'), closer_beg, closer_end, MD_MARK_CLOSER | MD_MARK_RESOLVED);
3092 ctx->marks[ctx->n_marks-2].next = ctx->n_marks-1;
3093 ctx->marks[ctx->n_marks-1].prev = ctx->n_marks-2;
3094
3095 off = closer_end;
3096
3097 /* Advance the current line accordingly. */
3098 while(off > line_end) {
3099 i++;
3100 line++;
3101 line_end = line->end;
3102 }
3103 continue;
3104 }
3105
3106 off = opener_end;
3107 continue;
3108 }
3109
3110 /* A potential entity start. */
3111 if(ch == _T('&')) {
3112 PUSH_MARK(ch, off, off+1, MD_MARK_POTENTIAL_OPENER);
3113 off++;
3114 continue;
3115 }
3116
3117 /* A potential entity end. */
3118 if(ch == _T(';')) {
3119 /* We surely cannot be entity unless the previous mark is '&'. */
3120 if(ctx->n_marks > 0 && ctx->marks[ctx->n_marks-1].ch == _T('&'))
3121 PUSH_MARK(ch, off, off+1, MD_MARK_POTENTIAL_CLOSER);
3122
3123 off++;
3124 continue;
3125 }
3126
3127 /* A potential autolink or raw HTML start/end. */
3128 if(ch == _T('<')) {
3129 int is_autolink;
3130 OFF autolink_end;
3131 int missing_mailto;
3132
3133 if(!(ctx->parser.flags & MD_FLAG_NOHTMLSPANS)) {
3134 int is_html;
3135 OFF html_end;
3136
3137 /* Given the nature of the raw HTML, we have to recognize
3138 * it here. Doing so later in md_analyze_lt_gt() could
3139 * open can of worms of quadratic complexity. */
3140 is_html = md_is_html_any(ctx, lines: lines + i, n_lines: n_lines - i, beg: off,
3141 max_end: lines[n_lines-1].end, p_end: &html_end);
3142 if(is_html) {
3143 PUSH_MARK(_T('<'), off, off, MD_MARK_OPENER | MD_MARK_RESOLVED);
3144 PUSH_MARK(_T('>'), html_end, html_end, MD_MARK_CLOSER | MD_MARK_RESOLVED);
3145 ctx->marks[ctx->n_marks-2].next = ctx->n_marks-1;
3146 ctx->marks[ctx->n_marks-1].prev = ctx->n_marks-2;
3147 off = html_end;
3148
3149 /* Advance the current line accordingly. */
3150 while(off > line_end) {
3151 i++;
3152 line++;
3153 line_end = line->end;
3154 }
3155 continue;
3156 }
3157 }
3158
3159 is_autolink = md_is_autolink(ctx, beg: off, max_end: lines[n_lines-1].end,
3160 p_end: &autolink_end, p_missing_mailto: &missing_mailto);
3161 if(is_autolink) {
3162 PUSH_MARK((missing_mailto ? _T('@') : _T('<')), off, off+1,
3163 MD_MARK_OPENER | MD_MARK_RESOLVED | MD_MARK_AUTOLINK);
3164 PUSH_MARK(_T('>'), autolink_end-1, autolink_end,
3165 MD_MARK_CLOSER | MD_MARK_RESOLVED | MD_MARK_AUTOLINK);
3166 ctx->marks[ctx->n_marks-2].next = ctx->n_marks-1;
3167 ctx->marks[ctx->n_marks-1].prev = ctx->n_marks-2;
3168 off = autolink_end;
3169 continue;
3170 }
3171
3172 off++;
3173 continue;
3174 }
3175
3176 /* A potential link or its part. */
3177 if(ch == _T('[') || (ch == _T('!') && off+1 < line_end && CH(off+1) == _T('['))) {
3178 OFF tmp = (ch == _T('[') ? off+1 : off+2);
3179 PUSH_MARK(ch, off, tmp, MD_MARK_POTENTIAL_OPENER);
3180 off = tmp;
3181 /* Two dummies to make enough place for data we need if it is
3182 * a link. */
3183 PUSH_MARK('D', off, off, 0);
3184 PUSH_MARK('D', off, off, 0);
3185 continue;
3186 }
3187 if(ch == _T(']')) {
3188 PUSH_MARK(ch, off, off+1, MD_MARK_POTENTIAL_CLOSER);
3189 off++;
3190 continue;
3191 }
3192
3193 /* A potential permissive e-mail autolink. */
3194 if(ch == _T('@')) {
3195 if(line->beg + 1 <= off && ISALNUM(off-1) &&
3196 off + 3 < line->end && ISALNUM(off+1))
3197 {
3198 PUSH_MARK(ch, off, off+1, MD_MARK_POTENTIAL_OPENER);
3199 /* Push a dummy as a reserve for a closer. */
3200 PUSH_MARK('D', off, off, 0);
3201 }
3202
3203 off++;
3204 continue;
3205 }
3206
3207 /* A potential permissive URL autolink. */
3208 if(ch == _T(':')) {
3209 static struct {
3210 const CHAR* scheme;
3211 SZ scheme_size;
3212 const CHAR* suffix;
3213 SZ suffix_size;
3214 } scheme_map[] = {
3215 /* In the order from the most frequently used, arguably. */
3216 { _T("http"), 4, _T("//"), 2 },
3217 { _T("https"), 5, _T("//"), 2 },
3218 { _T("ftp"), 3, _T("//"), 2 }
3219 };
3220 int scheme_index;
3221
3222 for(scheme_index = 0; scheme_index < (int) SIZEOF_ARRAY(scheme_map); scheme_index++) {
3223 const CHAR* scheme = scheme_map[scheme_index].scheme;
3224 const SZ scheme_size = scheme_map[scheme_index].scheme_size;
3225 const CHAR* suffix = scheme_map[scheme_index].suffix;
3226 const SZ suffix_size = scheme_map[scheme_index].suffix_size;
3227
3228 if(line->beg + scheme_size <= off && md_ascii_eq(STR(off-scheme_size), s2: scheme, n: scheme_size) &&
3229 (line->beg + scheme_size == off || ISWHITESPACE(off-scheme_size-1) || ISANYOF(off-scheme_size-1, _T("*_~(["))) &&
3230 off + 1 + suffix_size < line->end && md_ascii_eq(STR(off+1), s2: suffix, n: suffix_size))
3231 {
3232 PUSH_MARK(ch, off-scheme_size, off+1+suffix_size, MD_MARK_POTENTIAL_OPENER);
3233 /* Push a dummy as a reserve for a closer. */
3234 PUSH_MARK('D', off, off, 0);
3235 off += 1 + suffix_size;
3236 break;
3237 }
3238 }
3239
3240 off++;
3241 continue;
3242 }
3243
3244 /* A potential permissive WWW autolink. */
3245 if(ch == _T('.')) {
3246 if(line->beg + 3 <= off && md_ascii_eq(STR(off-3), _T("www"), n: 3) &&
3247 (line->beg + 3 == off || ISWHITESPACE(off-4) || ISANYOF(off-4, _T("*_~(["))) &&
3248 off + 1 < line_end)
3249 {
3250 PUSH_MARK(ch, off-3, off+1, MD_MARK_POTENTIAL_OPENER);
3251 /* Push a dummy as a reserve for a closer. */
3252 PUSH_MARK('D', off, off, 0);
3253 off++;
3254 continue;
3255 }
3256
3257 off++;
3258 continue;
3259 }
3260
3261 /* A potential table cell boundary or wiki link label delimiter. */
3262 if((table_mode || ctx->parser.flags & MD_FLAG_WIKILINKS) && ch == _T('|')) {
3263 PUSH_MARK(ch, off, off+1, 0);
3264 off++;
3265 continue;
3266 }
3267
3268 /* A potential strikethrough start/end. */
3269 if(ch == _T('~')) {
3270 OFF tmp = off+1;
3271
3272 while(tmp < line_end && CH(tmp) == _T('~'))
3273 tmp++;
3274
3275 if(tmp - off < 3) {
3276 unsigned flags = 0;
3277
3278 if(tmp < line_end && !ISUNICODEWHITESPACE(tmp))
3279 flags |= MD_MARK_POTENTIAL_OPENER;
3280 if(off > line->beg && !ISUNICODEWHITESPACEBEFORE(off))
3281 flags |= MD_MARK_POTENTIAL_CLOSER;
3282 if(flags != 0)
3283 PUSH_MARK(ch, off, tmp, flags);
3284 }
3285
3286 off = tmp;
3287 continue;
3288 }
3289
3290 /* A potential equation start/end */
3291 if(ch == _T('$')) {
3292 /* We can have at most two consecutive $ signs,
3293 * where two dollar signs signify a display equation. */
3294 OFF tmp = off+1;
3295
3296 while(tmp < line_end && CH(tmp) == _T('$'))
3297 tmp++;
3298
3299 if (tmp - off <= 2)
3300 PUSH_MARK(ch, off, tmp, MD_MARK_POTENTIAL_OPENER | MD_MARK_POTENTIAL_CLOSER);
3301 off = tmp;
3302 continue;
3303 }
3304
3305 /* Turn non-trivial whitespace into single space. */
3306 if(ISWHITESPACE_(ch)) {
3307 OFF tmp = off+1;
3308
3309 while(tmp < line_end && ISWHITESPACE(tmp))
3310 tmp++;
3311
3312 if(tmp - off > 1 || ch != _T(' '))
3313 PUSH_MARK(ch, off, tmp, MD_MARK_RESOLVED);
3314
3315 off = tmp;
3316 continue;
3317 }
3318
3319 /* NULL character. */
3320 if(ch == _T('\0')) {
3321 PUSH_MARK(ch, off, off+1, MD_MARK_RESOLVED);
3322 off++;
3323 continue;
3324 }
3325
3326 off++;
3327 }
3328 }
3329
3330 /* Add a dummy mark at the end of the mark vector to simplify
3331 * process_inlines(). */
3332 PUSH_MARK(127, ctx->size, ctx->size, MD_MARK_RESOLVED);
3333
3334abort:
3335 return ret;
3336}
3337
3338static void
3339md_analyze_bracket(MD_CTX* ctx, int mark_index)
3340{
3341 /* We cannot really resolve links here as for that we would need
3342 * more context. E.g. a following pair of brackets (reference link),
3343 * or enclosing pair of brackets (if the inner is the link, the outer
3344 * one cannot be.)
3345 *
3346 * Therefore we here only construct a list of resolved '[' ']' pairs
3347 * ordered by position of the closer. This allows ur to analyze what is
3348 * or is not link in the right order, from inside to outside in case
3349 * of nested brackets.
3350 *
3351 * The resolving itself is deferred into md_resolve_links().
3352 */
3353
3354 MD_MARK* mark = &ctx->marks[mark_index];
3355
3356 if(mark->flags & MD_MARK_POTENTIAL_OPENER) {
3357 md_mark_chain_append(ctx, chain: &BRACKET_OPENERS, mark_index);
3358 return;
3359 }
3360
3361 if(BRACKET_OPENERS.tail >= 0) {
3362 /* Pop the opener from the chain. */
3363 int opener_index = BRACKET_OPENERS.tail;
3364 MD_MARK* opener = &ctx->marks[opener_index];
3365 if(opener->prev >= 0)
3366 ctx->marks[opener->prev].next = -1;
3367 else
3368 BRACKET_OPENERS.head = -1;
3369 BRACKET_OPENERS.tail = opener->prev;
3370
3371 /* Interconnect the opener and closer. */
3372 opener->next = mark_index;
3373 mark->prev = opener_index;
3374
3375 /* Add the pair into chain of potential links for md_resolve_links().
3376 * Note we misuse opener->prev for this as opener->next points to its
3377 * closer. */
3378 if(ctx->unresolved_link_tail >= 0)
3379 ctx->marks[ctx->unresolved_link_tail].prev = opener_index;
3380 else
3381 ctx->unresolved_link_head = opener_index;
3382 ctx->unresolved_link_tail = opener_index;
3383 opener->prev = -1;
3384 }
3385}
3386
3387/* Forward declaration. */
3388static void md_analyze_link_contents(MD_CTX* ctx, const MD_LINE* lines, int n_lines,
3389 int mark_beg, int mark_end);
3390
3391static int
3392md_resolve_links(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
3393{
3394 int opener_index = ctx->unresolved_link_head;
3395 OFF last_link_beg = 0;
3396 OFF last_link_end = 0;
3397 OFF last_img_beg = 0;
3398 OFF last_img_end = 0;
3399
3400 while(opener_index >= 0) {
3401 MD_MARK* opener = &ctx->marks[opener_index];
3402 int closer_index = opener->next;
3403 MD_MARK* closer = &ctx->marks[closer_index];
3404 int next_index = opener->prev;
3405 MD_MARK* next_opener;
3406 MD_MARK* next_closer;
3407 MD_LINK_ATTR attr;
3408 int is_link = FALSE;
3409
3410 if(next_index >= 0) {
3411 next_opener = &ctx->marks[next_index];
3412 next_closer = &ctx->marks[next_opener->next];
3413 } else {
3414 next_opener = NULL;
3415 next_closer = NULL;
3416 }
3417
3418 /* If nested ("[ [ ] ]"), we need to make sure that:
3419 * - The outer does not end inside of (...) belonging to the inner.
3420 * - The outer cannot be link if the inner is link (i.e. not image).
3421 *
3422 * (Note we here analyze from inner to outer as the marks are ordered
3423 * by closer->beg.)
3424 */
3425 if((opener->beg < last_link_beg && closer->end < last_link_end) ||
3426 (opener->beg < last_img_beg && closer->end < last_img_end) ||
3427 (opener->beg < last_link_end && opener->ch == '['))
3428 {
3429 opener_index = next_index;
3430 continue;
3431 }
3432
3433 /* Recognize and resolve wiki links.
3434 * Wiki-links maybe '[[destination]]' or '[[destination|label]]'.
3435 */
3436 if ((ctx->parser.flags & MD_FLAG_WIKILINKS) &&
3437 (opener->end - opener->beg == 1) && /* not image */
3438 next_opener != NULL && /* double '[' opener */
3439 next_opener->ch == '[' &&
3440 (next_opener->beg == opener->beg - 1) &&
3441 (next_opener->end - next_opener->beg == 1) &&
3442 next_closer != NULL && /* double ']' closer */
3443 next_closer->ch == ']' &&
3444 (next_closer->beg == closer->beg + 1) &&
3445 (next_closer->end - next_closer->beg == 1))
3446 {
3447 MD_MARK* delim = NULL;
3448 int delim_index;
3449 OFF dest_beg, dest_end;
3450
3451 is_link = TRUE;
3452
3453 /* We don't allow destination to be longer than 100 characters.
3454 * Lets scan to see whether there is '|'. (If not then the whole
3455 * wiki-link has to be below the 100 characters.) */
3456 delim_index = opener_index + 1;
3457 while(delim_index < closer_index) {
3458 MD_MARK* m = &ctx->marks[delim_index];
3459 if(m->ch == '|') {
3460 delim = m;
3461 break;
3462 }
3463 if(m->ch != 'D' && m->beg - opener->end > 100)
3464 break;
3465 delim_index++;
3466 }
3467 dest_beg = opener->end;
3468 dest_end = (delim != NULL) ? delim->beg : closer->beg;
3469 if(dest_end - dest_beg == 0 || dest_end - dest_beg > 100)
3470 is_link = FALSE;
3471
3472 /* There may not be any new line in the destination. */
3473 if(is_link) {
3474 OFF off;
3475 for(off = dest_beg; off < dest_end; off++) {
3476 if(ISNEWLINE(off)) {
3477 is_link = FALSE;
3478 break;
3479 }
3480 }
3481 }
3482
3483 if(is_link) {
3484 if(delim != NULL) {
3485 if(delim->end < closer->beg) {
3486 opener->end = delim->beg;
3487 } else {
3488 /* The pipe is just before the closer: [[foo|]] */
3489 closer->beg = delim->beg;
3490 delim = NULL;
3491 }
3492 }
3493
3494 opener->beg = next_opener->beg;
3495 opener->next = closer_index;
3496 opener->flags |= MD_MARK_OPENER | MD_MARK_RESOLVED;
3497
3498 closer->end = next_closer->end;
3499 closer->prev = opener_index;
3500 closer->flags |= MD_MARK_CLOSER | MD_MARK_RESOLVED;
3501
3502 last_link_beg = opener->beg;
3503 last_link_end = closer->end;
3504
3505 if(delim != NULL) {
3506 delim->flags |= MD_MARK_RESOLVED;
3507 md_rollback(ctx, opener_index, closer_index: delim_index, MD_ROLLBACK_ALL);
3508 md_analyze_link_contents(ctx, lines, n_lines, mark_beg: opener_index+1, mark_end: closer_index);
3509 } else {
3510 md_rollback(ctx, opener_index, closer_index, MD_ROLLBACK_ALL);
3511 }
3512
3513 opener_index = next_opener->prev;
3514 continue;
3515 }
3516 }
3517
3518 if(next_opener != NULL && next_opener->beg == closer->end) {
3519 if(next_closer->beg > closer->end + 1) {
3520 /* Might be full reference link. */
3521 is_link = md_is_link_reference(ctx, lines, n_lines, beg: next_opener->beg, end: next_closer->end, attr: &attr);
3522 } else {
3523 /* Might be shortcut reference link. */
3524 is_link = md_is_link_reference(ctx, lines, n_lines, beg: opener->beg, end: closer->end, attr: &attr);
3525 }
3526
3527 if(is_link < 0)
3528 return -1;
3529
3530 if(is_link) {
3531 /* Eat the 2nd "[...]". */
3532 closer->end = next_closer->end;
3533
3534 /* Do not analyze the label as a standalone link in the next
3535 * iteration. */
3536 next_index = ctx->marks[next_index].prev;
3537 }
3538 } else {
3539 if(closer->end < ctx->size && CH(closer->end) == _T('(')) {
3540 /* Might be inline link. */
3541 OFF inline_link_end = UINT_MAX;
3542
3543 is_link = md_is_inline_link_spec(ctx, lines, n_lines, beg: closer->end, p_end: &inline_link_end, attr: &attr);
3544 if(is_link < 0)
3545 return -1;
3546
3547 /* Check the closing ')' is not inside an already resolved range
3548 * (i.e. a range with a higher priority), e.g. a code span. */
3549 if(is_link) {
3550 int i = closer_index + 1;
3551
3552 while(i < ctx->n_marks) {
3553 MD_MARK* mark = &ctx->marks[i];
3554
3555 if(mark->beg >= inline_link_end)
3556 break;
3557 if((mark->flags & (MD_MARK_OPENER | MD_MARK_RESOLVED)) == (MD_MARK_OPENER | MD_MARK_RESOLVED)) {
3558 if(ctx->marks[mark->next].beg >= inline_link_end) {
3559 /* Cancel the link status. */
3560 if(attr.title_needs_free)
3561 free(ptr: attr.title);
3562 is_link = FALSE;
3563 break;
3564 }
3565
3566 i = mark->next + 1;
3567 } else {
3568 i++;
3569 }
3570 }
3571 }
3572
3573 if(is_link) {
3574 /* Eat the "(...)" */
3575 closer->end = inline_link_end;
3576 }
3577 }
3578
3579 if(!is_link) {
3580 /* Might be collapsed reference link. */
3581 is_link = md_is_link_reference(ctx, lines, n_lines, beg: opener->beg, end: closer->end, attr: &attr);
3582 if(is_link < 0)
3583 return -1;
3584 }
3585 }
3586
3587 if(is_link) {
3588 /* Resolve the brackets as a link. */
3589 opener->flags |= MD_MARK_OPENER | MD_MARK_RESOLVED;
3590 closer->flags |= MD_MARK_CLOSER | MD_MARK_RESOLVED;
3591
3592 /* If it is a link, we store the destination and title in the two
3593 * dummy marks after the opener. */
3594 MD_ASSERT(ctx->marks[opener_index+1].ch == 'D');
3595 ctx->marks[opener_index+1].beg = attr.dest_beg;
3596 ctx->marks[opener_index+1].end = attr.dest_end;
3597
3598 MD_ASSERT(ctx->marks[opener_index+2].ch == 'D');
3599 md_mark_store_ptr(ctx, mark_index: opener_index+2, ptr: attr.title);
3600 /* The title might or might not have been allocated for us. */
3601 if(attr.title_needs_free)
3602 md_mark_chain_append(ctx, chain: &PTR_CHAIN, mark_index: opener_index+2);
3603 ctx->marks[opener_index+2].prev = attr.title_size;
3604
3605 if(opener->ch == '[') {
3606 last_link_beg = opener->beg;
3607 last_link_end = closer->end;
3608 } else {
3609 last_img_beg = opener->beg;
3610 last_img_end = closer->end;
3611 }
3612
3613 md_analyze_link_contents(ctx, lines, n_lines, mark_beg: opener_index+1, mark_end: closer_index);
3614
3615 /* If the link text is formed by nothing but permissive autolink,
3616 * suppress the autolink.
3617 * See https://github.com/mity/md4c/issues/152 for more info. */
3618 if(ctx->parser.flags & MD_FLAG_PERMISSIVEAUTOLINKS) {
3619 MD_MARK* first_nested;
3620 MD_MARK* last_nested;
3621
3622 first_nested = opener + 1;
3623 while(first_nested->ch == _T('D') && first_nested < closer)
3624 first_nested++;
3625
3626 last_nested = closer - 1;
3627 while(first_nested->ch == _T('D') && last_nested > opener)
3628 last_nested--;
3629
3630 if((first_nested->flags & MD_MARK_RESOLVED) &&
3631 first_nested->beg == opener->end &&
3632 ISANYOF_(first_nested->ch, _T("@:.")) &&
3633 first_nested->next == (last_nested - ctx->marks) &&
3634 last_nested->end == closer->beg)
3635 {
3636 first_nested->ch = _T('D');
3637 first_nested->flags &= ~MD_MARK_RESOLVED;
3638 last_nested->ch = _T('D');
3639 last_nested->flags &= ~MD_MARK_RESOLVED;
3640 }
3641 }
3642 }
3643
3644 opener_index = next_index;
3645 }
3646
3647 return 0;
3648}
3649
3650/* Analyze whether the mark '&' starts a HTML entity.
3651 * If so, update its flags as well as flags of corresponding closer ';'. */
3652static void
3653md_analyze_entity(MD_CTX* ctx, int mark_index)
3654{
3655 MD_MARK* opener = &ctx->marks[mark_index];
3656 MD_MARK* closer;
3657 OFF off;
3658
3659 /* Cannot be entity if there is no closer as the next mark.
3660 * (Any other mark between would mean strange character which cannot be
3661 * part of the entity.
3662 *
3663 * So we can do all the work on '&' and do not call this later for the
3664 * closing mark ';'.
3665 */
3666 if(mark_index + 1 >= ctx->n_marks)
3667 return;
3668 closer = &ctx->marks[mark_index+1];
3669 if(closer->ch != ';')
3670 return;
3671
3672 if(md_is_entity(ctx, beg: opener->beg, max_end: closer->end, p_end: &off)) {
3673 MD_ASSERT(off == closer->end);
3674
3675 md_resolve_range(ctx, NULL, opener_index: mark_index, closer_index: mark_index+1);
3676 opener->end = closer->end;
3677 }
3678}
3679
3680static void
3681md_analyze_table_cell_boundary(MD_CTX* ctx, int mark_index)
3682{
3683 MD_MARK* mark = &ctx->marks[mark_index];
3684 mark->flags |= MD_MARK_RESOLVED;
3685
3686 md_mark_chain_append(ctx, chain: &TABLECELLBOUNDARIES, mark_index);
3687 ctx->n_table_cell_boundaries++;
3688}
3689
3690/* Split a longer mark into two. The new mark takes the given count of
3691 * characters. May only be called if an adequate number of dummy 'D' marks
3692 * follows.
3693 */
3694static int
3695md_split_emph_mark(MD_CTX* ctx, int mark_index, SZ n)
3696{
3697 MD_MARK* mark = &ctx->marks[mark_index];
3698 int new_mark_index = mark_index + (mark->end - mark->beg - n);
3699 MD_MARK* dummy = &ctx->marks[new_mark_index];
3700
3701 MD_ASSERT(mark->end - mark->beg > n);
3702 MD_ASSERT(dummy->ch == 'D');
3703
3704 memcpy(dest: dummy, src: mark, n: sizeof(MD_MARK));
3705 mark->end -= n;
3706 dummy->beg = mark->end;
3707
3708 return new_mark_index;
3709}
3710
3711static void
3712md_analyze_emph(MD_CTX* ctx, int mark_index)
3713{
3714 MD_MARK* mark = &ctx->marks[mark_index];
3715 MD_MARKCHAIN* chain = md_mark_chain(ctx, mark_index);
3716
3717 /* If we can be a closer, try to resolve with the preceding opener. */
3718 if(mark->flags & MD_MARK_POTENTIAL_CLOSER) {
3719 MD_MARK* opener = NULL;
3720 int opener_index = 0;
3721
3722 if(mark->ch == _T('*')) {
3723 MD_MARKCHAIN* opener_chains[6];
3724 int i, n_opener_chains;
3725 unsigned flags = mark->flags;
3726
3727 /* Apply the "rule of three". */
3728 n_opener_chains = 0;
3729 opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_intraword_mod3_0;
3730 if((flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_2)
3731 opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_intraword_mod3_1;
3732 if((flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_1)
3733 opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_intraword_mod3_2;
3734 opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_extraword_mod3_0;
3735 if(!(flags & MD_MARK_EMPH_INTRAWORD) || (flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_2)
3736 opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_extraword_mod3_1;
3737 if(!(flags & MD_MARK_EMPH_INTRAWORD) || (flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_1)
3738 opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_extraword_mod3_2;
3739
3740 /* Opener is the most recent mark from the allowed chains. */
3741 for(i = 0; i < n_opener_chains; i++) {
3742 if(opener_chains[i]->tail >= 0) {
3743 int tmp_index = opener_chains[i]->tail;
3744 MD_MARK* tmp_mark = &ctx->marks[tmp_index];
3745 if(opener == NULL || tmp_mark->end > opener->end) {
3746 opener_index = tmp_index;
3747 opener = tmp_mark;
3748 }
3749 }
3750 }
3751 } else {
3752 /* Simple emph. mark */
3753 if(chain->tail >= 0) {
3754 opener_index = chain->tail;
3755 opener = &ctx->marks[opener_index];
3756 }
3757 }
3758
3759 /* Resolve, if we have found matching opener. */
3760 if(opener != NULL) {
3761 SZ opener_size = opener->end - opener->beg;
3762 SZ closer_size = mark->end - mark->beg;
3763 MD_MARKCHAIN* opener_chain = md_mark_chain(ctx, mark_index: opener_index);
3764
3765 if(opener_size > closer_size) {
3766 opener_index = md_split_emph_mark(ctx, mark_index: opener_index, n: closer_size);
3767 md_mark_chain_append(ctx, chain: opener_chain, mark_index: opener_index);
3768 } else if(opener_size < closer_size) {
3769 md_split_emph_mark(ctx, mark_index, n: closer_size - opener_size);
3770 }
3771
3772 md_rollback(ctx, opener_index, closer_index: mark_index, MD_ROLLBACK_CROSSING);
3773 md_resolve_range(ctx, chain: opener_chain, opener_index, closer_index: mark_index);
3774 return;
3775 }
3776 }
3777
3778 /* If we could not resolve as closer, we may be yet be an opener. */
3779 if(mark->flags & MD_MARK_POTENTIAL_OPENER)
3780 md_mark_chain_append(ctx, chain, mark_index);
3781}
3782
3783static void
3784md_analyze_tilde(MD_CTX* ctx, int mark_index)
3785{
3786 MD_MARK* mark = &ctx->marks[mark_index];
3787 MD_MARKCHAIN* chain = md_mark_chain(ctx, mark_index);
3788
3789 /* We attempt to be Github Flavored Markdown compatible here. GFM accepts
3790 * only tildes sequences of length 1 and 2, and the length of the opener
3791 * and closer has to match. */
3792
3793 if((mark->flags & MD_MARK_POTENTIAL_CLOSER) && chain->head >= 0) {
3794 int opener_index = chain->head;
3795
3796 md_rollback(ctx, opener_index, closer_index: mark_index, MD_ROLLBACK_CROSSING);
3797 md_resolve_range(ctx, chain, opener_index, closer_index: mark_index);
3798 return;
3799 }
3800
3801 if(mark->flags & MD_MARK_POTENTIAL_OPENER)
3802 md_mark_chain_append(ctx, chain, mark_index);
3803}
3804
3805static void
3806md_analyze_dollar(MD_CTX* ctx, int mark_index)
3807{
3808 /* This should mimic the way inline equations work in LaTeX, so there
3809 * can only ever be one item in the chain (i.e. the dollars can't be
3810 * nested). This is basically the same as the md_analyze_tilde function,
3811 * except that we require matching openers and closers to be of the same
3812 * length.
3813 *
3814 * E.g.: $abc$$def$$ => abc (display equation) def (end equation) */
3815 if(DOLLAR_OPENERS.head >= 0) {
3816 /* If the potential closer has a non-matching number of $, discard */
3817 MD_MARK* open = &ctx->marks[DOLLAR_OPENERS.head];
3818 MD_MARK* close = &ctx->marks[mark_index];
3819
3820 int opener_index = DOLLAR_OPENERS.head;
3821 md_rollback(ctx, opener_index, closer_index: mark_index, MD_ROLLBACK_ALL);
3822 if (open->end - open->beg == close->end - close->beg) {
3823 /* We are the matching closer */
3824 md_resolve_range(ctx, chain: &DOLLAR_OPENERS, opener_index, closer_index: mark_index);
3825 } else {
3826 /* We don't match the opener, so discard old opener and insert as opener */
3827 md_mark_chain_append(ctx, chain: &DOLLAR_OPENERS, mark_index);
3828 }
3829 } else {
3830 /* No unmatched openers, so we are opener */
3831 md_mark_chain_append(ctx, chain: &DOLLAR_OPENERS, mark_index);
3832 }
3833}
3834
3835static void
3836md_analyze_permissive_url_autolink(MD_CTX* ctx, int mark_index)
3837{
3838 MD_MARK* opener = &ctx->marks[mark_index];
3839 int closer_index = mark_index + 1;
3840 MD_MARK* closer = &ctx->marks[closer_index];
3841 MD_MARK* next_resolved_mark;
3842 OFF off = opener->end;
3843 int n_dots = FALSE;
3844 int has_underscore_in_last_seg = FALSE;
3845 int has_underscore_in_next_to_last_seg = FALSE;
3846 int n_opened_parenthesis = 0;
3847 int n_excess_parenthesis = 0;
3848
3849 /* Check for domain. */
3850 while(off < ctx->size) {
3851 if(ISALNUM(off) || CH(off) == _T('-')) {
3852 off++;
3853 } else if(CH(off) == _T('.')) {
3854 /* We must see at least one period. */
3855 n_dots++;
3856 has_underscore_in_next_to_last_seg = has_underscore_in_last_seg;
3857 has_underscore_in_last_seg = FALSE;
3858 off++;
3859 } else if(CH(off) == _T('_')) {
3860 /* No underscore may be present in the last two domain segments. */
3861 has_underscore_in_last_seg = TRUE;
3862 off++;
3863 } else {
3864 break;
3865 }
3866 }
3867 if(off > opener->end && CH(off-1) == _T('.')) {
3868 off--;
3869 n_dots--;
3870 }
3871 if(off <= opener->end || n_dots == 0 || has_underscore_in_next_to_last_seg || has_underscore_in_last_seg)
3872 return;
3873
3874 /* Check for path. */
3875 next_resolved_mark = closer + 1;
3876 while(next_resolved_mark->ch == 'D' || !(next_resolved_mark->flags & MD_MARK_RESOLVED))
3877 next_resolved_mark++;
3878 while(off < next_resolved_mark->beg && CH(off) != _T('<') && !ISWHITESPACE(off) && !ISNEWLINE(off)) {
3879 /* Parenthesis must be balanced. */
3880 if(CH(off) == _T('(')) {
3881 n_opened_parenthesis++;
3882 } else if(CH(off) == _T(')')) {
3883 if(n_opened_parenthesis > 0)
3884 n_opened_parenthesis--;
3885 else
3886 n_excess_parenthesis++;
3887 }
3888
3889 off++;
3890 }
3891
3892 /* Trim a trailing punctuation from the end. */
3893 while(TRUE) {
3894 if(ISANYOF(off-1, _T("?!.,:*_~"))) {
3895 off--;
3896 } else if(CH(off-1) == ')' && n_excess_parenthesis > 0) {
3897 /* Unmatched ')' can be in an interior of the path but not at the
3898 * of it, so the auto-link may be safely nested in a parenthesis
3899 * pair. */
3900 off--;
3901 n_excess_parenthesis--;
3902 } else {
3903 break;
3904 }
3905 }
3906
3907 /* Ok. Lets call it an auto-link. Adapt opener and create closer to zero
3908 * length so all the contents becomes the link text. */
3909 MD_ASSERT(closer->ch == 'D');
3910 opener->end = opener->beg;
3911 closer->ch = opener->ch;
3912 closer->beg = off;
3913 closer->end = off;
3914 md_resolve_range(ctx, NULL, opener_index: mark_index, closer_index);
3915}
3916
3917/* The permissive autolinks do not have to be enclosed in '<' '>' but we
3918 * instead impose stricter rules what is understood as an e-mail address
3919 * here. Actually any non-alphanumeric characters with exception of '.'
3920 * are prohibited both in username and after '@'. */
3921static void
3922md_analyze_permissive_email_autolink(MD_CTX* ctx, int mark_index)
3923{
3924 MD_MARK* opener = &ctx->marks[mark_index];
3925 int closer_index;
3926 MD_MARK* closer;
3927 OFF beg = opener->beg;
3928 OFF end = opener->end;
3929 int dot_count = 0;
3930
3931 MD_ASSERT(CH(beg) == _T('@'));
3932
3933 /* Scan for name before '@'. */
3934 while(beg > 0 && (ISALNUM(beg-1) || ISANYOF(beg-1, _T(".-_+"))))
3935 beg--;
3936
3937 /* Scan for domain after '@'. */
3938 while(end < ctx->size && (ISALNUM(end) || ISANYOF(end, _T(".-_")))) {
3939 if(CH(end) == _T('.'))
3940 dot_count++;
3941 end++;
3942 }
3943 if(CH(end-1) == _T('.')) { /* Final '.' not part of it. */
3944 dot_count--;
3945 end--;
3946 }
3947 else if(ISANYOF2(end-1, _T('-'), _T('_'))) /* These are forbidden at the end. */
3948 return;
3949 if(CH(end-1) == _T('@') || dot_count == 0)
3950 return;
3951
3952 /* Ok. Lets call it auto-link. Adapt opener and create closer to zero
3953 * length so all the contents becomes the link text. */
3954 closer_index = mark_index + 1;
3955 closer = &ctx->marks[closer_index];
3956 MD_ASSERT(closer->ch == 'D');
3957
3958 opener->beg = beg;
3959 opener->end = beg;
3960 closer->ch = opener->ch;
3961 closer->beg = end;
3962 closer->end = end;
3963 md_resolve_range(ctx, NULL, opener_index: mark_index, closer_index);
3964}
3965
3966static inline void
3967md_analyze_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines,
3968 int mark_beg, int mark_end, const CHAR* mark_chars)
3969{
3970 int i = mark_beg;
3971 MD_UNUSED(lines);
3972 MD_UNUSED(n_lines);
3973
3974 while(i < mark_end) {
3975 MD_MARK* mark = &ctx->marks[i];
3976
3977 /* Skip resolved spans. */
3978 if(mark->flags & MD_MARK_RESOLVED) {
3979 if(mark->flags & MD_MARK_OPENER) {
3980 MD_ASSERT(i < mark->next);
3981 i = mark->next + 1;
3982 } else {
3983 i++;
3984 }
3985 continue;
3986 }
3987
3988 /* Skip marks we do not want to deal with. */
3989 if(!ISANYOF_(mark->ch, mark_chars)) {
3990 i++;
3991 continue;
3992 }
3993
3994 /* Analyze the mark. */
3995 switch(mark->ch) {
3996 case '[': /* Pass through. */
3997 case '!': /* Pass through. */
3998 case ']': md_analyze_bracket(ctx, mark_index: i); break;
3999 case '&': md_analyze_entity(ctx, mark_index: i); break;
4000 case '|': md_analyze_table_cell_boundary(ctx, mark_index: i); break;
4001 case '_': /* Pass through. */
4002 case '*': md_analyze_emph(ctx, mark_index: i); break;
4003 case '~': md_analyze_tilde(ctx, mark_index: i); break;
4004 case '$': md_analyze_dollar(ctx, mark_index: i); break;
4005 case '.': /* Pass through. */
4006 case ':': md_analyze_permissive_url_autolink(ctx, mark_index: i); break;
4007 case '@': md_analyze_permissive_email_autolink(ctx, mark_index: i); break;
4008 }
4009
4010 i++;
4011 }
4012}
4013
4014/* Analyze marks (build ctx->marks). */
4015static int
4016md_analyze_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines, int table_mode)
4017{
4018 int ret;
4019
4020 /* Reset the previously collected stack of marks. */
4021 ctx->n_marks = 0;
4022
4023 /* Collect all marks. */
4024 MD_CHECK(md_collect_marks(ctx, lines, n_lines, table_mode));
4025
4026 /* We analyze marks in few groups to handle their precedence. */
4027 /* (1) Entities; code spans; autolinks; raw HTML. */
4028 md_analyze_marks(ctx, lines, n_lines, mark_beg: 0, mark_end: ctx->n_marks, _T("&"));
4029
4030 /* (2) Links. */
4031 md_analyze_marks(ctx, lines, n_lines, mark_beg: 0, mark_end: ctx->n_marks, _T("[]!"));
4032 MD_CHECK(md_resolve_links(ctx, lines, n_lines));
4033 BRACKET_OPENERS.head = -1;
4034 BRACKET_OPENERS.tail = -1;
4035 ctx->unresolved_link_head = -1;
4036 ctx->unresolved_link_tail = -1;
4037
4038 if(table_mode) {
4039 /* (3) Analyze table cell boundaries.
4040 * Note we reset TABLECELLBOUNDARIES chain prior to the call md_analyze_marks(),
4041 * not after, because caller may need it. */
4042 MD_ASSERT(n_lines == 1);
4043 TABLECELLBOUNDARIES.head = -1;
4044 TABLECELLBOUNDARIES.tail = -1;
4045 ctx->n_table_cell_boundaries = 0;
4046 md_analyze_marks(ctx, lines, n_lines, mark_beg: 0, mark_end: ctx->n_marks, _T("|"));
4047 return ret;
4048 }
4049
4050 /* (4) Emphasis and strong emphasis; permissive autolinks. */
4051 md_analyze_link_contents(ctx, lines, n_lines, mark_beg: 0, mark_end: ctx->n_marks);
4052
4053abort:
4054 return ret;
4055}
4056
4057static void
4058md_analyze_link_contents(MD_CTX* ctx, const MD_LINE* lines, int n_lines,
4059 int mark_beg, int mark_end)
4060{
4061 int i;
4062
4063 md_analyze_marks(ctx, lines, n_lines, mark_beg, mark_end, _T("*_~$@:."));
4064
4065 for(i = OPENERS_CHAIN_FIRST; i <= OPENERS_CHAIN_LAST; i++) {
4066 ctx->mark_chains[i].head = -1;
4067 ctx->mark_chains[i].tail = -1;
4068 }
4069}
4070
4071static int
4072md_enter_leave_span_a(MD_CTX* ctx, int enter, MD_SPANTYPE type,
4073 const CHAR* dest, SZ dest_size, int prohibit_escapes_in_dest,
4074 const CHAR* title, SZ title_size)
4075{
4076 MD_ATTRIBUTE_BUILD href_build = { 0 };
4077 MD_ATTRIBUTE_BUILD title_build = { 0 };
4078 MD_SPAN_A_DETAIL det;
4079 int ret = 0;
4080
4081 /* Note we here rely on fact that MD_SPAN_A_DETAIL and
4082 * MD_SPAN_IMG_DETAIL are binary-compatible. */
4083 memset(s: &det, c: 0, n: sizeof(MD_SPAN_A_DETAIL));
4084 MD_CHECK(md_build_attribute(ctx, dest, dest_size,
4085 (prohibit_escapes_in_dest ? MD_BUILD_ATTR_NO_ESCAPES : 0),
4086 &det.href, &href_build));
4087 MD_CHECK(md_build_attribute(ctx, title, title_size, 0, &det.title, &title_build));
4088
4089 if(enter)
4090 MD_ENTER_SPAN(type, &det);
4091 else
4092 MD_LEAVE_SPAN(type, &det);
4093
4094abort:
4095 md_free_attribute(ctx, build: &href_build);
4096 md_free_attribute(ctx, build: &title_build);
4097 return ret;
4098}
4099
4100static int
4101md_enter_leave_span_wikilink(MD_CTX* ctx, int enter, const CHAR* target, SZ target_size)
4102{
4103 MD_ATTRIBUTE_BUILD target_build = { 0 };
4104 MD_SPAN_WIKILINK_DETAIL det;
4105 int ret = 0;
4106
4107 memset(s: &det, c: 0, n: sizeof(MD_SPAN_WIKILINK_DETAIL));
4108 MD_CHECK(md_build_attribute(ctx, target, target_size, 0, &det.target, &target_build));
4109
4110 if (enter)
4111 MD_ENTER_SPAN(MD_SPAN_WIKILINK, &det);
4112 else
4113 MD_LEAVE_SPAN(MD_SPAN_WIKILINK, &det);
4114
4115abort:
4116 md_free_attribute(ctx, build: &target_build);
4117 return ret;
4118}
4119
4120
4121/* Render the output, accordingly to the analyzed ctx->marks. */
4122static int
4123md_process_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
4124{
4125 MD_TEXTTYPE text_type;
4126 const MD_LINE* line = lines;
4127 MD_MARK* prev_mark = NULL;
4128 MD_MARK* mark;
4129 OFF off = lines[0].beg;
4130 OFF end = lines[n_lines-1].end;
4131 int enforce_hardbreak = 0;
4132 int ret = 0;
4133
4134 /* Find first resolved mark. Note there is always at least one resolved
4135 * mark, the dummy last one after the end of the latest line we actually
4136 * never really reach. This saves us of a lot of special checks and cases
4137 * in this function. */
4138 mark = ctx->marks;
4139 while(!(mark->flags & MD_MARK_RESOLVED))
4140 mark++;
4141
4142 text_type = MD_TEXT_NORMAL;
4143
4144 while(1) {
4145 /* Process the text up to the next mark or end-of-line. */
4146 OFF tmp = (line->end < mark->beg ? line->end : mark->beg);
4147 if(tmp > off) {
4148 MD_TEXT(text_type, STR(off), tmp - off);
4149 off = tmp;
4150 }
4151
4152 /* If reached the mark, process it and move to next one. */
4153 if(off >= mark->beg) {
4154 switch(mark->ch) {
4155 case '\\': /* Backslash escape. */
4156 if(ISNEWLINE(mark->beg+1))
4157 enforce_hardbreak = 1;
4158 else
4159 MD_TEXT(text_type, STR(mark->beg+1), 1);
4160 break;
4161
4162 case ' ': /* Non-trivial space. */
4163 MD_TEXT(text_type, _T(" "), 1);
4164 break;
4165
4166 case '`': /* Code span. */
4167 if(mark->flags & MD_MARK_OPENER) {
4168 MD_ENTER_SPAN(MD_SPAN_CODE, NULL);
4169 text_type = MD_TEXT_CODE;
4170 } else {
4171 MD_LEAVE_SPAN(MD_SPAN_CODE, NULL);
4172 text_type = MD_TEXT_NORMAL;
4173 }
4174 break;
4175
4176 case '_': /* Underline (or emphasis if we fall through). */
4177 if(ctx->parser.flags & MD_FLAG_UNDERLINE) {
4178 if(mark->flags & MD_MARK_OPENER) {
4179 while(off < mark->end) {
4180 MD_ENTER_SPAN(MD_SPAN_U, NULL);
4181 off++;
4182 }
4183 } else {
4184 while(off < mark->end) {
4185 MD_LEAVE_SPAN(MD_SPAN_U, NULL);
4186 off++;
4187 }
4188 }
4189 break;
4190 }
4191 MD_FALLTHROUGH();
4192
4193 case '*': /* Emphasis, strong emphasis. */
4194 if(mark->flags & MD_MARK_OPENER) {
4195 if((mark->end - off) % 2) {
4196 MD_ENTER_SPAN(MD_SPAN_EM, NULL);
4197 off++;
4198 }
4199 while(off + 1 < mark->end) {
4200 MD_ENTER_SPAN(MD_SPAN_STRONG, NULL);
4201 off += 2;
4202 }
4203 } else {
4204 while(off + 1 < mark->end) {
4205 MD_LEAVE_SPAN(MD_SPAN_STRONG, NULL);
4206 off += 2;
4207 }
4208 if((mark->end - off) % 2) {
4209 MD_LEAVE_SPAN(MD_SPAN_EM, NULL);
4210 off++;
4211 }
4212 }
4213 break;
4214
4215 case '~':
4216 if(mark->flags & MD_MARK_OPENER)
4217 MD_ENTER_SPAN(MD_SPAN_DEL, NULL);
4218 else
4219 MD_LEAVE_SPAN(MD_SPAN_DEL, NULL);
4220 break;
4221
4222 case '$':
4223 if(mark->flags & MD_MARK_OPENER) {
4224 MD_ENTER_SPAN((mark->end - off) % 2 ? MD_SPAN_LATEXMATH : MD_SPAN_LATEXMATH_DISPLAY, NULL);
4225 text_type = MD_TEXT_LATEXMATH;
4226 } else {
4227 MD_LEAVE_SPAN((mark->end - off) % 2 ? MD_SPAN_LATEXMATH : MD_SPAN_LATEXMATH_DISPLAY, NULL);
4228 text_type = MD_TEXT_NORMAL;
4229 }
4230 break;
4231
4232 case '[': /* Link, wiki link, image. */
4233 case '!':
4234 case ']':
4235 {
4236 const MD_MARK* opener = (mark->ch != ']' ? mark : &ctx->marks[mark->prev]);
4237 const MD_MARK* closer = &ctx->marks[opener->next];
4238 const MD_MARK* dest_mark;
4239 const MD_MARK* title_mark;
4240
4241 if ((opener->ch == '[' && closer->ch == ']') &&
4242 opener->end - opener->beg >= 2 &&
4243 closer->end - closer->beg >= 2)
4244 {
4245 int has_label = (opener->end - opener->beg > 2);
4246 SZ target_sz;
4247
4248 if(has_label)
4249 target_sz = opener->end - (opener->beg+2);
4250 else
4251 target_sz = closer->beg - opener->end;
4252
4253 MD_CHECK(md_enter_leave_span_wikilink(ctx, (mark->ch != ']'),
4254 has_label ? STR(opener->beg+2) : STR(opener->end),
4255 target_sz));
4256
4257 break;
4258 }
4259
4260 dest_mark = opener+1;
4261 MD_ASSERT(dest_mark->ch == 'D');
4262 title_mark = opener+2;
4263 MD_ASSERT(title_mark->ch == 'D');
4264
4265 MD_CHECK(md_enter_leave_span_a(ctx, (mark->ch != ']'),
4266 (opener->ch == '!' ? MD_SPAN_IMG : MD_SPAN_A),
4267 STR(dest_mark->beg), dest_mark->end - dest_mark->beg, FALSE,
4268 md_mark_get_ptr(ctx, (int)(title_mark - ctx->marks)), title_mark->prev));
4269
4270 /* link/image closer may span multiple lines. */
4271 if(mark->ch == ']') {
4272 while(mark->end > line->end)
4273 line++;
4274 }
4275
4276 break;
4277 }
4278
4279 case '<':
4280 case '>': /* Autolink or raw HTML. */
4281 if(!(mark->flags & MD_MARK_AUTOLINK)) {
4282 /* Raw HTML. */
4283 if(mark->flags & MD_MARK_OPENER)
4284 text_type = MD_TEXT_HTML;
4285 else
4286 text_type = MD_TEXT_NORMAL;
4287 break;
4288 }
4289 /* Pass through, if auto-link. */
4290 MD_FALLTHROUGH();
4291
4292 case '@': /* Permissive e-mail autolink. */
4293 case ':': /* Permissive URL autolink. */
4294 case '.': /* Permissive WWW autolink. */
4295 {
4296 MD_MARK* opener = ((mark->flags & MD_MARK_OPENER) ? mark : &ctx->marks[mark->prev]);
4297 MD_MARK* closer = &ctx->marks[opener->next];
4298 const CHAR* dest = STR(opener->end);
4299 SZ dest_size = closer->beg - opener->end;
4300
4301 /* For permissive auto-links we do not know closer mark
4302 * position at the time of md_collect_marks(), therefore
4303 * it can be out-of-order in ctx->marks[].
4304 *
4305 * With this flag, we make sure that we output the closer
4306 * only if we processed the opener. */
4307 if(mark->flags & MD_MARK_OPENER)
4308 closer->flags |= MD_MARK_VALIDPERMISSIVEAUTOLINK;
4309
4310 if(opener->ch == '@' || opener->ch == '.') {
4311 dest_size += 7;
4312 MD_TEMP_BUFFER(dest_size * sizeof(CHAR));
4313 memcpy(dest: ctx->buffer,
4314 src: (opener->ch == '@' ? _T("mailto:") : _T("http://")),
4315 n: 7 * sizeof(CHAR));
4316 memcpy(dest: ctx->buffer + 7, src: dest, n: (dest_size-7) * sizeof(CHAR));
4317 dest = ctx->buffer;
4318 }
4319
4320 if(closer->flags & MD_MARK_VALIDPERMISSIVEAUTOLINK)
4321 MD_CHECK(md_enter_leave_span_a(ctx, (mark->flags & MD_MARK_OPENER),
4322 MD_SPAN_A, dest, dest_size, TRUE, NULL, 0));
4323 break;
4324 }
4325
4326 case '&': /* Entity. */
4327 MD_TEXT(MD_TEXT_ENTITY, STR(mark->beg), mark->end - mark->beg);
4328 break;
4329
4330 case '\0':
4331 MD_TEXT(MD_TEXT_NULLCHAR, _T(""), 1);
4332 break;
4333
4334 case 127:
4335 goto abort;
4336 }
4337
4338 off = mark->end;
4339
4340 /* Move to next resolved mark. */
4341 prev_mark = mark;
4342 mark++;
4343 while(!(mark->flags & MD_MARK_RESOLVED) || mark->beg < off)
4344 mark++;
4345 }
4346
4347 /* If reached end of line, move to next one. */
4348 if(off >= line->end) {
4349 /* If it is the last line, we are done. */
4350 if(off >= end)
4351 break;
4352
4353 if(text_type == MD_TEXT_CODE || text_type == MD_TEXT_LATEXMATH) {
4354 OFF tmp;
4355
4356 MD_ASSERT(prev_mark != NULL);
4357 MD_ASSERT(ISANYOF2_(prev_mark->ch, '`', '$') && (prev_mark->flags & MD_MARK_OPENER));
4358 MD_ASSERT(ISANYOF2_(mark->ch, '`', '$') && (mark->flags & MD_MARK_CLOSER));
4359
4360 /* Inside a code span, trailing line whitespace has to be
4361 * outputted. */
4362 tmp = off;
4363 while(off < ctx->size && ISBLANK(off))
4364 off++;
4365 if(off > tmp)
4366 MD_TEXT(text_type, STR(tmp), off-tmp);
4367
4368 /* and new lines are transformed into single spaces. */
4369 if(prev_mark->end < off && off < mark->beg)
4370 MD_TEXT(text_type, _T(" "), 1);
4371 } else if(text_type == MD_TEXT_HTML) {
4372 /* Inside raw HTML, we output the new line verbatim, including
4373 * any trailing spaces. */
4374 OFF tmp = off;
4375
4376 while(tmp < end && ISBLANK(tmp))
4377 tmp++;
4378 if(tmp > off)
4379 MD_TEXT(MD_TEXT_HTML, STR(off), tmp - off);
4380 MD_TEXT(MD_TEXT_HTML, _T("\n"), 1);
4381 } else {
4382 /* Output soft or hard line break. */
4383 MD_TEXTTYPE break_type = MD_TEXT_SOFTBR;
4384
4385 if(text_type == MD_TEXT_NORMAL) {
4386 if(enforce_hardbreak)
4387 break_type = MD_TEXT_BR;
4388 else if((CH(line->end) == _T(' ') && CH(line->end+1) == _T(' ')))
4389 break_type = MD_TEXT_BR;
4390 }
4391
4392 MD_TEXT(break_type, _T("\n"), 1);
4393 }
4394
4395 /* Move to the next line. */
4396 line++;
4397 off = line->beg;
4398
4399 enforce_hardbreak = 0;
4400 }
4401 }
4402
4403abort:
4404 return ret;
4405}
4406
4407
4408/***************************
4409 *** Processing Tables ***
4410 ***************************/
4411
4412static void
4413md_analyze_table_alignment(MD_CTX* ctx, OFF beg, OFF end, MD_ALIGN* align, int n_align)
4414{
4415 static const MD_ALIGN align_map[] = { MD_ALIGN_DEFAULT, MD_ALIGN_LEFT, MD_ALIGN_RIGHT, MD_ALIGN_CENTER };
4416 OFF off = beg;
4417
4418 while(n_align > 0) {
4419 int index = 0; /* index into align_map[] */
4420
4421 while(CH(off) != _T('-'))
4422 off++;
4423 if(off > beg && CH(off-1) == _T(':'))
4424 index |= 1;
4425 while(off < end && CH(off) == _T('-'))
4426 off++;
4427 if(off < end && CH(off) == _T(':'))
4428 index |= 2;
4429
4430 *align = align_map[index];
4431 align++;
4432 n_align--;
4433 }
4434
4435}
4436
4437/* Forward declaration. */
4438static int md_process_normal_block_contents(MD_CTX* ctx, const MD_LINE* lines, int n_lines);
4439
4440static int
4441md_process_table_cell(MD_CTX* ctx, MD_BLOCKTYPE cell_type, MD_ALIGN align, OFF beg, OFF end)
4442{
4443 MD_LINE line;
4444 MD_BLOCK_TD_DETAIL det;
4445 int ret = 0;
4446
4447 while(beg < end && ISWHITESPACE(beg))
4448 beg++;
4449 while(end > beg && ISWHITESPACE(end-1))
4450 end--;
4451
4452 det.align = align;
4453 line.beg = beg;
4454 line.end = end;
4455
4456 MD_ENTER_BLOCK(cell_type, &det);
4457 MD_CHECK(md_process_normal_block_contents(ctx, &line, 1));
4458 MD_LEAVE_BLOCK(cell_type, &det);
4459
4460abort:
4461 return ret;
4462}
4463
4464static int
4465md_process_table_row(MD_CTX* ctx, MD_BLOCKTYPE cell_type, OFF beg, OFF end,
4466 const MD_ALIGN* align, int col_count)
4467{
4468 MD_LINE line;
4469 OFF* pipe_offs = NULL;
4470 int i, j, k, n;
4471 int ret = 0;
4472
4473 line.beg = beg;
4474 line.end = end;
4475
4476 /* Break the line into table cells by identifying pipe characters who
4477 * form the cell boundary. */
4478 MD_CHECK(md_analyze_inlines(ctx, &line, 1, TRUE));
4479
4480 /* We have to remember the cell boundaries in local buffer because
4481 * ctx->marks[] shall be reused during cell contents processing. */
4482 n = ctx->n_table_cell_boundaries + 2;
4483 pipe_offs = (OFF*) malloc(size: n * sizeof(OFF));
4484 if(pipe_offs == NULL) {
4485 MD_LOG("malloc() failed.");
4486 ret = -1;
4487 goto abort;
4488 }
4489 j = 0;
4490 pipe_offs[j++] = beg;
4491 for(i = TABLECELLBOUNDARIES.head; i >= 0; i = ctx->marks[i].next) {
4492 MD_MARK* mark = &ctx->marks[i];
4493 pipe_offs[j++] = mark->end;
4494 }
4495 pipe_offs[j++] = end+1;
4496
4497 /* Process cells. */
4498 MD_ENTER_BLOCK(MD_BLOCK_TR, NULL);
4499 k = 0;
4500 for(i = 0; i < j-1 && k < col_count; i++) {
4501 if(pipe_offs[i] < pipe_offs[i+1]-1)
4502 MD_CHECK(md_process_table_cell(ctx, cell_type, align[k++], pipe_offs[i], pipe_offs[i+1]-1));
4503 }
4504 /* Make sure we call enough table cells even if the current table contains
4505 * too few of them. */
4506 while(k < col_count)
4507 MD_CHECK(md_process_table_cell(ctx, cell_type, align[k++], 0, 0));
4508 MD_LEAVE_BLOCK(MD_BLOCK_TR, NULL);
4509
4510abort:
4511 free(ptr: pipe_offs);
4512
4513 /* Free any temporary memory blocks stored within some dummy marks. */
4514 for(i = PTR_CHAIN.head; i >= 0; i = ctx->marks[i].next)
4515 free(ptr: md_mark_get_ptr(ctx, mark_index: i));
4516 PTR_CHAIN.head = -1;
4517 PTR_CHAIN.tail = -1;
4518
4519 return ret;
4520}
4521
4522static int
4523md_process_table_block_contents(MD_CTX* ctx, int col_count, const MD_LINE* lines, int n_lines)
4524{
4525 MD_ALIGN* align;
4526 int i;
4527 int ret = 0;
4528
4529 /* At least two lines have to be present: The column headers and the line
4530 * with the underlines. */
4531 MD_ASSERT(n_lines >= 2);
4532
4533 align = malloc(size: col_count * sizeof(MD_ALIGN));
4534 if(align == NULL) {
4535 MD_LOG("malloc() failed.");
4536 ret = -1;
4537 goto abort;
4538 }
4539
4540 md_analyze_table_alignment(ctx, beg: lines[1].beg, end: lines[1].end, align, n_align: col_count);
4541
4542 MD_ENTER_BLOCK(MD_BLOCK_THEAD, NULL);
4543 MD_CHECK(md_process_table_row(ctx, MD_BLOCK_TH,
4544 lines[0].beg, lines[0].end, align, col_count));
4545 MD_LEAVE_BLOCK(MD_BLOCK_THEAD, NULL);
4546
4547 if(n_lines > 2) {
4548 MD_ENTER_BLOCK(MD_BLOCK_TBODY, NULL);
4549 for(i = 2; i < n_lines; i++) {
4550 MD_CHECK(md_process_table_row(ctx, MD_BLOCK_TD,
4551 lines[i].beg, lines[i].end, align, col_count));
4552 }
4553 MD_LEAVE_BLOCK(MD_BLOCK_TBODY, NULL);
4554 }
4555
4556abort:
4557 free(ptr: align);
4558 return ret;
4559}
4560
4561
4562/**************************
4563 *** Processing Block ***
4564 **************************/
4565
4566#define MD_BLOCK_CONTAINER_OPENER 0x01
4567#define MD_BLOCK_CONTAINER_CLOSER 0x02
4568#define MD_BLOCK_CONTAINER (MD_BLOCK_CONTAINER_OPENER | MD_BLOCK_CONTAINER_CLOSER)
4569#define MD_BLOCK_LOOSE_LIST 0x04
4570#define MD_BLOCK_SETEXT_HEADER 0x08
4571
4572struct MD_BLOCK_tag {
4573 MD_BLOCKTYPE type : 8;
4574 unsigned flags : 8;
4575
4576 /* MD_BLOCK_H: Header level (1 - 6)
4577 * MD_BLOCK_CODE: Non-zero if fenced, zero if indented.
4578 * MD_BLOCK_LI: Task mark character (0 if not task list item, 'x', 'X' or ' ').
4579 * MD_BLOCK_TABLE: Column count (as determined by the table underline).
4580 */
4581 unsigned data : 16;
4582
4583 /* Leaf blocks: Count of lines (MD_LINE or MD_VERBATIMLINE) on the block.
4584 * MD_BLOCK_LI: Task mark offset in the input doc.
4585 * MD_BLOCK_OL: Start item number.
4586 */
4587 unsigned n_lines;
4588};
4589
4590struct MD_CONTAINER_tag {
4591 CHAR ch;
4592 unsigned is_loose : 8;
4593 unsigned is_task : 8;
4594 unsigned start;
4595 unsigned mark_indent;
4596 unsigned contents_indent;
4597 OFF block_byte_off;
4598 OFF task_mark_off;
4599};
4600
4601
4602static int
4603md_process_normal_block_contents(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
4604{
4605 int i;
4606 int ret;
4607
4608 MD_CHECK(md_analyze_inlines(ctx, lines, n_lines, FALSE));
4609 MD_CHECK(md_process_inlines(ctx, lines, n_lines));
4610
4611abort:
4612 /* Free any temporary memory blocks stored within some dummy marks. */
4613 for(i = PTR_CHAIN.head; i >= 0; i = ctx->marks[i].next)
4614 free(ptr: md_mark_get_ptr(ctx, mark_index: i));
4615 PTR_CHAIN.head = -1;
4616 PTR_CHAIN.tail = -1;
4617
4618 return ret;
4619}
4620
4621static int
4622md_process_verbatim_block_contents(MD_CTX* ctx, MD_TEXTTYPE text_type, const MD_VERBATIMLINE* lines, int n_lines)
4623{
4624 static const CHAR indent_chunk_str[] = _T(" ");
4625 static const SZ indent_chunk_size = SIZEOF_ARRAY(indent_chunk_str) - 1;
4626
4627 int i;
4628 int ret = 0;
4629
4630 for(i = 0; i < n_lines; i++) {
4631 const MD_VERBATIMLINE* line = &lines[i];
4632 int indent = line->indent;
4633
4634 MD_ASSERT(indent >= 0);
4635
4636 /* Output code indentation. */
4637 while(indent > (int) indent_chunk_size) {
4638 MD_TEXT(text_type, indent_chunk_str, indent_chunk_size);
4639 indent -= indent_chunk_size;
4640 }
4641 if(indent > 0)
4642 MD_TEXT(text_type, indent_chunk_str, indent);
4643
4644 /* Output the code line itself. */
4645 MD_TEXT_INSECURE(text_type, STR(line->beg), line->end - line->beg);
4646
4647 /* Enforce end-of-line. */
4648 MD_TEXT(text_type, _T("\n"), 1);
4649 }
4650
4651abort:
4652 return ret;
4653}
4654
4655static int
4656md_process_code_block_contents(MD_CTX* ctx, int is_fenced, const MD_VERBATIMLINE* lines, int n_lines)
4657{
4658 if(is_fenced) {
4659 /* Skip the first line in case of fenced code: It is the fence.
4660 * (Only the starting fence is present due to logic in md_analyze_line().) */
4661 lines++;
4662 n_lines--;
4663 } else {
4664 /* Ignore blank lines at start/end of indented code block. */
4665 while(n_lines > 0 && lines[0].beg == lines[0].end) {
4666 lines++;
4667 n_lines--;
4668 }
4669 while(n_lines > 0 && lines[n_lines-1].beg == lines[n_lines-1].end) {
4670 n_lines--;
4671 }
4672 }
4673
4674 if(n_lines == 0)
4675 return 0;
4676
4677 return md_process_verbatim_block_contents(ctx, text_type: MD_TEXT_CODE, lines, n_lines);
4678}
4679
4680static int
4681md_setup_fenced_code_detail(MD_CTX* ctx, const MD_BLOCK* block, MD_BLOCK_CODE_DETAIL* det,
4682 MD_ATTRIBUTE_BUILD* info_build, MD_ATTRIBUTE_BUILD* lang_build)
4683{
4684 const MD_VERBATIMLINE* fence_line = (const MD_VERBATIMLINE*)(block + 1);
4685 OFF beg = fence_line->beg;
4686 OFF end = fence_line->end;
4687 OFF lang_end;
4688 CHAR fence_ch = CH(fence_line->beg);
4689 int ret = 0;
4690
4691 /* Skip the fence itself. */
4692 while(beg < ctx->size && CH(beg) == fence_ch)
4693 beg++;
4694 /* Trim initial spaces. */
4695 while(beg < ctx->size && CH(beg) == _T(' '))
4696 beg++;
4697
4698 /* Trim trailing spaces. */
4699 while(end > beg && CH(end-1) == _T(' '))
4700 end--;
4701
4702 /* Build info string attribute. */
4703 MD_CHECK(md_build_attribute(ctx, STR(beg), end - beg, 0, &det->info, info_build));
4704
4705 /* Build info string attribute. */
4706 lang_end = beg;
4707 while(lang_end < end && !ISWHITESPACE(lang_end))
4708 lang_end++;
4709 MD_CHECK(md_build_attribute(ctx, STR(beg), lang_end - beg, 0, &det->lang, lang_build));
4710
4711 det->fence_char = fence_ch;
4712
4713abort:
4714 return ret;
4715}
4716
4717static int
4718md_process_leaf_block(MD_CTX* ctx, const MD_BLOCK* block)
4719{
4720 union {
4721 MD_BLOCK_H_DETAIL header;
4722 MD_BLOCK_CODE_DETAIL code;
4723 MD_BLOCK_TABLE_DETAIL table;
4724 } det;
4725 MD_ATTRIBUTE_BUILD info_build;
4726 MD_ATTRIBUTE_BUILD lang_build;
4727 int is_in_tight_list;
4728 int clean_fence_code_detail = FALSE;
4729 int ret = 0;
4730
4731 memset(s: &det, c: 0, n: sizeof(det));
4732
4733 if(ctx->n_containers == 0)
4734 is_in_tight_list = FALSE;
4735 else
4736 is_in_tight_list = !ctx->containers[ctx->n_containers-1].is_loose;
4737
4738 switch(block->type) {
4739 case MD_BLOCK_H:
4740 det.header.level = block->data;
4741 break;
4742
4743 case MD_BLOCK_CODE:
4744 /* For fenced code block, we may need to set the info string. */
4745 if(block->data != 0) {
4746 memset(s: &det.code, c: 0, n: sizeof(MD_BLOCK_CODE_DETAIL));
4747 clean_fence_code_detail = TRUE;
4748 MD_CHECK(md_setup_fenced_code_detail(ctx, block, &det.code, &info_build, &lang_build));
4749 }
4750 break;
4751
4752 case MD_BLOCK_TABLE:
4753 det.table.col_count = block->data;
4754 det.table.head_row_count = 1;
4755 det.table.body_row_count = block->n_lines - 2;
4756 break;
4757
4758 default:
4759 /* Noop. */
4760 break;
4761 }
4762
4763 if(!is_in_tight_list || block->type != MD_BLOCK_P)
4764 MD_ENTER_BLOCK(block->type, (void*) &det);
4765
4766 /* Process the block contents accordingly to is type. */
4767 switch(block->type) {
4768 case MD_BLOCK_HR:
4769 /* noop */
4770 break;
4771
4772 case MD_BLOCK_CODE:
4773 MD_CHECK(md_process_code_block_contents(ctx, (block->data != 0),
4774 (const MD_VERBATIMLINE*)(block + 1), block->n_lines));
4775 break;
4776
4777 case MD_BLOCK_HTML:
4778 MD_CHECK(md_process_verbatim_block_contents(ctx, MD_TEXT_HTML,
4779 (const MD_VERBATIMLINE*)(block + 1), block->n_lines));
4780 break;
4781
4782 case MD_BLOCK_TABLE:
4783 MD_CHECK(md_process_table_block_contents(ctx, block->data,
4784 (const MD_LINE*)(block + 1), block->n_lines));
4785 break;
4786
4787 default:
4788 MD_CHECK(md_process_normal_block_contents(ctx,
4789 (const MD_LINE*)(block + 1), block->n_lines));
4790 break;
4791 }
4792
4793 if(!is_in_tight_list || block->type != MD_BLOCK_P)
4794 MD_LEAVE_BLOCK(block->type, (void*) &det);
4795
4796abort:
4797 if(clean_fence_code_detail) {
4798 md_free_attribute(ctx, build: &info_build);
4799 md_free_attribute(ctx, build: &lang_build);
4800 }
4801 return ret;
4802}
4803
4804static int
4805md_process_all_blocks(MD_CTX* ctx)
4806{
4807 int byte_off = 0;
4808 int ret = 0;
4809
4810 /* ctx->containers now is not needed for detection of lists and list items
4811 * so we reuse it for tracking what lists are loose or tight. We rely
4812 * on the fact the vector is large enough to hold the deepest nesting
4813 * level of lists. */
4814 ctx->n_containers = 0;
4815
4816 while(byte_off < ctx->n_block_bytes) {
4817 MD_BLOCK* block = (MD_BLOCK*)((char*)ctx->block_bytes + byte_off);
4818 union {
4819 MD_BLOCK_UL_DETAIL ul;
4820 MD_BLOCK_OL_DETAIL ol;
4821 MD_BLOCK_LI_DETAIL li;
4822 } det;
4823
4824 switch(block->type) {
4825 case MD_BLOCK_UL:
4826 det.ul.is_tight = (block->flags & MD_BLOCK_LOOSE_LIST) ? FALSE : TRUE;
4827 det.ul.mark = (CHAR) block->data;
4828 break;
4829
4830 case MD_BLOCK_OL:
4831 det.ol.start = block->n_lines;
4832 det.ol.is_tight = (block->flags & MD_BLOCK_LOOSE_LIST) ? FALSE : TRUE;
4833 det.ol.mark_delimiter = (CHAR) block->data;
4834 break;
4835
4836 case MD_BLOCK_LI:
4837 det.li.is_task = (block->data != 0);
4838 det.li.task_mark = (CHAR) block->data;
4839 det.li.task_mark_offset = (OFF) block->n_lines;
4840 break;
4841
4842 default:
4843 /* noop */
4844 break;
4845 }
4846
4847 if(block->flags & MD_BLOCK_CONTAINER) {
4848 if(block->flags & MD_BLOCK_CONTAINER_CLOSER) {
4849 MD_LEAVE_BLOCK(block->type, &det);
4850
4851 if(block->type == MD_BLOCK_UL || block->type == MD_BLOCK_OL || block->type == MD_BLOCK_QUOTE)
4852 ctx->n_containers--;
4853 }
4854
4855 if(block->flags & MD_BLOCK_CONTAINER_OPENER) {
4856 MD_ENTER_BLOCK(block->type, &det);
4857
4858 if(block->type == MD_BLOCK_UL || block->type == MD_BLOCK_OL) {
4859 ctx->containers[ctx->n_containers].is_loose = (block->flags & MD_BLOCK_LOOSE_LIST);
4860 ctx->n_containers++;
4861 } else if(block->type == MD_BLOCK_QUOTE) {
4862 /* This causes that any text in a block quote, even if
4863 * nested inside a tight list item, is wrapped with
4864 * <p>...</p>. */
4865 ctx->containers[ctx->n_containers].is_loose = TRUE;
4866 ctx->n_containers++;
4867 }
4868 }
4869 } else {
4870 MD_CHECK(md_process_leaf_block(ctx, block));
4871
4872 if(block->type == MD_BLOCK_CODE || block->type == MD_BLOCK_HTML)
4873 byte_off += block->n_lines * sizeof(MD_VERBATIMLINE);
4874 else
4875 byte_off += block->n_lines * sizeof(MD_LINE);
4876 }
4877
4878 byte_off += sizeof(MD_BLOCK);
4879 }
4880
4881 ctx->n_block_bytes = 0;
4882
4883abort:
4884 return ret;
4885}
4886
4887
4888/************************************
4889 *** Grouping Lines into Blocks ***
4890 ************************************/
4891
4892static void*
4893md_push_block_bytes(MD_CTX* ctx, int n_bytes)
4894{
4895 void* ptr;
4896
4897 if(ctx->n_block_bytes + n_bytes > ctx->alloc_block_bytes) {
4898 void* new_block_bytes;
4899
4900 ctx->alloc_block_bytes = (ctx->alloc_block_bytes > 0
4901 ? ctx->alloc_block_bytes + ctx->alloc_block_bytes / 2
4902 : 512);
4903 new_block_bytes = realloc(ptr: ctx->block_bytes, size: ctx->alloc_block_bytes);
4904 if(new_block_bytes == NULL) {
4905 MD_LOG("realloc() failed.");
4906 return NULL;
4907 }
4908
4909 /* Fix the ->current_block after the reallocation. */
4910 if(ctx->current_block != NULL) {
4911 OFF off_current_block = (OFF)((char*) ctx->current_block - (char*) ctx->block_bytes);
4912 ctx->current_block = (MD_BLOCK*) ((char*) new_block_bytes + off_current_block);
4913 }
4914
4915 ctx->block_bytes = new_block_bytes;
4916 }
4917
4918 ptr = (char*)ctx->block_bytes + ctx->n_block_bytes;
4919 ctx->n_block_bytes += n_bytes;
4920 return ptr;
4921}
4922
4923static int
4924md_start_new_block(MD_CTX* ctx, const MD_LINE_ANALYSIS* line)
4925{
4926 MD_BLOCK* block;
4927
4928 MD_ASSERT(ctx->current_block == NULL);
4929
4930 block = (MD_BLOCK*) md_push_block_bytes(ctx, n_bytes: sizeof(MD_BLOCK));
4931 if(block == NULL)
4932 return -1;
4933
4934 switch(line->type) {
4935 case MD_LINE_HR:
4936 block->type = MD_BLOCK_HR;
4937 break;
4938
4939 case MD_LINE_ATXHEADER:
4940 case MD_LINE_SETEXTHEADER:
4941 block->type = MD_BLOCK_H;
4942 break;
4943
4944 case MD_LINE_FENCEDCODE:
4945 case MD_LINE_INDENTEDCODE:
4946 block->type = MD_BLOCK_CODE;
4947 break;
4948
4949 case MD_LINE_TEXT:
4950 block->type = MD_BLOCK_P;
4951 break;
4952
4953 case MD_LINE_HTML:
4954 block->type = MD_BLOCK_HTML;
4955 break;
4956
4957 case MD_LINE_BLANK:
4958 case MD_LINE_SETEXTUNDERLINE:
4959 case MD_LINE_TABLEUNDERLINE:
4960 default:
4961 MD_UNREACHABLE();
4962 break;
4963 }
4964
4965 block->flags = 0;
4966 block->data = line->data;
4967 block->n_lines = 0;
4968
4969 ctx->current_block = block;
4970 return 0;
4971}
4972
4973/* Eat from start of current (textual) block any reference definitions and
4974 * remember them so we can resolve any links referring to them.
4975 *
4976 * (Reference definitions can only be at start of it as they cannot break
4977 * a paragraph.)
4978 */
4979static int
4980md_consume_link_reference_definitions(MD_CTX* ctx)
4981{
4982 MD_LINE* lines = (MD_LINE*) (ctx->current_block + 1);
4983 int n_lines = ctx->current_block->n_lines;
4984 int n = 0;
4985
4986 /* Compute how many lines at the start of the block form one or more
4987 * reference definitions. */
4988 while(n < n_lines) {
4989 int n_link_ref_lines;
4990
4991 n_link_ref_lines = md_is_link_reference_definition(ctx,
4992 lines: lines + n, n_lines: n_lines - n);
4993 /* Not a reference definition? */
4994 if(n_link_ref_lines == 0)
4995 break;
4996
4997 /* We fail if it is the ref. def. but it could not be stored due
4998 * a memory allocation error. */
4999 if(n_link_ref_lines < 0)
5000 return -1;
5001
5002 n += n_link_ref_lines;
5003 }
5004
5005 /* If there was at least one reference definition, we need to remove
5006 * its lines from the block, or perhaps even the whole block. */
5007 if(n > 0) {
5008 if(n == n_lines) {
5009 /* Remove complete block. */
5010 ctx->n_block_bytes -= n * sizeof(MD_LINE);
5011 ctx->n_block_bytes -= sizeof(MD_BLOCK);
5012 ctx->current_block = NULL;
5013 } else {
5014 /* Remove just some initial lines from the block. */
5015 memmove(dest: lines, src: lines + n, n: (n_lines - n) * sizeof(MD_LINE));
5016 ctx->current_block->n_lines -= n;
5017 ctx->n_block_bytes -= n * sizeof(MD_LINE);
5018 }
5019 }
5020
5021 return 0;
5022}
5023
5024static int
5025md_end_current_block(MD_CTX* ctx)
5026{
5027 int ret = 0;
5028
5029 if(ctx->current_block == NULL)
5030 return ret;
5031
5032 /* Check whether there is a reference definition. (We do this here instead
5033 * of in md_analyze_line() because reference definition can take multiple
5034 * lines.) */
5035 if(ctx->current_block->type == MD_BLOCK_P ||
5036 (ctx->current_block->type == MD_BLOCK_H && (ctx->current_block->flags & MD_BLOCK_SETEXT_HEADER)))
5037 {
5038 MD_LINE* lines = (MD_LINE*) (ctx->current_block + 1);
5039 if(CH(lines[0].beg) == _T('[')) {
5040 MD_CHECK(md_consume_link_reference_definitions(ctx));
5041 if(ctx->current_block == NULL)
5042 return ret;
5043 }
5044 }
5045
5046 if(ctx->current_block->type == MD_BLOCK_H && (ctx->current_block->flags & MD_BLOCK_SETEXT_HEADER)) {
5047 int n_lines = ctx->current_block->n_lines;
5048
5049 if(n_lines > 1) {
5050 /* Get rid of the underline. */
5051 ctx->current_block->n_lines--;
5052 ctx->n_block_bytes -= sizeof(MD_LINE);
5053 } else {
5054 /* Only the underline has left after eating the ref. defs.
5055 * Keep the line as beginning of a new ordinary paragraph. */
5056 ctx->current_block->type = MD_BLOCK_P;
5057 return 0;
5058 }
5059 }
5060
5061 /* Mark we are not building any block anymore. */
5062 ctx->current_block = NULL;
5063
5064abort:
5065 return ret;
5066}
5067
5068static int
5069md_add_line_into_current_block(MD_CTX* ctx, const MD_LINE_ANALYSIS* analysis)
5070{
5071 MD_ASSERT(ctx->current_block != NULL);
5072
5073 if(ctx->current_block->type == MD_BLOCK_CODE || ctx->current_block->type == MD_BLOCK_HTML) {
5074 MD_VERBATIMLINE* line;
5075
5076 line = (MD_VERBATIMLINE*) md_push_block_bytes(ctx, n_bytes: sizeof(MD_VERBATIMLINE));
5077 if(line == NULL)
5078 return -1;
5079
5080 line->indent = analysis->indent;
5081 line->beg = analysis->beg;
5082 line->end = analysis->end;
5083 } else {
5084 MD_LINE* line;
5085
5086 line = (MD_LINE*) md_push_block_bytes(ctx, n_bytes: sizeof(MD_LINE));
5087 if(line == NULL)
5088 return -1;
5089
5090 line->beg = analysis->beg;
5091 line->end = analysis->end;
5092 }
5093 ctx->current_block->n_lines++;
5094
5095 return 0;
5096}
5097
5098static int
5099md_push_container_bytes(MD_CTX* ctx, MD_BLOCKTYPE type, unsigned start,
5100 unsigned data, unsigned flags)
5101{
5102 MD_BLOCK* block;
5103 int ret = 0;
5104
5105 MD_CHECK(md_end_current_block(ctx));
5106
5107 block = (MD_BLOCK*) md_push_block_bytes(ctx, n_bytes: sizeof(MD_BLOCK));
5108 if(block == NULL)
5109 return -1;
5110
5111 block->type = type;
5112 block->flags = flags;
5113 block->data = data;
5114 block->n_lines = start;
5115
5116abort:
5117 return ret;
5118}
5119
5120
5121
5122/***********************
5123 *** Line Analysis ***
5124 ***********************/
5125
5126static int
5127md_is_hr_line(MD_CTX* ctx, OFF beg, OFF* p_end, OFF* p_killer)
5128{
5129 OFF off = beg + 1;
5130 int n = 1;
5131
5132 while(off < ctx->size && (CH(off) == CH(beg) || CH(off) == _T(' ') || CH(off) == _T('\t'))) {
5133 if(CH(off) == CH(beg))
5134 n++;
5135 off++;
5136 }
5137
5138 if(n < 3) {
5139 *p_killer = off;
5140 return FALSE;
5141 }
5142
5143 /* Nothing else can be present on the line. */
5144 if(off < ctx->size && !ISNEWLINE(off)) {
5145 *p_killer = off;
5146 return FALSE;
5147 }
5148
5149 *p_end = off;
5150 return TRUE;
5151}
5152
5153static int
5154md_is_atxheader_line(MD_CTX* ctx, OFF beg, OFF* p_beg, OFF* p_end, unsigned* p_level)
5155{
5156 int n;
5157 OFF off = beg + 1;
5158
5159 while(off < ctx->size && CH(off) == _T('#') && off - beg < 7)
5160 off++;
5161 n = off - beg;
5162
5163 if(n > 6)
5164 return FALSE;
5165 *p_level = n;
5166
5167 if(!(ctx->parser.flags & MD_FLAG_PERMISSIVEATXHEADERS) && off < ctx->size &&
5168 CH(off) != _T(' ') && CH(off) != _T('\t') && !ISNEWLINE(off))
5169 return FALSE;
5170
5171 while(off < ctx->size && CH(off) == _T(' '))
5172 off++;
5173 *p_beg = off;
5174 *p_end = off;
5175 return TRUE;
5176}
5177
5178static int
5179md_is_setext_underline(MD_CTX* ctx, OFF beg, OFF* p_end, unsigned* p_level)
5180{
5181 OFF off = beg + 1;
5182
5183 while(off < ctx->size && CH(off) == CH(beg))
5184 off++;
5185
5186 /* Optionally, space(s) can follow. */
5187 while(off < ctx->size && CH(off) == _T(' '))
5188 off++;
5189
5190 /* But nothing more is allowed on the line. */
5191 if(off < ctx->size && !ISNEWLINE(off))
5192 return FALSE;
5193
5194 *p_level = (CH(beg) == _T('=') ? 1 : 2);
5195 *p_end = off;
5196 return TRUE;
5197}
5198
5199static int
5200md_is_table_underline(MD_CTX* ctx, OFF beg, OFF* p_end, unsigned* p_col_count)
5201{
5202 OFF off = beg;
5203 int found_pipe = FALSE;
5204 unsigned col_count = 0;
5205
5206 if(off < ctx->size && CH(off) == _T('|')) {
5207 found_pipe = TRUE;
5208 off++;
5209 while(off < ctx->size && ISWHITESPACE(off))
5210 off++;
5211 }
5212
5213 while(1) {
5214 OFF cell_beg;
5215 int delimited = FALSE;
5216
5217 /* Cell underline ("-----", ":----", "----:" or ":----:") */
5218 cell_beg = off;
5219 if(off < ctx->size && CH(off) == _T(':'))
5220 off++;
5221 while(off < ctx->size && CH(off) == _T('-'))
5222 off++;
5223 if(off < ctx->size && CH(off) == _T(':'))
5224 off++;
5225 if(off - cell_beg < 3)
5226 return FALSE;
5227
5228 col_count++;
5229
5230 /* Pipe delimiter (optional at the end of line). */
5231 while(off < ctx->size && ISWHITESPACE(off))
5232 off++;
5233 if(off < ctx->size && CH(off) == _T('|')) {
5234 delimited = TRUE;
5235 found_pipe = TRUE;
5236 off++;
5237 while(off < ctx->size && ISWHITESPACE(off))
5238 off++;
5239 }
5240
5241 /* Success, if we reach end of line. */
5242 if(off >= ctx->size || ISNEWLINE(off))
5243 break;
5244
5245 if(!delimited)
5246 return FALSE;
5247 }
5248
5249 if(!found_pipe)
5250 return FALSE;
5251
5252 *p_end = off;
5253 *p_col_count = col_count;
5254 return TRUE;
5255}
5256
5257static int
5258md_is_opening_code_fence(MD_CTX* ctx, OFF beg, OFF* p_end)
5259{
5260 OFF off = beg;
5261
5262 while(off < ctx->size && CH(off) == CH(beg))
5263 off++;
5264
5265 /* Fence must have at least three characters. */
5266 if(off - beg < 3)
5267 return FALSE;
5268
5269 ctx->code_fence_length = off - beg;
5270
5271 /* Optionally, space(s) can follow. */
5272 while(off < ctx->size && CH(off) == _T(' '))
5273 off++;
5274
5275 /* Optionally, an info string can follow. */
5276 while(off < ctx->size && !ISNEWLINE(off)) {
5277 /* Backtick-based fence must not contain '`' in the info string. */
5278 if(CH(beg) == _T('`') && CH(off) == _T('`'))
5279 return FALSE;
5280 off++;
5281 }
5282
5283 *p_end = off;
5284 return TRUE;
5285}
5286
5287static int
5288md_is_closing_code_fence(MD_CTX* ctx, CHAR ch, OFF beg, OFF* p_end)
5289{
5290 OFF off = beg;
5291 int ret = FALSE;
5292
5293 /* Closing fence must have at least the same length and use same char as
5294 * opening one. */
5295 while(off < ctx->size && CH(off) == ch)
5296 off++;
5297 if(off - beg < ctx->code_fence_length)
5298 goto out;
5299
5300 /* Optionally, space(s) can follow */
5301 while(off < ctx->size && CH(off) == _T(' '))
5302 off++;
5303
5304 /* But nothing more is allowed on the line. */
5305 if(off < ctx->size && !ISNEWLINE(off))
5306 goto out;
5307
5308 ret = TRUE;
5309
5310out:
5311 /* Note we set *p_end even on failure: If we are not closing fence, caller
5312 * would eat the line anyway without any parsing. */
5313 *p_end = off;
5314 return ret;
5315}
5316
5317/* Returns type of the raw HTML block, or FALSE if it is not HTML block.
5318 * (Refer to CommonMark specification for details about the types.)
5319 */
5320static int
5321md_is_html_block_start_condition(MD_CTX* ctx, OFF beg)
5322{
5323 typedef struct TAG_tag TAG;
5324 struct TAG_tag {
5325 const CHAR* name;
5326 unsigned len : 8;
5327 };
5328
5329 /* Type 6 is started by a long list of allowed tags. We use two-level
5330 * tree to speed-up the search. */
5331#ifdef X
5332 #undef X
5333#endif
5334#define X(name) { _T(name), (sizeof(name)-1) / sizeof(CHAR) }
5335#define Xend { NULL, 0 }
5336 static const TAG t1[] = { X("script"), X("pre"), X("style"), Xend };
5337
5338 static const TAG a6[] = { X("address"), X("article"), X("aside"), Xend };
5339 static const TAG b6[] = { X("base"), X("basefont"), X("blockquote"), X("body"), Xend };
5340 static const TAG c6[] = { X("caption"), X("center"), X("col"), X("colgroup"), Xend };
5341 static const TAG d6[] = { X("dd"), X("details"), X("dialog"), X("dir"),
5342 X("div"), X("dl"), X("dt"), Xend };
5343 static const TAG f6[] = { X("fieldset"), X("figcaption"), X("figure"), X("footer"),
5344 X("form"), X("frame"), X("frameset"), Xend };
5345 static const TAG h6[] = { X("h1"), X("head"), X("header"), X("hr"), X("html"), Xend };
5346 static const TAG i6[] = { X("iframe"), Xend };
5347 static const TAG l6[] = { X("legend"), X("li"), X("link"), Xend };
5348 static const TAG m6[] = { X("main"), X("menu"), X("menuitem"), Xend };
5349 static const TAG n6[] = { X("nav"), X("noframes"), Xend };
5350 static const TAG o6[] = { X("ol"), X("optgroup"), X("option"), Xend };
5351 static const TAG p6[] = { X("p"), X("param"), Xend };
5352 static const TAG s6[] = { X("section"), X("source"), X("summary"), Xend };
5353 static const TAG t6[] = { X("table"), X("tbody"), X("td"), X("tfoot"), X("th"),
5354 X("thead"), X("title"), X("tr"), X("track"), Xend };
5355 static const TAG u6[] = { X("ul"), Xend };
5356 static const TAG xx[] = { Xend };
5357#undef X
5358
5359 static const TAG* map6[26] = {
5360 a6, b6, c6, d6, xx, f6, xx, h6, i6, xx, xx, l6, m6,
5361 n6, o6, p6, xx, xx, s6, t6, u6, xx, xx, xx, xx, xx
5362 };
5363 OFF off = beg + 1;
5364 int i;
5365
5366 /* Check for type 1: <script, <pre, or <style */
5367 for(i = 0; t1[i].name != NULL; i++) {
5368 if(off + t1[i].len <= ctx->size) {
5369 if(md_ascii_case_eq(STR(off), s2: t1[i].name, n: t1[i].len))
5370 return 1;
5371 }
5372 }
5373
5374 /* Check for type 2: <!-- */
5375 if(off + 3 < ctx->size && CH(off) == _T('!') && CH(off+1) == _T('-') && CH(off+2) == _T('-'))
5376 return 2;
5377
5378 /* Check for type 3: <? */
5379 if(off < ctx->size && CH(off) == _T('?'))
5380 return 3;
5381
5382 /* Check for type 4 or 5: <! */
5383 if(off < ctx->size && CH(off) == _T('!')) {
5384 /* Check for type 4: <! followed by uppercase letter. */
5385 if(off + 1 < ctx->size && ISUPPER(off+1))
5386 return 4;
5387
5388 /* Check for type 5: <![CDATA[ */
5389 if(off + 8 < ctx->size) {
5390 if(md_ascii_eq(STR(off), _T("![CDATA["), n: 8))
5391 return 5;
5392 }
5393 }
5394
5395 /* Check for type 6: Many possible starting tags listed above. */
5396 if(off + 1 < ctx->size && (ISALPHA(off) || (CH(off) == _T('/') && ISALPHA(off+1)))) {
5397 int slot;
5398 const TAG* tags;
5399
5400 if(CH(off) == _T('/'))
5401 off++;
5402
5403 slot = (ISUPPER(off) ? CH(off) - 'A' : CH(off) - 'a');
5404 tags = map6[slot];
5405
5406 for(i = 0; tags[i].name != NULL; i++) {
5407 if(off + tags[i].len <= ctx->size) {
5408 if(md_ascii_case_eq(STR(off), s2: tags[i].name, n: tags[i].len)) {
5409 OFF tmp = off + tags[i].len;
5410 if(tmp >= ctx->size)
5411 return 6;
5412 if(ISBLANK(tmp) || ISNEWLINE(tmp) || CH(tmp) == _T('>'))
5413 return 6;
5414 if(tmp+1 < ctx->size && CH(tmp) == _T('/') && CH(tmp+1) == _T('>'))
5415 return 6;
5416 break;
5417 }
5418 }
5419 }
5420 }
5421
5422 /* Check for type 7: any COMPLETE other opening or closing tag. */
5423 if(off + 1 < ctx->size) {
5424 OFF end;
5425
5426 if(md_is_html_tag(ctx, NULL, n_lines: 0, beg, max_end: ctx->size, p_end: &end)) {
5427 /* Only optional whitespace and new line may follow. */
5428 while(end < ctx->size && ISWHITESPACE(end))
5429 end++;
5430 if(end >= ctx->size || ISNEWLINE(end))
5431 return 7;
5432 }
5433 }
5434
5435 return FALSE;
5436}
5437
5438/* Case sensitive check whether there is a substring 'what' between 'beg'
5439 * and end of line. */
5440static int
5441md_line_contains(MD_CTX* ctx, OFF beg, const CHAR* what, SZ what_len, OFF* p_end)
5442{
5443 OFF i;
5444 for(i = beg; i + what_len < ctx->size; i++) {
5445 if(ISNEWLINE(i))
5446 break;
5447 if(memcmp(STR(i), s2: what, n: what_len * sizeof(CHAR)) == 0) {
5448 *p_end = i + what_len;
5449 return TRUE;
5450 }
5451 }
5452
5453 *p_end = i;
5454 return FALSE;
5455}
5456
5457/* Returns type of HTML block end condition or FALSE if not an end condition.
5458 *
5459 * Note it fills p_end even when it is not end condition as the caller
5460 * does not need to analyze contents of a raw HTML block.
5461 */
5462static int
5463md_is_html_block_end_condition(MD_CTX* ctx, OFF beg, OFF* p_end)
5464{
5465 switch(ctx->html_block_type) {
5466 case 1:
5467 {
5468 OFF off = beg;
5469
5470 while(off < ctx->size && !ISNEWLINE(off)) {
5471 if(CH(off) == _T('<')) {
5472 if(md_ascii_case_eq(STR(off), _T("</script>"), n: 9)) {
5473 *p_end = off + 9;
5474 return TRUE;
5475 }
5476
5477 if(md_ascii_case_eq(STR(off), _T("</style>"), n: 8)) {
5478 *p_end = off + 8;
5479 return TRUE;
5480 }
5481
5482 if(md_ascii_case_eq(STR(off), _T("</pre>"), n: 6)) {
5483 *p_end = off + 6;
5484 return TRUE;
5485 }
5486 }
5487
5488 off++;
5489 }
5490 *p_end = off;
5491 return FALSE;
5492 }
5493
5494 case 2:
5495 return (md_line_contains(ctx, beg, _T("-->"), what_len: 3, p_end) ? 2 : FALSE);
5496
5497 case 3:
5498 return (md_line_contains(ctx, beg, _T("?>"), what_len: 2, p_end) ? 3 : FALSE);
5499
5500 case 4:
5501 return (md_line_contains(ctx, beg, _T(">"), what_len: 1, p_end) ? 4 : FALSE);
5502
5503 case 5:
5504 return (md_line_contains(ctx, beg, _T("]]>"), what_len: 3, p_end) ? 5 : FALSE);
5505
5506 case 6: /* Pass through */
5507 case 7:
5508 *p_end = beg;
5509 return (ISNEWLINE(beg) ? ctx->html_block_type : FALSE);
5510
5511 default:
5512 MD_UNREACHABLE();
5513 }
5514 return FALSE;
5515}
5516
5517
5518static int
5519md_is_container_compatible(const MD_CONTAINER* pivot, const MD_CONTAINER* container)
5520{
5521 /* Block quote has no "items" like lists. */
5522 if(container->ch == _T('>'))
5523 return FALSE;
5524
5525 if(container->ch != pivot->ch)
5526 return FALSE;
5527 if(container->mark_indent > pivot->contents_indent)
5528 return FALSE;
5529
5530 return TRUE;
5531}
5532
5533static int
5534md_push_container(MD_CTX* ctx, const MD_CONTAINER* container)
5535{
5536 if(ctx->n_containers >= ctx->alloc_containers) {
5537 MD_CONTAINER* new_containers;
5538
5539 ctx->alloc_containers = (ctx->alloc_containers > 0
5540 ? ctx->alloc_containers + ctx->alloc_containers / 2
5541 : 16);
5542 new_containers = realloc(ptr: ctx->containers, size: ctx->alloc_containers * sizeof(MD_CONTAINER));
5543 if(new_containers == NULL) {
5544 MD_LOG("realloc() failed.");
5545 return -1;
5546 }
5547
5548 ctx->containers = new_containers;
5549 }
5550
5551 memcpy(dest: &ctx->containers[ctx->n_containers++], src: container, n: sizeof(MD_CONTAINER));
5552 return 0;
5553}
5554
5555static int
5556md_enter_child_containers(MD_CTX* ctx, int n_children)
5557{
5558 int i;
5559 int ret = 0;
5560
5561 for(i = ctx->n_containers - n_children; i < ctx->n_containers; i++) {
5562 MD_CONTAINER* c = &ctx->containers[i];
5563 int is_ordered_list = FALSE;
5564
5565 switch(c->ch) {
5566 case _T(')'):
5567 case _T('.'):
5568 is_ordered_list = TRUE;
5569 MD_FALLTHROUGH();
5570
5571 case _T('-'):
5572 case _T('+'):
5573 case _T('*'):
5574 /* Remember offset in ctx->block_bytes so we can revisit the
5575 * block if we detect it is a loose list. */
5576 md_end_current_block(ctx);
5577 c->block_byte_off = ctx->n_block_bytes;
5578
5579 MD_CHECK(md_push_container_bytes(ctx,
5580 (is_ordered_list ? MD_BLOCK_OL : MD_BLOCK_UL),
5581 c->start, c->ch, MD_BLOCK_CONTAINER_OPENER));
5582 MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_LI,
5583 c->task_mark_off,
5584 (c->is_task ? CH(c->task_mark_off) : 0),
5585 MD_BLOCK_CONTAINER_OPENER));
5586 break;
5587
5588 case _T('>'):
5589 MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_QUOTE, 0, 0, MD_BLOCK_CONTAINER_OPENER));
5590 break;
5591
5592 default:
5593 MD_UNREACHABLE();
5594 break;
5595 }
5596 }
5597
5598abort:
5599 return ret;
5600}
5601
5602static int
5603md_leave_child_containers(MD_CTX* ctx, int n_keep)
5604{
5605 int ret = 0;
5606
5607 while(ctx->n_containers > n_keep) {
5608 MD_CONTAINER* c = &ctx->containers[ctx->n_containers-1];
5609 int is_ordered_list = FALSE;
5610
5611 switch(c->ch) {
5612 case _T(')'):
5613 case _T('.'):
5614 is_ordered_list = TRUE;
5615 MD_FALLTHROUGH();
5616
5617 case _T('-'):
5618 case _T('+'):
5619 case _T('*'):
5620 MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_LI,
5621 c->task_mark_off, (c->is_task ? CH(c->task_mark_off) : 0),
5622 MD_BLOCK_CONTAINER_CLOSER));
5623 MD_CHECK(md_push_container_bytes(ctx,
5624 (is_ordered_list ? MD_BLOCK_OL : MD_BLOCK_UL), 0,
5625 c->ch, MD_BLOCK_CONTAINER_CLOSER));
5626 break;
5627
5628 case _T('>'):
5629 MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_QUOTE, 0,
5630 0, MD_BLOCK_CONTAINER_CLOSER));
5631 break;
5632
5633 default:
5634 MD_UNREACHABLE();
5635 break;
5636 }
5637
5638 ctx->n_containers--;
5639 }
5640
5641abort:
5642 return ret;
5643}
5644
5645static int
5646md_is_container_mark(MD_CTX* ctx, unsigned indent, OFF beg, OFF* p_end, MD_CONTAINER* p_container)
5647{
5648 OFF off = beg;
5649 OFF max_end;
5650
5651 if(off >= ctx->size || indent >= ctx->code_indent_offset)
5652 return FALSE;
5653
5654 /* Check for block quote mark. */
5655 if(CH(off) == _T('>')) {
5656 off++;
5657 p_container->ch = _T('>');
5658 p_container->is_loose = FALSE;
5659 p_container->is_task = FALSE;
5660 p_container->mark_indent = indent;
5661 p_container->contents_indent = indent + 1;
5662 *p_end = off;
5663 return TRUE;
5664 }
5665
5666 /* Check for list item bullet mark. */
5667 if(ISANYOF(off, _T("-+*")) && (off+1 >= ctx->size || ISBLANK(off+1) || ISNEWLINE(off+1))) {
5668 p_container->ch = CH(off);
5669 p_container->is_loose = FALSE;
5670 p_container->is_task = FALSE;
5671 p_container->mark_indent = indent;
5672 p_container->contents_indent = indent + 1;
5673 *p_end = off+1;
5674 return TRUE;
5675 }
5676
5677 /* Check for ordered list item marks. */
5678 max_end = off + 9;
5679 if(max_end > ctx->size)
5680 max_end = ctx->size;
5681 p_container->start = 0;
5682 while(off < max_end && ISDIGIT(off)) {
5683 p_container->start = p_container->start * 10 + CH(off) - _T('0');
5684 off++;
5685 }
5686 if(off > beg &&
5687 (CH(off) == _T('.') || CH(off) == _T(')')) &&
5688 (off+1 >= ctx->size || ISBLANK(off+1) || ISNEWLINE(off+1)))
5689 {
5690 p_container->ch = CH(off);
5691 p_container->is_loose = FALSE;
5692 p_container->is_task = FALSE;
5693 p_container->mark_indent = indent;
5694 p_container->contents_indent = indent + off - beg + 1;
5695 *p_end = off+1;
5696 return TRUE;
5697 }
5698
5699 return FALSE;
5700}
5701
5702static unsigned
5703md_line_indentation(MD_CTX* ctx, unsigned total_indent, OFF beg, OFF* p_end)
5704{
5705 OFF off = beg;
5706 unsigned indent = total_indent;
5707
5708 while(off < ctx->size && ISBLANK(off)) {
5709 if(CH(off) == _T('\t'))
5710 indent = (indent + 4) & ~3;
5711 else
5712 indent++;
5713 off++;
5714 }
5715
5716 *p_end = off;
5717 return indent - total_indent;
5718}
5719
5720static const MD_LINE_ANALYSIS md_dummy_blank_line = { MD_LINE_BLANK, 0, 0, 0, 0 };
5721
5722/* Analyze type of the line and find some its properties. This serves as a
5723 * main input for determining type and boundaries of a block. */
5724static int
5725md_analyze_line(MD_CTX* ctx, OFF beg, OFF* p_end,
5726 const MD_LINE_ANALYSIS* pivot_line, MD_LINE_ANALYSIS* line)
5727{
5728 unsigned total_indent = 0;
5729 int n_parents = 0;
5730 int n_brothers = 0;
5731 int n_children = 0;
5732 MD_CONTAINER container = { 0 };
5733 int prev_line_has_list_loosening_effect = ctx->last_line_has_list_loosening_effect;
5734 OFF off = beg;
5735 OFF hr_killer = 0;
5736 int ret = 0;
5737
5738 line->indent = md_line_indentation(ctx, total_indent, beg: off, p_end: &off);
5739 total_indent += line->indent;
5740 line->beg = off;
5741
5742 /* Given the indentation and block quote marks '>', determine how many of
5743 * the current containers are our parents. */
5744 while(n_parents < ctx->n_containers) {
5745 MD_CONTAINER* c = &ctx->containers[n_parents];
5746
5747 if(c->ch == _T('>') && line->indent < ctx->code_indent_offset &&
5748 off < ctx->size && CH(off) == _T('>'))
5749 {
5750 /* Block quote mark. */
5751 off++;
5752 total_indent++;
5753 line->indent = md_line_indentation(ctx, total_indent, beg: off, p_end: &off);
5754 total_indent += line->indent;
5755
5756 /* The optional 1st space after '>' is part of the block quote mark. */
5757 if(line->indent > 0)
5758 line->indent--;
5759
5760 line->beg = off;
5761
5762 } else if(c->ch != _T('>') && line->indent >= c->contents_indent) {
5763 /* List. */
5764 line->indent -= c->contents_indent;
5765 } else {
5766 break;
5767 }
5768
5769 n_parents++;
5770 }
5771
5772 if(off >= ctx->size || ISNEWLINE(off)) {
5773 /* Blank line does not need any real indentation to be nested inside
5774 * a list. */
5775 if(n_brothers + n_children == 0) {
5776 while(n_parents < ctx->n_containers && ctx->containers[n_parents].ch != _T('>'))
5777 n_parents++;
5778 }
5779 }
5780
5781 while(TRUE) {
5782 /* Check whether we are fenced code continuation. */
5783 if(pivot_line->type == MD_LINE_FENCEDCODE) {
5784 line->beg = off;
5785
5786 /* We are another MD_LINE_FENCEDCODE unless we are closing fence
5787 * which we transform into MD_LINE_BLANK. */
5788 if(line->indent < ctx->code_indent_offset) {
5789 if(md_is_closing_code_fence(ctx, CH(pivot_line->beg), beg: off, p_end: &off)) {
5790 line->type = MD_LINE_BLANK;
5791 ctx->last_line_has_list_loosening_effect = FALSE;
5792 break;
5793 }
5794 }
5795
5796 /* Change indentation accordingly to the initial code fence. */
5797 if(n_parents == ctx->n_containers) {
5798 if(line->indent > pivot_line->indent)
5799 line->indent -= pivot_line->indent;
5800 else
5801 line->indent = 0;
5802
5803 line->type = MD_LINE_FENCEDCODE;
5804 break;
5805 }
5806 }
5807
5808 /* Check whether we are HTML block continuation. */
5809 if(pivot_line->type == MD_LINE_HTML && ctx->html_block_type > 0) {
5810 if(n_parents < ctx->n_containers) {
5811 /* HTML block is implicitly ended if the enclosing container
5812 * block ends. */
5813 ctx->html_block_type = 0;
5814 } else {
5815 int html_block_type;
5816
5817 html_block_type = md_is_html_block_end_condition(ctx, beg: off, p_end: &off);
5818 if(html_block_type > 0) {
5819 MD_ASSERT(html_block_type == ctx->html_block_type);
5820
5821 /* Make sure this is the last line of the block. */
5822 ctx->html_block_type = 0;
5823
5824 /* Some end conditions serve as blank lines at the same time. */
5825 if(html_block_type == 6 || html_block_type == 7) {
5826 line->type = MD_LINE_BLANK;
5827 line->indent = 0;
5828 break;
5829 }
5830 }
5831
5832 line->type = MD_LINE_HTML;
5833 n_parents = ctx->n_containers;
5834 break;
5835 }
5836 }
5837
5838 /* Check for blank line. */
5839 if(off >= ctx->size || ISNEWLINE(off)) {
5840 if(pivot_line->type == MD_LINE_INDENTEDCODE && n_parents == ctx->n_containers) {
5841 line->type = MD_LINE_INDENTEDCODE;
5842 if(line->indent > ctx->code_indent_offset)
5843 line->indent -= ctx->code_indent_offset;
5844 else
5845 line->indent = 0;
5846 ctx->last_line_has_list_loosening_effect = FALSE;
5847 } else {
5848 line->type = MD_LINE_BLANK;
5849 ctx->last_line_has_list_loosening_effect = (n_parents > 0 &&
5850 n_brothers + n_children == 0 &&
5851 ctx->containers[n_parents-1].ch != _T('>'));
5852
5853 #if 1
5854 /* See https://github.com/mity/md4c/issues/6
5855 *
5856 * This ugly checking tests we are in (yet empty) list item but
5857 * not its very first line (i.e. not the line with the list
5858 * item mark).
5859 *
5860 * If we are such a blank line, then any following non-blank
5861 * line which would be part of the list item actually has to
5862 * end the list because according to the specification, "a list
5863 * item can begin with at most one blank line."
5864 */
5865 if(n_parents > 0 && ctx->containers[n_parents-1].ch != _T('>') &&
5866 n_brothers + n_children == 0 && ctx->current_block == NULL &&
5867 ctx->n_block_bytes > (int) sizeof(MD_BLOCK))
5868 {
5869 MD_BLOCK* top_block = (MD_BLOCK*) ((char*)ctx->block_bytes + ctx->n_block_bytes - sizeof(MD_BLOCK));
5870 if(top_block->type == MD_BLOCK_LI)
5871 ctx->last_list_item_starts_with_two_blank_lines = TRUE;
5872 }
5873 #endif
5874 }
5875 break;
5876 } else {
5877 #if 1
5878 /* This is the 2nd half of the hack. If the flag is set (i.e. there
5879 * was a 2nd blank line at the beginning of the list item) and if
5880 * we would otherwise still belong to the list item, we enforce
5881 * the end of the list. */
5882 ctx->last_line_has_list_loosening_effect = FALSE;
5883 if(ctx->last_list_item_starts_with_two_blank_lines) {
5884 if(n_parents > 0 && ctx->containers[n_parents-1].ch != _T('>') &&
5885 n_brothers + n_children == 0 && ctx->current_block == NULL &&
5886 ctx->n_block_bytes > (int) sizeof(MD_BLOCK))
5887 {
5888 MD_BLOCK* top_block = (MD_BLOCK*) ((char*)ctx->block_bytes + ctx->n_block_bytes - sizeof(MD_BLOCK));
5889 if(top_block->type == MD_BLOCK_LI)
5890 n_parents--;
5891 }
5892
5893 ctx->last_list_item_starts_with_two_blank_lines = FALSE;
5894 }
5895 #endif
5896 }
5897
5898 /* Check whether we are Setext underline. */
5899 if(line->indent < ctx->code_indent_offset && pivot_line->type == MD_LINE_TEXT
5900 && off < ctx->size && ISANYOF2(off, _T('='), _T('-'))
5901 && (n_parents == ctx->n_containers))
5902 {
5903 unsigned level;
5904
5905 if(md_is_setext_underline(ctx, beg: off, p_end: &off, p_level: &level)) {
5906 line->type = MD_LINE_SETEXTUNDERLINE;
5907 line->data = level;
5908 break;
5909 }
5910 }
5911
5912 /* Check for thematic break line. */
5913 if(line->indent < ctx->code_indent_offset
5914 && off < ctx->size && off >= hr_killer
5915 && ISANYOF(off, _T("-_*")))
5916 {
5917 if(md_is_hr_line(ctx, beg: off, p_end: &off, p_killer: &hr_killer)) {
5918 line->type = MD_LINE_HR;
5919 break;
5920 }
5921 }
5922
5923 /* Check for "brother" container. I.e. whether we are another list item
5924 * in already started list. */
5925 if(n_parents < ctx->n_containers && n_brothers + n_children == 0) {
5926 OFF tmp;
5927
5928 if(md_is_container_mark(ctx, indent: line->indent, beg: off, p_end: &tmp, p_container: &container) &&
5929 md_is_container_compatible(pivot: &ctx->containers[n_parents], container: &container))
5930 {
5931 pivot_line = &md_dummy_blank_line;
5932
5933 off = tmp;
5934
5935 total_indent += container.contents_indent - container.mark_indent;
5936 line->indent = md_line_indentation(ctx, total_indent, beg: off, p_end: &off);
5937 total_indent += line->indent;
5938 line->beg = off;
5939
5940 /* Some of the following whitespace actually still belongs to the mark. */
5941 if(off >= ctx->size || ISNEWLINE(off)) {
5942 container.contents_indent++;
5943 } else if(line->indent <= ctx->code_indent_offset) {
5944 container.contents_indent += line->indent;
5945 line->indent = 0;
5946 } else {
5947 container.contents_indent += 1;
5948 line->indent--;
5949 }
5950
5951 ctx->containers[n_parents].mark_indent = container.mark_indent;
5952 ctx->containers[n_parents].contents_indent = container.contents_indent;
5953
5954 n_brothers++;
5955 continue;
5956 }
5957 }
5958
5959 /* Check for indented code.
5960 * Note indented code block cannot interrupt a paragraph. */
5961 if(line->indent >= ctx->code_indent_offset &&
5962 (pivot_line->type == MD_LINE_BLANK || pivot_line->type == MD_LINE_INDENTEDCODE))
5963 {
5964 line->type = MD_LINE_INDENTEDCODE;
5965 MD_ASSERT(line->indent >= ctx->code_indent_offset);
5966 line->indent -= ctx->code_indent_offset;
5967 line->data = 0;
5968 break;
5969 }
5970
5971 /* Check for start of a new container block. */
5972 if(line->indent < ctx->code_indent_offset &&
5973 md_is_container_mark(ctx, indent: line->indent, beg: off, p_end: &off, p_container: &container))
5974 {
5975 if(pivot_line->type == MD_LINE_TEXT && n_parents == ctx->n_containers &&
5976 (off >= ctx->size || ISNEWLINE(off)) && container.ch != _T('>'))
5977 {
5978 /* Noop. List mark followed by a blank line cannot interrupt a paragraph. */
5979 } else if(pivot_line->type == MD_LINE_TEXT && n_parents == ctx->n_containers &&
5980 ISANYOF2_(container.ch, _T('.'), _T(')')) && container.start != 1)
5981 {
5982 /* Noop. Ordered list cannot interrupt a paragraph unless the start index is 1. */
5983 } else {
5984 total_indent += container.contents_indent - container.mark_indent;
5985 line->indent = md_line_indentation(ctx, total_indent, beg: off, p_end: &off);
5986 total_indent += line->indent;
5987
5988 line->beg = off;
5989 line->data = container.ch;
5990
5991 /* Some of the following whitespace actually still belongs to the mark. */
5992 if(off >= ctx->size || ISNEWLINE(off)) {
5993 container.contents_indent++;
5994 } else if(line->indent <= ctx->code_indent_offset) {
5995 container.contents_indent += line->indent;
5996 line->indent = 0;
5997 } else {
5998 container.contents_indent += 1;
5999 line->indent--;
6000 }
6001
6002 if(n_brothers + n_children == 0)
6003 pivot_line = &md_dummy_blank_line;
6004
6005 if(n_children == 0)
6006 MD_CHECK(md_leave_child_containers(ctx, n_parents + n_brothers));
6007
6008 n_children++;
6009 MD_CHECK(md_push_container(ctx, &container));
6010 continue;
6011 }
6012 }
6013
6014 /* Check whether we are table continuation. */
6015 if(pivot_line->type == MD_LINE_TABLE && n_parents == ctx->n_containers) {
6016 line->type = MD_LINE_TABLE;
6017 break;
6018 }
6019
6020 /* Check for ATX header. */
6021 if(line->indent < ctx->code_indent_offset &&
6022 off < ctx->size && CH(off) == _T('#'))
6023 {
6024 unsigned level;
6025
6026 if(md_is_atxheader_line(ctx, beg: off, p_beg: &line->beg, p_end: &off, p_level: &level)) {
6027 line->type = MD_LINE_ATXHEADER;
6028 line->data = level;
6029 break;
6030 }
6031 }
6032
6033 /* Check whether we are starting code fence. */
6034 if(off < ctx->size && ISANYOF2(off, _T('`'), _T('~'))) {
6035 if(md_is_opening_code_fence(ctx, beg: off, p_end: &off)) {
6036 line->type = MD_LINE_FENCEDCODE;
6037 line->data = 1;
6038 break;
6039 }
6040 }
6041
6042 /* Check for start of raw HTML block. */
6043 if(off < ctx->size && CH(off) == _T('<')
6044 && !(ctx->parser.flags & MD_FLAG_NOHTMLBLOCKS))
6045 {
6046 ctx->html_block_type = md_is_html_block_start_condition(ctx, beg: off);
6047
6048 /* HTML block type 7 cannot interrupt paragraph. */
6049 if(ctx->html_block_type == 7 && pivot_line->type == MD_LINE_TEXT)
6050 ctx->html_block_type = 0;
6051
6052 if(ctx->html_block_type > 0) {
6053 /* The line itself also may immediately close the block. */
6054 if(md_is_html_block_end_condition(ctx, beg: off, p_end: &off) == ctx->html_block_type) {
6055 /* Make sure this is the last line of the block. */
6056 ctx->html_block_type = 0;
6057 }
6058
6059 line->type = MD_LINE_HTML;
6060 break;
6061 }
6062 }
6063
6064 /* Check for table underline. */
6065 if((ctx->parser.flags & MD_FLAG_TABLES) && pivot_line->type == MD_LINE_TEXT
6066 && off < ctx->size && ISANYOF3(off, _T('|'), _T('-'), _T(':'))
6067 && n_parents == ctx->n_containers)
6068 {
6069 unsigned col_count;
6070
6071 if(ctx->current_block != NULL && ctx->current_block->n_lines == 1 &&
6072 md_is_table_underline(ctx, beg: off, p_end: &off, p_col_count: &col_count))
6073 {
6074 line->data = col_count;
6075 line->type = MD_LINE_TABLEUNDERLINE;
6076 break;
6077 }
6078 }
6079
6080 /* By default, we are normal text line. */
6081 line->type = MD_LINE_TEXT;
6082 if(pivot_line->type == MD_LINE_TEXT && n_brothers + n_children == 0) {
6083 /* Lazy continuation. */
6084 n_parents = ctx->n_containers;
6085 }
6086
6087 /* Check for task mark. */
6088 if((ctx->parser.flags & MD_FLAG_TASKLISTS) && n_brothers + n_children > 0 &&
6089 ISANYOF_(ctx->containers[ctx->n_containers-1].ch, _T("-+*.)")))
6090 {
6091 OFF tmp = off;
6092
6093 while(tmp < ctx->size && tmp < off + 3 && ISBLANK(tmp))
6094 tmp++;
6095 if(tmp + 2 < ctx->size && CH(tmp) == _T('[') &&
6096 ISANYOF(tmp+1, _T("xX ")) && CH(tmp+2) == _T(']') &&
6097 (tmp + 3 == ctx->size || ISBLANK(tmp+3) || ISNEWLINE(tmp+3)))
6098 {
6099 MD_CONTAINER* task_container = (n_children > 0 ? &ctx->containers[ctx->n_containers-1] : &container);
6100 task_container->is_task = TRUE;
6101 task_container->task_mark_off = tmp + 1;
6102 off = tmp + 3;
6103 while(ISWHITESPACE(off))
6104 off++;
6105 line->beg = off;
6106 }
6107 }
6108
6109 break;
6110 }
6111
6112 /* Scan for end of the line.
6113 *
6114 * Note this is quite a bottleneck of the parsing as we here iterate almost
6115 * over compete document.
6116 */
6117#if defined __linux__ && !defined MD4C_USE_UTF16
6118 /* Recent glibc versions have superbly optimized strcspn(), even using
6119 * vectorization if available. */
6120 if(ctx->doc_ends_with_newline && off < ctx->size) {
6121 while(TRUE) {
6122 off += (OFF) strcspn(STR(off), reject: "\r\n");
6123
6124 /* strcspn() can stop on zero terminator; but that can appear
6125 * anywhere in the Markfown input... */
6126 if(CH(off) == _T('\0'))
6127 off++;
6128 else
6129 break;
6130 }
6131 } else
6132#endif
6133 {
6134 /* Optimization: Use some loop unrolling. */
6135 while(off + 3 < ctx->size && !ISNEWLINE(off+0) && !ISNEWLINE(off+1)
6136 && !ISNEWLINE(off+2) && !ISNEWLINE(off+3))
6137 off += 4;
6138 while(off < ctx->size && !ISNEWLINE(off))
6139 off++;
6140 }
6141
6142 /* Set end of the line. */
6143 line->end = off;
6144
6145 /* But for ATX header, we should exclude the optional trailing mark. */
6146 if(line->type == MD_LINE_ATXHEADER) {
6147 OFF tmp = line->end;
6148 while(tmp > line->beg && CH(tmp-1) == _T(' '))
6149 tmp--;
6150 while(tmp > line->beg && CH(tmp-1) == _T('#'))
6151 tmp--;
6152 if(tmp == line->beg || CH(tmp-1) == _T(' ') || (ctx->parser.flags & MD_FLAG_PERMISSIVEATXHEADERS))
6153 line->end = tmp;
6154 }
6155
6156 /* Trim trailing spaces. */
6157 if(line->type != MD_LINE_INDENTEDCODE && line->type != MD_LINE_FENCEDCODE) {
6158 while(line->end > line->beg && CH(line->end-1) == _T(' '))
6159 line->end--;
6160 }
6161
6162 /* Eat also the new line. */
6163 if(off < ctx->size && CH(off) == _T('\r'))
6164 off++;
6165 if(off < ctx->size && CH(off) == _T('\n'))
6166 off++;
6167
6168 *p_end = off;
6169
6170 /* If we belong to a list after seeing a blank line, the list is loose. */
6171 if(prev_line_has_list_loosening_effect && line->type != MD_LINE_BLANK && n_parents + n_brothers > 0) {
6172 MD_CONTAINER* c = &ctx->containers[n_parents + n_brothers - 1];
6173 if(c->ch != _T('>')) {
6174 MD_BLOCK* block = (MD_BLOCK*) (((char*)ctx->block_bytes) + c->block_byte_off);
6175 block->flags |= MD_BLOCK_LOOSE_LIST;
6176 }
6177 }
6178
6179 /* Leave any containers we are not part of anymore. */
6180 if(n_children == 0 && n_parents + n_brothers < ctx->n_containers)
6181 MD_CHECK(md_leave_child_containers(ctx, n_parents + n_brothers));
6182
6183 /* Enter any container we found a mark for. */
6184 if(n_brothers > 0) {
6185 MD_ASSERT(n_brothers == 1);
6186 MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_LI,
6187 ctx->containers[n_parents].task_mark_off,
6188 (ctx->containers[n_parents].is_task ? CH(ctx->containers[n_parents].task_mark_off) : 0),
6189 MD_BLOCK_CONTAINER_CLOSER));
6190 MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_LI,
6191 container.task_mark_off,
6192 (container.is_task ? CH(container.task_mark_off) : 0),
6193 MD_BLOCK_CONTAINER_OPENER));
6194 ctx->containers[n_parents].is_task = container.is_task;
6195 ctx->containers[n_parents].task_mark_off = container.task_mark_off;
6196 }
6197
6198 if(n_children > 0)
6199 MD_CHECK(md_enter_child_containers(ctx, n_children));
6200
6201abort:
6202 return ret;
6203}
6204
6205static int
6206md_process_line(MD_CTX* ctx, const MD_LINE_ANALYSIS** p_pivot_line, MD_LINE_ANALYSIS* line)
6207{
6208 const MD_LINE_ANALYSIS* pivot_line = *p_pivot_line;
6209 int ret = 0;
6210
6211 /* Blank line ends current leaf block. */
6212 if(line->type == MD_LINE_BLANK) {
6213 MD_CHECK(md_end_current_block(ctx));
6214 *p_pivot_line = &md_dummy_blank_line;
6215 return 0;
6216 }
6217
6218 /* Some line types form block on their own. */
6219 if(line->type == MD_LINE_HR || line->type == MD_LINE_ATXHEADER) {
6220 MD_CHECK(md_end_current_block(ctx));
6221
6222 /* Add our single-line block. */
6223 MD_CHECK(md_start_new_block(ctx, line));
6224 MD_CHECK(md_add_line_into_current_block(ctx, line));
6225 MD_CHECK(md_end_current_block(ctx));
6226 *p_pivot_line = &md_dummy_blank_line;
6227 return 0;
6228 }
6229
6230 /* MD_LINE_SETEXTUNDERLINE changes meaning of the current block and ends it. */
6231 if(line->type == MD_LINE_SETEXTUNDERLINE) {
6232 MD_ASSERT(ctx->current_block != NULL);
6233 ctx->current_block->type = MD_BLOCK_H;
6234 ctx->current_block->data = line->data;
6235 ctx->current_block->flags |= MD_BLOCK_SETEXT_HEADER;
6236 MD_CHECK(md_add_line_into_current_block(ctx, line));
6237 MD_CHECK(md_end_current_block(ctx));
6238 if(ctx->current_block == NULL) {
6239 *p_pivot_line = &md_dummy_blank_line;
6240 } else {
6241 /* This happens if we have consumed all the body as link ref. defs.
6242 * and downgraded the underline into start of a new paragraph block. */
6243 line->type = MD_LINE_TEXT;
6244 *p_pivot_line = line;
6245 }
6246 return 0;
6247 }
6248
6249 /* MD_LINE_TABLEUNDERLINE changes meaning of the current block. */
6250 if(line->type == MD_LINE_TABLEUNDERLINE) {
6251 MD_ASSERT(ctx->current_block != NULL);
6252 MD_ASSERT(ctx->current_block->n_lines == 1);
6253 ctx->current_block->type = MD_BLOCK_TABLE;
6254 ctx->current_block->data = line->data;
6255 MD_ASSERT(pivot_line != &md_dummy_blank_line);
6256 ((MD_LINE_ANALYSIS*)pivot_line)->type = MD_LINE_TABLE;
6257 MD_CHECK(md_add_line_into_current_block(ctx, line));
6258 return 0;
6259 }
6260
6261 /* The current block also ends if the line has different type. */
6262 if(line->type != pivot_line->type)
6263 MD_CHECK(md_end_current_block(ctx));
6264
6265 /* The current line may start a new block. */
6266 if(ctx->current_block == NULL) {
6267 MD_CHECK(md_start_new_block(ctx, line));
6268 *p_pivot_line = line;
6269 }
6270
6271 /* In all other cases the line is just a continuation of the current block. */
6272 MD_CHECK(md_add_line_into_current_block(ctx, line));
6273
6274abort:
6275 return ret;
6276}
6277
6278static int
6279md_process_doc(MD_CTX *ctx)
6280{
6281 const MD_LINE_ANALYSIS* pivot_line = &md_dummy_blank_line;
6282 MD_LINE_ANALYSIS line_buf[2];
6283 MD_LINE_ANALYSIS* line = &line_buf[0];
6284 OFF off = 0;
6285 int ret = 0;
6286
6287 MD_ENTER_BLOCK(MD_BLOCK_DOC, NULL);
6288
6289 while(off < ctx->size) {
6290 if(line == pivot_line)
6291 line = (line == &line_buf[0] ? &line_buf[1] : &line_buf[0]);
6292
6293 MD_CHECK(md_analyze_line(ctx, off, &off, pivot_line, line));
6294 MD_CHECK(md_process_line(ctx, &pivot_line, line));
6295 }
6296
6297 md_end_current_block(ctx);
6298
6299 MD_CHECK(md_build_ref_def_hashtable(ctx));
6300
6301 /* Process all blocks. */
6302 MD_CHECK(md_leave_child_containers(ctx, 0));
6303 MD_CHECK(md_process_all_blocks(ctx));
6304
6305 MD_LEAVE_BLOCK(MD_BLOCK_DOC, NULL);
6306
6307abort:
6308
6309#if 0
6310 /* Output some memory consumption statistics. */
6311 {
6312 char buffer[256];
6313 sprintf(buffer, "Alloced %u bytes for block buffer.",
6314 (unsigned)(ctx->alloc_block_bytes));
6315 MD_LOG(buffer);
6316
6317 sprintf(buffer, "Alloced %u bytes for containers buffer.",
6318 (unsigned)(ctx->alloc_containers * sizeof(MD_CONTAINER)));
6319 MD_LOG(buffer);
6320
6321 sprintf(buffer, "Alloced %u bytes for marks buffer.",
6322 (unsigned)(ctx->alloc_marks * sizeof(MD_MARK)));
6323 MD_LOG(buffer);
6324
6325 sprintf(buffer, "Alloced %u bytes for aux. buffer.",
6326 (unsigned)(ctx->alloc_buffer * sizeof(MD_CHAR)));
6327 MD_LOG(buffer);
6328 }
6329#endif
6330
6331 return ret;
6332}
6333
6334
6335/********************
6336 *** Public API ***
6337 ********************/
6338
6339int
6340md_parse(const MD_CHAR* text, MD_SIZE size, const MD_PARSER* parser, void* userdata)
6341{
6342 MD_CTX ctx;
6343 int i;
6344 int ret;
6345
6346 if(parser->abi_version != 0) {
6347 if(parser->debug_log != NULL)
6348 parser->debug_log("Unsupported abi_version.", userdata);
6349 return -1;
6350 }
6351
6352 /* Setup context structure. */
6353 memset(s: &ctx, c: 0, n: sizeof(MD_CTX));
6354 ctx.text = text;
6355 ctx.size = size;
6356 memcpy(dest: &ctx.parser, src: parser, n: sizeof(MD_PARSER));
6357 ctx.userdata = userdata;
6358 ctx.code_indent_offset = (ctx.parser.flags & MD_FLAG_NOINDENTEDCODEBLOCKS) ? (OFF)(-1) : 4;
6359 md_build_mark_char_map(ctx: &ctx);
6360 ctx.doc_ends_with_newline = (size > 0 && ISNEWLINE_(text[size-1]));
6361
6362 /* Reset all unresolved opener mark chains. */
6363 for(i = 0; i < (int) SIZEOF_ARRAY(ctx.mark_chains); i++) {
6364 ctx.mark_chains[i].head = -1;
6365 ctx.mark_chains[i].tail = -1;
6366 }
6367 ctx.unresolved_link_head = -1;
6368 ctx.unresolved_link_tail = -1;
6369
6370 /* All the work. */
6371 ret = md_process_doc(ctx: &ctx);
6372
6373 /* Clean-up. */
6374 md_free_ref_defs(ctx: &ctx);
6375 md_free_ref_def_hashtable(ctx: &ctx);
6376 free(ptr: ctx.buffer);
6377 free(ptr: ctx.marks);
6378 free(ptr: ctx.block_bytes);
6379 free(ptr: ctx.containers);
6380
6381 return ret;
6382}
6383

source code of qtbase/src/3rdparty/md4c/md4c.c