1/*
2 * MD4C: Markdown parser for C
3 * (http://github.com/mity/md4c)
4 *
5 * Copyright (c) 2016-2024 Martin Mitáš
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 * and/or sell copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice shall be included in
15 * all copies or substantial portions of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
23 * IN THE SOFTWARE.
24 */
25
26#include "md4c.h"
27
28#include <limits.h>
29#include <stdio.h>
30#include <stdlib.h>
31#include <string.h>
32
33
34/*****************************
35 *** Miscellaneous Stuff ***
36 *****************************/
37
38#if !defined(__STDC_VERSION__) || __STDC_VERSION__ < 199409L
39 /* C89/90 or old compilers in general may not understand "inline". */
40 #if defined __GNUC__
41 #define inline __inline__
42 #elif defined _MSC_VER
43 #define inline __inline
44 #else
45 #define inline
46 #endif
47#endif
48
49/* Make the UTF-8 support the default. */
50#if !defined MD4C_USE_ASCII && !defined MD4C_USE_UTF8 && !defined MD4C_USE_UTF16
51 #define MD4C_USE_UTF8
52#endif
53
54/* Magic for making wide literals with MD4C_USE_UTF16. */
55#ifdef _T
56 #undef _T
57#endif
58#if defined MD4C_USE_UTF16
59 #define _T(x) L##x
60#else
61 #define _T(x) x
62#endif
63
64/* Misc. macros. */
65#define SIZEOF_ARRAY(a) (sizeof(a) / sizeof(a[0]))
66
67#define STRINGIZE_(x) #x
68#define STRINGIZE(x) STRINGIZE_(x)
69
70#define MAX(a,b) ((a) > (b) ? (a) : (b))
71#define MIN(a,b) ((a) < (b) ? (a) : (b))
72
73#ifndef TRUE
74 #define TRUE 1
75 #define FALSE 0
76#endif
77
78#define MD_LOG(msg) \
79 do { \
80 if(ctx->parser.debug_log != NULL) \
81 ctx->parser.debug_log((msg), ctx->userdata); \
82 } while(0)
83
84#ifdef DEBUG
85 #define MD_ASSERT(cond) \
86 do { \
87 if(!(cond)) { \
88 MD_LOG(__FILE__ ":" STRINGIZE(__LINE__) ": " \
89 "Assertion '" STRINGIZE(cond) "' failed."); \
90 exit(1); \
91 } \
92 } while(0)
93
94 #define MD_UNREACHABLE() MD_ASSERT(1 == 0)
95#else
96 #ifdef __GNUC__
97 #define MD_ASSERT(cond) do { if(!(cond)) __builtin_unreachable(); } while(0)
98 #define MD_UNREACHABLE() do { __builtin_unreachable(); } while(0)
99 #elif defined _MSC_VER && _MSC_VER > 120
100 #define MD_ASSERT(cond) do { __assume(cond); } while(0)
101 #define MD_UNREACHABLE() do { __assume(0); } while(0)
102 #else
103 #define MD_ASSERT(cond) do {} while(0)
104 #define MD_UNREACHABLE() do {} while(0)
105 #endif
106#endif
107
108/* For falling through case labels in switch statements. */
109#if defined __clang__ && __clang_major__ >= 12
110 #define MD_FALLTHROUGH() __attribute__((fallthrough))
111#elif defined __GNUC__ && __GNUC__ >= 7
112 #define MD_FALLTHROUGH() __attribute__((fallthrough))
113#else
114 #define MD_FALLTHROUGH() ((void)0)
115#endif
116
117/* Suppress "unused parameter" warnings. */
118#define MD_UNUSED(x) ((void)x)
119
120
121/******************************
122 *** Some internal limits ***
123 ******************************/
124
125/* We limit code span marks to lower than 32 backticks. This solves the
126 * pathologic case of too many openers, each of different length: Their
127 * resolving would be then O(n^2). */
128#define CODESPAN_MARK_MAXLEN 32
129
130/* We limit column count of tables to prevent quadratic explosion of output
131 * from pathological input of a table thousands of columns and thousands
132 * of rows where rows are requested with as little as single character
133 * per-line, relying on us to "helpfully" fill all the missing "<td></td>". */
134#define TABLE_MAXCOLCOUNT 128
135
136
137/************************
138 *** Internal Types ***
139 ************************/
140
141/* These are omnipresent so lets save some typing. */
142#define CHAR MD_CHAR
143#define SZ MD_SIZE
144#define OFF MD_OFFSET
145
146typedef struct MD_MARK_tag MD_MARK;
147typedef struct MD_BLOCK_tag MD_BLOCK;
148typedef struct MD_CONTAINER_tag MD_CONTAINER;
149typedef struct MD_REF_DEF_tag MD_REF_DEF;
150
151
152/* During analyzes of inline marks, we need to manage stacks of unresolved
153 * openers of the given type.
154 * The stack connects the marks via MD_MARK::next;
155 */
156typedef struct MD_MARKSTACK_tag MD_MARKSTACK;
157struct MD_MARKSTACK_tag {
158 int top; /* -1 if empty. */
159};
160
161/* Context propagated through all the parsing. */
162typedef struct MD_CTX_tag MD_CTX;
163struct MD_CTX_tag {
164 /* Immutable stuff (parameters of md_parse()). */
165 const CHAR* text;
166 SZ size;
167 MD_PARSER parser;
168 void* userdata;
169
170 /* When this is true, it allows some optimizations. */
171 int doc_ends_with_newline;
172
173 /* Helper temporary growing buffer. */
174 CHAR* buffer;
175 unsigned alloc_buffer;
176
177 /* Reference definitions. */
178 MD_REF_DEF* ref_defs;
179 int n_ref_defs;
180 int alloc_ref_defs;
181 void** ref_def_hashtable;
182 int ref_def_hashtable_size;
183
184 /* Stack of inline/span markers.
185 * This is only used for parsing a single block contents but by storing it
186 * here we may reuse the stack for subsequent blocks; i.e. we have fewer
187 * (re)allocations. */
188 MD_MARK* marks;
189 int n_marks;
190 int alloc_marks;
191
192#if defined MD4C_USE_UTF16
193 char mark_char_map[128];
194#else
195 char mark_char_map[256];
196#endif
197
198 /* For resolving of inline spans. */
199 MD_MARKSTACK opener_stacks[16];
200#define ASTERISK_OPENERS_oo_mod3_0 (ctx->opener_stacks[0]) /* Opener-only */
201#define ASTERISK_OPENERS_oo_mod3_1 (ctx->opener_stacks[1])
202#define ASTERISK_OPENERS_oo_mod3_2 (ctx->opener_stacks[2])
203#define ASTERISK_OPENERS_oc_mod3_0 (ctx->opener_stacks[3]) /* Both opener and closer candidate */
204#define ASTERISK_OPENERS_oc_mod3_1 (ctx->opener_stacks[4])
205#define ASTERISK_OPENERS_oc_mod3_2 (ctx->opener_stacks[5])
206#define UNDERSCORE_OPENERS_oo_mod3_0 (ctx->opener_stacks[6]) /* Opener-only */
207#define UNDERSCORE_OPENERS_oo_mod3_1 (ctx->opener_stacks[7])
208#define UNDERSCORE_OPENERS_oo_mod3_2 (ctx->opener_stacks[8])
209#define UNDERSCORE_OPENERS_oc_mod3_0 (ctx->opener_stacks[9]) /* Both opener and closer candidate */
210#define UNDERSCORE_OPENERS_oc_mod3_1 (ctx->opener_stacks[10])
211#define UNDERSCORE_OPENERS_oc_mod3_2 (ctx->opener_stacks[11])
212#define TILDE_OPENERS_1 (ctx->opener_stacks[12])
213#define TILDE_OPENERS_2 (ctx->opener_stacks[13])
214#define BRACKET_OPENERS (ctx->opener_stacks[14])
215#define DOLLAR_OPENERS (ctx->opener_stacks[15])
216
217 /* Stack of dummies which need to call free() for pointers stored in them.
218 * These are constructed during inline parsing and freed after all the block
219 * is processed (i.e. all callbacks referring those strings are called). */
220 MD_MARKSTACK ptr_stack;
221
222 /* For resolving table rows. */
223 int n_table_cell_boundaries;
224 int table_cell_boundaries_head;
225 int table_cell_boundaries_tail;
226
227 /* For resolving links. */
228 int unresolved_link_head;
229 int unresolved_link_tail;
230
231 /* For resolving raw HTML. */
232 OFF html_comment_horizon;
233 OFF html_proc_instr_horizon;
234 OFF html_decl_horizon;
235 OFF html_cdata_horizon;
236
237 /* For block analysis.
238 * Notes:
239 * -- It holds MD_BLOCK as well as MD_LINE structures. After each
240 * MD_BLOCK, its (multiple) MD_LINE(s) follow.
241 * -- For MD_BLOCK_HTML and MD_BLOCK_CODE, MD_VERBATIMLINE(s) are used
242 * instead of MD_LINE(s).
243 */
244 void* block_bytes;
245 MD_BLOCK* current_block;
246 int n_block_bytes;
247 int alloc_block_bytes;
248
249 /* For container block analysis. */
250 MD_CONTAINER* containers;
251 int n_containers;
252 int alloc_containers;
253
254 /* Minimal indentation to call the block "indented code block". */
255 unsigned code_indent_offset;
256
257 /* Contextual info for line analysis. */
258 SZ code_fence_length; /* For checking closing fence length. */
259 int html_block_type; /* For checking closing raw HTML condition. */
260 int last_line_has_list_loosening_effect;
261 int last_list_item_starts_with_two_blank_lines;
262};
263
264enum MD_LINETYPE_tag {
265 MD_LINE_BLANK,
266 MD_LINE_HR,
267 MD_LINE_ATXHEADER,
268 MD_LINE_SETEXTHEADER,
269 MD_LINE_SETEXTUNDERLINE,
270 MD_LINE_INDENTEDCODE,
271 MD_LINE_FENCEDCODE,
272 MD_LINE_HTML,
273 MD_LINE_TEXT,
274 MD_LINE_TABLE,
275 MD_LINE_TABLEUNDERLINE
276};
277typedef enum MD_LINETYPE_tag MD_LINETYPE;
278
279typedef struct MD_LINE_ANALYSIS_tag MD_LINE_ANALYSIS;
280struct MD_LINE_ANALYSIS_tag {
281 MD_LINETYPE type;
282 unsigned data;
283 int enforce_new_block;
284 OFF beg;
285 OFF end;
286 unsigned indent; /* Indentation level. */
287};
288
289typedef struct MD_LINE_tag MD_LINE;
290struct MD_LINE_tag {
291 OFF beg;
292 OFF end;
293};
294
295typedef struct MD_VERBATIMLINE_tag MD_VERBATIMLINE;
296struct MD_VERBATIMLINE_tag {
297 OFF beg;
298 OFF end;
299 OFF indent;
300};
301
302
303/*****************
304 *** Helpers ***
305 *****************/
306
307/* Character accessors. */
308#define CH(off) (ctx->text[(off)])
309#define STR(off) (ctx->text + (off))
310
311/* Character classification.
312 * Note we assume ASCII compatibility of code points < 128 here. */
313#define ISIN_(ch, ch_min, ch_max) ((ch_min) <= (unsigned)(ch) && (unsigned)(ch) <= (ch_max))
314#define ISANYOF_(ch, palette) ((ch) != _T('\0') && md_strchr((palette), (ch)) != NULL)
315#define ISANYOF2_(ch, ch1, ch2) ((ch) == (ch1) || (ch) == (ch2))
316#define ISANYOF3_(ch, ch1, ch2, ch3) ((ch) == (ch1) || (ch) == (ch2) || (ch) == (ch3))
317#define ISASCII_(ch) ((unsigned)(ch) <= 127)
318#define ISBLANK_(ch) (ISANYOF2_((ch), _T(' '), _T('\t')))
319#define ISNEWLINE_(ch) (ISANYOF2_((ch), _T('\r'), _T('\n')))
320#define ISWHITESPACE_(ch) (ISBLANK_(ch) || ISANYOF2_((ch), _T('\v'), _T('\f')))
321#define ISCNTRL_(ch) ((unsigned)(ch) <= 31 || (unsigned)(ch) == 127)
322#define ISPUNCT_(ch) (ISIN_(ch, 33, 47) || ISIN_(ch, 58, 64) || ISIN_(ch, 91, 96) || ISIN_(ch, 123, 126))
323#define ISUPPER_(ch) (ISIN_(ch, _T('A'), _T('Z')))
324#define ISLOWER_(ch) (ISIN_(ch, _T('a'), _T('z')))
325#define ISALPHA_(ch) (ISUPPER_(ch) || ISLOWER_(ch))
326#define ISDIGIT_(ch) (ISIN_(ch, _T('0'), _T('9')))
327#define ISXDIGIT_(ch) (ISDIGIT_(ch) || ISIN_(ch, _T('A'), _T('F')) || ISIN_(ch, _T('a'), _T('f')))
328#define ISALNUM_(ch) (ISALPHA_(ch) || ISDIGIT_(ch))
329
330#define ISANYOF(off, palette) ISANYOF_(CH(off), (palette))
331#define ISANYOF2(off, ch1, ch2) ISANYOF2_(CH(off), (ch1), (ch2))
332#define ISANYOF3(off, ch1, ch2, ch3) ISANYOF3_(CH(off), (ch1), (ch2), (ch3))
333#define ISASCII(off) ISASCII_(CH(off))
334#define ISBLANK(off) ISBLANK_(CH(off))
335#define ISNEWLINE(off) ISNEWLINE_(CH(off))
336#define ISWHITESPACE(off) ISWHITESPACE_(CH(off))
337#define ISCNTRL(off) ISCNTRL_(CH(off))
338#define ISPUNCT(off) ISPUNCT_(CH(off))
339#define ISUPPER(off) ISUPPER_(CH(off))
340#define ISLOWER(off) ISLOWER_(CH(off))
341#define ISALPHA(off) ISALPHA_(CH(off))
342#define ISDIGIT(off) ISDIGIT_(CH(off))
343#define ISXDIGIT(off) ISXDIGIT_(CH(off))
344#define ISALNUM(off) ISALNUM_(CH(off))
345
346
347#if defined MD4C_USE_UTF16
348 #define md_strchr wcschr
349#else
350 #define md_strchr strchr
351#endif
352
353
354/* Case insensitive check of string equality. */
355static inline int
356md_ascii_case_eq(const CHAR* s1, const CHAR* s2, SZ n)
357{
358 OFF i;
359 for(i = 0; i < n; i++) {
360 CHAR ch1 = s1[i];
361 CHAR ch2 = s2[i];
362
363 if(ISLOWER_(ch1))
364 ch1 += ('A'-'a');
365 if(ISLOWER_(ch2))
366 ch2 += ('A'-'a');
367 if(ch1 != ch2)
368 return FALSE;
369 }
370 return TRUE;
371}
372
373static inline int
374md_ascii_eq(const CHAR* s1, const CHAR* s2, SZ n)
375{
376 return memcmp(s1: s1, s2: s2, n: n * sizeof(CHAR)) == 0;
377}
378
379static int
380md_text_with_null_replacement(MD_CTX* ctx, MD_TEXTTYPE type, const CHAR* str, SZ size)
381{
382 OFF off = 0;
383 int ret = 0;
384
385 while(1) {
386 while(off < size && str[off] != _T('\0'))
387 off++;
388
389 if(off > 0) {
390 ret = ctx->parser.text(type, str, off, ctx->userdata);
391 if(ret != 0)
392 return ret;
393
394 str += off;
395 size -= off;
396 off = 0;
397 }
398
399 if(off >= size)
400 return 0;
401
402 ret = ctx->parser.text(MD_TEXT_NULLCHAR, _T(""), 1, ctx->userdata);
403 if(ret != 0)
404 return ret;
405 off++;
406 }
407}
408
409
410#define MD_CHECK(func) \
411 do { \
412 ret = (func); \
413 if(ret < 0) \
414 goto abort; \
415 } while(0)
416
417
418#define MD_TEMP_BUFFER(sz) \
419 do { \
420 if(sz > ctx->alloc_buffer) { \
421 CHAR* new_buffer; \
422 SZ new_size = ((sz) + (sz) / 2 + 128) & ~127; \
423 \
424 new_buffer = realloc(ctx->buffer, new_size); \
425 if(new_buffer == NULL) { \
426 MD_LOG("realloc() failed."); \
427 ret = -1; \
428 goto abort; \
429 } \
430 \
431 ctx->buffer = new_buffer; \
432 ctx->alloc_buffer = new_size; \
433 } \
434 } while(0)
435
436
437#define MD_ENTER_BLOCK(type, arg) \
438 do { \
439 ret = ctx->parser.enter_block((type), (arg), ctx->userdata); \
440 if(ret != 0) { \
441 MD_LOG("Aborted from enter_block() callback."); \
442 goto abort; \
443 } \
444 } while(0)
445
446#define MD_LEAVE_BLOCK(type, arg) \
447 do { \
448 ret = ctx->parser.leave_block((type), (arg), ctx->userdata); \
449 if(ret != 0) { \
450 MD_LOG("Aborted from leave_block() callback."); \
451 goto abort; \
452 } \
453 } while(0)
454
455#define MD_ENTER_SPAN(type, arg) \
456 do { \
457 ret = ctx->parser.enter_span((type), (arg), ctx->userdata); \
458 if(ret != 0) { \
459 MD_LOG("Aborted from enter_span() callback."); \
460 goto abort; \
461 } \
462 } while(0)
463
464#define MD_LEAVE_SPAN(type, arg) \
465 do { \
466 ret = ctx->parser.leave_span((type), (arg), ctx->userdata); \
467 if(ret != 0) { \
468 MD_LOG("Aborted from leave_span() callback."); \
469 goto abort; \
470 } \
471 } while(0)
472
473#define MD_TEXT(type, str, size) \
474 do { \
475 if(size > 0) { \
476 ret = ctx->parser.text((type), (str), (size), ctx->userdata); \
477 if(ret != 0) { \
478 MD_LOG("Aborted from text() callback."); \
479 goto abort; \
480 } \
481 } \
482 } while(0)
483
484#define MD_TEXT_INSECURE(type, str, size) \
485 do { \
486 if(size > 0) { \
487 ret = md_text_with_null_replacement(ctx, type, str, size); \
488 if(ret != 0) { \
489 MD_LOG("Aborted from text() callback."); \
490 goto abort; \
491 } \
492 } \
493 } while(0)
494
495
496/* If the offset falls into a gap between line, we return the following
497 * line. */
498static const MD_LINE*
499md_lookup_line(OFF off, const MD_LINE* lines, MD_SIZE n_lines, MD_SIZE* p_line_index)
500{
501 MD_SIZE lo, hi;
502 MD_SIZE pivot;
503 const MD_LINE* line;
504
505 lo = 0;
506 hi = n_lines - 1;
507 while(lo <= hi) {
508 pivot = (lo + hi) / 2;
509 line = &lines[pivot];
510
511 if(off < line->beg) {
512 if(hi == 0 || lines[hi-1].end < off) {
513 if(p_line_index != NULL)
514 *p_line_index = pivot;
515 return line;
516 }
517 hi = pivot - 1;
518 } else if(off > line->end) {
519 lo = pivot + 1;
520 } else {
521 if(p_line_index != NULL)
522 *p_line_index = pivot;
523 return line;
524 }
525 }
526
527 return NULL;
528}
529
530
531/*************************
532 *** Unicode Support ***
533 *************************/
534
535typedef struct MD_UNICODE_FOLD_INFO_tag MD_UNICODE_FOLD_INFO;
536struct MD_UNICODE_FOLD_INFO_tag {
537 unsigned codepoints[3];
538 unsigned n_codepoints;
539};
540
541
542#if defined MD4C_USE_UTF16 || defined MD4C_USE_UTF8
543 /* Binary search over sorted "map" of codepoints. Consecutive sequences
544 * of codepoints may be encoded in the map by just using the
545 * (MIN_CODEPOINT | 0x40000000) and (MAX_CODEPOINT | 0x80000000).
546 *
547 * Returns index of the found record in the map (in the case of ranges,
548 * the minimal value is used); or -1 on failure. */
549 static int
550 md_unicode_bsearch__(unsigned codepoint, const unsigned* map, size_t map_size)
551 {
552 int beg, end;
553 int pivot_beg, pivot_end;
554
555 beg = 0;
556 end = (int) map_size-1;
557 while(beg <= end) {
558 /* Pivot may be a range, not just a single value. */
559 pivot_beg = pivot_end = (beg + end) / 2;
560 if(map[pivot_end] & 0x40000000)
561 pivot_end++;
562 if(map[pivot_beg] & 0x80000000)
563 pivot_beg--;
564
565 if(codepoint < (map[pivot_beg] & 0x00ffffff))
566 end = pivot_beg - 1;
567 else if(codepoint > (map[pivot_end] & 0x00ffffff))
568 beg = pivot_end + 1;
569 else
570 return pivot_beg;
571 }
572
573 return -1;
574 }
575
576 static int
577 md_is_unicode_whitespace__(unsigned codepoint)
578 {
579#define R(cp_min, cp_max) ((cp_min) | 0x40000000), ((cp_max) | 0x80000000)
580#define S(cp) (cp)
581 /* Unicode "Zs" category.
582 * (generated by scripts/build_whitespace_map.py) */
583 static const unsigned WHITESPACE_MAP[] = {
584 S(0x0020), S(0x00a0), S(0x1680), R(0x2000,0x200a), S(0x202f), S(0x205f), S(0x3000)
585 };
586#undef R
587#undef S
588
589 /* The ASCII ones are the most frequently used ones, also CommonMark
590 * specification requests few more in this range. */
591 if(codepoint <= 0x7f)
592 return ISWHITESPACE_(codepoint);
593
594 return (md_unicode_bsearch__(codepoint, map: WHITESPACE_MAP, SIZEOF_ARRAY(WHITESPACE_MAP)) >= 0);
595 }
596
597 static int
598 md_is_unicode_punct__(unsigned codepoint)
599 {
600#define R(cp_min, cp_max) ((cp_min) | 0x40000000), ((cp_max) | 0x80000000)
601#define S(cp) (cp)
602 /* Unicode general "P" and "S" categories.
603 * (generated by scripts/build_punct_map.py) */
604 static const unsigned PUNCT_MAP[] = {
605 R(0x0021,0x002f), R(0x003a,0x0040), R(0x005b,0x0060), R(0x007b,0x007e), R(0x00a1,0x00a9),
606 R(0x00ab,0x00ac), R(0x00ae,0x00b1), S(0x00b4), R(0x00b6,0x00b8), S(0x00bb), S(0x00bf), S(0x00d7),
607 S(0x00f7), R(0x02c2,0x02c5), R(0x02d2,0x02df), R(0x02e5,0x02eb), S(0x02ed), R(0x02ef,0x02ff), S(0x0375),
608 S(0x037e), R(0x0384,0x0385), S(0x0387), S(0x03f6), S(0x0482), R(0x055a,0x055f), R(0x0589,0x058a),
609 R(0x058d,0x058f), S(0x05be), S(0x05c0), S(0x05c3), S(0x05c6), R(0x05f3,0x05f4), R(0x0606,0x060f),
610 S(0x061b), R(0x061d,0x061f), R(0x066a,0x066d), S(0x06d4), S(0x06de), S(0x06e9), R(0x06fd,0x06fe),
611 R(0x0700,0x070d), R(0x07f6,0x07f9), R(0x07fe,0x07ff), R(0x0830,0x083e), S(0x085e), S(0x0888),
612 R(0x0964,0x0965), S(0x0970), R(0x09f2,0x09f3), R(0x09fa,0x09fb), S(0x09fd), S(0x0a76), R(0x0af0,0x0af1),
613 S(0x0b70), R(0x0bf3,0x0bfa), S(0x0c77), S(0x0c7f), S(0x0c84), S(0x0d4f), S(0x0d79), S(0x0df4), S(0x0e3f),
614 S(0x0e4f), R(0x0e5a,0x0e5b), R(0x0f01,0x0f17), R(0x0f1a,0x0f1f), S(0x0f34), S(0x0f36), S(0x0f38),
615 R(0x0f3a,0x0f3d), S(0x0f85), R(0x0fbe,0x0fc5), R(0x0fc7,0x0fcc), R(0x0fce,0x0fda), R(0x104a,0x104f),
616 R(0x109e,0x109f), S(0x10fb), R(0x1360,0x1368), R(0x1390,0x1399), S(0x1400), R(0x166d,0x166e),
617 R(0x169b,0x169c), R(0x16eb,0x16ed), R(0x1735,0x1736), R(0x17d4,0x17d6), R(0x17d8,0x17db),
618 R(0x1800,0x180a), S(0x1940), R(0x1944,0x1945), R(0x19de,0x19ff), R(0x1a1e,0x1a1f), R(0x1aa0,0x1aa6),
619 R(0x1aa8,0x1aad), R(0x1b5a,0x1b6a), R(0x1b74,0x1b7e), R(0x1bfc,0x1bff), R(0x1c3b,0x1c3f),
620 R(0x1c7e,0x1c7f), R(0x1cc0,0x1cc7), S(0x1cd3), S(0x1fbd), R(0x1fbf,0x1fc1), R(0x1fcd,0x1fcf),
621 R(0x1fdd,0x1fdf), R(0x1fed,0x1fef), R(0x1ffd,0x1ffe), R(0x2010,0x2027), R(0x2030,0x205e),
622 R(0x207a,0x207e), R(0x208a,0x208e), R(0x20a0,0x20c0), R(0x2100,0x2101), R(0x2103,0x2106),
623 R(0x2108,0x2109), S(0x2114), R(0x2116,0x2118), R(0x211e,0x2123), S(0x2125), S(0x2127), S(0x2129),
624 S(0x212e), R(0x213a,0x213b), R(0x2140,0x2144), R(0x214a,0x214d), S(0x214f), R(0x218a,0x218b),
625 R(0x2190,0x2426), R(0x2440,0x244a), R(0x249c,0x24e9), R(0x2500,0x2775), R(0x2794,0x2b73),
626 R(0x2b76,0x2b95), R(0x2b97,0x2bff), R(0x2ce5,0x2cea), R(0x2cf9,0x2cfc), R(0x2cfe,0x2cff), S(0x2d70),
627 R(0x2e00,0x2e2e), R(0x2e30,0x2e5d), R(0x2e80,0x2e99), R(0x2e9b,0x2ef3), R(0x2f00,0x2fd5),
628 R(0x2ff0,0x2fff), R(0x3001,0x3004), R(0x3008,0x3020), S(0x3030), R(0x3036,0x3037), R(0x303d,0x303f),
629 R(0x309b,0x309c), S(0x30a0), S(0x30fb), R(0x3190,0x3191), R(0x3196,0x319f), R(0x31c0,0x31e3), S(0x31ef),
630 R(0x3200,0x321e), R(0x322a,0x3247), S(0x3250), R(0x3260,0x327f), R(0x328a,0x32b0), R(0x32c0,0x33ff),
631 R(0x4dc0,0x4dff), R(0xa490,0xa4c6), R(0xa4fe,0xa4ff), R(0xa60d,0xa60f), S(0xa673), S(0xa67e),
632 R(0xa6f2,0xa6f7), R(0xa700,0xa716), R(0xa720,0xa721), R(0xa789,0xa78a), R(0xa828,0xa82b),
633 R(0xa836,0xa839), R(0xa874,0xa877), R(0xa8ce,0xa8cf), R(0xa8f8,0xa8fa), S(0xa8fc), R(0xa92e,0xa92f),
634 S(0xa95f), R(0xa9c1,0xa9cd), R(0xa9de,0xa9df), R(0xaa5c,0xaa5f), R(0xaa77,0xaa79), R(0xaade,0xaadf),
635 R(0xaaf0,0xaaf1), S(0xab5b), R(0xab6a,0xab6b), S(0xabeb), S(0xfb29), R(0xfbb2,0xfbc2), R(0xfd3e,0xfd4f),
636 S(0xfdcf), R(0xfdfc,0xfdff), R(0xfe10,0xfe19), R(0xfe30,0xfe52), R(0xfe54,0xfe66), R(0xfe68,0xfe6b),
637 R(0xff01,0xff0f), R(0xff1a,0xff20), R(0xff3b,0xff40), R(0xff5b,0xff65), R(0xffe0,0xffe6),
638 R(0xffe8,0xffee), R(0xfffc,0xfffd), R(0x10100,0x10102), R(0x10137,0x1013f), R(0x10179,0x10189),
639 R(0x1018c,0x1018e), R(0x10190,0x1019c), S(0x101a0), R(0x101d0,0x101fc), S(0x1039f), S(0x103d0),
640 S(0x1056f), S(0x10857), R(0x10877,0x10878), S(0x1091f), S(0x1093f), R(0x10a50,0x10a58), S(0x10a7f),
641 S(0x10ac8), R(0x10af0,0x10af6), R(0x10b39,0x10b3f), R(0x10b99,0x10b9c), S(0x10ead), R(0x10f55,0x10f59),
642 R(0x10f86,0x10f89), R(0x11047,0x1104d), R(0x110bb,0x110bc), R(0x110be,0x110c1), R(0x11140,0x11143),
643 R(0x11174,0x11175), R(0x111c5,0x111c8), S(0x111cd), S(0x111db), R(0x111dd,0x111df), R(0x11238,0x1123d),
644 S(0x112a9), R(0x1144b,0x1144f), R(0x1145a,0x1145b), S(0x1145d), S(0x114c6), R(0x115c1,0x115d7),
645 R(0x11641,0x11643), R(0x11660,0x1166c), S(0x116b9), R(0x1173c,0x1173f), S(0x1183b), R(0x11944,0x11946),
646 S(0x119e2), R(0x11a3f,0x11a46), R(0x11a9a,0x11a9c), R(0x11a9e,0x11aa2), R(0x11b00,0x11b09),
647 R(0x11c41,0x11c45), R(0x11c70,0x11c71), R(0x11ef7,0x11ef8), R(0x11f43,0x11f4f), R(0x11fd5,0x11ff1),
648 S(0x11fff), R(0x12470,0x12474), R(0x12ff1,0x12ff2), R(0x16a6e,0x16a6f), S(0x16af5), R(0x16b37,0x16b3f),
649 R(0x16b44,0x16b45), R(0x16e97,0x16e9a), S(0x16fe2), S(0x1bc9c), S(0x1bc9f), R(0x1cf50,0x1cfc3),
650 R(0x1d000,0x1d0f5), R(0x1d100,0x1d126), R(0x1d129,0x1d164), R(0x1d16a,0x1d16c), R(0x1d183,0x1d184),
651 R(0x1d18c,0x1d1a9), R(0x1d1ae,0x1d1ea), R(0x1d200,0x1d241), S(0x1d245), R(0x1d300,0x1d356), S(0x1d6c1),
652 S(0x1d6db), S(0x1d6fb), S(0x1d715), S(0x1d735), S(0x1d74f), S(0x1d76f), S(0x1d789), S(0x1d7a9),
653 S(0x1d7c3), R(0x1d800,0x1d9ff), R(0x1da37,0x1da3a), R(0x1da6d,0x1da74), R(0x1da76,0x1da83),
654 R(0x1da85,0x1da8b), S(0x1e14f), S(0x1e2ff), R(0x1e95e,0x1e95f), S(0x1ecac), S(0x1ecb0), S(0x1ed2e),
655 R(0x1eef0,0x1eef1), R(0x1f000,0x1f02b), R(0x1f030,0x1f093), R(0x1f0a0,0x1f0ae), R(0x1f0b1,0x1f0bf),
656 R(0x1f0c1,0x1f0cf), R(0x1f0d1,0x1f0f5), R(0x1f10d,0x1f1ad), R(0x1f1e6,0x1f202), R(0x1f210,0x1f23b),
657 R(0x1f240,0x1f248), R(0x1f250,0x1f251), R(0x1f260,0x1f265), R(0x1f300,0x1f6d7), R(0x1f6dc,0x1f6ec),
658 R(0x1f6f0,0x1f6fc), R(0x1f700,0x1f776), R(0x1f77b,0x1f7d9), R(0x1f7e0,0x1f7eb), S(0x1f7f0),
659 R(0x1f800,0x1f80b), R(0x1f810,0x1f847), R(0x1f850,0x1f859), R(0x1f860,0x1f887), R(0x1f890,0x1f8ad),
660 R(0x1f8b0,0x1f8b1), R(0x1f900,0x1fa53), R(0x1fa60,0x1fa6d), R(0x1fa70,0x1fa7c), R(0x1fa80,0x1fa88),
661 R(0x1fa90,0x1fabd), R(0x1fabf,0x1fac5), R(0x1face,0x1fadb), R(0x1fae0,0x1fae8), R(0x1faf0,0x1faf8),
662 R(0x1fb00,0x1fb92), R(0x1fb94,0x1fbca)
663 };
664#undef R
665#undef S
666
667 /* The ASCII ones are the most frequently used ones, also CommonMark
668 * specification requests few more in this range. */
669 if(codepoint <= 0x7f)
670 return ISPUNCT_(codepoint);
671
672 return (md_unicode_bsearch__(codepoint, map: PUNCT_MAP, SIZEOF_ARRAY(PUNCT_MAP)) >= 0);
673 }
674
675 static void
676 md_get_unicode_fold_info(unsigned codepoint, MD_UNICODE_FOLD_INFO* info)
677 {
678#define R(cp_min, cp_max) ((cp_min) | 0x40000000), ((cp_max) | 0x80000000)
679#define S(cp) (cp)
680 /* Unicode "Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps" categories.
681 * (generated by scripts/build_folding_map.py) */
682 static const unsigned FOLD_MAP_1[] = {
683 R(0x0041,0x005a), S(0x00b5), R(0x00c0,0x00d6), R(0x00d8,0x00de), R(0x0100,0x012e), R(0x0132,0x0136),
684 R(0x0139,0x0147), R(0x014a,0x0176), S(0x0178), R(0x0179,0x017d), S(0x017f), S(0x0181), S(0x0182),
685 S(0x0184), S(0x0186), S(0x0187), S(0x0189), S(0x018a), S(0x018b), S(0x018e), S(0x018f), S(0x0190),
686 S(0x0191), S(0x0193), S(0x0194), S(0x0196), S(0x0197), S(0x0198), S(0x019c), S(0x019d), S(0x019f),
687 R(0x01a0,0x01a4), S(0x01a6), S(0x01a7), S(0x01a9), S(0x01ac), S(0x01ae), S(0x01af), S(0x01b1), S(0x01b2),
688 S(0x01b3), S(0x01b5), S(0x01b7), S(0x01b8), S(0x01bc), S(0x01c4), S(0x01c5), S(0x01c7), S(0x01c8),
689 S(0x01ca), R(0x01cb,0x01db), R(0x01de,0x01ee), S(0x01f1), S(0x01f2), S(0x01f4), S(0x01f6), S(0x01f7),
690 R(0x01f8,0x021e), S(0x0220), R(0x0222,0x0232), S(0x023a), S(0x023b), S(0x023d), S(0x023e), S(0x0241),
691 S(0x0243), S(0x0244), S(0x0245), R(0x0246,0x024e), S(0x0345), S(0x0370), S(0x0372), S(0x0376), S(0x037f),
692 S(0x0386), R(0x0388,0x038a), S(0x038c), S(0x038e), S(0x038f), R(0x0391,0x03a1), R(0x03a3,0x03ab),
693 S(0x03c2), S(0x03cf), S(0x03d0), S(0x03d1), S(0x03d5), S(0x03d6), R(0x03d8,0x03ee), S(0x03f0), S(0x03f1),
694 S(0x03f4), S(0x03f5), S(0x03f7), S(0x03f9), S(0x03fa), R(0x03fd,0x03ff), R(0x0400,0x040f),
695 R(0x0410,0x042f), R(0x0460,0x0480), R(0x048a,0x04be), S(0x04c0), R(0x04c1,0x04cd), R(0x04d0,0x052e),
696 R(0x0531,0x0556), R(0x10a0,0x10c5), S(0x10c7), S(0x10cd), R(0x13f8,0x13fd), S(0x1c80), S(0x1c81),
697 S(0x1c82), S(0x1c83), S(0x1c84), S(0x1c85), S(0x1c86), S(0x1c87), S(0x1c88), R(0x1c90,0x1cba),
698 R(0x1cbd,0x1cbf), R(0x1e00,0x1e94), S(0x1e9b), R(0x1ea0,0x1efe), R(0x1f08,0x1f0f), R(0x1f18,0x1f1d),
699 R(0x1f28,0x1f2f), R(0x1f38,0x1f3f), R(0x1f48,0x1f4d), S(0x1f59), S(0x1f5b), S(0x1f5d), S(0x1f5f),
700 R(0x1f68,0x1f6f), S(0x1fb8), S(0x1fb9), S(0x1fba), S(0x1fbb), S(0x1fbe), R(0x1fc8,0x1fcb), S(0x1fd8),
701 S(0x1fd9), S(0x1fda), S(0x1fdb), S(0x1fe8), S(0x1fe9), S(0x1fea), S(0x1feb), S(0x1fec), S(0x1ff8),
702 S(0x1ff9), S(0x1ffa), S(0x1ffb), S(0x2126), S(0x212a), S(0x212b), S(0x2132), R(0x2160,0x216f), S(0x2183),
703 R(0x24b6,0x24cf), R(0x2c00,0x2c2f), S(0x2c60), S(0x2c62), S(0x2c63), S(0x2c64), R(0x2c67,0x2c6b),
704 S(0x2c6d), S(0x2c6e), S(0x2c6f), S(0x2c70), S(0x2c72), S(0x2c75), S(0x2c7e), S(0x2c7f), R(0x2c80,0x2ce2),
705 S(0x2ceb), S(0x2ced), S(0x2cf2), R(0xa640,0xa66c), R(0xa680,0xa69a), R(0xa722,0xa72e), R(0xa732,0xa76e),
706 S(0xa779), S(0xa77b), S(0xa77d), R(0xa77e,0xa786), S(0xa78b), S(0xa78d), S(0xa790), S(0xa792),
707 R(0xa796,0xa7a8), S(0xa7aa), S(0xa7ab), S(0xa7ac), S(0xa7ad), S(0xa7ae), S(0xa7b0), S(0xa7b1), S(0xa7b2),
708 S(0xa7b3), R(0xa7b4,0xa7c2), S(0xa7c4), S(0xa7c5), S(0xa7c6), S(0xa7c7), S(0xa7c9), S(0xa7d0), S(0xa7d6),
709 S(0xa7d8), S(0xa7f5), R(0xab70,0xabbf), R(0xff21,0xff3a), R(0x10400,0x10427), R(0x104b0,0x104d3),
710 R(0x10570,0x1057a), R(0x1057c,0x1058a), R(0x1058c,0x10592), S(0x10594), S(0x10595), R(0x10c80,0x10cb2),
711 R(0x118a0,0x118bf), R(0x16e40,0x16e5f), R(0x1e900,0x1e921)
712 };
713 static const unsigned FOLD_MAP_1_DATA[] = {
714 0x0061, 0x007a, 0x03bc, 0x00e0, 0x00f6, 0x00f8, 0x00fe, 0x0101, 0x012f, 0x0133, 0x0137, 0x013a, 0x0148,
715 0x014b, 0x0177, 0x00ff, 0x017a, 0x017e, 0x0073, 0x0253, 0x0183, 0x0185, 0x0254, 0x0188, 0x0256, 0x0257,
716 0x018c, 0x01dd, 0x0259, 0x025b, 0x0192, 0x0260, 0x0263, 0x0269, 0x0268, 0x0199, 0x026f, 0x0272, 0x0275,
717 0x01a1, 0x01a5, 0x0280, 0x01a8, 0x0283, 0x01ad, 0x0288, 0x01b0, 0x028a, 0x028b, 0x01b4, 0x01b6, 0x0292,
718 0x01b9, 0x01bd, 0x01c6, 0x01c6, 0x01c9, 0x01c9, 0x01cc, 0x01cc, 0x01dc, 0x01df, 0x01ef, 0x01f3, 0x01f3,
719 0x01f5, 0x0195, 0x01bf, 0x01f9, 0x021f, 0x019e, 0x0223, 0x0233, 0x2c65, 0x023c, 0x019a, 0x2c66, 0x0242,
720 0x0180, 0x0289, 0x028c, 0x0247, 0x024f, 0x03b9, 0x0371, 0x0373, 0x0377, 0x03f3, 0x03ac, 0x03ad, 0x03af,
721 0x03cc, 0x03cd, 0x03ce, 0x03b1, 0x03c1, 0x03c3, 0x03cb, 0x03c3, 0x03d7, 0x03b2, 0x03b8, 0x03c6, 0x03c0,
722 0x03d9, 0x03ef, 0x03ba, 0x03c1, 0x03b8, 0x03b5, 0x03f8, 0x03f2, 0x03fb, 0x037b, 0x037d, 0x0450, 0x045f,
723 0x0430, 0x044f, 0x0461, 0x0481, 0x048b, 0x04bf, 0x04cf, 0x04c2, 0x04ce, 0x04d1, 0x052f, 0x0561, 0x0586,
724 0x2d00, 0x2d25, 0x2d27, 0x2d2d, 0x13f0, 0x13f5, 0x0432, 0x0434, 0x043e, 0x0441, 0x0442, 0x0442, 0x044a,
725 0x0463, 0xa64b, 0x10d0, 0x10fa, 0x10fd, 0x10ff, 0x1e01, 0x1e95, 0x1e61, 0x1ea1, 0x1eff, 0x1f00, 0x1f07,
726 0x1f10, 0x1f15, 0x1f20, 0x1f27, 0x1f30, 0x1f37, 0x1f40, 0x1f45, 0x1f51, 0x1f53, 0x1f55, 0x1f57, 0x1f60,
727 0x1f67, 0x1fb0, 0x1fb1, 0x1f70, 0x1f71, 0x03b9, 0x1f72, 0x1f75, 0x1fd0, 0x1fd1, 0x1f76, 0x1f77, 0x1fe0,
728 0x1fe1, 0x1f7a, 0x1f7b, 0x1fe5, 0x1f78, 0x1f79, 0x1f7c, 0x1f7d, 0x03c9, 0x006b, 0x00e5, 0x214e, 0x2170,
729 0x217f, 0x2184, 0x24d0, 0x24e9, 0x2c30, 0x2c5f, 0x2c61, 0x026b, 0x1d7d, 0x027d, 0x2c68, 0x2c6c, 0x0251,
730 0x0271, 0x0250, 0x0252, 0x2c73, 0x2c76, 0x023f, 0x0240, 0x2c81, 0x2ce3, 0x2cec, 0x2cee, 0x2cf3, 0xa641,
731 0xa66d, 0xa681, 0xa69b, 0xa723, 0xa72f, 0xa733, 0xa76f, 0xa77a, 0xa77c, 0x1d79, 0xa77f, 0xa787, 0xa78c,
732 0x0265, 0xa791, 0xa793, 0xa797, 0xa7a9, 0x0266, 0x025c, 0x0261, 0x026c, 0x026a, 0x029e, 0x0287, 0x029d,
733 0xab53, 0xa7b5, 0xa7c3, 0xa794, 0x0282, 0x1d8e, 0xa7c8, 0xa7ca, 0xa7d1, 0xa7d7, 0xa7d9, 0xa7f6, 0x13a0,
734 0x13ef, 0xff41, 0xff5a, 0x10428, 0x1044f, 0x104d8, 0x104fb, 0x10597, 0x105a1, 0x105a3, 0x105b1, 0x105b3,
735 0x105b9, 0x105bb, 0x105bc, 0x10cc0, 0x10cf2, 0x118c0, 0x118df, 0x16e60, 0x16e7f, 0x1e922, 0x1e943
736 };
737 static const unsigned FOLD_MAP_2[] = {
738 S(0x00df), S(0x0130), S(0x0149), S(0x01f0), S(0x0587), S(0x1e96), S(0x1e97), S(0x1e98), S(0x1e99),
739 S(0x1e9a), S(0x1e9e), S(0x1f50), R(0x1f80,0x1f87), R(0x1f88,0x1f8f), R(0x1f90,0x1f97), R(0x1f98,0x1f9f),
740 R(0x1fa0,0x1fa7), R(0x1fa8,0x1faf), S(0x1fb2), S(0x1fb3), S(0x1fb4), S(0x1fb6), S(0x1fbc), S(0x1fc2),
741 S(0x1fc3), S(0x1fc4), S(0x1fc6), S(0x1fcc), S(0x1fd6), S(0x1fe4), S(0x1fe6), S(0x1ff2), S(0x1ff3),
742 S(0x1ff4), S(0x1ff6), S(0x1ffc), S(0xfb00), S(0xfb01), S(0xfb02), S(0xfb05), S(0xfb06), S(0xfb13),
743 S(0xfb14), S(0xfb15), S(0xfb16), S(0xfb17)
744 };
745 static const unsigned FOLD_MAP_2_DATA[] = {
746 0x0073,0x0073, 0x0069,0x0307, 0x02bc,0x006e, 0x006a,0x030c, 0x0565,0x0582, 0x0068,0x0331, 0x0074,0x0308,
747 0x0077,0x030a, 0x0079,0x030a, 0x0061,0x02be, 0x0073,0x0073, 0x03c5,0x0313, 0x1f00,0x03b9, 0x1f07,0x03b9,
748 0x1f00,0x03b9, 0x1f07,0x03b9, 0x1f20,0x03b9, 0x1f27,0x03b9, 0x1f20,0x03b9, 0x1f27,0x03b9, 0x1f60,0x03b9,
749 0x1f67,0x03b9, 0x1f60,0x03b9, 0x1f67,0x03b9, 0x1f70,0x03b9, 0x03b1,0x03b9, 0x03ac,0x03b9, 0x03b1,0x0342,
750 0x03b1,0x03b9, 0x1f74,0x03b9, 0x03b7,0x03b9, 0x03ae,0x03b9, 0x03b7,0x0342, 0x03b7,0x03b9, 0x03b9,0x0342,
751 0x03c1,0x0313, 0x03c5,0x0342, 0x1f7c,0x03b9, 0x03c9,0x03b9, 0x03ce,0x03b9, 0x03c9,0x0342, 0x03c9,0x03b9,
752 0x0066,0x0066, 0x0066,0x0069, 0x0066,0x006c, 0x0073,0x0074, 0x0073,0x0074, 0x0574,0x0576, 0x0574,0x0565,
753 0x0574,0x056b, 0x057e,0x0576, 0x0574,0x056d
754 };
755 static const unsigned FOLD_MAP_3[] = {
756 S(0x0390), S(0x03b0), S(0x1f52), S(0x1f54), S(0x1f56), S(0x1fb7), S(0x1fc7), S(0x1fd2), S(0x1fd3),
757 S(0x1fd7), S(0x1fe2), S(0x1fe3), S(0x1fe7), S(0x1ff7), S(0xfb03), S(0xfb04)
758 };
759 static const unsigned FOLD_MAP_3_DATA[] = {
760 0x03b9,0x0308,0x0301, 0x03c5,0x0308,0x0301, 0x03c5,0x0313,0x0300, 0x03c5,0x0313,0x0301,
761 0x03c5,0x0313,0x0342, 0x03b1,0x0342,0x03b9, 0x03b7,0x0342,0x03b9, 0x03b9,0x0308,0x0300,
762 0x03b9,0x0308,0x0301, 0x03b9,0x0308,0x0342, 0x03c5,0x0308,0x0300, 0x03c5,0x0308,0x0301,
763 0x03c5,0x0308,0x0342, 0x03c9,0x0342,0x03b9, 0x0066,0x0066,0x0069, 0x0066,0x0066,0x006c
764 };
765#undef R
766#undef S
767 static const struct {
768 const unsigned* map;
769 const unsigned* data;
770 size_t map_size;
771 unsigned n_codepoints;
772 } FOLD_MAP_LIST[] = {
773 { FOLD_MAP_1, FOLD_MAP_1_DATA, SIZEOF_ARRAY(FOLD_MAP_1), 1 },
774 { FOLD_MAP_2, FOLD_MAP_2_DATA, SIZEOF_ARRAY(FOLD_MAP_2), 2 },
775 { FOLD_MAP_3, FOLD_MAP_3_DATA, SIZEOF_ARRAY(FOLD_MAP_3), 3 }
776 };
777
778 int i;
779
780 /* Fast path for ASCII characters. */
781 if(codepoint <= 0x7f) {
782 info->codepoints[0] = codepoint;
783 if(ISUPPER_(codepoint))
784 info->codepoints[0] += 'a' - 'A';
785 info->n_codepoints = 1;
786 return;
787 }
788
789 /* Try to locate the codepoint in any of the maps. */
790 for(i = 0; i < (int) SIZEOF_ARRAY(FOLD_MAP_LIST); i++) {
791 int index;
792
793 index = md_unicode_bsearch__(codepoint, map: FOLD_MAP_LIST[i].map, map_size: FOLD_MAP_LIST[i].map_size);
794 if(index >= 0) {
795 /* Found the mapping. */
796 unsigned n_codepoints = FOLD_MAP_LIST[i].n_codepoints;
797 const unsigned* map = FOLD_MAP_LIST[i].map;
798 const unsigned* codepoints = FOLD_MAP_LIST[i].data + (index * n_codepoints);
799
800 memcpy(dest: info->codepoints, src: codepoints, n: sizeof(unsigned) * n_codepoints);
801 info->n_codepoints = n_codepoints;
802
803 if(FOLD_MAP_LIST[i].map[index] != codepoint) {
804 /* The found mapping maps whole range of codepoints,
805 * i.e. we have to offset info->codepoints[0] accordingly. */
806 if((map[index] & 0x00ffffff)+1 == codepoints[0]) {
807 /* Alternating type of the range. */
808 info->codepoints[0] = codepoint + ((codepoint & 0x1) == (map[index] & 0x1) ? 1 : 0);
809 } else {
810 /* Range to range kind of mapping. */
811 info->codepoints[0] += (codepoint - (map[index] & 0x00ffffff));
812 }
813 }
814
815 return;
816 }
817 }
818
819 /* No mapping found. Map the codepoint to itself. */
820 info->codepoints[0] = codepoint;
821 info->n_codepoints = 1;
822 }
823#endif
824
825
826#if defined MD4C_USE_UTF16
827 #define IS_UTF16_SURROGATE_HI(word) (((WORD)(word) & 0xfc00) == 0xd800)
828 #define IS_UTF16_SURROGATE_LO(word) (((WORD)(word) & 0xfc00) == 0xdc00)
829 #define UTF16_DECODE_SURROGATE(hi, lo) (0x10000 + ((((unsigned)(hi) & 0x3ff) << 10) | (((unsigned)(lo) & 0x3ff) << 0)))
830
831 static unsigned
832 md_decode_utf16le__(const CHAR* str, SZ str_size, SZ* p_size)
833 {
834 if(IS_UTF16_SURROGATE_HI(str[0])) {
835 if(1 < str_size && IS_UTF16_SURROGATE_LO(str[1])) {
836 if(p_size != NULL)
837 *p_size = 2;
838 return UTF16_DECODE_SURROGATE(str[0], str[1]);
839 }
840 }
841
842 if(p_size != NULL)
843 *p_size = 1;
844 return str[0];
845 }
846
847 static unsigned
848 md_decode_utf16le_before__(MD_CTX* ctx, OFF off)
849 {
850 if(off > 2 && IS_UTF16_SURROGATE_HI(CH(off-2)) && IS_UTF16_SURROGATE_LO(CH(off-1)))
851 return UTF16_DECODE_SURROGATE(CH(off-2), CH(off-1));
852
853 return CH(off);
854 }
855
856 /* No whitespace uses surrogates, so no decoding needed here. */
857 #define ISUNICODEWHITESPACE_(codepoint) md_is_unicode_whitespace__(codepoint)
858 #define ISUNICODEWHITESPACE(off) md_is_unicode_whitespace__(CH(off))
859 #define ISUNICODEWHITESPACEBEFORE(off) md_is_unicode_whitespace__(CH((off)-1))
860
861 #define ISUNICODEPUNCT(off) md_is_unicode_punct__(md_decode_utf16le__(STR(off), ctx->size - (off), NULL))
862 #define ISUNICODEPUNCTBEFORE(off) md_is_unicode_punct__(md_decode_utf16le_before__(ctx, off))
863
864 static inline int
865 md_decode_unicode(const CHAR* str, OFF off, SZ str_size, SZ* p_char_size)
866 {
867 return md_decode_utf16le__(str+off, str_size-off, p_char_size);
868 }
869#elif defined MD4C_USE_UTF8
870 #define IS_UTF8_LEAD1(byte) ((unsigned char)(byte) <= 0x7f)
871 #define IS_UTF8_LEAD2(byte) (((unsigned char)(byte) & 0xe0) == 0xc0)
872 #define IS_UTF8_LEAD3(byte) (((unsigned char)(byte) & 0xf0) == 0xe0)
873 #define IS_UTF8_LEAD4(byte) (((unsigned char)(byte) & 0xf8) == 0xf0)
874 #define IS_UTF8_TAIL(byte) (((unsigned char)(byte) & 0xc0) == 0x80)
875
876 static unsigned
877 md_decode_utf8__(const CHAR* str, SZ str_size, SZ* p_size)
878 {
879 if(!IS_UTF8_LEAD1(str[0])) {
880 if(IS_UTF8_LEAD2(str[0])) {
881 if(1 < str_size && IS_UTF8_TAIL(str[1])) {
882 if(p_size != NULL)
883 *p_size = 2;
884
885 return (((unsigned int)str[0] & 0x1f) << 6) |
886 (((unsigned int)str[1] & 0x3f) << 0);
887 }
888 } else if(IS_UTF8_LEAD3(str[0])) {
889 if(2 < str_size && IS_UTF8_TAIL(str[1]) && IS_UTF8_TAIL(str[2])) {
890 if(p_size != NULL)
891 *p_size = 3;
892
893 return (((unsigned int)str[0] & 0x0f) << 12) |
894 (((unsigned int)str[1] & 0x3f) << 6) |
895 (((unsigned int)str[2] & 0x3f) << 0);
896 }
897 } else if(IS_UTF8_LEAD4(str[0])) {
898 if(3 < str_size && IS_UTF8_TAIL(str[1]) && IS_UTF8_TAIL(str[2]) && IS_UTF8_TAIL(str[3])) {
899 if(p_size != NULL)
900 *p_size = 4;
901
902 return (((unsigned int)str[0] & 0x07) << 18) |
903 (((unsigned int)str[1] & 0x3f) << 12) |
904 (((unsigned int)str[2] & 0x3f) << 6) |
905 (((unsigned int)str[3] & 0x3f) << 0);
906 }
907 }
908 }
909
910 if(p_size != NULL)
911 *p_size = 1;
912 return (unsigned) str[0];
913 }
914
915 static unsigned
916 md_decode_utf8_before__(MD_CTX* ctx, OFF off)
917 {
918 if(!IS_UTF8_LEAD1(CH(off-1))) {
919 if(off > 1 && IS_UTF8_LEAD2(CH(off-2)) && IS_UTF8_TAIL(CH(off-1)))
920 return (((unsigned int)CH(off-2) & 0x1f) << 6) |
921 (((unsigned int)CH(off-1) & 0x3f) << 0);
922
923 if(off > 2 && IS_UTF8_LEAD3(CH(off-3)) && IS_UTF8_TAIL(CH(off-2)) && IS_UTF8_TAIL(CH(off-1)))
924 return (((unsigned int)CH(off-3) & 0x0f) << 12) |
925 (((unsigned int)CH(off-2) & 0x3f) << 6) |
926 (((unsigned int)CH(off-1) & 0x3f) << 0);
927
928 if(off > 3 && IS_UTF8_LEAD4(CH(off-4)) && IS_UTF8_TAIL(CH(off-3)) && IS_UTF8_TAIL(CH(off-2)) && IS_UTF8_TAIL(CH(off-1)))
929 return (((unsigned int)CH(off-4) & 0x07) << 18) |
930 (((unsigned int)CH(off-3) & 0x3f) << 12) |
931 (((unsigned int)CH(off-2) & 0x3f) << 6) |
932 (((unsigned int)CH(off-1) & 0x3f) << 0);
933 }
934
935 return (unsigned) CH(off-1);
936 }
937
938 #define ISUNICODEWHITESPACE_(codepoint) md_is_unicode_whitespace__(codepoint)
939 #define ISUNICODEWHITESPACE(off) md_is_unicode_whitespace__(md_decode_utf8__(STR(off), ctx->size - (off), NULL))
940 #define ISUNICODEWHITESPACEBEFORE(off) md_is_unicode_whitespace__(md_decode_utf8_before__(ctx, off))
941
942 #define ISUNICODEPUNCT(off) md_is_unicode_punct__(md_decode_utf8__(STR(off), ctx->size - (off), NULL))
943 #define ISUNICODEPUNCTBEFORE(off) md_is_unicode_punct__(md_decode_utf8_before__(ctx, off))
944
945 static inline unsigned
946 md_decode_unicode(const CHAR* str, OFF off, SZ str_size, SZ* p_char_size)
947 {
948 return md_decode_utf8__(str: str+off, str_size: str_size-off, p_size: p_char_size);
949 }
950#else
951 #define ISUNICODEWHITESPACE_(codepoint) ISWHITESPACE_(codepoint)
952 #define ISUNICODEWHITESPACE(off) ISWHITESPACE(off)
953 #define ISUNICODEWHITESPACEBEFORE(off) ISWHITESPACE((off)-1)
954
955 #define ISUNICODEPUNCT(off) ISPUNCT(off)
956 #define ISUNICODEPUNCTBEFORE(off) ISPUNCT((off)-1)
957
958 static inline void
959 md_get_unicode_fold_info(unsigned codepoint, MD_UNICODE_FOLD_INFO* info)
960 {
961 info->codepoints[0] = codepoint;
962 if(ISUPPER_(codepoint))
963 info->codepoints[0] += 'a' - 'A';
964 info->n_codepoints = 1;
965 }
966
967 static inline unsigned
968 md_decode_unicode(const CHAR* str, OFF off, SZ str_size, SZ* p_size)
969 {
970 *p_size = 1;
971 return (unsigned) str[off];
972 }
973#endif
974
975
976/*************************************
977 *** Helper string manipulations ***
978 *************************************/
979
980/* Fill buffer with copy of the string between 'beg' and 'end' but replace any
981 * line breaks with given replacement character.
982 *
983 * NOTE: Caller is responsible to make sure the buffer is large enough.
984 * (Given the output is always shorter then input, (end - beg) is good idea
985 * what the caller should allocate.)
986 */
987static void
988md_merge_lines(MD_CTX* ctx, OFF beg, OFF end, const MD_LINE* lines, MD_SIZE n_lines,
989 CHAR line_break_replacement_char, CHAR* buffer, SZ* p_size)
990{
991 CHAR* ptr = buffer;
992 int line_index = 0;
993 OFF off = beg;
994
995 MD_UNUSED(n_lines);
996
997 while(1) {
998 const MD_LINE* line = &lines[line_index];
999 OFF line_end = line->end;
1000 if(end < line_end)
1001 line_end = end;
1002
1003 while(off < line_end) {
1004 *ptr = CH(off);
1005 ptr++;
1006 off++;
1007 }
1008
1009 if(off >= end) {
1010 *p_size = (MD_SIZE)(ptr - buffer);
1011 return;
1012 }
1013
1014 *ptr = line_break_replacement_char;
1015 ptr++;
1016
1017 line_index++;
1018 off = lines[line_index].beg;
1019 }
1020}
1021
1022/* Wrapper of md_merge_lines() which allocates new buffer for the output string.
1023 */
1024static int
1025md_merge_lines_alloc(MD_CTX* ctx, OFF beg, OFF end, const MD_LINE* lines, MD_SIZE n_lines,
1026 CHAR line_break_replacement_char, CHAR** p_str, SZ* p_size)
1027{
1028 CHAR* buffer;
1029
1030 buffer = (CHAR*) malloc(size: sizeof(CHAR) * (end - beg));
1031 if(buffer == NULL) {
1032 MD_LOG("malloc() failed.");
1033 return -1;
1034 }
1035
1036 md_merge_lines(ctx, beg, end, lines, n_lines,
1037 line_break_replacement_char, buffer, p_size);
1038
1039 *p_str = buffer;
1040 return 0;
1041}
1042
1043static OFF
1044md_skip_unicode_whitespace(const CHAR* label, OFF off, SZ size)
1045{
1046 SZ char_size;
1047 unsigned codepoint;
1048
1049 while(off < size) {
1050 codepoint = md_decode_unicode(str: label, off, str_size: size, p_char_size: &char_size);
1051 if(!ISUNICODEWHITESPACE_(codepoint) && !ISNEWLINE_(label[off]))
1052 break;
1053 off += char_size;
1054 }
1055
1056 return off;
1057}
1058
1059
1060/******************************
1061 *** Recognizing raw HTML ***
1062 ******************************/
1063
1064/* md_is_html_tag() may be called when processing inlines (inline raw HTML)
1065 * or when breaking document to blocks (checking for start of HTML block type 7).
1066 *
1067 * When breaking document to blocks, we do not yet know line boundaries, but
1068 * in that case the whole tag has to live on a single line. We distinguish this
1069 * by n_lines == 0.
1070 */
1071static int
1072md_is_html_tag(MD_CTX* ctx, const MD_LINE* lines, MD_SIZE n_lines, OFF beg, OFF max_end, OFF* p_end)
1073{
1074 int attr_state;
1075 OFF off = beg;
1076 OFF line_end = (n_lines > 0) ? lines[0].end : ctx->size;
1077 MD_SIZE line_index = 0;
1078
1079 MD_ASSERT(CH(beg) == _T('<'));
1080
1081 if(off + 1 >= line_end)
1082 return FALSE;
1083 off++;
1084
1085 /* For parsing attributes, we need a little state automaton below.
1086 * State -1: no attributes are allowed.
1087 * State 0: attribute could follow after some whitespace.
1088 * State 1: after a whitespace (attribute name may follow).
1089 * State 2: after attribute name ('=' MAY follow).
1090 * State 3: after '=' (value specification MUST follow).
1091 * State 41: in middle of unquoted attribute value.
1092 * State 42: in middle of single-quoted attribute value.
1093 * State 43: in middle of double-quoted attribute value.
1094 */
1095 attr_state = 0;
1096
1097 if(CH(off) == _T('/')) {
1098 /* Closer tag "</ ... >". No attributes may be present. */
1099 attr_state = -1;
1100 off++;
1101 }
1102
1103 /* Tag name */
1104 if(off >= line_end || !ISALPHA(off))
1105 return FALSE;
1106 off++;
1107 while(off < line_end && (ISALNUM(off) || CH(off) == _T('-')))
1108 off++;
1109
1110 /* (Optional) attributes (if not closer), (optional) '/' (if not closer)
1111 * and final '>'. */
1112 while(1) {
1113 while(off < line_end && !ISNEWLINE(off)) {
1114 if(attr_state > 40) {
1115 if(attr_state == 41 && (ISBLANK(off) || ISANYOF(off, _T("\"'=<>`")))) {
1116 attr_state = 0;
1117 off--; /* Put the char back for re-inspection in the new state. */
1118 } else if(attr_state == 42 && CH(off) == _T('\'')) {
1119 attr_state = 0;
1120 } else if(attr_state == 43 && CH(off) == _T('"')) {
1121 attr_state = 0;
1122 }
1123 off++;
1124 } else if(ISWHITESPACE(off)) {
1125 if(attr_state == 0)
1126 attr_state = 1;
1127 off++;
1128 } else if(attr_state <= 2 && CH(off) == _T('>')) {
1129 /* End. */
1130 goto done;
1131 } else if(attr_state <= 2 && CH(off) == _T('/') && off+1 < line_end && CH(off+1) == _T('>')) {
1132 /* End with digraph '/>' */
1133 off++;
1134 goto done;
1135 } else if((attr_state == 1 || attr_state == 2) && (ISALPHA(off) || CH(off) == _T('_') || CH(off) == _T(':'))) {
1136 off++;
1137 /* Attribute name */
1138 while(off < line_end && (ISALNUM(off) || ISANYOF(off, _T("_.:-"))))
1139 off++;
1140 attr_state = 2;
1141 } else if(attr_state == 2 && CH(off) == _T('=')) {
1142 /* Attribute assignment sign */
1143 off++;
1144 attr_state = 3;
1145 } else if(attr_state == 3) {
1146 /* Expecting start of attribute value. */
1147 if(CH(off) == _T('"'))
1148 attr_state = 43;
1149 else if(CH(off) == _T('\''))
1150 attr_state = 42;
1151 else if(!ISANYOF(off, _T("\"'=<>`")) && !ISNEWLINE(off))
1152 attr_state = 41;
1153 else
1154 return FALSE;
1155 off++;
1156 } else {
1157 /* Anything unexpected. */
1158 return FALSE;
1159 }
1160 }
1161
1162 /* We have to be on a single line. See definition of start condition
1163 * of HTML block, type 7. */
1164 if(n_lines == 0)
1165 return FALSE;
1166
1167 line_index++;
1168 if(line_index >= n_lines)
1169 return FALSE;
1170
1171 off = lines[line_index].beg;
1172 line_end = lines[line_index].end;
1173
1174 if(attr_state == 0 || attr_state == 41)
1175 attr_state = 1;
1176
1177 if(off >= max_end)
1178 return FALSE;
1179 }
1180
1181done:
1182 if(off >= max_end)
1183 return FALSE;
1184
1185 *p_end = off+1;
1186 return TRUE;
1187}
1188
1189static int
1190md_scan_for_html_closer(MD_CTX* ctx, const MD_CHAR* str, MD_SIZE len,
1191 const MD_LINE* lines, MD_SIZE n_lines,
1192 OFF beg, OFF max_end, OFF* p_end,
1193 OFF* p_scan_horizon)
1194{
1195 OFF off = beg;
1196 MD_SIZE line_index = 0;
1197
1198 if(off < *p_scan_horizon && *p_scan_horizon >= max_end - len) {
1199 /* We have already scanned the range up to the max_end so we know
1200 * there is nothing to see. */
1201 return FALSE;
1202 }
1203
1204 while(TRUE) {
1205 while(off + len <= lines[line_index].end && off + len <= max_end) {
1206 if(md_ascii_eq(STR(off), s2: str, n: len)) {
1207 /* Success. */
1208 *p_end = off + len;
1209 return TRUE;
1210 }
1211 off++;
1212 }
1213
1214 line_index++;
1215 if(off >= max_end || line_index >= n_lines) {
1216 /* Failure. */
1217 *p_scan_horizon = off;
1218 return FALSE;
1219 }
1220
1221 off = lines[line_index].beg;
1222 }
1223}
1224
1225static int
1226md_is_html_comment(MD_CTX* ctx, const MD_LINE* lines, MD_SIZE n_lines, OFF beg, OFF max_end, OFF* p_end)
1227{
1228 OFF off = beg;
1229
1230 MD_ASSERT(CH(beg) == _T('<'));
1231
1232 if(off + 4 >= lines[0].end)
1233 return FALSE;
1234 if(CH(off+1) != _T('!') || CH(off+2) != _T('-') || CH(off+3) != _T('-'))
1235 return FALSE;
1236
1237 /* Skip only "<!" so that we accept also "<!-->" or "<!--->" */
1238 off += 2;
1239
1240 /* Scan for ordinary comment closer "-->". */
1241 return md_scan_for_html_closer(ctx, _T("-->"), len: 3,
1242 lines, n_lines, beg: off, max_end, p_end, p_scan_horizon: &ctx->html_comment_horizon);
1243}
1244
1245static int
1246md_is_html_processing_instruction(MD_CTX* ctx, const MD_LINE* lines, MD_SIZE n_lines, OFF beg, OFF max_end, OFF* p_end)
1247{
1248 OFF off = beg;
1249
1250 if(off + 2 >= lines[0].end)
1251 return FALSE;
1252 if(CH(off+1) != _T('?'))
1253 return FALSE;
1254 off += 2;
1255
1256 return md_scan_for_html_closer(ctx, _T("?>"), len: 2,
1257 lines, n_lines, beg: off, max_end, p_end, p_scan_horizon: &ctx->html_proc_instr_horizon);
1258}
1259
1260static int
1261md_is_html_declaration(MD_CTX* ctx, const MD_LINE* lines, MD_SIZE n_lines, OFF beg, OFF max_end, OFF* p_end)
1262{
1263 OFF off = beg;
1264
1265 if(off + 2 >= lines[0].end)
1266 return FALSE;
1267 if(CH(off+1) != _T('!'))
1268 return FALSE;
1269 off += 2;
1270
1271 /* Declaration name. */
1272 if(off >= lines[0].end || !ISALPHA(off))
1273 return FALSE;
1274 off++;
1275 while(off < lines[0].end && ISALPHA(off))
1276 off++;
1277
1278 return md_scan_for_html_closer(ctx, _T(">"), len: 1,
1279 lines, n_lines, beg: off, max_end, p_end, p_scan_horizon: &ctx->html_decl_horizon);
1280}
1281
1282static int
1283md_is_html_cdata(MD_CTX* ctx, const MD_LINE* lines, MD_SIZE n_lines, OFF beg, OFF max_end, OFF* p_end)
1284{
1285 static const CHAR open_str[] = _T("<![CDATA[");
1286 static const SZ open_size = SIZEOF_ARRAY(open_str) - 1;
1287
1288 OFF off = beg;
1289
1290 if(off + open_size >= lines[0].end)
1291 return FALSE;
1292 if(memcmp(STR(off), s2: open_str, n: open_size) != 0)
1293 return FALSE;
1294 off += open_size;
1295
1296 return md_scan_for_html_closer(ctx, _T("]]>"), len: 3,
1297 lines, n_lines, beg: off, max_end, p_end, p_scan_horizon: &ctx->html_cdata_horizon);
1298}
1299
1300static int
1301md_is_html_any(MD_CTX* ctx, const MD_LINE* lines, MD_SIZE n_lines, OFF beg, OFF max_end, OFF* p_end)
1302{
1303 MD_ASSERT(CH(beg) == _T('<'));
1304 return (md_is_html_tag(ctx, lines, n_lines, beg, max_end, p_end) ||
1305 md_is_html_comment(ctx, lines, n_lines, beg, max_end, p_end) ||
1306 md_is_html_processing_instruction(ctx, lines, n_lines, beg, max_end, p_end) ||
1307 md_is_html_declaration(ctx, lines, n_lines, beg, max_end, p_end) ||
1308 md_is_html_cdata(ctx, lines, n_lines, beg, max_end, p_end));
1309}
1310
1311
1312/****************************
1313 *** Recognizing Entity ***
1314 ****************************/
1315
1316static int
1317md_is_hex_entity_contents(MD_CTX* ctx, const CHAR* text, OFF beg, OFF max_end, OFF* p_end)
1318{
1319 OFF off = beg;
1320 MD_UNUSED(ctx);
1321
1322 while(off < max_end && ISXDIGIT_(text[off]) && off - beg <= 8)
1323 off++;
1324
1325 if(1 <= off - beg && off - beg <= 6) {
1326 *p_end = off;
1327 return TRUE;
1328 } else {
1329 return FALSE;
1330 }
1331}
1332
1333static int
1334md_is_dec_entity_contents(MD_CTX* ctx, const CHAR* text, OFF beg, OFF max_end, OFF* p_end)
1335{
1336 OFF off = beg;
1337 MD_UNUSED(ctx);
1338
1339 while(off < max_end && ISDIGIT_(text[off]) && off - beg <= 8)
1340 off++;
1341
1342 if(1 <= off - beg && off - beg <= 7) {
1343 *p_end = off;
1344 return TRUE;
1345 } else {
1346 return FALSE;
1347 }
1348}
1349
1350static int
1351md_is_named_entity_contents(MD_CTX* ctx, const CHAR* text, OFF beg, OFF max_end, OFF* p_end)
1352{
1353 OFF off = beg;
1354 MD_UNUSED(ctx);
1355
1356 if(off < max_end && ISALPHA_(text[off]))
1357 off++;
1358 else
1359 return FALSE;
1360
1361 while(off < max_end && ISALNUM_(text[off]) && off - beg <= 48)
1362 off++;
1363
1364 if(2 <= off - beg && off - beg <= 48) {
1365 *p_end = off;
1366 return TRUE;
1367 } else {
1368 return FALSE;
1369 }
1370}
1371
1372static int
1373md_is_entity_str(MD_CTX* ctx, const CHAR* text, OFF beg, OFF max_end, OFF* p_end)
1374{
1375 int is_contents;
1376 OFF off = beg;
1377
1378 MD_ASSERT(text[off] == _T('&'));
1379 off++;
1380
1381 if(off+2 < max_end && text[off] == _T('#') && (text[off+1] == _T('x') || text[off+1] == _T('X')))
1382 is_contents = md_is_hex_entity_contents(ctx, text, beg: off+2, max_end, p_end: &off);
1383 else if(off+1 < max_end && text[off] == _T('#'))
1384 is_contents = md_is_dec_entity_contents(ctx, text, beg: off+1, max_end, p_end: &off);
1385 else
1386 is_contents = md_is_named_entity_contents(ctx, text, beg: off, max_end, p_end: &off);
1387
1388 if(is_contents && off < max_end && text[off] == _T(';')) {
1389 *p_end = off+1;
1390 return TRUE;
1391 } else {
1392 return FALSE;
1393 }
1394}
1395
1396static inline int
1397md_is_entity(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end)
1398{
1399 return md_is_entity_str(ctx, text: ctx->text, beg, max_end, p_end);
1400}
1401
1402
1403/******************************
1404 *** Attribute Management ***
1405 ******************************/
1406
1407typedef struct MD_ATTRIBUTE_BUILD_tag MD_ATTRIBUTE_BUILD;
1408struct MD_ATTRIBUTE_BUILD_tag {
1409 CHAR* text;
1410 MD_TEXTTYPE* substr_types;
1411 OFF* substr_offsets;
1412 int substr_count;
1413 int substr_alloc;
1414 MD_TEXTTYPE trivial_types[1];
1415 OFF trivial_offsets[2];
1416};
1417
1418
1419#define MD_BUILD_ATTR_NO_ESCAPES 0x0001
1420
1421static int
1422md_build_attr_append_substr(MD_CTX* ctx, MD_ATTRIBUTE_BUILD* build,
1423 MD_TEXTTYPE type, OFF off)
1424{
1425 if(build->substr_count >= build->substr_alloc) {
1426 MD_TEXTTYPE* new_substr_types;
1427 OFF* new_substr_offsets;
1428
1429 build->substr_alloc = (build->substr_alloc > 0
1430 ? build->substr_alloc + build->substr_alloc / 2
1431 : 8);
1432 new_substr_types = (MD_TEXTTYPE*) realloc(ptr: build->substr_types,
1433 size: build->substr_alloc * sizeof(MD_TEXTTYPE));
1434 if(new_substr_types == NULL) {
1435 MD_LOG("realloc() failed.");
1436 return -1;
1437 }
1438 /* Note +1 to reserve space for final offset (== raw_size). */
1439 new_substr_offsets = (OFF*) realloc(ptr: build->substr_offsets,
1440 size: (build->substr_alloc+1) * sizeof(OFF));
1441 if(new_substr_offsets == NULL) {
1442 MD_LOG("realloc() failed.");
1443 free(ptr: new_substr_types);
1444 return -1;
1445 }
1446
1447 build->substr_types = new_substr_types;
1448 build->substr_offsets = new_substr_offsets;
1449 }
1450
1451 build->substr_types[build->substr_count] = type;
1452 build->substr_offsets[build->substr_count] = off;
1453 build->substr_count++;
1454 return 0;
1455}
1456
1457static void
1458md_free_attribute(MD_CTX* ctx, MD_ATTRIBUTE_BUILD* build)
1459{
1460 MD_UNUSED(ctx);
1461
1462 if(build->substr_alloc > 0) {
1463 free(ptr: build->text);
1464 free(ptr: build->substr_types);
1465 free(ptr: build->substr_offsets);
1466 }
1467}
1468
1469static int
1470md_build_attribute(MD_CTX* ctx, const CHAR* raw_text, SZ raw_size,
1471 unsigned flags, MD_ATTRIBUTE* attr, MD_ATTRIBUTE_BUILD* build)
1472{
1473 OFF raw_off, off;
1474 int is_trivial;
1475 int ret = 0;
1476
1477 memset(s: build, c: 0, n: sizeof(MD_ATTRIBUTE_BUILD));
1478
1479 /* If there is no backslash and no ampersand, build trivial attribute
1480 * without any malloc(). */
1481 is_trivial = TRUE;
1482 for(raw_off = 0; raw_off < raw_size; raw_off++) {
1483 if(ISANYOF3_(raw_text[raw_off], _T('\\'), _T('&'), _T('\0'))) {
1484 is_trivial = FALSE;
1485 break;
1486 }
1487 }
1488
1489 if(is_trivial) {
1490 build->text = (CHAR*) (raw_size ? raw_text : NULL);
1491 build->substr_types = build->trivial_types;
1492 build->substr_offsets = build->trivial_offsets;
1493 build->substr_count = 1;
1494 build->substr_alloc = 0;
1495 build->trivial_types[0] = MD_TEXT_NORMAL;
1496 build->trivial_offsets[0] = 0;
1497 build->trivial_offsets[1] = raw_size;
1498 off = raw_size;
1499 } else {
1500 build->text = (CHAR*) malloc(size: raw_size * sizeof(CHAR));
1501 if(build->text == NULL) {
1502 MD_LOG("malloc() failed.");
1503 goto abort;
1504 }
1505
1506 raw_off = 0;
1507 off = 0;
1508
1509 while(raw_off < raw_size) {
1510 if(raw_text[raw_off] == _T('\0')) {
1511 MD_CHECK(md_build_attr_append_substr(ctx, build, MD_TEXT_NULLCHAR, off));
1512 memcpy(dest: build->text + off, src: raw_text + raw_off, n: 1);
1513 off++;
1514 raw_off++;
1515 continue;
1516 }
1517
1518 if(raw_text[raw_off] == _T('&')) {
1519 OFF ent_end;
1520
1521 if(md_is_entity_str(ctx, text: raw_text, beg: raw_off, max_end: raw_size, p_end: &ent_end)) {
1522 MD_CHECK(md_build_attr_append_substr(ctx, build, MD_TEXT_ENTITY, off));
1523 memcpy(dest: build->text + off, src: raw_text + raw_off, n: ent_end - raw_off);
1524 off += ent_end - raw_off;
1525 raw_off = ent_end;
1526 continue;
1527 }
1528 }
1529
1530 if(build->substr_count == 0 || build->substr_types[build->substr_count-1] != MD_TEXT_NORMAL)
1531 MD_CHECK(md_build_attr_append_substr(ctx, build, MD_TEXT_NORMAL, off));
1532
1533 if(!(flags & MD_BUILD_ATTR_NO_ESCAPES) &&
1534 raw_text[raw_off] == _T('\\') && raw_off+1 < raw_size &&
1535 (ISPUNCT_(raw_text[raw_off+1]) || ISNEWLINE_(raw_text[raw_off+1])))
1536 raw_off++;
1537
1538 build->text[off++] = raw_text[raw_off++];
1539 }
1540 build->substr_offsets[build->substr_count] = off;
1541 }
1542
1543 attr->text = build->text;
1544 attr->size = off;
1545 attr->substr_offsets = build->substr_offsets;
1546 attr->substr_types = build->substr_types;
1547 return 0;
1548
1549abort:
1550 md_free_attribute(ctx, build);
1551 return -1;
1552}
1553
1554
1555/*********************************************
1556 *** Dictionary of Reference Definitions ***
1557 *********************************************/
1558
1559#define MD_FNV1A_BASE 2166136261U
1560#define MD_FNV1A_PRIME 16777619U
1561
1562static inline unsigned
1563md_fnv1a(unsigned base, const void* data, size_t n)
1564{
1565 const unsigned char* buf = (const unsigned char*) data;
1566 unsigned hash = base;
1567 size_t i;
1568
1569 for(i = 0; i < n; i++) {
1570 hash ^= buf[i];
1571 hash *= MD_FNV1A_PRIME;
1572 }
1573
1574 return hash;
1575}
1576
1577
1578struct MD_REF_DEF_tag {
1579 CHAR* label;
1580 CHAR* title;
1581 unsigned hash;
1582 SZ label_size;
1583 SZ title_size;
1584 OFF dest_beg;
1585 OFF dest_end;
1586 unsigned char label_needs_free : 1;
1587 unsigned char title_needs_free : 1;
1588};
1589
1590/* Label equivalence is quite complicated with regards to whitespace and case
1591 * folding. This complicates computing a hash of it as well as direct comparison
1592 * of two labels. */
1593
1594static unsigned
1595md_link_label_hash(const CHAR* label, SZ size)
1596{
1597 unsigned hash = MD_FNV1A_BASE;
1598 OFF off;
1599 unsigned codepoint;
1600 int is_whitespace = FALSE;
1601
1602 off = md_skip_unicode_whitespace(label, off: 0, size);
1603 while(off < size) {
1604 SZ char_size;
1605
1606 codepoint = md_decode_unicode(str: label, off, str_size: size, p_char_size: &char_size);
1607 is_whitespace = ISUNICODEWHITESPACE_(codepoint) || ISNEWLINE_(label[off]);
1608
1609 if(is_whitespace) {
1610 codepoint = ' ';
1611 hash = md_fnv1a(base: hash, data: &codepoint, n: sizeof(unsigned));
1612 off = md_skip_unicode_whitespace(label, off, size);
1613 } else {
1614 MD_UNICODE_FOLD_INFO fold_info;
1615
1616 md_get_unicode_fold_info(codepoint, info: &fold_info);
1617 hash = md_fnv1a(base: hash, data: fold_info.codepoints, n: fold_info.n_codepoints * sizeof(unsigned));
1618 off += char_size;
1619 }
1620 }
1621
1622 return hash;
1623}
1624
1625static OFF
1626md_link_label_cmp_load_fold_info(const CHAR* label, OFF off, SZ size,
1627 MD_UNICODE_FOLD_INFO* fold_info)
1628{
1629 unsigned codepoint;
1630 SZ char_size;
1631
1632 if(off >= size) {
1633 /* Treat end of a link label as a whitespace. */
1634 goto whitespace;
1635 }
1636
1637 codepoint = md_decode_unicode(str: label, off, str_size: size, p_char_size: &char_size);
1638 off += char_size;
1639 if(ISUNICODEWHITESPACE_(codepoint)) {
1640 /* Treat all whitespace as equivalent */
1641 goto whitespace;
1642 }
1643
1644 /* Get real folding info. */
1645 md_get_unicode_fold_info(codepoint, info: fold_info);
1646 return off;
1647
1648whitespace:
1649 fold_info->codepoints[0] = _T(' ');
1650 fold_info->n_codepoints = 1;
1651 return md_skip_unicode_whitespace(label, off, size);
1652}
1653
1654static int
1655md_link_label_cmp(const CHAR* a_label, SZ a_size, const CHAR* b_label, SZ b_size)
1656{
1657 OFF a_off;
1658 OFF b_off;
1659 MD_UNICODE_FOLD_INFO a_fi = { { 0 }, 0 };
1660 MD_UNICODE_FOLD_INFO b_fi = { { 0 }, 0 };
1661 OFF a_fi_off = 0;
1662 OFF b_fi_off = 0;
1663 int cmp;
1664
1665 a_off = md_skip_unicode_whitespace(label: a_label, off: 0, size: a_size);
1666 b_off = md_skip_unicode_whitespace(label: b_label, off: 0, size: b_size);
1667 while(a_off < a_size || a_fi_off < a_fi.n_codepoints ||
1668 b_off < b_size || b_fi_off < b_fi.n_codepoints)
1669 {
1670 /* If needed, load fold info for next char. */
1671 if(a_fi_off >= a_fi.n_codepoints) {
1672 a_fi_off = 0;
1673 a_off = md_link_label_cmp_load_fold_info(label: a_label, off: a_off, size: a_size, fold_info: &a_fi);
1674 }
1675 if(b_fi_off >= b_fi.n_codepoints) {
1676 b_fi_off = 0;
1677 b_off = md_link_label_cmp_load_fold_info(label: b_label, off: b_off, size: b_size, fold_info: &b_fi);
1678 }
1679
1680 cmp = b_fi.codepoints[b_fi_off] - a_fi.codepoints[a_fi_off];
1681 if(cmp != 0)
1682 return cmp;
1683
1684 a_fi_off++;
1685 b_fi_off++;
1686 }
1687
1688 return 0;
1689}
1690
1691typedef struct MD_REF_DEF_LIST_tag MD_REF_DEF_LIST;
1692struct MD_REF_DEF_LIST_tag {
1693 int n_ref_defs;
1694 int alloc_ref_defs;
1695 MD_REF_DEF* ref_defs[]; /* Valid items always point into ctx->ref_defs[] */
1696};
1697
1698static int
1699md_ref_def_cmp(const void* a, const void* b)
1700{
1701 const MD_REF_DEF* a_ref = *(const MD_REF_DEF**)a;
1702 const MD_REF_DEF* b_ref = *(const MD_REF_DEF**)b;
1703
1704 if(a_ref->hash < b_ref->hash)
1705 return -1;
1706 else if(a_ref->hash > b_ref->hash)
1707 return +1;
1708 else
1709 return md_link_label_cmp(a_label: a_ref->label, a_size: a_ref->label_size, b_label: b_ref->label, b_size: b_ref->label_size);
1710}
1711
1712static int
1713md_ref_def_cmp_for_sort(const void* a, const void* b)
1714{
1715 int cmp;
1716
1717 cmp = md_ref_def_cmp(a, b);
1718
1719 /* Ensure stability of the sorting. */
1720 if(cmp == 0) {
1721 const MD_REF_DEF* a_ref = *(const MD_REF_DEF**)a;
1722 const MD_REF_DEF* b_ref = *(const MD_REF_DEF**)b;
1723
1724 if(a_ref < b_ref)
1725 cmp = -1;
1726 else if(a_ref > b_ref)
1727 cmp = +1;
1728 else
1729 cmp = 0;
1730 }
1731
1732 return cmp;
1733}
1734
1735static int
1736md_build_ref_def_hashtable(MD_CTX* ctx)
1737{
1738 int i, j;
1739
1740 if(ctx->n_ref_defs == 0)
1741 return 0;
1742
1743 ctx->ref_def_hashtable_size = (ctx->n_ref_defs * 5) / 4;
1744 ctx->ref_def_hashtable = malloc(size: ctx->ref_def_hashtable_size * sizeof(void*));
1745 if(ctx->ref_def_hashtable == NULL) {
1746 MD_LOG("malloc() failed.");
1747 goto abort;
1748 }
1749 memset(s: ctx->ref_def_hashtable, c: 0, n: ctx->ref_def_hashtable_size * sizeof(void*));
1750
1751 /* Each member of ctx->ref_def_hashtable[] can be:
1752 * -- NULL,
1753 * -- pointer to the MD_REF_DEF in ctx->ref_defs[], or
1754 * -- pointer to a MD_REF_DEF_LIST, which holds multiple pointers to
1755 * such MD_REF_DEFs.
1756 */
1757 for(i = 0; i < ctx->n_ref_defs; i++) {
1758 MD_REF_DEF* def = &ctx->ref_defs[i];
1759 void* bucket;
1760 MD_REF_DEF_LIST* list;
1761
1762 def->hash = md_link_label_hash(label: def->label, size: def->label_size);
1763 bucket = ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size];
1764
1765 if(bucket == NULL) {
1766 /* The bucket is empty. Make it just point to the def. */
1767 ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size] = def;
1768 continue;
1769 }
1770
1771 if(ctx->ref_defs <= (MD_REF_DEF*) bucket && (MD_REF_DEF*) bucket < ctx->ref_defs + ctx->n_ref_defs) {
1772 /* The bucket already contains one ref. def. Lets see whether it
1773 * is the same label (ref. def. duplicate) or different one
1774 * (hash conflict). */
1775 MD_REF_DEF* old_def = (MD_REF_DEF*) bucket;
1776
1777 if(md_link_label_cmp(a_label: def->label, a_size: def->label_size, b_label: old_def->label, b_size: old_def->label_size) == 0) {
1778 /* Duplicate label: Ignore this ref. def. */
1779 continue;
1780 }
1781
1782 /* Make the bucket complex, i.e. able to hold more ref. defs. */
1783 list = (MD_REF_DEF_LIST*) malloc(size: sizeof(MD_REF_DEF_LIST) + 2 * sizeof(MD_REF_DEF*));
1784 if(list == NULL) {
1785 MD_LOG("malloc() failed.");
1786 goto abort;
1787 }
1788 list->ref_defs[0] = old_def;
1789 list->ref_defs[1] = def;
1790 list->n_ref_defs = 2;
1791 list->alloc_ref_defs = 2;
1792 ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size] = list;
1793 continue;
1794 }
1795
1796 /* Append the def to the complex bucket list.
1797 *
1798 * Note in this case we ignore potential duplicates to avoid expensive
1799 * iterating over the complex bucket. Below, we revisit all the complex
1800 * buckets and handle it more cheaply after the complex bucket contents
1801 * is sorted. */
1802 list = (MD_REF_DEF_LIST*) bucket;
1803 if(list->n_ref_defs >= list->alloc_ref_defs) {
1804 int alloc_ref_defs = list->alloc_ref_defs + list->alloc_ref_defs / 2;
1805 MD_REF_DEF_LIST* list_tmp = (MD_REF_DEF_LIST*) realloc(ptr: list,
1806 size: sizeof(MD_REF_DEF_LIST) + alloc_ref_defs * sizeof(MD_REF_DEF*));
1807 if(list_tmp == NULL) {
1808 MD_LOG("realloc() failed.");
1809 goto abort;
1810 }
1811 list = list_tmp;
1812 list->alloc_ref_defs = alloc_ref_defs;
1813 ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size] = list;
1814 }
1815
1816 list->ref_defs[list->n_ref_defs] = def;
1817 list->n_ref_defs++;
1818 }
1819
1820 /* Sort the complex buckets so we can use bsearch() with them. */
1821 for(i = 0; i < ctx->ref_def_hashtable_size; i++) {
1822 void* bucket = ctx->ref_def_hashtable[i];
1823 MD_REF_DEF_LIST* list;
1824
1825 if(bucket == NULL)
1826 continue;
1827 if(ctx->ref_defs <= (MD_REF_DEF*) bucket && (MD_REF_DEF*) bucket < ctx->ref_defs + ctx->n_ref_defs)
1828 continue;
1829
1830 list = (MD_REF_DEF_LIST*) bucket;
1831 qsort(base: list->ref_defs, nmemb: list->n_ref_defs, size: sizeof(MD_REF_DEF*), compar: md_ref_def_cmp_for_sort);
1832
1833 /* Disable all duplicates in the complex bucket by forcing all such
1834 * records to point to the 1st such ref. def. I.e. no matter which
1835 * record is found during the lookup, it will always point to the right
1836 * ref. def. in ctx->ref_defs[]. */
1837 for(j = 1; j < list->n_ref_defs; j++) {
1838 if(md_ref_def_cmp(a: &list->ref_defs[j-1], b: &list->ref_defs[j]) == 0)
1839 list->ref_defs[j] = list->ref_defs[j-1];
1840 }
1841 }
1842
1843 return 0;
1844
1845abort:
1846 return -1;
1847}
1848
1849static void
1850md_free_ref_def_hashtable(MD_CTX* ctx)
1851{
1852 if(ctx->ref_def_hashtable != NULL) {
1853 int i;
1854
1855 for(i = 0; i < ctx->ref_def_hashtable_size; i++) {
1856 void* bucket = ctx->ref_def_hashtable[i];
1857 if(bucket == NULL)
1858 continue;
1859 if(ctx->ref_defs <= (MD_REF_DEF*) bucket && (MD_REF_DEF*) bucket < ctx->ref_defs + ctx->n_ref_defs)
1860 continue;
1861 free(ptr: bucket);
1862 }
1863
1864 free(ptr: ctx->ref_def_hashtable);
1865 }
1866}
1867
1868static const MD_REF_DEF*
1869md_lookup_ref_def(MD_CTX* ctx, const CHAR* label, SZ label_size)
1870{
1871 unsigned hash;
1872 void* bucket;
1873
1874 if(ctx->ref_def_hashtable_size == 0)
1875 return NULL;
1876
1877 hash = md_link_label_hash(label, size: label_size);
1878 bucket = ctx->ref_def_hashtable[hash % ctx->ref_def_hashtable_size];
1879
1880 if(bucket == NULL) {
1881 return NULL;
1882 } else if(ctx->ref_defs <= (MD_REF_DEF*) bucket && (MD_REF_DEF*) bucket < ctx->ref_defs + ctx->n_ref_defs) {
1883 const MD_REF_DEF* def = (MD_REF_DEF*) bucket;
1884
1885 if(md_link_label_cmp(a_label: def->label, a_size: def->label_size, b_label: label, b_size: label_size) == 0)
1886 return def;
1887 else
1888 return NULL;
1889 } else {
1890 MD_REF_DEF_LIST* list = (MD_REF_DEF_LIST*) bucket;
1891 MD_REF_DEF key_buf;
1892 const MD_REF_DEF* key = &key_buf;
1893 const MD_REF_DEF** ret;
1894
1895 key_buf.label = (CHAR*) label;
1896 key_buf.label_size = label_size;
1897 key_buf.hash = md_link_label_hash(label: key_buf.label, size: key_buf.label_size);
1898
1899 ret = (const MD_REF_DEF**) bsearch(key: &key, base: list->ref_defs,
1900 nmemb: list->n_ref_defs, size: sizeof(MD_REF_DEF*), compar: md_ref_def_cmp);
1901 if(ret != NULL)
1902 return *ret;
1903 else
1904 return NULL;
1905 }
1906}
1907
1908
1909/***************************
1910 *** Recognizing Links ***
1911 ***************************/
1912
1913/* Note this code is partially shared between processing inlines and blocks
1914 * as reference definitions and links share some helper parser functions.
1915 */
1916
1917typedef struct MD_LINK_ATTR_tag MD_LINK_ATTR;
1918struct MD_LINK_ATTR_tag {
1919 OFF dest_beg;
1920 OFF dest_end;
1921
1922 CHAR* title;
1923 SZ title_size;
1924 int title_needs_free;
1925};
1926
1927
1928static int
1929md_is_link_label(MD_CTX* ctx, const MD_LINE* lines, MD_SIZE n_lines, OFF beg,
1930 OFF* p_end, MD_SIZE* p_beg_line_index, MD_SIZE* p_end_line_index,
1931 OFF* p_contents_beg, OFF* p_contents_end)
1932{
1933 OFF off = beg;
1934 OFF contents_beg = 0;
1935 OFF contents_end = 0;
1936 MD_SIZE line_index = 0;
1937 int len = 0;
1938
1939 if(CH(off) != _T('['))
1940 return FALSE;
1941 off++;
1942
1943 while(1) {
1944 OFF line_end = lines[line_index].end;
1945
1946 while(off < line_end) {
1947 if(CH(off) == _T('\\') && off+1 < ctx->size && (ISPUNCT(off+1) || ISNEWLINE(off+1))) {
1948 if(contents_end == 0) {
1949 contents_beg = off;
1950 *p_beg_line_index = line_index;
1951 }
1952 contents_end = off + 2;
1953 off += 2;
1954 } else if(CH(off) == _T('[')) {
1955 return FALSE;
1956 } else if(CH(off) == _T(']')) {
1957 if(contents_beg < contents_end) {
1958 /* Success. */
1959 *p_contents_beg = contents_beg;
1960 *p_contents_end = contents_end;
1961 *p_end = off+1;
1962 *p_end_line_index = line_index;
1963 return TRUE;
1964 } else {
1965 /* Link label must have some non-whitespace contents. */
1966 return FALSE;
1967 }
1968 } else {
1969 unsigned codepoint;
1970 SZ char_size;
1971
1972 codepoint = md_decode_unicode(str: ctx->text, off, str_size: ctx->size, p_char_size: &char_size);
1973 if(!ISUNICODEWHITESPACE_(codepoint)) {
1974 if(contents_end == 0) {
1975 contents_beg = off;
1976 *p_beg_line_index = line_index;
1977 }
1978 contents_end = off + char_size;
1979 }
1980
1981 off += char_size;
1982 }
1983
1984 len++;
1985 if(len > 999)
1986 return FALSE;
1987 }
1988
1989 line_index++;
1990 len++;
1991 if(line_index < n_lines)
1992 off = lines[line_index].beg;
1993 else
1994 break;
1995 }
1996
1997 return FALSE;
1998}
1999
2000static int
2001md_is_link_destination_A(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end,
2002 OFF* p_contents_beg, OFF* p_contents_end)
2003{
2004 OFF off = beg;
2005
2006 if(off >= max_end || CH(off) != _T('<'))
2007 return FALSE;
2008 off++;
2009
2010 while(off < max_end) {
2011 if(CH(off) == _T('\\') && off+1 < max_end && ISPUNCT(off+1)) {
2012 off += 2;
2013 continue;
2014 }
2015
2016 if(ISNEWLINE(off) || CH(off) == _T('<'))
2017 return FALSE;
2018
2019 if(CH(off) == _T('>')) {
2020 /* Success. */
2021 *p_contents_beg = beg+1;
2022 *p_contents_end = off;
2023 *p_end = off+1;
2024 return TRUE;
2025 }
2026
2027 off++;
2028 }
2029
2030 return FALSE;
2031}
2032
2033static int
2034md_is_link_destination_B(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end,
2035 OFF* p_contents_beg, OFF* p_contents_end)
2036{
2037 OFF off = beg;
2038 int parenthesis_level = 0;
2039
2040 while(off < max_end) {
2041 if(CH(off) == _T('\\') && off+1 < max_end && ISPUNCT(off+1)) {
2042 off += 2;
2043 continue;
2044 }
2045
2046 if(ISWHITESPACE(off) || ISCNTRL(off))
2047 break;
2048
2049 /* Link destination may include balanced pairs of unescaped '(' ')'.
2050 * Note we limit the maximal nesting level by 32 to protect us from
2051 * https://github.com/jgm/cmark/issues/214 */
2052 if(CH(off) == _T('(')) {
2053 parenthesis_level++;
2054 if(parenthesis_level > 32)
2055 return FALSE;
2056 } else if(CH(off) == _T(')')) {
2057 if(parenthesis_level == 0)
2058 break;
2059 parenthesis_level--;
2060 }
2061
2062 off++;
2063 }
2064
2065 if(parenthesis_level != 0 || off == beg)
2066 return FALSE;
2067
2068 /* Success. */
2069 *p_contents_beg = beg;
2070 *p_contents_end = off;
2071 *p_end = off;
2072 return TRUE;
2073}
2074
2075static inline int
2076md_is_link_destination(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end,
2077 OFF* p_contents_beg, OFF* p_contents_end)
2078{
2079 if(CH(beg) == _T('<'))
2080 return md_is_link_destination_A(ctx, beg, max_end, p_end, p_contents_beg, p_contents_end);
2081 else
2082 return md_is_link_destination_B(ctx, beg, max_end, p_end, p_contents_beg, p_contents_end);
2083}
2084
2085static int
2086md_is_link_title(MD_CTX* ctx, const MD_LINE* lines, MD_SIZE n_lines, OFF beg,
2087 OFF* p_end, MD_SIZE* p_beg_line_index, MD_SIZE* p_end_line_index,
2088 OFF* p_contents_beg, OFF* p_contents_end)
2089{
2090 OFF off = beg;
2091 CHAR closer_char;
2092 MD_SIZE line_index = 0;
2093
2094 /* White space with up to one line break. */
2095 while(off < lines[line_index].end && ISWHITESPACE(off))
2096 off++;
2097 if(off >= lines[line_index].end) {
2098 line_index++;
2099 if(line_index >= n_lines)
2100 return FALSE;
2101 off = lines[line_index].beg;
2102 }
2103 if(off == beg)
2104 return FALSE;
2105
2106 *p_beg_line_index = line_index;
2107
2108 /* First char determines how to detect end of it. */
2109 switch(CH(off)) {
2110 case _T('"'): closer_char = _T('"'); break;
2111 case _T('\''): closer_char = _T('\''); break;
2112 case _T('('): closer_char = _T(')'); break;
2113 default: return FALSE;
2114 }
2115 off++;
2116
2117 *p_contents_beg = off;
2118
2119 while(line_index < n_lines) {
2120 OFF line_end = lines[line_index].end;
2121
2122 while(off < line_end) {
2123 if(CH(off) == _T('\\') && off+1 < ctx->size && (ISPUNCT(off+1) || ISNEWLINE(off+1))) {
2124 off++;
2125 } else if(CH(off) == closer_char) {
2126 /* Success. */
2127 *p_contents_end = off;
2128 *p_end = off+1;
2129 *p_end_line_index = line_index;
2130 return TRUE;
2131 } else if(closer_char == _T(')') && CH(off) == _T('(')) {
2132 /* ()-style title cannot contain (unescaped '(')) */
2133 return FALSE;
2134 }
2135
2136 off++;
2137 }
2138
2139 line_index++;
2140 }
2141
2142 return FALSE;
2143}
2144
2145/* Returns 0 if it is not a reference definition.
2146 *
2147 * Returns N > 0 if it is a reference definition. N then corresponds to the
2148 * number of lines forming it). In this case the definition is stored for
2149 * resolving any links referring to it.
2150 *
2151 * Returns -1 in case of an error (out of memory).
2152 */
2153static int
2154md_is_link_reference_definition(MD_CTX* ctx, const MD_LINE* lines, MD_SIZE n_lines)
2155{
2156 OFF label_contents_beg;
2157 OFF label_contents_end;
2158 MD_SIZE label_contents_line_index;
2159 int label_is_multiline = FALSE;
2160 OFF dest_contents_beg;
2161 OFF dest_contents_end;
2162 OFF title_contents_beg;
2163 OFF title_contents_end;
2164 MD_SIZE title_contents_line_index;
2165 int title_is_multiline = FALSE;
2166 OFF off;
2167 MD_SIZE line_index = 0;
2168 MD_SIZE tmp_line_index;
2169 MD_REF_DEF* def = NULL;
2170 int ret = 0;
2171
2172 /* Link label. */
2173 if(!md_is_link_label(ctx, lines, n_lines, beg: lines[0].beg,
2174 p_end: &off, p_beg_line_index: &label_contents_line_index, p_end_line_index: &line_index,
2175 p_contents_beg: &label_contents_beg, p_contents_end: &label_contents_end))
2176 return FALSE;
2177 label_is_multiline = (label_contents_line_index != line_index);
2178
2179 /* Colon. */
2180 if(off >= lines[line_index].end || CH(off) != _T(':'))
2181 return FALSE;
2182 off++;
2183
2184 /* Optional white space with up to one line break. */
2185 while(off < lines[line_index].end && ISWHITESPACE(off))
2186 off++;
2187 if(off >= lines[line_index].end) {
2188 line_index++;
2189 if(line_index >= n_lines)
2190 return FALSE;
2191 off = lines[line_index].beg;
2192 }
2193
2194 /* Link destination. */
2195 if(!md_is_link_destination(ctx, beg: off, max_end: lines[line_index].end,
2196 p_end: &off, p_contents_beg: &dest_contents_beg, p_contents_end: &dest_contents_end))
2197 return FALSE;
2198
2199 /* (Optional) title. Note we interpret it as an title only if nothing
2200 * more follows on its last line. */
2201 if(md_is_link_title(ctx, lines: lines + line_index, n_lines: n_lines - line_index, beg: off,
2202 p_end: &off, p_beg_line_index: &title_contents_line_index, p_end_line_index: &tmp_line_index,
2203 p_contents_beg: &title_contents_beg, p_contents_end: &title_contents_end)
2204 && off >= lines[line_index + tmp_line_index].end)
2205 {
2206 title_is_multiline = (tmp_line_index != title_contents_line_index);
2207 title_contents_line_index += line_index;
2208 line_index += tmp_line_index;
2209 } else {
2210 /* Not a title. */
2211 title_is_multiline = FALSE;
2212 title_contents_beg = off;
2213 title_contents_end = off;
2214 title_contents_line_index = 0;
2215 }
2216
2217 /* Nothing more can follow on the last line. */
2218 if(off < lines[line_index].end)
2219 return FALSE;
2220
2221 /* So, it _is_ a reference definition. Remember it. */
2222 if(ctx->n_ref_defs >= ctx->alloc_ref_defs) {
2223 MD_REF_DEF* new_defs;
2224
2225 ctx->alloc_ref_defs = (ctx->alloc_ref_defs > 0
2226 ? ctx->alloc_ref_defs + ctx->alloc_ref_defs / 2
2227 : 16);
2228 new_defs = (MD_REF_DEF*) realloc(ptr: ctx->ref_defs, size: ctx->alloc_ref_defs * sizeof(MD_REF_DEF));
2229 if(new_defs == NULL) {
2230 MD_LOG("realloc() failed.");
2231 goto abort;
2232 }
2233
2234 ctx->ref_defs = new_defs;
2235 }
2236 def = &ctx->ref_defs[ctx->n_ref_defs];
2237 memset(s: def, c: 0, n: sizeof(MD_REF_DEF));
2238
2239 if(label_is_multiline) {
2240 MD_CHECK(md_merge_lines_alloc(ctx, label_contents_beg, label_contents_end,
2241 lines + label_contents_line_index, n_lines - label_contents_line_index,
2242 _T(' '), &def->label, &def->label_size));
2243 def->label_needs_free = TRUE;
2244 } else {
2245 def->label = (CHAR*) STR(label_contents_beg);
2246 def->label_size = label_contents_end - label_contents_beg;
2247 }
2248
2249 if(title_is_multiline) {
2250 MD_CHECK(md_merge_lines_alloc(ctx, title_contents_beg, title_contents_end,
2251 lines + title_contents_line_index, n_lines - title_contents_line_index,
2252 _T('\n'), &def->title, &def->title_size));
2253 def->title_needs_free = TRUE;
2254 } else {
2255 def->title = (CHAR*) STR(title_contents_beg);
2256 def->title_size = title_contents_end - title_contents_beg;
2257 }
2258
2259 def->dest_beg = dest_contents_beg;
2260 def->dest_end = dest_contents_end;
2261
2262 /* Success. */
2263 ctx->n_ref_defs++;
2264 return line_index + 1;
2265
2266abort:
2267 /* Failure. */
2268 if(def != NULL && def->label_needs_free)
2269 free(ptr: def->label);
2270 if(def != NULL && def->title_needs_free)
2271 free(ptr: def->title);
2272 return ret;
2273}
2274
2275static int
2276md_is_link_reference(MD_CTX* ctx, const MD_LINE* lines, MD_SIZE n_lines,
2277 OFF beg, OFF end, MD_LINK_ATTR* attr)
2278{
2279 const MD_REF_DEF* def;
2280 const MD_LINE* beg_line;
2281 int is_multiline;
2282 CHAR* label;
2283 SZ label_size;
2284 int ret;
2285
2286 MD_ASSERT(CH(beg) == _T('[') || CH(beg) == _T('!'));
2287 MD_ASSERT(CH(end-1) == _T(']'));
2288
2289 beg += (CH(beg) == _T('!') ? 2 : 1);
2290 end--;
2291
2292 /* Find lines corresponding to the beg and end positions. */
2293 beg_line = md_lookup_line(off: beg, lines, n_lines, NULL);
2294 is_multiline = (end > beg_line->end);
2295
2296 if(is_multiline) {
2297 MD_CHECK(md_merge_lines_alloc(ctx, beg, end, beg_line,
2298 (int)(n_lines - (beg_line - lines)), _T(' '), &label, &label_size));
2299 } else {
2300 label = (CHAR*) STR(beg);
2301 label_size = end - beg;
2302 }
2303
2304 def = md_lookup_ref_def(ctx, label, label_size);
2305 if(def != NULL) {
2306 attr->dest_beg = def->dest_beg;
2307 attr->dest_end = def->dest_end;
2308 attr->title = def->title;
2309 attr->title_size = def->title_size;
2310 attr->title_needs_free = FALSE;
2311 }
2312
2313 if(is_multiline)
2314 free(ptr: label);
2315
2316 ret = (def != NULL);
2317
2318abort:
2319 return ret;
2320}
2321
2322static int
2323md_is_inline_link_spec(MD_CTX* ctx, const MD_LINE* lines, MD_SIZE n_lines,
2324 OFF beg, OFF* p_end, MD_LINK_ATTR* attr)
2325{
2326 MD_SIZE line_index = 0;
2327 MD_SIZE tmp_line_index;
2328 OFF title_contents_beg;
2329 OFF title_contents_end;
2330 MD_SIZE title_contents_line_index;
2331 int title_is_multiline;
2332 OFF off = beg;
2333 int ret = FALSE;
2334
2335 while(off >= lines[line_index].end)
2336 line_index++;
2337
2338 MD_ASSERT(CH(off) == _T('('));
2339 off++;
2340
2341 /* Optional white space with up to one line break. */
2342 while(off < lines[line_index].end && ISWHITESPACE(off))
2343 off++;
2344 if(off >= lines[line_index].end && (off >= ctx->size || ISNEWLINE(off))) {
2345 line_index++;
2346 if(line_index >= n_lines)
2347 return FALSE;
2348 off = lines[line_index].beg;
2349 }
2350
2351 /* Link destination may be omitted, but only when not also having a title. */
2352 if(off < ctx->size && CH(off) == _T(')')) {
2353 attr->dest_beg = off;
2354 attr->dest_end = off;
2355 attr->title = NULL;
2356 attr->title_size = 0;
2357 attr->title_needs_free = FALSE;
2358 off++;
2359 *p_end = off;
2360 return TRUE;
2361 }
2362
2363 /* Link destination. */
2364 if(!md_is_link_destination(ctx, beg: off, max_end: lines[line_index].end,
2365 p_end: &off, p_contents_beg: &attr->dest_beg, p_contents_end: &attr->dest_end))
2366 return FALSE;
2367
2368 /* (Optional) title. */
2369 if(md_is_link_title(ctx, lines: lines + line_index, n_lines: n_lines - line_index, beg: off,
2370 p_end: &off, p_beg_line_index: &title_contents_line_index, p_end_line_index: &tmp_line_index,
2371 p_contents_beg: &title_contents_beg, p_contents_end: &title_contents_end))
2372 {
2373 title_is_multiline = (tmp_line_index != title_contents_line_index);
2374 title_contents_line_index += line_index;
2375 line_index += tmp_line_index;
2376 } else {
2377 /* Not a title. */
2378 title_is_multiline = FALSE;
2379 title_contents_beg = off;
2380 title_contents_end = off;
2381 title_contents_line_index = 0;
2382 }
2383
2384 /* Optional whitespace followed with final ')'. */
2385 while(off < lines[line_index].end && ISWHITESPACE(off))
2386 off++;
2387 if(off >= lines[line_index].end) {
2388 line_index++;
2389 if(line_index >= n_lines)
2390 return FALSE;
2391 off = lines[line_index].beg;
2392 }
2393 if(CH(off) != _T(')'))
2394 goto abort;
2395 off++;
2396
2397 if(title_contents_beg >= title_contents_end) {
2398 attr->title = NULL;
2399 attr->title_size = 0;
2400 attr->title_needs_free = FALSE;
2401 } else if(!title_is_multiline) {
2402 attr->title = (CHAR*) STR(title_contents_beg);
2403 attr->title_size = title_contents_end - title_contents_beg;
2404 attr->title_needs_free = FALSE;
2405 } else {
2406 MD_CHECK(md_merge_lines_alloc(ctx, title_contents_beg, title_contents_end,
2407 lines + title_contents_line_index, n_lines - title_contents_line_index,
2408 _T('\n'), &attr->title, &attr->title_size));
2409 attr->title_needs_free = TRUE;
2410 }
2411
2412 *p_end = off;
2413 ret = TRUE;
2414
2415abort:
2416 return ret;
2417}
2418
2419static void
2420md_free_ref_defs(MD_CTX* ctx)
2421{
2422 int i;
2423
2424 for(i = 0; i < ctx->n_ref_defs; i++) {
2425 MD_REF_DEF* def = &ctx->ref_defs[i];
2426
2427 if(def->label_needs_free)
2428 free(ptr: def->label);
2429 if(def->title_needs_free)
2430 free(ptr: def->title);
2431 }
2432
2433 free(ptr: ctx->ref_defs);
2434}
2435
2436
2437/******************************************
2438 *** Processing Inlines (a.k.a Spans) ***
2439 ******************************************/
2440
2441/* We process inlines in few phases:
2442 *
2443 * (1) We go through the block text and collect all significant characters
2444 * which may start/end a span or some other significant position into
2445 * ctx->marks[]. Core of this is what md_collect_marks() does.
2446 *
2447 * We also do some very brief preliminary context-less analysis, whether
2448 * it might be opener or closer (e.g. of an emphasis span).
2449 *
2450 * This speeds the other steps as we do not need to re-iterate over all
2451 * characters anymore.
2452 *
2453 * (2) We analyze each potential mark types, in order by their precedence.
2454 *
2455 * In each md_analyze_XXX() function, we re-iterate list of the marks,
2456 * skipping already resolved regions (in preceding precedences) and try to
2457 * resolve them.
2458 *
2459 * (2.1) For trivial marks, which are single (e.g. HTML entity), we just mark
2460 * them as resolved.
2461 *
2462 * (2.2) For range-type marks, we analyze whether the mark could be closer
2463 * and, if yes, whether there is some preceding opener it could satisfy.
2464 *
2465 * If not we check whether it could be really an opener and if yes, we
2466 * remember it so subsequent closers may resolve it.
2467 *
2468 * (3) Finally, when all marks were analyzed, we render the block contents
2469 * by calling MD_RENDERER::text() callback, interrupting by ::enter_span()
2470 * or ::close_span() whenever we reach a resolved mark.
2471 */
2472
2473
2474/* The mark structure.
2475 *
2476 * '\\': Maybe escape sequence.
2477 * '\0': NULL char.
2478 * '*': Maybe (strong) emphasis start/end.
2479 * '_': Maybe (strong) emphasis start/end.
2480 * '~': Maybe strikethrough start/end (needs MD_FLAG_STRIKETHROUGH).
2481 * '`': Maybe code span start/end.
2482 * '&': Maybe start of entity.
2483 * ';': Maybe end of entity.
2484 * '<': Maybe start of raw HTML or autolink.
2485 * '>': Maybe end of raw HTML or autolink.
2486 * '[': Maybe start of link label or link text.
2487 * '!': Equivalent of '[' for image.
2488 * ']': Maybe end of link label or link text.
2489 * '@': Maybe permissive e-mail auto-link (needs MD_FLAG_PERMISSIVEEMAILAUTOLINKS).
2490 * ':': Maybe permissive URL auto-link (needs MD_FLAG_PERMISSIVEURLAUTOLINKS).
2491 * '.': Maybe permissive WWW auto-link (needs MD_FLAG_PERMISSIVEWWWAUTOLINKS).
2492 * 'D': Dummy mark, it reserves a space for splitting a previous mark
2493 * (e.g. emphasis) or to make more space for storing some special data
2494 * related to the preceding mark (e.g. link).
2495 *
2496 * Note that not all instances of these chars in the text imply creation of the
2497 * structure. Only those which have (or may have, after we see more context)
2498 * the special meaning.
2499 *
2500 * (Keep this struct as small as possible to fit as much of them into CPU
2501 * cache line.)
2502 */
2503struct MD_MARK_tag {
2504 OFF beg;
2505 OFF end;
2506
2507 /* For unresolved openers, 'next' may be used to form a stack of
2508 * unresolved open openers.
2509 *
2510 * When resolved with MD_MARK_OPENER/CLOSER flag, next/prev is index of the
2511 * respective closer/opener.
2512 */
2513 int prev;
2514 int next;
2515 CHAR ch;
2516 unsigned char flags;
2517};
2518
2519/* Mark flags (these apply to ALL mark types). */
2520#define MD_MARK_POTENTIAL_OPENER 0x01 /* Maybe opener. */
2521#define MD_MARK_POTENTIAL_CLOSER 0x02 /* Maybe closer. */
2522#define MD_MARK_OPENER 0x04 /* Definitely opener. */
2523#define MD_MARK_CLOSER 0x08 /* Definitely closer. */
2524#define MD_MARK_RESOLVED 0x10 /* Resolved in any definite way. */
2525
2526/* Mark flags specific for various mark types (so they can share bits). */
2527#define MD_MARK_EMPH_OC 0x20 /* Opener/closer mixed candidate. Helper for the "rule of 3". */
2528#define MD_MARK_EMPH_MOD3_0 0x40
2529#define MD_MARK_EMPH_MOD3_1 0x80
2530#define MD_MARK_EMPH_MOD3_2 (0x40 | 0x80)
2531#define MD_MARK_EMPH_MOD3_MASK (0x40 | 0x80)
2532#define MD_MARK_AUTOLINK 0x20 /* Distinguisher for '<', '>'. */
2533#define MD_MARK_AUTOLINK_MISSING_MAILTO 0x40
2534#define MD_MARK_VALIDPERMISSIVEAUTOLINK 0x20 /* For permissive autolinks. */
2535#define MD_MARK_HASNESTEDBRACKETS 0x20 /* For '[' to rule out invalid link labels early */
2536
2537static MD_MARKSTACK*
2538md_emph_stack(MD_CTX* ctx, MD_CHAR ch, unsigned flags)
2539{
2540 MD_MARKSTACK* stack;
2541
2542 switch(ch) {
2543 case '*': stack = &ASTERISK_OPENERS_oo_mod3_0; break;
2544 case '_': stack = &UNDERSCORE_OPENERS_oo_mod3_0; break;
2545 default: MD_UNREACHABLE();
2546 }
2547
2548 if(flags & MD_MARK_EMPH_OC)
2549 stack += 3;
2550
2551 switch(flags & MD_MARK_EMPH_MOD3_MASK) {
2552 case MD_MARK_EMPH_MOD3_0: stack += 0; break;
2553 case MD_MARK_EMPH_MOD3_1: stack += 1; break;
2554 case MD_MARK_EMPH_MOD3_2: stack += 2; break;
2555 default: MD_UNREACHABLE();
2556 }
2557
2558 return stack;
2559}
2560
2561static MD_MARKSTACK*
2562md_opener_stack(MD_CTX* ctx, int mark_index)
2563{
2564 MD_MARK* mark = &ctx->marks[mark_index];
2565
2566 switch(mark->ch) {
2567 case _T('*'):
2568 case _T('_'): return md_emph_stack(ctx, ch: mark->ch, flags: mark->flags);
2569
2570 case _T('~'): return (mark->end - mark->beg == 1) ? &TILDE_OPENERS_1 : &TILDE_OPENERS_2;
2571
2572 case _T('!'):
2573 case _T('['): return &BRACKET_OPENERS;
2574
2575 default: MD_UNREACHABLE();
2576 }
2577}
2578
2579static MD_MARK*
2580md_add_mark(MD_CTX* ctx)
2581{
2582 if(ctx->n_marks >= ctx->alloc_marks) {
2583 MD_MARK* new_marks;
2584
2585 ctx->alloc_marks = (ctx->alloc_marks > 0
2586 ? ctx->alloc_marks + ctx->alloc_marks / 2
2587 : 64);
2588 new_marks = realloc(ptr: ctx->marks, size: ctx->alloc_marks * sizeof(MD_MARK));
2589 if(new_marks == NULL) {
2590 MD_LOG("realloc() failed.");
2591 return NULL;
2592 }
2593
2594 ctx->marks = new_marks;
2595 }
2596
2597 return &ctx->marks[ctx->n_marks++];
2598}
2599
2600#define ADD_MARK_() \
2601 do { \
2602 mark = md_add_mark(ctx); \
2603 if(mark == NULL) { \
2604 ret = -1; \
2605 goto abort; \
2606 } \
2607 } while(0)
2608
2609#define ADD_MARK(ch_, beg_, end_, flags_) \
2610 do { \
2611 ADD_MARK_(); \
2612 mark->beg = (beg_); \
2613 mark->end = (end_); \
2614 mark->prev = -1; \
2615 mark->next = -1; \
2616 mark->ch = (char)(ch_); \
2617 mark->flags = (flags_); \
2618 } while(0)
2619
2620
2621static inline void
2622md_mark_stack_push(MD_CTX* ctx, MD_MARKSTACK* stack, int mark_index)
2623{
2624 ctx->marks[mark_index].next = stack->top;
2625 stack->top = mark_index;
2626}
2627
2628static inline int
2629md_mark_stack_pop(MD_CTX* ctx, MD_MARKSTACK* stack)
2630{
2631 int top = stack->top;
2632 if(top >= 0)
2633 stack->top = ctx->marks[top].next;
2634 return top;
2635}
2636
2637/* Sometimes, we need to store a pointer into the mark. It is quite rare
2638 * so we do not bother to make MD_MARK use union, and it can only happen
2639 * for dummy marks. */
2640static inline void
2641md_mark_store_ptr(MD_CTX* ctx, int mark_index, void* ptr)
2642{
2643 MD_MARK* mark = &ctx->marks[mark_index];
2644 MD_ASSERT(mark->ch == 'D');
2645
2646 /* Check only members beg and end are misused for this. */
2647 MD_ASSERT(sizeof(void*) <= 2 * sizeof(OFF));
2648 memcpy(dest: mark, src: &ptr, n: sizeof(void*));
2649}
2650
2651static inline void*
2652md_mark_get_ptr(MD_CTX* ctx, int mark_index)
2653{
2654 void* ptr;
2655 MD_MARK* mark = &ctx->marks[mark_index];
2656 MD_ASSERT(mark->ch == 'D');
2657 memcpy(dest: &ptr, src: mark, n: sizeof(void*));
2658 return ptr;
2659}
2660
2661static inline void
2662md_resolve_range(MD_CTX* ctx, int opener_index, int closer_index)
2663{
2664 MD_MARK* opener = &ctx->marks[opener_index];
2665 MD_MARK* closer = &ctx->marks[closer_index];
2666
2667 /* Interconnect opener and closer and mark both as resolved. */
2668 opener->next = closer_index;
2669 closer->prev = opener_index;
2670
2671 opener->flags |= MD_MARK_OPENER | MD_MARK_RESOLVED;
2672 closer->flags |= MD_MARK_CLOSER | MD_MARK_RESOLVED;
2673}
2674
2675
2676#define MD_ROLLBACK_CROSSING 0
2677#define MD_ROLLBACK_ALL 1
2678
2679/* In the range ctx->marks[opener_index] ... [closer_index], undo some or all
2680 * resolvings accordingly to these rules:
2681 *
2682 * (1) All stacks of openers are cut so that any pending potential openers
2683 * are discarded from future consideration.
2684 *
2685 * (2) If 'how' is MD_ROLLBACK_ALL, then ALL resolved marks inside the range
2686 * are thrown away and turned into dummy marks ('D').
2687 *
2688 * WARNING: Do not call for arbitrary range of opener and closer.
2689 * This must form (potentially) valid range not crossing nesting boundaries
2690 * of already resolved ranges.
2691 */
2692static void
2693md_rollback(MD_CTX* ctx, int opener_index, int closer_index, int how)
2694{
2695 int i;
2696
2697 for(i = 0; i < (int) SIZEOF_ARRAY(ctx->opener_stacks); i++) {
2698 MD_MARKSTACK* stack = &ctx->opener_stacks[i];
2699 while(stack->top >= opener_index)
2700 md_mark_stack_pop(ctx, stack);
2701 }
2702
2703 if(how == MD_ROLLBACK_ALL) {
2704 for(i = opener_index + 1; i < closer_index; i++) {
2705 ctx->marks[i].ch = 'D';
2706 ctx->marks[i].flags = 0;
2707 }
2708 }
2709}
2710
2711static void
2712md_build_mark_char_map(MD_CTX* ctx)
2713{
2714 memset(s: ctx->mark_char_map, c: 0, n: sizeof(ctx->mark_char_map));
2715
2716 ctx->mark_char_map['\\'] = 1;
2717 ctx->mark_char_map['*'] = 1;
2718 ctx->mark_char_map['_'] = 1;
2719 ctx->mark_char_map['`'] = 1;
2720 ctx->mark_char_map['&'] = 1;
2721 ctx->mark_char_map[';'] = 1;
2722 ctx->mark_char_map['<'] = 1;
2723 ctx->mark_char_map['>'] = 1;
2724 ctx->mark_char_map['['] = 1;
2725 ctx->mark_char_map['!'] = 1;
2726 ctx->mark_char_map[']'] = 1;
2727 ctx->mark_char_map['\0'] = 1;
2728
2729 if(ctx->parser.flags & MD_FLAG_STRIKETHROUGH)
2730 ctx->mark_char_map['~'] = 1;
2731
2732 if(ctx->parser.flags & MD_FLAG_LATEXMATHSPANS)
2733 ctx->mark_char_map['$'] = 1;
2734
2735 if(ctx->parser.flags & MD_FLAG_PERMISSIVEEMAILAUTOLINKS)
2736 ctx->mark_char_map['@'] = 1;
2737
2738 if(ctx->parser.flags & MD_FLAG_PERMISSIVEURLAUTOLINKS)
2739 ctx->mark_char_map[':'] = 1;
2740
2741 if(ctx->parser.flags & MD_FLAG_PERMISSIVEWWWAUTOLINKS)
2742 ctx->mark_char_map['.'] = 1;
2743
2744 if((ctx->parser.flags & MD_FLAG_TABLES) || (ctx->parser.flags & MD_FLAG_WIKILINKS))
2745 ctx->mark_char_map['|'] = 1;
2746
2747 if(ctx->parser.flags & MD_FLAG_COLLAPSEWHITESPACE) {
2748 int i;
2749
2750 for(i = 0; i < (int) sizeof(ctx->mark_char_map); i++) {
2751 if(ISWHITESPACE_(i))
2752 ctx->mark_char_map[i] = 1;
2753 }
2754 }
2755}
2756
2757static int
2758md_is_code_span(MD_CTX* ctx, const MD_LINE* lines, MD_SIZE n_lines, OFF beg,
2759 MD_MARK* opener, MD_MARK* closer,
2760 OFF last_potential_closers[CODESPAN_MARK_MAXLEN],
2761 int* p_reached_paragraph_end)
2762{
2763 OFF opener_beg = beg;
2764 OFF opener_end;
2765 OFF closer_beg;
2766 OFF closer_end;
2767 SZ mark_len;
2768 OFF line_end;
2769 int has_space_after_opener = FALSE;
2770 int has_eol_after_opener = FALSE;
2771 int has_space_before_closer = FALSE;
2772 int has_eol_before_closer = FALSE;
2773 int has_only_space = TRUE;
2774 MD_SIZE line_index = 0;
2775
2776 line_end = lines[0].end;
2777 opener_end = opener_beg;
2778 while(opener_end < line_end && CH(opener_end) == _T('`'))
2779 opener_end++;
2780 has_space_after_opener = (opener_end < line_end && CH(opener_end) == _T(' '));
2781 has_eol_after_opener = (opener_end == line_end);
2782
2783 /* The caller needs to know end of the opening mark even if we fail. */
2784 opener->end = opener_end;
2785
2786 mark_len = opener_end - opener_beg;
2787 if(mark_len > CODESPAN_MARK_MAXLEN)
2788 return FALSE;
2789
2790 /* Check whether we already know there is no closer of this length.
2791 * If so, re-scan does no sense. This fixes issue #59. */
2792 if(last_potential_closers[mark_len-1] >= lines[n_lines-1].end ||
2793 (*p_reached_paragraph_end && last_potential_closers[mark_len-1] < opener_end))
2794 return FALSE;
2795
2796 closer_beg = opener_end;
2797 closer_end = opener_end;
2798
2799 /* Find closer mark. */
2800 while(TRUE) {
2801 while(closer_beg < line_end && CH(closer_beg) != _T('`')) {
2802 if(CH(closer_beg) != _T(' '))
2803 has_only_space = FALSE;
2804 closer_beg++;
2805 }
2806 closer_end = closer_beg;
2807 while(closer_end < line_end && CH(closer_end) == _T('`'))
2808 closer_end++;
2809
2810 if(closer_end - closer_beg == mark_len) {
2811 /* Success. */
2812 has_space_before_closer = (closer_beg > lines[line_index].beg && CH(closer_beg-1) == _T(' '));
2813 has_eol_before_closer = (closer_beg == lines[line_index].beg);
2814 break;
2815 }
2816
2817 if(closer_end - closer_beg > 0) {
2818 /* We have found a back-tick which is not part of the closer. */
2819 has_only_space = FALSE;
2820
2821 /* But if we eventually fail, remember it as a potential closer
2822 * of its own length for future attempts. This mitigates needs for
2823 * rescans. */
2824 if(closer_end - closer_beg < CODESPAN_MARK_MAXLEN) {
2825 if(closer_beg > last_potential_closers[closer_end - closer_beg - 1])
2826 last_potential_closers[closer_end - closer_beg - 1] = closer_beg;
2827 }
2828 }
2829
2830 if(closer_end >= line_end) {
2831 line_index++;
2832 if(line_index >= n_lines) {
2833 /* Reached end of the paragraph and still nothing. */
2834 *p_reached_paragraph_end = TRUE;
2835 return FALSE;
2836 }
2837 /* Try on the next line. */
2838 line_end = lines[line_index].end;
2839 closer_beg = lines[line_index].beg;
2840 } else {
2841 closer_beg = closer_end;
2842 }
2843 }
2844
2845 /* If there is a space or a new line both after and before the opener
2846 * (and if the code span is not made of spaces only), consume one initial
2847 * and one trailing space as part of the marks. */
2848 if(!has_only_space &&
2849 (has_space_after_opener || has_eol_after_opener) &&
2850 (has_space_before_closer || has_eol_before_closer))
2851 {
2852 if(has_space_after_opener)
2853 opener_end++;
2854 else
2855 opener_end = lines[1].beg;
2856
2857 if(has_space_before_closer)
2858 closer_beg--;
2859 else {
2860 /* Go back to the end of prev line */
2861 closer_beg = lines[line_index-1].end;
2862 /* But restore any trailing whitespace */
2863 while(closer_beg < ctx->size && ISBLANK(closer_beg))
2864 closer_beg++;
2865 }
2866 }
2867
2868 opener->ch = _T('`');
2869 opener->beg = opener_beg;
2870 opener->end = opener_end;
2871 opener->flags = MD_MARK_POTENTIAL_OPENER;
2872 closer->ch = _T('`');
2873 closer->beg = closer_beg;
2874 closer->end = closer_end;
2875 closer->flags = MD_MARK_POTENTIAL_CLOSER;
2876 return TRUE;
2877}
2878
2879static int
2880md_is_autolink_uri(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end)
2881{
2882 OFF off = beg+1;
2883
2884 MD_ASSERT(CH(beg) == _T('<'));
2885
2886 /* Check for scheme. */
2887 if(off >= max_end || !ISASCII(off))
2888 return FALSE;
2889 off++;
2890 while(1) {
2891 if(off >= max_end)
2892 return FALSE;
2893 if(off - beg > 32)
2894 return FALSE;
2895 if(CH(off) == _T(':') && off - beg >= 3)
2896 break;
2897 if(!ISALNUM(off) && CH(off) != _T('+') && CH(off) != _T('-') && CH(off) != _T('.'))
2898 return FALSE;
2899 off++;
2900 }
2901
2902 /* Check the path after the scheme. */
2903 while(off < max_end && CH(off) != _T('>')) {
2904 if(ISWHITESPACE(off) || ISCNTRL(off) || CH(off) == _T('<'))
2905 return FALSE;
2906 off++;
2907 }
2908
2909 if(off >= max_end)
2910 return FALSE;
2911
2912 MD_ASSERT(CH(off) == _T('>'));
2913 *p_end = off+1;
2914 return TRUE;
2915}
2916
2917static int
2918md_is_autolink_email(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end)
2919{
2920 OFF off = beg + 1;
2921 int label_len;
2922
2923 MD_ASSERT(CH(beg) == _T('<'));
2924
2925 /* The code should correspond to this regexp:
2926 /^[a-zA-Z0-9.!#$%&'*+\/=?^_`{|}~-]+
2927 @[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?
2928 (?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$/
2929 */
2930
2931 /* Username (before '@'). */
2932 while(off < max_end && (ISALNUM(off) || ISANYOF(off, _T(".!#$%&'*+/=?^_`{|}~-"))))
2933 off++;
2934 if(off <= beg+1)
2935 return FALSE;
2936
2937 /* '@' */
2938 if(off >= max_end || CH(off) != _T('@'))
2939 return FALSE;
2940 off++;
2941
2942 /* Labels delimited with '.'; each label is sequence of 1 - 63 alnum
2943 * characters or '-', but '-' is not allowed as first or last char. */
2944 label_len = 0;
2945 while(off < max_end) {
2946 if(ISALNUM(off))
2947 label_len++;
2948 else if(CH(off) == _T('-') && label_len > 0)
2949 label_len++;
2950 else if(CH(off) == _T('.') && label_len > 0 && CH(off-1) != _T('-'))
2951 label_len = 0;
2952 else
2953 break;
2954
2955 if(label_len > 63)
2956 return FALSE;
2957
2958 off++;
2959 }
2960
2961 if(label_len <= 0 || off >= max_end || CH(off) != _T('>') || CH(off-1) == _T('-'))
2962 return FALSE;
2963
2964 *p_end = off+1;
2965 return TRUE;
2966}
2967
2968static int
2969md_is_autolink(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end, int* p_missing_mailto)
2970{
2971 if(md_is_autolink_uri(ctx, beg, max_end, p_end)) {
2972 *p_missing_mailto = FALSE;
2973 return TRUE;
2974 }
2975
2976 if(md_is_autolink_email(ctx, beg, max_end, p_end)) {
2977 *p_missing_mailto = TRUE;
2978 return TRUE;
2979 }
2980
2981 return FALSE;
2982}
2983
2984static int
2985md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, MD_SIZE n_lines, int table_mode)
2986{
2987 MD_SIZE line_index;
2988 int ret = 0;
2989 MD_MARK* mark;
2990 OFF codespan_last_potential_closers[CODESPAN_MARK_MAXLEN] = { 0 };
2991 int codespan_scanned_till_paragraph_end = FALSE;
2992
2993 for(line_index = 0; line_index < n_lines; line_index++) {
2994 const MD_LINE* line = &lines[line_index];
2995 OFF off = line->beg;
2996
2997 while(TRUE) {
2998 CHAR ch;
2999
3000#ifdef MD4C_USE_UTF16
3001 /* For UTF-16, mark_char_map[] covers only ASCII. */
3002 #define IS_MARK_CHAR(off) ((CH(off) < SIZEOF_ARRAY(ctx->mark_char_map)) && \
3003 (ctx->mark_char_map[(unsigned char) CH(off)]))
3004#else
3005 /* For 8-bit encodings, mark_char_map[] covers all 256 elements. */
3006 #define IS_MARK_CHAR(off) (ctx->mark_char_map[(unsigned char) CH(off)])
3007#endif
3008
3009 /* Optimization: Use some loop unrolling. */
3010 while(off + 3 < line->end && !IS_MARK_CHAR(off+0) && !IS_MARK_CHAR(off+1)
3011 && !IS_MARK_CHAR(off+2) && !IS_MARK_CHAR(off+3))
3012 off += 4;
3013 while(off < line->end && !IS_MARK_CHAR(off+0))
3014 off++;
3015
3016 if(off >= line->end)
3017 break;
3018
3019 ch = CH(off);
3020
3021 /* A backslash escape.
3022 * It can go beyond line->end as it may involve escaped new
3023 * line to form a hard break. */
3024 if(ch == _T('\\') && off+1 < ctx->size && (ISPUNCT(off+1) || ISNEWLINE(off+1))) {
3025 /* Hard-break cannot be on the last line of the block. */
3026 if(!ISNEWLINE(off+1) || line_index+1 < n_lines)
3027 ADD_MARK(ch, off, off+2, MD_MARK_RESOLVED);
3028 off += 2;
3029 continue;
3030 }
3031
3032 /* A potential (string) emphasis start/end. */
3033 if(ch == _T('*') || ch == _T('_')) {
3034 OFF tmp = off+1;
3035 int left_level; /* What precedes: 0 = whitespace; 1 = punctuation; 2 = other char. */
3036 int right_level; /* What follows: 0 = whitespace; 1 = punctuation; 2 = other char. */
3037
3038 while(tmp < line->end && CH(tmp) == ch)
3039 tmp++;
3040
3041 if(off == line->beg || ISUNICODEWHITESPACEBEFORE(off))
3042 left_level = 0;
3043 else if(ISUNICODEPUNCTBEFORE(off))
3044 left_level = 1;
3045 else
3046 left_level = 2;
3047
3048 if(tmp == line->end || ISUNICODEWHITESPACE(tmp))
3049 right_level = 0;
3050 else if(ISUNICODEPUNCT(tmp))
3051 right_level = 1;
3052 else
3053 right_level = 2;
3054
3055 /* Intra-word underscore doesn't have special meaning. */
3056 if(ch == _T('_') && left_level == 2 && right_level == 2) {
3057 left_level = 0;
3058 right_level = 0;
3059 }
3060
3061 if(left_level != 0 || right_level != 0) {
3062 unsigned flags = 0;
3063
3064 if(left_level > 0 && left_level >= right_level)
3065 flags |= MD_MARK_POTENTIAL_CLOSER;
3066 if(right_level > 0 && right_level >= left_level)
3067 flags |= MD_MARK_POTENTIAL_OPENER;
3068 if(flags == (MD_MARK_POTENTIAL_OPENER | MD_MARK_POTENTIAL_CLOSER))
3069 flags |= MD_MARK_EMPH_OC;
3070
3071 /* For "the rule of three" we need to remember the original
3072 * size of the mark (modulo three), before we potentially
3073 * split the mark when being later resolved partially by some
3074 * shorter closer. */
3075 switch((tmp - off) % 3) {
3076 case 0: flags |= MD_MARK_EMPH_MOD3_0; break;
3077 case 1: flags |= MD_MARK_EMPH_MOD3_1; break;
3078 case 2: flags |= MD_MARK_EMPH_MOD3_2; break;
3079 }
3080
3081 ADD_MARK(ch, off, tmp, flags);
3082
3083 /* During resolving, multiple asterisks may have to be
3084 * split into independent span start/ends. Consider e.g.
3085 * "**foo* bar*". Therefore we push also some empty dummy
3086 * marks to have enough space for that. */
3087 off++;
3088 while(off < tmp) {
3089 ADD_MARK('D', off, off, 0);
3090 off++;
3091 }
3092 continue;
3093 }
3094
3095 off = tmp;
3096 continue;
3097 }
3098
3099 /* A potential code span start/end. */
3100 if(ch == _T('`')) {
3101 MD_MARK opener;
3102 MD_MARK closer;
3103 int is_code_span;
3104
3105 is_code_span = md_is_code_span(ctx, lines: line, n_lines: n_lines - line_index, beg: off,
3106 opener: &opener, closer: &closer, last_potential_closers: codespan_last_potential_closers,
3107 p_reached_paragraph_end: &codespan_scanned_till_paragraph_end);
3108 if(is_code_span) {
3109 ADD_MARK(opener.ch, opener.beg, opener.end, opener.flags);
3110 ADD_MARK(closer.ch, closer.beg, closer.end, closer.flags);
3111 md_resolve_range(ctx, opener_index: ctx->n_marks-2, closer_index: ctx->n_marks-1);
3112 off = closer.end;
3113
3114 /* Advance the current line accordingly. */
3115 if(off > line->end)
3116 line = md_lookup_line(off, lines, n_lines, p_line_index: &line_index);
3117 continue;
3118 }
3119
3120 off = opener.end;
3121 continue;
3122 }
3123
3124 /* A potential entity start. */
3125 if(ch == _T('&')) {
3126 ADD_MARK(ch, off, off+1, MD_MARK_POTENTIAL_OPENER);
3127 off++;
3128 continue;
3129 }
3130
3131 /* A potential entity end. */
3132 if(ch == _T(';')) {
3133 /* We surely cannot be entity unless the previous mark is '&'. */
3134 if(ctx->n_marks > 0 && ctx->marks[ctx->n_marks-1].ch == _T('&'))
3135 ADD_MARK(ch, off, off+1, MD_MARK_POTENTIAL_CLOSER);
3136
3137 off++;
3138 continue;
3139 }
3140
3141 /* A potential autolink or raw HTML start/end. */
3142 if(ch == _T('<')) {
3143 int is_autolink;
3144 OFF autolink_end;
3145 int missing_mailto;
3146
3147 if(!(ctx->parser.flags & MD_FLAG_NOHTMLSPANS)) {
3148 int is_html;
3149 OFF html_end;
3150
3151 /* Given the nature of the raw HTML, we have to recognize
3152 * it here. Doing so later in md_analyze_lt_gt() could
3153 * open can of worms of quadratic complexity. */
3154 is_html = md_is_html_any(ctx, lines: line, n_lines: n_lines - line_index, beg: off,
3155 max_end: lines[n_lines-1].end, p_end: &html_end);
3156 if(is_html) {
3157 ADD_MARK(_T('<'), off, off, MD_MARK_OPENER | MD_MARK_RESOLVED);
3158 ADD_MARK(_T('>'), html_end, html_end, MD_MARK_CLOSER | MD_MARK_RESOLVED);
3159 ctx->marks[ctx->n_marks-2].next = ctx->n_marks-1;
3160 ctx->marks[ctx->n_marks-1].prev = ctx->n_marks-2;
3161 off = html_end;
3162
3163 /* Advance the current line accordingly. */
3164 if(off > line->end)
3165 line = md_lookup_line(off, lines, n_lines, p_line_index: &line_index);
3166 continue;
3167 }
3168 }
3169
3170 is_autolink = md_is_autolink(ctx, beg: off, max_end: lines[n_lines-1].end,
3171 p_end: &autolink_end, p_missing_mailto: &missing_mailto);
3172 if(is_autolink) {
3173 unsigned flags = MD_MARK_RESOLVED | MD_MARK_AUTOLINK;
3174 if(missing_mailto)
3175 flags |= MD_MARK_AUTOLINK_MISSING_MAILTO;
3176
3177 ADD_MARK(_T('<'), off, off+1, MD_MARK_OPENER | flags);
3178 ADD_MARK(_T('>'), autolink_end-1, autolink_end, MD_MARK_CLOSER | flags);
3179 ctx->marks[ctx->n_marks-2].next = ctx->n_marks-1;
3180 ctx->marks[ctx->n_marks-1].prev = ctx->n_marks-2;
3181 off = autolink_end;
3182 continue;
3183 }
3184
3185 off++;
3186 continue;
3187 }
3188
3189 /* A potential link or its part. */
3190 if(ch == _T('[') || (ch == _T('!') && off+1 < line->end && CH(off+1) == _T('['))) {
3191 OFF tmp = (ch == _T('[') ? off+1 : off+2);
3192 ADD_MARK(ch, off, tmp, MD_MARK_POTENTIAL_OPENER);
3193 off = tmp;
3194 /* Two dummies to make enough place for data we need if it is
3195 * a link. */
3196 ADD_MARK('D', off, off, 0);
3197 ADD_MARK('D', off, off, 0);
3198 continue;
3199 }
3200 if(ch == _T(']')) {
3201 ADD_MARK(ch, off, off+1, MD_MARK_POTENTIAL_CLOSER);
3202 off++;
3203 continue;
3204 }
3205
3206 /* A potential permissive e-mail autolink. */
3207 if(ch == _T('@')) {
3208 if(line->beg + 1 <= off && ISALNUM(off-1) &&
3209 off + 3 < line->end && ISALNUM(off+1))
3210 {
3211 ADD_MARK(ch, off, off+1, MD_MARK_POTENTIAL_OPENER);
3212 /* Push a dummy as a reserve for a closer. */
3213 ADD_MARK('D', line->beg, line->end, 0);
3214 }
3215
3216 off++;
3217 continue;
3218 }
3219
3220 /* A potential permissive URL autolink. */
3221 if(ch == _T(':')) {
3222 static struct {
3223 const CHAR* scheme;
3224 SZ scheme_size;
3225 const CHAR* suffix;
3226 SZ suffix_size;
3227 } scheme_map[] = {
3228 /* In the order from the most frequently used, arguably. */
3229 { _T("http"), 4, _T("//"), 2 },
3230 { _T("https"), 5, _T("//"), 2 },
3231 { _T("ftp"), 3, _T("//"), 2 }
3232 };
3233 int scheme_index;
3234
3235 for(scheme_index = 0; scheme_index < (int) SIZEOF_ARRAY(scheme_map); scheme_index++) {
3236 const CHAR* scheme = scheme_map[scheme_index].scheme;
3237 const SZ scheme_size = scheme_map[scheme_index].scheme_size;
3238 const CHAR* suffix = scheme_map[scheme_index].suffix;
3239 const SZ suffix_size = scheme_map[scheme_index].suffix_size;
3240
3241 if(line->beg + scheme_size <= off && md_ascii_eq(STR(off-scheme_size), s2: scheme, n: scheme_size) &&
3242 off + 1 + suffix_size < line->end && md_ascii_eq(STR(off+1), s2: suffix, n: suffix_size))
3243 {
3244 ADD_MARK(ch, off-scheme_size, off+1+suffix_size, MD_MARK_POTENTIAL_OPENER);
3245 /* Push a dummy as a reserve for a closer. */
3246 ADD_MARK('D', line->beg, line->end, 0);
3247 off += 1 + suffix_size;
3248 break;
3249 }
3250 }
3251
3252 off++;
3253 continue;
3254 }
3255
3256 /* A potential permissive WWW autolink. */
3257 if(ch == _T('.')) {
3258 if(line->beg + 3 <= off && md_ascii_eq(STR(off-3), _T("www"), n: 3) &&
3259 (off-3 == line->beg || ISUNICODEWHITESPACEBEFORE(off-3) || ISUNICODEPUNCTBEFORE(off-3)))
3260 {
3261 ADD_MARK(ch, off-3, off+1, MD_MARK_POTENTIAL_OPENER);
3262 /* Push a dummy as a reserve for a closer. */
3263 ADD_MARK('D', line->beg, line->end, 0);
3264 off++;
3265 continue;
3266 }
3267
3268 off++;
3269 continue;
3270 }
3271
3272 /* A potential table cell boundary or wiki link label delimiter. */
3273 if((table_mode || ctx->parser.flags & MD_FLAG_WIKILINKS) && ch == _T('|')) {
3274 ADD_MARK(ch, off, off+1, 0);
3275 off++;
3276 continue;
3277 }
3278
3279 /* A potential strikethrough start/end. */
3280 if(ch == _T('~')) {
3281 OFF tmp = off+1;
3282
3283 while(tmp < line->end && CH(tmp) == _T('~'))
3284 tmp++;
3285
3286 if(tmp - off < 3) {
3287 unsigned flags = 0;
3288
3289 if(tmp < line->end && !ISUNICODEWHITESPACE(tmp))
3290 flags |= MD_MARK_POTENTIAL_OPENER;
3291 if(off > line->beg && !ISUNICODEWHITESPACEBEFORE(off))
3292 flags |= MD_MARK_POTENTIAL_CLOSER;
3293 if(flags != 0)
3294 ADD_MARK(ch, off, tmp, flags);
3295 }
3296
3297 off = tmp;
3298 continue;
3299 }
3300
3301 /* A potential equation start/end */
3302 if(ch == _T('$')) {
3303 /* We can have at most two consecutive $ signs,
3304 * where two dollar signs signify a display equation. */
3305 OFF tmp = off+1;
3306
3307 while(tmp < line->end && CH(tmp) == _T('$'))
3308 tmp++;
3309
3310 if(tmp - off <= 2) {
3311 unsigned flags = MD_MARK_POTENTIAL_OPENER | MD_MARK_POTENTIAL_CLOSER;
3312
3313 if(off > line->beg && !ISUNICODEWHITESPACEBEFORE(off) && !ISUNICODEPUNCTBEFORE(off))
3314 flags &= ~MD_MARK_POTENTIAL_OPENER;
3315 if(tmp < line->end && !ISUNICODEWHITESPACE(tmp) && !ISUNICODEPUNCT(tmp))
3316 flags &= ~MD_MARK_POTENTIAL_CLOSER;
3317 if(flags != 0)
3318 ADD_MARK(ch, off, tmp, flags);
3319 }
3320
3321 off = tmp;
3322 continue;
3323 }
3324
3325 /* Turn non-trivial whitespace into single space. */
3326 if(ISWHITESPACE_(ch)) {
3327 OFF tmp = off+1;
3328
3329 while(tmp < line->end && ISWHITESPACE(tmp))
3330 tmp++;
3331
3332 if(tmp - off > 1 || ch != _T(' '))
3333 ADD_MARK(ch, off, tmp, MD_MARK_RESOLVED);
3334
3335 off = tmp;
3336 continue;
3337 }
3338
3339 /* NULL character. */
3340 if(ch == _T('\0')) {
3341 ADD_MARK(ch, off, off+1, MD_MARK_RESOLVED);
3342 off++;
3343 continue;
3344 }
3345
3346 off++;
3347 }
3348 }
3349
3350 /* Add a dummy mark at the end of the mark vector to simplify
3351 * process_inlines(). */
3352 ADD_MARK(127, ctx->size, ctx->size, MD_MARK_RESOLVED);
3353
3354abort:
3355 return ret;
3356}
3357
3358static void
3359md_analyze_bracket(MD_CTX* ctx, int mark_index)
3360{
3361 /* We cannot really resolve links here as for that we would need
3362 * more context. E.g. a following pair of brackets (reference link),
3363 * or enclosing pair of brackets (if the inner is the link, the outer
3364 * one cannot be.)
3365 *
3366 * Therefore we here only construct a list of '[' ']' pairs ordered by
3367 * position of the closer. This allows us to analyze what is or is not
3368 * link in the right order, from inside to outside in case of nested
3369 * brackets.
3370 *
3371 * The resolving itself is deferred to md_resolve_links().
3372 */
3373
3374 MD_MARK* mark = &ctx->marks[mark_index];
3375
3376 if(mark->flags & MD_MARK_POTENTIAL_OPENER) {
3377 if(BRACKET_OPENERS.top >= 0)
3378 ctx->marks[BRACKET_OPENERS.top].flags |= MD_MARK_HASNESTEDBRACKETS;
3379
3380 md_mark_stack_push(ctx, stack: &BRACKET_OPENERS, mark_index);
3381 return;
3382 }
3383
3384 if(BRACKET_OPENERS.top >= 0) {
3385 int opener_index = md_mark_stack_pop(ctx, stack: &BRACKET_OPENERS);
3386 MD_MARK* opener = &ctx->marks[opener_index];
3387
3388 /* Interconnect the opener and closer. */
3389 opener->next = mark_index;
3390 mark->prev = opener_index;
3391
3392 /* Add the pair into a list of potential links for md_resolve_links().
3393 * Note we misuse opener->prev for this as opener->next points to its
3394 * closer. */
3395 if(ctx->unresolved_link_tail >= 0)
3396 ctx->marks[ctx->unresolved_link_tail].prev = opener_index;
3397 else
3398 ctx->unresolved_link_head = opener_index;
3399 ctx->unresolved_link_tail = opener_index;
3400 opener->prev = -1;
3401 }
3402}
3403
3404/* Forward declaration. */
3405static void md_analyze_link_contents(MD_CTX* ctx, const MD_LINE* lines, MD_SIZE n_lines,
3406 int mark_beg, int mark_end);
3407
3408static int
3409md_resolve_links(MD_CTX* ctx, const MD_LINE* lines, MD_SIZE n_lines)
3410{
3411 int opener_index = ctx->unresolved_link_head;
3412 OFF last_link_beg = 0;
3413 OFF last_link_end = 0;
3414 OFF last_img_beg = 0;
3415 OFF last_img_end = 0;
3416
3417 while(opener_index >= 0) {
3418 MD_MARK* opener = &ctx->marks[opener_index];
3419 int closer_index = opener->next;
3420 MD_MARK* closer = &ctx->marks[closer_index];
3421 int next_index = opener->prev;
3422 MD_MARK* next_opener;
3423 MD_MARK* next_closer;
3424 MD_LINK_ATTR attr;
3425 int is_link = FALSE;
3426
3427 if(next_index >= 0) {
3428 next_opener = &ctx->marks[next_index];
3429 next_closer = &ctx->marks[next_opener->next];
3430 } else {
3431 next_opener = NULL;
3432 next_closer = NULL;
3433 }
3434
3435 /* If nested ("[ [ ] ]"), we need to make sure that:
3436 * - The outer does not end inside of (...) belonging to the inner.
3437 * - The outer cannot be link if the inner is link (i.e. not image).
3438 *
3439 * (Note we here analyze from inner to outer as the marks are ordered
3440 * by closer->beg.)
3441 */
3442 if((opener->beg < last_link_beg && closer->end < last_link_end) ||
3443 (opener->beg < last_img_beg && closer->end < last_img_end) ||
3444 (opener->beg < last_link_end && opener->ch == '['))
3445 {
3446 opener_index = next_index;
3447 continue;
3448 }
3449
3450 /* Recognize and resolve wiki links.
3451 * Wiki-links maybe '[[destination]]' or '[[destination|label]]'.
3452 */
3453 if ((ctx->parser.flags & MD_FLAG_WIKILINKS) &&
3454 (opener->end - opener->beg == 1) && /* not image */
3455 next_opener != NULL && /* double '[' opener */
3456 next_opener->ch == '[' &&
3457 (next_opener->beg == opener->beg - 1) &&
3458 (next_opener->end - next_opener->beg == 1) &&
3459 next_closer != NULL && /* double ']' closer */
3460 next_closer->ch == ']' &&
3461 (next_closer->beg == closer->beg + 1) &&
3462 (next_closer->end - next_closer->beg == 1))
3463 {
3464 MD_MARK* delim = NULL;
3465 int delim_index;
3466 OFF dest_beg, dest_end;
3467
3468 is_link = TRUE;
3469
3470 /* We don't allow destination to be longer than 100 characters.
3471 * Lets scan to see whether there is '|'. (If not then the whole
3472 * wiki-link has to be below the 100 characters.) */
3473 delim_index = opener_index + 1;
3474 while(delim_index < closer_index) {
3475 MD_MARK* m = &ctx->marks[delim_index];
3476 if(m->ch == '|') {
3477 delim = m;
3478 break;
3479 }
3480 if(m->ch != 'D') {
3481 if(m->beg - opener->end > 100)
3482 break;
3483 if(m->ch != 'D' && (m->flags & MD_MARK_OPENER))
3484 delim_index = m->next;
3485 }
3486 delim_index++;
3487 }
3488
3489 dest_beg = opener->end;
3490 dest_end = (delim != NULL) ? delim->beg : closer->beg;
3491 if(dest_end - dest_beg == 0 || dest_end - dest_beg > 100)
3492 is_link = FALSE;
3493
3494 /* There may not be any new line in the destination. */
3495 if(is_link) {
3496 OFF off;
3497 for(off = dest_beg; off < dest_end; off++) {
3498 if(ISNEWLINE(off)) {
3499 is_link = FALSE;
3500 break;
3501 }
3502 }
3503 }
3504
3505 if(is_link) {
3506 if(delim != NULL) {
3507 if(delim->end < closer->beg) {
3508 md_rollback(ctx, opener_index, closer_index: delim_index, MD_ROLLBACK_ALL);
3509 md_rollback(ctx, opener_index: delim_index, closer_index, MD_ROLLBACK_CROSSING);
3510 delim->flags |= MD_MARK_RESOLVED;
3511 opener->end = delim->beg;
3512 } else {
3513 /* The pipe is just before the closer: [[foo|]] */
3514 md_rollback(ctx, opener_index, closer_index, MD_ROLLBACK_ALL);
3515 closer->beg = delim->beg;
3516 delim = NULL;
3517 }
3518 }
3519
3520 opener->beg = next_opener->beg;
3521 opener->next = closer_index;
3522 opener->flags |= MD_MARK_OPENER | MD_MARK_RESOLVED;
3523
3524 closer->end = next_closer->end;
3525 closer->prev = opener_index;
3526 closer->flags |= MD_MARK_CLOSER | MD_MARK_RESOLVED;
3527
3528 last_link_beg = opener->beg;
3529 last_link_end = closer->end;
3530
3531 if(delim != NULL)
3532 md_analyze_link_contents(ctx, lines, n_lines, mark_beg: delim_index+1, mark_end: closer_index);
3533
3534 opener_index = next_opener->prev;
3535 continue;
3536 }
3537 }
3538
3539 if(next_opener != NULL && next_opener->beg == closer->end) {
3540 if(next_closer->beg > closer->end + 1) {
3541 /* Might be full reference link. */
3542 if(!(next_opener->flags & MD_MARK_HASNESTEDBRACKETS))
3543 is_link = md_is_link_reference(ctx, lines, n_lines, beg: next_opener->beg, end: next_closer->end, attr: &attr);
3544 } else {
3545 /* Might be shortcut reference link. */
3546 if(!(opener->flags & MD_MARK_HASNESTEDBRACKETS))
3547 is_link = md_is_link_reference(ctx, lines, n_lines, beg: opener->beg, end: closer->end, attr: &attr);
3548 }
3549
3550 if(is_link < 0)
3551 return -1;
3552
3553 if(is_link) {
3554 /* Eat the 2nd "[...]". */
3555 closer->end = next_closer->end;
3556
3557 /* Do not analyze the label as a standalone link in the next
3558 * iteration. */
3559 next_index = ctx->marks[next_index].prev;
3560 }
3561 } else {
3562 if(closer->end < ctx->size && CH(closer->end) == _T('(')) {
3563 /* Might be inline link. */
3564 OFF inline_link_end = UINT_MAX;
3565
3566 is_link = md_is_inline_link_spec(ctx, lines, n_lines, beg: closer->end, p_end: &inline_link_end, attr: &attr);
3567 if(is_link < 0)
3568 return -1;
3569
3570 /* Check the closing ')' is not inside an already resolved range
3571 * (i.e. a range with a higher priority), e.g. a code span. */
3572 if(is_link) {
3573 int i = closer_index + 1;
3574
3575 while(i < ctx->n_marks) {
3576 MD_MARK* mark = &ctx->marks[i];
3577
3578 if(mark->beg >= inline_link_end)
3579 break;
3580 if((mark->flags & (MD_MARK_OPENER | MD_MARK_RESOLVED)) == (MD_MARK_OPENER | MD_MARK_RESOLVED)) {
3581 if(ctx->marks[mark->next].beg >= inline_link_end) {
3582 /* Cancel the link status. */
3583 if(attr.title_needs_free)
3584 free(ptr: attr.title);
3585 is_link = FALSE;
3586 break;
3587 }
3588
3589 i = mark->next + 1;
3590 } else {
3591 i++;
3592 }
3593 }
3594 }
3595
3596 if(is_link) {
3597 /* Eat the "(...)" */
3598 closer->end = inline_link_end;
3599 }
3600 }
3601
3602 if(!is_link) {
3603 /* Might be collapsed reference link. */
3604 if(!(opener->flags & MD_MARK_HASNESTEDBRACKETS))
3605 is_link = md_is_link_reference(ctx, lines, n_lines, beg: opener->beg, end: closer->end, attr: &attr);
3606 if(is_link < 0)
3607 return -1;
3608 }
3609 }
3610
3611 if(is_link) {
3612 /* Resolve the brackets as a link. */
3613 opener->flags |= MD_MARK_OPENER | MD_MARK_RESOLVED;
3614 closer->flags |= MD_MARK_CLOSER | MD_MARK_RESOLVED;
3615
3616 /* If it is a link, we store the destination and title in the two
3617 * dummy marks after the opener. */
3618 MD_ASSERT(ctx->marks[opener_index+1].ch == 'D');
3619 ctx->marks[opener_index+1].beg = attr.dest_beg;
3620 ctx->marks[opener_index+1].end = attr.dest_end;
3621
3622 MD_ASSERT(ctx->marks[opener_index+2].ch == 'D');
3623 md_mark_store_ptr(ctx, mark_index: opener_index+2, ptr: attr.title);
3624 /* The title might or might not have been allocated for us. */
3625 if(attr.title_needs_free)
3626 md_mark_stack_push(ctx, stack: &ctx->ptr_stack, mark_index: opener_index+2);
3627 ctx->marks[opener_index+2].prev = attr.title_size;
3628
3629 if(opener->ch == '[') {
3630 last_link_beg = opener->beg;
3631 last_link_end = closer->end;
3632 } else {
3633 last_img_beg = opener->beg;
3634 last_img_end = closer->end;
3635 }
3636
3637 md_analyze_link_contents(ctx, lines, n_lines, mark_beg: opener_index+1, mark_end: closer_index);
3638
3639 /* If the link text is formed by nothing but permissive autolink,
3640 * suppress the autolink.
3641 * See https://github.com/mity/md4c/issues/152 for more info. */
3642 if(ctx->parser.flags & MD_FLAG_PERMISSIVEAUTOLINKS) {
3643 MD_MARK* first_nested;
3644 MD_MARK* last_nested;
3645
3646 first_nested = opener + 1;
3647 while(first_nested->ch == _T('D') && first_nested < closer)
3648 first_nested++;
3649
3650 last_nested = closer - 1;
3651 while(first_nested->ch == _T('D') && last_nested > opener)
3652 last_nested--;
3653
3654 if((first_nested->flags & MD_MARK_RESOLVED) &&
3655 first_nested->beg == opener->end &&
3656 ISANYOF_(first_nested->ch, _T("@:.")) &&
3657 first_nested->next == (last_nested - ctx->marks) &&
3658 last_nested->end == closer->beg)
3659 {
3660 first_nested->ch = _T('D');
3661 first_nested->flags &= ~MD_MARK_RESOLVED;
3662 last_nested->ch = _T('D');
3663 last_nested->flags &= ~MD_MARK_RESOLVED;
3664 }
3665 }
3666 }
3667
3668 opener_index = next_index;
3669 }
3670
3671 return 0;
3672}
3673
3674/* Analyze whether the mark '&' starts a HTML entity.
3675 * If so, update its flags as well as flags of corresponding closer ';'. */
3676static void
3677md_analyze_entity(MD_CTX* ctx, int mark_index)
3678{
3679 MD_MARK* opener = &ctx->marks[mark_index];
3680 MD_MARK* closer;
3681 OFF off;
3682
3683 /* Cannot be entity if there is no closer as the next mark.
3684 * (Any other mark between would mean strange character which cannot be
3685 * part of the entity.
3686 *
3687 * So we can do all the work on '&' and do not call this later for the
3688 * closing mark ';'.
3689 */
3690 if(mark_index + 1 >= ctx->n_marks)
3691 return;
3692 closer = &ctx->marks[mark_index+1];
3693 if(closer->ch != ';')
3694 return;
3695
3696 if(md_is_entity(ctx, beg: opener->beg, max_end: closer->end, p_end: &off)) {
3697 MD_ASSERT(off == closer->end);
3698
3699 md_resolve_range(ctx, opener_index: mark_index, closer_index: mark_index+1);
3700 opener->end = closer->end;
3701 }
3702}
3703
3704static void
3705md_analyze_table_cell_boundary(MD_CTX* ctx, int mark_index)
3706{
3707 MD_MARK* mark = &ctx->marks[mark_index];
3708 mark->flags |= MD_MARK_RESOLVED;
3709 mark->next = -1;
3710
3711 if(ctx->table_cell_boundaries_head < 0)
3712 ctx->table_cell_boundaries_head = mark_index;
3713 else
3714 ctx->marks[ctx->table_cell_boundaries_tail].next = mark_index;
3715 ctx->table_cell_boundaries_tail = mark_index;
3716 ctx->n_table_cell_boundaries++;
3717}
3718
3719/* Split a longer mark into two. The new mark takes the given count of
3720 * characters. May only be called if an adequate number of dummy 'D' marks
3721 * follows.
3722 */
3723static int
3724md_split_emph_mark(MD_CTX* ctx, int mark_index, SZ n)
3725{
3726 MD_MARK* mark = &ctx->marks[mark_index];
3727 int new_mark_index = mark_index + (mark->end - mark->beg - n);
3728 MD_MARK* dummy = &ctx->marks[new_mark_index];
3729
3730 MD_ASSERT(mark->end - mark->beg > n);
3731 MD_ASSERT(dummy->ch == 'D');
3732
3733 memcpy(dest: dummy, src: mark, n: sizeof(MD_MARK));
3734 mark->end -= n;
3735 dummy->beg = mark->end;
3736
3737 return new_mark_index;
3738}
3739
3740static void
3741md_analyze_emph(MD_CTX* ctx, int mark_index)
3742{
3743 MD_MARK* mark = &ctx->marks[mark_index];
3744
3745 /* If we can be a closer, try to resolve with the preceding opener. */
3746 if(mark->flags & MD_MARK_POTENTIAL_CLOSER) {
3747 MD_MARK* opener = NULL;
3748 int opener_index = 0;
3749 MD_MARKSTACK* opener_stacks[6];
3750 int i, n_opener_stacks;
3751 unsigned flags = mark->flags;
3752
3753 n_opener_stacks = 0;
3754
3755 /* Apply the rule of 3 */
3756 opener_stacks[n_opener_stacks++] = md_emph_stack(ctx, ch: mark->ch, MD_MARK_EMPH_MOD3_0 | MD_MARK_EMPH_OC);
3757 if((flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_2)
3758 opener_stacks[n_opener_stacks++] = md_emph_stack(ctx, ch: mark->ch, MD_MARK_EMPH_MOD3_1 | MD_MARK_EMPH_OC);
3759 if((flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_1)
3760 opener_stacks[n_opener_stacks++] = md_emph_stack(ctx, ch: mark->ch, MD_MARK_EMPH_MOD3_2 | MD_MARK_EMPH_OC);
3761 opener_stacks[n_opener_stacks++] = md_emph_stack(ctx, ch: mark->ch, MD_MARK_EMPH_MOD3_0);
3762 if(!(flags & MD_MARK_EMPH_OC) || (flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_2)
3763 opener_stacks[n_opener_stacks++] = md_emph_stack(ctx, ch: mark->ch, MD_MARK_EMPH_MOD3_1);
3764 if(!(flags & MD_MARK_EMPH_OC) || (flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_1)
3765 opener_stacks[n_opener_stacks++] = md_emph_stack(ctx, ch: mark->ch, MD_MARK_EMPH_MOD3_2);
3766
3767 /* Opener is the most recent mark from the allowed stacks. */
3768 for(i = 0; i < n_opener_stacks; i++) {
3769 if(opener_stacks[i]->top >= 0) {
3770 int m_index = opener_stacks[i]->top;
3771 MD_MARK* m = &ctx->marks[m_index];
3772
3773 if(opener == NULL || m->end > opener->end) {
3774 opener_index = m_index;
3775 opener = m;
3776 }
3777 }
3778 }
3779
3780 /* Resolve, if we have found matching opener. */
3781 if(opener != NULL) {
3782 SZ opener_size = opener->end - opener->beg;
3783 SZ closer_size = mark->end - mark->beg;
3784 MD_MARKSTACK* stack = md_opener_stack(ctx, mark_index: opener_index);
3785
3786 if(opener_size > closer_size) {
3787 opener_index = md_split_emph_mark(ctx, mark_index: opener_index, n: closer_size);
3788 md_mark_stack_push(ctx, stack, mark_index: opener_index);
3789 } else if(opener_size < closer_size) {
3790 md_split_emph_mark(ctx, mark_index, n: closer_size - opener_size);
3791 }
3792
3793 /* Above we were only peeking. */
3794 md_mark_stack_pop(ctx, stack);
3795
3796 md_rollback(ctx, opener_index, closer_index: mark_index, MD_ROLLBACK_CROSSING);
3797 md_resolve_range(ctx, opener_index, closer_index: mark_index);
3798 return;
3799 }
3800 }
3801
3802 /* If we could not resolve as closer, we may be yet be an opener. */
3803 if(mark->flags & MD_MARK_POTENTIAL_OPENER)
3804 md_mark_stack_push(ctx, stack: md_emph_stack(ctx, ch: mark->ch, flags: mark->flags), mark_index);
3805}
3806
3807static void
3808md_analyze_tilde(MD_CTX* ctx, int mark_index)
3809{
3810 MD_MARK* mark = &ctx->marks[mark_index];
3811 MD_MARKSTACK* stack = md_opener_stack(ctx, mark_index);
3812
3813 /* We attempt to be Github Flavored Markdown compatible here. GFM accepts
3814 * only tildes sequences of length 1 and 2, and the length of the opener
3815 * and closer has to match. */
3816
3817 if((mark->flags & MD_MARK_POTENTIAL_CLOSER) && stack->top >= 0) {
3818 int opener_index = stack->top;
3819
3820 md_mark_stack_pop(ctx, stack);
3821 md_rollback(ctx, opener_index, closer_index: mark_index, MD_ROLLBACK_CROSSING);
3822 md_resolve_range(ctx, opener_index, closer_index: mark_index);
3823 return;
3824 }
3825
3826 if(mark->flags & MD_MARK_POTENTIAL_OPENER)
3827 md_mark_stack_push(ctx, stack, mark_index);
3828}
3829
3830static void
3831md_analyze_dollar(MD_CTX* ctx, int mark_index)
3832{
3833 MD_MARK* mark = &ctx->marks[mark_index];
3834
3835 if((mark->flags & MD_MARK_POTENTIAL_CLOSER) && DOLLAR_OPENERS.top >= 0) {
3836 /* If the potential closer has a non-matching number of $, discard */
3837 MD_MARK* opener = &ctx->marks[DOLLAR_OPENERS.top];
3838 int opener_index = DOLLAR_OPENERS.top;
3839 MD_MARK* closer = mark;
3840 int closer_index = mark_index;
3841
3842 if(opener->end - opener->beg == closer->end - closer->beg) {
3843 /* We are the matching closer */
3844 md_mark_stack_pop(ctx, stack: &DOLLAR_OPENERS);
3845 md_rollback(ctx, opener_index, closer_index, MD_ROLLBACK_ALL);
3846 md_resolve_range(ctx, opener_index, closer_index);
3847
3848 /* Discard all pending openers: Latex math span do not allow
3849 * nesting. */
3850 DOLLAR_OPENERS.top = -1;
3851 return;
3852 }
3853 }
3854
3855 if(mark->flags & MD_MARK_POTENTIAL_OPENER)
3856 md_mark_stack_push(ctx, stack: &DOLLAR_OPENERS, mark_index);
3857}
3858
3859static MD_MARK*
3860md_scan_left_for_resolved_mark(MD_CTX* ctx, MD_MARK* mark_from, OFF off, MD_MARK** p_cursor)
3861{
3862 MD_MARK* mark;
3863
3864 for(mark = mark_from; mark >= ctx->marks; mark--) {
3865 if(mark->ch == 'D' || mark->beg > off)
3866 continue;
3867 if(mark->beg <= off && off < mark->end && (mark->flags & MD_MARK_RESOLVED)) {
3868 if(p_cursor != NULL)
3869 *p_cursor = mark;
3870 return mark;
3871 }
3872 if(mark->end <= off)
3873 break;
3874 }
3875
3876 if(p_cursor != NULL)
3877 *p_cursor = mark;
3878 return NULL;
3879}
3880
3881static MD_MARK*
3882md_scan_right_for_resolved_mark(MD_CTX* ctx, MD_MARK* mark_from, OFF off, MD_MARK** p_cursor)
3883{
3884 MD_MARK* mark;
3885
3886 for(mark = mark_from; mark < ctx->marks + ctx->n_marks; mark++) {
3887 if(mark->ch == 'D' || mark->end <= off)
3888 continue;
3889 if(mark->beg <= off && off < mark->end && (mark->flags & MD_MARK_RESOLVED)) {
3890 if(p_cursor != NULL)
3891 *p_cursor = mark;
3892 return mark;
3893 }
3894 if(mark->beg > off)
3895 break;
3896 }
3897
3898 if(p_cursor != NULL)
3899 *p_cursor = mark;
3900 return NULL;
3901}
3902
3903static void
3904md_analyze_permissive_autolink(MD_CTX* ctx, int mark_index)
3905{
3906 static const struct {
3907 const MD_CHAR start_char;
3908 const MD_CHAR delim_char;
3909 const MD_CHAR* allowed_nonalnum_chars;
3910 int min_components;
3911 const MD_CHAR optional_end_char;
3912 } URL_MAP[] = {
3913 { _T('\0'), _T('.'), _T(".-_"), 2, _T('\0') }, /* host, mandatory */
3914 { _T('/'), _T('/'), _T("/.-_"), 0, _T('/') }, /* path */
3915 { _T('?'), _T('&'), _T("&.-+_=()"), 1, _T('\0') }, /* query */
3916 { _T('#'), _T('\0'), _T(".-+_") , 1, _T('\0') } /* fragment */
3917 };
3918
3919 MD_MARK* opener = &ctx->marks[mark_index];
3920 MD_MARK* closer = &ctx->marks[mark_index + 1]; /* The dummy. */
3921 OFF line_beg = closer->beg; /* md_collect_mark() set this for us */
3922 OFF line_end = closer->end; /* ditto */
3923 OFF beg = opener->beg;
3924 OFF end = opener->end;
3925 MD_MARK* left_cursor = opener;
3926 int left_boundary_ok = FALSE;
3927 MD_MARK* right_cursor = opener;
3928 int right_boundary_ok = FALSE;
3929 unsigned i;
3930
3931 MD_ASSERT(closer->ch == 'D');
3932
3933 if(opener->ch == '@') {
3934 MD_ASSERT(CH(opener->beg) == _T('@'));
3935
3936 /* Scan backwards for the user name (before '@'). */
3937 while(beg > line_beg) {
3938 if(ISALNUM(beg-1))
3939 beg--;
3940 else if(beg >= line_beg+2 && ISALNUM(beg-2) &&
3941 ISANYOF(beg-1, _T(".-_+")) &&
3942 md_scan_left_for_resolved_mark(ctx, mark_from: left_cursor, off: beg-1, p_cursor: &left_cursor) == NULL &&
3943 ISALNUM(beg))
3944 beg--;
3945 else
3946 break;
3947 }
3948 if(beg == opener->beg) /* empty user name */
3949 return;
3950 }
3951
3952 /* Verify there's line boundary, whitespace, allowed punctuation or
3953 * resolved emphasis mark just before the suspected autolink. */
3954 if(beg == line_beg || ISUNICODEWHITESPACEBEFORE(beg) || ISANYOF(beg-1, _T("({["))) {
3955 left_boundary_ok = TRUE;
3956 } else if(ISANYOF(beg-1, _T("*_~"))) {
3957 MD_MARK* left_mark;
3958
3959 left_mark = md_scan_left_for_resolved_mark(ctx, mark_from: left_cursor, off: beg-1, p_cursor: &left_cursor);
3960 if(left_mark != NULL && (left_mark->flags & MD_MARK_OPENER))
3961 left_boundary_ok = TRUE;
3962 }
3963 if(!left_boundary_ok)
3964 return;
3965
3966 for(i = 0; i < SIZEOF_ARRAY(URL_MAP); i++) {
3967 int n_components = 0;
3968 int n_open_brackets = 0;
3969
3970 if(URL_MAP[i].start_char != _T('\0')) {
3971 if(end >= line_end || CH(end) != URL_MAP[i].start_char)
3972 continue;
3973 if(URL_MAP[i].min_components > 0 && (end+1 >= line_end || !ISALNUM(end+1)))
3974 continue;
3975 end++;
3976 }
3977
3978 while(end < line_end) {
3979 if(ISALNUM(end)) {
3980 if(n_components == 0)
3981 n_components++;
3982 end++;
3983 } else if(end < line_end &&
3984 ISANYOF(end, URL_MAP[i].allowed_nonalnum_chars) &&
3985 md_scan_right_for_resolved_mark(ctx, mark_from: right_cursor, off: end, p_cursor: &right_cursor) == NULL &&
3986 ((end > line_beg && (ISALNUM(end-1) || CH(end-1) == _T(')'))) || CH(end) == _T('(')) &&
3987 ((end+1 < line_end && (ISALNUM(end+1) || CH(end+1) == _T('('))) || CH(end) == _T(')')))
3988 {
3989 if(CH(end) == URL_MAP[i].delim_char)
3990 n_components++;
3991
3992 /* brackets have to be balanced. */
3993 if(CH(end) == _T('(')) {
3994 n_open_brackets++;
3995 } else if(CH(end) == _T(')')) {
3996 if(n_open_brackets <= 0)
3997 break;
3998 n_open_brackets--;
3999 }
4000
4001 end++;
4002 } else {
4003 break;
4004 }
4005 }
4006
4007 if(end < line_end && URL_MAP[i].optional_end_char != _T('\0') &&
4008 CH(end) == URL_MAP[i].optional_end_char)
4009 end++;
4010
4011 if(n_components < URL_MAP[i].min_components || n_open_brackets != 0)
4012 return;
4013
4014 if(opener->ch == '@') /* E-mail autolinks wants only the host. */
4015 break;
4016 }
4017
4018 /* Verify there's line boundary, whitespace, allowed punctuation or
4019 * resolved emphasis mark just after the suspected autolink. */
4020 if(end == line_end || ISUNICODEWHITESPACE(end) || ISANYOF(end, _T(")}].!?,;"))) {
4021 right_boundary_ok = TRUE;
4022 } else {
4023 MD_MARK* right_mark;
4024
4025 right_mark = md_scan_right_for_resolved_mark(ctx, mark_from: right_cursor, off: end, p_cursor: &right_cursor);
4026 if(right_mark != NULL && (right_mark->flags & MD_MARK_CLOSER))
4027 right_boundary_ok = TRUE;
4028 }
4029 if(!right_boundary_ok)
4030 return;
4031
4032 /* Success, we are an autolink. */
4033 opener->beg = beg;
4034 opener->end = beg;
4035 closer->beg = end;
4036 closer->end = end;
4037 closer->ch = opener->ch;
4038 md_resolve_range(ctx, opener_index: mark_index, closer_index: mark_index + 1);
4039}
4040
4041#define MD_ANALYZE_NOSKIP_EMPH 0x01
4042
4043static inline void
4044md_analyze_marks(MD_CTX* ctx, const MD_LINE* lines, MD_SIZE n_lines,
4045 int mark_beg, int mark_end, const CHAR* mark_chars, unsigned flags)
4046{
4047 int i = mark_beg;
4048 OFF last_end = lines[0].beg;
4049
4050 MD_UNUSED(lines);
4051 MD_UNUSED(n_lines);
4052
4053 while(i < mark_end) {
4054 MD_MARK* mark = &ctx->marks[i];
4055
4056 /* Skip resolved spans. */
4057 if(mark->flags & MD_MARK_RESOLVED) {
4058 if((mark->flags & MD_MARK_OPENER) &&
4059 !((flags & MD_ANALYZE_NOSKIP_EMPH) && ISANYOF_(mark->ch, "*_~")))
4060 {
4061 MD_ASSERT(i < mark->next);
4062 i = mark->next + 1;
4063 } else {
4064 i++;
4065 }
4066 continue;
4067 }
4068
4069 /* Skip marks we do not want to deal with. */
4070 if(!ISANYOF_(mark->ch, mark_chars)) {
4071 i++;
4072 continue;
4073 }
4074
4075 /* The resolving in previous step could have expanded a mark. */
4076 if(mark->beg < last_end) {
4077 i++;
4078 continue;
4079 }
4080
4081 /* Analyze the mark. */
4082 switch(mark->ch) {
4083 case '[': /* Pass through. */
4084 case '!': /* Pass through. */
4085 case ']': md_analyze_bracket(ctx, mark_index: i); break;
4086 case '&': md_analyze_entity(ctx, mark_index: i); break;
4087 case '|': md_analyze_table_cell_boundary(ctx, mark_index: i); break;
4088 case '_': /* Pass through. */
4089 case '*': md_analyze_emph(ctx, mark_index: i); break;
4090 case '~': md_analyze_tilde(ctx, mark_index: i); break;
4091 case '$': md_analyze_dollar(ctx, mark_index: i); break;
4092 case '.': /* Pass through. */
4093 case ':': /* Pass through. */
4094 case '@': md_analyze_permissive_autolink(ctx, mark_index: i); break;
4095 }
4096
4097 if(mark->flags & MD_MARK_RESOLVED) {
4098 if(mark->flags & MD_MARK_OPENER)
4099 last_end = ctx->marks[mark->next].end;
4100 else
4101 last_end = mark->end;
4102 }
4103
4104 i++;
4105 }
4106}
4107
4108/* Analyze marks (build ctx->marks). */
4109static int
4110md_analyze_inlines(MD_CTX* ctx, const MD_LINE* lines, MD_SIZE n_lines, int table_mode)
4111{
4112 int ret;
4113
4114 /* Reset the previously collected stack of marks. */
4115 ctx->n_marks = 0;
4116
4117 /* Collect all marks. */
4118 MD_CHECK(md_collect_marks(ctx, lines, n_lines, table_mode));
4119
4120 /* (1) Links. */
4121 md_analyze_marks(ctx, lines, n_lines, mark_beg: 0, mark_end: ctx->n_marks, _T("[]!"), flags: 0);
4122 MD_CHECK(md_resolve_links(ctx, lines, n_lines));
4123 BRACKET_OPENERS.top = -1;
4124 ctx->unresolved_link_head = -1;
4125 ctx->unresolved_link_tail = -1;
4126
4127 if(table_mode) {
4128 /* (2) Analyze table cell boundaries. */
4129 MD_ASSERT(n_lines == 1);
4130 ctx->n_table_cell_boundaries = 0;
4131 md_analyze_marks(ctx, lines, n_lines, mark_beg: 0, mark_end: ctx->n_marks, _T("|"), flags: 0);
4132 return ret;
4133 }
4134
4135 /* (3) Emphasis and strong emphasis; permissive autolinks. */
4136 md_analyze_link_contents(ctx, lines, n_lines, mark_beg: 0, mark_end: ctx->n_marks);
4137
4138abort:
4139 return ret;
4140}
4141
4142static void
4143md_analyze_link_contents(MD_CTX* ctx, const MD_LINE* lines, MD_SIZE n_lines,
4144 int mark_beg, int mark_end)
4145{
4146 int i;
4147
4148 md_analyze_marks(ctx, lines, n_lines, mark_beg, mark_end, _T("&"), flags: 0);
4149 md_analyze_marks(ctx, lines, n_lines, mark_beg, mark_end, _T("*_~$"), flags: 0);
4150
4151 if((ctx->parser.flags & MD_FLAG_PERMISSIVEAUTOLINKS) != 0) {
4152 /* These have to be processed last, as they may be greedy and expand
4153 * from their original mark. Also their implementation must be careful
4154 * not to cross any (previously) resolved marks when doing so. */
4155 md_analyze_marks(ctx, lines, n_lines, mark_beg, mark_end, _T("@:."), MD_ANALYZE_NOSKIP_EMPH);
4156 }
4157
4158 for(i = 0; i < (int) SIZEOF_ARRAY(ctx->opener_stacks); i++)
4159 ctx->opener_stacks[i].top = -1;
4160}
4161
4162static int
4163md_enter_leave_span_a(MD_CTX* ctx, int enter, MD_SPANTYPE type,
4164 const CHAR* dest, SZ dest_size, int is_autolink,
4165 const CHAR* title, SZ title_size)
4166{
4167 MD_ATTRIBUTE_BUILD href_build = { 0 };
4168 MD_ATTRIBUTE_BUILD title_build = { 0 };
4169 MD_SPAN_A_DETAIL det;
4170 int ret = 0;
4171
4172 /* Note we here rely on fact that MD_SPAN_A_DETAIL and
4173 * MD_SPAN_IMG_DETAIL are binary-compatible. */
4174 memset(s: &det, c: 0, n: sizeof(MD_SPAN_A_DETAIL));
4175 MD_CHECK(md_build_attribute(ctx, dest, dest_size,
4176 (is_autolink ? MD_BUILD_ATTR_NO_ESCAPES : 0),
4177 &det.href, &href_build));
4178 MD_CHECK(md_build_attribute(ctx, title, title_size, 0, &det.title, &title_build));
4179 det.is_autolink = is_autolink;
4180 if(enter)
4181 MD_ENTER_SPAN(type, &det);
4182 else
4183 MD_LEAVE_SPAN(type, &det);
4184
4185abort:
4186 md_free_attribute(ctx, build: &href_build);
4187 md_free_attribute(ctx, build: &title_build);
4188 return ret;
4189}
4190
4191static int
4192md_enter_leave_span_wikilink(MD_CTX* ctx, int enter, const CHAR* target, SZ target_size)
4193{
4194 MD_ATTRIBUTE_BUILD target_build = { 0 };
4195 MD_SPAN_WIKILINK_DETAIL det;
4196 int ret = 0;
4197
4198 memset(s: &det, c: 0, n: sizeof(MD_SPAN_WIKILINK_DETAIL));
4199 MD_CHECK(md_build_attribute(ctx, target, target_size, 0, &det.target, &target_build));
4200
4201 if (enter)
4202 MD_ENTER_SPAN(MD_SPAN_WIKILINK, &det);
4203 else
4204 MD_LEAVE_SPAN(MD_SPAN_WIKILINK, &det);
4205
4206abort:
4207 md_free_attribute(ctx, build: &target_build);
4208 return ret;
4209}
4210
4211
4212/* Render the output, accordingly to the analyzed ctx->marks. */
4213static int
4214md_process_inlines(MD_CTX* ctx, const MD_LINE* lines, MD_SIZE n_lines)
4215{
4216 MD_TEXTTYPE text_type;
4217 const MD_LINE* line = lines;
4218 MD_MARK* prev_mark = NULL;
4219 MD_MARK* mark;
4220 OFF off = lines[0].beg;
4221 OFF end = lines[n_lines-1].end;
4222 OFF tmp;
4223 int enforce_hardbreak = 0;
4224 int ret = 0;
4225
4226 /* Find first resolved mark. Note there is always at least one resolved
4227 * mark, the dummy last one after the end of the latest line we actually
4228 * never really reach. This saves us of a lot of special checks and cases
4229 * in this function. */
4230 mark = ctx->marks;
4231 while(!(mark->flags & MD_MARK_RESOLVED))
4232 mark++;
4233
4234 text_type = MD_TEXT_NORMAL;
4235
4236 while(1) {
4237 /* Process the text up to the next mark or end-of-line. */
4238 tmp = (line->end < mark->beg ? line->end : mark->beg);
4239 if(tmp > off) {
4240 MD_TEXT(text_type, STR(off), tmp - off);
4241 off = tmp;
4242 }
4243
4244 /* If reached the mark, process it and move to next one. */
4245 if(off >= mark->beg) {
4246 switch(mark->ch) {
4247 case '\\': /* Backslash escape. */
4248 if(ISNEWLINE(mark->beg+1))
4249 enforce_hardbreak = 1;
4250 else
4251 MD_TEXT(text_type, STR(mark->beg+1), 1);
4252 break;
4253
4254 case ' ': /* Non-trivial space. */
4255 MD_TEXT(text_type, _T(" "), 1);
4256 break;
4257
4258 case '`': /* Code span. */
4259 if(mark->flags & MD_MARK_OPENER) {
4260 MD_ENTER_SPAN(MD_SPAN_CODE, NULL);
4261 text_type = MD_TEXT_CODE;
4262 } else {
4263 MD_LEAVE_SPAN(MD_SPAN_CODE, NULL);
4264 text_type = MD_TEXT_NORMAL;
4265 }
4266 break;
4267
4268 case '_': /* Underline (or emphasis if we fall through). */
4269 if(ctx->parser.flags & MD_FLAG_UNDERLINE) {
4270 if(mark->flags & MD_MARK_OPENER) {
4271 while(off < mark->end) {
4272 MD_ENTER_SPAN(MD_SPAN_U, NULL);
4273 off++;
4274 }
4275 } else {
4276 while(off < mark->end) {
4277 MD_LEAVE_SPAN(MD_SPAN_U, NULL);
4278 off++;
4279 }
4280 }
4281 break;
4282 }
4283 MD_FALLTHROUGH();
4284
4285 case '*': /* Emphasis, strong emphasis. */
4286 if(mark->flags & MD_MARK_OPENER) {
4287 if((mark->end - off) % 2) {
4288 MD_ENTER_SPAN(MD_SPAN_EM, NULL);
4289 off++;
4290 }
4291 while(off + 1 < mark->end) {
4292 MD_ENTER_SPAN(MD_SPAN_STRONG, NULL);
4293 off += 2;
4294 }
4295 } else {
4296 while(off + 1 < mark->end) {
4297 MD_LEAVE_SPAN(MD_SPAN_STRONG, NULL);
4298 off += 2;
4299 }
4300 if((mark->end - off) % 2) {
4301 MD_LEAVE_SPAN(MD_SPAN_EM, NULL);
4302 off++;
4303 }
4304 }
4305 break;
4306
4307 case '~':
4308 if(mark->flags & MD_MARK_OPENER)
4309 MD_ENTER_SPAN(MD_SPAN_DEL, NULL);
4310 else
4311 MD_LEAVE_SPAN(MD_SPAN_DEL, NULL);
4312 break;
4313
4314 case '$':
4315 if(mark->flags & MD_MARK_OPENER) {
4316 MD_ENTER_SPAN((mark->end - off) % 2 ? MD_SPAN_LATEXMATH : MD_SPAN_LATEXMATH_DISPLAY, NULL);
4317 text_type = MD_TEXT_LATEXMATH;
4318 } else {
4319 MD_LEAVE_SPAN((mark->end - off) % 2 ? MD_SPAN_LATEXMATH : MD_SPAN_LATEXMATH_DISPLAY, NULL);
4320 text_type = MD_TEXT_NORMAL;
4321 }
4322 break;
4323
4324 case '[': /* Link, wiki link, image. */
4325 case '!':
4326 case ']':
4327 {
4328 const MD_MARK* opener = (mark->ch != ']' ? mark : &ctx->marks[mark->prev]);
4329 const MD_MARK* closer = &ctx->marks[opener->next];
4330 const MD_MARK* dest_mark;
4331 const MD_MARK* title_mark;
4332
4333 if ((opener->ch == '[' && closer->ch == ']') &&
4334 opener->end - opener->beg >= 2 &&
4335 closer->end - closer->beg >= 2)
4336 {
4337 int has_label = (opener->end - opener->beg > 2);
4338 SZ target_sz;
4339
4340 if(has_label)
4341 target_sz = opener->end - (opener->beg+2);
4342 else
4343 target_sz = closer->beg - opener->end;
4344
4345 MD_CHECK(md_enter_leave_span_wikilink(ctx, (mark->ch != ']'),
4346 has_label ? STR(opener->beg+2) : STR(opener->end),
4347 target_sz));
4348
4349 break;
4350 }
4351
4352 dest_mark = opener+1;
4353 MD_ASSERT(dest_mark->ch == 'D');
4354 title_mark = opener+2;
4355 MD_ASSERT(title_mark->ch == 'D');
4356
4357 MD_CHECK(md_enter_leave_span_a(ctx, (mark->ch != ']'),
4358 (opener->ch == '!' ? MD_SPAN_IMG : MD_SPAN_A),
4359 STR(dest_mark->beg), dest_mark->end - dest_mark->beg, FALSE,
4360 md_mark_get_ptr(ctx, (int)(title_mark - ctx->marks)),
4361 title_mark->prev));
4362
4363 /* link/image closer may span multiple lines. */
4364 if(mark->ch == ']') {
4365 while(mark->end > line->end)
4366 line++;
4367 }
4368
4369 break;
4370 }
4371
4372 case '<':
4373 case '>': /* Autolink or raw HTML. */
4374 if(!(mark->flags & MD_MARK_AUTOLINK)) {
4375 /* Raw HTML. */
4376 if(mark->flags & MD_MARK_OPENER)
4377 text_type = MD_TEXT_HTML;
4378 else
4379 text_type = MD_TEXT_NORMAL;
4380 break;
4381 }
4382 /* Pass through, if auto-link. */
4383 MD_FALLTHROUGH();
4384
4385 case '@': /* Permissive e-mail autolink. */
4386 case ':': /* Permissive URL autolink. */
4387 case '.': /* Permissive WWW autolink. */
4388 {
4389 MD_MARK* opener = ((mark->flags & MD_MARK_OPENER) ? mark : &ctx->marks[mark->prev]);
4390 MD_MARK* closer = &ctx->marks[opener->next];
4391 const CHAR* dest = STR(opener->end);
4392 SZ dest_size = closer->beg - opener->end;
4393
4394 /* For permissive auto-links we do not know closer mark
4395 * position at the time of md_collect_marks(), therefore
4396 * it can be out-of-order in ctx->marks[].
4397 *
4398 * With this flag, we make sure that we output the closer
4399 * only if we processed the opener. */
4400 if(mark->flags & MD_MARK_OPENER)
4401 closer->flags |= MD_MARK_VALIDPERMISSIVEAUTOLINK;
4402
4403 if(opener->ch == '@' || opener->ch == '.' ||
4404 (opener->ch == '<' && (opener->flags & MD_MARK_AUTOLINK_MISSING_MAILTO)))
4405 {
4406 dest_size += 7;
4407 MD_TEMP_BUFFER(dest_size * sizeof(CHAR));
4408 memcpy(dest: ctx->buffer,
4409 src: (opener->ch == '.' ? _T("http://") : _T("mailto:")),
4410 n: 7 * sizeof(CHAR));
4411 memcpy(dest: ctx->buffer + 7, src: dest, n: (dest_size-7) * sizeof(CHAR));
4412 dest = ctx->buffer;
4413 }
4414
4415 if(closer->flags & MD_MARK_VALIDPERMISSIVEAUTOLINK)
4416 MD_CHECK(md_enter_leave_span_a(ctx, (mark->flags & MD_MARK_OPENER),
4417 MD_SPAN_A, dest, dest_size, TRUE, NULL, 0));
4418 break;
4419 }
4420
4421 case '&': /* Entity. */
4422 MD_TEXT(MD_TEXT_ENTITY, STR(mark->beg), mark->end - mark->beg);
4423 break;
4424
4425 case '\0':
4426 MD_TEXT(MD_TEXT_NULLCHAR, _T(""), 1);
4427 break;
4428
4429 case 127:
4430 goto abort;
4431 }
4432
4433 off = mark->end;
4434
4435 /* Move to next resolved mark. */
4436 prev_mark = mark;
4437 mark++;
4438 while(!(mark->flags & MD_MARK_RESOLVED) || mark->beg < off)
4439 mark++;
4440 }
4441
4442 /* If reached end of line, move to next one. */
4443 if(off >= line->end) {
4444 /* If it is the last line, we are done. */
4445 if(off >= end)
4446 break;
4447
4448 if(text_type == MD_TEXT_CODE || text_type == MD_TEXT_LATEXMATH) {
4449 MD_ASSERT(prev_mark != NULL);
4450 MD_ASSERT(ISANYOF2_(prev_mark->ch, '`', '$') && (prev_mark->flags & MD_MARK_OPENER));
4451 MD_ASSERT(ISANYOF2_(mark->ch, '`', '$') && (mark->flags & MD_MARK_CLOSER));
4452
4453 /* Inside a code span, trailing line whitespace has to be
4454 * outputted. */
4455 tmp = off;
4456 while(off < ctx->size && ISBLANK(off))
4457 off++;
4458 if(off > tmp)
4459 MD_TEXT(text_type, STR(tmp), off-tmp);
4460
4461 /* and new lines are transformed into single spaces. */
4462 if(off == line->end)
4463 MD_TEXT(text_type, _T(" "), 1);
4464 } else if(text_type == MD_TEXT_HTML) {
4465 /* Inside raw HTML, we output the new line verbatim, including
4466 * any trailing spaces. */
4467 tmp = off;
4468 while(tmp < end && ISBLANK(tmp))
4469 tmp++;
4470 if(tmp > off)
4471 MD_TEXT(MD_TEXT_HTML, STR(off), tmp - off);
4472 MD_TEXT(MD_TEXT_HTML, _T("\n"), 1);
4473 } else {
4474 /* Output soft or hard line break. */
4475 MD_TEXTTYPE break_type = MD_TEXT_SOFTBR;
4476
4477 if(text_type == MD_TEXT_NORMAL) {
4478 if(ctx->parser.flags & MD_FLAG_HARD_SOFT_BREAKS)
4479 break_type = MD_TEXT_BR;
4480 else if(enforce_hardbreak)
4481 break_type = MD_TEXT_BR;
4482 else if((CH(line->end) == _T(' ') && CH(line->end+1) == _T(' ')))
4483 break_type = MD_TEXT_BR;
4484 }
4485
4486 MD_TEXT(break_type, _T("\n"), 1);
4487 }
4488
4489 /* Move to the next line. */
4490 line++;
4491 off = line->beg;
4492
4493 enforce_hardbreak = 0;
4494 }
4495 }
4496
4497abort:
4498 return ret;
4499}
4500
4501
4502/***************************
4503 *** Processing Tables ***
4504 ***************************/
4505
4506static void
4507md_analyze_table_alignment(MD_CTX* ctx, OFF beg, OFF end, MD_ALIGN* align, int n_align)
4508{
4509 static const MD_ALIGN align_map[] = { MD_ALIGN_DEFAULT, MD_ALIGN_LEFT, MD_ALIGN_RIGHT, MD_ALIGN_CENTER };
4510 OFF off = beg;
4511
4512 while(n_align > 0) {
4513 int index = 0; /* index into align_map[] */
4514
4515 while(CH(off) != _T('-'))
4516 off++;
4517 if(off > beg && CH(off-1) == _T(':'))
4518 index |= 1;
4519 while(off < end && CH(off) == _T('-'))
4520 off++;
4521 if(off < end && CH(off) == _T(':'))
4522 index |= 2;
4523
4524 *align = align_map[index];
4525 align++;
4526 n_align--;
4527 }
4528
4529}
4530
4531/* Forward declaration. */
4532static int md_process_normal_block_contents(MD_CTX* ctx, const MD_LINE* lines, MD_SIZE n_lines);
4533
4534static int
4535md_process_table_cell(MD_CTX* ctx, MD_BLOCKTYPE cell_type, MD_ALIGN align, OFF beg, OFF end)
4536{
4537 MD_LINE line;
4538 MD_BLOCK_TD_DETAIL det;
4539 int ret = 0;
4540
4541 while(beg < end && ISWHITESPACE(beg))
4542 beg++;
4543 while(end > beg && ISWHITESPACE(end-1))
4544 end--;
4545
4546 det.align = align;
4547 line.beg = beg;
4548 line.end = end;
4549
4550 MD_ENTER_BLOCK(cell_type, &det);
4551 MD_CHECK(md_process_normal_block_contents(ctx, &line, 1));
4552 MD_LEAVE_BLOCK(cell_type, &det);
4553
4554abort:
4555 return ret;
4556}
4557
4558static int
4559md_process_table_row(MD_CTX* ctx, MD_BLOCKTYPE cell_type, OFF beg, OFF end,
4560 const MD_ALIGN* align, int col_count)
4561{
4562 MD_LINE line;
4563 OFF* pipe_offs = NULL;
4564 int i, j, k, n;
4565 int ret = 0;
4566
4567 line.beg = beg;
4568 line.end = end;
4569
4570 /* Break the line into table cells by identifying pipe characters who
4571 * form the cell boundary. */
4572 MD_CHECK(md_analyze_inlines(ctx, &line, 1, TRUE));
4573
4574 /* We have to remember the cell boundaries in local buffer because
4575 * ctx->marks[] shall be reused during cell contents processing. */
4576 n = ctx->n_table_cell_boundaries + 2;
4577 pipe_offs = (OFF*) malloc(size: n * sizeof(OFF));
4578 if(pipe_offs == NULL) {
4579 MD_LOG("malloc() failed.");
4580 ret = -1;
4581 goto abort;
4582 }
4583 j = 0;
4584 pipe_offs[j++] = beg;
4585 for(i = ctx->table_cell_boundaries_head; i >= 0; i = ctx->marks[i].next) {
4586 MD_MARK* mark = &ctx->marks[i];
4587 pipe_offs[j++] = mark->end;
4588 }
4589 pipe_offs[j++] = end+1;
4590
4591 /* Process cells. */
4592 MD_ENTER_BLOCK(MD_BLOCK_TR, NULL);
4593 k = 0;
4594 for(i = 0; i < j-1 && k < col_count; i++) {
4595 if(pipe_offs[i] < pipe_offs[i+1]-1)
4596 MD_CHECK(md_process_table_cell(ctx, cell_type, align[k++], pipe_offs[i], pipe_offs[i+1]-1));
4597 }
4598 /* Make sure we call enough table cells even if the current table contains
4599 * too few of them. */
4600 while(k < col_count)
4601 MD_CHECK(md_process_table_cell(ctx, cell_type, align[k++], 0, 0));
4602 MD_LEAVE_BLOCK(MD_BLOCK_TR, NULL);
4603
4604abort:
4605 free(ptr: pipe_offs);
4606
4607 ctx->table_cell_boundaries_head = -1;
4608 ctx->table_cell_boundaries_tail = -1;
4609
4610 return ret;
4611}
4612
4613static int
4614md_process_table_block_contents(MD_CTX* ctx, int col_count, const MD_LINE* lines, MD_SIZE n_lines)
4615{
4616 MD_ALIGN* align;
4617 MD_SIZE line_index;
4618 int ret = 0;
4619
4620 /* At least two lines have to be present: The column headers and the line
4621 * with the underlines. */
4622 MD_ASSERT(n_lines >= 2);
4623
4624 align = malloc(size: col_count * sizeof(MD_ALIGN));
4625 if(align == NULL) {
4626 MD_LOG("malloc() failed.");
4627 ret = -1;
4628 goto abort;
4629 }
4630
4631 md_analyze_table_alignment(ctx, beg: lines[1].beg, end: lines[1].end, align, n_align: col_count);
4632
4633 MD_ENTER_BLOCK(MD_BLOCK_THEAD, NULL);
4634 MD_CHECK(md_process_table_row(ctx, MD_BLOCK_TH,
4635 lines[0].beg, lines[0].end, align, col_count));
4636 MD_LEAVE_BLOCK(MD_BLOCK_THEAD, NULL);
4637
4638 if(n_lines > 2) {
4639 MD_ENTER_BLOCK(MD_BLOCK_TBODY, NULL);
4640 for(line_index = 2; line_index < n_lines; line_index++) {
4641 MD_CHECK(md_process_table_row(ctx, MD_BLOCK_TD,
4642 lines[line_index].beg, lines[line_index].end, align, col_count));
4643 }
4644 MD_LEAVE_BLOCK(MD_BLOCK_TBODY, NULL);
4645 }
4646
4647abort:
4648 free(ptr: align);
4649 return ret;
4650}
4651
4652
4653/**************************
4654 *** Processing Block ***
4655 **************************/
4656
4657#define MD_BLOCK_CONTAINER_OPENER 0x01
4658#define MD_BLOCK_CONTAINER_CLOSER 0x02
4659#define MD_BLOCK_CONTAINER (MD_BLOCK_CONTAINER_OPENER | MD_BLOCK_CONTAINER_CLOSER)
4660#define MD_BLOCK_LOOSE_LIST 0x04
4661#define MD_BLOCK_SETEXT_HEADER 0x08
4662
4663struct MD_BLOCK_tag {
4664 MD_BLOCKTYPE type : 8;
4665 unsigned flags : 8;
4666
4667 /* MD_BLOCK_H: Header level (1 - 6)
4668 * MD_BLOCK_CODE: Non-zero if fenced, zero if indented.
4669 * MD_BLOCK_LI: Task mark character (0 if not task list item, 'x', 'X' or ' ').
4670 * MD_BLOCK_TABLE: Column count (as determined by the table underline).
4671 */
4672 unsigned data : 16;
4673
4674 /* Leaf blocks: Count of lines (MD_LINE or MD_VERBATIMLINE) on the block.
4675 * MD_BLOCK_LI: Task mark offset in the input doc.
4676 * MD_BLOCK_OL: Start item number.
4677 */
4678 MD_SIZE n_lines;
4679};
4680
4681struct MD_CONTAINER_tag {
4682 CHAR ch;
4683 unsigned is_loose : 8;
4684 unsigned is_task : 8;
4685 unsigned start;
4686 unsigned mark_indent;
4687 unsigned contents_indent;
4688 OFF block_byte_off;
4689 OFF task_mark_off;
4690};
4691
4692
4693static int
4694md_process_normal_block_contents(MD_CTX* ctx, const MD_LINE* lines, MD_SIZE n_lines)
4695{
4696 int i;
4697 int ret;
4698
4699 MD_CHECK(md_analyze_inlines(ctx, lines, n_lines, FALSE));
4700 MD_CHECK(md_process_inlines(ctx, lines, n_lines));
4701
4702abort:
4703 /* Free any temporary memory blocks stored within some dummy marks. */
4704 for(i = ctx->ptr_stack.top; i >= 0; i = ctx->marks[i].next)
4705 free(ptr: md_mark_get_ptr(ctx, mark_index: i));
4706 ctx->ptr_stack.top = -1;
4707
4708 return ret;
4709}
4710
4711static int
4712md_process_verbatim_block_contents(MD_CTX* ctx, MD_TEXTTYPE text_type, const MD_VERBATIMLINE* lines, MD_SIZE n_lines)
4713{
4714 static const CHAR indent_chunk_str[] = _T(" ");
4715 static const SZ indent_chunk_size = SIZEOF_ARRAY(indent_chunk_str) - 1;
4716
4717 MD_SIZE line_index;
4718 int ret = 0;
4719
4720 for(line_index = 0; line_index < n_lines; line_index++) {
4721 const MD_VERBATIMLINE* line = &lines[line_index];
4722 int indent = line->indent;
4723
4724 MD_ASSERT(indent >= 0);
4725
4726 /* Output code indentation. */
4727 while(indent > (int) indent_chunk_size) {
4728 MD_TEXT(text_type, indent_chunk_str, indent_chunk_size);
4729 indent -= indent_chunk_size;
4730 }
4731 if(indent > 0)
4732 MD_TEXT(text_type, indent_chunk_str, indent);
4733
4734 /* Output the code line itself. */
4735 MD_TEXT_INSECURE(text_type, STR(line->beg), line->end - line->beg);
4736
4737 /* Enforce end-of-line. */
4738 MD_TEXT(text_type, _T("\n"), 1);
4739 }
4740
4741abort:
4742 return ret;
4743}
4744
4745static int
4746md_process_code_block_contents(MD_CTX* ctx, int is_fenced, const MD_VERBATIMLINE* lines, MD_SIZE n_lines)
4747{
4748 if(is_fenced) {
4749 /* Skip the first line in case of fenced code: It is the fence.
4750 * (Only the starting fence is present due to logic in md_analyze_line().) */
4751 lines++;
4752 n_lines--;
4753 } else {
4754 /* Ignore blank lines at start/end of indented code block. */
4755 while(n_lines > 0 && lines[0].beg == lines[0].end) {
4756 lines++;
4757 n_lines--;
4758 }
4759 while(n_lines > 0 && lines[n_lines-1].beg == lines[n_lines-1].end) {
4760 n_lines--;
4761 }
4762 }
4763
4764 if(n_lines == 0)
4765 return 0;
4766
4767 return md_process_verbatim_block_contents(ctx, text_type: MD_TEXT_CODE, lines, n_lines);
4768}
4769
4770static int
4771md_setup_fenced_code_detail(MD_CTX* ctx, const MD_BLOCK* block, MD_BLOCK_CODE_DETAIL* det,
4772 MD_ATTRIBUTE_BUILD* info_build, MD_ATTRIBUTE_BUILD* lang_build)
4773{
4774 const MD_VERBATIMLINE* fence_line = (const MD_VERBATIMLINE*)(block + 1);
4775 OFF beg = fence_line->beg;
4776 OFF end = fence_line->end;
4777 OFF lang_end;
4778 CHAR fence_ch = CH(fence_line->beg);
4779 int ret = 0;
4780
4781 /* Skip the fence itself. */
4782 while(beg < ctx->size && CH(beg) == fence_ch)
4783 beg++;
4784 /* Trim initial spaces. */
4785 while(beg < ctx->size && CH(beg) == _T(' '))
4786 beg++;
4787
4788 /* Trim trailing spaces. */
4789 while(end > beg && CH(end-1) == _T(' '))
4790 end--;
4791
4792 /* Build info string attribute. */
4793 MD_CHECK(md_build_attribute(ctx, STR(beg), end - beg, 0, &det->info, info_build));
4794
4795 /* Build info string attribute. */
4796 lang_end = beg;
4797 while(lang_end < end && !ISWHITESPACE(lang_end))
4798 lang_end++;
4799 MD_CHECK(md_build_attribute(ctx, STR(beg), lang_end - beg, 0, &det->lang, lang_build));
4800
4801 det->fence_char = fence_ch;
4802
4803abort:
4804 return ret;
4805}
4806
4807static int
4808md_process_leaf_block(MD_CTX* ctx, const MD_BLOCK* block)
4809{
4810 union {
4811 MD_BLOCK_H_DETAIL header;
4812 MD_BLOCK_CODE_DETAIL code;
4813 MD_BLOCK_TABLE_DETAIL table;
4814 } det;
4815 MD_ATTRIBUTE_BUILD info_build;
4816 MD_ATTRIBUTE_BUILD lang_build;
4817 int is_in_tight_list;
4818 int clean_fence_code_detail = FALSE;
4819 int ret = 0;
4820
4821 memset(s: &det, c: 0, n: sizeof(det));
4822
4823 if(ctx->n_containers == 0)
4824 is_in_tight_list = FALSE;
4825 else
4826 is_in_tight_list = !ctx->containers[ctx->n_containers-1].is_loose;
4827
4828 switch(block->type) {
4829 case MD_BLOCK_H:
4830 det.header.level = block->data;
4831 break;
4832
4833 case MD_BLOCK_CODE:
4834 /* For fenced code block, we may need to set the info string. */
4835 if(block->data != 0) {
4836 memset(s: &det.code, c: 0, n: sizeof(MD_BLOCK_CODE_DETAIL));
4837 clean_fence_code_detail = TRUE;
4838 MD_CHECK(md_setup_fenced_code_detail(ctx, block, &det.code, &info_build, &lang_build));
4839 }
4840 break;
4841
4842 case MD_BLOCK_TABLE:
4843 det.table.col_count = block->data;
4844 det.table.head_row_count = 1;
4845 det.table.body_row_count = block->n_lines - 2;
4846 break;
4847
4848 default:
4849 /* Noop. */
4850 break;
4851 }
4852
4853 if(!is_in_tight_list || block->type != MD_BLOCK_P)
4854 MD_ENTER_BLOCK(block->type, (void*) &det);
4855
4856 /* Process the block contents accordingly to is type. */
4857 switch(block->type) {
4858 case MD_BLOCK_HR:
4859 /* noop */
4860 break;
4861
4862 case MD_BLOCK_CODE:
4863 MD_CHECK(md_process_code_block_contents(ctx, (block->data != 0),
4864 (const MD_VERBATIMLINE*)(block + 1), block->n_lines));
4865 break;
4866
4867 case MD_BLOCK_HTML:
4868 MD_CHECK(md_process_verbatim_block_contents(ctx, MD_TEXT_HTML,
4869 (const MD_VERBATIMLINE*)(block + 1), block->n_lines));
4870 break;
4871
4872 case MD_BLOCK_TABLE:
4873 MD_CHECK(md_process_table_block_contents(ctx, block->data,
4874 (const MD_LINE*)(block + 1), block->n_lines));
4875 break;
4876
4877 default:
4878 MD_CHECK(md_process_normal_block_contents(ctx,
4879 (const MD_LINE*)(block + 1), block->n_lines));
4880 break;
4881 }
4882
4883 if(!is_in_tight_list || block->type != MD_BLOCK_P)
4884 MD_LEAVE_BLOCK(block->type, (void*) &det);
4885
4886abort:
4887 if(clean_fence_code_detail) {
4888 md_free_attribute(ctx, build: &info_build);
4889 md_free_attribute(ctx, build: &lang_build);
4890 }
4891 return ret;
4892}
4893
4894static int
4895md_process_all_blocks(MD_CTX* ctx)
4896{
4897 int byte_off = 0;
4898 int ret = 0;
4899
4900 /* ctx->containers now is not needed for detection of lists and list items
4901 * so we reuse it for tracking what lists are loose or tight. We rely
4902 * on the fact the vector is large enough to hold the deepest nesting
4903 * level of lists. */
4904 ctx->n_containers = 0;
4905
4906 while(byte_off < ctx->n_block_bytes) {
4907 MD_BLOCK* block = (MD_BLOCK*)((char*)ctx->block_bytes + byte_off);
4908 union {
4909 MD_BLOCK_UL_DETAIL ul;
4910 MD_BLOCK_OL_DETAIL ol;
4911 MD_BLOCK_LI_DETAIL li;
4912 } det;
4913
4914 switch(block->type) {
4915 case MD_BLOCK_UL:
4916 det.ul.is_tight = (block->flags & MD_BLOCK_LOOSE_LIST) ? FALSE : TRUE;
4917 det.ul.mark = (CHAR) block->data;
4918 break;
4919
4920 case MD_BLOCK_OL:
4921 det.ol.start = block->n_lines;
4922 det.ol.is_tight = (block->flags & MD_BLOCK_LOOSE_LIST) ? FALSE : TRUE;
4923 det.ol.mark_delimiter = (CHAR) block->data;
4924 break;
4925
4926 case MD_BLOCK_LI:
4927 det.li.is_task = (block->data != 0);
4928 det.li.task_mark = (CHAR) block->data;
4929 det.li.task_mark_offset = (OFF) block->n_lines;
4930 break;
4931
4932 default:
4933 /* noop */
4934 break;
4935 }
4936
4937 if(block->flags & MD_BLOCK_CONTAINER) {
4938 if(block->flags & MD_BLOCK_CONTAINER_CLOSER) {
4939 MD_LEAVE_BLOCK(block->type, &det);
4940
4941 if(block->type == MD_BLOCK_UL || block->type == MD_BLOCK_OL || block->type == MD_BLOCK_QUOTE)
4942 ctx->n_containers--;
4943 }
4944
4945 if(block->flags & MD_BLOCK_CONTAINER_OPENER) {
4946 MD_ENTER_BLOCK(block->type, &det);
4947
4948 if(block->type == MD_BLOCK_UL || block->type == MD_BLOCK_OL) {
4949 ctx->containers[ctx->n_containers].is_loose = (block->flags & MD_BLOCK_LOOSE_LIST);
4950 ctx->n_containers++;
4951 } else if(block->type == MD_BLOCK_QUOTE) {
4952 /* This causes that any text in a block quote, even if
4953 * nested inside a tight list item, is wrapped with
4954 * <p>...</p>. */
4955 ctx->containers[ctx->n_containers].is_loose = TRUE;
4956 ctx->n_containers++;
4957 }
4958 }
4959 } else {
4960 MD_CHECK(md_process_leaf_block(ctx, block));
4961
4962 if(block->type == MD_BLOCK_CODE || block->type == MD_BLOCK_HTML)
4963 byte_off += block->n_lines * sizeof(MD_VERBATIMLINE);
4964 else
4965 byte_off += block->n_lines * sizeof(MD_LINE);
4966 }
4967
4968 byte_off += sizeof(MD_BLOCK);
4969 }
4970
4971 ctx->n_block_bytes = 0;
4972
4973abort:
4974 return ret;
4975}
4976
4977
4978/************************************
4979 *** Grouping Lines into Blocks ***
4980 ************************************/
4981
4982static void*
4983md_push_block_bytes(MD_CTX* ctx, int n_bytes)
4984{
4985 void* ptr;
4986
4987 if(ctx->n_block_bytes + n_bytes > ctx->alloc_block_bytes) {
4988 void* new_block_bytes;
4989
4990 ctx->alloc_block_bytes = (ctx->alloc_block_bytes > 0
4991 ? ctx->alloc_block_bytes + ctx->alloc_block_bytes / 2
4992 : 512);
4993 new_block_bytes = realloc(ptr: ctx->block_bytes, size: ctx->alloc_block_bytes);
4994 if(new_block_bytes == NULL) {
4995 MD_LOG("realloc() failed.");
4996 return NULL;
4997 }
4998
4999 /* Fix the ->current_block after the reallocation. */
5000 if(ctx->current_block != NULL) {
5001 OFF off_current_block = (OFF) ((char*) ctx->current_block - (char*) ctx->block_bytes);
5002 ctx->current_block = (MD_BLOCK*) ((char*) new_block_bytes + off_current_block);
5003 }
5004
5005 ctx->block_bytes = new_block_bytes;
5006 }
5007
5008 ptr = (char*)ctx->block_bytes + ctx->n_block_bytes;
5009 ctx->n_block_bytes += n_bytes;
5010 return ptr;
5011}
5012
5013static int
5014md_start_new_block(MD_CTX* ctx, const MD_LINE_ANALYSIS* line)
5015{
5016 MD_BLOCK* block;
5017
5018 MD_ASSERT(ctx->current_block == NULL);
5019
5020 block = (MD_BLOCK*) md_push_block_bytes(ctx, n_bytes: sizeof(MD_BLOCK));
5021 if(block == NULL)
5022 return -1;
5023
5024 switch(line->type) {
5025 case MD_LINE_HR:
5026 block->type = MD_BLOCK_HR;
5027 break;
5028
5029 case MD_LINE_ATXHEADER:
5030 case MD_LINE_SETEXTHEADER:
5031 block->type = MD_BLOCK_H;
5032 break;
5033
5034 case MD_LINE_FENCEDCODE:
5035 case MD_LINE_INDENTEDCODE:
5036 block->type = MD_BLOCK_CODE;
5037 break;
5038
5039 case MD_LINE_TEXT:
5040 block->type = MD_BLOCK_P;
5041 break;
5042
5043 case MD_LINE_HTML:
5044 block->type = MD_BLOCK_HTML;
5045 break;
5046
5047 case MD_LINE_BLANK:
5048 case MD_LINE_SETEXTUNDERLINE:
5049 case MD_LINE_TABLEUNDERLINE:
5050 default:
5051 MD_UNREACHABLE();
5052 break;
5053 }
5054
5055 block->flags = 0;
5056 block->data = line->data;
5057 block->n_lines = 0;
5058
5059 ctx->current_block = block;
5060 return 0;
5061}
5062
5063/* Eat from start of current (textual) block any reference definitions and
5064 * remember them so we can resolve any links referring to them.
5065 *
5066 * (Reference definitions can only be at start of it as they cannot break
5067 * a paragraph.)
5068 */
5069static int
5070md_consume_link_reference_definitions(MD_CTX* ctx)
5071{
5072 MD_LINE* lines = (MD_LINE*) (ctx->current_block + 1);
5073 MD_SIZE n_lines = ctx->current_block->n_lines;
5074 MD_SIZE n = 0;
5075
5076 /* Compute how many lines at the start of the block form one or more
5077 * reference definitions. */
5078 while(n < n_lines) {
5079 int n_link_ref_lines;
5080
5081 n_link_ref_lines = md_is_link_reference_definition(ctx,
5082 lines: lines + n, n_lines: n_lines - n);
5083 /* Not a reference definition? */
5084 if(n_link_ref_lines == 0)
5085 break;
5086
5087 /* We fail if it is the ref. def. but it could not be stored due
5088 * a memory allocation error. */
5089 if(n_link_ref_lines < 0)
5090 return -1;
5091
5092 n += n_link_ref_lines;
5093 }
5094
5095 /* If there was at least one reference definition, we need to remove
5096 * its lines from the block, or perhaps even the whole block. */
5097 if(n > 0) {
5098 if(n == n_lines) {
5099 /* Remove complete block. */
5100 ctx->n_block_bytes -= n * sizeof(MD_LINE);
5101 ctx->n_block_bytes -= sizeof(MD_BLOCK);
5102 ctx->current_block = NULL;
5103 } else {
5104 /* Remove just some initial lines from the block. */
5105 memmove(dest: lines, src: lines + n, n: (n_lines - n) * sizeof(MD_LINE));
5106 ctx->current_block->n_lines -= n;
5107 ctx->n_block_bytes -= n * sizeof(MD_LINE);
5108 }
5109 }
5110
5111 return 0;
5112}
5113
5114static int
5115md_end_current_block(MD_CTX* ctx)
5116{
5117 int ret = 0;
5118
5119 if(ctx->current_block == NULL)
5120 return ret;
5121
5122 /* Check whether there is a reference definition. (We do this here instead
5123 * of in md_analyze_line() because reference definition can take multiple
5124 * lines.) */
5125 if(ctx->current_block->type == MD_BLOCK_P ||
5126 (ctx->current_block->type == MD_BLOCK_H && (ctx->current_block->flags & MD_BLOCK_SETEXT_HEADER)))
5127 {
5128 MD_LINE* lines = (MD_LINE*) (ctx->current_block + 1);
5129 if(lines[0].beg < ctx->size && CH(lines[0].beg) == _T('[')) {
5130 MD_CHECK(md_consume_link_reference_definitions(ctx));
5131 if(ctx->current_block == NULL)
5132 return ret;
5133 }
5134 }
5135
5136 if(ctx->current_block->type == MD_BLOCK_H && (ctx->current_block->flags & MD_BLOCK_SETEXT_HEADER)) {
5137 MD_SIZE n_lines = ctx->current_block->n_lines;
5138
5139 if(n_lines > 1) {
5140 /* Get rid of the underline. */
5141 ctx->current_block->n_lines--;
5142 ctx->n_block_bytes -= sizeof(MD_LINE);
5143 } else {
5144 /* Only the underline has left after eating the ref. defs.
5145 * Keep the line as beginning of a new ordinary paragraph. */
5146 ctx->current_block->type = MD_BLOCK_P;
5147 return 0;
5148 }
5149 }
5150
5151 /* Mark we are not building any block anymore. */
5152 ctx->current_block = NULL;
5153
5154abort:
5155 return ret;
5156}
5157
5158static int
5159md_add_line_into_current_block(MD_CTX* ctx, const MD_LINE_ANALYSIS* analysis)
5160{
5161 MD_ASSERT(ctx->current_block != NULL);
5162
5163 if(ctx->current_block->type == MD_BLOCK_CODE || ctx->current_block->type == MD_BLOCK_HTML) {
5164 MD_VERBATIMLINE* line;
5165
5166 line = (MD_VERBATIMLINE*) md_push_block_bytes(ctx, n_bytes: sizeof(MD_VERBATIMLINE));
5167 if(line == NULL)
5168 return -1;
5169
5170 line->indent = analysis->indent;
5171 line->beg = analysis->beg;
5172 line->end = analysis->end;
5173 } else {
5174 MD_LINE* line;
5175
5176 line = (MD_LINE*) md_push_block_bytes(ctx, n_bytes: sizeof(MD_LINE));
5177 if(line == NULL)
5178 return -1;
5179
5180 line->beg = analysis->beg;
5181 line->end = analysis->end;
5182 }
5183 ctx->current_block->n_lines++;
5184
5185 return 0;
5186}
5187
5188static int
5189md_push_container_bytes(MD_CTX* ctx, MD_BLOCKTYPE type, unsigned start,
5190 unsigned data, unsigned flags)
5191{
5192 MD_BLOCK* block;
5193 int ret = 0;
5194
5195 MD_CHECK(md_end_current_block(ctx));
5196
5197 block = (MD_BLOCK*) md_push_block_bytes(ctx, n_bytes: sizeof(MD_BLOCK));
5198 if(block == NULL)
5199 return -1;
5200
5201 block->type = type;
5202 block->flags = flags;
5203 block->data = data;
5204 block->n_lines = start;
5205
5206abort:
5207 return ret;
5208}
5209
5210
5211
5212/***********************
5213 *** Line Analysis ***
5214 ***********************/
5215
5216static int
5217md_is_hr_line(MD_CTX* ctx, OFF beg, OFF* p_end, OFF* p_killer)
5218{
5219 OFF off = beg + 1;
5220 int n = 1;
5221
5222 while(off < ctx->size && (CH(off) == CH(beg) || CH(off) == _T(' ') || CH(off) == _T('\t'))) {
5223 if(CH(off) == CH(beg))
5224 n++;
5225 off++;
5226 }
5227
5228 if(n < 3) {
5229 *p_killer = off;
5230 return FALSE;
5231 }
5232
5233 /* Nothing else can be present on the line. */
5234 if(off < ctx->size && !ISNEWLINE(off)) {
5235 *p_killer = off;
5236 return FALSE;
5237 }
5238
5239 *p_end = off;
5240 return TRUE;
5241}
5242
5243static int
5244md_is_atxheader_line(MD_CTX* ctx, OFF beg, OFF* p_beg, OFF* p_end, unsigned* p_level)
5245{
5246 int n;
5247 OFF off = beg + 1;
5248
5249 while(off < ctx->size && CH(off) == _T('#') && off - beg < 7)
5250 off++;
5251 n = off - beg;
5252
5253 if(n > 6)
5254 return FALSE;
5255 *p_level = n;
5256
5257 if(!(ctx->parser.flags & MD_FLAG_PERMISSIVEATXHEADERS) && off < ctx->size &&
5258 CH(off) != _T(' ') && CH(off) != _T('\t') && !ISNEWLINE(off))
5259 return FALSE;
5260
5261 while(off < ctx->size && CH(off) == _T(' '))
5262 off++;
5263 *p_beg = off;
5264 *p_end = off;
5265 return TRUE;
5266}
5267
5268static int
5269md_is_setext_underline(MD_CTX* ctx, OFF beg, OFF* p_end, unsigned* p_level)
5270{
5271 OFF off = beg + 1;
5272
5273 while(off < ctx->size && CH(off) == CH(beg))
5274 off++;
5275
5276 /* Optionally, space(s) or tabs can follow. */
5277 while(off < ctx->size && ISBLANK(off))
5278 off++;
5279
5280 /* But nothing more is allowed on the line. */
5281 if(off < ctx->size && !ISNEWLINE(off))
5282 return FALSE;
5283
5284 *p_level = (CH(beg) == _T('=') ? 1 : 2);
5285 *p_end = off;
5286 return TRUE;
5287}
5288
5289static int
5290md_is_table_underline(MD_CTX* ctx, OFF beg, OFF* p_end, unsigned* p_col_count)
5291{
5292 OFF off = beg;
5293 int found_pipe = FALSE;
5294 unsigned col_count = 0;
5295
5296 if(off < ctx->size && CH(off) == _T('|')) {
5297 found_pipe = TRUE;
5298 off++;
5299 while(off < ctx->size && ISWHITESPACE(off))
5300 off++;
5301 }
5302
5303 while(1) {
5304 int delimited = FALSE;
5305
5306 /* Cell underline ("-----", ":----", "----:" or ":----:") */
5307 if(off < ctx->size && CH(off) == _T(':'))
5308 off++;
5309 if(off >= ctx->size || CH(off) != _T('-'))
5310 return FALSE;
5311 while(off < ctx->size && CH(off) == _T('-'))
5312 off++;
5313 if(off < ctx->size && CH(off) == _T(':'))
5314 off++;
5315
5316 col_count++;
5317 if(col_count > TABLE_MAXCOLCOUNT) {
5318 MD_LOG("Suppressing table (column_count >" STRINGIZE(TABLE_MAXCOLCOUNT) ")");
5319 return FALSE;
5320 }
5321
5322 /* Pipe delimiter (optional at the end of line). */
5323 while(off < ctx->size && ISWHITESPACE(off))
5324 off++;
5325 if(off < ctx->size && CH(off) == _T('|')) {
5326 delimited = TRUE;
5327 found_pipe = TRUE;
5328 off++;
5329 while(off < ctx->size && ISWHITESPACE(off))
5330 off++;
5331 }
5332
5333 /* Success, if we reach end of line. */
5334 if(off >= ctx->size || ISNEWLINE(off))
5335 break;
5336
5337 if(!delimited)
5338 return FALSE;
5339 }
5340
5341 if(!found_pipe)
5342 return FALSE;
5343
5344 *p_end = off;
5345 *p_col_count = col_count;
5346 return TRUE;
5347}
5348
5349static int
5350md_is_opening_code_fence(MD_CTX* ctx, OFF beg, OFF* p_end)
5351{
5352 OFF off = beg;
5353
5354 while(off < ctx->size && CH(off) == CH(beg))
5355 off++;
5356
5357 /* Fence must have at least three characters. */
5358 if(off - beg < 3)
5359 return FALSE;
5360
5361 ctx->code_fence_length = off - beg;
5362
5363 /* Optionally, space(s) can follow. */
5364 while(off < ctx->size && CH(off) == _T(' '))
5365 off++;
5366
5367 /* Optionally, an info string can follow. */
5368 while(off < ctx->size && !ISNEWLINE(off)) {
5369 /* Backtick-based fence must not contain '`' in the info string. */
5370 if(CH(beg) == _T('`') && CH(off) == _T('`'))
5371 return FALSE;
5372 off++;
5373 }
5374
5375 *p_end = off;
5376 return TRUE;
5377}
5378
5379static int
5380md_is_closing_code_fence(MD_CTX* ctx, CHAR ch, OFF beg, OFF* p_end)
5381{
5382 OFF off = beg;
5383 int ret = FALSE;
5384
5385 /* Closing fence must have at least the same length and use same char as
5386 * opening one. */
5387 while(off < ctx->size && CH(off) == ch)
5388 off++;
5389 if(off - beg < ctx->code_fence_length)
5390 goto out;
5391
5392 /* Optionally, space(s) can follow */
5393 while(off < ctx->size && CH(off) == _T(' '))
5394 off++;
5395
5396 /* But nothing more is allowed on the line. */
5397 if(off < ctx->size && !ISNEWLINE(off))
5398 goto out;
5399
5400 ret = TRUE;
5401
5402out:
5403 /* Note we set *p_end even on failure: If we are not closing fence, caller
5404 * would eat the line anyway without any parsing. */
5405 *p_end = off;
5406 return ret;
5407}
5408
5409
5410/* Helper data for md_is_html_block_start_condition() and
5411 * md_is_html_block_end_condition() */
5412typedef struct TAG_tag TAG;
5413struct TAG_tag {
5414 const CHAR* name;
5415 unsigned len : 8;
5416};
5417
5418#ifdef X
5419 #undef X
5420#endif
5421#define X(name) { _T(name), (sizeof(name)-1) / sizeof(CHAR) }
5422#define Xend { NULL, 0 }
5423
5424static const TAG t1[] = { X("pre"), X("script"), X("style"), X("textarea"), Xend };
5425
5426static const TAG a6[] = { X("address"), X("article"), X("aside"), Xend };
5427static const TAG b6[] = { X("base"), X("basefont"), X("blockquote"), X("body"), Xend };
5428static const TAG c6[] = { X("caption"), X("center"), X("col"), X("colgroup"), Xend };
5429static const TAG d6[] = { X("dd"), X("details"), X("dialog"), X("dir"),
5430 X("div"), X("dl"), X("dt"), Xend };
5431static const TAG f6[] = { X("fieldset"), X("figcaption"), X("figure"), X("footer"),
5432 X("form"), X("frame"), X("frameset"), Xend };
5433static const TAG h6[] = { X("h1"), X("h2"), X("h3"), X("h4"), X("h5"), X("h6"),
5434 X("head"), X("header"), X("hr"), X("html"), Xend };
5435static const TAG i6[] = { X("iframe"), Xend };
5436static const TAG l6[] = { X("legend"), X("li"), X("link"), Xend };
5437static const TAG m6[] = { X("main"), X("menu"), X("menuitem"), Xend };
5438static const TAG n6[] = { X("nav"), X("noframes"), Xend };
5439static const TAG o6[] = { X("ol"), X("optgroup"), X("option"), Xend };
5440static const TAG p6[] = { X("p"), X("param"), Xend };
5441static const TAG s6[] = { X("search"), X("section"), X("summary"), Xend };
5442static const TAG t6[] = { X("table"), X("tbody"), X("td"), X("tfoot"), X("th"),
5443 X("thead"), X("title"), X("tr"), X("track"), Xend };
5444static const TAG u6[] = { X("ul"), Xend };
5445static const TAG xx[] = { Xend };
5446
5447#undef X
5448#undef Xend
5449
5450/* Returns type of the raw HTML block, or FALSE if it is not HTML block.
5451 * (Refer to CommonMark specification for details about the types.)
5452 */
5453static int
5454md_is_html_block_start_condition(MD_CTX* ctx, OFF beg)
5455{
5456 /* Type 6 is started by a long list of allowed tags. We use two-level
5457 * tree to speed-up the search. */
5458 static const TAG* map6[26] = {
5459 a6, b6, c6, d6, xx, f6, xx, h6, i6, xx, xx, l6, m6,
5460 n6, o6, p6, xx, xx, s6, t6, u6, xx, xx, xx, xx, xx
5461 };
5462 OFF off = beg + 1;
5463 int i;
5464
5465 /* Check for type 1: <script, <pre, or <style */
5466 for(i = 0; t1[i].name != NULL; i++) {
5467 if(off + t1[i].len <= ctx->size) {
5468 if(md_ascii_case_eq(STR(off), s2: t1[i].name, n: t1[i].len))
5469 return 1;
5470 }
5471 }
5472
5473 /* Check for type 2: <!-- */
5474 if(off + 3 < ctx->size && CH(off) == _T('!') && CH(off+1) == _T('-') && CH(off+2) == _T('-'))
5475 return 2;
5476
5477 /* Check for type 3: <? */
5478 if(off < ctx->size && CH(off) == _T('?'))
5479 return 3;
5480
5481 /* Check for type 4 or 5: <! */
5482 if(off < ctx->size && CH(off) == _T('!')) {
5483 /* Check for type 4: <! followed by uppercase letter. */
5484 if(off + 1 < ctx->size && ISASCII(off+1))
5485 return 4;
5486
5487 /* Check for type 5: <![CDATA[ */
5488 if(off + 8 < ctx->size) {
5489 if(md_ascii_eq(STR(off), _T("![CDATA["), n: 8))
5490 return 5;
5491 }
5492 }
5493
5494 /* Check for type 6: Many possible starting tags listed above. */
5495 if(off + 1 < ctx->size && (ISALPHA(off) || (CH(off) == _T('/') && ISALPHA(off+1)))) {
5496 int slot;
5497 const TAG* tags;
5498
5499 if(CH(off) == _T('/'))
5500 off++;
5501
5502 slot = (ISUPPER(off) ? CH(off) - 'A' : CH(off) - 'a');
5503 tags = map6[slot];
5504
5505 for(i = 0; tags[i].name != NULL; i++) {
5506 if(off + tags[i].len <= ctx->size) {
5507 if(md_ascii_case_eq(STR(off), s2: tags[i].name, n: tags[i].len)) {
5508 OFF tmp = off + tags[i].len;
5509 if(tmp >= ctx->size)
5510 return 6;
5511 if(ISBLANK(tmp) || ISNEWLINE(tmp) || CH(tmp) == _T('>'))
5512 return 6;
5513 if(tmp+1 < ctx->size && CH(tmp) == _T('/') && CH(tmp+1) == _T('>'))
5514 return 6;
5515 break;
5516 }
5517 }
5518 }
5519 }
5520
5521 /* Check for type 7: any COMPLETE other opening or closing tag. */
5522 if(off + 1 < ctx->size) {
5523 OFF end;
5524
5525 if(md_is_html_tag(ctx, NULL, n_lines: 0, beg, max_end: ctx->size, p_end: &end)) {
5526 /* Only optional whitespace and new line may follow. */
5527 while(end < ctx->size && ISWHITESPACE(end))
5528 end++;
5529 if(end >= ctx->size || ISNEWLINE(end))
5530 return 7;
5531 }
5532 }
5533
5534 return FALSE;
5535}
5536
5537/* Case sensitive check whether there is a substring 'what' between 'beg'
5538 * and end of line. */
5539static int
5540md_line_contains(MD_CTX* ctx, OFF beg, const CHAR* what, SZ what_len, OFF* p_end)
5541{
5542 OFF i;
5543 for(i = beg; i + what_len < ctx->size; i++) {
5544 if(ISNEWLINE(i))
5545 break;
5546 if(memcmp(STR(i), s2: what, n: what_len * sizeof(CHAR)) == 0) {
5547 *p_end = i + what_len;
5548 return TRUE;
5549 }
5550 }
5551
5552 *p_end = i;
5553 return FALSE;
5554}
5555
5556/* Returns type of HTML block end condition or FALSE if not an end condition.
5557 *
5558 * Note it fills p_end even when it is not end condition as the caller
5559 * does not need to analyze contents of a raw HTML block.
5560 */
5561static int
5562md_is_html_block_end_condition(MD_CTX* ctx, OFF beg, OFF* p_end)
5563{
5564 switch(ctx->html_block_type) {
5565 case 1:
5566 {
5567 OFF off = beg;
5568 int i;
5569
5570 while(off+1 < ctx->size && !ISNEWLINE(off)) {
5571 if(CH(off) == _T('<') && CH(off+1) == _T('/')) {
5572 for(i = 0; t1[i].name != NULL; i++) {
5573 if(off + 2 + t1[i].len < ctx->size) {
5574 if(md_ascii_case_eq(STR(off+2), s2: t1[i].name, n: t1[i].len) &&
5575 CH(off+2+t1[i].len) == _T('>'))
5576 {
5577 *p_end = off+2+t1[i].len+1;
5578 return TRUE;
5579 }
5580 }
5581 }
5582 }
5583 off++;
5584 }
5585 *p_end = off;
5586 return FALSE;
5587 }
5588
5589 case 2:
5590 return (md_line_contains(ctx, beg, _T("-->"), what_len: 3, p_end) ? 2 : FALSE);
5591
5592 case 3:
5593 return (md_line_contains(ctx, beg, _T("?>"), what_len: 2, p_end) ? 3 : FALSE);
5594
5595 case 4:
5596 return (md_line_contains(ctx, beg, _T(">"), what_len: 1, p_end) ? 4 : FALSE);
5597
5598 case 5:
5599 return (md_line_contains(ctx, beg, _T("]]>"), what_len: 3, p_end) ? 5 : FALSE);
5600
5601 case 6: /* Pass through */
5602 case 7:
5603 if(beg >= ctx->size || ISNEWLINE(beg)) {
5604 /* Blank line ends types 6 and 7. */
5605 *p_end = beg;
5606 return ctx->html_block_type;
5607 }
5608 return FALSE;
5609
5610 default:
5611 MD_UNREACHABLE();
5612 }
5613 return FALSE;
5614}
5615
5616
5617static int
5618md_is_container_compatible(const MD_CONTAINER* pivot, const MD_CONTAINER* container)
5619{
5620 /* Block quote has no "items" like lists. */
5621 if(container->ch == _T('>'))
5622 return FALSE;
5623
5624 if(container->ch != pivot->ch)
5625 return FALSE;
5626 if(container->mark_indent > pivot->contents_indent)
5627 return FALSE;
5628
5629 return TRUE;
5630}
5631
5632static int
5633md_push_container(MD_CTX* ctx, const MD_CONTAINER* container)
5634{
5635 if(ctx->n_containers >= ctx->alloc_containers) {
5636 MD_CONTAINER* new_containers;
5637
5638 ctx->alloc_containers = (ctx->alloc_containers > 0
5639 ? ctx->alloc_containers + ctx->alloc_containers / 2
5640 : 16);
5641 new_containers = realloc(ptr: ctx->containers, size: ctx->alloc_containers * sizeof(MD_CONTAINER));
5642 if(new_containers == NULL) {
5643 MD_LOG("realloc() failed.");
5644 return -1;
5645 }
5646
5647 ctx->containers = new_containers;
5648 }
5649
5650 memcpy(dest: &ctx->containers[ctx->n_containers++], src: container, n: sizeof(MD_CONTAINER));
5651 return 0;
5652}
5653
5654static int
5655md_enter_child_containers(MD_CTX* ctx, int n_children)
5656{
5657 int i;
5658 int ret = 0;
5659
5660 for(i = ctx->n_containers - n_children; i < ctx->n_containers; i++) {
5661 MD_CONTAINER* c = &ctx->containers[i];
5662 int is_ordered_list = FALSE;
5663
5664 switch(c->ch) {
5665 case _T(')'):
5666 case _T('.'):
5667 is_ordered_list = TRUE;
5668 MD_FALLTHROUGH();
5669
5670 case _T('-'):
5671 case _T('+'):
5672 case _T('*'):
5673 /* Remember offset in ctx->block_bytes so we can revisit the
5674 * block if we detect it is a loose list. */
5675 md_end_current_block(ctx);
5676 c->block_byte_off = ctx->n_block_bytes;
5677
5678 MD_CHECK(md_push_container_bytes(ctx,
5679 (is_ordered_list ? MD_BLOCK_OL : MD_BLOCK_UL),
5680 c->start, c->ch, MD_BLOCK_CONTAINER_OPENER));
5681 MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_LI,
5682 c->task_mark_off,
5683 (c->is_task ? CH(c->task_mark_off) : 0),
5684 MD_BLOCK_CONTAINER_OPENER));
5685 break;
5686
5687 case _T('>'):
5688 MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_QUOTE, 0, 0, MD_BLOCK_CONTAINER_OPENER));
5689 break;
5690
5691 default:
5692 MD_UNREACHABLE();
5693 break;
5694 }
5695 }
5696
5697abort:
5698 return ret;
5699}
5700
5701static int
5702md_leave_child_containers(MD_CTX* ctx, int n_keep)
5703{
5704 int ret = 0;
5705
5706 while(ctx->n_containers > n_keep) {
5707 MD_CONTAINER* c = &ctx->containers[ctx->n_containers-1];
5708 int is_ordered_list = FALSE;
5709
5710 switch(c->ch) {
5711 case _T(')'):
5712 case _T('.'):
5713 is_ordered_list = TRUE;
5714 MD_FALLTHROUGH();
5715
5716 case _T('-'):
5717 case _T('+'):
5718 case _T('*'):
5719 MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_LI,
5720 c->task_mark_off, (c->is_task ? CH(c->task_mark_off) : 0),
5721 MD_BLOCK_CONTAINER_CLOSER));
5722 MD_CHECK(md_push_container_bytes(ctx,
5723 (is_ordered_list ? MD_BLOCK_OL : MD_BLOCK_UL), 0,
5724 c->ch, MD_BLOCK_CONTAINER_CLOSER));
5725 break;
5726
5727 case _T('>'):
5728 MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_QUOTE, 0,
5729 0, MD_BLOCK_CONTAINER_CLOSER));
5730 break;
5731
5732 default:
5733 MD_UNREACHABLE();
5734 break;
5735 }
5736
5737 ctx->n_containers--;
5738 }
5739
5740abort:
5741 return ret;
5742}
5743
5744static int
5745md_is_container_mark(MD_CTX* ctx, unsigned indent, OFF beg, OFF* p_end, MD_CONTAINER* p_container)
5746{
5747 OFF off = beg;
5748 OFF max_end;
5749
5750 if(off >= ctx->size || indent >= ctx->code_indent_offset)
5751 return FALSE;
5752
5753 /* Check for block quote mark. */
5754 if(CH(off) == _T('>')) {
5755 off++;
5756 p_container->ch = _T('>');
5757 p_container->is_loose = FALSE;
5758 p_container->is_task = FALSE;
5759 p_container->mark_indent = indent;
5760 p_container->contents_indent = indent + 1;
5761 *p_end = off;
5762 return TRUE;
5763 }
5764
5765 /* Check for list item bullet mark. */
5766 if(ISANYOF(off, _T("-+*")) && (off+1 >= ctx->size || ISBLANK(off+1) || ISNEWLINE(off+1))) {
5767 p_container->ch = CH(off);
5768 p_container->is_loose = FALSE;
5769 p_container->is_task = FALSE;
5770 p_container->mark_indent = indent;
5771 p_container->contents_indent = indent + 1;
5772 *p_end = off+1;
5773 return TRUE;
5774 }
5775
5776 /* Check for ordered list item marks. */
5777 max_end = off + 9;
5778 if(max_end > ctx->size)
5779 max_end = ctx->size;
5780 p_container->start = 0;
5781 while(off < max_end && ISDIGIT(off)) {
5782 p_container->start = p_container->start * 10 + CH(off) - _T('0');
5783 off++;
5784 }
5785 if(off > beg &&
5786 off < ctx->size &&
5787 (CH(off) == _T('.') || CH(off) == _T(')')) &&
5788 (off+1 >= ctx->size || ISBLANK(off+1) || ISNEWLINE(off+1)))
5789 {
5790 p_container->ch = CH(off);
5791 p_container->is_loose = FALSE;
5792 p_container->is_task = FALSE;
5793 p_container->mark_indent = indent;
5794 p_container->contents_indent = indent + off - beg + 1;
5795 *p_end = off+1;
5796 return TRUE;
5797 }
5798
5799 return FALSE;
5800}
5801
5802static unsigned
5803md_line_indentation(MD_CTX* ctx, unsigned total_indent, OFF beg, OFF* p_end)
5804{
5805 OFF off = beg;
5806 unsigned indent = total_indent;
5807
5808 while(off < ctx->size && ISBLANK(off)) {
5809 if(CH(off) == _T('\t'))
5810 indent = (indent + 4) & ~3;
5811 else
5812 indent++;
5813 off++;
5814 }
5815
5816 *p_end = off;
5817 return indent - total_indent;
5818}
5819
5820static const MD_LINE_ANALYSIS md_dummy_blank_line = { MD_LINE_BLANK, 0, 0, 0, 0, 0 };
5821
5822/* Analyze type of the line and find some its properties. This serves as a
5823 * main input for determining type and boundaries of a block. */
5824static int
5825md_analyze_line(MD_CTX* ctx, OFF beg, OFF* p_end,
5826 const MD_LINE_ANALYSIS* pivot_line, MD_LINE_ANALYSIS* line)
5827{
5828 unsigned total_indent = 0;
5829 int n_parents = 0;
5830 int n_brothers = 0;
5831 int n_children = 0;
5832 MD_CONTAINER container = { 0 };
5833 int prev_line_has_list_loosening_effect = ctx->last_line_has_list_loosening_effect;
5834 OFF off = beg;
5835 OFF hr_killer = 0;
5836 int ret = 0;
5837
5838 line->indent = md_line_indentation(ctx, total_indent, beg: off, p_end: &off);
5839 total_indent += line->indent;
5840 line->beg = off;
5841 line->enforce_new_block = FALSE;
5842
5843 /* Given the indentation and block quote marks '>', determine how many of
5844 * the current containers are our parents. */
5845 while(n_parents < ctx->n_containers) {
5846 MD_CONTAINER* c = &ctx->containers[n_parents];
5847
5848 if(c->ch == _T('>') && line->indent < ctx->code_indent_offset &&
5849 off < ctx->size && CH(off) == _T('>'))
5850 {
5851 /* Block quote mark. */
5852 off++;
5853 total_indent++;
5854 line->indent = md_line_indentation(ctx, total_indent, beg: off, p_end: &off);
5855 total_indent += line->indent;
5856
5857 /* The optional 1st space after '>' is part of the block quote mark. */
5858 if(line->indent > 0)
5859 line->indent--;
5860
5861 line->beg = off;
5862
5863 } else if(c->ch != _T('>') && line->indent >= c->contents_indent) {
5864 /* List. */
5865 line->indent -= c->contents_indent;
5866 } else {
5867 break;
5868 }
5869
5870 n_parents++;
5871 }
5872
5873 if(off >= ctx->size || ISNEWLINE(off)) {
5874 /* Blank line does not need any real indentation to be nested inside
5875 * a list. */
5876 if(n_brothers + n_children == 0) {
5877 while(n_parents < ctx->n_containers && ctx->containers[n_parents].ch != _T('>'))
5878 n_parents++;
5879 }
5880 }
5881
5882 while(TRUE) {
5883 /* Check whether we are fenced code continuation. */
5884 if(pivot_line->type == MD_LINE_FENCEDCODE) {
5885 line->beg = off;
5886
5887 /* We are another MD_LINE_FENCEDCODE unless we are closing fence
5888 * which we transform into MD_LINE_BLANK. */
5889 if(line->indent < ctx->code_indent_offset) {
5890 if(md_is_closing_code_fence(ctx, CH(pivot_line->beg), beg: off, p_end: &off)) {
5891 line->type = MD_LINE_BLANK;
5892 ctx->last_line_has_list_loosening_effect = FALSE;
5893 break;
5894 }
5895 }
5896
5897 /* Change indentation accordingly to the initial code fence. */
5898 if(n_parents == ctx->n_containers) {
5899 if(line->indent > pivot_line->indent)
5900 line->indent -= pivot_line->indent;
5901 else
5902 line->indent = 0;
5903
5904 line->type = MD_LINE_FENCEDCODE;
5905 break;
5906 }
5907 }
5908
5909 /* Check whether we are HTML block continuation. */
5910 if(pivot_line->type == MD_LINE_HTML && ctx->html_block_type > 0) {
5911 if(n_parents < ctx->n_containers) {
5912 /* HTML block is implicitly ended if the enclosing container
5913 * block ends. */
5914 ctx->html_block_type = 0;
5915 } else {
5916 int html_block_type;
5917
5918 html_block_type = md_is_html_block_end_condition(ctx, beg: off, p_end: &off);
5919 if(html_block_type > 0) {
5920 MD_ASSERT(html_block_type == ctx->html_block_type);
5921
5922 /* Make sure this is the last line of the block. */
5923 ctx->html_block_type = 0;
5924
5925 /* Some end conditions serve as blank lines at the same time. */
5926 if(html_block_type == 6 || html_block_type == 7) {
5927 line->type = MD_LINE_BLANK;
5928 line->indent = 0;
5929 break;
5930 }
5931 }
5932
5933 line->type = MD_LINE_HTML;
5934 n_parents = ctx->n_containers;
5935 break;
5936 }
5937 }
5938
5939 /* Check for blank line. */
5940 if(off >= ctx->size || ISNEWLINE(off)) {
5941 if(pivot_line->type == MD_LINE_INDENTEDCODE && n_parents == ctx->n_containers) {
5942 line->type = MD_LINE_INDENTEDCODE;
5943 if(line->indent > ctx->code_indent_offset)
5944 line->indent -= ctx->code_indent_offset;
5945 else
5946 line->indent = 0;
5947 ctx->last_line_has_list_loosening_effect = FALSE;
5948 } else {
5949 line->type = MD_LINE_BLANK;
5950 ctx->last_line_has_list_loosening_effect = (n_parents > 0 &&
5951 n_brothers + n_children == 0 &&
5952 ctx->containers[n_parents-1].ch != _T('>'));
5953
5954 #if 1
5955 /* See https://github.com/mity/md4c/issues/6
5956 *
5957 * This ugly checking tests we are in (yet empty) list item but
5958 * not its very first line (i.e. not the line with the list
5959 * item mark).
5960 *
5961 * If we are such a blank line, then any following non-blank
5962 * line which would be part of the list item actually has to
5963 * end the list because according to the specification, "a list
5964 * item can begin with at most one blank line."
5965 */
5966 if(n_parents > 0 && ctx->containers[n_parents-1].ch != _T('>') &&
5967 n_brothers + n_children == 0 && ctx->current_block == NULL &&
5968 ctx->n_block_bytes > (int) sizeof(MD_BLOCK))
5969 {
5970 MD_BLOCK* top_block = (MD_BLOCK*) ((char*)ctx->block_bytes + ctx->n_block_bytes - sizeof(MD_BLOCK));
5971 if(top_block->type == MD_BLOCK_LI)
5972 ctx->last_list_item_starts_with_two_blank_lines = TRUE;
5973 }
5974 #endif
5975 }
5976 break;
5977 } else {
5978 #if 1
5979 /* This is the 2nd half of the hack. If the flag is set (i.e. there
5980 * was a 2nd blank line at the beginning of the list item) and if
5981 * we would otherwise still belong to the list item, we enforce
5982 * the end of the list. */
5983 if(ctx->last_list_item_starts_with_two_blank_lines) {
5984 if(n_parents > 0 && n_parents == ctx->n_containers &&
5985 ctx->containers[n_parents-1].ch != _T('>') &&
5986 n_brothers + n_children == 0 && ctx->current_block == NULL &&
5987 ctx->n_block_bytes > (int) sizeof(MD_BLOCK))
5988 {
5989 MD_BLOCK* top_block = (MD_BLOCK*) ((char*)ctx->block_bytes + ctx->n_block_bytes - sizeof(MD_BLOCK));
5990 if(top_block->type == MD_BLOCK_LI) {
5991 n_parents--;
5992
5993 line->indent = total_indent;
5994 if(n_parents > 0)
5995 line->indent -= MIN(line->indent, ctx->containers[n_parents-1].contents_indent);
5996 }
5997 }
5998
5999 ctx->last_list_item_starts_with_two_blank_lines = FALSE;
6000 }
6001 #endif
6002 ctx->last_line_has_list_loosening_effect = FALSE;
6003 }
6004
6005 /* Check whether we are Setext underline. */
6006 if(line->indent < ctx->code_indent_offset && pivot_line->type == MD_LINE_TEXT
6007 && off < ctx->size && ISANYOF2(off, _T('='), _T('-'))
6008 && (n_parents == ctx->n_containers))
6009 {
6010 unsigned level;
6011
6012 if(md_is_setext_underline(ctx, beg: off, p_end: &off, p_level: &level)) {
6013 line->type = MD_LINE_SETEXTUNDERLINE;
6014 line->data = level;
6015 break;
6016 }
6017 }
6018
6019 /* Check for thematic break line. */
6020 if(line->indent < ctx->code_indent_offset
6021 && off < ctx->size && off >= hr_killer
6022 && ISANYOF(off, _T("-_*")))
6023 {
6024 if(md_is_hr_line(ctx, beg: off, p_end: &off, p_killer: &hr_killer)) {
6025 line->type = MD_LINE_HR;
6026 break;
6027 }
6028 }
6029
6030 /* Check for "brother" container. I.e. whether we are another list item
6031 * in already started list. */
6032 if(n_parents < ctx->n_containers && n_brothers + n_children == 0) {
6033 OFF tmp;
6034
6035 if(md_is_container_mark(ctx, indent: line->indent, beg: off, p_end: &tmp, p_container: &container) &&
6036 md_is_container_compatible(pivot: &ctx->containers[n_parents], container: &container))
6037 {
6038 pivot_line = &md_dummy_blank_line;
6039
6040 off = tmp;
6041
6042 total_indent += container.contents_indent - container.mark_indent;
6043 line->indent = md_line_indentation(ctx, total_indent, beg: off, p_end: &off);
6044 total_indent += line->indent;
6045 line->beg = off;
6046
6047 /* Some of the following whitespace actually still belongs to the mark. */
6048 if(off >= ctx->size || ISNEWLINE(off)) {
6049 container.contents_indent++;
6050 } else if(line->indent <= ctx->code_indent_offset) {
6051 container.contents_indent += line->indent;
6052 line->indent = 0;
6053 } else {
6054 container.contents_indent += 1;
6055 line->indent--;
6056 }
6057
6058 ctx->containers[n_parents].mark_indent = container.mark_indent;
6059 ctx->containers[n_parents].contents_indent = container.contents_indent;
6060
6061 n_brothers++;
6062 continue;
6063 }
6064 }
6065
6066 /* Check for indented code.
6067 * Note indented code block cannot interrupt a paragraph. */
6068 if(line->indent >= ctx->code_indent_offset && (pivot_line->type != MD_LINE_TEXT)) {
6069 line->type = MD_LINE_INDENTEDCODE;
6070 line->indent -= ctx->code_indent_offset;
6071 line->data = 0;
6072 break;
6073 }
6074
6075 /* Check for start of a new container block. */
6076 if(line->indent < ctx->code_indent_offset &&
6077 md_is_container_mark(ctx, indent: line->indent, beg: off, p_end: &off, p_container: &container))
6078 {
6079 if(pivot_line->type == MD_LINE_TEXT && n_parents == ctx->n_containers &&
6080 (off >= ctx->size || ISNEWLINE(off)) && container.ch != _T('>'))
6081 {
6082 /* Noop. List mark followed by a blank line cannot interrupt a paragraph. */
6083 } else if(pivot_line->type == MD_LINE_TEXT && n_parents == ctx->n_containers &&
6084 ISANYOF2_(container.ch, _T('.'), _T(')')) && container.start != 1)
6085 {
6086 /* Noop. Ordered list cannot interrupt a paragraph unless the start index is 1. */
6087 } else {
6088 total_indent += container.contents_indent - container.mark_indent;
6089 line->indent = md_line_indentation(ctx, total_indent, beg: off, p_end: &off);
6090 total_indent += line->indent;
6091
6092 line->beg = off;
6093 line->data = container.ch;
6094
6095 /* Some of the following whitespace actually still belongs to the mark. */
6096 if(off >= ctx->size || ISNEWLINE(off)) {
6097 container.contents_indent++;
6098 } else if(line->indent <= ctx->code_indent_offset) {
6099 container.contents_indent += line->indent;
6100 line->indent = 0;
6101 } else {
6102 container.contents_indent += 1;
6103 line->indent--;
6104 }
6105
6106 if(n_brothers + n_children == 0)
6107 pivot_line = &md_dummy_blank_line;
6108
6109 if(n_children == 0)
6110 MD_CHECK(md_leave_child_containers(ctx, n_parents + n_brothers));
6111
6112 n_children++;
6113 MD_CHECK(md_push_container(ctx, &container));
6114 continue;
6115 }
6116 }
6117
6118 /* Check whether we are table continuation. */
6119 if(pivot_line->type == MD_LINE_TABLE && n_parents == ctx->n_containers) {
6120 line->type = MD_LINE_TABLE;
6121 break;
6122 }
6123
6124 /* Check for ATX header. */
6125 if(line->indent < ctx->code_indent_offset &&
6126 off < ctx->size && CH(off) == _T('#'))
6127 {
6128 unsigned level;
6129
6130 if(md_is_atxheader_line(ctx, beg: off, p_beg: &line->beg, p_end: &off, p_level: &level)) {
6131 line->type = MD_LINE_ATXHEADER;
6132 line->data = level;
6133 break;
6134 }
6135 }
6136
6137 /* Check whether we are starting code fence. */
6138 if(line->indent < ctx->code_indent_offset &&
6139 off < ctx->size && ISANYOF2(off, _T('`'), _T('~')))
6140 {
6141 if(md_is_opening_code_fence(ctx, beg: off, p_end: &off)) {
6142 line->type = MD_LINE_FENCEDCODE;
6143 line->data = 1;
6144 line->enforce_new_block = TRUE;
6145 break;
6146 }
6147 }
6148
6149 /* Check for start of raw HTML block. */
6150 if(off < ctx->size && CH(off) == _T('<')
6151 && !(ctx->parser.flags & MD_FLAG_NOHTMLBLOCKS))
6152 {
6153 ctx->html_block_type = md_is_html_block_start_condition(ctx, beg: off);
6154
6155 /* HTML block type 7 cannot interrupt paragraph. */
6156 if(ctx->html_block_type == 7 && pivot_line->type == MD_LINE_TEXT)
6157 ctx->html_block_type = 0;
6158
6159 if(ctx->html_block_type > 0) {
6160 /* The line itself also may immediately close the block. */
6161 if(md_is_html_block_end_condition(ctx, beg: off, p_end: &off) == ctx->html_block_type) {
6162 /* Make sure this is the last line of the block. */
6163 ctx->html_block_type = 0;
6164 }
6165
6166 line->enforce_new_block = TRUE;
6167 line->type = MD_LINE_HTML;
6168 break;
6169 }
6170 }
6171
6172 /* Check for table underline. */
6173 if((ctx->parser.flags & MD_FLAG_TABLES) && pivot_line->type == MD_LINE_TEXT
6174 && off < ctx->size && ISANYOF3(off, _T('|'), _T('-'), _T(':'))
6175 && n_parents == ctx->n_containers)
6176 {
6177 unsigned col_count;
6178
6179 if(ctx->current_block != NULL && ctx->current_block->n_lines == 1 &&
6180 md_is_table_underline(ctx, beg: off, p_end: &off, p_col_count: &col_count))
6181 {
6182 line->data = col_count;
6183 line->type = MD_LINE_TABLEUNDERLINE;
6184 break;
6185 }
6186 }
6187
6188 /* By default, we are normal text line. */
6189 line->type = MD_LINE_TEXT;
6190 if(pivot_line->type == MD_LINE_TEXT && n_brothers + n_children == 0) {
6191 /* Lazy continuation. */
6192 n_parents = ctx->n_containers;
6193 }
6194
6195 /* Check for task mark. */
6196 if((ctx->parser.flags & MD_FLAG_TASKLISTS) && n_brothers + n_children > 0 &&
6197 ISANYOF_(ctx->containers[ctx->n_containers-1].ch, _T("-+*.)")))
6198 {
6199 OFF tmp = off;
6200
6201 while(tmp < ctx->size && tmp < off + 3 && ISBLANK(tmp))
6202 tmp++;
6203 if(tmp + 2 < ctx->size && CH(tmp) == _T('[') &&
6204 ISANYOF(tmp+1, _T("xX ")) && CH(tmp+2) == _T(']') &&
6205 (tmp + 3 == ctx->size || ISBLANK(tmp+3) || ISNEWLINE(tmp+3)))
6206 {
6207 MD_CONTAINER* task_container = (n_children > 0 ? &ctx->containers[ctx->n_containers-1] : &container);
6208 task_container->is_task = TRUE;
6209 task_container->task_mark_off = tmp + 1;
6210 off = tmp + 3;
6211 while(off < ctx->size && ISWHITESPACE(off))
6212 off++;
6213 line->beg = off;
6214 }
6215 }
6216
6217 break;
6218 }
6219
6220 /* Scan for end of the line.
6221 *
6222 * Note this is quite a bottleneck of the parsing as we here iterate almost
6223 * over compete document.
6224 */
6225#if defined __linux__ && !defined MD4C_USE_UTF16
6226 /* Recent glibc versions have superbly optimized strcspn(), even using
6227 * vectorization if available. */
6228 if(ctx->doc_ends_with_newline && off < ctx->size) {
6229 while(TRUE) {
6230 off += (OFF) strcspn(STR(off), reject: "\r\n");
6231
6232 /* strcspn() can stop on zero terminator; but that can appear
6233 * anywhere in the Markfown input... */
6234 if(CH(off) == _T('\0'))
6235 off++;
6236 else
6237 break;
6238 }
6239 } else
6240#endif
6241 {
6242 /* Optimization: Use some loop unrolling. */
6243 while(off + 3 < ctx->size && !ISNEWLINE(off+0) && !ISNEWLINE(off+1)
6244 && !ISNEWLINE(off+2) && !ISNEWLINE(off+3))
6245 off += 4;
6246 while(off < ctx->size && !ISNEWLINE(off))
6247 off++;
6248 }
6249
6250 /* Set end of the line. */
6251 line->end = off;
6252
6253 /* But for ATX header, we should exclude the optional trailing mark. */
6254 if(line->type == MD_LINE_ATXHEADER) {
6255 OFF tmp = line->end;
6256 while(tmp > line->beg && CH(tmp-1) == _T(' '))
6257 tmp--;
6258 while(tmp > line->beg && CH(tmp-1) == _T('#'))
6259 tmp--;
6260 if(tmp == line->beg || CH(tmp-1) == _T(' ') || (ctx->parser.flags & MD_FLAG_PERMISSIVEATXHEADERS))
6261 line->end = tmp;
6262 }
6263
6264 /* Trim trailing spaces. */
6265 if(line->type != MD_LINE_INDENTEDCODE && line->type != MD_LINE_FENCEDCODE && line->type != MD_LINE_HTML) {
6266 while(line->end > line->beg && CH(line->end-1) == _T(' '))
6267 line->end--;
6268 }
6269
6270 /* Eat also the new line. */
6271 if(off < ctx->size && CH(off) == _T('\r'))
6272 off++;
6273 if(off < ctx->size && CH(off) == _T('\n'))
6274 off++;
6275
6276 *p_end = off;
6277
6278 /* If we belong to a list after seeing a blank line, the list is loose. */
6279 if(prev_line_has_list_loosening_effect && line->type != MD_LINE_BLANK && n_parents + n_brothers > 0) {
6280 MD_CONTAINER* c = &ctx->containers[n_parents + n_brothers - 1];
6281 if(c->ch != _T('>')) {
6282 MD_BLOCK* block = (MD_BLOCK*) (((char*)ctx->block_bytes) + c->block_byte_off);
6283 block->flags |= MD_BLOCK_LOOSE_LIST;
6284 }
6285 }
6286
6287 /* Leave any containers we are not part of anymore. */
6288 if(n_children == 0 && n_parents + n_brothers < ctx->n_containers)
6289 MD_CHECK(md_leave_child_containers(ctx, n_parents + n_brothers));
6290
6291 /* Enter any container we found a mark for. */
6292 if(n_brothers > 0) {
6293 MD_ASSERT(n_brothers == 1);
6294 MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_LI,
6295 ctx->containers[n_parents].task_mark_off,
6296 (ctx->containers[n_parents].is_task ? CH(ctx->containers[n_parents].task_mark_off) : 0),
6297 MD_BLOCK_CONTAINER_CLOSER));
6298 MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_LI,
6299 container.task_mark_off,
6300 (container.is_task ? CH(container.task_mark_off) : 0),
6301 MD_BLOCK_CONTAINER_OPENER));
6302 ctx->containers[n_parents].is_task = container.is_task;
6303 ctx->containers[n_parents].task_mark_off = container.task_mark_off;
6304 }
6305
6306 if(n_children > 0)
6307 MD_CHECK(md_enter_child_containers(ctx, n_children));
6308
6309abort:
6310 return ret;
6311}
6312
6313static int
6314md_process_line(MD_CTX* ctx, const MD_LINE_ANALYSIS** p_pivot_line, MD_LINE_ANALYSIS* line)
6315{
6316 const MD_LINE_ANALYSIS* pivot_line = *p_pivot_line;
6317 int ret = 0;
6318
6319 /* Blank line ends current leaf block. */
6320 if(line->type == MD_LINE_BLANK) {
6321 MD_CHECK(md_end_current_block(ctx));
6322 *p_pivot_line = &md_dummy_blank_line;
6323 return 0;
6324 }
6325
6326 if(line->enforce_new_block)
6327 MD_CHECK(md_end_current_block(ctx));
6328
6329 /* Some line types form block on their own. */
6330 if(line->type == MD_LINE_HR || line->type == MD_LINE_ATXHEADER) {
6331 MD_CHECK(md_end_current_block(ctx));
6332
6333 /* Add our single-line block. */
6334 MD_CHECK(md_start_new_block(ctx, line));
6335 MD_CHECK(md_add_line_into_current_block(ctx, line));
6336 MD_CHECK(md_end_current_block(ctx));
6337 *p_pivot_line = &md_dummy_blank_line;
6338 return 0;
6339 }
6340
6341 /* MD_LINE_SETEXTUNDERLINE changes meaning of the current block and ends it. */
6342 if(line->type == MD_LINE_SETEXTUNDERLINE) {
6343 MD_ASSERT(ctx->current_block != NULL);
6344 ctx->current_block->type = MD_BLOCK_H;
6345 ctx->current_block->data = line->data;
6346 ctx->current_block->flags |= MD_BLOCK_SETEXT_HEADER;
6347 MD_CHECK(md_add_line_into_current_block(ctx, line));
6348 MD_CHECK(md_end_current_block(ctx));
6349 if(ctx->current_block == NULL) {
6350 *p_pivot_line = &md_dummy_blank_line;
6351 } else {
6352 /* This happens if we have consumed all the body as link ref. defs.
6353 * and downgraded the underline into start of a new paragraph block. */
6354 line->type = MD_LINE_TEXT;
6355 *p_pivot_line = line;
6356 }
6357 return 0;
6358 }
6359
6360 /* MD_LINE_TABLEUNDERLINE changes meaning of the current block. */
6361 if(line->type == MD_LINE_TABLEUNDERLINE) {
6362 MD_ASSERT(ctx->current_block != NULL);
6363 MD_ASSERT(ctx->current_block->n_lines == 1);
6364 ctx->current_block->type = MD_BLOCK_TABLE;
6365 ctx->current_block->data = line->data;
6366 MD_ASSERT(pivot_line != &md_dummy_blank_line);
6367 ((MD_LINE_ANALYSIS*)pivot_line)->type = MD_LINE_TABLE;
6368 MD_CHECK(md_add_line_into_current_block(ctx, line));
6369 return 0;
6370 }
6371
6372 /* The current block also ends if the line has different type. */
6373 if(line->type != pivot_line->type)
6374 MD_CHECK(md_end_current_block(ctx));
6375
6376 /* The current line may start a new block. */
6377 if(ctx->current_block == NULL) {
6378 MD_CHECK(md_start_new_block(ctx, line));
6379 *p_pivot_line = line;
6380 }
6381
6382 /* In all other cases the line is just a continuation of the current block. */
6383 MD_CHECK(md_add_line_into_current_block(ctx, line));
6384
6385abort:
6386 return ret;
6387}
6388
6389static int
6390md_process_doc(MD_CTX *ctx)
6391{
6392 const MD_LINE_ANALYSIS* pivot_line = &md_dummy_blank_line;
6393 MD_LINE_ANALYSIS line_buf[2];
6394 MD_LINE_ANALYSIS* line = &line_buf[0];
6395 OFF off = 0;
6396 int ret = 0;
6397
6398 MD_ENTER_BLOCK(MD_BLOCK_DOC, NULL);
6399
6400 while(off < ctx->size) {
6401 if(line == pivot_line)
6402 line = (line == &line_buf[0] ? &line_buf[1] : &line_buf[0]);
6403
6404 MD_CHECK(md_analyze_line(ctx, off, &off, pivot_line, line));
6405 MD_CHECK(md_process_line(ctx, &pivot_line, line));
6406 }
6407
6408 md_end_current_block(ctx);
6409
6410 MD_CHECK(md_build_ref_def_hashtable(ctx));
6411
6412 /* Process all blocks. */
6413 MD_CHECK(md_leave_child_containers(ctx, 0));
6414 MD_CHECK(md_process_all_blocks(ctx));
6415
6416 MD_LEAVE_BLOCK(MD_BLOCK_DOC, NULL);
6417
6418abort:
6419
6420#if 0
6421 /* Output some memory consumption statistics. */
6422 {
6423 char buffer[256];
6424 sprintf(buffer, "Alloced %u bytes for block buffer.",
6425 (unsigned)(ctx->alloc_block_bytes));
6426 MD_LOG(buffer);
6427
6428 sprintf(buffer, "Alloced %u bytes for containers buffer.",
6429 (unsigned)(ctx->alloc_containers * sizeof(MD_CONTAINER)));
6430 MD_LOG(buffer);
6431
6432 sprintf(buffer, "Alloced %u bytes for marks buffer.",
6433 (unsigned)(ctx->alloc_marks * sizeof(MD_MARK)));
6434 MD_LOG(buffer);
6435
6436 sprintf(buffer, "Alloced %u bytes for aux. buffer.",
6437 (unsigned)(ctx->alloc_buffer * sizeof(MD_CHAR)));
6438 MD_LOG(buffer);
6439 }
6440#endif
6441
6442 return ret;
6443}
6444
6445
6446/********************
6447 *** Public API ***
6448 ********************/
6449
6450int
6451md_parse(const MD_CHAR* text, MD_SIZE size, const MD_PARSER* parser, void* userdata)
6452{
6453 MD_CTX ctx;
6454 int i;
6455 int ret;
6456
6457 if(parser->abi_version != 0) {
6458 if(parser->debug_log != NULL)
6459 parser->debug_log("Unsupported abi_version.", userdata);
6460 return -1;
6461 }
6462
6463 /* Setup context structure. */
6464 memset(s: &ctx, c: 0, n: sizeof(MD_CTX));
6465 ctx.text = text;
6466 ctx.size = size;
6467 memcpy(dest: &ctx.parser, src: parser, n: sizeof(MD_PARSER));
6468 ctx.userdata = userdata;
6469 ctx.code_indent_offset = (ctx.parser.flags & MD_FLAG_NOINDENTEDCODEBLOCKS) ? (OFF)(-1) : 4;
6470 md_build_mark_char_map(ctx: &ctx);
6471 ctx.doc_ends_with_newline = (size > 0 && ISNEWLINE_(text[size-1]));
6472
6473 /* Reset all mark stacks and lists. */
6474 for(i = 0; i < (int) SIZEOF_ARRAY(ctx.opener_stacks); i++)
6475 ctx.opener_stacks[i].top = -1;
6476 ctx.ptr_stack.top = -1;
6477 ctx.unresolved_link_head = -1;
6478 ctx.unresolved_link_tail = -1;
6479 ctx.table_cell_boundaries_head = -1;
6480 ctx.table_cell_boundaries_tail = -1;
6481
6482 /* All the work. */
6483 ret = md_process_doc(ctx: &ctx);
6484
6485 /* Clean-up. */
6486 md_free_ref_defs(ctx: &ctx);
6487 md_free_ref_def_hashtable(ctx: &ctx);
6488 free(ptr: ctx.buffer);
6489 free(ptr: ctx.marks);
6490 free(ptr: ctx.block_bytes);
6491 free(ptr: ctx.containers);
6492
6493 return ret;
6494}
6495

source code of qtbase/src/3rdparty/md4c/md4c.c