| 1 | /* |
| 2 | * MD4C: Markdown parser for C |
| 3 | * (http://github.com/mity/md4c) |
| 4 | * |
| 5 | * Copyright (c) 2016-2024 Martin Mitáš |
| 6 | * |
| 7 | * Permission is hereby granted, free of charge, to any person obtaining a |
| 8 | * copy of this software and associated documentation files (the "Software"), |
| 9 | * to deal in the Software without restriction, including without limitation |
| 10 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
| 11 | * and/or sell copies of the Software, and to permit persons to whom the |
| 12 | * Software is furnished to do so, subject to the following conditions: |
| 13 | * |
| 14 | * The above copyright notice and this permission notice shall be included in |
| 15 | * all copies or substantial portions of the Software. |
| 16 | * |
| 17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS |
| 18 | * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 19 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 20 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 21 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
| 22 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS |
| 23 | * IN THE SOFTWARE. |
| 24 | */ |
| 25 | |
| 26 | #ifndef MD4C_H |
| 27 | #define MD4C_H |
| 28 | |
| 29 | #ifdef __cplusplus |
| 30 | extern "C" { |
| 31 | #endif |
| 32 | |
| 33 | #if defined MD4C_USE_UTF16 |
| 34 | /* Magic to support UTF-16. Note that in order to use it, you have to define |
| 35 | * the macro MD4C_USE_UTF16 both when building MD4C as well as when |
| 36 | * including this header in your code. */ |
| 37 | #ifdef _WIN32 |
| 38 | #include <windows.h> |
| 39 | typedef WCHAR MD_CHAR; |
| 40 | #else |
| 41 | #error MD4C_USE_UTF16 is only supported on Windows. |
| 42 | #endif |
| 43 | #else |
| 44 | typedef char MD_CHAR; |
| 45 | #endif |
| 46 | |
| 47 | typedef unsigned MD_SIZE; |
| 48 | typedef unsigned MD_OFFSET; |
| 49 | |
| 50 | |
| 51 | /* Block represents a part of document hierarchy structure like a paragraph |
| 52 | * or list item. |
| 53 | */ |
| 54 | typedef enum MD_BLOCKTYPE { |
| 55 | /* <body>...</body> */ |
| 56 | MD_BLOCK_DOC = 0, |
| 57 | |
| 58 | /* <blockquote>...</blockquote> */ |
| 59 | MD_BLOCK_QUOTE, |
| 60 | |
| 61 | /* <ul>...</ul> |
| 62 | * Detail: Structure MD_BLOCK_UL_DETAIL. */ |
| 63 | MD_BLOCK_UL, |
| 64 | |
| 65 | /* <ol>...</ol> |
| 66 | * Detail: Structure MD_BLOCK_OL_DETAIL. */ |
| 67 | MD_BLOCK_OL, |
| 68 | |
| 69 | /* <li>...</li> |
| 70 | * Detail: Structure MD_BLOCK_LI_DETAIL. */ |
| 71 | MD_BLOCK_LI, |
| 72 | |
| 73 | /* <hr> */ |
| 74 | MD_BLOCK_HR, |
| 75 | |
| 76 | /* <h1>...</h1> (for levels up to 6) |
| 77 | * Detail: Structure MD_BLOCK_H_DETAIL. */ |
| 78 | MD_BLOCK_H, |
| 79 | |
| 80 | /* <pre><code>...</code></pre> |
| 81 | * Note the text lines within code blocks are terminated with '\n' |
| 82 | * instead of explicit MD_TEXT_BR. */ |
| 83 | MD_BLOCK_CODE, |
| 84 | |
| 85 | /* Raw HTML block. This itself does not correspond to any particular HTML |
| 86 | * tag. The contents of it _is_ raw HTML source intended to be put |
| 87 | * in verbatim form to the HTML output. */ |
| 88 | MD_BLOCK_HTML, |
| 89 | |
| 90 | /* <p>...</p> */ |
| 91 | MD_BLOCK_P, |
| 92 | |
| 93 | /* <table>...</table> and its contents. |
| 94 | * Detail: Structure MD_BLOCK_TABLE_DETAIL (for MD_BLOCK_TABLE), |
| 95 | * structure MD_BLOCK_TD_DETAIL (for MD_BLOCK_TH and MD_BLOCK_TD) |
| 96 | * Note all of these are used only if extension MD_FLAG_TABLES is enabled. */ |
| 97 | MD_BLOCK_TABLE, |
| 98 | MD_BLOCK_THEAD, |
| 99 | MD_BLOCK_TBODY, |
| 100 | MD_BLOCK_TR, |
| 101 | MD_BLOCK_TH, |
| 102 | MD_BLOCK_TD |
| 103 | } MD_BLOCKTYPE; |
| 104 | |
| 105 | /* Span represents an in-line piece of a document which should be rendered with |
| 106 | * the same font, color and other attributes. A sequence of spans forms a block |
| 107 | * like paragraph or list item. */ |
| 108 | typedef enum MD_SPANTYPE { |
| 109 | /* <em>...</em> */ |
| 110 | MD_SPAN_EM, |
| 111 | |
| 112 | /* <strong>...</strong> */ |
| 113 | MD_SPAN_STRONG, |
| 114 | |
| 115 | /* <a href="xxx">...</a> |
| 116 | * Detail: Structure MD_SPAN_A_DETAIL. */ |
| 117 | MD_SPAN_A, |
| 118 | |
| 119 | /* <img src="xxx">...</a> |
| 120 | * Detail: Structure MD_SPAN_IMG_DETAIL. |
| 121 | * Note: Image text can contain nested spans and even nested images. |
| 122 | * If rendered into ALT attribute of HTML <IMG> tag, it's responsibility |
| 123 | * of the parser to deal with it. |
| 124 | */ |
| 125 | MD_SPAN_IMG, |
| 126 | |
| 127 | /* <code>...</code> */ |
| 128 | MD_SPAN_CODE, |
| 129 | |
| 130 | /* <del>...</del> |
| 131 | * Note: Recognized only when MD_FLAG_STRIKETHROUGH is enabled. |
| 132 | */ |
| 133 | MD_SPAN_DEL, |
| 134 | |
| 135 | /* For recognizing inline ($) and display ($$) equations |
| 136 | * Note: Recognized only when MD_FLAG_LATEXMATHSPANS is enabled. |
| 137 | */ |
| 138 | MD_SPAN_LATEXMATH, |
| 139 | MD_SPAN_LATEXMATH_DISPLAY, |
| 140 | |
| 141 | /* Wiki links |
| 142 | * Note: Recognized only when MD_FLAG_WIKILINKS is enabled. |
| 143 | */ |
| 144 | MD_SPAN_WIKILINK, |
| 145 | |
| 146 | /* <u>...</u> |
| 147 | * Note: Recognized only when MD_FLAG_UNDERLINE is enabled. */ |
| 148 | MD_SPAN_U |
| 149 | } MD_SPANTYPE; |
| 150 | |
| 151 | /* Text is the actual textual contents of span. */ |
| 152 | typedef enum MD_TEXTTYPE { |
| 153 | /* Normal text. */ |
| 154 | MD_TEXT_NORMAL = 0, |
| 155 | |
| 156 | /* NULL character. CommonMark requires replacing NULL character with |
| 157 | * the replacement char U+FFFD, so this allows caller to do that easily. */ |
| 158 | MD_TEXT_NULLCHAR, |
| 159 | |
| 160 | /* Line breaks. |
| 161 | * Note these are not sent from blocks with verbatim output (MD_BLOCK_CODE |
| 162 | * or MD_BLOCK_HTML). In such cases, '\n' is part of the text itself. */ |
| 163 | MD_TEXT_BR, /* <br> (hard break) */ |
| 164 | MD_TEXT_SOFTBR, /* '\n' in source text where it is not semantically meaningful (soft break) */ |
| 165 | |
| 166 | /* Entity. |
| 167 | * (a) Named entity, e.g. |
| 168 | * (Note MD4C does not have a list of known entities. |
| 169 | * Anything matching the regexp /&[A-Za-z][A-Za-z0-9]{1,47};/ is |
| 170 | * treated as a named entity.) |
| 171 | * (b) Numerical entity, e.g. Ӓ |
| 172 | * (c) Hexadecimal entity, e.g. ካ |
| 173 | * |
| 174 | * As MD4C is mostly encoding agnostic, application gets the verbatim |
| 175 | * entity text into the MD_PARSER::text_callback(). */ |
| 176 | MD_TEXT_ENTITY, |
| 177 | |
| 178 | /* Text in a code block (inside MD_BLOCK_CODE) or inlined code (`code`). |
| 179 | * If it is inside MD_BLOCK_CODE, it includes spaces for indentation and |
| 180 | * '\n' for new lines. MD_TEXT_BR and MD_TEXT_SOFTBR are not sent for this |
| 181 | * kind of text. */ |
| 182 | MD_TEXT_CODE, |
| 183 | |
| 184 | /* Text is a raw HTML. If it is contents of a raw HTML block (i.e. not |
| 185 | * an inline raw HTML), then MD_TEXT_BR and MD_TEXT_SOFTBR are not used. |
| 186 | * The text contains verbatim '\n' for the new lines. */ |
| 187 | MD_TEXT_HTML, |
| 188 | |
| 189 | /* Text is inside an equation. This is processed the same way as inlined code |
| 190 | * spans (`code`). */ |
| 191 | MD_TEXT_LATEXMATH |
| 192 | } MD_TEXTTYPE; |
| 193 | |
| 194 | |
| 195 | /* Alignment enumeration. */ |
| 196 | typedef enum MD_ALIGN { |
| 197 | MD_ALIGN_DEFAULT = 0, /* When unspecified. */ |
| 198 | MD_ALIGN_LEFT, |
| 199 | MD_ALIGN_CENTER, |
| 200 | MD_ALIGN_RIGHT |
| 201 | } MD_ALIGN; |
| 202 | |
| 203 | |
| 204 | /* String attribute. |
| 205 | * |
| 206 | * This wraps strings which are outside of a normal text flow and which are |
| 207 | * propagated within various detailed structures, but which still may contain |
| 208 | * string portions of different types like e.g. entities. |
| 209 | * |
| 210 | * So, for example, lets consider this image: |
| 211 | * |
| 212 | *  |
| 213 | * |
| 214 | * The image alt text is propagated as a normal text via the MD_PARSER::text() |
| 215 | * callback. However, the image title ('foo " bar') is propagated as |
| 216 | * MD_ATTRIBUTE in MD_SPAN_IMG_DETAIL::title. |
| 217 | * |
| 218 | * Then the attribute MD_SPAN_IMG_DETAIL::title shall provide the following: |
| 219 | * -- [0]: "foo " (substr_types[0] == MD_TEXT_NORMAL; substr_offsets[0] == 0) |
| 220 | * -- [1]: """ (substr_types[1] == MD_TEXT_ENTITY; substr_offsets[1] == 4) |
| 221 | * -- [2]: " bar" (substr_types[2] == MD_TEXT_NORMAL; substr_offsets[2] == 10) |
| 222 | * -- [3]: (n/a) (n/a ; substr_offsets[3] == 14) |
| 223 | * |
| 224 | * Note that these invariants are always guaranteed: |
| 225 | * -- substr_offsets[0] == 0 |
| 226 | * -- substr_offsets[LAST+1] == size |
| 227 | * -- Currently, only MD_TEXT_NORMAL, MD_TEXT_ENTITY, MD_TEXT_NULLCHAR |
| 228 | * substrings can appear. This could change only of the specification |
| 229 | * changes. |
| 230 | */ |
| 231 | typedef struct MD_ATTRIBUTE { |
| 232 | const MD_CHAR* text; |
| 233 | MD_SIZE size; |
| 234 | const MD_TEXTTYPE* substr_types; |
| 235 | const MD_OFFSET* substr_offsets; |
| 236 | } MD_ATTRIBUTE; |
| 237 | |
| 238 | |
| 239 | /* Detailed info for MD_BLOCK_UL. */ |
| 240 | typedef struct MD_BLOCK_UL_DETAIL { |
| 241 | int is_tight; /* Non-zero if tight list, zero if loose. */ |
| 242 | MD_CHAR mark; /* Item bullet character in MarkDown source of the list, e.g. '-', '+', '*'. */ |
| 243 | } MD_BLOCK_UL_DETAIL; |
| 244 | |
| 245 | /* Detailed info for MD_BLOCK_OL. */ |
| 246 | typedef struct MD_BLOCK_OL_DETAIL { |
| 247 | unsigned start; /* Start index of the ordered list. */ |
| 248 | int is_tight; /* Non-zero if tight list, zero if loose. */ |
| 249 | MD_CHAR mark_delimiter; /* Character delimiting the item marks in MarkDown source, e.g. '.' or ')' */ |
| 250 | } MD_BLOCK_OL_DETAIL; |
| 251 | |
| 252 | /* Detailed info for MD_BLOCK_LI. */ |
| 253 | typedef struct MD_BLOCK_LI_DETAIL { |
| 254 | int is_task; /* Can be non-zero only with MD_FLAG_TASKLISTS */ |
| 255 | MD_CHAR task_mark; /* If is_task, then one of 'x', 'X' or ' '. Undefined otherwise. */ |
| 256 | MD_OFFSET task_mark_offset; /* If is_task, then offset in the input of the char between '[' and ']'. */ |
| 257 | } MD_BLOCK_LI_DETAIL; |
| 258 | |
| 259 | /* Detailed info for MD_BLOCK_H. */ |
| 260 | typedef struct MD_BLOCK_H_DETAIL { |
| 261 | unsigned level; /* Header level (1 - 6) */ |
| 262 | } MD_BLOCK_H_DETAIL; |
| 263 | |
| 264 | /* Detailed info for MD_BLOCK_CODE. */ |
| 265 | typedef struct MD_BLOCK_CODE_DETAIL { |
| 266 | MD_ATTRIBUTE info; |
| 267 | MD_ATTRIBUTE lang; |
| 268 | MD_CHAR fence_char; /* The character used for fenced code block; or zero for indented code block. */ |
| 269 | } MD_BLOCK_CODE_DETAIL; |
| 270 | |
| 271 | /* Detailed info for MD_BLOCK_TABLE. */ |
| 272 | typedef struct MD_BLOCK_TABLE_DETAIL { |
| 273 | unsigned col_count; /* Count of columns in the table. */ |
| 274 | unsigned head_row_count; /* Count of rows in the table header (currently always 1) */ |
| 275 | unsigned body_row_count; /* Count of rows in the table body */ |
| 276 | } MD_BLOCK_TABLE_DETAIL; |
| 277 | |
| 278 | /* Detailed info for MD_BLOCK_TH and MD_BLOCK_TD. */ |
| 279 | typedef struct MD_BLOCK_TD_DETAIL { |
| 280 | MD_ALIGN align; |
| 281 | } MD_BLOCK_TD_DETAIL; |
| 282 | |
| 283 | /* Detailed info for MD_SPAN_A. */ |
| 284 | typedef struct MD_SPAN_A_DETAIL { |
| 285 | MD_ATTRIBUTE href; |
| 286 | MD_ATTRIBUTE title; |
| 287 | int is_autolink; /* nonzero if this is an autolink */ |
| 288 | } MD_SPAN_A_DETAIL; |
| 289 | |
| 290 | /* Detailed info for MD_SPAN_IMG. */ |
| 291 | typedef struct MD_SPAN_IMG_DETAIL { |
| 292 | MD_ATTRIBUTE src; |
| 293 | MD_ATTRIBUTE title; |
| 294 | } MD_SPAN_IMG_DETAIL; |
| 295 | |
| 296 | /* Detailed info for MD_SPAN_WIKILINK. */ |
| 297 | typedef struct MD_SPAN_WIKILINK { |
| 298 | MD_ATTRIBUTE target; |
| 299 | } MD_SPAN_WIKILINK_DETAIL; |
| 300 | |
| 301 | /* Flags specifying extensions/deviations from CommonMark specification. |
| 302 | * |
| 303 | * By default (when MD_PARSER::flags == 0), we follow CommonMark specification. |
| 304 | * The following flags may allow some extensions or deviations from it. |
| 305 | */ |
| 306 | #define MD_FLAG_COLLAPSEWHITESPACE 0x0001 /* In MD_TEXT_NORMAL, collapse non-trivial whitespace into single ' ' */ |
| 307 | #define 0x0002 /* Do not require space in ATX headers ( ###header ) */ |
| 308 | #define MD_FLAG_PERMISSIVEURLAUTOLINKS 0x0004 /* Recognize URLs as autolinks even without '<', '>' */ |
| 309 | #define MD_FLAG_PERMISSIVEEMAILAUTOLINKS 0x0008 /* Recognize e-mails as autolinks even without '<', '>' and 'mailto:' */ |
| 310 | #define MD_FLAG_NOINDENTEDCODEBLOCKS 0x0010 /* Disable indented code blocks. (Only fenced code works.) */ |
| 311 | #define MD_FLAG_NOHTMLBLOCKS 0x0020 /* Disable raw HTML blocks. */ |
| 312 | #define MD_FLAG_NOHTMLSPANS 0x0040 /* Disable raw HTML (inline). */ |
| 313 | #define MD_FLAG_TABLES 0x0100 /* Enable tables extension. */ |
| 314 | #define MD_FLAG_STRIKETHROUGH 0x0200 /* Enable strikethrough extension. */ |
| 315 | #define MD_FLAG_PERMISSIVEWWWAUTOLINKS 0x0400 /* Enable WWW autolinks (even without any scheme prefix, if they begin with 'www.') */ |
| 316 | #define MD_FLAG_TASKLISTS 0x0800 /* Enable task list extension. */ |
| 317 | #define MD_FLAG_LATEXMATHSPANS 0x1000 /* Enable $ and $$ containing LaTeX equations. */ |
| 318 | #define MD_FLAG_WIKILINKS 0x2000 /* Enable wiki links extension. */ |
| 319 | #define MD_FLAG_UNDERLINE 0x4000 /* Enable underline extension (and disables '_' for normal emphasis). */ |
| 320 | #define MD_FLAG_HARD_SOFT_BREAKS 0x8000 /* Force all soft breaks to act as hard breaks. */ |
| 321 | |
| 322 | #define MD_FLAG_PERMISSIVEAUTOLINKS (MD_FLAG_PERMISSIVEEMAILAUTOLINKS | MD_FLAG_PERMISSIVEURLAUTOLINKS | MD_FLAG_PERMISSIVEWWWAUTOLINKS) |
| 323 | #define MD_FLAG_NOHTML (MD_FLAG_NOHTMLBLOCKS | MD_FLAG_NOHTMLSPANS) |
| 324 | |
| 325 | /* Convenient sets of flags corresponding to well-known Markdown dialects. |
| 326 | * |
| 327 | * Note we may only support subset of features of the referred dialect. |
| 328 | * The constant just enables those extensions which bring us as close as |
| 329 | * possible given what features we implement. |
| 330 | * |
| 331 | * ABI compatibility note: Meaning of these can change in time as new |
| 332 | * extensions, bringing the dialect closer to the original, are implemented. |
| 333 | */ |
| 334 | #define MD_DIALECT_COMMONMARK 0 |
| 335 | #define MD_DIALECT_GITHUB (MD_FLAG_PERMISSIVEAUTOLINKS | MD_FLAG_TABLES | MD_FLAG_STRIKETHROUGH | MD_FLAG_TASKLISTS) |
| 336 | |
| 337 | /* Parser structure. |
| 338 | */ |
| 339 | typedef struct MD_PARSER { |
| 340 | /* Reserved. Set to zero. |
| 341 | */ |
| 342 | unsigned abi_version; |
| 343 | |
| 344 | /* Dialect options. Bitmask of MD_FLAG_xxxx values. |
| 345 | */ |
| 346 | unsigned flags; |
| 347 | |
| 348 | /* Caller-provided rendering callbacks. |
| 349 | * |
| 350 | * For some block/span types, more detailed information is provided in a |
| 351 | * type-specific structure pointed by the argument 'detail'. |
| 352 | * |
| 353 | * The last argument of all callbacks, 'userdata', is just propagated from |
| 354 | * md_parse() and is available for any use by the application. |
| 355 | * |
| 356 | * Note any strings provided to the callbacks as their arguments or as |
| 357 | * members of any detail structure are generally not zero-terminated. |
| 358 | * Application has to take the respective size information into account. |
| 359 | * |
| 360 | * Any rendering callback may abort further parsing of the document by |
| 361 | * returning non-zero. |
| 362 | */ |
| 363 | int (*enter_block)(MD_BLOCKTYPE /*type*/, void* /*detail*/, void* /*userdata*/); |
| 364 | int (*leave_block)(MD_BLOCKTYPE /*type*/, void* /*detail*/, void* /*userdata*/); |
| 365 | |
| 366 | int (*enter_span)(MD_SPANTYPE /*type*/, void* /*detail*/, void* /*userdata*/); |
| 367 | int (*leave_span)(MD_SPANTYPE /*type*/, void* /*detail*/, void* /*userdata*/); |
| 368 | |
| 369 | int (*text)(MD_TEXTTYPE /*type*/, const MD_CHAR* /*text*/, MD_SIZE /*size*/, void* /*userdata*/); |
| 370 | |
| 371 | /* Debug callback. Optional (may be NULL). |
| 372 | * |
| 373 | * If provided and something goes wrong, this function gets called. |
| 374 | * This is intended for debugging and problem diagnosis for developers; |
| 375 | * it is not intended to provide any errors suitable for displaying to an |
| 376 | * end user. |
| 377 | */ |
| 378 | void (*debug_log)(const char* /*msg*/, void* /*userdata*/); |
| 379 | |
| 380 | /* Reserved. Set to NULL. |
| 381 | */ |
| 382 | void (*syntax)(void); |
| 383 | } MD_PARSER; |
| 384 | |
| 385 | |
| 386 | /* For backward compatibility. Do not use in new code. |
| 387 | */ |
| 388 | typedef MD_PARSER MD_RENDERER; |
| 389 | |
| 390 | |
| 391 | /* Parse the Markdown document stored in the string 'text' of size 'size'. |
| 392 | * The parser provides callbacks to be called during the parsing so the |
| 393 | * caller can render the document on the screen or convert the Markdown |
| 394 | * to another format. |
| 395 | * |
| 396 | * Zero is returned on success. If a runtime error occurs (e.g. a memory |
| 397 | * fails), -1 is returned. If the processing is aborted due any callback |
| 398 | * returning non-zero, the return value of the callback is returned. |
| 399 | */ |
| 400 | int md_parse(const MD_CHAR* text, MD_SIZE size, const MD_PARSER* parser, void* userdata); |
| 401 | |
| 402 | |
| 403 | #ifdef __cplusplus |
| 404 | } /* extern "C" { */ |
| 405 | #endif |
| 406 | |
| 407 | #endif /* MD4C_H */ |
| 408 | |