1 module gumbo.capi; 2 3 extern (C) { 4 5 /** 6 * A struct representing a character position within the original text buffer. 7 * Line and column numbers are 1-based and offsets are 0-based, which matches 8 * how most editors and command-line tools work. Also, columns measure 9 * positions in terms of characters while offsets measure by bytes; this is 10 * because the offset field is often used to pull out a particular region of 11 * text (which in most languages that bind to C implies pointer arithmetic on a 12 * buffer of bytes), while the column field is often used to reference a 13 * particular column on a printable display, which nowadays is usually UTF-8. 14 */ 15 struct GumboSourcePosition { 16 uint line; 17 uint column; 18 uint offset; 19 }; 20 21 /** 22 * A SourcePosition used for elements that have no source position, i.e. 23 * parser-inserted elements. 24 */ 25 extern const GumboSourcePosition kGumboEmptySourcePosition; 26 27 28 /** 29 * A struct representing a string or part of a string. Strings within the 30 * parser are represented by a char* and a length; the char* points into 31 * an existing data buffer owned by some other code (often the original input). 32 * GumboStringPieces are assumed (by convention) to be immutable, because they 33 * may share data. Use GumboStringBuffer if you need to construct a string. 34 * Clients should assume that it is not NUL-terminated, and should always use 35 * explicit lengths when manipulating them. 36 */ 37 struct GumboStringPiece { 38 /** A pointer to the beginning of the string. NULL iff length == 0. */ 39 const char* data; 40 41 /** The length of the string fragment, in bytes. May be zero. */ 42 size_t length; 43 }; 44 45 /** A constant to represent a 0-length null string. */ 46 extern const GumboStringPiece kGumboEmptyString; 47 48 /** 49 * Compares two GumboStringPieces, and returns true if they're equal or false 50 * otherwise. 51 */ 52 bool gumbo_string_equals( 53 const GumboStringPiece* str1, const GumboStringPiece* str2); 54 55 /** 56 * Compares two GumboStringPieces ignoring case, and returns true if they're 57 * equal or false otherwise. 58 */ 59 bool gumbo_string_equals_ignore_case( 60 const GumboStringPiece* str1, const GumboStringPiece* str2); 61 62 63 /** 64 * A simple vector implementation. This stores a pointer to a data array and a 65 * length. All elements are stored as void*; client code must cast to the 66 * appropriate type. Overflows upon addition result in reallocation of the data 67 * array, with the size doubling to maintain O(1) amortized cost. There is no 68 * removal function, as this isn't needed for any of the operations within this 69 * library. Iteration can be done through inspecting the structure directly in 70 * a for-loop. 71 */ 72 struct GumboVector { 73 /** Data elements. This points to a dynamically-allocated array of capacity 74 * elements, each a void* to the element itself. 75 */ 76 void** data; 77 78 /** Number of elements currently in the vector. */ 79 uint length; 80 81 /** Current array capacity. */ 82 uint capacity; 83 }; 84 85 /** An empty (0-length, 0-capacity) GumboVector. */ 86 extern const GumboVector kGumboEmptyVector; 87 88 /** 89 * Returns the first index at which an element appears in this vector (testing 90 * by pointer equality), or -1 if it never does. 91 */ 92 int gumbo_vector_index_of(GumboVector* vector, void* element); 93 94 95 /** 96 * An enum for all the tags defined in the HTML5 standard. These correspond to 97 * the tag names themselves. Enum constants exist only for tags which appear in 98 * the spec itself (or for tags with special handling in the SVG and MathML 99 * namespaces); any other tags appear as GUMBO_TAG_UNKNOWN and the actual tag 100 * name can be obtained through original_tag. 101 * 102 * This is mostly for API convenience, so that clients of this library don't 103 * need to perform a strcasecmp to find the normalized tag name. It also has 104 * efficiency benefits, by letting the parser work with enums instead of 105 * strings. 106 */ 107 enum GumboTag { 108 // http://www.whatwg.org/specs/web-apps/current-work/multipage/semantics.html#the-root-element 109 GUMBO_TAG_HTML, 110 // http://www.whatwg.org/specs/web-apps/current-work/multipage/semantics.html#document-metadata 111 GUMBO_TAG_HEAD, 112 GUMBO_TAG_TITLE, 113 GUMBO_TAG_BASE, 114 GUMBO_TAG_LINK, 115 GUMBO_TAG_META, 116 GUMBO_TAG_STYLE, 117 // http://www.whatwg.org/specs/web-apps/current-work/multipage/scripting-1.html#scripting-1 118 GUMBO_TAG_SCRIPT, 119 GUMBO_TAG_NOSCRIPT, 120 // http://www.whatwg.org/specs/web-apps/current-work/multipage/sections.html#sections 121 GUMBO_TAG_BODY, 122 GUMBO_TAG_SECTION, 123 GUMBO_TAG_NAV, 124 GUMBO_TAG_ARTICLE, 125 GUMBO_TAG_ASIDE, 126 GUMBO_TAG_H1, 127 GUMBO_TAG_H2, 128 GUMBO_TAG_H3, 129 GUMBO_TAG_H4, 130 GUMBO_TAG_H5, 131 GUMBO_TAG_H6, 132 GUMBO_TAG_HGROUP, 133 GUMBO_TAG_HEADER, 134 GUMBO_TAG_FOOTER, 135 GUMBO_TAG_ADDRESS, 136 // http://www.whatwg.org/specs/web-apps/current-work/multipage/grouping-content.html#grouping-content 137 GUMBO_TAG_P, 138 GUMBO_TAG_HR, 139 GUMBO_TAG_PRE, 140 GUMBO_TAG_BLOCKQUOTE, 141 GUMBO_TAG_OL, 142 GUMBO_TAG_UL, 143 GUMBO_TAG_LI, 144 GUMBO_TAG_DL, 145 GUMBO_TAG_DT, 146 GUMBO_TAG_DD, 147 GUMBO_TAG_FIGURE, 148 GUMBO_TAG_FIGCAPTION, 149 GUMBO_TAG_DIV, 150 // http://www.whatwg.org/specs/web-apps/current-work/multipage/text-level-semantics.html#text-level-semantics 151 GUMBO_TAG_A, 152 GUMBO_TAG_EM, 153 GUMBO_TAG_STRONG, 154 GUMBO_TAG_SMALL, 155 GUMBO_TAG_S, 156 GUMBO_TAG_CITE, 157 GUMBO_TAG_Q, 158 GUMBO_TAG_DFN, 159 GUMBO_TAG_ABBR, 160 GUMBO_TAG_TIME, 161 GUMBO_TAG_CODE, 162 GUMBO_TAG_VAR, 163 GUMBO_TAG_SAMP, 164 GUMBO_TAG_KBD, 165 GUMBO_TAG_SUB, 166 GUMBO_TAG_SUP, 167 GUMBO_TAG_I, 168 GUMBO_TAG_B, 169 GUMBO_TAG_MARK, 170 GUMBO_TAG_RUBY, 171 GUMBO_TAG_RT, 172 GUMBO_TAG_RP, 173 GUMBO_TAG_BDI, 174 GUMBO_TAG_BDO, 175 GUMBO_TAG_SPAN, 176 GUMBO_TAG_BR, 177 GUMBO_TAG_WBR, 178 // http://www.whatwg.org/specs/web-apps/current-work/multipage/edits.html#edits 179 GUMBO_TAG_INS, 180 GUMBO_TAG_DEL, 181 // http://www.whatwg.org/specs/web-apps/current-work/multipage/embedded-content-1.html#embedded-content-1 182 GUMBO_TAG_IMAGE, 183 GUMBO_TAG_IMG, 184 GUMBO_TAG_IFRAME, 185 GUMBO_TAG_EMBED, 186 GUMBO_TAG_OBJECT, 187 GUMBO_TAG_PARAM, 188 GUMBO_TAG_VIDEO, 189 GUMBO_TAG_AUDIO, 190 GUMBO_TAG_SOURCE, 191 GUMBO_TAG_TRACK, 192 GUMBO_TAG_CANVAS, 193 GUMBO_TAG_MAP, 194 GUMBO_TAG_AREA, 195 // http://www.whatwg.org/specs/web-apps/current-work/multipage/the-map-element.html#mathml 196 GUMBO_TAG_MATH, 197 GUMBO_TAG_MI, 198 GUMBO_TAG_MO, 199 GUMBO_TAG_MN, 200 GUMBO_TAG_MS, 201 GUMBO_TAG_MTEXT, 202 GUMBO_TAG_MGLYPH, 203 GUMBO_TAG_MALIGNMARK, 204 GUMBO_TAG_ANNOTATION_XML, 205 // http://www.whatwg.org/specs/web-apps/current-work/multipage/the-map-element.html#svg-0 206 GUMBO_TAG_SVG, 207 GUMBO_TAG_FOREIGNOBJECT, 208 GUMBO_TAG_DESC, 209 // SVG title tags will have GUMBO_TAG_TITLE as with HTML. 210 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tabular-data.html#tabular-data 211 GUMBO_TAG_TABLE, 212 GUMBO_TAG_CAPTION, 213 GUMBO_TAG_COLGROUP, 214 GUMBO_TAG_COL, 215 GUMBO_TAG_TBODY, 216 GUMBO_TAG_THEAD, 217 GUMBO_TAG_TFOOT, 218 GUMBO_TAG_TR, 219 GUMBO_TAG_TD, 220 GUMBO_TAG_TH, 221 // http://www.whatwg.org/specs/web-apps/current-work/multipage/forms.html#forms 222 GUMBO_TAG_FORM, 223 GUMBO_TAG_FIELDSET, 224 GUMBO_TAG_LEGEND, 225 GUMBO_TAG_LABEL, 226 GUMBO_TAG_INPUT, 227 GUMBO_TAG_BUTTON, 228 GUMBO_TAG_SELECT, 229 GUMBO_TAG_DATALIST, 230 GUMBO_TAG_OPTGROUP, 231 GUMBO_TAG_OPTION, 232 GUMBO_TAG_TEXTAREA, 233 GUMBO_TAG_KEYGEN, 234 GUMBO_TAG_OUTPUT, 235 GUMBO_TAG_PROGRESS, 236 GUMBO_TAG_METER, 237 // http://www.whatwg.org/specs/web-apps/current-work/multipage/interactive-elements.html#interactive-elements 238 GUMBO_TAG_DETAILS, 239 GUMBO_TAG_SUMMARY, 240 GUMBO_TAG_COMMAND, 241 GUMBO_TAG_MENU, 242 // Non-conforming elements that nonetheless appear in the HTML5 spec. 243 // http://www.whatwg.org/specs/web-apps/current-work/multipage/obsolete.html#non-conforming-features 244 GUMBO_TAG_APPLET, 245 GUMBO_TAG_ACRONYM, 246 GUMBO_TAG_BGSOUND, 247 GUMBO_TAG_DIR, 248 GUMBO_TAG_FRAME, 249 GUMBO_TAG_FRAMESET, 250 GUMBO_TAG_NOFRAMES, 251 GUMBO_TAG_ISINDEX, 252 GUMBO_TAG_LISTING, 253 GUMBO_TAG_XMP, 254 GUMBO_TAG_NEXTID, 255 GUMBO_TAG_NOEMBED, 256 GUMBO_TAG_PLAINTEXT, 257 GUMBO_TAG_RB, 258 GUMBO_TAG_STRIKE, 259 GUMBO_TAG_BASEFONT, 260 GUMBO_TAG_BIG, 261 GUMBO_TAG_BLINK, 262 GUMBO_TAG_CENTER, 263 GUMBO_TAG_FONT, 264 GUMBO_TAG_MARQUEE, 265 GUMBO_TAG_MULTICOL, 266 GUMBO_TAG_NOBR, 267 GUMBO_TAG_SPACER, 268 GUMBO_TAG_TT, 269 GUMBO_TAG_U, 270 // Used for all tags that don't have special handling in HTML. 271 GUMBO_TAG_UNKNOWN, 272 // A marker value to indicate the end of the enum, for iterating over it. 273 // Also used as the terminator for varargs functions that take tags. 274 GUMBO_TAG_LAST, 275 }; 276 277 /** 278 * Returns the normalized (usually all-lowercased, except for foreign content) 279 * tag name for an GumboTag enum. Return value is static data owned by the 280 * library. 281 */ 282 char* gumbo_normalized_tagname(GumboTag tag); 283 284 /** 285 * Extracts the tag name from the original_text field of an element or token by 286 * stripping off </> characters and attributes and adjusting the passed-in 287 * GumboStringPiece appropriately. The tag name is in the original case and 288 * shares a buffer with the original text, to simplify memory management. 289 * Behavior is undefined if a string-piece that doesn't represent an HTML tag 290 * (<tagname> or </tagname>) is passed in. If the string piece is completely 291 * empty (NULL data pointer), then this function will exit successfully as a 292 * no-op. 293 */ 294 void gumbo_tag_from_original_text(GumboStringPiece* text); 295 296 /** 297 * Fixes the case of SVG elements that are not all lowercase. 298 * http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#parsing-main-inforeign 299 * This is not done at parse time because there's no place to store a mutated 300 * tag name. tag_name is an enum (which will be TAG_UNKNOWN for most SVG tags 301 * without special handling), while original_tag_name is a pointer into the 302 * original buffer. Instead, we provide this helper function that clients can 303 * use to rename SVG tags as appropriate. 304 * Returns the case-normalized SVG tagname if a replacement is found, or NULL if 305 * no normalization is called for. The return value is static data and owned by 306 * the library. 307 */ 308 char* gumbo_normalize_svg_tagname(const GumboStringPiece* tagname); 309 310 /** 311 * Converts a tag name string (which may be in upper or mixed case) to a tag 312 * enum. 313 */ 314 GumboTag gumbo_tag_enum(const char* tagname); 315 316 /** 317 * Attribute namespaces. 318 * HTML includes special handling for XLink, XML, and XMLNS namespaces on 319 * attributes. Everything else goes in the generatic "NONE" namespace. 320 */ 321 enum GumboAttributeNamespaceEnum { 322 GUMBO_ATTR_NAMESPACE_NONE, 323 GUMBO_ATTR_NAMESPACE_XLINK, 324 GUMBO_ATTR_NAMESPACE_XML, 325 GUMBO_ATTR_NAMESPACE_XMLNS, 326 }; 327 328 /** 329 * A struct representing a single attribute on an HTML tag. This is a 330 * name-value pair, but also includes information about source locations and 331 * original source text. 332 */ 333 struct GumboAttribute { 334 /** 335 * The namespace for the attribute. This will usually be 336 * GUMBO_ATTR_NAMESPACE_NONE, but some XLink/XMLNS/XML attributes take special 337 * values, per: 338 * http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#adjust-foreign-attributes 339 */ 340 GumboAttributeNamespaceEnum attr_namespace; 341 342 /** 343 * The name of the attribute. This is in a freshly-allocated buffer to deal 344 * with case-normalization, and is null-terminated. 345 */ 346 const char* name; 347 348 /** 349 * The original text of the attribute name, as a pointer into the original 350 * source buffer. 351 */ 352 GumboStringPiece original_name; 353 354 /** 355 * The value of the attribute. This is in a freshly-allocated buffer to deal 356 * with unescaping, and is null-terminated. It does not include any quotes 357 * that surround the attribute. If the attribute has no value (for example, 358 * 'selected' on a checkbox), this will be an empty string. 359 */ 360 const char* value; 361 362 /** 363 * The original text of the value of the attribute. This points into the 364 * original source buffer. It includes any quotes that surround the 365 * attribute, and you can look at original_value.data[0] and 366 * original_value.data[original_value.length - 1] to determine what the quote 367 * characters were. If the attribute has no value, this will be a 0-length 368 * string. 369 */ 370 GumboStringPiece original_value; 371 372 /** The starting position of the attribute name. */ 373 GumboSourcePosition name_start; 374 375 /** 376 * The ending position of the attribute name. This is not always derivable 377 * from the starting position of the value because of the possibility of 378 * whitespace around the = sign. 379 */ 380 GumboSourcePosition name_end; 381 382 /** The starting position of the attribute value. */ 383 GumboSourcePosition value_start; 384 385 /** The ending position of the attribute value. */ 386 GumboSourcePosition value_end; 387 }; 388 389 /** 390 * Given a vector of GumboAttributes, look up the one with the specified name 391 * and return it, or NULL if no such attribute exists. This uses a 392 * case-insensitive match, as HTML is case-insensitive. 393 */ 394 GumboAttribute* gumbo_get_attribute( 395 const GumboVector* attrs, const char* name); 396 397 /** 398 * Enum denoting the type of node. This determines the type of the node.v 399 * union. 400 */ 401 enum GumboNodeType { 402 /** Document node. v will be a GumboDocument. */ 403 GUMBO_NODE_DOCUMENT, 404 /** Element node. v will be a GumboElement. */ 405 GUMBO_NODE_ELEMENT, 406 /** Text node. v will be a GumboText. */ 407 GUMBO_NODE_TEXT, 408 /** CDATA node. v will be a GumboText. */ 409 GUMBO_NODE_CDATA, 410 /** Comment node. v. will be a GumboText, excluding comment delimiters. */ 411 GUMBO_NODE_COMMENT, 412 /** Text node, where all contents is whitespace. v will be a GumboText. */ 413 GUMBO_NODE_WHITESPACE 414 }; 415 416 /** 417 * Forward declaration of GumboNode so it can be used recursively in 418 * GumboNode.parent. 419 */ 420 //typedef struct _GumboNode GumboNode; 421 422 /** http://www.whatwg.org/specs/web-apps/current-work/complete/dom.html#quirks-mode */ 423 enum GumboQuirksModeEnum { 424 GUMBO_DOCTYPE_NO_QUIRKS, 425 GUMBO_DOCTYPE_QUIRKS, 426 GUMBO_DOCTYPE_LIMITED_QUIRKS 427 }; 428 429 /** 430 * Namespaces. 431 * Unlike in X(HT)ML, namespaces in HTML5 are not denoted by a prefix. Rather, 432 * anything inside an <svg> tag is in the SVG namespace, anything inside the 433 * <math> tag is in the MathML namespace, and anything else is inside the HTML 434 * namespace. No other namespaces are supported, so this can be an enum only. 435 */ 436 enum GumboNamespaceEnum { 437 GUMBO_NAMESPACE_HTML, 438 GUMBO_NAMESPACE_SVG, 439 GUMBO_NAMESPACE_MATHML 440 }; 441 442 /** 443 * Parse flags. 444 * We track the reasons for parser insertion of nodes and store them in a 445 * bitvector in the node itself. This lets client code optimize out nodes that 446 * are implied by the HTML structure of the document, or flag constructs that 447 * may not be allowed by a style guide, or track the prevalence of incorrect or 448 * tricky HTML code. 449 */ 450 enum GumboParseFlags { 451 /** 452 * A normal node - both start and end tags appear in the source, nothing has 453 * been reparented. 454 */ 455 GUMBO_INSERTION_NORMAL = 0, 456 457 /** 458 * A node inserted by the parser to fulfill some implicit insertion rule. 459 * This is usually set in addition to some other flag giving a more specific 460 * insertion reason; it's a generic catch-all term meaning "The start tag for 461 * this node did not appear in the document source". 462 */ 463 GUMBO_INSERTION_BY_PARSER = 1 << 0, 464 465 /** 466 * A flag indicating that the end tag for this node did not appear in the 467 * document source. Note that in some cases, you can still have 468 * parser-inserted nodes with an explicit end tag: for example, "Text</html>" 469 * has GUMBO_INSERTED_BY_PARSER set on the <html> node, but 470 * GUMBO_INSERTED_END_TAG_IMPLICITLY is unset, as the </html> tag actually 471 * exists. This flag will be set only if the end tag is completely missing; 472 * in some cases, the end tag may be misplaced (eg. a </body> tag with text 473 * afterwards), which will leave this flag unset and require clients to 474 * inspect the parse errors for that case. 475 */ 476 GUMBO_INSERTION_IMPLICIT_END_TAG = 1 << 1, 477 478 // Value 1 << 2 was for a flag that has since been removed. 479 480 /** 481 * A flag for nodes that are inserted because their presence is implied by 482 * other tags, eg. <html>, <head>, <body>, <tbody>, etc. 483 */ 484 GUMBO_INSERTION_IMPLIED = 1 << 3, 485 486 /** 487 * A flag for nodes that are converted from their end tag equivalents. For 488 * example, </p> when no paragraph is open implies that the parser should 489 * create a <p> tag and immediately close it, while </br> means the same thing 490 * as <br>. 491 */ 492 GUMBO_INSERTION_CONVERTED_FROM_END_TAG = 1 << 4, 493 494 /** A flag for nodes that are converted from the parse of an <isindex> tag. */ 495 GUMBO_INSERTION_FROM_ISINDEX = 1 << 5, 496 497 /** A flag for <image> tags that are rewritten as <img>. */ 498 GUMBO_INSERTION_FROM_IMAGE = 1 << 6, 499 500 /** 501 * A flag for nodes that are cloned as a result of the reconstruction of 502 * active formatting elements. This is set only on the clone; the initial 503 * portion of the formatting run is a NORMAL node with an IMPLICIT_END_TAG. 504 */ 505 GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT = 1 << 7, 506 507 /** A flag for nodes that are cloned by the adoption agency algorithm. */ 508 GUMBO_INSERTION_ADOPTION_AGENCY_CLONED = 1 << 8, 509 510 /** A flag for nodes that are moved by the adoption agency algorithm. */ 511 GUMBO_INSERTION_ADOPTION_AGENCY_MOVED = 1 << 9, 512 513 /** 514 * A flag for nodes that have been foster-parented out of a table (or 515 * should've been foster-parented, if verbatim mode is set). 516 */ 517 GUMBO_INSERTION_FOSTER_PARENTED = 1 << 10, 518 }; 519 520 521 /** 522 * Information specific to document nodes. 523 */ 524 struct GumboDocument { 525 /** 526 * An array of GumboNodes, containing the children of this element. This will 527 * normally consist of the <html> element and any comment nodes found. 528 * Pointers are owned. 529 */ 530 GumboVector /* GumboNode* */ children; 531 532 // True if there was an explicit doctype token as opposed to it being omitted. 533 bool has_doctype; 534 535 // Fields from the doctype token, copied verbatim. 536 const char* name; 537 const char* public_identifier; 538 const char* system_identifier; 539 540 /** 541 * Whether or not the document is in QuirksMode, as determined by the values 542 * in the GumboTokenDocType template. 543 */ 544 GumboQuirksModeEnum doc_type_quirks_mode; 545 }; 546 547 /** 548 * The struct used to represent TEXT, CDATA, COMMENT, and WHITESPACE elements. 549 * This contains just a block of text and its position. 550 */ 551 struct GumboText { 552 /** 553 * The text of this node, after entities have been parsed and decoded. For 554 * comment/cdata nodes, this does not include the comment delimiters. 555 */ 556 const char* text; 557 558 /** 559 * The original text of this node, as a pointer into the original buffer. For 560 * comment/cdata nodes, this includes the comment delimiters. 561 */ 562 GumboStringPiece original_text; 563 564 /** 565 * The starting position of this node. This corresponds to the position of 566 * original_text, before entities are decoded. 567 * */ 568 GumboSourcePosition start_pos; 569 }; 570 571 /** 572 * The struct used to represent all HTML elements. This contains information 573 * about the tag, attributes, and child nodes. 574 */ 575 struct GumboElement { 576 /** 577 * An array of GumboNodes, containing the children of this element. Pointers 578 * are owned. 579 */ 580 GumboVector /* GumboNode* */ children; 581 582 /** The GumboTag enum for this element. */ 583 GumboTag tag; 584 585 /** The GumboNamespaceEnum for this element. */ 586 GumboNamespaceEnum tag_namespace; 587 588 /** 589 * A GumboStringPiece pointing to the original tag text for this element, 590 * pointing directly into the source buffer. If the tag was inserted 591 * algorithmically (for example, <head> or <tbody> insertion), this will be a 592 * zero-length string. 593 */ 594 GumboStringPiece original_tag; 595 596 /** 597 * A GumboStringPiece pointing to the original end tag text for this element. 598 * If the end tag was inserted algorithmically, (for example, closing a 599 * self-closing tag), this will be a zero-length string. 600 */ 601 GumboStringPiece original_end_tag; 602 603 /** The source position for the start of the start tag. */ 604 GumboSourcePosition start_pos; 605 606 /** The source position for the start of the end tag. */ 607 GumboSourcePosition end_pos; 608 609 /** 610 * An array of GumboAttributes, containing the attributes for this tag in the 611 * order that they were parsed. Pointers are owned. 612 */ 613 GumboVector /* GumboAttribute* */ attributes; 614 }; 615 616 /** 617 * A supertype for GumboElement and GumboText, so that we can include one 618 * generic type in lists of children and cast as necessary to subtypes. 619 */ 620 struct GumboNode { 621 /** The type of node that this is. */ 622 GumboNodeType type; 623 624 /** Pointer back to parent node. Not owned. */ 625 GumboNode* parent; 626 627 /** The index within the parent's children vector of this node. */ 628 size_t index_within_parent; 629 630 /** 631 * A bitvector of flags containing information about why this element was 632 * inserted into the parse tree, including a variety of special parse 633 * situations. 634 */ 635 GumboParseFlags parse_flags; 636 637 /** The actual node data. */ 638 private union NodeData { 639 GumboDocument document; // For GUMBO_NODE_DOCUMENT. 640 GumboElement element; // For GUMBO_NODE_ELEMENT. 641 GumboText text; // For everything else. 642 }; 643 NodeData v; 644 }; 645 646 /** 647 * The type for an allocator function. Takes the 'userdata' member of the 648 * GumboParser struct as its first argument. Semantics should be the same as 649 * malloc, i.e. return a block of size_t bytes on success or NULL on failure. 650 * Allocating a block of 0 bytes behaves as per malloc. 651 */ 652 // TODO(jdtang): Add checks throughout the codebase for out-of-memory condition. 653 //typedef void* (*GumboAllocatorFunction)(void* userdata, size_t size); 654 alias void* function(void* userdata, size_t size) GumboAllocatorFunction; 655 656 /** 657 * The type for a deallocator function. Takes the 'userdata' member of the 658 * GumboParser struct as its first argument. 659 */ 660 //typedef void (*GumboDeallocatorFunction)(void* userdata, void* ptr); 661 alias void function(void* userdata, void* ptr) GumboDeallocatorFunction; 662 663 /** 664 * Input struct containing configuration options for the parser. 665 * These let you specify alternate memory managers, provide different error 666 * handling, etc. 667 * Use kGumboDefaultOptions for sensible defaults, and only set what you need. 668 */ 669 struct GumboOptions { 670 /** A memory allocator function. Default: malloc. */ 671 GumboAllocatorFunction allocator; 672 673 /** A memory deallocator function. Default: free. */ 674 GumboDeallocatorFunction deallocator; 675 676 /** 677 * An opaque object that's passed in as the first argument to all callbacks 678 * used by this library. Default: NULL. 679 */ 680 void* userdata; 681 682 /** 683 * The tab-stop size, for computing positions in source code that uses tabs. 684 * Default: 8. 685 */ 686 int tab_stop; 687 688 /** 689 * Whether or not to stop parsing when the first error is encountered. 690 * Default: false. 691 */ 692 bool stop_on_first_error; 693 694 /** 695 * The maximum number of errors before the parser stops recording them. This 696 * is provided so that if the page is totally borked, we don't completely fill 697 * up the errors vector and exhaust memory with useless redundant errors. Set 698 * to -1 to disable the limit. 699 * Default: -1 700 */ 701 int max_errors; 702 }; 703 704 /** Default options struct; use this with gumbo_parse_with_options. */ 705 extern const GumboOptions kGumboDefaultOptions; 706 707 /** The output struct containing the results of the parse. */ 708 struct GumboOutput { 709 /** 710 * Pointer to the document node. This is a GumboNode of type NODE_DOCUMENT 711 * that contains the entire document as its child. 712 */ 713 GumboNode* document; 714 715 /** 716 * Pointer to the root node. This the <html> tag that forms the root of the 717 * document. 718 */ 719 GumboNode* root; 720 721 /** 722 * A list of errors that occurred during the parse. 723 * NOTE: In version 1.0 of this library, the API for errors hasn't been fully 724 * fleshed out and may change in the future. For this reason, the GumboError 725 * header isn't part of the public API. Contact us if you need errors 726 * reported so we can work out something appropriate for your use-case. 727 */ 728 GumboVector /* GumboError */ errors; 729 }; 730 731 /** 732 * Parses a buffer of UTF8 text into an GumboNode parse tree. The buffer must 733 * live at least as long as the parse tree, as some fields (eg. original_text) 734 * point directly into the original buffer. 735 * 736 * This doesn't support buffers longer than 4 gigabytes. 737 */ 738 GumboOutput* gumbo_parse(const char* buffer); 739 740 /** 741 * Extended version of gumbo_parse that takes an explicit options structure, 742 * buffer, and length. 743 */ 744 GumboOutput* gumbo_parse_with_options( 745 const GumboOptions* options, const char* buffer, size_t buffer_length); 746 747 /** Release the memory used for the parse tree & parse errors. */ 748 void gumbo_destroy_output( 749 const GumboOptions* options, GumboOutput* output); 750 751 }