Package jodd.lagarto

Class LagartoParser


  • public class LagartoParser
    extends java.lang.Object
    HTML/XML content parser/tokenizer using TagVisitor for callbacks. Works by the HTML5 specs for tokenization, as described on WhatWG. Differences from the specs:
    • text is emitted as a block of text, and not character by character.
    • tags name case (and letter case of other entities) is not changed, but case-sensitive information exist for matching.
    • the whole tokenization process is implemented here, without going into the tree building. This applies for switching to the RAWTEXT state.
    • script tag is emitted separately
    • conditional comments added
    • xml states and callbacks added
    • Field Detail

      • tag

        protected jodd.lagarto.ParsedTag tag
      • doctype

        protected jodd.lagarto.ParsedDoctype doctype
      • in

        protected final jodd.lagarto.CharsInput in
      • parsing

        protected boolean parsing
      • DATA_STATE

        protected State DATA_STATE
        Data state.
      • TAG_OPEN

        protected State TAG_OPEN
      • END_TAG_OPEN

        protected State END_TAG_OPEN
      • TAG_NAME

        protected State TAG_NAME
      • BEFORE_ATTRIBUTE_NAME

        protected State BEFORE_ATTRIBUTE_NAME
      • ATTRIBUTE_NAME

        protected State ATTRIBUTE_NAME
      • AFTER_ATTRIBUTE_NAME

        protected State AFTER_ATTRIBUTE_NAME
      • BEFORE_ATTRIBUTE_VALUE

        protected State BEFORE_ATTRIBUTE_VALUE
      • ATTR_VALUE_UNQUOTED

        protected State ATTR_VALUE_UNQUOTED
      • ATTR_VALUE_SINGLE_QUOTED

        protected State ATTR_VALUE_SINGLE_QUOTED
      • ATTR_VALUE_DOUBLE_QUOTED

        protected State ATTR_VALUE_DOUBLE_QUOTED
      • AFTER_ATTRIBUTE_VALUE_QUOTED

        protected State AFTER_ATTRIBUTE_VALUE_QUOTED
      • SELF_CLOSING_START_TAG

        protected State SELF_CLOSING_START_TAG
      • BOGUS_COMMENT

        protected State BOGUS_COMMENT
      • MARKUP_DECLARATION_OPEN

        protected State MARKUP_DECLARATION_OPEN
      • rawTextStart

        protected int rawTextStart
      • rawTextEnd

        protected int rawTextEnd
      • rawTagName

        protected char[] rawTagName
      • RAWTEXT

        protected State RAWTEXT
      • RAWTEXT_LESS_THAN_SIGN

        protected State RAWTEXT_LESS_THAN_SIGN
      • RAWTEXT_END_TAG_OPEN

        protected State RAWTEXT_END_TAG_OPEN
      • RAWTEXT_END_TAG_NAME

        protected State RAWTEXT_END_TAG_NAME
      • rcdataTagStart

        protected int rcdataTagStart
      • rcdataTagName

        protected char[] rcdataTagName
      • RCDATA

        protected State RCDATA
      • RCDATA_LESS_THAN_SIGN

        protected State RCDATA_LESS_THAN_SIGN
      • RCDATA_END_TAG_OPEN

        protected State RCDATA_END_TAG_OPEN
      • RCDATA_END_TAG_NAME

        protected State RCDATA_END_TAG_NAME
      • commentStart

        protected int commentStart
      • COMMENT_START

        protected State COMMENT_START
      • COMMENT_START_DASH

        protected State COMMENT_START_DASH
      • COMMENT

        protected State COMMENT
      • COMMENT_END_DASH

        protected State COMMENT_END_DASH
      • COMMENT_END

        protected State COMMENT_END
      • COMMENT_END_BANG

        protected State COMMENT_END_BANG
      • DOCTYPE

        protected State DOCTYPE
      • BEFORE_DOCTYPE_NAME

        protected State BEFORE_DOCTYPE_NAME
      • DOCTYPE_NAME

        protected State DOCTYPE_NAME
      • AFTER_DOCUMENT_NAME

        protected State AFTER_DOCUMENT_NAME
      • doctypeIdNameStart

        protected int doctypeIdNameStart
      • AFTER_DOCTYPE_PUBLIC_KEYWORD

        protected State AFTER_DOCTYPE_PUBLIC_KEYWORD
      • BEFORE_DOCTYPE_PUBLIC_IDENTIFIER

        protected State BEFORE_DOCTYPE_PUBLIC_IDENTIFIER
      • DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED

        protected State DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED
      • DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED

        protected State DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED
      • AFTER_DOCTYPE_PUBLIC_IDENTIFIER

        protected State AFTER_DOCTYPE_PUBLIC_IDENTIFIER
      • BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS

        protected State BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS
      • BOGUS_DOCTYPE

        protected State BOGUS_DOCTYPE
      • AFTER_DOCTYPE_SYSTEM_KEYWORD

        protected State AFTER_DOCTYPE_SYSTEM_KEYWORD
      • BEFORE_DOCTYPE_SYSTEM_IDENTIFIER

        protected State BEFORE_DOCTYPE_SYSTEM_IDENTIFIER
      • DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED

        protected State DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED
      • DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED

        protected State DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED
      • AFTER_DOCTYPE_SYSTEM_IDENTIFIER

        protected State AFTER_DOCTYPE_SYSTEM_IDENTIFIER
      • scriptStartNdx

        protected int scriptStartNdx
      • scriptEndNdx

        protected int scriptEndNdx
      • scriptEndTagName

        protected int scriptEndTagName
      • SCRIPT_DATA

        protected State SCRIPT_DATA
      • SCRIPT_DATA_LESS_THAN_SIGN

        protected State SCRIPT_DATA_LESS_THAN_SIGN
      • SCRIPT_DATA_END_TAG_OPEN

        protected State SCRIPT_DATA_END_TAG_OPEN
      • SCRIPT_DATA_END_TAG_NAME

        protected State SCRIPT_DATA_END_TAG_NAME
      • text

        protected char[] text
      • textLen

        protected int textLen
      • attrStartNdx

        protected int attrStartNdx
      • attrEndNdx

        protected int attrEndNdx
      • state

        protected State state
    • Constructor Detail

      • LagartoParser

        public LagartoParser​(LagartoParserConfig parserConfig,
                             char[] input)
        Creates parser on char array.
      • LagartoParser

        public LagartoParser​(char[] input)
        Creates parser on char array.
      • LagartoParser

        public LagartoParser​(LagartoParserConfig parserConfig,
                             java.lang.CharSequence input)
        Creates parser on a char sequence.
      • LagartoParser

        public LagartoParser​(java.lang.CharSequence input)
        Creates parser on a char sequence.
    • Method Detail

      • initialize

        protected void initialize()
        Initializes parser.
      • parse

        public void parse​(TagVisitor visitor)
        Parses content and emits event to provided TagVisitor.
      • consumeCharacterReference

        protected void consumeCharacterReference​(char allowedChar)
      • consumeCharacterReference

        protected void consumeCharacterReference()
      • textEmitChar

        protected void textEmitChar​(char c)
        Emits characters into the local text buffer.
      • textStart

        protected void textStart()
        Resets text buffer.
      • textEmitChars

        protected void textEmitChars​(int from,
                                     int to)
      • textEmitChars

        protected void textEmitChars​(char[] buffer)
      • textWrap

        protected java.lang.CharSequence textWrap()
      • emitTag

        protected void emitTag()
      • emitComment

        protected void emitComment​(int from,
                                   int to)
        Emits a comment. Also checks for conditional comments!
      • emitText

        protected void emitText()
        Emits text if there is some content.
      • emitScript

        protected void emitScript​(int from,
                                  int to)
      • emitDoctype

        protected void emitDoctype()
      • emitXml

        protected void emitXml()
      • emitCData

        protected void emitCData​(java.lang.CharSequence charSequence)
      • errorEOF

        protected void errorEOF()
      • errorInvalidToken

        protected void errorInvalidToken()
      • errorCharReference

        protected void errorCharReference()
      • _error

        protected void _error​(java.lang.String message)
        Prepares error message and reports it to the visitor.