/*
 * Decompiled with CFR 0.152.
 */
package org.archive.modules.extractor;

import com.google.common.base.Ascii;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.net.URLDecoder;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.Locale;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.httpclient.URIException;
import org.apache.commons.io.IOUtils;
import org.archive.io.ReplayCharSequence;
import org.archive.modules.CrawlMetadata;
import org.archive.modules.CrawlURI;
import org.archive.modules.extractor.ContentExtractor;
import org.archive.modules.extractor.ExtractorCSS;
import org.archive.modules.extractor.ExtractorJS;
import org.archive.modules.extractor.HTMLLinkContext;
import org.archive.modules.extractor.Hop;
import org.archive.modules.net.RobotsPolicy;
import org.archive.net.UURI;
import org.archive.net.UURIFactory;
import org.archive.util.DevUtils;
import org.archive.util.TextUtils;
import org.archive.util.UriUtils;
import org.springframework.beans.factory.InitializingBean;
import org.springframework.beans.factory.annotation.Autowired;

public class ExtractorHTML
extends ContentExtractor
implements InitializingBean {
    private static final long serialVersionUID = 2L;
    private static Logger logger = Logger.getLogger(ExtractorHTML.class.getName());
    private static final String MAX_ELEMENT_REPLACE = "MAX_ELEMENT";
    private static final String MAX_ATTR_NAME_REPLACE = "MAX_ATTR_NAME";
    private static final String MAX_ATTR_VAL_REPLACE = "MAX_ATTR_VAL";
    public static final String A_META_ROBOTS = "meta-robots";
    public static final String A_FORM_OFFSETS = "form-offsets";
    private static final Pattern ASCII_WHITESPACE = Pattern.compile("[\t\n\f\r ]+");
    static final String RELEVANT_TAG_EXTRACTOR = "(?is)<(?:((script[^>]*+)>.*?</script)|((style[^>]*+)>.*?</style)|(((meta)|(?:\\w{1,MAX_ELEMENT}))\\s+[^>]*+)|(!--(?!\\[if|>).*?--))>";
    static final String EACH_ATTRIBUTE_EXTRACTOR = "(?is)\\s?((href|(?:cite))|(action)|(on\\w*)|((?:src)|(?:srcset)|(?:lowsrc)|(?:background)|(?:longdesc)|(?:usemap)|(?:profile)|(?:datasrc)|(?:data-src)|(?:data-srcset)|(?:data-original)|(?:data-original-set))|(codebase)|((?:classid)|(?:data))|(archive)|(code)|(value)|(style)|(method)|([-\\w]{1,MAX_ATTR_NAME}))\\s*=\\s*(?:(?:\"(.{0,MAX_ATTR_VAL}?)(?:\"|$))|(?:'(.{0,MAX_ATTR_VAL}?)(?:'|$))|(\\S{1,MAX_ATTR_VAL}))";
    static final String WHITESPACE = "\\s";
    static final String CLASSEXT = ".class";
    static final String APPLET = "applet";
    static final String BASE = "base";
    static final String LINK = "link";
    static final String FRAME = "frame";
    static final String IFRAME = "iframe";
    protected CrawlMetadata metadata;
    protected transient ExtractorJS extractorJS;
    private String relevantTagPattern;
    private String eachAttributePattern;
    static final String JAVASCRIPT = "(?i)^javascript:.*";
    static final String NON_HTML_PATH_EXTENSION = "(?i)(gif)|(jp(e)?g)|(png)|(tif(f)?)|(bmp)|(avi)|(mov)|(mp(e)?g)|(mp3)|(mp4)|(swf)|(wav)|(au)|(aiff)|(mid)";

    public int getMaxElementLength() {
        return (Integer)this.kp.get("maxElementLength");
    }

    public void setMaxElementLength(int max) {
        this.kp.put((Object)"maxElementLength", (Object)max);
    }

    public int getMaxAttributeNameLength() {
        return (Integer)this.kp.get("maxAttributeNameLength");
    }

    public void setMaxAttributeNameLength(int max) {
        this.kp.put((Object)"maxAttributeNameLength", (Object)max);
    }

    public int getMaxAttributeValLength() {
        return (Integer)this.kp.get("maxAttributeValLength");
    }

    public void setMaxAttributeValLength(int max) {
        this.kp.put((Object)"maxAttributeValLength", (Object)max);
    }

    public boolean getTreatFramesAsEmbedLinks() {
        return (Boolean)this.kp.get("treatFramesAsEmbedLinks");
    }

    public void setTreatFramesAsEmbedLinks(boolean asEmbeds) {
        this.kp.put((Object)"treatFramesAsEmbedLinks", (Object)asEmbeds);
    }

    public boolean getIgnoreFormActionUrls() {
        return (Boolean)this.kp.get("ignoreFormActionUrls");
    }

    public void setIgnoreFormActionUrls(boolean ignoreActions) {
        this.kp.put((Object)"ignoreFormActionUrls", (Object)ignoreActions);
    }

    public boolean getExtractOnlyFormGets() {
        return (Boolean)this.kp.get("extractOnlyFormGets");
    }

    public void setExtractOnlyFormGets(boolean onlyGets) {
        this.kp.put((Object)"extractOnlyFormGets", (Object)onlyGets);
    }

    public boolean getExtractJavascript() {
        return (Boolean)this.kp.get("extractJavascript");
    }

    public void setExtractJavascript(boolean extractJavascript) {
        this.kp.put((Object)"extractJavascript", (Object)extractJavascript);
    }

    public boolean getExtractValueAttributes() {
        return (Boolean)this.kp.get("extractValueAttributes");
    }

    public void setExtractValueAttributes(boolean extractValueAttributes) {
        this.kp.put((Object)"extractValueAttributes", (Object)extractValueAttributes);
    }

    public boolean getIgnoreUnexpectedHtml() {
        return (Boolean)this.kp.get("ignoreUnexpectedHtml");
    }

    public void setIgnoreUnexpectedHtml(boolean ignoreUnexpectedHtml) {
        this.kp.put((Object)"ignoreUnexpectedHtml", (Object)ignoreUnexpectedHtml);
    }

    public CrawlMetadata getMetadata() {
        return this.metadata;
    }

    @Autowired
    public void setMetadata(CrawlMetadata provider) {
        this.metadata = provider;
    }

    public ExtractorJS getExtractorJS() {
        return this.extractorJS;
    }

    @Autowired
    public void setExtractorJS(ExtractorJS extractorJS) {
        this.extractorJS = extractorJS;
    }

    public ExtractorHTML() {
        this.setMaxElementLength(64);
        this.setMaxAttributeNameLength(64);
        this.setMaxAttributeValLength(2048);
        this.setTreatFramesAsEmbedLinks(true);
        this.setIgnoreFormActionUrls(false);
        this.setExtractOnlyFormGets(true);
        this.setExtractJavascript(true);
        this.setExtractValueAttributes(true);
        this.setIgnoreUnexpectedHtml(true);
    }

    public void afterPropertiesSet() {
        String regex = RELEVANT_TAG_EXTRACTOR;
        this.relevantTagPattern = regex = regex.replace(MAX_ELEMENT_REPLACE, Integer.toString(this.getMaxElementLength()));
        regex = EACH_ATTRIBUTE_EXTRACTOR;
        regex = regex.replace(MAX_ATTR_NAME_REPLACE, Integer.toString(this.getMaxAttributeNameLength()));
        this.eachAttributePattern = regex = regex.replace(MAX_ATTR_VAL_REPLACE, Integer.toString(this.getMaxAttributeValLength()));
    }

    protected void processGeneralTag(CrawlURI curi, CharSequence element, CharSequence cs) {
        Matcher attr = TextUtils.getMatcher((String)this.eachAttributePattern, (CharSequence)cs);
        String codebase = null;
        ArrayList<String> resources = null;
        CharSequence action = null;
        CharSequence actionContext = null;
        CharSequence method = null;
        CharSequence valueVal = null;
        CharSequence valueContext = null;
        CharSequence nameVal = null;
        CharSequence linkHref = null;
        CharSequence linkRel = null;
        boolean framesAsEmbeds = this.getTreatFramesAsEmbedLinks();
        boolean ignoreFormActions = this.getIgnoreFormActionUrls();
        boolean extractValueAttributes = this.getExtractValueAttributes();
        String elementStr = element.toString();
        while (attr.find()) {
            CharSequence context;
            int valueGroup = attr.start(14) > -1 ? 14 : (attr.start(15) > -1 ? 15 : 16);
            int start = attr.start(valueGroup);
            int end = attr.end(valueGroup);
            assert (start >= 0) : "Start is: " + start + ", " + curi;
            assert (end >= 0) : "End is :" + end + ", " + curi;
            CharSequence value = cs.subSequence(start, end);
            CharSequence attrName = cs.subSequence(attr.start(1), attr.end(1));
            value = TextUtils.unescapeHtml((CharSequence)value);
            if (attr.start(2) > -1) {
                context = "a".equals(element) && TextUtils.matches((String)"(?i).*data-remote\\s*=\\s*([\"'])true.*\\1", (CharSequence)cs) ? "a[data-remote='true']/@href" : ExtractorHTML.elementContext(element, attr.group(2));
                if (elementStr.equalsIgnoreCase(LINK)) {
                    linkHref = value;
                } else if ("a[data-remote='true']/@href".equals(context)) {
                    this.processEmbed(curi, value, context);
                } else {
                    this.processLink(curi, value, context);
                }
                if (!elementStr.equalsIgnoreCase(BASE) || curi.containsDataKey("html-base-href")) continue;
                try {
                    UURI base = UURIFactory.getInstance((UURI)curi.getUURI(), (String)value.toString());
                    curi.setBaseURI(base);
                }
                catch (URIException e) {
                    this.logUriError(e, curi.getUURI(), value);
                }
                continue;
            }
            if (attr.start(3) > -1) {
                if (ignoreFormActions) continue;
                action = value;
                actionContext = ExtractorHTML.elementContext(element, attr.group(3));
                continue;
            }
            if (attr.start(4) > -1) {
                this.processScriptCode(curi, value);
                continue;
            }
            if (attr.start(5) > -1) {
                context = ExtractorHTML.elementContext(element, attr.group(5));
                if (context.toString().toLowerCase().startsWith("data:")) continue;
                Hop hop = !framesAsEmbeds && (elementStr.equalsIgnoreCase(FRAME) || elementStr.equalsIgnoreCase(IFRAME)) ? Hop.NAVLINK : Hop.EMBED;
                this.processEmbed(curi, value, context, hop);
                continue;
            }
            if (attr.start(6) > -1) {
                codebase = value instanceof String ? (String)value : value.toString();
                context = ExtractorHTML.elementContext(element, attr.group(6));
                this.processLink(curi, codebase, context);
                continue;
            }
            if (attr.start(7) > -1) {
                if (resources == null) {
                    resources = new ArrayList<String>();
                }
                resources.add(value.toString());
                continue;
            }
            if (attr.start(8) > -1) {
                if (resources == null) {
                    resources = new ArrayList();
                }
                String[] multi = TextUtils.split((String)WHITESPACE, (CharSequence)value);
                for (int i = 0; i < multi.length; ++i) {
                    resources.add(multi[i]);
                }
                continue;
            }
            if (attr.start(9) > -1) {
                if (resources == null) {
                    resources = new ArrayList();
                }
                if (elementStr.equalsIgnoreCase(APPLET) && !value.toString().toLowerCase().endsWith(CLASSEXT)) {
                    resources.add(value.toString() + CLASSEXT);
                    continue;
                }
                resources.add(value.toString());
                continue;
            }
            if (attr.start(10) > -1) {
                valueVal = value;
                valueContext = ExtractorHTML.elementContext(element, attr.group(10));
                continue;
            }
            if (attr.start(11) > -1) {
                this.numberOfLinksExtracted.addAndGet(ExtractorCSS.processStyleCode(this, curi, value));
                continue;
            }
            if (attr.start(12) > -1) {
                method = value;
                continue;
            }
            if (attr.start(13) <= -1) continue;
            if (Ascii.equalsIgnoreCase((CharSequence)attrName, (CharSequence)"NAME")) {
                nameVal = value;
            } else if (Ascii.equalsIgnoreCase((CharSequence)attrName, (CharSequence)"FLASHVARS")) {
                valueContext = ExtractorHTML.elementContext(element, attr.group(13));
                this.considerQueryStringValues(curi, value, valueContext, Hop.SPECULATIVE);
            } else if (Ascii.equalsIgnoreCase((CharSequence)attrName, (CharSequence)"REL")) {
                linkRel = value;
            }
            context = ExtractorHTML.elementContext(element, attr.group(13));
            if (!TextUtils.matches((String)"data-(src|src-small|src-medium|srcset|original|original-set|lazy|lazy-srcset|full-src)", (CharSequence)attr.group(13).toLowerCase())) continue;
            Hop hop = !framesAsEmbeds && (elementStr.equalsIgnoreCase(FRAME) || elementStr.equalsIgnoreCase(IFRAME)) ? Hop.NAVLINK : Hop.EMBED;
            this.processEmbed(curi, value, context, hop);
        }
        TextUtils.recycleMatcher((Matcher)attr);
        if (resources != null) {
            Iterator iter = resources.iterator();
            UURI codebaseURI = null;
            String res = null;
            try {
                if (codebase != null) {
                    codebaseURI = UURIFactory.getInstance((UURI)curi.getUURI(), codebase);
                }
                while (iter.hasNext()) {
                    res = ((String)iter.next()).toString();
                    res = (String)TextUtils.unescapeHtml((CharSequence)res);
                    if (codebaseURI != null) {
                        res = codebaseURI.resolve(res).toString();
                    }
                    this.processEmbed(curi, res, element);
                }
            }
            catch (URIException e) {
                curi.getNonFatalFailures().add(e);
            }
            catch (IllegalArgumentException e) {
                DevUtils.logger.log(Level.WARNING, "processGeneralTag()\ncodebase=" + codebase + " res=" + res + "\n" + DevUtils.extraInfo(), e);
            }
        }
        if (linkHref != null && linkRel != null) {
            this.processLinkTagWithRel(curi, linkHref, linkRel);
        }
        if (action != null && (method == null || "GET".equalsIgnoreCase(method.toString()) || !this.getExtractOnlyFormGets())) {
            this.processLink(curi, action, actionContext);
        }
        if (valueVal != null) {
            if ("PARAM".equalsIgnoreCase(elementStr) && nameVal != null && "flashvars".equalsIgnoreCase(nameVal.toString())) {
                String queryStringLike = valueVal.toString();
                this.considerQueryStringValues(curi, queryStringLike, valueContext, Hop.SPECULATIVE);
            } else if (extractValueAttributes) {
                this.considerIfLikelyUri(curi, valueVal, valueContext, Hop.NAVLINK);
            }
        }
    }

    protected void processLinkTagWithRel(CrawlURI curi, CharSequence href, CharSequence rel) {
        boolean emitAsNavLink = false;
        block16: for (String keyword : ASCII_WHITESPACE.split(rel)) {
            String linkType;
            switch (linkType = keyword.toLowerCase(Locale.ROOT)) {
                case "icon": 
                case "stylesheet": 
                case "modulepreload": 
                case "prefetch": 
                case "prerender": {
                    this.processEmbed(curi, href, "link[rel='" + linkType + "']/@href");
                    return;
                }
                case "pingback": {
                    return;
                }
                case "dns-prefetch": 
                case "preconnect": 
                case "": {
                    continue block16;
                }
                default: {
                    emitAsNavLink = true;
                }
            }
        }
        if (emitAsNavLink) {
            this.processLink(curi, href, "link/@href");
        }
    }

    protected void considerQueryStringValues(CrawlURI curi, CharSequence queryString, CharSequence valueContext, Hop hop) {
        for (String pairString : queryString.toString().split("&")) {
            String[] encodedKeyVal = pairString.split("=");
            if (encodedKeyVal.length != 2) continue;
            try {
                String value = URLDecoder.decode(encodedKeyVal[1], "UTF-8");
                this.considerIfLikelyUri(curi, value, valueContext, hop);
            }
            catch (IllegalArgumentException e) {
                this.considerIfLikelyUri(curi, encodedKeyVal[1], valueContext, hop);
            }
            catch (UnsupportedEncodingException e) {
                throw new AssertionError((Object)("all jvms must support UTF-8, and yet somehow this happened: " + e));
            }
        }
    }

    protected void considerIfLikelyUri(CrawlURI curi, CharSequence candidate, CharSequence valueContext, Hop hop) {
        if (UriUtils.isVeryLikelyUri((CharSequence)candidate)) {
            this.addLinkFromString(curi, candidate, valueContext, hop);
        }
    }

    protected void processScriptCode(CrawlURI curi, CharSequence cs) {
        if (this.getExtractorJS() != null && this.getExtractJavascript()) {
            this.numberOfLinksExtracted.addAndGet(this.getExtractorJS().considerStrings(this, curi, cs));
        }
    }

    protected void processLink(CrawlURI curi, CharSequence value, CharSequence context) {
        if (TextUtils.matches((String)JAVASCRIPT, (CharSequence)value)) {
            this.processScriptCode(curi, value.subSequence(11, value.length()));
        } else {
            if (logger.isLoggable(Level.FINEST)) {
                logger.finest("link: " + value.toString() + " from " + curi);
            }
            this.addLinkFromString(curi, value, context, Hop.NAVLINK);
            this.numberOfLinksExtracted.incrementAndGet();
        }
    }

    protected void addLinkFromString(CrawlURI curi, CharSequence uri, CharSequence context, Hop hop) {
        try {
            HTMLLinkContext hc = HTMLLinkContext.get(context.toString());
            int max = this.getExtractorParameters().getMaxOutlinks();
            ExtractorHTML.addRelativeToBase(curi, max, uri, hc, hop);
        }
        catch (URIException e) {
            this.logUriError(e, curi.getUURI(), uri);
        }
    }

    protected final void processEmbed(CrawlURI curi, CharSequence value, CharSequence context) {
        this.processEmbed(curi, value, context, Hop.EMBED);
    }

    protected void processEmbed(CrawlURI curi, CharSequence value, CharSequence context, Hop hop) {
        if (logger.isLoggable(Level.FINEST)) {
            logger.finest("embed (" + hop.getHopChar() + "): " + value.toString() + " from " + curi);
        }
        if (context.equals(HTMLLinkContext.IMG_SRCSET.toString()) || context.equals(HTMLLinkContext.IMG_DATA_SRC.toString()) || context.equals(HTMLLinkContext.SOURCE_SRCSET.toString()) || context.equals(HTMLLinkContext.IMG_DATA_SRCSET.toString()) || context.equals(HTMLLinkContext.SOURCE_DATA_SRCSET.toString()) || context.equals(HTMLLinkContext.SOURCE_DATA_LAZY_SRCSET.toString()) || context.equals(HTMLLinkContext.IMG_DATA_LAZY_SRCSET.toString()) || context.equals(HTMLLinkContext.IMG_DATA_SRC_MEDIUM.toString()) || context.equals(HTMLLinkContext.IMG_DATA_SRC_SMALL.toString()) || context.equals(HTMLLinkContext.IMG_DATA_ORIGINAL_SET.toString()) || context.equals(HTMLLinkContext.SOURCE_DATA_ORIGINAL_SET.toString()) || context.equals(HTMLLinkContext.LINK_IMAGESRCSET.toString())) {
            logger.log(Level.FINE, "Found srcset listing: {0}", value);
            Matcher matcher = TextUtils.getMatcher((String)"[\\s,]*(\\S*[^,\\s])(?:\\s(?:[^,(]+|\\([^)]*(?:\\)|$))*)?", (CharSequence)value);
            while (matcher.lookingAt()) {
                CharSequence link = value.subSequence(matcher.start(1), matcher.end(1));
                matcher.region(matcher.end(), matcher.regionEnd());
                logger.log(Level.FINER, "Found {0} adding to outlinks.", link);
                this.addLinkFromString(curi, link, context, hop);
                this.numberOfLinksExtracted.incrementAndGet();
            }
            TextUtils.recycleMatcher((Matcher)matcher);
        } else {
            this.addLinkFromString(curi, value, context, hop);
            this.numberOfLinksExtracted.incrementAndGet();
        }
    }

    @Override
    protected boolean shouldExtract(CrawlURI uri) {
        String mime;
        if (this.getIgnoreUnexpectedHtml()) {
            try {
                if (!this.isHtmlExpectedHere(uri)) {
                    return false;
                }
            }
            catch (URIException e) {
                logger.severe("Failed expectedHTML test: " + e.getMessage());
            }
        }
        if ((mime = uri.getContentType().toLowerCase()).startsWith("text/html") || mime.startsWith("application/xhtml") || mime.startsWith("text/vnd.wap.wml") || mime.startsWith("application/vnd.wap.wml") || mime.startsWith("application/vnd.wap.xhtml")) {
            return true;
        }
        String contentPrefixLC = uri.getRecorder().getContentReplayPrefixString(1000).toLowerCase();
        return contentPrefixLC.contains("<html") || contentPrefixLC.contains("<!doctype html");
    }

    @Override
    public boolean innerExtract(CrawlURI curi) {
        if (!curi.containsContentTypeCharsetDeclaration()) {
            String contentPrefix = curi.getRecorder().getContentReplayPrefixString(1000);
            Charset contentDeclaredEncoding = this.getContentDeclaredCharset(curi, contentPrefix);
            if (!curi.getRecorder().getCharset().equals(contentDeclaredEncoding) && contentDeclaredEncoding != null) {
                String newContentPrefix = curi.getRecorder().getContentReplayPrefixString(1000, contentDeclaredEncoding);
                Charset reflexiveCharset = this.getContentDeclaredCharset(curi, newContentPrefix);
                if (contentDeclaredEncoding.equals(reflexiveCharset)) {
                    curi.getAnnotations().add("usingCharsetInHTML:" + contentDeclaredEncoding);
                    curi.getRecorder().setCharset(contentDeclaredEncoding);
                } else {
                    curi.getAnnotations().add("inconsistentCharsetInHTML:" + contentDeclaredEncoding);
                }
            }
        }
        try {
            ReplayCharSequence cs = curi.getRecorder().getContentReplayCharSequence();
            this.extract(curi, (CharSequence)cs);
            if (cs.getDecodeExceptionCount() > 0L) {
                curi.getNonFatalFailures().add(cs.getCodingException());
            }
            return true;
        }
        catch (IOException e) {
            curi.getNonFatalFailures().add(e);
            logger.log(Level.WARNING, "Failed get of replay char sequence in " + Thread.currentThread().getName(), e);
            return false;
        }
    }

    protected Charset getContentDeclaredCharset(CrawlURI curi, String contentPrefix) {
        String charsetName = null;
        Matcher matcher = TextUtils.getMatcher((String)"(?is)<meta\\s+[^>]*http-equiv\\s*=\\s*['\"]content-type['\"][^>]*>", (CharSequence)contentPrefix);
        if (matcher.find()) {
            String metaContentType = matcher.group();
            TextUtils.recycleMatcher((Matcher)matcher);
            matcher = TextUtils.getMatcher((String)"charset=([^'\";\\s>]+)", (CharSequence)metaContentType);
            if (matcher.find()) {
                charsetName = matcher.group(1);
            }
            TextUtils.recycleMatcher((Matcher)matcher);
        }
        if (charsetName == null) {
            matcher = TextUtils.getMatcher((String)"(?si)<meta\\s+[^>]*charset=['\"]([^'\";\\s>]+)['\"]", (CharSequence)contentPrefix);
            if (matcher.find()) {
                charsetName = matcher.group(1);
                TextUtils.recycleMatcher((Matcher)matcher);
            } else {
                matcher = TextUtils.getMatcher((String)"(?is)<\\?xml\\s+[^>]*encoding=['\"]([^'\"]+)['\"]", (CharSequence)contentPrefix);
                if (!matcher.find()) {
                    return null;
                }
                charsetName = matcher.group(1);
                TextUtils.recycleMatcher((Matcher)matcher);
            }
        }
        try {
            return Charset.forName(charsetName);
        }
        catch (IllegalArgumentException iae) {
            logger.log(Level.INFO, "Unknown content-encoding '" + charsetName + "' declared; using default");
            curi.getAnnotations().add("unsatisfiableCharsetInHTML:" + charsetName);
            return null;
        }
    }

    protected void extract(CrawlURI curi, CharSequence cs) {
        Matcher tags = TextUtils.getMatcher((String)this.relevantTagPattern, (CharSequence)cs);
        while (tags.find() && !Thread.interrupted()) {
            int end;
            int start;
            if (tags.start(8) > 0) continue;
            if (tags.start(7) > 0) {
                start = tags.start(5);
                end = tags.end(5);
                assert (start >= 0) : "Start is: " + start + ", " + curi;
                assert (end >= 0) : "End is :" + end + ", " + curi;
                if (!this.processMeta(curi, cs.subSequence(start, end))) continue;
                break;
            }
            if (tags.start(5) > 0) {
                int start5 = tags.start(5);
                int end5 = tags.end(5);
                assert (start5 >= 0) : "Start is: " + start5 + ", " + curi;
                assert (end5 >= 0) : "End is :" + end5 + ", " + curi;
                int start6 = tags.start(6);
                int end6 = tags.end(6);
                assert (start6 >= 0) : "Start is: " + start6 + ", " + curi;
                assert (end6 >= 0) : "End is :" + end6 + ", " + curi;
                String element = cs.subSequence(start6, end6).toString();
                CharSequence attributes = cs.subSequence(start5, end5);
                this.processGeneralTag(curi, element, attributes);
                if (!"form".equalsIgnoreCase(element)) continue;
                curi.getDataList(A_FORM_OFFSETS).add(start6 - 1);
                continue;
            }
            if (tags.start(1) > 0) {
                start = tags.start(1);
                end = tags.end(1);
                assert (start >= 0) : "Start is: " + start + ", " + curi;
                assert (end >= 0) : "End is :" + end + ", " + curi;
                assert (tags.end(2) >= 0) : "Tags.end(2) illegal " + tags.end(2) + ", " + curi;
                this.processScript(curi, cs.subSequence(start, end), tags.end(2) - start);
                continue;
            }
            if (tags.start(3) <= 0) continue;
            start = tags.start(3);
            end = tags.end(3);
            assert (start >= 0) : "Start is: " + start + ", " + curi;
            assert (end >= 0) : "End is :" + end + ", " + curi;
            assert (tags.end(4) >= 0) : "Tags.end(4) illegal " + tags.end(4) + ", " + curi;
            this.processStyle(curi, cs.subSequence(start, end), tags.end(4) - start);
        }
        TextUtils.recycleMatcher((Matcher)tags);
    }

    protected boolean isHtmlExpectedHere(CrawlURI curi) throws URIException {
        String path = curi.getUURI().getPath();
        if (path == null) {
            return true;
        }
        int dot = path.lastIndexOf(46);
        if (dot < 0) {
            return true;
        }
        if (dot < path.length() - 5) {
            return true;
        }
        String ext = path.substring(dot + 1);
        return !TextUtils.matches((String)NON_HTML_PATH_EXTENSION, (CharSequence)ext);
    }

    protected void processScript(CrawlURI curi, CharSequence sequence, int endOfOpenTag) {
        this.processGeneralTag(curi, sequence.subSequence(0, 6), sequence.subSequence(0, endOfOpenTag));
        this.processScriptCode(curi, sequence.subSequence(endOfOpenTag, sequence.length()));
    }

    protected boolean processMeta(CrawlURI curi, CharSequence cs) {
        Matcher attr = TextUtils.getMatcher((String)this.eachAttributePattern, (CharSequence)cs);
        String name = null;
        String httpEquiv = null;
        String content = null;
        while (attr.find()) {
            int valueGroup = attr.start(14) > -1 ? 14 : (attr.start(15) > -1 ? 15 : 16);
            CharSequence value = cs.subSequence(attr.start(valueGroup), attr.end(valueGroup));
            value = TextUtils.unescapeHtml((CharSequence)value);
            if (attr.group(1).equalsIgnoreCase("name")) {
                name = value.toString();
                continue;
            }
            if (attr.group(1).equalsIgnoreCase("http-equiv")) {
                httpEquiv = value.toString();
                continue;
            }
            if (!attr.group(1).equalsIgnoreCase("content")) continue;
            content = value.toString();
        }
        TextUtils.recycleMatcher((Matcher)attr);
        if ("robots".equalsIgnoreCase(name) && content != null) {
            curi.getData().put(A_META_ROBOTS, content);
            RobotsPolicy policy = this.metadata.getRobotsPolicy();
            String contentLower = content.toLowerCase();
            if (policy.obeyMetaRobotsNofollow() && (contentLower.indexOf("nofollow") >= 0 || contentLower.indexOf("none") >= 0)) {
                logger.fine("HTML extraction skipped due to robots meta-tag for: " + curi.toString());
                return true;
            }
        } else if ("refresh".equalsIgnoreCase(httpEquiv) && content != null) {
            int urlIndex = content.indexOf("=") + 1;
            if (urlIndex > 0) {
                String refreshUri = TextUtils.replaceAll((String)"[\"']", (CharSequence)content.substring(urlIndex), (String)"");
                try {
                    int max = this.getExtractorParameters().getMaxOutlinks();
                    ExtractorHTML.addRelativeToBase(curi, max, refreshUri, HTMLLinkContext.META, Hop.REFER);
                }
                catch (URIException e) {
                    this.logUriError(e, curi.getUURI(), refreshUri);
                }
            }
        } else if (content != null) {
            try {
                if (UriUtils.isVeryLikelyUri((CharSequence)UriUtils.speculativeFixup((String)content, (UURI)curi.getUURI()))) {
                    int max = this.getExtractorParameters().getMaxOutlinks();
                    ExtractorHTML.addRelativeToBase(curi, max, content, HTMLLinkContext.META, Hop.SPECULATIVE);
                }
            }
            catch (URIException e) {
                this.logUriError(e, curi.getUURI(), content);
            }
        }
        return false;
    }

    protected void processStyle(CrawlURI curi, CharSequence sequence, int endOfOpenTag) {
        this.processGeneralTag(curi, sequence.subSequence(0, 6), sequence.subSequence(0, endOfOpenTag));
        this.numberOfLinksExtracted.addAndGet(ExtractorCSS.processStyleCode(this, curi, sequence.subSequence(endOfOpenTag, sequence.length())));
    }

    public static CharSequence elementContext(CharSequence element, CharSequence attribute) {
        return attribute == null ? "" : (element + "/@" + attribute).toLowerCase(Locale.ROOT);
    }

    public static void main(String[] args) throws Exception {
        String content;
        String url = null;
        CrawlMetadata metadata = new CrawlMetadata();
        block18: for (int i = 0; i < args.length; ++i) {
            if (!args[i].startsWith("-")) {
                url = args[i];
                continue;
            }
            switch (args[i]) {
                case "-h": 
                case "--help": {
                    System.out.println("Usage: ExtractorHTML [options] URL");
                    System.out.println("Extracts and prints links from the given URL");
                    System.out.println("");
                    System.out.println("Options:");
                    System.out.println("  --robots POLICY    Policy for robots meta tags " + RobotsPolicy.STANDARD_POLICIES.keySet());
                    System.exit(0);
                    continue block18;
                }
                case "--robots": {
                    metadata.setRobotsPolicyName(args[++i]);
                    continue block18;
                }
                default: {
                    System.err.println("ExtractorHTML: Unknown option: " + args[i]);
                    System.err.println("Try --help for usage information.");
                    System.exit(1);
                }
            }
        }
        if (url == null) {
            System.err.println("ExtractorHTML: No URL specified.");
            System.err.println("Try --help for usage information.");
            System.exit(1);
        }
        CrawlURI curi = new CrawlURI(UURIFactory.getInstance(url));
        metadata.afterPropertiesSet();
        ExtractorHTML extractor = new ExtractorHTML();
        extractor.setExtractorJS(new ExtractorJS());
        extractor.setMetadata(metadata);
        extractor.afterPropertiesSet();
        try (InputStream stream = new URL(url).openStream();){
            content = IOUtils.toString((InputStream)stream, (Charset)StandardCharsets.ISO_8859_1);
        }
        extractor.extract(curi, content);
        for (CrawlURI link : curi.getOutLinks()) {
            System.out.println(link.getURI() + " " + link.getLastHop() + " " + link.getViaContext());
        }
    }
}

