/*
 * Decompiled with CFR 0.152.
 */
package it.unimi.di.law.bubing.parser;

import com.google.common.base.Charsets;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hasher;
import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.StringParser;
import com.martiansoftware.jsap.Switch;
import com.martiansoftware.jsap.UnflaggedOption;
import it.unimi.di.law.bubing.parser.BinaryParser;
import it.unimi.di.law.bubing.parser.Parser;
import it.unimi.di.law.bubing.util.BURL;
import it.unimi.di.law.bubing.util.ByteArrayCharSequence;
import it.unimi.di.law.bubing.util.Util;
import it.unimi.di.law.warc.filters.URIResponse;
import it.unimi.di.law.warc.records.WarcHeader;
import it.unimi.di.law.warc.records.WarcRecord;
import it.unimi.di.law.warc.util.StringHttpMessages;
import it.unimi.dsi.fastutil.io.InspectableFileCachedInputStream;
import it.unimi.dsi.fastutil.objects.ObjectLinkedOpenHashSet;
import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet;
import it.unimi.dsi.fastutil.objects.Reference2ObjectOpenHashMap;
import it.unimi.dsi.lang.ObjectParser;
import it.unimi.dsi.util.TextPattern;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PrintStream;
import java.io.Reader;
import java.lang.reflect.InvocationTargetException;
import java.net.URI;
import java.net.URISyntaxException;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
import java.security.NoSuchAlgorithmException;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import net.htmlparser.jericho.CharacterReference;
import net.htmlparser.jericho.EndTag;
import net.htmlparser.jericho.EndTagType;
import net.htmlparser.jericho.HTMLElements;
import net.htmlparser.jericho.Segment;
import net.htmlparser.jericho.StartTag;
import net.htmlparser.jericho.StartTagType;
import net.htmlparser.jericho.StreamedSource;
import org.apache.commons.codec.binary.Hex;
import org.apache.commons.io.IOUtils;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.impl.client.HttpClients;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class HTMLParser<T>
implements Parser<T> {
    private static final Logger LOGGER = LoggerFactory.getLogger(HTMLParser.class);
    protected static final TextPattern URLEQUAL_PATTERN;
    public static final int CHAR_BUFFER_SIZE = 131072;
    protected final char[] buffer;
    protected String guessedCharset;
    protected final DigestAppendable digestAppendable;
    protected final Parser.TextProcessor<T> textProcessor;
    protected URI location;
    private boolean returnNoFollow;
    protected URI metaLocation;
    protected boolean crossAuthorityDuplicates;
    protected static final TextPattern META_PATTERN;
    protected static final Pattern HTTP_EQUIV_PATTERN;
    protected static final Pattern CONTENT_PATTERN;
    protected static final Pattern CHARSET_PATTERN;

    public HTMLParser(HashFunction hashFunction) {
        this(hashFunction, false);
    }

    public HTMLParser(HashFunction hashFunction, Parser.TextProcessor<T> textProcessor, boolean crossAuthorityDuplicates, boolean returnNoFollow, int bufferSize) {
        this.buffer = bufferSize != 0 ? new char[bufferSize] : null;
        this.digestAppendable = hashFunction == null ? null : new DigestAppendable(hashFunction);
        this.textProcessor = textProcessor;
        this.crossAuthorityDuplicates = crossAuthorityDuplicates;
        this.returnNoFollow = returnNoFollow;
    }

    public HTMLParser(HashFunction hashFunction, Parser.TextProcessor<T> textProcessor, boolean crossAuthorityDuplicates, int bufferSize) {
        this(hashFunction, textProcessor, crossAuthorityDuplicates, false, bufferSize);
    }

    public HTMLParser(HashFunction hashFunction, Parser.TextProcessor<T> textProcessor, boolean crossAuthorityDuplicates, boolean returnNoFollow) {
        this(hashFunction, textProcessor, crossAuthorityDuplicates, returnNoFollow, 131072);
    }

    public HTMLParser(HashFunction hashFunction, boolean crossAuthorityDuplicates) {
        this(hashFunction, null, crossAuthorityDuplicates, 131072);
    }

    public HTMLParser(HashFunction hashFunction, Parser.TextProcessor<T> textProcessor, boolean crossAuthorityDuplicates) {
        this(hashFunction, textProcessor, crossAuthorityDuplicates, 131072);
    }

    public HTMLParser(String messageDigest) throws NoSuchAlgorithmException {
        this(BinaryParser.forName(messageDigest));
    }

    public HTMLParser(String messageDigest, String crossAuthorityDuplicates) throws NoSuchAlgorithmException {
        this(BinaryParser.forName(messageDigest), Util.parseBoolean(crossAuthorityDuplicates));
    }

    public HTMLParser(String messageDigest, String textProcessorSpec, String crossAuthorityDuplicates) throws NoSuchAlgorithmException, IllegalArgumentException, ClassNotFoundException, IllegalAccessException, InvocationTargetException, InstantiationException, NoSuchMethodException, IOException {
        this(BinaryParser.forName(messageDigest), (Parser.TextProcessor)ObjectParser.fromSpec((String)textProcessorSpec), Util.parseBoolean(crossAuthorityDuplicates));
    }

    public HTMLParser(String messageDigest, String textProcessorSpec, String crossAuthorityDuplicates, String returnNoFollow) throws NoSuchAlgorithmException, IllegalArgumentException, ClassNotFoundException, IllegalAccessException, InvocationTargetException, InstantiationException, NoSuchMethodException, IOException {
        this(BinaryParser.forName(messageDigest), (Parser.TextProcessor)ObjectParser.fromSpec((String)textProcessorSpec), Util.parseBoolean(crossAuthorityDuplicates), Util.parseBoolean(returnNoFollow));
    }

    public HTMLParser() {
        this(null, null, false, 131072);
    }

    protected void process(Parser.LinkReceiver linkReceiver, URI base, String s) {
        if (s == null) {
            return;
        }
        URI url = BURL.parse(s);
        if (url == null) {
            return;
        }
        linkReceiver.link(base.resolve(url));
    }

    @Override
    public byte[] parse(URI uri, HttpResponse httpResponse, Parser.LinkReceiver linkReceiver) throws IOException {
        URI location;
        Charset charset;
        InputStream contentStream;
        block38: {
            Header bubingGuessedCharsetHeader;
            String headerCharset;
            this.guessedCharset = "ISO-8859-1";
            HttpEntity entity = httpResponse.getEntity();
            Header contentTypeHeader = entity.getContentType();
            if (contentTypeHeader != null && (headerCharset = HTMLParser.getCharsetNameFromHeader(contentTypeHeader.getValue())) != null) {
                this.guessedCharset = headerCharset;
            }
            contentStream = entity.getContent();
            Header header = bubingGuessedCharsetHeader = httpResponse instanceof WarcRecord ? ((WarcRecord)httpResponse).getWarcHeader(WarcHeader.Name.BUBING_GUESSED_CHARSET) : null;
            if (bubingGuessedCharsetHeader != null) {
                this.guessedCharset = bubingGuessedCharsetHeader.getValue();
            } else if (contentStream instanceof InspectableFileCachedInputStream) {
                InspectableFileCachedInputStream inspectableStream = (InspectableFileCachedInputStream)contentStream;
                String metaCharset = HTMLParser.getCharsetName(inspectableStream.buffer, inspectableStream.inspectable);
                if (metaCharset != null) {
                    this.guessedCharset = metaCharset;
                }
            }
            if (LOGGER.isDebugEnabled()) {
                LOGGER.debug("Guessing charset \"{}\" for URL {}", (Object)this.guessedCharset, (Object)uri);
            }
            charset = Charsets.ISO_8859_1;
            try {
                charset = Charset.forName(this.guessedCharset);
            }
            catch (IllegalCharsetNameException e) {
                if (LOGGER.isDebugEnabled()) {
                    LOGGER.debug("Response for {} contained an illegal charset name: \"{}\"", (Object)uri, (Object)this.guessedCharset);
                }
            }
            catch (UnsupportedCharsetException e) {
                if (!LOGGER.isDebugEnabled()) break block38;
                LOGGER.debug("Response for {} contained an unsupported charset: \"{}\"", (Object)uri, (Object)this.guessedCharset);
            }
        }
        linkReceiver.init(uri);
        if (this.textProcessor != null) {
            this.textProcessor.init(uri);
        }
        this.location = null;
        this.metaLocation = null;
        System.err.println(Arrays.toString(httpResponse.getAllHeaders()));
        Header locationHeader = httpResponse.getFirstHeader("Location");
        if (locationHeader != null && (location = BURL.parse(locationHeader.getValue())) != null) {
            if (!location.isAbsolute() && LOGGER.isDebugEnabled()) {
                LOGGER.debug("Found relative header location URL: \"{}\"", (Object)location);
            }
            this.location = uri.resolve(location);
            linkReceiver.location(this.location);
        }
        StreamedSource streamedSource = new StreamedSource((Reader)new InputStreamReader(contentStream, charset));
        if (this.buffer != null) {
            streamedSource.setBuffer(this.buffer);
        }
        if (this.digestAppendable != null) {
            this.digestAppendable.init(this.crossAuthorityDuplicates ? null : uri);
        }
        URI base = uri;
        int lastSegmentEnd = 0;
        int inSpecialText = 0;
        for (Segment segment : streamedSource) {
            String name;
            if (segment.getEnd() <= lastSegmentEnd) continue;
            lastSegmentEnd = segment.getEnd();
            if (segment instanceof StartTag) {
                URI metaLocation;
                String urlPattern;
                URI refresh;
                int pos;
                StartTag startTag = (StartTag)segment;
                if (startTag.getTagType() != StartTagType.NORMAL) continue;
                name = startTag.getName();
                if (!(name != "style" && name != "script" || startTag.isSyntacticalEmptyElementTag())) {
                    ++inSpecialText;
                }
                if (this.digestAppendable != null) {
                    this.digestAppendable.startTag(startTag);
                }
                if (name == "iframe" || name == "frame" || name == "embed") {
                    this.process(linkReceiver, base, startTag.getAttributeValue("src"));
                    continue;
                }
                if (name == "img" || name == "script") {
                    this.process(linkReceiver, base, startTag.getAttributeValue("src"));
                    continue;
                }
                if (name == "object") {
                    this.process(linkReceiver, base, startTag.getAttributeValue("data"));
                    continue;
                }
                if (name == "a") {
                    if (!this.returnNoFollow && "nofollow".equalsIgnoreCase(startTag.getAttributeValue("rel"))) continue;
                    this.process(linkReceiver, base, startTag.getAttributeValue("href"));
                    continue;
                }
                if (name == "area" || name == "link") {
                    this.process(linkReceiver, base, startTag.getAttributeValue("href"));
                    continue;
                }
                if (name == "base") {
                    URI link;
                    String s = startTag.getAttributeValue("href");
                    if (s == null || (link = BURL.parse(s)) == null) continue;
                    if (link.isAbsolute()) {
                        base = link;
                        continue;
                    }
                    if (!LOGGER.isDebugEnabled()) continue;
                    LOGGER.debug("Found relative BASE URL: \"{}\"", (Object)link);
                    continue;
                }
                if (name != "meta") continue;
                String equiv = startTag.getAttributeValue("http-equiv");
                String content = startTag.getAttributeValue("content");
                if (equiv == null || content == null) continue;
                equiv.toLowerCase();
                if (equiv.equals("refresh") && (pos = URLEQUAL_PATTERN.search((CharSequence)content)) != -1 && (refresh = BURL.parse(urlPattern = content.substring(pos + URLEQUAL_PATTERN.length()))) != null) {
                    if (!refresh.isAbsolute() && LOGGER.isDebugEnabled()) {
                        LOGGER.debug("Found relative META refresh URL: \"{}\"", (Object)urlPattern);
                    }
                    linkReceiver.metaRefresh(base.resolve(refresh));
                }
                if (!equiv.equals("location") || (metaLocation = BURL.parse(content)) == null) continue;
                if (!metaLocation.isAbsolute() && LOGGER.isDebugEnabled()) {
                    LOGGER.debug("Found relative META location URL: \"{}\"", (Object)content);
                }
                this.metaLocation = base.resolve(metaLocation);
                linkReceiver.metaLocation(this.metaLocation);
                continue;
            }
            if (segment instanceof EndTag) {
                EndTag endTag = (EndTag)segment;
                name = endTag.getName();
                if (name == "style" || name == "script") {
                    inSpecialText = Math.max(0, inSpecialText - 1);
                }
                if (this.digestAppendable == null || endTag.getTagType() != EndTagType.NORMAL) continue;
                this.digestAppendable.endTag(endTag);
                continue;
            }
            if (inSpecialText != 0) continue;
            if (this.textProcessor != null) {
                if (segment instanceof CharacterReference) {
                    ((CharacterReference)segment).appendCharTo(this.textProcessor);
                } else {
                    this.textProcessor.append((CharSequence)segment);
                }
            }
            if (this.digestAppendable == null) continue;
            if (segment instanceof CharacterReference) {
                ((CharacterReference)segment).appendCharTo((Appendable)this.digestAppendable);
                continue;
            }
            this.digestAppendable.append((CharSequence)segment);
        }
        if (this.digestAppendable != null && httpResponse.getStatusLine().getStatusCode() / 100 == 3) {
            this.digestAppendable.append('\u0000');
            if (this.location != null) {
                this.digestAppendable.append(BURL.toByteArray(this.location));
            }
            this.digestAppendable.append('\u0000');
            if (this.metaLocation != null) {
                this.digestAppendable.append(BURL.toByteArray(this.metaLocation));
            }
            this.digestAppendable.append('\u0000');
        }
        return this.digestAppendable != null ? this.digestAppendable.digest() : null;
    }

    @Override
    public String guessedCharset() {
        return this.guessedCharset;
    }

    public URI location() {
        if (this.location != null) {
            return this.location;
        }
        if (this.metaLocation != null) {
            return this.metaLocation;
        }
        return null;
    }

    public static String getCharsetName(byte[] buffer, int length) {
        int start = 0;
        while ((start = META_PATTERN.search(buffer, start, length)) != -1) {
            Matcher m;
            int end;
            for (end = start; end < length && buffer[end] != 62; ++end) {
            }
            if (end == length) {
                return null;
            }
            ByteArrayCharSequence tagContent = new ByteArrayCharSequence(buffer, start + META_PATTERN.length(), end - start - META_PATTERN.length());
            if (HTTP_EQUIV_PATTERN.matcher(tagContent).matches() && (m = CONTENT_PATTERN.matcher(tagContent)).matches()) {
                return HTMLParser.getCharsetNameFromHeader(m.group(2));
            }
            start = end + 1;
        }
        return null;
    }

    public static String getCharsetNameFromHeader(String headerValue) {
        Matcher m = CHARSET_PATTERN.matcher(headerValue);
        if (m.matches()) {
            String s = m.group(1);
            int start = 0;
            int end = s.length();
            if (end > 0 && (s.charAt(0) == '\"' || s.charAt(0) == '\'')) {
                start = 1;
            }
            if (end > 0 && (s.charAt(end - 1) == '\"' || s.charAt(end - 1) == '\'')) {
                --end;
            }
            if (start < end) {
                return s.substring(start, end);
            }
        }
        return null;
    }

    public boolean apply(URIResponse uriResponse) {
        Header contentType = uriResponse.response().getEntity().getContentType();
        return contentType != null && contentType.getValue().startsWith("text/");
    }

    public HTMLParser<T> clone() {
        return new HTMLParser<T>(this.digestAppendable == null ? null : this.digestAppendable.hashFunction, this.textProcessor == null ? null : (Parser.TextProcessor)this.textProcessor.copy(), this.crossAuthorityDuplicates, this.buffer.length);
    }

    @Override
    public HTMLParser<T> copy() {
        return this.clone();
    }

    @Override
    public T result() {
        return this.textProcessor == null ? null : (T)this.textProcessor.result();
    }

    public static void main(String[] arg) throws IllegalArgumentException, IOException, URISyntaxException, JSAPException, NoSuchAlgorithmException {
        byte[] digest;
        SimpleJSAP jsap = new SimpleJSAP(HTMLParser.class.getName(), "Produce the digest of a page: the page is downloaded or passed as argument by specifying a file", new Parameter[]{new UnflaggedOption("url", (StringParser)JSAP.STRING_PARSER, true, "The url of the page."), new Switch("crossAuthorityDuplicates", 'c', "cross-authority-duplicates"), new FlaggedOption("charBufferSize", (StringParser)JSAP.INTSIZE_PARSER, Integer.toString(131072), false, 'b', "buffer", "The size of the parser character buffer (0 for dynamic sizing)."), new FlaggedOption("file", (StringParser)JSAP.STRING_PARSER, JSAP.NO_DEFAULT, false, 'f', "file", "The page to be processed."), new FlaggedOption("digester", (StringParser)JSAP.STRING_PARSER, "MD5", false, 'd', "digester", "The digester to be used.")});
        JSAPResult jsapResult = jsap.parse(arg);
        if (jsap.messagePrinted()) {
            System.exit(1);
        }
        String url = jsapResult.getString("url");
        String digester = jsapResult.getString("digester");
        boolean crossAuthorityDuplicates = jsapResult.userSpecified("crossAuthorityDuplicates");
        int charBufferSize = jsapResult.getInt("charBufferSize");
        HTMLParser htmlParser = new HTMLParser(BinaryParser.forName(digester), (Parser.TextProcessor)null, crossAuthorityDuplicates, charBufferSize);
        SetLinkReceiver linkReceiver = new SetLinkReceiver();
        if (!jsapResult.userSpecified("file")) {
            URI uri = new URI(url);
            HttpGet request = new HttpGet(uri);
            request.setConfig(RequestConfig.custom().setRedirectsEnabled(false).build());
            digest = htmlParser.parse(uri, (HttpResponse)HttpClients.createDefault().execute((HttpUriRequest)request), linkReceiver);
        } else {
            String file = jsapResult.getString("file");
            String content = IOUtils.toString((Reader)new InputStreamReader(new FileInputStream(file)));
            digest = htmlParser.parse(BURL.parse(url), new StringHttpMessages.HttpResponse(content), linkReceiver);
        }
        System.out.println("DigestHexString: " + Hex.encodeHexString((byte[])digest));
        System.out.println("Links: " + linkReceiver.urls);
        ObjectOpenHashSet urlStrings = new ObjectOpenHashSet();
        for (URI link : linkReceiver.urls) {
            urlStrings.add(link.toString());
        }
        if (urlStrings.size() != linkReceiver.urls.size()) {
            System.out.println("There are " + linkReceiver.urls.size() + " URIs but " + urlStrings.size() + " strings");
        }
    }

    static {
        StartTagType.SERVER_COMMON.deregister();
        StartTagType.SERVER_COMMON_COMMENT.deregister();
        StartTagType.SERVER_COMMON_ESCAPED.deregister();
        URLEQUAL_PATTERN = new TextPattern((CharSequence)"URL=", 1);
        META_PATTERN = new TextPattern((CharSequence)"<meta", 1);
        HTTP_EQUIV_PATTERN = Pattern.compile(".*http-equiv\\s*=\\s*('|\")?content-type('|\")?.*", 2);
        CONTENT_PATTERN = Pattern.compile(".*content\\s*=\\s*('|\")([^'\"]*)('|\").*", 2);
        CHARSET_PATTERN = Pattern.compile(".*charset\\s*=\\s*(([\\041-\\0176&&[^<>\\{\\}\\\\/:,;@?=]])+|\"[^\"]*\").*", 2);
    }

    public static final class DigestAppendable
    implements Appendable {
        private static final boolean DEBUG = false;
        private PrintStream debugStream;
        private File debugFile;
        protected static final Reference2ObjectOpenHashMap<String, byte[]> startTags;
        protected static final Reference2ObjectOpenHashMap<String, byte[]> endTags;
        protected final HashFunction hashFunction;
        protected Hasher hasher;
        protected boolean lastAppendedWasSpace;
        protected byte[] digest;

        public DigestAppendable(HashFunction hashFunction) {
            this.hashFunction = hashFunction;
        }

        public void init(URI url) {
            this.hasher = this.hashFunction.newHasher();
            this.digest = null;
            if (url != null) {
                this.hasher.putUnencodedChars((CharSequence)url.getHost());
                this.hasher.putByte((byte)0);
            }
            this.lastAppendedWasSpace = false;
        }

        @Override
        public Appendable append(CharSequence csq, int start, int end) {
            for (int i = start; i < end; ++i) {
                this.append(csq.charAt(i));
            }
            return this;
        }

        @Override
        public Appendable append(char c) {
            if (Character.isWhitespace(c) || Character.isDigit(c)) {
                if (!this.lastAppendedWasSpace) {
                    this.hasher.putChar(' ');
                    this.lastAppendedWasSpace = true;
                }
            } else {
                this.hasher.putChar(c);
                this.lastAppendedWasSpace = false;
            }
            return this;
        }

        @Override
        public Appendable append(CharSequence csq) {
            return this.append(csq, 0, csq.length());
        }

        private void append(byte[] a) {
            this.hasher.putBytes(a);
        }

        public byte[] digest() {
            if (this.digest == null) {
                this.digest = this.hasher.hash().asBytes();
            }
            return this.digest;
        }

        public void startTag(StartTag startTag) {
            String s;
            String name = startTag.getName();
            this.append((byte[])startTags.get((Object)name));
            if ((name == "iframe" || name == "frame") && (s = startTag.getAttributeValue("src")) != null) {
                this.append('\"');
                this.append(s);
                this.append('\"');
            }
            this.lastAppendedWasSpace = false;
        }

        public void endTag(EndTag endTag) {
            this.append((byte[])endTags.get((Object)endTag.getName()));
            this.lastAppendedWasSpace = false;
        }

        static {
            List elementNames = HTMLElements.getElementNames();
            startTags = new Reference2ObjectOpenHashMap(elementNames.size());
            endTags = new Reference2ObjectOpenHashMap(elementNames.size());
            startTags.defaultReturnValue((Object)Util.toByteArray("<unknown>"));
            endTags.defaultReturnValue((Object)Util.toByteArray("</unknown>"));
            for (String name : elementNames) {
                startTags.put((Object)name, (Object)Util.toByteArray("<" + name + ">"));
                endTags.put((Object)name, (Object)Util.toByteArray("</" + name + ">"));
            }
        }
    }

    public static final class SetLinkReceiver
    implements Parser.LinkReceiver {
        public final Set<URI> urls = new ObjectLinkedOpenHashSet();

        @Override
        public void location(URI location) {
            this.urls.add(location);
        }

        @Override
        public void metaLocation(URI location) {
            this.urls.add(location);
        }

        @Override
        public void metaRefresh(URI refresh) {
            this.urls.add(refresh);
        }

        @Override
        public void link(URI link) {
            this.urls.add(link);
        }

        @Override
        public void init(URI responseUrl) {
            this.urls.clear();
        }

        @Override
        public Iterator<URI> iterator() {
            return this.urls.iterator();
        }

        @Override
        public int size() {
            return this.urls.size();
        }
    }
}

