/*
 * Decompiled with CFR 0.152.
 */
package edu.uci.ics.crawler4j.parser;

import edu.uci.ics.crawler4j.crawler.Configurable;
import edu.uci.ics.crawler4j.crawler.CrawlConfig;
import edu.uci.ics.crawler4j.crawler.Page;
import edu.uci.ics.crawler4j.parser.BinaryParseData;
import edu.uci.ics.crawler4j.parser.ExtractedUrlAnchorPair;
import edu.uci.ics.crawler4j.parser.HtmlContentHandler;
import edu.uci.ics.crawler4j.parser.HtmlParseData;
import edu.uci.ics.crawler4j.parser.TextParseData;
import edu.uci.ics.crawler4j.url.URLCanonicalizer;
import edu.uci.ics.crawler4j.url.WebURL;
import edu.uci.ics.crawler4j.util.Util;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import org.apache.log4j.Logger;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.html.HtmlParser;
import org.xml.sax.ContentHandler;

public class Parser
extends Configurable {
    protected static final Logger logger = Logger.getLogger((String)Parser.class.getName());
    private HtmlParser htmlParser = new HtmlParser();
    private ParseContext parseContext = new ParseContext();

    public Parser(CrawlConfig config) {
        super(config);
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    public boolean parse(Page page, String contextURL) {
        if (Util.hasBinaryContent(page.getContentType())) {
            if (!this.config.isIncludeBinaryContentInCrawling()) {
                return false;
            }
            page.setParseData(BinaryParseData.getInstance());
            return true;
        }
        if (Util.hasPlainTextContent(page.getContentType())) {
            try {
                TextParseData parseData = new TextParseData();
                parseData.setTextContent(new String(page.getContentData(), page.getContentCharset()));
                page.setParseData(parseData);
                return true;
            }
            catch (Exception e) {
                logger.error((Object)(e.getMessage() + ", while parsing: " + page.getWebURL().getURL()));
                return false;
            }
        }
        Metadata metadata = new Metadata();
        HtmlContentHandler contentHandler = new HtmlContentHandler();
        ByteArrayInputStream inputStream = null;
        try {
            inputStream = new ByteArrayInputStream(page.getContentData());
            this.htmlParser.parse((InputStream)inputStream, (ContentHandler)contentHandler, metadata, this.parseContext);
        }
        catch (Exception e) {
            logger.error((Object)(e.getMessage() + ", while parsing: " + page.getWebURL().getURL()));
        }
        finally {
            try {
                if (inputStream != null) {
                    ((InputStream)inputStream).close();
                }
            }
            catch (IOException e) {
                logger.error((Object)(e.getMessage() + ", while parsing: " + page.getWebURL().getURL()));
            }
        }
        if (page.getContentCharset() == null) {
            page.setContentCharset(metadata.get("Content-Encoding"));
        }
        HtmlParseData parseData = new HtmlParseData();
        parseData.setText(contentHandler.getBodyText().trim());
        parseData.setTitle(metadata.get("title"));
        ArrayList<WebURL> outgoingUrls = new ArrayList<WebURL>();
        String baseURL = contentHandler.getBaseUrl();
        if (baseURL != null) {
            contextURL = baseURL;
        }
        int urlCount = 0;
        for (ExtractedUrlAnchorPair urlAnchorPair : contentHandler.getOutgoingUrls()) {
            String url;
            String href = urlAnchorPair.getHref();
            if ((href = href.trim()).length() == 0) continue;
            String hrefWithoutProtocol = href.toLowerCase();
            if (href.startsWith("http://")) {
                hrefWithoutProtocol = href.substring(7);
            }
            if (hrefWithoutProtocol.contains("javascript:") || hrefWithoutProtocol.contains("mailto:") || hrefWithoutProtocol.contains("@") || (url = URLCanonicalizer.getCanonicalURL(href, contextURL)) == null) continue;
            WebURL webURL = new WebURL();
            webURL.setURL(url);
            webURL.setAnchor(urlAnchorPair.getAnchor());
            outgoingUrls.add(webURL);
            if (++urlCount <= this.config.getMaxOutgoingLinksToFollow()) continue;
            break;
        }
        parseData.setOutgoingUrls(outgoingUrls);
        try {
            if (page.getContentCharset() == null) {
                parseData.setHtml(new String(page.getContentData()));
            } else {
                parseData.setHtml(new String(page.getContentData(), page.getContentCharset()));
            }
        }
        catch (UnsupportedEncodingException e) {
            e.printStackTrace();
            return false;
        }
        page.setParseData(parseData);
        return true;
    }
}

