/*
 * Decompiled with CFR 0.152.
 */
package com.digitalpebble.stormcrawler.tika;

import com.digitalpebble.stormcrawler.Metadata;
import com.digitalpebble.stormcrawler.filtering.URLFilters;
import com.digitalpebble.stormcrawler.parse.Outlink;
import com.digitalpebble.stormcrawler.parse.ParseData;
import com.digitalpebble.stormcrawler.parse.ParseFilter;
import com.digitalpebble.stormcrawler.parse.ParseFilters;
import com.digitalpebble.stormcrawler.parse.ParseResult;
import com.digitalpebble.stormcrawler.persistence.Status;
import com.digitalpebble.stormcrawler.tika.DOMBuilder;
import com.digitalpebble.stormcrawler.util.ConfUtils;
import com.digitalpebble.stormcrawler.util.InitialisationUtil;
import com.digitalpebble.stormcrawler.util.MetadataTransfer;
import com.digitalpebble.stormcrawler.util.URLUtil;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang.StringUtils;
import org.apache.html.dom.HTMLDocumentImpl;
import org.apache.storm.metric.api.IMetric;
import org.apache.storm.metric.api.MultiCountMetric;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseRichBolt;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;
import org.apache.tika.Tika;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.html.HtmlMapper;
import org.apache.tika.parser.html.IdentityHtmlMapper;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.Link;
import org.apache.tika.sax.LinkContentHandler;
import org.apache.tika.sax.TeeContentHandler;
import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.DocumentFragment;
import org.xml.sax.ContentHandler;

public class ParserBolt
extends BaseRichBolt {
    private Tika tika;
    private URLFilters urlFilters = null;
    private ParseFilter parseFilters = null;
    private OutputCollector collector;
    private static final Logger LOG = LoggerFactory.getLogger(ParserBolt.class);
    private MultiCountMetric eventCounter;
    private boolean upperCaseElementNames = true;
    private Class<? extends HtmlMapper> htmlMapperClass = IdentityHtmlMapper.class;
    private boolean extractEmbedded = false;
    private MetadataTransfer metadataTransfer;
    private boolean emitOutlinks = true;
    private List<String> mimeTypeWhiteList = new LinkedList<String>();
    private String protocolMDprefix;

    public void prepare(@NotNull Map<String, Object> conf, @NotNull TopologyContext context, @NotNull OutputCollector collector) {
        this.emitOutlinks = ConfUtils.getBoolean(conf, (String)"parser.emitOutlinks", (boolean)true);
        this.urlFilters = URLFilters.fromConf(conf);
        this.parseFilters = ParseFilters.fromConf(conf);
        this.upperCaseElementNames = ConfUtils.getBoolean(conf, (String)"parser.uppercase.element.names", (boolean)true);
        this.extractEmbedded = ConfUtils.getBoolean(conf, (String)"parser.extract.embedded", (boolean)false);
        String htmlmapperClassName = ConfUtils.getString(conf, (String)"parser.htmlmapper.classname", (String)"org.apache.tika.parser.html.IdentityHtmlMapper");
        try {
            this.htmlMapperClass = InitialisationUtil.getClassFor((String)htmlmapperClassName, HtmlMapper.class, (Class[])new Class[0]);
        }
        catch (RuntimeException e) {
            LOG.error("Can't load class {}", (Object)htmlmapperClassName);
            throw e;
        }
        this.mimeTypeWhiteList = ConfUtils.loadListFromConf((String)"parser.mimetype.whitelist", conf);
        this.protocolMDprefix = ConfUtils.getString(conf, (String)"protocol.md.prefix", (String)"");
        this.tika = this.instantiateTika(conf);
        this.collector = collector;
        this.eventCounter = (MultiCountMetric)context.registerMetric(((Object)((Object)this)).getClass().getSimpleName(), (IMetric)new MultiCountMetric(), 10);
        this.metadataTransfer = MetadataTransfer.getInstance(conf);
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    public void execute(Tuple tuple) {
        String text;
        this.eventCounter.scope("tuple_in").incrBy(1L);
        byte[] content = tuple.getBinaryByField("content");
        String url = tuple.getStringByField("url");
        Metadata metadata = (Metadata)tuple.getValueByField("metadata");
        if (this.mimeTypeWhiteList.size() > 0) {
            boolean mt_match = false;
            String mimeType = metadata.getFirstValue("parse.Content-Type");
            if (mimeType == null) {
                mimeType = metadata.getFirstValue("content-type", this.protocolMDprefix);
            }
            if (mimeType != null) {
                for (String mt : this.mimeTypeWhiteList) {
                    if (!mimeType.matches(mt)) continue;
                    mt_match = true;
                    break;
                }
            }
            if (!mt_match) {
                this.handleException(url, null, metadata, tuple, "content type");
                return;
            }
        }
        if ("true".equalsIgnoreCase(metadata.getFirstValue("http.trimmed", this.protocolMDprefix))) {
            this.handleException(url, null, metadata, tuple, "skipped_trimmed");
            return;
        }
        long start = System.currentTimeMillis();
        ByteArrayInputStream bais = new ByteArrayInputStream(content);
        org.apache.tika.metadata.Metadata md = new org.apache.tika.metadata.Metadata();
        String httpCT = metadata.getFirstValue("content-type", this.protocolMDprefix);
        if (StringUtils.isNotBlank((String)httpCT)) {
            md.set("Content-Type", httpCT);
        }
        try {
            URL _url = new URL(url);
            md.set("resourceName", _url.getFile());
        }
        catch (MalformedURLException e1) {
            throw new IllegalStateException("Malformed URL", e1);
        }
        LinkContentHandler linkHandler = new LinkContentHandler();
        BodyContentHandler textHandler = new BodyContentHandler(-1);
        TeeContentHandler teeHandler = new TeeContentHandler(new ContentHandler[]{linkHandler, textHandler});
        ParseContext parseContext = new ParseContext();
        if (this.extractEmbedded) {
            parseContext.set(Parser.class, (Object)this.tika.getParser());
        }
        try {
            parseContext.set(HtmlMapper.class, (Object)((HtmlMapper)InitialisationUtil.initializeFromClass(this.htmlMapperClass)));
        }
        catch (Exception e) {
            LOG.error("Exception while specifying HTMLMapper {}", (Object)url, (Object)e);
        }
        DocumentFragment root = null;
        if (this.parseFilters.needsDOM()) {
            HTMLDocumentImpl doc = new HTMLDocumentImpl();
            doc.setErrorChecking(false);
            root = doc.createDocumentFragment();
            DOMBuilder domhandler = new DOMBuilder((Document)doc, root);
            domhandler.setUpperCaseElementNames(this.upperCaseElementNames);
            domhandler.setDefaultNamespaceURI("http://www.w3.org/1999/xhtml");
            teeHandler = new TeeContentHandler(new ContentHandler[]{linkHandler, textHandler, domhandler});
        }
        try {
            this.tika.getParser().parse((InputStream)bais, (ContentHandler)teeHandler, md, parseContext);
            text = textHandler.toString();
        }
        catch (Throwable e) {
            this.handleException(url, e, metadata, tuple, "parse error");
            return;
        }
        finally {
            try {
                bais.close();
            }
            catch (IOException e) {
                LOG.error("Exception while closing stream", (Throwable)e);
            }
        }
        for (String k : md.names()) {
            String[] values = md.getValues(k);
            metadata.setValues("parse." + k, values);
        }
        long duration = System.currentTimeMillis() - start;
        LOG.info("Parsed {} in {} msec", (Object)url, (Object)duration);
        List<Outlink> outlinks = this.toOutlinks(url, linkHandler.getLinks(), metadata);
        ParseResult parse = new ParseResult(outlinks);
        ParseData parseData = parse.get(url);
        parseData.setMetadata(metadata);
        parseData.setText(text);
        parseData.setContent(content);
        try {
            this.parseFilters.filter(url, content, root, parse);
        }
        catch (RuntimeException e) {
            this.handleException(url, e, metadata, tuple, "parse filters");
            return;
        }
        if (this.emitOutlinks) {
            for (Outlink outlink : parse.getOutlinks()) {
                this.collector.emit("status", tuple, (List)new Values(new Object[]{outlink.getTargetURL(), outlink.getMetadata(), Status.DISCOVERED}));
            }
        }
        for (Map.Entry doc : parse) {
            ParseData parseDoc = (ParseData)doc.getValue();
            this.collector.emit(tuple, (List)new Values(new Object[]{doc.getKey(), parseDoc.getContent(), parseDoc.getMetadata(), parseDoc.getText()}));
        }
        this.collector.ack(tuple);
        this.eventCounter.scope("tuple_success").incrBy(1L);
    }

    private Tika instantiateTika(Map<String, Object> conf) {
        Tika tika = null;
        String tikaConfigFile = ConfUtils.getString(conf, (String)"parser.tika.config.file", (String)"tika-config.xml");
        long start = System.currentTimeMillis();
        URL tikaConfigUrl = ((Object)((Object)this)).getClass().getClassLoader().getResource(tikaConfigFile);
        if (tikaConfigUrl == null) {
            LOG.error("Tika configuration file {} not found on classpath", (Object)tikaConfigFile);
        } else {
            LOG.info("Instantiating Tika using custom configuration {}", (Object)tikaConfigUrl);
            try {
                TikaConfig tikaConfig = new TikaConfig(tikaConfigUrl, ((Object)((Object)this)).getClass().getClassLoader());
                tika = new Tika(tikaConfig);
            }
            catch (Exception e) {
                LOG.error("Failed to instantiate Tika using custom configuration {}", (Object)tikaConfigUrl, (Object)e);
            }
        }
        if (tika == null) {
            LOG.info("Instantiating Tika with default configuration");
            tika = new Tika();
        }
        long end = System.currentTimeMillis();
        LOG.debug("Tika loaded in {} msec", (Object)(end - start));
        return tika;
    }

    private void handleException(String url, Throwable e, Metadata metadata, Tuple tuple, String errorType) {
        if (e != null) {
            LOG.error("{} -> {}", new Object[]{errorType, url, e});
        } else {
            LOG.info("{} -> {}", (Object)errorType, (Object)url);
        }
        metadata.setValue("error.source", "TIKA");
        metadata.setValue("error.message", errorType);
        this.collector.emit("status", tuple, (List)new Values(new Object[]{url, metadata, Status.ERROR}));
        this.collector.ack(tuple);
        String s = "error_" + errorType.replaceAll(" ", "_");
        this.eventCounter.scope(s).incrBy(1L);
        this.eventCounter.scope("parse exception").incrBy(1L);
    }

    public void declareOutputFields(OutputFieldsDeclarer declarer) {
        declarer.declare(new Fields(new String[]{"url", "content", "metadata", "text"}));
        declarer.declareStream("status", new Fields(new String[]{"url", "metadata", "status"}));
    }

    private List<Outlink> toOutlinks(String parentURL, List<Link> links, Metadata parentMetadata) {
        URL url_;
        HashMap<String, Outlink> outlinks = new HashMap<String, Outlink>();
        try {
            url_ = new URL(parentURL);
        }
        catch (MalformedURLException e1) {
            LOG.error("MalformedURLException on {}", (Object)parentURL);
            this.eventCounter.scope("error_invalid_source_url").incrBy(1L);
            return new ArrayList<Outlink>();
        }
        for (Link l : links) {
            String urlOL;
            if (StringUtils.isBlank((String)l.getUri())) continue;
            try {
                URL tmpURL = URLUtil.resolveURL((URL)url_, (String)l.getUri());
                urlOL = tmpURL.toExternalForm();
            }
            catch (MalformedURLException e) {
                LOG.debug("MalformedURLException on {}", (Object)l.getUri());
                this.eventCounter.scope("error_outlink_parsing_" + e.getClass().getSimpleName()).incrBy(1L);
                continue;
            }
            if (this.urlFilters != null && (urlOL = this.urlFilters.filter(url_, parentMetadata, urlOL)) == null) {
                this.eventCounter.scope("outlink_filtered").incrBy(1L);
                continue;
            }
            this.eventCounter.scope("outlink_kept").incrBy(1L);
            Outlink ol = new Outlink(urlOL);
            ol.setAnchor(l.getText());
            ol.setMetadata(this.metadataTransfer.getMetaForOutlink(urlOL, parentURL, parentMetadata));
            outlinks.putIfAbsent(urlOL, ol);
        }
        return new ArrayList<Outlink>(outlinks.values());
    }
}

