/*
 * Decompiled with CFR 0.152.
 */
package com.iveely.crawler.worker;

import com.iveely.crawler.common.WildcardMatcher;
import com.iveely.crawler.config.Seed;
import com.iveely.crawler.entity.WebUrl;
import com.iveely.crawler.worker.BloomFilter;
import com.iveely.crawler.worker.IndexRequest;
import com.iveely.crawler.worker.SimpleParser;
import com.iveely.crawler.worker.Worker;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import org.apache.commons.lang3.SerializationUtils;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class SeedExecutor
implements Runnable {
    private static Logger logger = LoggerFactory.getLogger(SeedExecutor.class);
    private final Seed seed;
    private List<WebUrl> queued;
    private String userAgent;
    private BloomFilter<String> filter;
    private SimpleParser parser;
    private File filterFile;

    public SeedExecutor(Seed seed, SimpleParser parser) {
        this.seed = seed;
        this.queued = new LinkedList<WebUrl>();
        this.filter = new BloomFilter(0.05, 1000);
        this.parser = parser;
        this.userAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36";
        this.filterFile = new File("conf/filter/" + this.seed.getName() + ".bin");
    }

    @Override
    public void run() {
        Thread.currentThread().setName("Thread_" + this.seed.getName());
        if (!StringUtils.isBlank((CharSequence)this.seed.getUserAgent())) {
            this.userAgent = this.seed.getUserAgent();
        }
        if (this.filterFile.exists()) {
            try {
                this.filter = (BloomFilter)SerializationUtils.deserialize((InputStream)new FileInputStream(this.filterFile));
            }
            catch (Exception ex) {
                this.filter = new BloomFilter(0.05, 1000);
                logger.error("Error happen when deserialize bloom filter", (Throwable)ex);
            }
        }
        WebUrl seedUrl = new WebUrl(1, this.seed.getUrl());
        this.queued.add(seedUrl);
        while (!this.queued.isEmpty()) {
            WebUrl webUrl = this.queued.remove(0);
            logger.info("Visit web url =>" + webUrl.getUrl() + " (left:" + this.queued.size() + ")");
            try {
                Document document = this.getDocument(webUrl);
                if (document == null) continue;
                List<WebUrl> urls = this.getUrls(webUrl, document);
                if (urls != null && !urls.isEmpty()) {
                    this.queued.addAll(urls);
                }
                IndexRequest.post(this.parser.get(document, webUrl.getUrl()));
            }
            catch (Exception ex) {
                logger.error("Crawl url:" + webUrl.getUrl() + " failed.", (Throwable)ex);
            }
        }
        try {
            if (this.filterFile.exists()) {
                this.filterFile.delete();
            }
            this.filterFile.createNewFile();
            SerializationUtils.serialize(this.filter, (OutputStream)new FileOutputStream(this.filterFile));
        }
        catch (Exception ex) {
            logger.error("Data persistence error happen.", (Throwable)ex);
        }
        Worker.remove(this.seed.getName());
        logger.info("Finish crawl seed:" + this.seed.getName());
    }

    protected Document getDocument(WebUrl webUrl) {
        try {
            Document document = Jsoup.connect((String)webUrl.getUrl()).userAgent(this.userAgent).timeout(20000).get();
            return document;
        }
        catch (IOException e) {
            logger.warn("Get document failed.", (Object)webUrl);
            return null;
        }
    }

    protected List<WebUrl> getUrls(WebUrl webUrl, Document document) {
        if (webUrl.getDepth() >= this.seed.getDepth()) {
            return null;
        }
        ArrayList<WebUrl> list = new ArrayList<WebUrl>();
        Elements links = document.getElementsByTag("a");
        if (links.isEmpty()) {
            return null;
        }
        links.forEach(e -> {
            String url = e.absUrl("href");
            if (!this.filter.contains(url)) {
                WebUrl childUrl = new WebUrl(webUrl.getDepth() + 1, url);
                childUrl.setDepth(webUrl.getDepth() + 1);
                childUrl.setUrl(url);
                if (this.parser.shouldVisit(webUrl, childUrl) && this.isMatched(url)) {
                    list.add(childUrl);
                }
                this.filter.add(url);
            }
        });
        return list;
    }

    protected boolean isMatched(String url) {
        for (String pattern : this.seed.getPatterns()) {
            if (!WildcardMatcher.match(url, pattern)) continue;
            return true;
        }
        return false;
    }
}

