/*
 * Decompiled with CFR 0.152.
 */
package org.mule.extension.webcrawler.internal.crawler.mule;

import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Queue;
import java.util.Set;
import org.json.JSONArray;
import org.json.JSONObject;
import org.jsoup.nodes.Document;
import org.mule.extension.webcrawler.internal.connection.WebCrawlerConnection;
import org.mule.extension.webcrawler.internal.constant.Constants;
import org.mule.extension.webcrawler.internal.crawler.Crawler;
import org.mule.extension.webcrawler.internal.helper.page.PageHelper;
import org.mule.extension.webcrawler.internal.util.Utils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class MuleCrawler
extends Crawler {
    private static final Logger LOGGER = LoggerFactory.getLogger(MuleCrawler.class);
    private static final String CRAWLED_IMAGES_FOLDER = "images/";
    private static final String CRAWLED_DOCUMENTS_FOLDER = "docs/";

    public MuleCrawler(WebCrawlerConnection connection, String originalUrl, int maxDepth, boolean restrictToPath, int delayMillis, boolean enforceRobotsTxt, boolean downloadImages, int maxImageNumber, boolean downloadDocuments, int maxDocumentNumber, String downloadPath, List<String> contentTags, Constants.OutputFormat outputFormat, boolean getMetaTags, Constants.RegexUrlsFilterLogic regexUrlsFilterLogic, List<String> regexUrls) {
        super(connection, originalUrl, maxDepth, restrictToPath, delayMillis, enforceRobotsTxt, downloadImages, maxImageNumber, downloadDocuments, maxDocumentNumber, downloadPath, contentTags, outputFormat, getMetaTags, regexUrlsFilterLogic, regexUrls);
    }

    @Override
    public Crawler.SiteNode crawl() {
        this.visitedLinksGlobal = new HashSet();
        this.visitedLinksByDepth = new HashMap();
        return this.crawl(this.rootURL, 0, this.connection.getReferrer());
    }

    private Crawler.SiteNode crawl(String url, int currentDepth, String referrer) {
        if (currentDepth > this.maxDepth) {
            return null;
        }
        if (this.enforceRobotsTxt && !PageHelper.canCrawl(url, this.connection.getUserAgent())) {
            LOGGER.debug("SKIPPING due to robots.txt: " + url);
            return null;
        }
        if (this.restrictToPath && !url.startsWith(this.rootURL)) {
            LOGGER.debug("SKIPPING due to strict crawling: " + url);
            return null;
        }
        this.visitedLinksByDepth.putIfAbsent(currentDepth, new HashSet());
        if (((Set)this.visitedLinksByDepth.get(currentDepth)).contains(url)) {
            return null;
        }
        try {
            Set<String> links;
            Utils.addDelay(this.delayMillis);
            ((Set)this.visitedLinksByDepth.get(currentDepth)).add(url);
            Crawler.SiteNode siteNode = null;
            Document document = PageHelper.getDocument(this.connection, url, referrer);
            if (!this.visitedLinksGlobal.contains(url)) {
                this.visitedLinksGlobal.add(url);
                JSONObject pageData = new JSONObject();
                LOGGER.debug("Fetching content for : " + url);
                String title = document.title();
                pageData.put("url", (Object)url);
                pageData.put("title", (Object)title);
                if (this.downloadImages) {
                    LOGGER.debug("Downloading images for : " + url);
                    pageData.put("imageFiles", (Object)PageHelper.downloadWebsiteImages(document, this.downloadPath, CRAWLED_IMAGES_FOLDER, this.maxImageNumber));
                }
                if (this.downloadDocuments) {
                    LOGGER.debug("Downloading documents for : " + url);
                    pageData.put("documentFiles", (Object)PageHelper.downloadFiles(document, this.downloadPath, CRAWLED_DOCUMENTS_FOLDER, this.maxDocumentNumber));
                }
                if (this.getMetaTags) {
                    JSONArray pageMetaTags = PageHelper.getPageMetaTags(document);
                    pageData.put("metaTags", (Object)pageMetaTags);
                }
                pageData.put("content", (Object)PageHelper.getPageContent(document, this.contentTags, this.outputFormat));
                String filename = PageHelper.savePageContents(pageData, this.downloadPath, title);
                siteNode = new Crawler.SiteNode(url, currentDepth, filename);
            } else {
                siteNode = new Crawler.SiteNode(url, currentDepth, "Duplicate.");
            }
            if (currentDepth <= this.maxDepth && (links = this.getPageLinks(document)) != null) {
                for (String childURL : links) {
                    Crawler.SiteNode childSiteNode = this.crawl(childURL, currentDepth + 1, url);
                    if (childSiteNode == null) continue;
                    siteNode.addChild(childSiteNode);
                }
            }
            return siteNode;
        }
        catch (Exception e) {
            LOGGER.error(e.toString());
            return null;
        }
    }

    @Override
    public Crawler.SiteNode map() {
        this.visitedLinksGlobal = new HashSet();
        this.visitedLinksByDepth = new HashMap();
        return this.map(this.rootURL, 0, this.connection.getReferrer());
    }

    private Crawler.SiteNode map(String url, int currentDepth, String referrer) {
        if (currentDepth > this.maxDepth) {
            return null;
        }
        if (this.enforceRobotsTxt && !PageHelper.canCrawl(url, this.connection.getUserAgent())) {
            LOGGER.debug("SKIPPING due to robots.txt: " + url);
            return null;
        }
        if (this.restrictToPath && !url.startsWith(this.rootURL)) {
            LOGGER.debug("SKIPPING due to strict crawling: " + url);
            return null;
        }
        this.visitedLinksByDepth.putIfAbsent(currentDepth, new HashSet());
        if (((Set)this.visitedLinksByDepth.get(currentDepth)).contains(url)) {
            return null;
        }
        try {
            Set<String> links;
            Utils.addDelay(this.delayMillis);
            ((Set)this.visitedLinksByDepth.get(currentDepth)).add(url);
            Crawler.SiteNode node = null;
            Document document = PageHelper.getDocument(this.connection, url, referrer);
            node = new Crawler.SiteNode(url, currentDepth, referrer);
            LOGGER.debug("Found url: " + url);
            if (currentDepth <= this.maxDepth && (links = this.getPageLinks(document)) != null) {
                LOGGER.debug(String.format("Found %d links on page: %s", links.size(), url));
                for (String childURL : links) {
                    Crawler.SiteNode childNode = currentDepth < this.maxDepth ? this.map(childURL, currentDepth + 1, url) : new Crawler.SiteNode(childURL, currentDepth + 1, url);
                    if (childNode == null) continue;
                    node.addChild(childNode);
                }
            }
            return node;
        }
        catch (Exception e) {
            LOGGER.error(e.toString());
            return null;
        }
    }

    private Set<String> getPageLinks(Document document) {
        Set<String> links = new HashSet<String>();
        if (this.restrictToPath) {
            HashMap<String, Object> pageInsights = PageHelper.getPageInsights(document, null, Constants.PageInsightType.INTERNALLINKS, this.regexUrlsFilterLogic, this.regexUrls);
            Map linksMap = (Map)pageInsights.get("links");
            if (linksMap != null) {
                links = (Set)linksMap.get("internal");
            }
        } else {
            HashMap<String, Object> pageInsights = PageHelper.getPageInsights(document, null, Constants.PageInsightType.ALL, this.regexUrlsFilterLogic, this.regexUrls);
            Map linksMap = (Map)pageInsights.get("links");
            if (linksMap != null) {
                links.addAll((Set)linksMap.get("internal"));
                links.addAll((Set)linksMap.get("external"));
                links.addAll((Set)linksMap.get("iframe"));
            }
        }
        return links;
    }

    @Override
    public DocumentIterator documentIterator() {
        return new DocumentIterator();
    }

    public class DocumentIterator
    extends Crawler.DocumentIterator {
        private final Queue<Crawler.SiteNode> siteNodeQueue;

        public DocumentIterator() {
            super(MuleCrawler.this);
            this.siteNodeQueue = new LinkedList<Crawler.SiteNode>();
            MuleCrawler.this.visitedLinksGlobal = new HashSet();
            if (MuleCrawler.this.rootURL == null) {
                throw new IllegalArgumentException("Root URL cannot be null.");
            }
            this.siteNodeQueue.add(new Crawler.SiteNode(MuleCrawler.this.rootURL, 0, MuleCrawler.this.connection.getReferrer()));
        }

        @Override
        public boolean hasNext() {
            return !this.siteNodeQueue.isEmpty();
        }

        @Override
        public Document next() {
            if (!this.hasNext()) {
                throw new NoSuchElementException("No more documents to iterate.");
            }
            Crawler.SiteNode currentNode = this.siteNodeQueue.poll();
            Document document = null;
            try {
                Set links;
                if (MuleCrawler.this.visitedLinksGlobal.contains(currentNode.getUrl())) {
                    LOGGER.debug(String.format("SKIPPING %s since already visited.", MuleCrawler.this.rootURL));
                    return null;
                }
                if (MuleCrawler.this.enforceRobotsTxt && !PageHelper.canCrawl(currentNode.getUrl(), MuleCrawler.this.connection.getUserAgent())) {
                    LOGGER.debug(String.format("SKIPPING %s due to robots.txt.", MuleCrawler.this.rootURL));
                    return null;
                }
                document = PageHelper.getDocument(MuleCrawler.this.connection, currentNode.getUrl(), currentNode.getReferrer());
                if (currentNode.getCurrentDepth() < MuleCrawler.this.maxDepth && (links = MuleCrawler.this.getPageLinks(document)) != null && !links.isEmpty()) {
                    LOGGER.debug(String.format("Found %d links on page: %s", links.size(), currentNode.getUrl()));
                    for (String childURL : links) {
                        this.siteNodeQueue.add(new Crawler.SiteNode(childURL, currentNode.getCurrentDepth() + 1, currentNode.getUrl()));
                    }
                }
            }
            catch (IOException e) {
                throw new RuntimeException(e);
            }
            MuleCrawler.this.visitedLinksGlobal.add(currentNode.getUrl());
            return document;
        }
    }
}

