/*
 * Decompiled with CFR 0.152.
 */
package org.mule.extension.webcrawler.internal.crawler.mule;

import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Queue;
import java.util.Set;
import org.json.JSONArray;
import org.json.JSONObject;
import org.jsoup.nodes.Document;
import org.mule.extension.webcrawler.internal.config.PageLoadOptions;
import org.mule.extension.webcrawler.internal.config.WebCrawlerConfiguration;
import org.mule.extension.webcrawler.internal.connection.WebCrawlerConnection;
import org.mule.extension.webcrawler.internal.constant.Constants;
import org.mule.extension.webcrawler.internal.crawler.Crawler;
import org.mule.extension.webcrawler.internal.helper.page.PageHelper;
import org.mule.extension.webcrawler.internal.util.URLUtils;
import org.mule.extension.webcrawler.internal.util.Utils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class MuleCrawler
extends Crawler {
    private static final Logger LOGGER = LoggerFactory.getLogger(MuleCrawler.class);
    private static final String CRAWLED_IMAGES_FOLDER = "images/";
    private static final String CRAWLED_DOCUMENTS_FOLDER = "docs/";

    public MuleCrawler(WebCrawlerConfiguration configuration, WebCrawlerConnection connection, String originalUrl, Long waitOnPageLoad, String waitForXPath, boolean extractShadowDom, String shadowHostXPath, int maxDepth, boolean restrictToPath, boolean downloadImages, int maxImageNumber, boolean downloadDocuments, int maxDocumentNumber, String downloadPath, List<String> contentTags, Constants.OutputFormat outputFormat, boolean getMetaTags, Constants.RegexUrlsFilterLogic regexUrlsFilterLogic, List<String> regexUrls) {
        super(configuration, connection, originalUrl, waitOnPageLoad, waitForXPath, extractShadowDom, shadowHostXPath, maxDepth, restrictToPath, downloadImages, maxImageNumber, downloadDocuments, maxDocumentNumber, downloadPath, contentTags, outputFormat, getMetaTags, regexUrlsFilterLogic, regexUrls);
    }

    @Override
    public Crawler.SiteNode crawl() {
        String rootURLCleaned = URLUtils.removeFragment(this.rootURL);
        this.visitedLinksGlobal = new HashSet();
        this.visitedLinksByDepth = new HashMap();
        return this.crawl(rootURLCleaned, 0, this.connection.getReferrer());
    }

    private Crawler.SiteNode crawl(String url, int currentDepth, String referrer) {
        if (this.configuration.getCrawlerOptions().isEnforceRobotsTxt() && !PageHelper.canCrawl(url, this.connection.getUserAgent())) {
            LOGGER.debug("SKIPPING due to robots.txt: " + url);
            return null;
        }
        try {
            Set<String> links;
            Utils.addDelay(this.configuration.getCrawlerOptions().getDelayMillis());
            Crawler.SiteNode siteNode = null;
            this.visitedLinksByDepth.put(url, currentDepth);
            Document document = PageHelper.getDocument(this.configuration, this.connection, url, referrer, new PageLoadOptions(this.waitOnPageLoad, this.waitForXPath, this.extractShadowDom, this.shadowHostXPath));
            if (!this.visitedLinksGlobal.contains(url)) {
                this.visitedLinksGlobal.add(url);
                JSONObject pageData = new JSONObject();
                LOGGER.debug("Fetching content for : " + url);
                String title = document.title();
                pageData.put("url", (Object)url);
                pageData.put("title", (Object)title);
                if (this.downloadImages) {
                    LOGGER.debug("Downloading images for : " + url);
                    pageData.put("imageFiles", (Object)PageHelper.downloadWebsiteImages(document, this.downloadPath, CRAWLED_IMAGES_FOLDER, this.maxImageNumber));
                }
                if (this.downloadDocuments) {
                    LOGGER.debug("Downloading documents for : " + url);
                    pageData.put("documentFiles", (Object)PageHelper.downloadFiles(document, this.downloadPath, CRAWLED_DOCUMENTS_FOLDER, this.maxDocumentNumber));
                }
                if (this.getMetaTags) {
                    JSONArray pageMetaTags = PageHelper.getPageMetaTags(document);
                    pageData.put("metaTags", (Object)pageMetaTags);
                }
                pageData.put("content", (Object)PageHelper.getPageContent(document, this.contentTags, this.outputFormat));
                String filename = PageHelper.savePageContents(pageData, this.downloadPath, title);
                siteNode = new Crawler.SiteNode(url, currentDepth, filename);
            } else {
                siteNode = new Crawler.SiteNode(url, currentDepth, "already visited");
            }
            if (currentDepth < this.maxDepth && (links = this.getPageLinks(document)) != null) {
                LOGGER.debug(String.format("Found %d links on page: %s", links.size(), url));
                for (String childURL : links) {
                    Crawler.SiteNode childNode;
                    String childURLCleaned = URLUtils.removeFragment(childURL);
                    if (this.visitedLinksByDepth.containsKey(childURLCleaned) && (Integer)this.visitedLinksByDepth.get(childURLCleaned) <= currentDepth + 1 || (childNode = this.crawl(childURLCleaned, currentDepth + 1, url)) == null) continue;
                    siteNode.addChild(childNode);
                }
            }
            return siteNode;
        }
        catch (Exception e) {
            LOGGER.error(e.toString());
            return null;
        }
    }

    @Override
    public Crawler.SiteNode map() {
        String rootURLCleaned = URLUtils.removeFragment(this.rootURL);
        this.visitedLinksByDepth = new HashMap();
        return this.map(rootURLCleaned, 0, this.connection.getReferrer());
    }

    private Crawler.SiteNode map(String url, int currentDepth, String referrer) {
        if (this.configuration.getCrawlerOptions().isEnforceRobotsTxt() && !PageHelper.canCrawl(url, this.connection.getUserAgent())) {
            LOGGER.debug("SKIPPING due to robots.txt: " + url);
            return null;
        }
        try {
            Set<String> links;
            Utils.addDelay(this.configuration.getCrawlerOptions().getDelayMillis());
            Crawler.SiteNode node = null;
            this.visitedLinksByDepth.put(url, currentDepth);
            Document document = PageHelper.getDocument(this.configuration, this.connection, url, referrer, new PageLoadOptions(this.waitOnPageLoad, this.waitForXPath, this.extractShadowDom, this.shadowHostXPath));
            node = new Crawler.SiteNode(url, currentDepth, referrer);
            LOGGER.debug("Found url: " + url);
            if (currentDepth <= this.maxDepth && (links = this.getPageLinks(document)) != null) {
                LOGGER.debug(String.format("Found %d links on page: %s", links.size(), url));
                for (String childURL : links) {
                    String childURLCleaned = URLUtils.removeFragment(childURL);
                    if (this.visitedLinksByDepth.containsKey(childURLCleaned) && (Integer)this.visitedLinksByDepth.get(childURLCleaned) <= currentDepth + 1) continue;
                    Crawler.SiteNode childNode = null;
                    childNode = currentDepth < this.maxDepth ? this.map(childURLCleaned, currentDepth + 1, url) : new Crawler.SiteNode(childURLCleaned, currentDepth + 1, url);
                    if (childNode == null) continue;
                    node.addChild(childNode);
                }
            }
            return node;
        }
        catch (Exception e) {
            LOGGER.error(e.toString());
            return null;
        }
    }

    private Set<String> getPageLinks(Document document) {
        Set<String> links = new HashSet<String>();
        if (this.restrictToPath) {
            HashMap<String, Object> pageInsights = PageHelper.getPageInsights(document, null, Constants.PageInsightType.INTERNALLINKS, this.regexUrlsFilterLogic, this.regexUrls);
            Map linksMap = (Map)pageInsights.get("links");
            if (linksMap != null) {
                links = (Set)linksMap.get("internal");
            }
        } else {
            HashMap<String, Object> pageInsights = PageHelper.getPageInsights(document, null, Constants.PageInsightType.ALL, this.regexUrlsFilterLogic, this.regexUrls);
            Map linksMap = (Map)pageInsights.get("links");
            if (linksMap != null) {
                links.addAll((Set)linksMap.get("internal"));
                links.addAll((Set)linksMap.get("external"));
                links.addAll((Set)linksMap.get("iframe"));
            }
        }
        return links;
    }

    @Override
    public DocumentIterator documentIterator() {
        return new DocumentIterator();
    }

    public class DocumentIterator
    extends Crawler.DocumentIterator {
        private final Queue<Crawler.SiteNode> siteNodeQueue;

        public DocumentIterator() {
            super(MuleCrawler.this);
            this.siteNodeQueue = new LinkedList<Crawler.SiteNode>();
            String rootURLCleaned = URLUtils.removeFragment(MuleCrawler.this.rootURL);
            MuleCrawler.this.visitedLinksGlobal = new HashSet();
            MuleCrawler.this.visitedLinksGlobal.add(rootURLCleaned);
            if (rootURLCleaned == null) {
                throw new IllegalArgumentException("Root URL cannot be null.");
            }
            this.siteNodeQueue.add(new Crawler.SiteNode(rootURLCleaned, 0, MuleCrawler.this.connection.getReferrer()));
        }

        @Override
        public boolean hasNext() {
            return !this.siteNodeQueue.isEmpty();
        }

        @Override
        public Document next() {
            if (!this.hasNext()) {
                throw new NoSuchElementException("No more documents to iterate.");
            }
            Crawler.SiteNode currentNode = this.siteNodeQueue.poll();
            Document document = null;
            try {
                Set links;
                if (MuleCrawler.this.configuration.getCrawlerOptions().isEnforceRobotsTxt() && !PageHelper.canCrawl(currentNode.getUrl(), MuleCrawler.this.connection.getUserAgent())) {
                    LOGGER.debug(String.format("SKIPPING %s due to robots.txt.", MuleCrawler.this.rootURL));
                    return null;
                }
                document = PageHelper.getDocument(MuleCrawler.this.configuration, MuleCrawler.this.connection, currentNode.getUrl(), currentNode.getReferrer(), new PageLoadOptions(MuleCrawler.this.waitOnPageLoad, MuleCrawler.this.waitForXPath, MuleCrawler.this.extractShadowDom, MuleCrawler.this.shadowHostXPath));
                if (currentNode.getCurrentDepth() < MuleCrawler.this.maxDepth && (links = MuleCrawler.this.getPageLinks(document)) != null && !links.isEmpty()) {
                    LOGGER.debug(String.format("Found %d links on page: %s", links.size(), currentNode.getUrl()));
                    for (String childURL : links) {
                        String childURLCleaned = URLUtils.removeFragment(childURL);
                        if (MuleCrawler.this.visitedLinksGlobal.contains(childURLCleaned)) continue;
                        MuleCrawler.this.visitedLinksGlobal.add(childURLCleaned);
                        this.siteNodeQueue.add(new Crawler.SiteNode(childURLCleaned, currentNode.getCurrentDepth() + 1, currentNode.getUrl()));
                    }
                }
            }
            catch (IOException e) {
                throw new RuntimeException(e);
            }
            return document;
        }
    }
}

