/*
 * Decompiled with CFR 0.152.
 */
package org.mule.extension.webcrawler.internal.crawler.mule;

import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Set;
import org.json.JSONArray;
import org.json.JSONObject;
import org.jsoup.nodes.Document;
import org.mule.extension.webcrawler.internal.config.PageLoadOptions;
import org.mule.extension.webcrawler.internal.config.WebCrawlerConfiguration;
import org.mule.extension.webcrawler.internal.connection.WebCrawlerConnection;
import org.mule.extension.webcrawler.internal.constant.Constants;
import org.mule.extension.webcrawler.internal.crawler.Crawler;
import org.mule.extension.webcrawler.internal.error.WebCrawlerErrorType;
import org.mule.extension.webcrawler.internal.helper.page.PageHelper;
import org.mule.extension.webcrawler.internal.util.URLUtils;
import org.mule.extension.webcrawler.internal.util.Utils;
import org.mule.runtime.extension.api.error.ErrorTypeDefinition;
import org.mule.runtime.extension.api.exception.ModuleException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class MuleCrawler
extends Crawler {
    private static final Logger LOGGER = LoggerFactory.getLogger(MuleCrawler.class);
    private static final String CRAWLED_IMAGES_FOLDER = "images/";
    private static final String CRAWLED_DOCUMENTS_FOLDER = "docs/";

    public MuleCrawler(WebCrawlerConfiguration configuration, WebCrawlerConnection connection, String originalUrl, Long waitOnPageLoad, String waitForXPath, boolean extractShadowDom, String shadowHostXPath, int maxDepth, boolean restrictToPath, boolean downloadImages, int maxImageNumber, boolean downloadDocuments, int maxDocumentNumber, String downloadPath, List<String> contentTags, Constants.OutputFormat outputFormat, boolean getMetaTags, Constants.RegexUrlsFilterLogic regexUrlsFilterLogic, List<String> regexUrls) {
        super(configuration, connection, originalUrl, waitOnPageLoad, waitForXPath, extractShadowDom, shadowHostXPath, maxDepth, restrictToPath, downloadImages, maxImageNumber, downloadDocuments, maxDocumentNumber, downloadPath, contentTags, outputFormat, getMetaTags, regexUrlsFilterLogic, regexUrls);
    }

    @Override
    public Crawler.SiteNode crawl() {
        this.siteNodeQueue = new LinkedList();
        this.visitedLinksGlobal = new HashSet();
        String rootURLCleaned = URLUtils.cleanURL(this.rootURL);
        Crawler.SiteNode rootNode = new Crawler.SiteNode(rootURLCleaned, 0, this.connection.getReferrer());
        this.siteNodeQueue.add(rootNode);
        this.visitedLinksGlobal.add(rootURLCleaned);
        while (!this.siteNodeQueue.isEmpty()) {
            try {
                Set<String> links;
                Crawler.SiteNode currentNode = (Crawler.SiteNode)this.siteNodeQueue.poll();
                if (this.configuration.getCrawlerOptions().isEnforceRobotsTxt() && !PageHelper.canCrawl(currentNode.getUrl(), this.connection.getUserAgent())) {
                    LOGGER.debug("SKIPPING url due to robots.txt: " + currentNode.getUrl());
                    continue;
                }
                LOGGER.debug("CRAWLING url: " + currentNode.getUrl());
                Utils.addDelay(this.configuration.getCrawlerOptions().getDelayMillis());
                Document document = PageHelper.getDocument(this.configuration, this.connection, currentNode.getUrl(), currentNode.getReferrer(), new PageLoadOptions(this.waitOnPageLoad, this.waitForXPath, this.extractShadowDom, this.shadowHostXPath));
                JSONObject pageData = new JSONObject();
                pageData.put("url", (Object)currentNode.getUrl());
                pageData.put("title", (Object)document.title());
                if (this.downloadImages) {
                    LOGGER.debug("Downloading images for : " + currentNode.getUrl());
                    pageData.put("imageFiles", (Object)PageHelper.downloadWebsiteImages(document, this.downloadPath, CRAWLED_IMAGES_FOLDER, this.maxImageNumber));
                }
                if (this.downloadDocuments) {
                    LOGGER.debug("Downloading documents for : " + currentNode.getUrl());
                    pageData.put("documentFiles", (Object)PageHelper.downloadFiles(document, this.downloadPath, CRAWLED_DOCUMENTS_FOLDER, this.maxDocumentNumber));
                }
                if (this.getMetaTags) {
                    JSONArray pageMetaTags = PageHelper.getPageMetaTags(document);
                    pageData.put("metaTags", (Object)pageMetaTags);
                }
                pageData.put("content", (Object)PageHelper.getPageContent(document, this.contentTags, this.outputFormat));
                String filename = PageHelper.savePageContents(pageData, this.downloadPath, document.title());
                currentNode.setFilename(filename);
                if (currentNode.getCurrentDepth() >= this.maxDepth || (links = this.getPageLinks(document)) == null || links.isEmpty()) continue;
                LOGGER.debug(String.format("Found %d links on page: %s", links.size(), currentNode.getUrl()));
                for (String childURL : links) {
                    String childURLCleaned = URLUtils.cleanURL(childURL);
                    if (this.visitedLinksGlobal.contains(childURLCleaned)) continue;
                    this.visitedLinksGlobal.add(childURLCleaned);
                    Crawler.SiteNode childNode = new Crawler.SiteNode(childURLCleaned, currentNode.getCurrentDepth() + 1, currentNode.getUrl());
                    this.siteNodeQueue.add(childNode);
                    currentNode.getChildren().add(childNode);
                }
            }
            catch (Exception e) {
                LOGGER.error(e.toString());
            }
        }
        return rootNode;
    }

    @Override
    public Crawler.SiteNode map() {
        this.siteNodeQueue = new LinkedList();
        this.visitedLinksGlobal = new HashSet();
        String rootURLCleaned = URLUtils.cleanURL(this.rootURL);
        Crawler.SiteNode rootNode = new Crawler.SiteNode(rootURLCleaned, 0, this.connection.getReferrer());
        this.siteNodeQueue.add(rootNode);
        this.visitedLinksGlobal.add(rootURLCleaned);
        if (this.maxDepth == 0 && !PageHelper.isURLValid(this.configuration, this.connection, rootNode.getUrl(), rootNode.getReferrer())) {
            return null;
        }
        while (!this.siteNodeQueue.isEmpty()) {
            Crawler.SiteNode currentNode = null;
            try {
                Set<String> links;
                currentNode = (Crawler.SiteNode)this.siteNodeQueue.poll();
                if (this.configuration.getCrawlerOptions().isEnforceRobotsTxt() && !PageHelper.canCrawl(currentNode.getUrl(), this.connection.getUserAgent())) {
                    LOGGER.debug("SKIPPING url due to robots.txt: " + currentNode.getUrl());
                    continue;
                }
                LOGGER.debug("MAPPING url: " + currentNode.getUrl());
                Utils.addDelay(this.configuration.getCrawlerOptions().getDelayMillis());
                if (currentNode.getCurrentDepth() == this.maxDepth) {
                    if (PageHelper.isURLValid(this.configuration, this.connection, currentNode.getUrl(), currentNode.getReferrer())) {
                        Crawler.SiteNode parentNode = currentNode.getParent();
                        if (parentNode != null) {
                            parentNode.addChild(currentNode);
                        }
                    } else {
                        LOGGER.debug(String.format("SKIPPING %s due to invalid URL", currentNode.getUrl()));
                    }
                }
                if (currentNode.getCurrentDepth() >= this.maxDepth) continue;
                Document document = PageHelper.getDocument(this.configuration, this.connection, currentNode.getUrl(), currentNode.getReferrer(), new PageLoadOptions(this.waitOnPageLoad, this.waitForXPath, this.extractShadowDom, this.shadowHostXPath));
                Crawler.SiteNode parentNode = currentNode.getParent();
                if (parentNode != null) {
                    parentNode.addChild(currentNode);
                }
                if ((links = this.getPageLinks(document)) == null || links.isEmpty()) continue;
                LOGGER.debug(String.format("Found %d links on page: %s", links.size(), currentNode.getUrl()));
                for (String childURL : links) {
                    String childURLCleaned = URLUtils.cleanURL(childURL);
                    if (this.visitedLinksGlobal.contains(childURLCleaned)) continue;
                    this.visitedLinksGlobal.add(childURLCleaned);
                    Crawler.SiteNode childNode = new Crawler.SiteNode(childURLCleaned, currentNode.getCurrentDepth() + 1, currentNode.getUrl(), currentNode);
                    this.siteNodeQueue.add(childNode);
                }
            }
            catch (Exception e) {
                LOGGER.error(e.toString());
                if (currentNode != null && currentNode.getCurrentDepth() != 0) continue;
                throw new ModuleException(e.toString(), (ErrorTypeDefinition)WebCrawlerErrorType.WEBCRAWLER_OPERATIONS_FAILURE, (Throwable)e);
            }
        }
        return rootNode;
    }

    private Set<String> getPageLinks(Document document) {
        HashMap<String, Object> pageInsights = PageHelper.getPageInsights(document, null, this.restrictToPath ? Constants.PageInsightType.INTERNALLINKS : Constants.PageInsightType.ALL, this.regexUrlsFilterLogic, this.regexUrls);
        return this.getPageLinks(pageInsights);
    }

    private Set<String> getPageLinks(Map<String, Object> pageInsights) {
        Set<String> links = new HashSet<String>();
        if (this.restrictToPath) {
            Map linksMap = (Map)pageInsights.get("links");
            if (linksMap != null) {
                links = (Set)linksMap.get("internal");
            }
        } else {
            Map linksMap = (Map)pageInsights.get("links");
            if (linksMap != null) {
                links.addAll((Set)linksMap.get("internal"));
                links.addAll((Set)linksMap.get("external"));
                links.addAll((Set)linksMap.get("iframe"));
            }
        }
        return links;
    }

    private Set<String> getDocumentLinks(Document document) {
        HashSet<String> links = new HashSet<String>();
        HashMap<String, Object> pageInsights = PageHelper.getPageInsights(document, null, Constants.PageInsightType.DOCUMENTLINKS);
        Map linksMap = (Map)pageInsights.get("links");
        if (linksMap != null) {
            links.addAll((Set)linksMap.get("documents"));
        }
        return links;
    }

    @Override
    public DocumentIterator documentIterator() {
        return new DocumentIterator();
    }

    public class DocumentIterator
    extends Crawler.DocumentIterator {
        public DocumentIterator() {
            super(MuleCrawler.this);
            String rootURLCleaned = URLUtils.cleanURL(MuleCrawler.this.rootURL);
            MuleCrawler.this.visitedLinksGlobal = new HashSet();
            MuleCrawler.this.visitedLinksGlobal.add(rootURLCleaned);
            if (rootURLCleaned == null) {
                throw new IllegalArgumentException("Root URL cannot be null.");
            }
            MuleCrawler.this.siteNodeQueue.add(new Crawler.SiteNode(rootURLCleaned, 0, MuleCrawler.this.connection.getReferrer()));
        }

        @Override
        public boolean hasNext() {
            return !MuleCrawler.this.siteNodeQueue.isEmpty();
        }

        @Override
        public Document next() {
            if (!this.hasNext()) {
                throw new NoSuchElementException("No more documents to iterate.");
            }
            Crawler.SiteNode currentNode = (Crawler.SiteNode)MuleCrawler.this.siteNodeQueue.poll();
            Document document = null;
            try {
                Set links;
                if (MuleCrawler.this.configuration.getCrawlerOptions().isEnforceRobotsTxt() && !PageHelper.canCrawl(currentNode.getUrl(), MuleCrawler.this.connection.getUserAgent())) {
                    LOGGER.debug(String.format("SKIPPING %s due to robots.txt.", MuleCrawler.this.rootURL));
                    return null;
                }
                document = PageHelper.getDocument(MuleCrawler.this.configuration, MuleCrawler.this.connection, currentNode.getUrl(), currentNode.getReferrer(), new PageLoadOptions(MuleCrawler.this.waitOnPageLoad, MuleCrawler.this.waitForXPath, MuleCrawler.this.extractShadowDom, MuleCrawler.this.shadowHostXPath));
                if (currentNode.getCurrentDepth() < MuleCrawler.this.maxDepth && (links = MuleCrawler.this.getPageLinks(document)) != null && !links.isEmpty()) {
                    LOGGER.debug(String.format("Found %d links on page: %s", links.size(), currentNode.getUrl()));
                    for (String childURL : links) {
                        String childURLCleaned = URLUtils.cleanURL(childURL);
                        if (MuleCrawler.this.visitedLinksGlobal.contains(childURLCleaned)) continue;
                        MuleCrawler.this.visitedLinksGlobal.add(childURLCleaned);
                        MuleCrawler.this.siteNodeQueue.add(new Crawler.SiteNode(childURLCleaned, currentNode.getCurrentDepth() + 1, currentNode.getUrl()));
                    }
                }
            }
            catch (IOException e) {
                throw new RuntimeException(e);
            }
            return document;
        }
    }
}

