/*
 * Decompiled with CFR 0.152.
 */
package org.archive.modules.extractor;

import crawlercommons.sitemaps.AbstractSiteMap;
import crawlercommons.sitemaps.SiteMap;
import crawlercommons.sitemaps.SiteMapIndex;
import crawlercommons.sitemaps.SiteMapParser;
import crawlercommons.sitemaps.SiteMapURL;
import crawlercommons.sitemaps.UnknownFormatException;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.Collection;
import java.util.Date;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.commons.httpclient.URIException;
import org.apache.commons.io.IOUtils;
import org.archive.modules.CrawlURI;
import org.archive.modules.extractor.ContentExtractor;
import org.archive.modules.extractor.Hop;
import org.archive.modules.extractor.LinkContext;

public class ExtractorSitemap
extends ContentExtractor {
    private static final Logger LOGGER = Logger.getLogger(ExtractorSitemap.class.getName());
    private String urlPattern = null;
    private boolean enableLenientExtraction = false;

    @Override
    protected boolean shouldExtract(CrawlURI uri) {
        String contentStartingChunk;
        if (uri.getAnnotations().contains("isSitemap")) {
            if (uri.is2XXSuccess()) {
                LOGGER.fine("This url (" + uri + ") is declared to be a sitemap (via robots.txt) and is a HTTP 200.");
                return true;
            }
            LOGGER.fine("This url (" + uri + ") is declared to be a sitemap (via robots.txt) but is a HTTP " + uri.getFetchStatus() + ".");
        }
        if (this.urlPattern != null && uri.getURI().matches(this.urlPattern)) {
            return true;
        }
        String mimeType = uri.getContentType();
        if (mimeType != null && (mimeType.toLowerCase().startsWith("text/xml") || mimeType.toLowerCase().startsWith("application/xml")) && (contentStartingChunk = uri.getRecorder().getContentReplayPrefixString(400)).matches("(?is)[\\ufeff]?<\\?xml\\s.*") && contentStartingChunk.matches("(?is).*(?:<urlset|<sitemapindex[>\\s]).*")) {
            LOGGER.info("Based on content sniffing, this is a sitemap: " + uri);
            return true;
        }
        return false;
    }

    @Override
    protected boolean innerExtract(CrawlURI uri) {
        block4: {
            AbstractSiteMap sitemap = this.parseSiteMap(uri);
            if (sitemap == null) break block4;
            if (sitemap.isIndex()) {
                Collection links = ((SiteMapIndex)sitemap).getSitemaps();
                for (AbstractSiteMap asm : links) {
                    if (asm == null) continue;
                    this.recordOutlink(uri, asm.getUrl(), asm.getLastModified(), true);
                }
            } else {
                Collection links = ((SiteMap)sitemap).getSiteMapUrls();
                for (SiteMapURL url : links) {
                    if (url == null) continue;
                    this.recordOutlink(uri, url.getUrl(), url.getLastModified(), false);
                }
            }
        }
        return false;
    }

    private AbstractSiteMap parseSiteMap(CrawlURI uri) {
        AbstractSiteMap sitemap = null;
        SiteMapParser smp = new SiteMapParser(!this.isEnableLenientExtraction(), true);
        try {
            byte[] content = IOUtils.toByteArray((InputStream)uri.getRecorder().getContentReplayInputStream());
            if (content.length > 0x3200000) {
                LOGGER.warning("Found sitemap exceeding 50MB " + uri + " " + content.length);
            }
            sitemap = smp.parseSiteMap(content, new URL(uri.getURI()));
        }
        catch (IOException e) {
            LOGGER.log(Level.WARNING, "I/O Exception when parsing sitemap " + uri, e);
        }
        catch (UnknownFormatException e) {
            LOGGER.log(Level.WARNING, "UnknownFormatException when parsing sitemap " + uri, e);
        }
        return sitemap;
    }

    private void recordOutlink(CrawlURI curi, URL newUri, Date lastModified, boolean isSitemap) {
        try {
            int max = 50000;
            CrawlURI newCuri = ExtractorSitemap.addRelativeToBase(curi, max, newUri.toString(), LinkContext.MANIFEST_MISC, Hop.MANIFEST);
            if (newCuri == null) {
                return;
            }
            if (isSitemap) {
                newCuri.getAnnotations().add("isSitemap");
            }
            LOGGER.fine("Found " + newUri + " from " + curi + " Dated " + lastModified + " and with isSitemap = " + isSitemap);
            this.numberOfLinksExtracted.incrementAndGet();
        }
        catch (URIException e) {
            this.logUriError(e, curi.getUURI(), newUri.toString());
        }
    }

    public String getUrlPattern() {
        return this.urlPattern;
    }

    public void setUrlPattern(String urlPattern) {
        this.urlPattern = urlPattern;
    }

    public boolean isEnableLenientExtraction() {
        return this.enableLenientExtraction;
    }

    public void setEnableLenientExtraction(boolean enableLenientExtraction) {
        this.enableLenientExtraction = enableLenientExtraction;
    }
}

