/*
 * Decompiled with CFR 0.152.
 */
package org.archive.modules.extractor;

import java.io.IOException;
import java.nio.charset.Charset;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.httpclient.URIException;
import org.apache.commons.lang.StringEscapeUtils;
import org.archive.io.ReplayCharSequence;
import org.archive.modules.CrawlURI;
import org.archive.modules.extractor.ContentExtractor;
import org.archive.modules.extractor.Extractor;
import org.archive.modules.extractor.Hop;
import org.archive.modules.extractor.LinkContext;
import org.archive.util.TextUtils;
import org.archive.util.UriUtils;

public class ExtractorXML
extends ContentExtractor {
    private static final long serialVersionUID = 3L;
    private static Logger logger = Logger.getLogger(ExtractorXML.class.getName());
    static final Pattern XML_URI_EXTRACTOR = Pattern.compile("(?s)[\"'>]\\s*(?:<!\\[CDATA\\[)?([^<>\\s'\"@]+)\\s*(?:\\]\\]>)?[\"'<]");

    @Override
    protected boolean shouldExtract(CrawlURI curi) {
        String mimeType = curi.getContentType();
        if (mimeType != null && mimeType.toLowerCase().indexOf("xml") >= 0 && !mimeType.matches("(?i)application/vnd.openxmlformats.*") || curi.toString().toLowerCase().endsWith(".rss") || curi.toString().toLowerCase().endsWith(".xml")) {
            return true;
        }
        String contentStartingChunk = curi.getRecorder().getContentReplayPrefixString(400);
        return contentStartingChunk.matches("(?is)[\\ufeff]?<\\?xml\\s.*") && !contentStartingChunk.matches("(?is).*(?:<!doctype\\s+html|<html[>\\s]).*");
    }

    @Override
    protected boolean innerExtract(CrawlURI curi) {
        ReplayCharSequence cs = null;
        Charset contentDeclaredEncoding = null;
        try {
            if (!curi.containsContentTypeCharsetDeclaration()) {
                String contentPrefix = curi.getRecorder().getContentReplayPrefixString(50);
                contentDeclaredEncoding = this.getContentDeclaredCharset(curi, contentPrefix);
                if (!curi.getRecorder().getCharset().equals(contentDeclaredEncoding) && contentDeclaredEncoding != null) {
                    String newContentPrefix = curi.getRecorder().getContentReplayPrefixString(50, contentDeclaredEncoding);
                    Charset reflexiveCharset = this.getContentDeclaredCharset(curi, newContentPrefix);
                    if (contentDeclaredEncoding.equals(reflexiveCharset)) {
                        curi.getAnnotations().add("usingCharsetInXML:" + contentDeclaredEncoding);
                        curi.getRecorder().setCharset(contentDeclaredEncoding);
                    } else {
                        curi.getAnnotations().add("inconsistentCharsetInXML:" + contentDeclaredEncoding);
                    }
                }
            }
            cs = curi.getRecorder().getContentReplayCharSequence();
            this.numberOfLinksExtracted.addAndGet(ExtractorXML.processXml(this, curi, (CharSequence)cs));
            return true;
        }
        catch (IOException e) {
            logger.severe("Failed getting ReplayCharSequence: " + e.getMessage());
            return false;
        }
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    protected Charset getContentDeclaredCharset(CrawlURI curi, String contentPrefix) {
        Matcher m = TextUtils.getMatcher((String)"(?s)<\\?xml\\s+[^>]*encoding=['\"]([^'\"]+)['\"]", (CharSequence)contentPrefix);
        String charsetName = null;
        try {
            if (m.find()) {
                charsetName = m.group(1);
                Charset charset = Charset.forName(charsetName);
                return charset;
            }
        }
        catch (IllegalArgumentException iae) {
            logger.log(Level.INFO, "Unknown content-encoding '" + charsetName + "' declared; using default");
            curi.getAnnotations().add("unsatisfiableCharsetInXML:" + charsetName);
        }
        finally {
            TextUtils.recycleMatcher((Matcher)m);
        }
        return null;
    }

    public static long processXml(Extractor ext, CrawlURI curi, CharSequence cs) {
        long foundLinks = 0L;
        Matcher matcher = XML_URI_EXTRACTOR.matcher(cs);
        while (matcher.find()) {
            String xmlUri = StringEscapeUtils.unescapeXml((String)matcher.group(1));
            if (!UriUtils.isVeryLikelyUri((CharSequence)xmlUri)) continue;
            ++foundLinks;
            try {
                int max = ext.getExtractorParameters().getMaxOutlinks();
                ExtractorXML.addRelativeToBase(curi, max, xmlUri, LinkContext.SPECULATIVE_MISC, Hop.SPECULATIVE);
            }
            catch (URIException e) {
                ext.logUriError(e, curi.getUURI(), xmlUri);
            }
        }
        return foundLinks;
    }
}

