/*
 * Decompiled with CFR 0.152.
 */
package org.codelibs.fess.crawler.extractor.impl;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang3.text.translate.AggregateTranslator;
import org.apache.commons.lang3.text.translate.CharSequenceTranslator;
import org.apache.commons.lang3.text.translate.EntityArrays;
import org.apache.commons.lang3.text.translate.LookupTranslator;
import org.apache.commons.lang3.text.translate.NumericEntityUnescaper;
import org.codelibs.core.io.InputStreamUtil;
import org.codelibs.core.lang.StringUtil;
import org.codelibs.fess.crawler.entity.ExtractData;
import org.codelibs.fess.crawler.exception.CrawlerSystemException;
import org.codelibs.fess.crawler.exception.ExtractException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public abstract class AbstractXmlExtractor {
    protected static final Logger logger = LoggerFactory.getLogger(AbstractXmlExtractor.class);
    protected static final CharSequenceTranslator UNESCAPE_HTML4 = new AggregateTranslator(new CharSequenceTranslator[]{new LookupTranslator((CharSequence[][])EntityArrays.BASIC_UNESCAPE()), new LookupTranslator((CharSequence[][])EntityArrays.ISO8859_1_UNESCAPE()), new LookupTranslator((CharSequence[][])EntityArrays.HTML40_EXTENDED_UNESCAPE()), new NumericEntityUnescaper(new NumericEntityUnescaper.OPTION[0])});
    protected String encoding = "UTF-8";
    protected int preloadSizeForCharset = 2048;
    protected boolean ignoreCommentTag = false;

    protected abstract Pattern getEncodingPattern();

    protected abstract Pattern getTagPattern();

    public ExtractData getText(InputStream in, Map<String, String> params) {
        if (in == null) {
            throw new CrawlerSystemException("The inputstream is null.");
        }
        try {
            BufferedInputStream bis = new BufferedInputStream(in);
            String enc = this.getEncoding(bis);
            String content = UNESCAPE_HTML4.translate((CharSequence)new String(InputStreamUtil.getBytes((InputStream)bis), enc));
            return new ExtractData(this.extractString(content));
        }
        catch (Exception e) {
            throw new ExtractException(e);
        }
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    protected String getEncoding(BufferedInputStream bis) {
        byte[] b = new byte[this.preloadSizeForCharset];
        try {
            String enc;
            bis.mark(this.preloadSizeForCharset);
            int c = bis.read(b);
            if (c == -1) {
                String string = this.encoding;
                return string;
            }
            String head = new String(b, 0, c, this.encoding);
            if (StringUtil.isBlank((String)head)) {
                String e = this.encoding;
                return e;
            }
            Matcher matcher = this.getEncodingPattern().matcher(head);
            if (matcher.find() && Charset.isSupported(enc = matcher.group(1))) {
                String string = enc;
                return string;
            }
        }
        catch (Exception e) {
            if (logger.isInfoEnabled()) {
                logger.info("Use a default encoding: " + this.encoding, (Throwable)e);
            }
        }
        finally {
            try {
                bis.reset();
            }
            catch (IOException e) {
                throw new ExtractException(e);
            }
        }
        return this.encoding;
    }

    protected String extractString(String content) {
        String input = content.replaceAll("[\\r\\n]", " ");
        input = this.ignoreCommentTag ? input.replaceAll("<!--[^>]+-->", "") : input.replace("<!--", "").replace("-->", "");
        Matcher matcher = this.getTagPattern().matcher(input);
        StringBuffer sb = new StringBuffer();
        Pattern attrPattern = Pattern.compile("\\s[^ ]+=\"([^\"]*)\"");
        while (matcher.find()) {
            String tagStr = matcher.group();
            Matcher attrMatcher = attrPattern.matcher(tagStr);
            StringBuilder buf = new StringBuilder(100);
            while (attrMatcher.find()) {
                buf.append(attrMatcher.group(1)).append(' ');
            }
            matcher.appendReplacement(sb, buf.toString().replace("\\", "\\\\").replace("$", "\\$"));
        }
        matcher.appendTail(sb);
        return sb.toString().replaceAll("\\s+", " ").trim();
    }

    public String getEncoding() {
        return this.encoding;
    }

    public void setEncoding(String encoding) {
        this.encoding = encoding;
    }

    public int getPreloadSizeForCharset() {
        return this.preloadSizeForCharset;
    }

    public void setPreloadSizeForCharset(int preloadSizeForCharset) {
        this.preloadSizeForCharset = preloadSizeForCharset;
    }

    public boolean isIgnoreCommentTag() {
        return this.ignoreCommentTag;
    }

    public void setIgnoreCommentTag(boolean ignoreCommentTag) {
        this.ignoreCommentTag = ignoreCommentTag;
    }
}

