/*
 * Decompiled with CFR 0.152.
 */
package org.apache.tika.langdetect.opennlp;

import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import opennlp.tools.langdetect.Language;
import opennlp.tools.langdetect.LanguageDetectorModel;
import opennlp.tools.util.normalizer.CharSequenceNormalizer;
import opennlp.tools.util.normalizer.EmojiCharSequenceNormalizer;
import opennlp.tools.util.normalizer.NumberCharSequenceNormalizer;
import opennlp.tools.util.normalizer.ShrinkCharSequenceNormalizer;
import opennlp.tools.util.normalizer.TwitterCharSequenceNormalizer;
import org.apache.tika.langdetect.opennlp.ProbingLanguageDetector;
import org.apache.tika.language.detect.LanguageConfidence;
import org.apache.tika.language.detect.LanguageDetector;
import org.apache.tika.language.detect.LanguageResult;

public class OpenNLPDetector
extends LanguageDetector {
    static LanguageDetectorModel LANG_MODEL;
    private final ProbingLanguageDetector detector = new ProbingLanguageDetector(LANG_MODEL, OpenNLPDetector.getNormalizers());
    private final StringBuilder buffer = new StringBuilder();

    static void loadBuiltInModels() throws IOException {
        try (InputStream is = OpenNLPDetector.class.getResourceAsStream("/opennlp-langdetect-20210413.bin");){
            LANG_MODEL = new LanguageDetectorModel(is);
        }
    }

    private static CharSequenceNormalizer[] getNormalizers() {
        return new CharSequenceNormalizer[]{TikaUrlCharSequenceNormalizer.getInstance(), AlphaIdeographSequenceNormalizer.getInstance(), EmojiCharSequenceNormalizer.getInstance(), TwitterCharSequenceNormalizer.getInstance(), NumberCharSequenceNormalizer.getInstance(), ShrinkCharSequenceNormalizer.getInstance()};
    }

    private static LanguageConfidence getConfidence(double confidence) {
        if (confidence > 0.9) {
            return LanguageConfidence.HIGH;
        }
        if (confidence > 0.85) {
            return LanguageConfidence.MEDIUM;
        }
        if (confidence > 0.2) {
            return LanguageConfidence.LOW;
        }
        return LanguageConfidence.NONE;
    }

    public LanguageDetector loadModels() throws IOException {
        return new OpenNLPDetector();
    }

    public LanguageDetector loadModels(Set<String> languages) throws IOException {
        throw new UnsupportedOperationException("This lang detector doesn't allow subsetting models");
    }

    public boolean hasModel(String language) {
        for (String lang : this.detector.getSupportedLanguages()) {
            if (!language.equals(lang)) continue;
            return true;
        }
        return false;
    }

    public LanguageDetector setPriors(Map<String, Float> languageProbabilities) throws IOException {
        throw new UnsupportedOperationException();
    }

    public void reset() {
        this.buffer.setLength(0);
    }

    public void addText(char[] cbuf, int off, int len) {
        int buffLen = this.buffer.length();
        int newLen = Math.min(len, this.detector.getMaxLength() - buffLen);
        if (len <= 0) {
            return;
        }
        this.buffer.append(cbuf, off, newLen);
    }

    public List<LanguageResult> detectAll() {
        Language[] langs = this.detector.predictLanguages(this.buffer.toString());
        ArrayList<LanguageResult> results = new ArrayList<LanguageResult>();
        for (Language lang : langs) {
            LanguageResult r = new LanguageResult(lang.getLang(), OpenNLPDetector.getConfidence(lang.getConfidence()), (float)lang.getConfidence());
            results.add(r);
        }
        return results;
    }

    public void setMaxLength(int maxLength) {
        this.detector.setMaxLength(maxLength);
    }

    public String[] getSupportedLanguages() {
        return this.detector.getSupportedLanguages();
    }

    static {
        try {
            OpenNLPDetector.loadBuiltInModels();
        }
        catch (IOException e) {
            throw new RuntimeException("Can't find built-in language models");
        }
    }

    private static class AlphaIdeographSequenceNormalizer
    implements CharSequenceNormalizer {
        private static final Pattern REGEX = Pattern.compile("[^\\p{IsAlphabetic}\\p{IsIdeographic}]+");
        private static final AlphaIdeographSequenceNormalizer INSTANCE = new AlphaIdeographSequenceNormalizer();

        private AlphaIdeographSequenceNormalizer() {
        }

        public static AlphaIdeographSequenceNormalizer getInstance() {
            return INSTANCE;
        }

        public CharSequence normalize(CharSequence charSequence) {
            return REGEX.matcher(charSequence).replaceAll(" ");
        }
    }

    private static class TikaUrlCharSequenceNormalizer
    implements CharSequenceNormalizer {
        private static final Pattern URL_REGEX = Pattern.compile("https?://[-_.?&~;+=/#0-9A-Za-z]{10,10000}");
        private static final Pattern MAIL_REGEX = Pattern.compile("[-_.0-9A-Za-z]{1,100}@[-_0-9A-Za-z]{1,100}[-_.0-9A-Za-z]{1,100}");
        private static final TikaUrlCharSequenceNormalizer INSTANCE = new TikaUrlCharSequenceNormalizer();

        private TikaUrlCharSequenceNormalizer() {
        }

        public static TikaUrlCharSequenceNormalizer getInstance() {
            return INSTANCE;
        }

        public CharSequence normalize(CharSequence charSequence) {
            String modified = URL_REGEX.matcher(charSequence).replaceAll(" ");
            return MAIL_REGEX.matcher(modified).replaceAll(" ");
        }
    }
}

