/*
 * Decompiled with CFR 0.152.
 */
package com.worksap.nlp.sudachi.sentdetect;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class SentenceDetector {
    private static final String PERIODS = "\u3002|\uff1f|\uff01|\u266a|\u2026|\\?|\\!";
    private static final String DOT = "(\\.|\uff0e)";
    private static final String CDOT = "\u30fb";
    private static final String COMMA = "(,|\uff0c|\u3001)";
    private static final String BR_TAG = "(<br>|<BR>){2,}";
    private static final String ALPHABET_OR_NUMBER = "[a-z]|[A-Z]|[0-9]|[\uff41-\uff5a]|[\uff21-\uff3a]|[\uff10-\uff19]|\u3007|\u4e00|\u4e8c|\u4e09|\u56db|\u4e94|\u516d|\u4e03|\u516b|\u4e5d|\u5341|\u767e|\u5343|\u4e07|\u5104|\u5146";
    private static final Pattern SENTENCE_BREAKER_PATTERN = Pattern.compile("(\u3002|\uff1f|\uff01|\u266a|\u2026|\\?|\\!|\u30fb{3,}+|((?<!([a-z]|[A-Z]|[0-9]|[\uff41-\uff5a]|[\uff21-\uff3a]|[\uff10-\uff19]|\u3007|\u4e00|\u4e8c|\u4e09|\u56db|\u4e94|\u516d|\u4e03|\u516b|\u4e5d|\u5341|\u767e|\u5343|\u4e07|\u5104|\u5146))(\\.|\uff0e)(?!([a-z]|[A-Z]|[0-9]|[\uff41-\uff5a]|[\uff21-\uff3a]|[\uff10-\uff19]|\u3007|\u4e00|\u4e8c|\u4e09|\u56db|\u4e94|\u516d|\u4e03|\u516b|\u4e5d|\u5341|\u767e|\u5343|\u4e07|\u5104|\u5146|(,|\uff0c|\u3001)))))((\\.|\uff0e)|\u3002|\uff1f|\uff01|\u266a|\u2026|\\?|\\!)*|(<br>|<BR>){2,}");
    private static final String OPEN_PARENTHESIS = "\\(|\\{|\uff5b|\\[|\uff08|\u300c|\u3010|\u300e|\uff3b|\u226a|\u3014|\u201c";
    private static final String CLOSE_PARENTHESIS = "\\)|\\}|\\]|\uff09|\u300d|\uff5d|\u3011|\u300f|\uff3d|\u3015|\u226b|\u201d";
    private static final String ITEMIZE_HEADER = "([a-z]|[A-Z]|[0-9]|[\uff41-\uff5a]|[\uff21-\uff3a]|[\uff10-\uff19]|\u3007|\u4e00|\u4e8c|\u4e09|\u56db|\u4e94|\u516d|\u4e03|\u516b|\u4e5d|\u5341|\u767e|\u5343|\u4e07|\u5104|\u5146)((\\.|\uff0e))";
    private static final Pattern ITEMIZE_HEADER_PATTERN = Pattern.compile("([a-z]|[A-Z]|[0-9]|[\uff41-\uff5a]|[\uff21-\uff3a]|[\uff10-\uff19]|\u3007|\u4e00|\u4e8c|\u4e09|\u56db|\u4e94|\u516d|\u4e03|\u516b|\u4e5d|\u5341|\u767e|\u5343|\u4e07|\u5104|\u5146)((\\.|\uff0e))");
    public static final int DEFAULT_LIMIT = 4096;
    private int limit;
    private static final Pattern PARENTHESIS_PATTERN = Pattern.compile("(\\(|\\{|\uff5b|\\[|\uff08|\u300c|\u3010|\u300e|\uff3b|\u226a|\u3014|\u201c)|(\\)|\\}|\\]|\uff09|\u300d|\uff5d|\u3011|\u300f|\uff3d|\u3015|\u226b|\u201d)");
    private static final Pattern PROHIBITED_BOS_PETTERN = Pattern.compile("\\A(\\)|\\}|\\]|\uff09|\u300d|\uff5d|\u3011|\u300f|\uff3d|\u3015|\u226b|\u201d|(,|\uff0c|\u3001)|\u3002|\uff1f|\uff01|\u266a|\u2026|\\?|\\!)+");
    private static final Pattern QUOTE_MARKER_PATTERN = Pattern.compile("(\uff01|\uff1f|\\!|\\?|\\)|\\}|\\]|\uff09|\u300d|\uff5d|\u3011|\u300f|\uff3d|\u3015|\u226b|\u201d)(\u3068|\u3063|\u3067\u3059)");
    private static final Pattern EOS_ITEMIZE_HEADER_PATTERN = Pattern.compile("([a-z]|[A-Z]|[0-9]|[\uff41-\uff5a]|[\uff21-\uff3a]|[\uff10-\uff19]|\u3007|\u4e00|\u4e8c|\u4e09|\u56db|\u4e94|\u516d|\u4e03|\u516b|\u4e5d|\u5341|\u767e|\u5343|\u4e07|\u5104|\u5146)((\\.|\uff0e))\\z");

    public SentenceDetector() {
        this(-1);
    }

    public SentenceDetector(int limit) {
        this.limit = limit > 0 ? limit : 4096;
    }

    public int getEos(CharSequence input, NonBreakCheker checker) {
        if (input.length() == 0) {
            return 0;
        }
        CharSequence s = input.length() > this.limit ? input.subSequence(0, this.limit) : input;
        Matcher matcher = SENTENCE_BREAKER_PATTERN.matcher(s);
        while (matcher.find()) {
            int eos = matcher.end();
            if (this.parenthesisLevel(s.subSequence(0, eos)) != 0) continue;
            if (eos < s.length()) {
                eos += this.prohibitedBOS(s.subSequence(eos, s.length()));
            }
            if (ITEMIZE_HEADER_PATTERN.matcher(s.subSequence(0, eos)).matches() || eos < s.length() && this.isContinuousPhrase(s, eos) || checker != null && checker.hasNonBreakWord(eos)) continue;
            return eos;
        }
        Pattern spaces = Pattern.compile(".+\\s+");
        Matcher m = spaces.matcher(s);
        if (m.find()) {
            return -m.end();
        }
        return -Math.min(input.length(), this.limit);
    }

    int parenthesisLevel(CharSequence s) {
        Matcher matcher = PARENTHESIS_PATTERN.matcher(s);
        int level = 0;
        while (matcher.find()) {
            level = matcher.group(1) != null ? ++level : --level;
            if (level >= 0) continue;
            level = 0;
        }
        return level;
    }

    int prohibitedBOS(CharSequence s) {
        Matcher m = PROHIBITED_BOS_PETTERN.matcher(s);
        return m.find() ? m.end() : 0;
    }

    boolean isContinuousPhrase(CharSequence s, int eos) {
        Matcher m = QUOTE_MARKER_PATTERN.matcher(s);
        if (m.find(eos - 1) && m.start() == eos - 1) {
            return true;
        }
        char c = s.charAt(eos);
        return (c == '\u3068' || c == '\u3084' || c == '\u306e') && EOS_ITEMIZE_HEADER_PATTERN.matcher(s.subSequence(0, eos)).find();
    }

    public static interface NonBreakCheker {
        public boolean hasNonBreakWord(int var1);
    }
}

