/*
 * Decompiled with CFR 0.152.
 */
package smile.nlp.tokenizer;

import java.util.regex.Pattern;
import smile.nlp.tokenizer.EnglishAbbreviations;
import smile.nlp.tokenizer.Tokenizer;

public class PennTreebankTokenizer
implements Tokenizer {
    private static final Pattern[] CONTRACTIONS2 = new Pattern[]{Pattern.compile("(?i)(.)('ll|'re|'ve|n't|'s|'m|'d)\\b"), Pattern.compile("(?i)\\b(can)(not)\\b"), Pattern.compile("(?i)\\b(D)('ye)\\b"), Pattern.compile("(?i)\\b(Gim)(me)\\b"), Pattern.compile("(?i)\\b(Gon)(na)\\b"), Pattern.compile("(?i)\\b(Got)(ta)\\b"), Pattern.compile("(?i)\\b(Lem)(me)\\b"), Pattern.compile("(?i)\\b(Mor)('n)\\b"), Pattern.compile("(?i)\\b(T)(is)\\b"), Pattern.compile("(?i)\\b(T)(was)\\b"), Pattern.compile("(?i)\\b(Wan)(na)\\b")};
    private static final Pattern[] CONTRACTIONS3 = new Pattern[]{Pattern.compile("(?i)\\b(Whad)(dd)(ya)\\b"), Pattern.compile("(?i)\\b(Wha)(t)(cha)\\b")};
    private static final Pattern[] DELIMITERS = new Pattern[]{Pattern.compile("(?U)([^\\w\\.\\'\\-\\/,&])"), Pattern.compile("(?U)(,\\s)"), Pattern.compile("(?U)('\\s)"), Pattern.compile("(?U)\\. *(\\n|$)")};
    private static final Pattern WHITESPACE = Pattern.compile("(?U)\\s+");
    private static final PennTreebankTokenizer singleton = new PennTreebankTokenizer();

    private PennTreebankTokenizer() {
    }

    public static PennTreebankTokenizer getInstance() {
        return singleton;
    }

    @Override
    public String[] split(String text) {
        for (Pattern regexp : CONTRACTIONS2) {
            text = regexp.matcher(text).replaceAll("$1 $2");
        }
        for (Pattern regexp : CONTRACTIONS3) {
            text = regexp.matcher(text).replaceAll("$1 $2 $3");
        }
        text = DELIMITERS[0].matcher(text).replaceAll(" $1 ");
        text = DELIMITERS[1].matcher(text).replaceAll(" $1");
        text = DELIMITERS[2].matcher(text).replaceAll(" $1");
        String[] words = WHITESPACE.split(text = DELIMITERS[3].matcher(text).replaceAll(" . "));
        if (words.length > 1 && words[words.length - 1].equals(".") && EnglishAbbreviations.contains(words[words.length - 2])) {
            words[words.length - 2] = words[words.length - 2] + ".";
        }
        return words;
    }
}

