/*
 * Decompiled with CFR 0.152.
 */
package hivemall.tools.text;

import java.util.ArrayList;
import java.util.List;
import java.util.StringTokenizer;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.hive.ql.udf.UDFType;
import org.apache.hadoop.io.Text;

@Description(name="tokenize", value="_FUNC_(string englishText [, boolean toLowerCase]) - Returns tokenized words in array<string>")
@UDFType(deterministic=true, stateful=false)
public final class TokenizeUDF
extends UDF {
    private static final String DELIM = " .,?!:;()<>[]\b\t\n\f\r\"'\\";

    public List<Text> evaluate(Text input) {
        return this.evaluate(input, false);
    }

    public List<Text> evaluate(Text input, boolean toLowerCase) {
        if (input == null) {
            return null;
        }
        ArrayList<Text> tokens = new ArrayList<Text>();
        StringTokenizer tokenizer = new StringTokenizer(input.toString(), DELIM);
        while (tokenizer.hasMoreElements()) {
            String word = tokenizer.nextToken();
            if (toLowerCase) {
                word = word.toLowerCase();
            }
            tokens.add(new Text(word));
        }
        return tokens;
    }
}

