/*
 * Decompiled with CFR 0.152.
 */
package hivemall.nlp.tokenizer;

import hivemall.utils.hadoop.HiveUtils;
import hivemall.utils.io.IOUtils;
import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.UDFType;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.io.Text;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.CharArraySet;

@Description(name="tokenize_cn", value="_FUNC_(String line [, const list<string> stopWords]) - returns tokenized strings in array<string>")
@UDFType(deterministic=true, stateful=false)
public final class SmartcnUDF
extends GenericUDF {
    private String[] _stopWordsArray;
    private transient SmartChineseAnalyzer _analyzer;

    public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
        int arglen = arguments.length;
        if (arglen < 1 || arglen > 2) {
            throw new UDFArgumentException("Invalid number of arguments for `tokenize_cn`: " + arglen);
        }
        this._stopWordsArray = arglen >= 2 ? HiveUtils.getConstStringArray(arguments[1]) : null;
        this._analyzer = null;
        return ObjectInspectorFactory.getStandardListObjectInspector((ObjectInspector)PrimitiveObjectInspectorFactory.writableStringObjectInspector);
    }

    public List<Text> evaluate(GenericUDF.DeferredObject[] arguments) throws HiveException {
        Object arg0;
        SmartChineseAnalyzer analyzer = this._analyzer;
        if (analyzer == null) {
            CharArraySet stopwords = SmartcnUDF.stopWords(this._stopWordsArray);
            this._analyzer = analyzer = new SmartChineseAnalyzer(stopwords);
        }
        if ((arg0 = arguments[0].get()) == null) {
            return null;
        }
        String line = arg0.toString();
        ArrayList<Text> results = new ArrayList<Text>(32);
        TokenStream stream = null;
        try {
            stream = analyzer.tokenStream("", line);
            if (stream != null) {
                SmartcnUDF.analyzeTokens(stream, results);
            }
        }
        catch (IOException e) {
            try {
                IOUtils.closeQuietly((Closeable)analyzer);
                throw new HiveException((Throwable)e);
            }
            catch (Throwable throwable) {
                IOUtils.closeQuietly(stream);
                throw throwable;
            }
        }
        IOUtils.closeQuietly((Closeable)stream);
        return results;
    }

    public void close() throws IOException {
        IOUtils.closeQuietly((Closeable)this._analyzer);
    }

    @Nonnull
    private static CharArraySet stopWords(@Nullable String[] array) throws UDFArgumentException {
        if (array == null) {
            return SmartChineseAnalyzer.getDefaultStopSet();
        }
        if (array.length == 0) {
            return CharArraySet.EMPTY_SET;
        }
        CharArraySet results = new CharArraySet(Arrays.asList(array), true);
        return results;
    }

    private static void analyzeTokens(@Nonnull TokenStream stream, @Nonnull List<Text> results) throws IOException {
        CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class);
        stream.reset();
        while (stream.incrementToken()) {
            String term = termAttr.toString();
            results.add(new Text(term));
        }
    }

    public String getDisplayString(String[] children) {
        return "tokenize_cn(" + Arrays.toString(children) + ')';
    }
}

