/*
 * Decompiled with CFR 0.152.
 */
package hivemall.nlp.tokenizer;

import hivemall.UDFWithOptions;
import hivemall.utils.hadoop.HiveUtils;
import hivemall.utils.io.HttpUtils;
import hivemall.utils.io.IOUtils;
import hivemall.utils.lang.ExceptionUtils;
import hivemall.utils.lang.Preconditions;
import java.io.Closeable;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.net.HttpURLConnection;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Objects;
import java.util.Set;
import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.Options;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.UDFType;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StandardListObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.io.Text;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.ja.JapaneseAnalyzer;
import org.apache.lucene.analysis.ja.JapaneseTokenizer;
import org.apache.lucene.analysis.ja.dict.UserDictionary;
import org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.CharArraySet;

@Description(name="tokenize_ja", value="_FUNC_(String line [, const string mode = \"normal\", const array<string> stopWords, const array<string> stopTags, const array<string> userDict (or string userDictURL)]) - returns tokenized strings in array<string>", extended="select tokenize_ja(\"kuromoji\u3092\u4f7f\u3063\u305f\u5206\u304b\u3061\u66f8\u304d\u306e\u30c6\u30b9\u30c8\u3067\u3059\u3002\u7b2c\u4e8c\u5f15\u6570\u306b\u306fnormal/search/extended\u3092\u6307\u5b9a\u3067\u304d\u307e\u3059\u3002\u30c7\u30d5\u30a9\u30eb\u30c8\u3067\u306fnormal\u30e2\u30fc\u30c9\u3067\u3059\u3002\");\n\n> [\"kuromoji\",\"\u4f7f\u3046\",\"\u5206\u304b\u3061\u66f8\u304d\",\"\u30c6\u30b9\u30c8\",\"\u7b2c\",\"\u4e8c\",\"\u5f15\u6570\",\"normal\",\"search\",\"extended\",\"\u6307\u5b9a\",\"\u30c7\u30d5\u30a9\u30eb\u30c8\",\"normal\",\" \u30e2\u30fc\u30c9\"]\n")
@UDFType(deterministic=true, stateful=false)
public final class KuromojiUDF
extends UDFWithOptions {
    private static final int CONNECT_TIMEOUT_MS = 10000;
    private static final int READ_TIMEOUT_MS = 60000;
    private static final long MAX_INPUT_STREAM_SIZE = 0x2000000L;
    private JapaneseTokenizer.Mode _mode;
    private boolean _returnPos;
    private transient Object[] _result;
    @Nullable
    private String[] _stopWordsArray;
    private Set<String> _stopTags;
    @Nullable
    private Object _userDictObj;
    private transient JapaneseAnalyzer _analyzer;

    @Override
    protected Options getOptions() {
        Options opts = new Options();
        opts.addOption("mode", true, "The tokenization mode. One of ['normal', 'search', 'extended', 'default' (normal)]");
        opts.addOption("pos", false, "Return part-of-speech information");
        return opts;
    }

    @Override
    protected CommandLine processOptions(String optionValue) throws UDFArgumentException {
        CommandLine cl = this.parseOptions(optionValue);
        if (cl.hasOption("mode")) {
            String modeStr = cl.getOptionValue("mode");
            this._mode = KuromojiUDF.tokenizationMode(modeStr);
        }
        this._returnPos = cl.hasOption("pos");
        return cl;
    }

    public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
        String arg1;
        int arglen = arguments.length;
        if (arglen < 1 || arglen > 5) {
            this.showHelp("Invalid number of arguments for `tokenize_ja`: " + arglen);
        }
        this._mode = JapaneseTokenizer.Mode.NORMAL;
        if (arglen >= 2 && (arg1 = HiveUtils.getConstString(arguments[1])) != null) {
            if (arg1.startsWith("-")) {
                this.processOptions(arg1);
            } else {
                this._mode = KuromojiUDF.tokenizationMode(arg1);
            }
        }
        if (arglen >= 3 && !HiveUtils.isVoidOI(arguments[2])) {
            this._stopWordsArray = HiveUtils.getConstStringArray(arguments[2]);
        }
        Set<String> set = this._stopTags = arglen >= 4 ? KuromojiUDF.stopTags(arguments[3]) : JapaneseAnalyzer.getDefaultStopTags();
        if (arglen >= 5) {
            if (HiveUtils.isConstListOI(arguments[4])) {
                this._userDictObj = HiveUtils.getConstStringArray(arguments[4]);
            } else if (HiveUtils.isConstString(arguments[4])) {
                this._userDictObj = HiveUtils.getConstString(arguments[4]);
            } else {
                throw new UDFArgumentException("User dictionary MUST be given as an array of constant string or constant string (URL)");
            }
        }
        this._analyzer = null;
        if (this._returnPos) {
            this._result = new Object[2];
            ArrayList<String> fieldNames = new ArrayList<String>();
            ArrayList<StandardListObjectInspector> fieldOIs = new ArrayList<StandardListObjectInspector>();
            fieldNames.add("tokens");
            fieldOIs.add(ObjectInspectorFactory.getStandardListObjectInspector((ObjectInspector)PrimitiveObjectInspectorFactory.writableStringObjectInspector));
            fieldNames.add("pos");
            fieldOIs.add(ObjectInspectorFactory.getStandardListObjectInspector((ObjectInspector)PrimitiveObjectInspectorFactory.writableStringObjectInspector));
            return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs);
        }
        return ObjectInspectorFactory.getStandardListObjectInspector((ObjectInspector)PrimitiveObjectInspectorFactory.writableStringObjectInspector);
    }

    public Object evaluate(GenericUDF.DeferredObject[] arguments) throws HiveException {
        Object arg0;
        if (this._analyzer == null) {
            CharArraySet stopWords = KuromojiUDF.stopWords(this._stopWordsArray);
            UserDictionary userDict = null;
            if (this._userDictObj instanceof String[]) {
                userDict = KuromojiUDF.userDictionary((String[])this._userDictObj);
            } else if (this._userDictObj instanceof String) {
                userDict = KuromojiUDF.userDictionary((String)this._userDictObj);
            }
            this._analyzer = new JapaneseAnalyzer(userDict, this._mode, stopWords, this._stopTags);
        }
        if ((arg0 = arguments[0].get()) == null) {
            return null;
        }
        String line = arg0.toString();
        if (this._returnPos) {
            return KuromojiUDF.parseLine(this._analyzer, line, this._result);
        }
        return KuromojiUDF.parseLine(this._analyzer, line);
    }

    @Nonnull
    private static Object[] parseLine(@Nonnull JapaneseAnalyzer analyzer, @Nonnull String line, @Nonnull Object[] result) throws HiveException {
        Objects.requireNonNull(result);
        Preconditions.checkArgument(result.length == 2);
        ArrayList<Text> tokens = new ArrayList<Text>(32);
        ArrayList<Text> pos = new ArrayList<Text>(32);
        TokenStream stream = null;
        try {
            stream = analyzer.tokenStream("", line);
            if (stream != null) {
                KuromojiUDF.analyzeTokens(stream, tokens, pos);
            }
        }
        catch (IOException e) {
            try {
                IOUtils.closeQuietly((Closeable)analyzer);
                throw new HiveException((Throwable)e);
            }
            catch (Throwable throwable) {
                IOUtils.closeQuietly(stream);
                throw throwable;
            }
        }
        IOUtils.closeQuietly((Closeable)stream);
        result[0] = tokens;
        result[1] = pos;
        return result;
    }

    @Nonnull
    private static List<Text> parseLine(@Nonnull JapaneseAnalyzer analyzer, @Nonnull String line) throws HiveException {
        ArrayList<Text> tokens = new ArrayList<Text>(32);
        TokenStream stream = null;
        try {
            stream = analyzer.tokenStream("", line);
            if (stream != null) {
                KuromojiUDF.analyzeTokens(stream, tokens);
            }
        }
        catch (IOException e) {
            try {
                IOUtils.closeQuietly((Closeable)analyzer);
                throw new HiveException((Throwable)e);
            }
            catch (Throwable throwable) {
                IOUtils.closeQuietly(stream);
                throw throwable;
            }
        }
        IOUtils.closeQuietly((Closeable)stream);
        return tokens;
    }

    public void close() throws IOException {
        IOUtils.closeQuietly((Closeable)this._analyzer);
    }

    @Nonnull
    private static JapaneseTokenizer.Mode tokenizationMode(@Nonnull String arg) throws UDFArgumentException {
        JapaneseTokenizer.Mode mode;
        if ("NORMAL".equalsIgnoreCase(arg)) {
            mode = JapaneseTokenizer.Mode.NORMAL;
        } else if ("SEARCH".equalsIgnoreCase(arg)) {
            mode = JapaneseTokenizer.Mode.SEARCH;
        } else if ("EXTENDED".equalsIgnoreCase(arg)) {
            mode = JapaneseTokenizer.Mode.EXTENDED;
        } else if ("DEFAULT".equalsIgnoreCase(arg)) {
            mode = JapaneseTokenizer.DEFAULT_MODE;
        } else {
            throw new UDFArgumentException("Expected NORMAL|SEARCH|EXTENDED|DEFAULT but got an unexpected mode: " + arg);
        }
        return mode;
    }

    @Nonnull
    private static CharArraySet stopWords(@Nullable String[] array) throws UDFArgumentException {
        if (array == null) {
            return JapaneseAnalyzer.getDefaultStopSet();
        }
        if (array.length == 0) {
            return CharArraySet.EMPTY_SET;
        }
        return new CharArraySet(Arrays.asList(array), true);
    }

    @Nonnull
    private static Set<String> stopTags(@Nonnull ObjectInspector oi) throws UDFArgumentException {
        if (HiveUtils.isVoidOI(oi)) {
            return JapaneseAnalyzer.getDefaultStopTags();
        }
        String[] array = HiveUtils.getConstStringArray(oi);
        if (array == null) {
            return JapaneseAnalyzer.getDefaultStopTags();
        }
        int length = array.length;
        if (length == 0) {
            return Collections.emptySet();
        }
        HashSet<String> results = new HashSet<String>(length);
        for (int i = 0; i < length; ++i) {
            String s = array[i];
            if (s == null) continue;
            results.add(s);
        }
        return results;
    }

    @Nullable
    private static UserDictionary userDictionary(@Nullable String[] userDictArray) throws UDFArgumentException {
        if (userDictArray == null) {
            return null;
        }
        StringBuilder builder = new StringBuilder();
        for (String row : userDictArray) {
            builder.append(row).append('\n');
        }
        StringReader reader = new StringReader(builder.toString());
        try {
            return UserDictionary.open(reader);
        }
        catch (Throwable e) {
            throw new UDFArgumentException("Failed to create user dictionary based on the given array<string>: " + builder.toString() + '\n' + ExceptionUtils.prettyPrintStackTrace(e));
        }
    }

    @Nullable
    private static UserDictionary userDictionary(@Nullable String userDictURL) throws UDFArgumentException {
        InputStream is;
        int responseCode;
        HttpURLConnection conn;
        if (userDictURL == null) {
            return null;
        }
        try {
            conn = HttpUtils.getHttpURLConnection(userDictURL);
        }
        catch (IOException | IllegalArgumentException e) {
            throw new UDFArgumentException("Failed to create HTTP connection to the URL: " + userDictURL + '\n' + ExceptionUtils.prettyPrintStackTrace(e));
        }
        conn.setRequestProperty("Accept-Encoding", "gzip");
        conn.setConnectTimeout(10000);
        conn.setReadTimeout(60000);
        try {
            responseCode = conn.getResponseCode();
        }
        catch (IOException e) {
            throw new UDFArgumentException("Failed to get response code: " + userDictURL + '\n' + ExceptionUtils.prettyPrintStackTrace(e));
        }
        if (responseCode != 200) {
            throw new UDFArgumentException("Got invalid response code: " + responseCode);
        }
        try {
            is = IOUtils.decodeInputStream(HttpUtils.getLimitedInputStream(conn, 0x2000000L));
        }
        catch (IOException | NullPointerException e) {
            throw new UDFArgumentException("Failed to get input stream from the connection: " + userDictURL + '\n' + ExceptionUtils.prettyPrintStackTrace(e));
        }
        CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder().onMalformedInput(CodingErrorAction.REPORT).onUnmappableCharacter(CodingErrorAction.REPORT);
        InputStreamReader reader = new InputStreamReader(is, decoder);
        try {
            return UserDictionary.open(reader);
        }
        catch (Throwable e) {
            throw new UDFArgumentException("Failed to parse the file in CSV format (UTF-8 encoding is expected): " + userDictURL + '\n' + ExceptionUtils.prettyPrintStackTrace(e));
        }
    }

    private static void analyzeTokens(@Nonnull TokenStream stream, @Nonnull List<Text> tokens) throws IOException {
        CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class);
        stream.reset();
        while (stream.incrementToken()) {
            String term = termAttr.toString();
            tokens.add(new Text(term));
        }
    }

    private static void analyzeTokens(@Nonnull TokenStream stream, @Nonnull List<Text> tokenResult, @Nonnull List<Text> posResult) throws IOException {
        CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class);
        PartOfSpeechAttribute posAttr = stream.addAttribute(PartOfSpeechAttribute.class);
        stream.reset();
        while (stream.incrementToken()) {
            String term = termAttr.toString();
            tokenResult.add(new Text(term));
            String pos = posAttr.getPartOfSpeech();
            posResult.add(new Text(pos));
        }
    }

    public String getDisplayString(String[] children) {
        return "tokenize_ja(" + Arrays.toString(children) + ')';
    }
}

