/*
 * Decompiled with CFR 0.152.
 */
package com.worksap.nlp.sudachi;

import com.worksap.nlp.sudachi.InputTextPlugin;
import com.worksap.nlp.sudachi.LatticeImpl;
import com.worksap.nlp.sudachi.LatticeNode;
import com.worksap.nlp.sudachi.LatticeNodeImpl;
import com.worksap.nlp.sudachi.Morpheme;
import com.worksap.nlp.sudachi.MorphemeList;
import com.worksap.nlp.sudachi.OovProviderPlugin;
import com.worksap.nlp.sudachi.PathRewritePlugin;
import com.worksap.nlp.sudachi.Tokenizer;
import com.worksap.nlp.sudachi.UTF8InputText;
import com.worksap.nlp.sudachi.UTF8InputTextBuilder;
import com.worksap.nlp.sudachi.dictionary.CategoryType;
import com.worksap.nlp.sudachi.dictionary.Grammar;
import com.worksap.nlp.sudachi.dictionary.Lexicon;
import com.worksap.nlp.sudachi.sentdetect.SentenceDetector;
import java.io.IOException;
import java.io.PrintStream;
import java.io.Reader;
import java.io.StringWriter;
import java.io.Writer;
import java.nio.CharBuffer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import javax.json.Json;
import javax.json.JsonArrayBuilder;
import javax.json.JsonObjectBuilder;
import javax.json.JsonWriter;

class JapaneseTokenizer
implements Tokenizer {
    Grammar grammar;
    Lexicon lexicon;
    List<InputTextPlugin> inputTextPlugins;
    List<OovProviderPlugin> oovProviderPlugins;
    List<PathRewritePlugin> pathRewritePlugins;
    OovProviderPlugin defaultOovProvider;
    PrintStream dumpOutput;
    JsonObjectBuilder jsonBuilder;
    boolean allowEmptyMorpheme;
    LatticeImpl lattice;

    JapaneseTokenizer(Grammar grammar, Lexicon lexicon, List<InputTextPlugin> inputTextPlugins, List<OovProviderPlugin> oovProviderPlugins, List<PathRewritePlugin> pathRewritePlugins) {
        this.grammar = grammar;
        this.lexicon = lexicon;
        this.inputTextPlugins = inputTextPlugins;
        this.oovProviderPlugins = oovProviderPlugins;
        this.pathRewritePlugins = pathRewritePlugins;
        this.lattice = new LatticeImpl(grammar);
        this.allowEmptyMorpheme = true;
        if (!oovProviderPlugins.isEmpty()) {
            this.defaultOovProvider = oovProviderPlugins.get(oovProviderPlugins.size() - 1);
        }
    }

    @Override
    public List<Morpheme> tokenize(Tokenizer.SplitMode mode, String text) {
        if (text.isEmpty()) {
            return Collections.emptyList();
        }
        UTF8InputText input = this.buildInputText(text);
        return this.tokenizeSentence(mode, input);
    }

    @Override
    public Iterable<List<Morpheme>> tokenizeSentences(Tokenizer.SplitMode mode, String text) {
        int length;
        if (text.isEmpty()) {
            return Collections.emptyList();
        }
        UTF8InputText input = this.buildInputText(text);
        String normalized = input.getText();
        ArrayList<List<Morpheme>> sentences = new ArrayList<List<Morpheme>>();
        SentenceDetector detector = new SentenceDetector();
        int bos = 0;
        NonBreakChecker checker = new NonBreakChecker(input);
        checker.setBos(bos);
        while ((length = detector.getEos(normalized, checker)) != 0) {
            int eos;
            if (length < 0) {
                length = -length;
            }
            if ((eos = bos + length) < normalized.length()) {
                eos = input.getNextInOriginal(eos - 1);
                length = eos - bos;
            }
            UTF8InputText sentence = input.slice(bos, eos);
            sentences.add(this.tokenizeSentence(mode, sentence));
            normalized = normalized.substring(length);
            bos = eos;
            checker.setBos(bos);
        }
        return sentences;
    }

    @Override
    public Iterable<List<Morpheme>> tokenizeSentences(Tokenizer.SplitMode mode, Reader reader) throws IOException {
        ArrayList<List<Morpheme>> sentences = new ArrayList<List<Morpheme>>();
        CharBuffer buffer = CharBuffer.allocate(4096);
        SentenceDetector detector = new SentenceDetector();
        while (reader.read(buffer) > 0) {
            int length;
            buffer.flip();
            UTF8InputText input = this.buildInputText(buffer);
            String normalized = input.getText();
            int bos = 0;
            NonBreakChecker checker = new NonBreakChecker(input);
            checker.setBos(bos);
            while ((length = detector.getEos(normalized, checker)) > 0) {
                int eos = bos + length;
                if (eos < normalized.length()) {
                    eos = input.getNextInOriginal(eos - 1);
                    length = eos - bos;
                }
                UTF8InputText sentence = input.slice(bos, eos);
                sentences.add(this.tokenizeSentence(mode, sentence));
                normalized = normalized.substring(length);
                bos = eos;
                checker.setBos(bos);
            }
            if (length >= 0) continue;
            buffer.position(input.textIndexToOriginalTextIndex(bos));
            buffer.compact();
        }
        buffer.flip();
        if (buffer.hasRemaining()) {
            sentences.add(this.tokenizeSentence(mode, this.buildInputText(buffer)));
        }
        return sentences;
    }

    @Override
    public void setDumpOutput(PrintStream output) {
        this.dumpOutput = output;
    }

    @Override
    public String dumpInternalStructures(String text) {
        this.jsonBuilder = Json.createObjectBuilder();
        this.tokenize(Tokenizer.SplitMode.C, text);
        StringWriter stringWriter = new StringWriter();
        try (JsonWriter writer = Json.createWriter((Writer)stringWriter);){
            writer.writeObject(this.jsonBuilder.build());
        }
        return stringWriter.toString();
    }

    UTF8InputText buildInputText(CharSequence text) {
        UTF8InputTextBuilder builder = new UTF8InputTextBuilder(text, this.grammar);
        for (InputTextPlugin plugin : this.inputTextPlugins) {
            plugin.rewrite(builder);
        }
        UTF8InputText input = builder.build();
        if (this.dumpOutput != null) {
            this.dumpOutput.println("=== Input dump:");
            this.dumpOutput.println(input.getText());
        }
        if (this.jsonBuilder != null) {
            this.jsonBuilder.add("inputText", Json.createObjectBuilder().add("originalText", input.getOriginalText()).add("modifiedText", input.getText()));
        }
        return input;
    }

    List<Morpheme> tokenizeSentence(Tokenizer.SplitMode mode, UTF8InputText input) {
        this.buildLattice(input);
        if (this.dumpOutput != null) {
            this.dumpOutput.println("=== Lattice dump:");
            this.lattice.dump(this.dumpOutput);
        }
        if (this.jsonBuilder != null) {
            this.jsonBuilder.add("lattice", this.lattice.toJson());
        }
        List<LatticeNode> path = this.lattice.getBestPath();
        if (this.dumpOutput != null) {
            this.dumpOutput.println("=== Before rewriting:");
            this.dumpPath(path);
        }
        if (this.jsonBuilder != null) {
            this.jsonBuilder.add("bestPath", this.pathToJson(path, this.lattice));
        }
        for (PathRewritePlugin plugin : this.pathRewritePlugins) {
            plugin.rewrite(input, path, this.lattice);
        }
        this.lattice.clear();
        if (mode != Tokenizer.SplitMode.C) {
            path = this.splitPath(path, mode);
        }
        if (this.dumpOutput != null) {
            this.dumpOutput.println("=== After rewriting:");
            this.dumpPath(path);
            this.dumpOutput.println("===");
        }
        if (this.jsonBuilder != null) {
            this.jsonBuilder.add("rewrittenPath", this.pathToJson(path, this.lattice));
        }
        return new MorphemeList(input, this.grammar, this.lexicon, path, this.allowEmptyMorpheme);
    }

    LatticeImpl buildLattice(UTF8InputText input) {
        byte[] bytes = input.getByteText();
        this.lattice.resize(bytes.length);
        for (int i = 0; i < bytes.length; ++i) {
            if (!input.canBow(i) || !this.lattice.hasPreviousNode(i)) continue;
            Iterator<int[]> iterator = this.lexicon.lookup(bytes, i);
            boolean hasWords = false;
            while (iterator.hasNext()) {
                int[] r = iterator.next();
                int wordId = r[0];
                int end = r[1];
                if (end < bytes.length && !input.canBow(end)) continue;
                LatticeNodeImpl n = new LatticeNodeImpl(this.lexicon, this.lexicon.getLeftId(wordId), this.lexicon.getRightId(wordId), this.lexicon.getCost(wordId), wordId);
                this.lattice.insert(i, end, n);
                hasWords = true;
            }
            if (!input.getCharCategoryTypes(i).contains((Object)CategoryType.NOOOVBOW)) {
                for (OovProviderPlugin plugin : this.oovProviderPlugins) {
                    for (LatticeNode node : plugin.getOOV(input, i, hasWords)) {
                        hasWords = true;
                        this.lattice.insert(node.getBegin(), node.getEnd(), node);
                    }
                }
            }
            if (!hasWords && this.defaultOovProvider != null) {
                for (LatticeNode node : this.defaultOovProvider.getOOV(input, i, hasWords)) {
                    hasWords = true;
                    this.lattice.insert(node.getBegin(), node.getEnd(), node);
                }
            }
            if (hasWords) continue;
            throw new IllegalStateException("there is no morpheme at " + i);
        }
        this.lattice.connectEosNode();
        return this.lattice;
    }

    List<LatticeNode> splitPath(List<LatticeNode> path, Tokenizer.SplitMode mode) {
        ArrayList<LatticeNode> newPath = new ArrayList<LatticeNode>();
        for (LatticeNode node : path) {
            int[] wids = mode == Tokenizer.SplitMode.A ? node.getWordInfo().getAunitSplit() : node.getWordInfo().getBunitSplit();
            if (wids.length == 0 || wids.length == 1) {
                newPath.add(node);
                continue;
            }
            int offset = node.getBegin();
            for (int wid : wids) {
                LatticeNodeImpl n = new LatticeNodeImpl(this.lexicon, 0, 0, 0, wid);
                n.begin = offset;
                n.end = offset += n.getWordInfo().getLength();
                newPath.add(n);
            }
        }
        return newPath;
    }

    void dumpPath(List<LatticeNode> path) {
        int i = 0;
        for (LatticeNode node : path) {
            this.dumpOutput.println(String.format("%d: %s", i, node.toString()));
            ++i;
        }
    }

    JsonArrayBuilder pathToJson(List<LatticeNode> path, LatticeImpl lattice) {
        JsonArrayBuilder builder = Json.createArrayBuilder();
        for (LatticeNode node : path) {
            builder.add(lattice.nodeToJson((LatticeNodeImpl)node));
        }
        return builder;
    }

    void disableEmptyMorpheme() {
        this.allowEmptyMorpheme = false;
    }

    class NonBreakChecker
    implements SentenceDetector.NonBreakCheker {
        private final UTF8InputText input;
        private int bos;

        NonBreakChecker(UTF8InputText input) {
            this.input = input;
        }

        public void setBos(int bos) {
            this.bos = bos;
        }

        @Override
        public boolean hasNonBreakWord(int length) {
            int byteEOS = this.input.getCodePointsOffsetLength(0, this.bos + length);
            byte[] bytes = this.input.getByteText();
            for (int i = Math.max(0, byteEOS - 64); i < byteEOS; ++i) {
                Iterator<int[]> iterator = JapaneseTokenizer.this.lexicon.lookup(bytes, i);
                while (iterator.hasNext()) {
                    int[] r = iterator.next();
                    int l = r[1];
                    if (l <= byteEOS && (l != byteEOS || this.bos + length - this.input.getOffsetTextLength(i) <= 1)) continue;
                    return true;
                }
            }
            return false;
        }
    }
}

