/*
 * Decompiled with CFR 0.152.
 */
package com.code972.hebmorph;

import com.code972.hebmorph.HebrewCharacters;
import com.code972.hebmorph.HebrewUtils;
import com.code972.hebmorph.Reference;
import com.code972.hebmorph.datastructures.DictRadix;
import java.io.IOException;
import java.io.Reader;
import java.util.HashMap;

public class Tokenizer {
    public static final char[] Geresh = new char[]{'\'', '\u05f3', '\u2018', '\u2019', '\u201b', '\uff07'};
    public static final char[] Gershayim = new char[]{'\"', '\u05f4', '\u201c', '\u201d', '\u201f', '\u275e', '\uff02'};
    public static final char[] Makaf = new char[]{'-', '\u2012', '\u2013', '\u2014', '\u2015', '\u05be'};
    public static final char[] CharsFollowingPrefixes = HebrewUtils.concatenateCharArrays(Geresh, Gershayim, Makaf);
    public static final char[] LettersAcceptingGeresh = new char[]{'\u05d6', '\u05d2', '\u05e5', '\u05e6', '\u05d7'};
    private Reader input;
    private int dataLen = 0;
    private int inputOffset = 0;
    private int tokenOffset = 0;
    private int tokenLengthInSource = 0;
    private Character suffixForExactMatch = null;
    private final HashMap<String, Integer> hebrewPrefixes;
    private final DictRadix<Byte> specialCases;
    private static final Byte dummyData = 0;
    private static final int IO_BUFFER_SIZE = 4096;
    private char[] ioBuffer = new char[4096];
    private int ioBufferIndex = 0;
    private final char[] wordBuffer = new char[127];
    private byte currentTokenLength = 0;
    private int tokenType = 0;
    static final int TOKENIZATION_EXCEPTION_MAX_LENGTH = 25;
    private char[] tokenizationExceptionBuffer = new char[25];

    public final int getOffset() {
        return this.tokenOffset;
    }

    public int getLengthInSource() {
        return this.tokenLengthInSource;
    }

    public Character getSuffixForExactMatch() {
        return this.suffixForExactMatch;
    }

    public void setSuffixForExactMatch(Character suffixForExactMatch) {
        this.suffixForExactMatch = suffixForExactMatch;
    }

    public void addSpecialCase(String token) {
        if (token.length() > 25) {
            throw new IllegalArgumentException("Special tokenization rule must be at most 25 in length");
        }
        if (token.contains(" ")) {
            throw new IllegalArgumentException("Special tokenization rule cannot contain spaces");
        }
        this.specialCases.addNode(token, dummyData);
    }

    public void clearSpecialCases() {
        this.specialCases.clear();
    }

    public static boolean isLegalPrefix(String prefix, HashMap<String, Integer> prefixesTree) {
        return prefixesTree.containsKey(prefix);
    }

    public static boolean isLegalPrefix(char[] prefix, int length, HashMap<String, Integer> prefixesTree) {
        return prefixesTree.containsKey(new String(prefix, 0, length));
    }

    public Tokenizer(Reader input, HashMap<String, Integer> prefixes) {
        this(input, prefixes, null);
    }

    public Tokenizer(Reader input, HashMap<String, Integer> prefixes, DictRadix<Byte> specialCases) {
        this.input = input;
        this.specialCases = specialCases != null ? specialCases : new DictRadix(false);
        this.hebrewPrefixes = prefixes;
    }

    private boolean isRecognizedException(char[] prefix, byte length, char c) {
        if (length >= 25) {
            return false;
        }
        System.arraycopy(prefix, 0, this.tokenizationExceptionBuffer, 0, length);
        this.tokenizationExceptionBuffer[length] = c;
        return this.isRecognizedException(this.tokenizationExceptionBuffer, length + 1, (byte)(length + 1));
    }

    private boolean isRecognizedException(char c) {
        this.tokenizationExceptionBuffer[0] = c;
        return this.isRecognizedException(this.tokenizationExceptionBuffer, 1, (byte)1);
    }

    private boolean isRecognizedException(char[] token, int tokenLen, byte length) {
        return this.isRecognizedException(token, tokenLen, length, false);
    }

    private boolean isRecognizedException(char[] token, int tokenLen, byte length, boolean exact) {
        int i;
        for (i = 0; i < tokenLen && HebrewUtils.isHebrewLetter(token[i]); ++i) {
            if (Tokenizer.isLegalPrefix(token, i + 1, this.hebrewPrefixes)) continue;
            i = 0;
            break;
        }
        try {
            this.specialCases.lookup(token, i, length - i, i, !exact);
            return true;
        }
        catch (IllegalArgumentException e) {
            return false;
        }
    }

    public int nextToken(Reference<String> tokenString) throws IOException {
        this.currentTokenLength = 0;
        this.tokenOffset = 0;
        this.tokenType = 0;
        boolean avoidTryingCustom = false;
        while (true) {
            boolean appendCurrentChar;
            char c;
            block30: {
                block28: {
                    block31: {
                        block29: {
                            if (this.ioBufferIndex >= this.dataLen) {
                                this.inputOffset += this.dataLen;
                                this.dataLen = this.input.read(this.ioBuffer, 0, this.ioBuffer.length);
                                if (this.dataLen <= 0) {
                                    this.dataLen = 0;
                                    if ((this.tokenType & TokenType.Custom) > 0 && this.currentTokenLength > 0 && !this.isRecognizedException(this.wordBuffer, this.wordBuffer.length, this.currentTokenLength, true)) {
                                        this.abortCustomToken();
                                    }
                                    if (this.currentTokenLength != 0) break;
                                    tokenString.ref = "";
                                    this.tokenLengthInSource = 0;
                                    this.tokenOffset = this.inputOffset;
                                    return 0;
                                }
                                this.ioBufferIndex = 0;
                            }
                            c = this.ioBuffer[this.ioBufferIndex++];
                            c = HebrewCharacters.collapseAlternate(c);
                            appendCurrentChar = false;
                            if (this.currentTokenLength != 0) break block28;
                            if (!HebrewUtils.isHebrewLetter(c)) break block29;
                            if (!HebrewUtils.isFinalHebrewLetter(c)) {
                                this.tokenType |= TokenType.Hebrew;
                                appendCurrentChar = true;
                            }
                            break block30;
                        }
                        if (!Character.isLetterOrDigit(c)) break block31;
                        this.tokenType |= TokenType.NonHebrew;
                        if (Character.isDigit(c)) {
                            this.tokenType |= TokenType.Numeric;
                        }
                        appendCurrentChar = true;
                        break block30;
                    }
                    if (avoidTryingCustom || Character.isWhitespace(c) || !this.isRecognizedException(c)) break block30;
                    this.tokenType |= TokenType.Custom;
                    appendCurrentChar = true;
                    break block30;
                }
                if (!avoidTryingCustom && (this.tokenType & TokenType.Custom) > 0 && !Character.isSpaceChar(c)) {
                    this.wordBuffer[this.currentTokenLength] = c;
                    if (!this.isRecognizedException(this.wordBuffer, this.wordBuffer.length, (byte)(this.currentTokenLength + 1))) {
                        if (!Character.isLetterOrDigit(c)) break;
                        this.tokenType &= ~TokenType.Custom;
                        avoidTryingCustom = true;
                        --this.ioBufferIndex;
                        if (this.ioBufferIndex >= this.currentTokenLength) {
                            this.ioBufferIndex -= this.currentTokenLength;
                            this.currentTokenLength = 0;
                            continue;
                        }
                        this.abortCustomToken();
                        continue;
                    }
                    appendCurrentChar = true;
                } else if (HebrewUtils.isHebrewLetter(c) || HebrewUtils.isNiqqudChar(c)) {
                    appendCurrentChar = true;
                } else if (Character.isLetterOrDigit(c)) {
                    if (this.tokenType == TokenType.Hebrew) {
                        this.tokenType |= TokenType.Mixed;
                    }
                    appendCurrentChar = true;
                } else if (HebrewUtils.isOfChars(c, Gershayim)) {
                    c = '\"';
                    if (!HebrewUtils.isHebrewLetter(this.wordBuffer[this.currentTokenLength - 1]) && !HebrewUtils.isNiqqudChar(this.wordBuffer[this.currentTokenLength - 1])) break;
                    this.tokenType |= TokenType.Acronym;
                    appendCurrentChar = true;
                } else if (HebrewUtils.isOfChars(c, Geresh)) {
                    c = '\'';
                    if ((this.tokenType & TokenType.Hebrew) > 0 && !HebrewUtils.isHebrewLetter(this.wordBuffer[this.currentTokenLength - 1]) && !HebrewUtils.isNiqqudChar(this.wordBuffer[this.currentTokenLength - 1]) && !HebrewUtils.isOfChars(this.wordBuffer[this.currentTokenLength - 1], Geresh)) break;
                    appendCurrentChar = true;
                } else if (!avoidTryingCustom && !this.isSuffixForExactMatch(c) && !Character.isSpaceChar(c) && this.isRecognizedException(this.wordBuffer, this.currentTokenLength, c)) {
                    this.tokenType |= TokenType.Custom;
                    appendCurrentChar = true;
                } else {
                    if (HebrewUtils.isOfChars(c, Makaf)) {
                        this.tokenType |= TokenType.Construct;
                        c = '-';
                        break;
                    }
                    if (this.suffixForExactMatch == null || !this.suffixForExactMatch.equals(Character.valueOf(c))) break;
                    this.tokenType |= TokenType.Exact;
                    break;
                }
            }
            if (!appendCurrentChar) continue;
            if (this.currentTokenLength == 0) {
                this.tokenOffset = this.inputOffset + this.ioBufferIndex - 1;
            } else if (this.currentTokenLength == this.wordBuffer.length - 1) continue;
            if (HebrewUtils.isOfChars(c, Geresh)) {
                if (this.wordBuffer[this.currentTokenLength - 1] == c) {
                    this.wordBuffer[this.currentTokenLength - 1] = 34;
                    this.tokenType |= TokenType.Acronym;
                    continue;
                }
                byte by = this.currentTokenLength;
                this.currentTokenLength = (byte)(by + 1);
                this.wordBuffer[by] = c;
                continue;
            }
            byte by = this.currentTokenLength;
            this.currentTokenLength = (byte)(by + 1);
            this.wordBuffer[by] = c;
        }
        this.tokenLengthInSource = this.dataLen <= 0 ? Math.max(this.inputOffset - this.tokenOffset, 0) : Math.max(this.inputOffset + this.ioBufferIndex - 1 - this.tokenOffset, 0);
        if (HebrewUtils.isOfChars(this.wordBuffer[this.currentTokenLength - 1], Gershayim)) {
            this.currentTokenLength = (byte)(this.currentTokenLength - 1);
            this.wordBuffer[this.currentTokenLength] = '\u0000';
            this.tokenLengthInSource = Math.max(this.tokenLengthInSource - 1, 0);
        }
        if (!(this.currentTokenLength <= 2 || this.wordBuffer[this.currentTokenLength - 1] != '\'' || (this.tokenType & TokenType.Hebrew) != 0 && HebrewUtils.isOfChars(this.wordBuffer[this.currentTokenLength - 2], LettersAcceptingGeresh))) {
            this.currentTokenLength = (byte)(this.currentTokenLength - 1);
            this.wordBuffer[this.currentTokenLength] = '\u0000';
            this.tokenLengthInSource = Math.max(this.tokenLengthInSource - 1, 0);
        }
        tokenString.ref = new String(this.wordBuffer, 0, (int)this.currentTokenLength);
        return this.tokenType;
    }

    private void abortCustomToken() {
        int start = 0;
        int pos = 0;
        boolean started = false;
        while (pos + start < this.currentTokenLength) {
            if (!(started || HebrewUtils.isHebrewLetter(this.wordBuffer[start]) || HebrewUtils.isNiqqudChar(this.wordBuffer[start]) || Character.isLetterOrDigit(this.wordBuffer[start]))) {
                ++start;
                continue;
            }
            started = true;
            Character c = Character.valueOf(this.wordBuffer[pos + start]);
            if (HebrewUtils.isHebrewLetter(c.charValue()) || HebrewUtils.isNiqqudChar(c.charValue())) {
                this.tokenType |= TokenType.Hebrew;
            } else if (Character.isLetterOrDigit(c.charValue())) {
                this.tokenType = this.tokenType == TokenType.Hebrew ? (this.tokenType |= TokenType.Mixed) : (this.tokenType |= TokenType.NonHebrew);
            } else if (HebrewUtils.isOfChars(c.charValue(), Gershayim)) {
                c = Character.valueOf('\"');
                this.tokenType |= TokenType.Acronym;
            } else {
                if (!HebrewUtils.isOfChars(c.charValue(), Geresh)) break;
                c = Character.valueOf('\'');
            }
            this.wordBuffer[pos] = c.charValue();
            ++pos;
        }
        this.currentTokenLength = (byte)pos;
    }

    private boolean isSuffixForExactMatch(char c) {
        if (this.suffixForExactMatch == null) {
            return false;
        }
        return c == this.suffixForExactMatch.charValue();
    }

    public final void reset(Reader _input) {
        this.input = _input;
        this.inputOffset = 0;
        this.dataLen = 0;
        this.ioBufferIndex = 0;
        this.tokenOffset = 0;
        this.tokenLengthInSource = 0;
        this.currentTokenLength = 0;
        this.tokenType = 0;
    }

    public static class TokenType {
        public static int Hebrew = 1;
        public static int NonHebrew = 2;
        public static int Numeric = 4;
        public static int Mixed = 8;
        public static int Construct = 16;
        public static int Acronym = 32;
        public static int Exact = 64;
        public static int Custom = 128;
    }
}

