/*
 * Decompiled with CFR 0.152.
 */
package com.johnsnowlabs.nlp.annotators.tokenizer.wordpiece;

import com.johnsnowlabs.nlp.annotators.common.IndexedToken;
import com.johnsnowlabs.nlp.annotators.common.Sentence;
import com.johnsnowlabs.nlp.annotators.tokenizer.wordpiece.BasicTokenizer$;
import java.text.Normalizer;
import scala.Function1;
import scala.Predef$;
import scala.Serializable;
import scala.collection.Seq;
import scala.collection.immutable.Nil$;
import scala.collection.immutable.StringOps;
import scala.collection.immutable.StringOps$;
import scala.collection.mutable.ArrayBuffer;
import scala.collection.mutable.ArrayBuffer$;
import scala.reflect.ClassTag$;
import scala.reflect.ScalaSignature;
import scala.runtime.BoxesRunTime;

@ScalaSignature(bytes="\u0006\u0001]4Q!\u0005\n\u00015yA\u0001\"\n\u0001\u0003\u0002\u0003\u0006Ia\n\u0005\tU\u0001\u0011\t\u0011)A\u0005O!)1\u0006\u0001C\u0001Y!)\u0011\u0007\u0001C\u0001e!)\u0001\b\u0001C\u0001s!)1\b\u0001C\u0001y!)a\b\u0001C\u0001\u007f!)\u0011\t\u0001C\u0001\u0005\")\u0001\u000b\u0001C\u0001#\")1\u000b\u0001C\u0001)\")a\u000b\u0001C\u0001/\u001eAaMEA\u0001\u0012\u0003QrM\u0002\u0005\u0012%\u0005\u0005\t\u0012\u0001\u000ei\u0011\u0015YS\u0002\"\u0001j\u0011\u001dQW\"%A\u0005\u0002-DqA^\u0007\u0012\u0002\u0013\u00051N\u0001\bCCNL7\rV8lK:L'0\u001a:\u000b\u0005M!\u0012!C<pe\u0012\u0004\u0018.Z2f\u0015\t)b#A\u0005u_.,g.\u001b>fe*\u0011q\u0003G\u0001\u000bC:tw\u000e^1u_J\u001c(BA\r\u001b\u0003\rqG\u000e\u001d\u0006\u00037q\tAB[8i]Ntwn\u001e7bENT\u0011!H\u0001\u0004G>l7C\u0001\u0001 !\t\u00013%D\u0001\"\u0015\u0005\u0011\u0013!B:dC2\f\u0017B\u0001\u0013\"\u0005\u0019\te.\u001f*fM\u0006i1-Y:f'\u0016t7/\u001b;jm\u0016\u001c\u0001\u0001\u0005\u0002!Q%\u0011\u0011&\t\u0002\b\u0005>|G.Z1o\u0003-A\u0017m\u001d\"fO&tWI\u001c3\u0002\rqJg.\u001b;?)\ris\u0006\r\t\u0003]\u0001i\u0011A\u0005\u0005\bK\r\u0001\n\u00111\u0001(\u0011\u001dQ3\u0001%AA\u0002\u001d\nA\"[:XQ&$Xm\u001d9bG\u0016$\"aJ\u001a\t\u000bQ\"\u0001\u0019A\u001b\u0002\t\rD\u0017M\u001d\t\u0003AYJ!aN\u0011\u0003\t\rC\u0017M]\u0001\nSN\u001cuN\u001c;s_2$\"a\n\u001e\t\u000bQ*\u0001\u0019A\u001b\u0002\u0015%\u001cHk\u001c$jYR,'\u000f\u0006\u0002({!)AG\u0002a\u0001k\u0005i\u0011n\u001d)v]\u000e$X/\u0019;j_:$\"a\n!\t\u000bQ:\u0001\u0019A\u001b\u0002\u0019M$(/\u001b9BG\u000e,g\u000e^:\u0015\u0005\rs\u0005C\u0001#L\u001d\t)\u0015\n\u0005\u0002GC5\tqI\u0003\u0002IM\u00051AH]8pizJ!AS\u0011\u0002\rA\u0013X\rZ3g\u0013\taUJ\u0001\u0004TiJLgn\u001a\u0006\u0003\u0015\u0006BQa\u0014\u0005A\u0002\r\u000bA\u0001^3yi\u0006I\u0011n]\"iS:,7/\u001a\u000b\u0003OICQ\u0001N\u0005A\u0002U\n\u0011B\\8s[\u0006d\u0017N_3\u0015\u0005\r+\u0006\"B(\u000b\u0001\u0004\u0019\u0015\u0001\u0003;pW\u0016t\u0017N_3\u0015\u0005a\u000b\u0007c\u0001\u0011Z7&\u0011!,\t\u0002\u0006\u0003J\u0014\u0018-\u001f\t\u00039~k\u0011!\u0018\u0006\u0003=Z\taaY8n[>t\u0017B\u00011^\u00051Ie\u000eZ3yK\u0012$vn[3o\u0011\u0015\u00117\u00021\u0001d\u0003!\u0019XM\u001c;f]\u000e,\u0007C\u0001/e\u0013\t)WL\u0001\u0005TK:$XM\\2f\u00039\u0011\u0015m]5d)>\\WM\\5{KJ\u0004\"AL\u0007\u0014\u00055yB#A4\u00027\u0011bWm]:j]&$He\u001a:fCR,'\u000f\n3fM\u0006,H\u000e\u001e\u00132+\u0005a'FA\u0014nW\u0005q\u0007CA8u\u001b\u0005\u0001(BA9s\u0003%)hn\u00195fG.,GM\u0003\u0002tC\u0005Q\u0011M\u001c8pi\u0006$\u0018n\u001c8\n\u0005U\u0004(!E;oG\",7m[3e-\u0006\u0014\u0018.\u00198dK\u0006YB\u0005\\3tg&t\u0017\u000e\u001e\u0013he\u0016\fG/\u001a:%I\u00164\u0017-\u001e7uII\u0002")
public class BasicTokenizer {
    private final boolean caseSensitive;
    private final boolean hasBeginEnd;

    public static boolean $lessinit$greater$default$2() {
        return BasicTokenizer$.MODULE$.$lessinit$greater$default$2();
    }

    public static boolean $lessinit$greater$default$1() {
        return BasicTokenizer$.MODULE$.$lessinit$greater$default$1();
    }

    public boolean isWhitespace(char c) {
        return c == ' ' || c == '\t' || c == '\n' || c == '\r' || Character.isWhitespace(c);
    }

    public boolean isControl(char c) {
        if (c == '\t' || c == '\n' || c == '\r') {
            return false;
        }
        return Character.isISOControl(c);
    }

    public boolean isToFilter(char c) {
        char cp = c;
        return cp == '\u0000' || cp == '\ufffd' || this.isControl(c);
    }

    public boolean isPunctuation(char c) {
        boolean bl;
        char cp = c;
        if (cp >= '!' && cp <= '/' || cp >= ':' && cp <= '@' || cp >= '[' && cp <= '`' || cp >= '{' && cp <= '~') {
            return true;
        }
        try {
            String string;
            String charCategory;
            String string2 = charCategory = Character.getName(c);
            String string3 = string2 != null ? (string = string2) : "";
            String charCategoryString = string3;
            bl = charCategoryString.contains("PUNCTUATION");
        }
        catch (Exception exception) {
            bl = false;
        }
        return bl;
    }

    public String stripAccents(String text) {
        return Normalizer.normalize(text, Normalizer.Form.NFD).replaceAll("\\p{InCombiningDiacriticalMarks}+", "");
    }

    public boolean isChinese(char c) {
        char c2 = c;
        return c2 >= '\u4e00' && c2 <= '\u9fff' || c2 >= '\u3400' && c2 <= '\u4dbf' || c2 >= '\u20000' && c2 <= '\u2a6df' || c2 >= '\u2a700' && c2 <= '\u2b73f' || c2 >= '\u2b740' && c2 <= '\u2b81f' || c2 >= '\u2b820' && c2 <= '\u2ceaf' || c2 >= '\uf900' && c2 <= '\ufaff' || c2 >= '\u2f800' && c2 <= '\u2fa1f';
    }

    public String normalize(String text) {
        String result = new StringOps(Predef$.MODULE$.augmentString((String)new StringOps(Predef$.MODULE$.augmentString(this.stripAccents(text.trim()))).filter((Function1 & java.io.Serializable & Serializable)c -> BoxesRunTime.boxToBoolean((boolean)BasicTokenizer.$anonfun$normalize$1(this, BoxesRunTime.unboxToChar((Object)c)))))).mkString("");
        return this.caseSensitive ? result : result.toLowerCase();
    }

    public IndexedToken[] tokenize(Sentence sentence) {
        ArrayBuffer tokens = (ArrayBuffer)ArrayBuffer$.MODULE$.apply((Seq)Nil$.MODULE$);
        String s = sentence.content();
        int i = 0;
        while (i < s.length()) {
            int end;
            while (i < s.length() && this.isWhitespace(StringOps$.MODULE$.apply$extension(Predef$.MODULE$.augmentString(s), i)) && !this.isPunctuation(StringOps$.MODULE$.apply$extension(Predef$.MODULE$.augmentString(s), i))) {
                ++i;
            }
            for (end = i; !(end >= s.length() || this.isToFilter(StringOps$.MODULE$.apply$extension(Predef$.MODULE$.augmentString(s), end)) || this.isPunctuation(StringOps$.MODULE$.apply$extension(Predef$.MODULE$.augmentString(s), end)) || this.isChinese(StringOps$.MODULE$.apply$extension(Predef$.MODULE$.augmentString(s), end)) || this.isWhitespace(StringOps$.MODULE$.apply$extension(Predef$.MODULE$.augmentString(s), end))); ++end) {
            }
            if (end > i) {
                this.append$1(i, end, s, sentence, tokens);
            }
            if (end < s.length() && (this.isPunctuation(StringOps$.MODULE$.apply$extension(Predef$.MODULE$.augmentString(s), end)) || this.isChinese(StringOps$.MODULE$.apply$extension(Predef$.MODULE$.augmentString(s), end)))) {
                this.append$1(end, end + 1, s, sentence, tokens);
            }
            i = end + 1;
        }
        return (IndexedToken[])tokens.toArray(ClassTag$.MODULE$.apply(IndexedToken.class));
    }

    public static final /* synthetic */ boolean $anonfun$normalize$1(BasicTokenizer $this, char c) {
        return !$this.isToFilter(c);
    }

    private final void append$1(int start, int end, String s$1, Sentence sentence$1, ArrayBuffer tokens$1) {
        block0: {
            Predef$.MODULE$.assert(end > start);
            String text = s$1.substring(start, end);
            String normalized = this.normalize(text);
            if (!new StringOps(Predef$.MODULE$.augmentString(normalized)).nonEmpty()) break block0;
            IndexedToken token = this.hasBeginEnd ? new IndexedToken(normalized, sentence$1.start(), end - 1 + sentence$1.start()) : new IndexedToken(normalized, start + sentence$1.start(), end - 1 + sentence$1.start());
            tokens$1.append((Seq)Predef$.MODULE$.wrapRefArray((Object[])new IndexedToken[]{token}));
        }
    }

    public BasicTokenizer(boolean caseSensitive, boolean hasBeginEnd) {
        this.caseSensitive = caseSensitive;
        this.hasBeginEnd = hasBeginEnd;
    }
}

