/*
 * Decompiled with CFR 0.152.
 */
package com.johnsnowlabs.nlp.annotators;

import com.johnsnowlabs.nlp.Annotation;
import com.johnsnowlabs.nlp.AnnotatorModel;
import com.johnsnowlabs.nlp.AnnotatorType$;
import com.johnsnowlabs.nlp.HasSimpleAnnotate;
import com.johnsnowlabs.nlp.annotators.DocumentTokenSplitter$;
import com.johnsnowlabs.nlp.annotators.TextSplitter;
import com.johnsnowlabs.nlp.functions$;
import java.io.Serializable;
import org.apache.spark.ml.param.BooleanParam;
import org.apache.spark.ml.param.IntParam;
import org.apache.spark.ml.param.Param;
import org.apache.spark.ml.param.ParamPair;
import org.apache.spark.ml.util.Identifiable;
import org.apache.spark.ml.util.Identifiable$;
import org.apache.spark.ml.util.MLReader;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.expressions.UserDefinedFunction;
import scala.Function0;
import scala.Function1;
import scala.MatchError;
import scala.None$;
import scala.Option;
import scala.Predef;
import scala.Predef$;
import scala.Some;
import scala.Tuple2;
import scala.collection.GenTraversableOnce;
import scala.collection.Map;
import scala.collection.Seq;
import scala.collection.Seq$;
import scala.collection.SeqLike;
import scala.collection.TraversableLike;
import scala.collection.immutable.StringOps;
import scala.collection.mutable.ArrayOps;
import scala.math.Ordering;
import scala.reflect.ScalaSignature;
import scala.reflect.api.TypeTags;
import scala.reflect.runtime.package$;
import scala.runtime.BoxesRunTime;
import scala.util.matching.Regex;
import scala.util.matching.Regex$;

@ScalaSignature(bytes="\u0006\u0001\u0005Uf\u0001\u0002\u0012$\u00011B\u0001B\u000e\u0001\u0003\u0006\u0004%\te\u000e\u0005\t\u000b\u0002\u0011\t\u0011)A\u0005q!)a\t\u0001C\u0001\u000f\")a\t\u0001C\u0001\u0013\"9!\n\u0001b\u0001\n\u0003Z\u0005B\u0002,\u0001A\u0003%A\nC\u0004X\u0001\t\u0007I\u0011\t-\t\re\u0003\u0001\u0015!\u0003Q\u0011\u001dQ\u0006A1A\u0005\u0002mCaA\u001b\u0001!\u0002\u0013a\u0006\"B6\u0001\t\u0003a\u0007\"\u0002:\u0001\t\u0003\u0019\bb\u0002;\u0001\u0005\u0004%\ta\u0017\u0005\u0007k\u0002\u0001\u000b\u0011\u0002/\t\u000bY\u0004A\u0011A<\t\u000be\u0004A\u0011A:\t\u000fi\u0004!\u0019!C\u0001w\"1q\u0010\u0001Q\u0001\nqDq!!\u0001\u0001\t\u0003\t\u0019\u0001C\u0004\u0002\u000e\u0001!\t!a\u0004\t\u0011\u0005E\u0001A1A\u0005\u0002mDq!a\u0005\u0001A\u0003%A\u0010C\u0004\u0002\u0016\u0001!\t!a\u0006\t\u000f\u0005m\u0001\u0001\"\u0001\u0002\u0010!I\u0011Q\u0004\u0001C\u0002\u0013%\u0011q\u0004\u0005\t\u0003c\u0001\u0001\u0015!\u0003\u0002\"!9\u00111\u0007\u0001\u0005\u0002\u0005U\u0002bBA\u001e\u0001\u0011\u0005\u0013Q\b\u0005\b\u00037\u0002A\u0011KA/\u000f\u001d\t\u0019i\tE\u0001\u0003\u000b3aAI\u0012\t\u0002\u0005\u001d\u0005B\u0002$ \t\u0003\ty\nC\u0005\u0002\"~\t\t\u0011\"\u0003\u0002$\n)Bi\\2v[\u0016tG\u000fV8lK:\u001c\u0006\u000f\\5ui\u0016\u0014(B\u0001\u0013&\u0003)\tgN\\8uCR|'o\u001d\u0006\u0003M\u001d\n1A\u001c7q\u0015\tA\u0013&\u0001\u0007k_\"t7O\\8xY\u0006\u00147OC\u0001+\u0003\r\u0019w.\\\u0002\u0001'\r\u0001Qf\r\t\u0004]=\nT\"A\u0013\n\u0005A*#AD!o]>$\u0018\r^8s\u001b>$W\r\u001c\t\u0003e\u0001i\u0011a\t\t\u0004]Q\n\u0014BA\u001b&\u0005EA\u0015m]*j[BdW-\u00118o_R\fG/Z\u0001\u0004k&$W#\u0001\u001d\u0011\u0005e\u0012eB\u0001\u001eA!\tYd(D\u0001=\u0015\ti4&\u0001\u0004=e>|GO\u0010\u0006\u0002\u007f\u0005)1oY1mC&\u0011\u0011IP\u0001\u0007!J,G-\u001a4\n\u0005\r#%AB*ue&twM\u0003\u0002B}\u0005!Q/\u001b3!\u0003\u0019a\u0014N\\5u}Q\u0011\u0011\u0007\u0013\u0005\u0006m\r\u0001\r\u0001\u000f\u000b\u0002c\u0005\u0019\u0012N\u001c9vi\u0006sgn\u001c;bi>\u0014H+\u001f9fgV\tA\nE\u0002N\u001dBk\u0011AP\u0005\u0003\u001fz\u0012Q!\u0011:sCf\u0004\"!\u0015*\u000e\u0003\u0001I!a\u0015+\u0003\u001b\u0005sgn\u001c;bi>\u0014H+\u001f9f\u0013\t)VE\u0001\fICN|U\u000f\u001e9vi\u0006sgn\u001c;bi>\u0014H+\u001f9f\u0003QIg\u000e];u\u0003:tw\u000e^1u_J$\u0016\u0010]3tA\u0005\u0019r.\u001e;qkR\feN\\8uCR|'\u000fV=qKV\t\u0001+\u0001\u000bpkR\u0004X\u000f^!o]>$\u0018\r^8s)f\u0004X\rI\u0001\n]VlGk\\6f]N,\u0012\u0001\u0018\t\u0003;\"l\u0011A\u0018\u0006\u0003?\u0002\fQ\u0001]1sC6T!!\u00192\u0002\u00055d'BA2e\u0003\u0015\u0019\b/\u0019:l\u0015\t)g-\u0001\u0004ba\u0006\u001c\u0007.\u001a\u0006\u0002O\u0006\u0019qN]4\n\u0005%t&\u0001C%oiB\u000b'/Y7\u0002\u00159,X\u000eV8lK:\u001c\b%\u0001\u0007tKRtU/\u001c+pW\u0016t7\u000f\u0006\u0002R[\")an\u0003a\u0001_\u0006)a/\u00197vKB\u0011Q\n]\u0005\u0003cz\u00121!\u00138u\u000319W\r\u001e(v[R{7.\u001a8t+\u0005y\u0017\u0001\u0004;pW\u0016twJ^3sY\u0006\u0004\u0018!\u0004;pW\u0016twJ^3sY\u0006\u0004\b%A\btKR$vn[3o\u001fZ,'\u000f\\1q)\t\t\u0006\u0010C\u0003o\u001f\u0001\u0007q.A\bhKR$vn[3o\u001fZ,'\u000f\\1q\u00035)\u0007\u0010\u001d7pI\u0016\u001c\u0006\u000f\\5ugV\tA\u0010\u0005\u0002^{&\u0011aP\u0018\u0002\r\u0005>|G.Z1o!\u0006\u0014\u0018-\\\u0001\u000fKb\u0004Hn\u001c3f'Bd\u0017\u000e^:!\u0003A\u0019X\r^#ya2|G-Z*qY&$8\u000fF\u0002R\u0003\u000bAaA\\\nA\u0002\u0005\u001d\u0001cA'\u0002\n%\u0019\u00111\u0002 \u0003\u000f\t{w\u000e\\3b]\u0006\u0001r-\u001a;FqBdw\u000eZ3Ta2LGo]\u000b\u0003\u0003\u000f\ta\u0002\u001e:j[^C\u0017\u000e^3ta\u0006\u001cW-A\bue&lw\u000b[5uKN\u0004\u0018mY3!\u0003E\u0019X\r\u001e+sS6<\u0006.\u001b;fgB\f7-\u001a\u000b\u0004#\u0006e\u0001B\u00028\u0018\u0001\u0004\t9!A\thKR$&/[7XQ&$Xm\u001d9bG\u0016\f\u0011\u0003^8lK:\u001c\u0006\u000f\\5u!\u0006$H/\u001a:o+\t\t\t\u0003\u0005\u0003\u0002$\u00055RBAA\u0013\u0015\u0011\t9#!\u000b\u0002\u00115\fGo\u00195j]\u001eT1!a\u000b?\u0003\u0011)H/\u001b7\n\t\u0005=\u0012Q\u0005\u0002\u0006%\u0016<W\r_\u0001\u0013i>\\WM\\*qY&$\b+\u0019;uKJt\u0007%\u0001\tmK:<G\u000f\u001b$s_6$vn[3ogR\u0019q.a\u000e\t\r\u0005e2\u00041\u00019\u0003\u0011!X\r\u001f;\u0002\u0011\u0005tgn\u001c;bi\u0016$B!a\u0010\u0002XA1\u0011\u0011IA&\u0003#rA!a\u0011\u0002H9\u00191(!\u0012\n\u0003}J1!!\u0013?\u0003\u001d\u0001\u0018mY6bO\u0016LA!!\u0014\u0002P\t\u00191+Z9\u000b\u0007\u0005%c\bE\u0002/\u0003'J1!!\u0016&\u0005)\teN\\8uCRLwN\u001c\u0005\b\u00033b\u0002\u0019AA \u0003-\tgN\\8uCRLwN\\:\u0002\u001b\u00054G/\u001a:B]:|G/\u0019;f)\u0011\ty&a \u0011\t\u0005\u0005\u0014\u0011\u0010\b\u0005\u0003G\n)H\u0004\u0003\u0002f\u0005Ed\u0002BA4\u0003_rA!!\u001b\u0002n9\u00191(a\u001b\n\u0003\u001dL!!\u001a4\n\u0005\r$\u0017bAA:E\u0006\u00191/\u001d7\n\t\u0005%\u0013q\u000f\u0006\u0004\u0003g\u0012\u0017\u0002BA>\u0003{\u0012\u0011\u0002R1uC\u001a\u0013\u0018-\\3\u000b\t\u0005%\u0013q\u000f\u0005\b\u0003\u0003k\u0002\u0019AA0\u0003\u001d!\u0017\r^1tKR\fQ\u0003R8dk6,g\u000e\u001e+pW\u0016t7\u000b\u001d7jiR,'\u000f\u0005\u00023?M9q$!#\u0002\u0010\u0006e\u0005cA'\u0002\f&\u0019\u0011Q\u0012 \u0003\r\u0005s\u0017PU3g!\u0015\t\t*!&2\u001b\t\t\u0019JC\u0002\u0002,\u0001LA!a&\u0002\u0014\n)B)\u001a4bk2$\b+\u0019:b[N\u0014V-\u00193bE2,\u0007cA'\u0002\u001c&\u0019\u0011Q\u0014 \u0003\u0019M+'/[1mSj\f'\r\\3\u0015\u0005\u0005\u0015\u0015a\u0003:fC\u0012\u0014Vm]8mm\u0016$\"!!*\u0011\t\u0005\u001d\u0016\u0011W\u0007\u0003\u0003SSA!a+\u0002.\u0006!A.\u00198h\u0015\t\ty+\u0001\u0003kCZ\f\u0017\u0002BAZ\u0003S\u0013aa\u00142kK\u000e$\b")
public class DocumentTokenSplitter
extends AnnotatorModel<DocumentTokenSplitter>
implements HasSimpleAnnotate<DocumentTokenSplitter> {
    private final String uid;
    private final String[] inputAnnotatorTypes;
    private final String outputAnnotatorType;
    private final IntParam numTokens;
    private final IntParam tokenOverlap;
    private final BooleanParam explodeSplits;
    private final BooleanParam trimWhitespace;
    private final Regex tokenSplitPattern;

    public static MLReader<DocumentTokenSplitter> read() {
        return DocumentTokenSplitter$.MODULE$.read();
    }

    public static Object load(String string) {
        return DocumentTokenSplitter$.MODULE$.load(string);
    }

    @Override
    public UserDefinedFunction dfAnnotate() {
        return HasSimpleAnnotate.dfAnnotate$(this);
    }

    public String uid() {
        return this.uid;
    }

    @Override
    public String[] inputAnnotatorTypes() {
        return this.inputAnnotatorTypes;
    }

    @Override
    public String outputAnnotatorType() {
        return this.outputAnnotatorType;
    }

    public IntParam numTokens() {
        return this.numTokens;
    }

    public DocumentTokenSplitter setNumTokens(int value) {
        Predef$.MODULE$.require(value > 0, (Function0 & Serializable & scala.Serializable)() -> "Number of tokens should be larger than 0.");
        return (DocumentTokenSplitter)this.set((Param)this.numTokens(), BoxesRunTime.boxToInteger((int)value));
    }

    public int getNumTokens() {
        return BoxesRunTime.unboxToInt((Object)this.$((Param)this.numTokens()));
    }

    public IntParam tokenOverlap() {
        return this.tokenOverlap;
    }

    public DocumentTokenSplitter setTokenOverlap(int value) {
        Predef$.MODULE$.require(value <= this.getNumTokens(), (Function0 & Serializable & scala.Serializable)() -> "Token overlap can't be larger than number of tokens.");
        return (DocumentTokenSplitter)this.set((Param)this.tokenOverlap(), BoxesRunTime.boxToInteger((int)value));
    }

    public int getTokenOverlap() {
        return BoxesRunTime.unboxToInt((Object)this.$((Param)this.tokenOverlap()));
    }

    public BooleanParam explodeSplits() {
        return this.explodeSplits;
    }

    public DocumentTokenSplitter setExplodeSplits(boolean value) {
        return (DocumentTokenSplitter)this.set((Param)this.explodeSplits(), BoxesRunTime.boxToBoolean((boolean)value));
    }

    public boolean getExplodeSplits() {
        return BoxesRunTime.unboxToBoolean((Object)this.$((Param)this.explodeSplits()));
    }

    public BooleanParam trimWhitespace() {
        return this.trimWhitespace;
    }

    public DocumentTokenSplitter setTrimWhitespace(boolean value) {
        return (DocumentTokenSplitter)this.set((Param)this.trimWhitespace(), BoxesRunTime.boxToBoolean((boolean)value));
    }

    public boolean getTrimWhitespace() {
        return BoxesRunTime.unboxToBoolean((Object)this.$((Param)this.trimWhitespace()));
    }

    private Regex tokenSplitPattern() {
        return this.tokenSplitPattern;
    }

    public int lengthFromTokens(String text) {
        return new ArrayOps.ofRef(Predef$.MODULE$.refArrayOps((Object[])this.tokenSplitPattern().split((CharSequence)text))).count((Function1 & Serializable & scala.Serializable)x$1 -> BoxesRunTime.boxToBoolean((boolean)DocumentTokenSplitter.$anonfun$lengthFromTokens$1(x$1)));
    }

    @Override
    public Seq<Annotation> annotate(Seq<Annotation> annotations) {
        TextSplitter textSplitter = new TextSplitter(this.getNumTokens(), this.getTokenOverlap(), true, true, this.getTrimWhitespace(), (Function1<String, Object>)(Function1 & Serializable & scala.Serializable)text -> BoxesRunTime.boxToInteger((int)this.lengthFromTokens(text)));
        String[] documentSplitPatterns = (String[])((Object[])new String[]{"\\s+"});
        return (Seq)((TraversableLike)((SeqLike)((TraversableLike)annotations.zipWithIndex(Seq$.MODULE$.canBuildFrom())).flatMap((Function1 & Serializable & scala.Serializable)x0$1 -> {
            Tuple2 tuple2 = x0$1;
            if (tuple2 == null) {
                throw new MatchError((Object)tuple2);
            }
            Annotation annotation = (Annotation)tuple2._1();
            int i = tuple2._2$mcI$sp();
            String text = annotation.result();
            Seq<String> textChunks = textSplitter.splitText(text, (Seq<String>)Predef$.MODULE$.wrapRefArray((Object[])documentSplitPatterns));
            Seq seq = (Seq)((TraversableLike)textChunks.zipWithIndex(Seq$.MODULE$.canBuildFrom())).map((Function1 & Serializable & scala.Serializable)x0$2 -> {
                int n;
                Tuple2 tuple2 = x0$2;
                if (tuple2 == null) throw new MatchError((Object)tuple2);
                String textChunk = (String)tuple2._1();
                int index = tuple2._2$mcI$sp();
                Option option = new StringOps(Predef$.MODULE$.augmentString(Regex$.MODULE$.quote(textChunk))).r().findFirstMatchIn((CharSequence)text);
                if (option instanceof Some) {
                    Some some = (Some)option;
                    Regex.Match m = (Regex.Match)some.value();
                    n = m.start();
                } else {
                    if (!None$.MODULE$.equals(option)) throw new MatchError((Object)option);
                    n = -1;
                }
                int textChunkBegin = n;
                int textChunkEnd = textChunkBegin >= 0 ? textChunkBegin + textChunk.length() : -1;
                return new Tuple2((Object)BoxesRunTime.boxToInteger((int)i), (Object)new Annotation(AnnotatorType$.MODULE$.DOCUMENT(), textChunkBegin, textChunkEnd, textChunk, (Map<String, String>)annotation.metadata().$plus$plus((GenTraversableOnce)Predef$.MODULE$.Map().apply((Seq)Predef$.MODULE$.wrapRefArray((Object[])new Tuple2[]{Predef.ArrowAssoc$.MODULE$.$minus$greater$extension(Predef$.MODULE$.ArrowAssoc((Object)"document"), (Object)Integer.toString(index)), Predef.ArrowAssoc$.MODULE$.$minus$greater$extension(Predef$.MODULE$.ArrowAssoc((Object)"numTokens"), (Object)Integer.toString(this.lengthFromTokens(textChunk)))}))), annotation.embeddings()));
            }, Seq$.MODULE$.canBuildFrom());
            return seq;
        }, Seq$.MODULE$.canBuildFrom())).sortBy((Function1 & Serializable & scala.Serializable)x$2 -> BoxesRunTime.boxToInteger((int)x$2._1$mcI$sp()), (Ordering)Ordering.Int$.MODULE$)).map((Function1 & Serializable & scala.Serializable)x$3 -> (Annotation)x$3._2(), Seq$.MODULE$.canBuildFrom());
    }

    @Override
    public Dataset<Row> afterAnnotate(Dataset<Row> dataset) {
        return this.getExplodeSplits() ? functions$.MODULE$.ExplodeAnnotations(dataset).explodeAnnotationsCol(this.getOutputCol(), this.getOutputCol(), ((TypeTags)package$.MODULE$.universe()).TypeTag().Nothing()) : dataset;
    }

    public static final /* synthetic */ boolean $anonfun$lengthFromTokens$1(String x$1) {
        return new StringOps(Predef$.MODULE$.augmentString(x$1)).nonEmpty();
    }

    public DocumentTokenSplitter(String uid) {
        this.uid = uid;
        HasSimpleAnnotate.$init$(this);
        this.inputAnnotatorTypes = (String[])((Object[])new String[]{AnnotatorType$.MODULE$.DOCUMENT()});
        this.outputAnnotatorType = AnnotatorType$.MODULE$.DOCUMENT();
        this.numTokens = new IntParam((Identifiable)this, "numTokens", "Limit of the number of tokens in a text");
        this.tokenOverlap = new IntParam((Identifiable)this, "tokenOverlap", "Length of the overlap between text chunks");
        this.explodeSplits = new BooleanParam((Identifiable)this, "explodeSplits", "Whether to explode split chunks to separate rows");
        this.trimWhitespace = new BooleanParam((Identifiable)this, "trimWhitespace", "Whether to trim whitespaces of extracted chunks");
        this.setDefault((Seq)Predef$.MODULE$.wrapRefArray((Object[])new ParamPair[]{this.tokenOverlap().$minus$greater((Object)BoxesRunTime.boxToInteger((int)0)), this.explodeSplits().$minus$greater((Object)BoxesRunTime.boxToBoolean((boolean)false)), this.trimWhitespace().$minus$greater((Object)BoxesRunTime.boxToBoolean((boolean)true))}));
        this.tokenSplitPattern = new StringOps(Predef$.MODULE$.augmentString("\\s+")).r();
    }

    public DocumentTokenSplitter() {
        this(Identifiable$.MODULE$.randomUID("DocumentTokenSplitter"));
    }
}

