/*
 * Decompiled with CFR 0.152.
 */
package org.apache.spark.ml.odkl.texts;

import org.apache.spark.annotation.DeveloperApi;
import org.apache.spark.ml.Transformer;
import org.apache.spark.ml.linalg.BLAS$;
import org.apache.spark.ml.linalg.Vector;
import org.apache.spark.ml.linalg.Vectors$;
import org.apache.spark.ml.odkl.texts.HashBasedDeduplicator$;
import org.apache.spark.ml.odkl.texts.HashBasedDeduplicator$$anonfun$1$;
import org.apache.spark.ml.param.DoubleParam;
import org.apache.spark.ml.param.Param;
import org.apache.spark.ml.param.ParamMap;
import org.apache.spark.ml.param.ParamPair;
import org.apache.spark.ml.param.ParamValidators$;
import org.apache.spark.ml.util.Identifiable;
import org.apache.spark.ml.util.Identifiable$;
import org.apache.spark.rdd.RDD;
import org.apache.spark.sql.Column;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.Row$;
import org.apache.spark.sql.types.StructType;
import scala.Function1;
import scala.Predef$;
import scala.Serializable;
import scala.collection.Iterator;
import scala.collection.Seq;
import scala.collection.mutable.ArrayBuffer;
import scala.reflect.ClassTag$;
import scala.reflect.ScalaSignature;
import scala.runtime.BoxesRunTime;
import scala.runtime.LongRef;

@ScalaSignature(bytes="\u0006\u0001\u0005Mb\u0001B\u0001\u0003\u0001=\u0011Q\u0003S1tQ\n\u000b7/\u001a3EK\u0012,\b\u000f\\5dCR|'O\u0003\u0002\u0004\t\u0005)A/\u001a=ug*\u0011QAB\u0001\u0005_\u0012\\GN\u0003\u0002\b\u0011\u0005\u0011Q\u000e\u001c\u0006\u0003\u0013)\tQa\u001d9be.T!a\u0003\u0007\u0002\r\u0005\u0004\u0018m\u00195f\u0015\u0005i\u0011aA8sO\u000e\u00011c\u0001\u0001\u0011)A\u0011\u0011CE\u0007\u0002\r%\u00111C\u0002\u0002\f)J\fgn\u001d4pe6,'\u000f\u0005\u0002\u001615\taC\u0003\u0002\u0018\r\u0005)\u0001/\u0019:b[&\u0011\u0011D\u0006\u0002\u0007!\u0006\u0014\u0018-\\:\t\u0011m\u0001!Q1A\u0005Bq\t1!^5e+\u0005i\u0002C\u0001\u0010%\u001d\ty\"%D\u0001!\u0015\u0005\t\u0013!B:dC2\f\u0017BA\u0012!\u0003\u0019\u0001&/\u001a3fM&\u0011QE\n\u0002\u0007'R\u0014\u0018N\\4\u000b\u0005\r\u0002\u0003\u0002\u0003\u0015\u0001\u0005\u0003\u0005\u000b\u0011B\u000f\u0002\tULG\r\t\u0005\u0006U\u0001!\taK\u0001\u0007y%t\u0017\u000e\u001e \u0015\u00051r\u0003CA\u0017\u0001\u001b\u0005\u0011\u0001\"B\u000e*\u0001\u0004i\u0002b\u0002\u0019\u0001\u0005\u0004%\t!M\u0001\u0014g&l\u0017\u000e\\1sSRLH\u000b\u001b:fg\"|G\u000eZ\u000b\u0002eA\u0011QcM\u0005\u0003iY\u00111\u0002R8vE2,\u0007+\u0019:b[\"1a\u0007\u0001Q\u0001\nI\nAc]5nS2\f'/\u001b;z)\"\u0014Xm\u001d5pY\u0012\u0004\u0003b\u0002\u001d\u0001\u0005\u0004%\t!O\u0001\rS:\u0004X\u000f^\"pY\"\u000b7\u000f[\u000b\u0002uA\u0019QcO\u000f\n\u0005q2\"!\u0002)be\u0006l\u0007B\u0002 \u0001A\u0003%!(A\u0007j]B,HoQ8m\u0011\u0006\u001c\b\u000e\t\u0005\b\u0001\u0002\u0011\r\u0011\"\u0001:\u00039Ig\u000e];u\u0007>dg+Z2u_JDaA\u0011\u0001!\u0002\u0013Q\u0014aD5oaV$8i\u001c7WK\u000e$xN\u001d\u0011\t\u000b\u0011\u0003A\u0011A#\u0002#M,G/\u00138qkR\u001cu\u000e\u001c,fGR|'\u000f\u0006\u0002G\u000f6\t\u0001\u0001C\u0003I\u0007\u0002\u0007Q$A\u0003wC2,X\rC\u0003K\u0001\u0011\u00051*A\btKRLe\u000e];u\u0007>d\u0007*Y:i)\t1E\nC\u0003I\u0013\u0002\u0007Q\u0004C\u0003O\u0001\u0011\u0005q*A\u000btKR\u001c\u0016.\\5mCJLG/\u001f+sKNDw\u000e\u001c3\u0015\u0005\u0019\u0003\u0006\"\u0002%N\u0001\u0004\t\u0006CA\u0010S\u0013\t\u0019\u0006E\u0001\u0004E_V\u0014G.\u001a\u0005\u0006U\u0001!\t!\u0016\u000b\u0002Y!)q\u000b\u0001C!1\u0006IAO]1og\u001a|'/\u001c\u000b\u000336\u0004\"A\u00176\u000f\u0005m;gB\u0001/f\u001d\tiFM\u0004\u0002_G:\u0011qLY\u0007\u0002A*\u0011\u0011MD\u0001\u0007yI|w\u000e\u001e \n\u00035I!a\u0003\u0007\n\u0005%Q\u0011B\u00014\t\u0003\r\u0019\u0018\u000f\\\u0005\u0003Q&\fq\u0001]1dW\u0006<WM\u0003\u0002g\u0011%\u00111\u000e\u001c\u0002\n\t\u0006$\u0018M\u0012:b[\u0016T!\u0001[5\t\u000b94\u0006\u0019A8\u0002\u000f\u0011\fG/Y:fiB\u0012\u0001O\u001e\t\u0004cJ$X\"A5\n\u0005ML'a\u0002#bi\u0006\u001cX\r\u001e\t\u0003kZd\u0001\u0001B\u0005x[\u0006\u0005\t\u0011!B\u0001q\n\u0019q\fJ\u0019\u0012\u0005ed\bCA\u0010{\u0013\tY\bEA\u0004O_RD\u0017N\\4\u0011\u0005}i\u0018B\u0001@!\u0005\r\te.\u001f\u0005\b\u0003\u0003\u0001A\u0011IA\u0002\u0003=!(/\u00198tM>\u0014XnU2iK6\fG\u0003BA\u0003\u0003#\u0001B!a\u0002\u0002\u000e5\u0011\u0011\u0011\u0002\u0006\u0004\u0003\u0017I\u0017!\u0002;za\u0016\u001c\u0018\u0002BA\b\u0003\u0013\u0011!b\u0015;sk\u000e$H+\u001f9f\u0011\u001d\t\u0019b a\u0001\u0003\u000b\taa]2iK6\f\u0007fA@\u0002\u0018A!\u0011\u0011DA\u0010\u001b\t\tYBC\u0002\u0002\u001e!\t!\"\u00198o_R\fG/[8o\u0013\u0011\t\t#a\u0007\u0003\u0019\u0011+g/\u001a7pa\u0016\u0014\u0018\t]5\t\u000f\u0005\u0015\u0002\u0001\"\u0011\u0002(\u0005!1m\u001c9z)\r\u0001\u0012\u0011\u0006\u0005\t\u0003W\t\u0019\u00031\u0001\u0002.\u0005)Q\r\u001f;sCB\u0019Q#a\f\n\u0007\u0005EbC\u0001\u0005QCJ\fW.T1q\u0001")
public class HashBasedDeduplicator
extends Transformer {
    private final String uid;
    private final DoubleParam similarityThreshold;
    private final Param<String> inputColHash;
    private final Param<String> inputColVector;

    public String uid() {
        return this.uid;
    }

    public DoubleParam similarityThreshold() {
        return this.similarityThreshold;
    }

    public Param<String> inputColHash() {
        return this.inputColHash;
    }

    public Param<String> inputColVector() {
        return this.inputColVector;
    }

    public HashBasedDeduplicator setInputColVector(String value) {
        return (HashBasedDeduplicator)this.set(this.inputColVector(), value);
    }

    public HashBasedDeduplicator setInputColHash(String value) {
        return (HashBasedDeduplicator)this.set(this.inputColHash(), value);
    }

    public HashBasedDeduplicator setSimilarityTreshold(double value) {
        return (HashBasedDeduplicator)this.set((Param)this.similarityThreshold(), BoxesRunTime.boxToDouble((double)value));
    }

    public Dataset<Row> transform(Dataset<?> dataset) {
        RDD qual$1 = dataset.toDF().repartition((Seq)Predef$.MODULE$.wrapRefArray((Object[])new Column[]{dataset.col((String)this.$(this.inputColHash()))})).sortWithinPartitions((String)this.$(this.inputColHash()), (Seq)Predef$.MODULE$.wrapRefArray((Object[])new String[0])).rdd();
        Serializable x$2 = new Serializable(this){
            public static final long serialVersionUID = 0L;
            private final /* synthetic */ HashBasedDeduplicator $outer;

            public final Iterator<Row> apply(Iterator<Row> f) {
                Iterator iterator;
                if (f.hasNext()) {
                    LongRef curHash = LongRef.create((long)-1L);
                    ArrayBuffer vectorsBuffer = new ArrayBuffer(0);
                    iterator = f.map((Function1)new Serializable(this, curHash, vectorsBuffer){
                        public static final long serialVersionUID = 0L;
                        private final /* synthetic */ $anonfun$1 $outer;
                        private final LongRef curHash$1;
                        private final ArrayBuffer vectorsBuffer$1;

                        public final Row apply(Row it) {
                            Row row;
                            long newHash = BoxesRunTime.unboxToLong((Object)it.getAs((String)this.$outer.org$apache$spark$ml$odkl$texts$HashBasedDeduplicator$$anonfun$$$outer().$(this.$outer.org$apache$spark$ml$odkl$texts$HashBasedDeduplicator$$anonfun$$$outer().inputColHash())));
                            if (newHash == this.curHash$1.elem) {
                                Vector currentVector = (Vector)it.getAs((String)this.$outer.org$apache$spark$ml$odkl$texts$HashBasedDeduplicator$$anonfun$$$outer().$(this.$outer.org$apache$spark$ml$odkl$texts$HashBasedDeduplicator$$anonfun$$$outer().inputColVector()));
                                boolean isUnique = this.vectorsBuffer$1.forall((Function1)new Serializable(this, currentVector){
                                    public static final long serialVersionUID = 0L;
                                    private final /* synthetic */ $anonfun$1$$anonfun$apply$1 $outer;
                                    private final Vector currentVector$1;

                                    public final boolean apply(Vector storedVector) {
                                        return BLAS$.MODULE$.dot(storedVector, this.currentVector$1) / (Vectors$.MODULE$.norm(storedVector, 2.0) * Vectors$.MODULE$.norm(this.currentVector$1, 2.0)) < BoxesRunTime.unboxToDouble((Object)this.$outer.org$apache$spark$ml$odkl$texts$HashBasedDeduplicator$$anonfun$$anonfun$$$outer().org$apache$spark$ml$odkl$texts$HashBasedDeduplicator$$anonfun$$$outer().$((Param)this.$outer.org$apache$spark$ml$odkl$texts$HashBasedDeduplicator$$anonfun$$anonfun$$$outer().org$apache$spark$ml$odkl$texts$HashBasedDeduplicator$$anonfun$$$outer().similarityThreshold()));
                                    }
                                    {
                                        if ($outer == null) {
                                            throw null;
                                        }
                                        this.$outer = $outer;
                                        this.currentVector$1 = currentVector$1;
                                    }
                                });
                                if (isUnique) {
                                    this.vectorsBuffer$1.append((Seq)Predef$.MODULE$.wrapRefArray((Object[])new Vector[]{currentVector}));
                                    row = it;
                                } else {
                                    row = Row$.MODULE$.empty();
                                }
                            } else {
                                this.vectorsBuffer$1.clear();
                                this.vectorsBuffer$1.append((Seq)Predef$.MODULE$.wrapRefArray((Object[])new Vector[]{(Vector)it.getAs((String)this.$outer.org$apache$spark$ml$odkl$texts$HashBasedDeduplicator$$anonfun$$$outer().$(this.$outer.org$apache$spark$ml$odkl$texts$HashBasedDeduplicator$$anonfun$$$outer().inputColVector()))}));
                                this.curHash$1.elem = newHash;
                                row = it;
                            }
                            return row;
                        }

                        public /* synthetic */ $anonfun$1 org$apache$spark$ml$odkl$texts$HashBasedDeduplicator$$anonfun$$anonfun$$$outer() {
                            return this.$outer;
                        }
                        {
                            if ($outer == null) {
                                throw null;
                            }
                            this.$outer = $outer;
                            this.curHash$1 = curHash$1;
                            this.vectorsBuffer$1 = vectorsBuffer$1;
                        }
                    });
                } else {
                    iterator = Predef$.MODULE$.refArrayOps((Object[])new Row[0]).toIterator();
                }
                return iterator;
            }

            public /* synthetic */ HashBasedDeduplicator org$apache$spark$ml$odkl$texts$HashBasedDeduplicator$$anonfun$$$outer() {
                return this.$outer;
            }
            {
                if ($outer == null) {
                    throw null;
                }
                this.$outer = $outer;
            }
        };
        boolean x$3 = qual$1.mapPartitions$default$2();
        return dataset.sqlContext().createDataFrame(qual$1.mapPartitions((Function1)x$2, x$3, ClassTag$.MODULE$.apply(Row.class)).filter((Function1)new Serializable(this){
            public static final long serialVersionUID = 0L;

            public final boolean apply(Row x$1) {
                return !x$1.equals((Object)Row$.MODULE$.empty());
            }
        }), this.transformSchema(dataset.schema()));
    }

    @DeveloperApi
    public StructType transformSchema(StructType schema) {
        return schema;
    }

    public Transformer copy(ParamMap extra) {
        return (Transformer)this.defaultCopy(extra);
    }

    public HashBasedDeduplicator(String uid) {
        this.uid = uid;
        this.similarityThreshold = new DoubleParam((Identifiable)this, "simTresh", "cosine similarity Treshold for dedupolication in one hash-bucket for vectors to be marked as 'similar' \n 0.9 by default", ParamValidators$.MODULE$.inRange(0.0, 1.0, false, true));
        this.inputColHash = new Param((Identifiable)this, "inputColHash", "column with LSH(local sensitive hashing) as Long \n \"hash\" by default");
        this.inputColVector = new Param((Identifiable)this, "inputColVector", "column with Vector data representation");
        this.setDefault((Seq)Predef$.MODULE$.wrapRefArray((Object[])new ParamPair[]{new ParamPair(this.inputColHash(), (Object)"hash"), new ParamPair((Param)this.similarityThreshold(), (Object)BoxesRunTime.boxToDouble((double)0.9))}));
    }

    public HashBasedDeduplicator() {
        this(Identifiable$.MODULE$.randomUID("hashBasedDeduplication"));
    }
}

