/*
 * Decompiled with CFR 0.152.
 */
package brickhouse.udf.sketch;

import brickhouse.analytics.uniques.SketchSet;
import brickhouse.udf.sketch.SketchSetUDAF;
import java.util.List;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDF;

@Description(name="set_similarity", value="_FUNC_(a,b) - Compute the Jaccard set similarity of two sketch sets. ")
public class SetSimilarityUDF
extends UDF {
    public Double evaluate(List<String> a, List<String> b) {
        if (a == null || b == null) {
            return null;
        }
        if (a.size() == 0 || b.size() == 0) {
            return 0.0;
        }
        int sketchSize = Math.max(a.size(), b.size());
        if (sketchSize < SketchSetUDAF.DEFAULT_SKETCH_SET_SIZE) {
            sketchSize = SketchSetUDAF.DEFAULT_SKETCH_SET_SIZE;
        }
        SketchSet sketchA = new SketchSet(sketchSize);
        SketchSet sketchB = new SketchSet(sketchSize);
        SketchSet sketchAUB = new SketchSet(sketchSize);
        for (String aStr : a) {
            sketchA.addItem(aStr);
            sketchAUB.addItem(aStr);
        }
        for (String bStr : b) {
            sketchB.addItem(bStr);
            sketchAUB.addItem(bStr);
        }
        double aEst = sketchA.estimateReach();
        double bEst = sketchB.estimateReach();
        double aubEst = sketchAUB.estimateReach();
        double ainterb = aEst + bEst - aubEst;
        double sim = ainterb / aubEst;
        return sim;
    }
}

