/*
 * Decompiled with CFR 0.152.
 */
package elki.application.greedyensemble;

import elki.application.AbstractApplication;
import elki.data.DoubleVector;
import elki.data.NumberVector;
import elki.data.type.TypeInformation;
import elki.data.type.TypeUtil;
import elki.database.Database;
import elki.database.DatabaseUtil;
import elki.database.datastore.DataStore;
import elki.database.datastore.DataStoreUtil;
import elki.database.datastore.WritableDataStore;
import elki.database.ids.ArrayModifiableDBIDs;
import elki.database.ids.DBID;
import elki.database.ids.DBIDIter;
import elki.database.ids.DBIDMIter;
import elki.database.ids.DBIDRef;
import elki.database.ids.DBIDUtil;
import elki.database.ids.DBIDs;
import elki.database.ids.DoubleDBIDListMIter;
import elki.database.ids.HashSetModifiableDBIDs;
import elki.database.ids.ModifiableDBIDs;
import elki.database.ids.ModifiableDoubleDBIDList;
import elki.database.relation.MaterializedRelation;
import elki.database.relation.Relation;
import elki.database.relation.RelationUtil;
import elki.distance.PrimitiveDistance;
import elki.distance.correlation.WeightedPearsonCorrelationDistance;
import elki.distance.minkowski.WeightedEuclideanDistance;
import elki.distance.minkowski.WeightedManhattanDistance;
import elki.distance.minkowski.WeightedSquaredEuclideanDistance;
import elki.evaluation.scores.ROCEvaluation;
import elki.evaluation.scores.ScoreEvaluation;
import elki.evaluation.scores.adapter.DecreasingVectorIter;
import elki.logging.Logging;
import elki.math.MeanVariance;
import elki.utilities.datastructures.arraylike.ArrayLikeUtil;
import elki.utilities.documentation.Reference;
import elki.utilities.ensemble.EnsembleVoting;
import elki.utilities.ensemble.EnsembleVotingMean;
import elki.utilities.exceptions.AbortException;
import elki.utilities.io.FormatUtil;
import elki.utilities.optionhandling.OptionID;
import elki.utilities.optionhandling.parameterization.Parameterization;
import elki.utilities.optionhandling.parameters.DoubleParameter;
import elki.utilities.optionhandling.parameters.EnumParameter;
import elki.utilities.optionhandling.parameters.ObjectParameter;
import elki.utilities.scaling.ScalingFunction;
import elki.utilities.scaling.outlier.OutlierScaling;
import elki.workflow.InputStep;
import java.util.ArrayList;
import java.util.Arrays;

@Reference(authors="Erich Schubert, Remigius Wojdanowski, Arthur Zimek, Hans-Peter Kriegel", title="On Evaluation of Outlier Rankings and Outlier Scores", booktitle="Proc. 12th SIAM Int. Conf. on Data Mining (SDM 2012)", url="https://doi.org/10.1137/1.9781611972825.90", bibkey="DBLP:conf/sdm/SchubertWZK12")
public class GreedyEnsembleExperiment
extends AbstractApplication {
    private static final Logging LOG = Logging.getLogger(GreedyEnsembleExperiment.class);
    private InputStep inputstep;
    boolean refine_truth = false;
    EnsembleVoting voting;
    ScalingFunction prescaling;
    ScalingFunction scaling;
    double rate;
    int minvote = 1;
    Distance distance = Distance.PEARSON;

    public GreedyEnsembleExperiment(InputStep inputstep, EnsembleVoting voting, Distance distance, ScalingFunction prescaling, ScalingFunction scaling, double rate) {
        this.inputstep = inputstep;
        this.voting = voting;
        this.distance = distance;
        this.prescaling = prescaling;
        this.scaling = scaling;
        this.rate = rate;
    }

    public void run() {
        PrimitiveDistance<NumberVector> wdist;
        DBID firstid;
        Database database = this.inputstep.getDatabase();
        Relation<NumberVector> relation = database.getRelation((TypeInformation)TypeUtil.NUMBER_VECTOR_FIELD, new Object[0]);
        Relation labels = DatabaseUtil.guessLabelRepresentation((Database)database);
        String firstlabel = (String)labels.get((DBIDRef)(firstid = DBIDUtil.deref((DBIDRef)labels.iterDBIDs())));
        if (!firstlabel.matches("bylabel")) {
            throw new AbortException("No 'by label' reference outlier found, which is needed for weighting!");
        }
        relation = GreedyEnsembleExperiment.applyPrescaling(this.prescaling, relation, (DBIDs)firstid);
        int numcand = relation.size() - 1;
        int dim = RelationUtil.dimensionality(relation);
        NumberVector refvec = (NumberVector)relation.get((DBIDRef)firstid);
        int desired_outliers = (int)(this.rate * (double)dim);
        int union_outliers = 0;
        int[] outliers_seen = new int[dim];
        int k = 0;
        ArrayList<DecreasingVectorIter> iters = new ArrayList<DecreasingVectorIter>(numcand);
        if (this.minvote >= numcand) {
            this.minvote = Math.max(1, numcand - 1);
        }
        DBIDIter iditer = relation.iterDBIDs();
        while (iditer.valid()) {
            if (!DBIDUtil.equal((DBIDRef)firstid, (DBIDRef)iditer)) {
                iters.add(new DecreasingVectorIter(refvec, (NumberVector)relation.get((DBIDRef)iditer)));
            }
            iditer.advance();
        }
        block1: while (union_outliers < desired_outliers) {
            for (DecreasingVectorIter iter : iters) {
                int cur;
                if (!iter.valid()) {
                    LOG.warning((CharSequence)("Union_outliers=" + union_outliers + " < desired_outliers=" + desired_outliers + " minvote=" + this.minvote));
                    break block1;
                }
                int n = cur = iter.dim();
                outliers_seen[n] = outliers_seen[n] + 1;
                if (outliers_seen[cur] == this.minvote) {
                    ++union_outliers;
                }
                iter.advance();
            }
            ++k;
        }
        LOG.verbose((CharSequence)("Merged top " + k + " outliers to: " + union_outliers + " outliers (desired: at least " + desired_outliers + ")"));
        double[] estimated_weights = new double[dim];
        double[] estimated_truth = new double[dim];
        this.updateEstimations(outliers_seen, union_outliers, estimated_weights, estimated_truth);
        DoubleVector estimated_truth_vec = DoubleVector.wrap((double[])estimated_truth);
        PrimitiveDistance<NumberVector> tdist = wdist = this.getDistance(estimated_weights);
        double[] naiveensemble = new double[dim];
        double[] buf = new double[numcand];
        for (int d = 0; d < dim; ++d) {
            int i = 0;
            DBIDIter iditer2 = relation.iterDBIDs();
            while (iditer2.valid()) {
                if (!DBIDUtil.equal((DBIDRef)firstid, (DBIDRef)iditer2)) {
                    buf[i++] = ((NumberVector)relation.get((DBIDRef)iditer2)).doubleValue(d);
                }
                iditer2.advance();
            }
            naiveensemble[d] = this.voting.combine(buf, i);
            if (!Double.isNaN(naiveensemble[d])) continue;
            LOG.warning((CharSequence)("NaN after combining: " + FormatUtil.format((double[])buf) + " i=" + i + " " + this.voting.toString()));
        }
        DoubleVector naivevec = DoubleVector.wrap((double[])naiveensemble);
        double bestauc = 0.0;
        double bestcost = Double.POSITIVE_INFINITY;
        String bestaucstr = "";
        String bestcoststr = "";
        DBID bestid = null;
        double bestest = Double.POSITIVE_INFINITY;
        double[] greedyensemble = new double[dim];
        DBIDIter iditer3 = relation.iterDBIDs();
        while (iditer3.valid()) {
            if (!DBIDUtil.equal((DBIDRef)firstid, (DBIDRef)iditer3)) {
                this.singleEnsemble(greedyensemble, (NumberVector)relation.get((DBIDRef)iditer3));
                double auc = ROCEvaluation.computeAUROC((ScoreEvaluation.Adapter)new DecreasingVectorIter(refvec, (NumberVector)DoubleVector.wrap((double[])greedyensemble)));
                double estimated = wdist.distance((Object)DoubleVector.wrap((double[])greedyensemble), (Object)estimated_truth_vec);
                double cost = tdist.distance((Object)DoubleVector.wrap((double[])greedyensemble), (Object)refvec);
                LOG.verbose((CharSequence)("AUROC: " + auc + " estimated " + estimated + " cost " + cost + " " + (String)labels.get((DBIDRef)iditer3)));
                if (auc > bestauc) {
                    bestauc = auc;
                    bestaucstr = (String)labels.get((DBIDRef)iditer3);
                }
                if (cost < bestcost) {
                    bestcost = cost;
                    bestcoststr = (String)labels.get((DBIDRef)iditer3);
                }
                if (estimated < bestest || bestid == null) {
                    bestest = estimated;
                    bestid = DBIDUtil.deref((DBIDRef)iditer3);
                }
            }
            iditer3.advance();
        }
        if (this.prescaling != null) {
            LOG.verbose((CharSequence)("Input prescaling: " + this.prescaling));
        }
        LOG.verbose((CharSequence)("Distance function: " + wdist));
        LOG.verbose((CharSequence)("Ensemble voting: " + this.voting));
        if (this.scaling != null) {
            LOG.verbose((CharSequence)("Ensemble rescaling: " + this.scaling));
        }
        LOG.verbose((CharSequence)("Initial estimation of outliers: " + union_outliers));
        LOG.verbose((CharSequence)("Initializing ensemble with: " + (String)labels.get(bestid)));
        ArrayModifiableDBIDs ensemble = DBIDUtil.newArray(bestid);
        HashSetModifiableDBIDs enscands = DBIDUtil.newHashSet((DBIDs)relation.getDBIDs());
        HashSetModifiableDBIDs dropped = DBIDUtil.newHashSet((int)relation.size());
        dropped.add((DBIDRef)firstid);
        enscands.remove(bestid);
        enscands.remove((DBIDRef)firstid);
        double[] greedyensemble2 = new double[dim];
        this.singleEnsemble(greedyensemble2, (NumberVector)relation.get((DBIDRef)bestid));
        double[] testensemble = new double[dim];
        block6: while (enscands.size() > 0) {
            DoubleVector greedyvec = DoubleVector.wrap((double[])greedyensemble2);
            double oldd = wdist.distance((Object)estimated_truth_vec, (Object)greedyvec);
            int heapsize = enscands.size();
            ModifiableDoubleDBIDList heap = DBIDUtil.newDistanceDBIDList((int)heapsize);
            double[] tmp = new double[dim];
            DBIDMIter iter = enscands.iter();
            while (iter.valid()) {
                this.singleEnsemble(tmp, (NumberVector)relation.get((DBIDRef)iter));
                heap.add(wdist.distance((Object)DoubleVector.wrap((double[])greedyensemble2), (Object)greedyvec), (DBIDRef)iter);
                iter.advance();
            }
            heap.sort();
            DoubleDBIDListMIter it = heap.iter();
            while (heap.size() > 0) {
                enscands.remove((DBIDRef)it.seek(heap.size() - 1));
                NumberVector vec = (NumberVector)relation.get((DBIDRef)it);
                double[] buf2 = new double[ensemble.size() + 1];
                for (int i = 0; i < dim; ++i) {
                    int j = 0;
                    DBIDMIter iter2 = ensemble.iter();
                    while (iter2.valid()) {
                        buf2[j++] = ((NumberVector)relation.get((DBIDRef)iter2)).doubleValue(i);
                        iter2.advance();
                    }
                    buf2[j] = vec.doubleValue(i);
                    testensemble[i] = this.voting.combine(buf2, j + 1);
                }
                GreedyEnsembleExperiment.applyScaling(testensemble, this.scaling);
                DoubleVector testvec = DoubleVector.wrap((double[])testensemble);
                double newd = wdist.distance((Object)estimated_truth_vec, (Object)testvec);
                if (newd < oldd) {
                    System.arraycopy(testensemble, 0, greedyensemble2, 0, dim);
                    ensemble.add((DBIDRef)it);
                    continue block6;
                }
                dropped.add((DBIDRef)it);
                if (this.refine_truth) {
                    ArrayList<DecreasingVectorIter> iters2 = new ArrayList<DecreasingVectorIter>(numcand);
                    Object iditer4 = relation.iterDBIDs();
                    while (iditer4.valid()) {
                        if (!DBIDUtil.equal((DBIDRef)firstid, (DBIDRef)iditer4) && !dropped.contains((DBIDRef)iditer4)) {
                            iters2.add(new DecreasingVectorIter(refvec, (NumberVector)relation.get((DBIDRef)iditer4)));
                        }
                        iditer4.advance();
                    }
                    if (this.minvote >= iters2.size()) {
                        this.minvote = iters2.size() - 1;
                    }
                    union_outliers = 0;
                    Arrays.fill(outliers_seen, 0);
                    while (union_outliers < desired_outliers) {
                        DecreasingVectorIter iter3;
                        iditer4 = iters2.iterator();
                        while (iditer4.hasNext() && (iter3 = (DecreasingVectorIter)iditer4.next()).valid()) {
                            int n = iter3.dim();
                            outliers_seen[n] = outliers_seen[n] + 1;
                            if (outliers_seen[n] == this.minvote) {
                                ++union_outliers;
                            }
                            iter3.advance();
                        }
                    }
                    LOG.warning((CharSequence)("New num outliers: " + union_outliers));
                    this.updateEstimations(outliers_seen, union_outliers, estimated_weights, estimated_truth);
                    estimated_truth_vec = DoubleVector.wrap((double[])estimated_truth);
                }
                it.remove();
            }
        }
        StringBuilder greedylbl = new StringBuilder();
        DBIDMIter iter = ensemble.iter();
        while (iter.valid()) {
            greedylbl.append((String)labels.get((DBIDRef)iter)).append(' ');
            iter.advance();
        }
        greedylbl.setLength(greedylbl.length() - 1);
        DoubleVector greedyvec = DoubleVector.wrap((double[])greedyensemble2);
        if (this.refine_truth) {
            LOG.verbose((CharSequence)("Estimated outliers remaining: " + union_outliers));
        }
        LOG.verbose((CharSequence)("Greedy ensemble (" + ensemble.size() + "): " + greedylbl.toString()));
        LOG.verbose((CharSequence)("Best single AUROC:    " + bestauc + " (" + bestaucstr + ")"));
        LOG.verbose((CharSequence)("Best single cost:     " + bestcost + " (" + bestcoststr + ")"));
        double naiveauc = ROCEvaluation.computeAUROC((ScoreEvaluation.Adapter)new DecreasingVectorIter(refvec, (NumberVector)naivevec));
        double naivecost = tdist.distance((Object)naivevec, (Object)refvec);
        LOG.verbose((CharSequence)("Naive ensemble AUROC:  " + naiveauc + " cost: " + naivecost));
        LOG.verbose((CharSequence)("Naive ensemble Gain:   " + this.gain(naiveauc, bestauc, 1.0) + " cost gain: " + this.gain(naivecost, bestcost, 0.0)));
        double greedyauc = ROCEvaluation.computeAUROC((ScoreEvaluation.Adapter)new DecreasingVectorIter(refvec, (NumberVector)greedyvec));
        double greedycost = tdist.distance((Object)greedyvec, (Object)refvec);
        LOG.verbose((CharSequence)("Greedy ensemble AUROC: " + greedyauc + " cost: " + greedycost));
        LOG.verbose((CharSequence)("Greedy ensemble Gain to best:  " + this.gain(greedyauc, bestauc, 1.0) + " cost gain: " + this.gain(greedycost, bestcost, 0.0)));
        LOG.verbose((CharSequence)("Greedy ensemble Gain to naive: " + this.gain(greedyauc, naiveauc, 1.0) + " cost gain: " + this.gain(greedycost, naivecost, 0.0)));
        MeanVariance meanauc = new MeanVariance();
        MeanVariance meancost = new MeanVariance();
        HashSetModifiableDBIDs candidates = DBIDUtil.newHashSet((DBIDs)relation.getDBIDs());
        candidates.remove((DBIDRef)firstid);
        for (int i = 0; i < 1000; ++i) {
            double[] randomensemble = new double[dim];
            ModifiableDBIDs random = DBIDUtil.randomSample((DBIDs)candidates, (int)ensemble.size(), (Long)Long.valueOf(i));
            double[] buf3 = new double[random.size()];
            for (int d = 0; d < dim; ++d) {
                int j = 0;
                DBIDIter iter4 = random.iter();
                while (iter4.valid()) {
                    assert (!DBIDUtil.equal((DBIDRef)firstid, (DBIDRef)iter4));
                    buf3[j++] = ((NumberVector)relation.get((DBIDRef)iter4)).doubleValue(d);
                    iter4.advance();
                }
                randomensemble[d] = this.voting.combine(buf3, j);
            }
            GreedyEnsembleExperiment.applyScaling(randomensemble, this.scaling);
            DoubleVector randomvec = DoubleVector.wrap((double[])randomensemble);
            meanauc.put(ROCEvaluation.computeAUROC((ScoreEvaluation.Adapter)new DecreasingVectorIter(refvec, (NumberVector)randomvec)));
            meancost.put(tdist.distance((Object)randomvec, (Object)refvec));
        }
        LOG.verbose((CharSequence)("Random ensemble AUROC: " + meanauc.getMean() + " + stddev: " + meanauc.getSampleStddev() + " = " + (meanauc.getMean() + meanauc.getSampleStddev())));
        LOG.verbose((CharSequence)("Random ensemble Gain:  " + this.gain(meanauc.getMean(), bestauc, 1.0)));
        LOG.verbose((CharSequence)("Greedy improvement:    " + (greedyauc - meanauc.getMean()) / meanauc.getSampleStddev() + " standard deviations."));
        LOG.verbose((CharSequence)("Random ensemble Cost:  " + meancost.getMean() + " + stddev: " + meancost.getSampleStddev() + " = " + (meancost.getMean() + meanauc.getSampleStddev())));
        LOG.verbose((CharSequence)("Random ensemble Gain:  " + this.gain(meancost.getMean(), bestcost, 0.0)));
        LOG.verbose((CharSequence)("Greedy improvement:    " + (meancost.getMean() - greedycost) / meancost.getSampleStddev() + " standard deviations."));
        LOG.verbose((CharSequence)("Naive ensemble Gain to random: " + this.gain(naiveauc, meanauc.getMean(), 1.0) + " cost gain: " + this.gain(naivecost, meancost.getMean(), 0.0)));
        LOG.verbose((CharSequence)("Random ensemble Gain to naive: " + this.gain(meanauc.getMean(), naiveauc, 1.0) + " cost gain: " + this.gain(meancost.getMean(), naivecost, 0.0)));
        LOG.verbose((CharSequence)("Greedy ensemble Gain to random: " + this.gain(greedyauc, meanauc.getMean(), 1.0) + " cost gain: " + this.gain(greedycost, meancost.getMean(), 0.0)));
    }

    protected void singleEnsemble(double[] ensemble, NumberVector vec) {
        double[] buf = new double[1];
        for (int i = 0; i < ensemble.length; ++i) {
            buf[0] = vec.doubleValue(i);
            ensemble[i] = this.voting.combine(buf, 1);
            if (!Double.isNaN(ensemble[i])) continue;
            LOG.warning((CharSequence)("NaN after combining: " + FormatUtil.format((double[])buf) + " " + this.voting.toString()));
        }
        GreedyEnsembleExperiment.applyScaling(ensemble, this.scaling);
    }

    public static Relation<NumberVector> applyPrescaling(ScalingFunction scaling, Relation<NumberVector> relation, DBIDs skip) {
        if (scaling == null) {
            return relation;
        }
        NumberVector.Factory factory = RelationUtil.getNumberVectorFactory(relation);
        DBIDs ids = relation.getDBIDs();
        WritableDataStore contents = DataStoreUtil.makeStorage((DBIDs)ids, (int)2, NumberVector.class);
        DBIDIter iter = ids.iter();
        while (iter.valid()) {
            double[] raw = ((NumberVector)relation.get((DBIDRef)iter)).toArray();
            if (!skip.contains((DBIDRef)iter)) {
                GreedyEnsembleExperiment.applyScaling(raw, scaling);
            }
            contents.put((DBIDRef)iter, (Object)factory.newNumberVector((Object)raw, ArrayLikeUtil.DOUBLEARRAYADAPTER));
            iter.advance();
        }
        return new MaterializedRelation("rescaled", relation.getDataTypeInformation(), ids, (DataStore)contents);
    }

    private static void applyScaling(double[] raw, ScalingFunction scaling) {
        if (scaling == null) {
            return;
        }
        if (scaling instanceof OutlierScaling) {
            ((OutlierScaling)scaling).prepare((Object)raw, ArrayLikeUtil.DOUBLEARRAYADAPTER);
        }
        for (int i = 0; i < raw.length; ++i) {
            double newval = scaling.getScaled(raw[i]);
            if (Double.isNaN(newval)) {
                LOG.warning((CharSequence)("NaN after prescaling: " + raw[i] + " " + scaling.toString() + " -> " + newval));
            }
            raw[i] = newval;
        }
    }

    protected void updateEstimations(int[] outliers, int numoutliers, double[] weights, double[] truth) {
        double oweight = 0.5 / (double)numoutliers;
        double iweight = 0.5 / (double)(outliers.length - numoutliers);
        double oscore = 1.0;
        double iscore = 0.0;
        for (int i = 0; i < outliers.length; ++i) {
            if (outliers[i] >= this.minvote) {
                weights[i] = oweight;
                truth[i] = 1.0;
                continue;
            }
            weights[i] = iweight;
            truth[i] = 0.0;
        }
    }

    private PrimitiveDistance<NumberVector> getDistance(double[] estimated_weights) {
        switch (this.distance) {
            case SQEUCLIDEAN: {
                return new WeightedSquaredEuclideanDistance(estimated_weights);
            }
            case EUCLIDEAN: {
                return new WeightedEuclideanDistance(estimated_weights);
            }
            case MANHATTAN: {
                return new WeightedManhattanDistance(estimated_weights);
            }
            case PEARSON: {
                return new WeightedPearsonCorrelationDistance(estimated_weights);
            }
        }
        throw new AbortException("Unsupported distance mode: " + (Object)((Object)this.distance));
    }

    double gain(double score, double ref, double optimal) {
        return 1.0 - (optimal - score) / (optimal - ref);
    }

    public static void main(String[] args) {
        GreedyEnsembleExperiment.runCLIApplication(GreedyEnsembleExperiment.class, (String[])args);
    }

    public static class Par
    extends AbstractApplication.Par {
        public static final OptionID RATE_ID = new OptionID("greedy.rate", "Expected rate of outliers.");
        public static final OptionID VOTING_ID = new OptionID("ensemble.voting", "Ensemble voting function.");
        public static final OptionID PRESCALING_ID = new OptionID("ensemble.prescaling", "Prescaling to apply to input scores.");
        public static final OptionID SCALING_ID = new OptionID("ensemble.scaling", "Scaling to apply to ensemble.");
        public static final OptionID DISTANCE_ID = new OptionID("ensemble.measure", "Similarity measure.");
        InputStep inputstep;
        EnsembleVoting voting;
        Distance distance = Distance.PEARSON;
        ScalingFunction prescaling;
        ScalingFunction scaling;
        double rate = 0.01;

        public void configure(Parameterization config) {
            super.configure(config);
            this.inputstep = (InputStep)config.tryInstantiate(InputStep.class);
            new ObjectParameter(VOTING_ID, EnsembleVoting.class, EnsembleVotingMean.class).grab(config, x -> {
                this.voting = x;
            });
            new EnumParameter(DISTANCE_ID, Distance.class).grab(config, x -> {
                this.distance = x;
            });
            new ObjectParameter(PRESCALING_ID, ScalingFunction.class).setOptional(true).grab(config, x -> {
                this.prescaling = x;
            });
            new ObjectParameter(SCALING_ID, ScalingFunction.class).setOptional(true).grab(config, x -> {
                this.scaling = x;
            });
            new DoubleParameter(RATE_ID, 0.01).grab(config, x -> {
                this.rate = x;
            });
        }

        public GreedyEnsembleExperiment make() {
            return new GreedyEnsembleExperiment(this.inputstep, this.voting, this.distance, this.prescaling, this.scaling, this.rate);
        }
    }

    public static enum Distance {
        PEARSON,
        SQEUCLIDEAN,
        EUCLIDEAN,
        MANHATTAN;

    }
}

