/*
 * Decompiled with CFR 0.152.
 */
package org.datavec.spark.transform;

import java.util.ArrayList;
import java.util.List;
import org.apache.spark.api.java.JavaDoubleRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.DoubleFunction;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.datavec.api.transform.ColumnType;
import org.datavec.api.transform.analysis.DataAnalysis;
import org.datavec.api.transform.analysis.SequenceDataAnalysis;
import org.datavec.api.transform.analysis.columns.BytesAnalysis;
import org.datavec.api.transform.analysis.columns.CategoricalAnalysis;
import org.datavec.api.transform.analysis.columns.ColumnAnalysis;
import org.datavec.api.transform.analysis.columns.DoubleAnalysis;
import org.datavec.api.transform.analysis.columns.IntegerAnalysis;
import org.datavec.api.transform.analysis.columns.LongAnalysis;
import org.datavec.api.transform.analysis.columns.StringAnalysis;
import org.datavec.api.transform.analysis.columns.TimeAnalysis;
import org.datavec.api.transform.analysis.sequence.SequenceLengthAnalysis;
import org.datavec.api.transform.metadata.CategoricalMetaData;
import org.datavec.api.transform.metadata.ColumnMetaData;
import org.datavec.api.transform.metadata.DoubleMetaData;
import org.datavec.api.transform.metadata.IntegerMetaData;
import org.datavec.api.transform.metadata.LongMetaData;
import org.datavec.api.transform.metadata.StringMetaData;
import org.datavec.api.transform.metadata.TimeMetaData;
import org.datavec.api.transform.quality.DataQualityAnalysis;
import org.datavec.api.transform.quality.columns.BytesQuality;
import org.datavec.api.transform.quality.columns.CategoricalQuality;
import org.datavec.api.transform.quality.columns.ColumnQuality;
import org.datavec.api.transform.quality.columns.DoubleQuality;
import org.datavec.api.transform.quality.columns.IntegerQuality;
import org.datavec.api.transform.quality.columns.LongQuality;
import org.datavec.api.transform.quality.columns.StringQuality;
import org.datavec.api.transform.quality.columns.TimeQuality;
import org.datavec.api.transform.schema.Schema;
import org.datavec.api.writable.Writable;
import org.datavec.spark.transform.analysis.SelectColumnFunction;
import org.datavec.spark.transform.analysis.SequenceFlatMapFunction;
import org.datavec.spark.transform.analysis.SequenceLengthFunction;
import org.datavec.spark.transform.analysis.aggregate.AnalysisAddFunction;
import org.datavec.spark.transform.analysis.aggregate.AnalysisCombineFunction;
import org.datavec.spark.transform.analysis.columns.BytesAnalysisCounter;
import org.datavec.spark.transform.analysis.columns.CategoricalAnalysisCounter;
import org.datavec.spark.transform.analysis.columns.DoubleAnalysisCounter;
import org.datavec.spark.transform.analysis.columns.IntegerAnalysisCounter;
import org.datavec.spark.transform.analysis.columns.LongAnalysisCounter;
import org.datavec.spark.transform.analysis.histogram.HistogramAddFunction;
import org.datavec.spark.transform.analysis.histogram.HistogramCombineFunction;
import org.datavec.spark.transform.analysis.histogram.HistogramCounter;
import org.datavec.spark.transform.analysis.seqlength.IntToDoubleFunction;
import org.datavec.spark.transform.analysis.seqlength.SequenceLengthAnalysisAddFunction;
import org.datavec.spark.transform.analysis.seqlength.SequenceLengthAnalysisCounter;
import org.datavec.spark.transform.analysis.seqlength.SequenceLengthAnalysisMergeFunction;
import org.datavec.spark.transform.analysis.string.StringAnalysisCounter;
import org.datavec.spark.transform.filter.FilterWritablesBySchemaFunction;
import org.datavec.spark.transform.quality.categorical.CategoricalQualityAddFunction;
import org.datavec.spark.transform.quality.categorical.CategoricalQualityMergeFunction;
import org.datavec.spark.transform.quality.integer.IntegerQualityAddFunction;
import org.datavec.spark.transform.quality.integer.IntegerQualityMergeFunction;
import org.datavec.spark.transform.quality.longq.LongQualityAddFunction;
import org.datavec.spark.transform.quality.longq.LongQualityMergeFunction;
import org.datavec.spark.transform.quality.real.RealQualityAddFunction;
import org.datavec.spark.transform.quality.real.RealQualityMergeFunction;
import org.datavec.spark.transform.quality.string.StringQualityAddFunction;
import org.datavec.spark.transform.quality.string.StringQualityMergeFunction;
import org.datavec.spark.transform.quality.time.TimeQualityAddFunction;
import org.datavec.spark.transform.quality.time.TimeQualityMergeFunction;
import scala.Tuple2;

public class AnalyzeSpark {
    public static final int DEFAULT_HISTOGRAM_BUCKETS = 30;

    public static SequenceDataAnalysis analyzeSequence(Schema schema, JavaRDD<List<List<Writable>>> data) {
        return AnalyzeSpark.analyzeSequence(schema, data, 30);
    }

    public static SequenceDataAnalysis analyzeSequence(Schema schema, JavaRDD<List<List<Writable>>> data, int maxHistogramBuckets) {
        JavaDoubleRDD drdd;
        Tuple2 hist;
        data.cache();
        JavaRDD fmSeq = data.flatMap((FlatMapFunction)new SequenceFlatMapFunction());
        DataAnalysis da = AnalyzeSpark.analyze(schema, (JavaRDD<List<Writable>>)fmSeq);
        JavaRDD seqLengths = data.map((Function)new SequenceLengthFunction());
        seqLengths.cache();
        SequenceLengthAnalysisCounter counter = new SequenceLengthAnalysisCounter();
        counter = (SequenceLengthAnalysisCounter)seqLengths.aggregate((Object)counter, (Function2)new SequenceLengthAnalysisAddFunction(), (Function2)new SequenceLengthAnalysisMergeFunction());
        int max = counter.getMaxLengthSeen();
        int min = counter.getMinLengthSeen();
        int nBuckets = counter.getMaxLengthSeen() - counter.getMinLengthSeen();
        if (max == min) {
            hist = new Tuple2((Object)new double[]{min}, (Object)new long[]{counter.getCountTotal()});
        } else if (nBuckets < maxHistogramBuckets) {
            drdd = seqLengths.mapToDouble((DoubleFunction)new IntToDoubleFunction());
            hist = drdd.histogram(nBuckets);
        } else {
            drdd = seqLengths.mapToDouble((DoubleFunction)new IntToDoubleFunction());
            hist = drdd.histogram(maxHistogramBuckets);
        }
        seqLengths.unpersist();
        SequenceLengthAnalysis lengthAnalysis = SequenceLengthAnalysis.builder().totalNumSequences(counter.getCountTotal()).minSeqLength(counter.getMinLengthSeen()).maxSeqLength(counter.getMaxLengthSeen()).countZeroLength(counter.getCountZeroLength()).countOneLength(counter.getCountOneLength()).meanLength(counter.getMean()).histogramBuckets((double[])hist._1()).histogramBucketCounts((long[])hist._2()).build();
        return new SequenceDataAnalysis(schema, da.getColumnAnalysis(), lengthAnalysis);
    }

    public static DataAnalysis analyze(Schema schema, JavaRDD<List<Writable>> data) {
        return AnalyzeSpark.analyze(schema, data, 30);
    }

    public static DataAnalysis analyze(Schema schema, JavaRDD<List<Writable>> data, int maxHistogramBuckets) {
        data.cache();
        List columnTypes = schema.getColumnTypes();
        List counters = (List)data.aggregate(null, (Function2)new AnalysisAddFunction(schema), (Function2)new AnalysisCombineFunction());
        double[][] minsMaxes = new double[counters.size()][2];
        int nColumns = schema.numColumns();
        ArrayList<Object> list = new ArrayList<Object>(nColumns);
        block9: for (int i = 0; i < nColumns; ++i) {
            ColumnType ct = (ColumnType)columnTypes.get(i);
            switch (ct) {
                case String: {
                    StringAnalysisCounter sac = (StringAnalysisCounter)counters.get(i);
                    list.add(new StringAnalysis.Builder().countTotal(sac.getCountTotal()).minLength(sac.getMinLengthSeen()).maxLength(sac.getMaxLengthSeen()).meanLength((double)sac.getSumLength() / (double)sac.getCountTotal()).build());
                    minsMaxes[i][0] = sac.getMinLengthSeen();
                    minsMaxes[i][1] = sac.getMaxLengthSeen();
                    continue block9;
                }
                case Integer: {
                    IntegerAnalysisCounter iac = (IntegerAnalysisCounter)counters.get(i);
                    IntegerAnalysis ia = ((IntegerAnalysis.Builder)((IntegerAnalysis.Builder)((IntegerAnalysis.Builder)((IntegerAnalysis.Builder)((IntegerAnalysis.Builder)((IntegerAnalysis.Builder)((IntegerAnalysis.Builder)((IntegerAnalysis.Builder)((IntegerAnalysis.Builder)new IntegerAnalysis.Builder().min(iac.getMinValueSeen()).max(iac.getMaxValueSeen()).mean((double)iac.getSum() / (double)iac.getCountTotal())).sampleStdev(iac.getSampleStdev())).sampleVariance(iac.getSampleVariance())).countZero(iac.getCountZero())).countNegative(iac.getCountNegative())).countPositive(iac.getCountPositive())).countMinValue(iac.getCountMinValue())).countMaxValue(iac.getCountMaxValue())).countTotal(iac.getCountTotal())).build();
                    list.add(ia);
                    minsMaxes[i][0] = iac.getMinValueSeen();
                    minsMaxes[i][1] = iac.getMaxValueSeen();
                    continue block9;
                }
                case Long: {
                    LongAnalysisCounter lac = (LongAnalysisCounter)counters.get(i);
                    LongAnalysis la = ((LongAnalysis.Builder)((LongAnalysis.Builder)((LongAnalysis.Builder)((LongAnalysis.Builder)((LongAnalysis.Builder)((LongAnalysis.Builder)((LongAnalysis.Builder)((LongAnalysis.Builder)((LongAnalysis.Builder)new LongAnalysis.Builder().min(lac.getMinValueSeen()).max(lac.getMaxValueSeen()).mean(lac.getSum().doubleValue() / (double)lac.getCountTotal())).sampleStdev(lac.getSampleStdev())).sampleVariance(lac.getSampleVariance())).countZero(lac.getCountZero())).countNegative(lac.getCountNegative())).countPositive(lac.getCountPositive())).countMinValue(lac.getCountMinValue())).countMaxValue(lac.getCountMaxValue())).countTotal(lac.getCountTotal())).build();
                    list.add(la);
                    minsMaxes[i][0] = lac.getMinValueSeen();
                    minsMaxes[i][1] = lac.getMaxValueSeen();
                    continue block9;
                }
                case Double: {
                    DoubleAnalysisCounter dac = (DoubleAnalysisCounter)counters.get(i);
                    DoubleAnalysis da = ((DoubleAnalysis.Builder)((DoubleAnalysis.Builder)((DoubleAnalysis.Builder)((DoubleAnalysis.Builder)((DoubleAnalysis.Builder)((DoubleAnalysis.Builder)((DoubleAnalysis.Builder)((DoubleAnalysis.Builder)((DoubleAnalysis.Builder)new DoubleAnalysis.Builder().min(dac.getMinValueSeen()).max(dac.getMaxValueSeen()).mean(dac.getSum() / (double)dac.getCountTotal())).sampleStdev(dac.getSampleStdev())).sampleVariance(dac.getSampleVariance())).countZero(dac.getCountZero())).countNegative(dac.getCountNegative())).countPositive(dac.getCountPositive())).countMinValue(dac.getCountMinValue())).countMaxValue(dac.getCountMaxValue())).countNaN(dac.getCountNaN()).countTotal(dac.getCountTotal())).build();
                    list.add(da);
                    minsMaxes[i][0] = dac.getMinValueSeen();
                    minsMaxes[i][1] = dac.getMaxValueSeen();
                    continue block9;
                }
                case Categorical: {
                    CategoricalAnalysisCounter cac = (CategoricalAnalysisCounter)counters.get(i);
                    CategoricalAnalysis ca = new CategoricalAnalysis(cac.getCounts());
                    list.add(ca);
                    continue block9;
                }
                case Time: {
                    LongAnalysisCounter lac2 = (LongAnalysisCounter)counters.get(i);
                    TimeAnalysis la2 = ((TimeAnalysis.Builder)((TimeAnalysis.Builder)((TimeAnalysis.Builder)((TimeAnalysis.Builder)((TimeAnalysis.Builder)((TimeAnalysis.Builder)((TimeAnalysis.Builder)((TimeAnalysis.Builder)((TimeAnalysis.Builder)new TimeAnalysis.Builder().min(lac2.getMinValueSeen()).max(lac2.getMaxValueSeen()).mean(lac2.getSum().doubleValue() / (double)lac2.getCountTotal())).sampleStdev(lac2.getSampleStdev())).sampleVariance(lac2.getSampleVariance())).countZero(lac2.getCountZero())).countNegative(lac2.getCountNegative())).countPositive(lac2.getCountPositive())).countMinValue(lac2.getCountMinValue())).countMaxValue(lac2.getCountMaxValue())).countTotal(lac2.getCountTotal())).build();
                    list.add(la2);
                    minsMaxes[i][0] = lac2.getMinValueSeen();
                    minsMaxes[i][1] = lac2.getMaxValueSeen();
                    continue block9;
                }
                case Bytes: {
                    BytesAnalysisCounter bac = (BytesAnalysisCounter)counters.get(i);
                    list.add(new BytesAnalysis.Builder().countTotal(bac.getCountTotal()).build());
                    continue block9;
                }
                default: {
                    throw new IllegalStateException("Unknown column type: " + ct);
                }
            }
        }
        List histogramCounters = (List)data.aggregate(null, (Function2)new HistogramAddFunction(maxHistogramBuckets, schema, minsMaxes), (Function2)new HistogramCombineFunction());
        for (int i = 0; i < list.size(); ++i) {
            HistogramCounter hc = (HistogramCounter)histogramCounters.get(i);
            ColumnAnalysis ca = (ColumnAnalysis)list.get(i);
            if (ca instanceof IntegerAnalysis) {
                ((IntegerAnalysis)ca).setHistogramBuckets(hc.getBins());
                ((IntegerAnalysis)ca).setHistogramBucketCounts(hc.getCounts());
                continue;
            }
            if (ca instanceof DoubleAnalysis) {
                ((DoubleAnalysis)ca).setHistogramBuckets(hc.getBins());
                ((DoubleAnalysis)ca).setHistogramBucketCounts(hc.getCounts());
                continue;
            }
            if (ca instanceof LongAnalysis) {
                ((LongAnalysis)ca).setHistogramBuckets(hc.getBins());
                ((LongAnalysis)ca).setHistogramBucketCounts(hc.getCounts());
                continue;
            }
            if (ca instanceof TimeAnalysis) {
                ((TimeAnalysis)ca).setHistogramBuckets(hc.getBins());
                ((TimeAnalysis)ca).setHistogramBucketCounts(hc.getCounts());
                continue;
            }
            if (!(ca instanceof StringAnalysis)) continue;
            ((StringAnalysis)ca).setHistogramBuckets(hc.getBins());
            ((StringAnalysis)ca).setHistogramBucketCounts(hc.getCounts());
        }
        return new DataAnalysis(schema, list);
    }

    public static List<Writable> sampleFromColumn(int count, String columnName, Schema schema, JavaRDD<List<Writable>> data) {
        int colIdx = schema.getIndexOfColumn(columnName);
        JavaRDD ithColumn = data.map((Function)new SelectColumnFunction(colIdx));
        return ithColumn.takeSample(false, count);
    }

    public static List<Writable> sampleFromColumnSequence(int count, String columnName, Schema schema, JavaRDD<List<List<Writable>>> sequenceData) {
        JavaRDD flattenedSequence = sequenceData.flatMap((FlatMapFunction)new SequenceFlatMapFunction());
        return AnalyzeSpark.sampleFromColumn(count, columnName, schema, (JavaRDD<List<Writable>>)flattenedSequence);
    }

    public static List<Writable> getUnique(String columnName, Schema schema, JavaRDD<List<Writable>> data) {
        int colIdx = schema.getIndexOfColumn(columnName);
        JavaRDD ithColumn = data.map((Function)new SelectColumnFunction(colIdx));
        return ithColumn.distinct().collect();
    }

    public static List<Writable> getUniqueSequence(String columnName, Schema schema, JavaRDD<List<List<Writable>>> sequenceData) {
        JavaRDD flattenedSequence = sequenceData.flatMap((FlatMapFunction)new SequenceFlatMapFunction());
        return AnalyzeSpark.getUnique(columnName, schema, (JavaRDD<List<Writable>>)flattenedSequence);
    }

    public static List<List<Writable>> sample(int count, JavaRDD<List<Writable>> data) {
        return data.takeSample(false, count);
    }

    public static List<List<List<Writable>>> sampleSequence(int count, JavaRDD<List<List<Writable>>> data) {
        return data.takeSample(false, count);
    }

    private static ColumnQuality analyze(ColumnMetaData meta, JavaRDD<Writable> ithColumn) {
        switch (meta.getColumnType()) {
            case String: {
                ithColumn.cache();
                long countUnique = ithColumn.distinct().count();
                StringQuality initialString = new StringQuality();
                StringQuality stringQuality = (StringQuality)ithColumn.aggregate((Object)initialString, (Function2)new StringQualityAddFunction((StringMetaData)meta), (Function2)new StringQualityMergeFunction());
                return stringQuality.add(new StringQuality(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, countUnique));
            }
            case Integer: {
                IntegerQuality initialInt = new IntegerQuality(0L, 0L, 0L, 0L, 0L);
                return (ColumnQuality)ithColumn.aggregate((Object)initialInt, (Function2)new IntegerQualityAddFunction((IntegerMetaData)meta), (Function2)new IntegerQualityMergeFunction());
            }
            case Long: {
                LongQuality initialLong = new LongQuality();
                return (ColumnQuality)ithColumn.aggregate((Object)initialLong, (Function2)new LongQualityAddFunction((LongMetaData)meta), (Function2)new LongQualityMergeFunction());
            }
            case Double: {
                DoubleQuality initialReal = new DoubleQuality();
                return (ColumnQuality)ithColumn.aggregate((Object)initialReal, (Function2)new RealQualityAddFunction((DoubleMetaData)meta), (Function2)new RealQualityMergeFunction());
            }
            case Categorical: {
                CategoricalQuality initialCat = new CategoricalQuality();
                return (ColumnQuality)ithColumn.aggregate((Object)initialCat, (Function2)new CategoricalQualityAddFunction((CategoricalMetaData)meta), (Function2)new CategoricalQualityMergeFunction());
            }
            case Time: {
                TimeQuality initTimeQuality = new TimeQuality();
                return (ColumnQuality)ithColumn.aggregate((Object)initTimeQuality, (Function2)new TimeQualityAddFunction((TimeMetaData)meta), (Function2)new TimeQualityMergeFunction());
            }
            case Bytes: {
                return new BytesQuality();
            }
        }
        throw new RuntimeException("Unknown or not implemented column type: " + meta.getColumnType());
    }

    public static DataQualityAnalysis analyzeQualitySequence(Schema schema, JavaRDD<List<List<Writable>>> data) {
        JavaRDD fmSeq = data.flatMap((FlatMapFunction)new SequenceFlatMapFunction());
        return AnalyzeSpark.analyzeQuality(schema, (JavaRDD<List<Writable>>)fmSeq);
    }

    public static DataQualityAnalysis analyzeQuality(Schema schema, JavaRDD<List<Writable>> data) {
        data.cache();
        int nColumns = schema.numColumns();
        ArrayList<ColumnQuality> list = new ArrayList<ColumnQuality>(nColumns);
        for (int i = 0; i < nColumns; ++i) {
            ColumnMetaData meta = schema.getMetaData(i);
            JavaRDD ithColumn = data.map((Function)new SelectColumnFunction(i));
            list.add(AnalyzeSpark.analyze(meta, (JavaRDD<Writable>)ithColumn));
        }
        return new DataQualityAnalysis(schema, list);
    }

    public static List<Writable> sampleInvalidFromColumn(int numToSample, String columnName, Schema schema, JavaRDD<List<Writable>> data) {
        return AnalyzeSpark.sampleInvalidFromColumn(numToSample, columnName, schema, data, false);
    }

    public static List<Writable> sampleInvalidFromColumn(int numToSample, String columnName, Schema schema, JavaRDD<List<Writable>> data, boolean ignoreMissing) {
        int colIdx = schema.getIndexOfColumn(columnName);
        JavaRDD ithColumn = data.map((Function)new SelectColumnFunction(colIdx));
        ColumnMetaData meta = schema.getMetaData(columnName);
        JavaRDD invalid = ithColumn.filter((Function)new FilterWritablesBySchemaFunction(meta, false, ignoreMissing));
        return invalid.takeSample(false, numToSample);
    }

    public static List<Writable> sampleInvalidFromColumnSequence(int numToSample, String columnName, Schema schema, JavaRDD<List<List<Writable>>> data) {
        JavaRDD flattened = data.flatMap((FlatMapFunction)new SequenceFlatMapFunction());
        return AnalyzeSpark.sampleInvalidFromColumn(numToSample, columnName, schema, (JavaRDD<List<Writable>>)flattened);
    }
}

