/*
 * Decompiled with CFR 0.152.
 */
package org.broadinstitute.hellbender.tools.copynumber;

import htsjdk.samtools.SAMSequenceDictionary;
import java.io.File;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.ListIterator;
import java.util.stream.Collectors;
import org.apache.commons.math3.linear.Array2DRowRealMatrix;
import org.apache.commons.math3.linear.RealMatrix;
import org.apache.logging.log4j.Logger;
import org.apache.spark.api.java.JavaSparkContext;
import org.broadinstitute.barclay.argparser.Advanced;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hdf5.HDF5Library;
import org.broadinstitute.hellbender.cmdline.programgroups.CopyNumberProgramGroup;
import org.broadinstitute.hellbender.engine.spark.SparkCommandLineProgram;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.copynumber.arguments.CopyNumberArgumentValidationUtils;
import org.broadinstitute.hellbender.tools.copynumber.denoising.HDF5SVDReadCountPanelOfNormals;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.AnnotatedIntervalCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SimpleCountCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SampleLocatableMetadata;
import org.broadinstitute.hellbender.tools.copynumber.formats.records.annotation.CopyNumberAnnotations;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;

@CommandLineProgramProperties(summary="Creates a panel of normals for read-count denoising given the read counts for samples in the panel", oneLineSummary="Creates a panel of normals for read-count denoising", programGroup=CopyNumberProgramGroup.class)
@DocumentedFeature
public final class CreateReadCountPanelOfNormals
extends SparkCommandLineProgram {
    private static final long serialVersionUID = 1L;
    private static final double DEFAULT_MINIMUM_INTERVAL_MEDIAN_PERCENTILE = 10.0;
    private static final double DEFAULT_MAXIMUM_ZEROS_IN_SAMPLE_PERCENTAGE = 5.0;
    private static final double DEFAULT_MAXIMUM_ZEROS_IN_INTERVAL_PERCENTAGE = 5.0;
    private static final double DEFAULT_EXTREME_SAMPLE_MEDIAN_PERCENTILE = 2.5;
    private static final boolean DEFAULT_DO_IMPUTE_ZEROS = true;
    private static final double DEFAULT_EXTREME_OUTLIER_TRUNCATION_PERCENTILE = 0.1;
    private static final int DEFAULT_NUMBER_OF_EIGENSAMPLES = 20;
    private static final int DEFAULT_CHUNK_DIVISOR = 16;
    private static final int DEFAULT_MAXIMUM_CHUNK_SIZE = 0xFFFFFF;
    public static final String MINIMUM_INTERVAL_MEDIAN_PERCENTILE_LONG_NAME = "minimum-interval-median-percentile";
    public static final String MAXIMUM_ZEROS_IN_SAMPLE_PERCENTAGE_LONG_NAME = "maximum-zeros-in-sample-percentage";
    public static final String MAXIMUM_ZEROS_IN_INTERVAL_PERCENTAGE_LONG_NAME = "maximum-zeros-in-interval-percentage";
    public static final String EXTREME_SAMPLE_MEDIAN_PERCENTILE_LONG_NAME = "extreme-sample-median-percentile";
    public static final String IMPUTE_ZEROS_LONG_NAME = "do-impute-zeros";
    public static final String EXTREME_OUTLIER_TRUNCATION_PERCENTILE_LONG_NAME = "extreme-outlier-truncation-percentile";
    public static final String MAXIMUM_CHUNK_SIZE = "maximum-chunk-size";
    @Argument(doc="Input TSV or HDF5 files containing integer read counts in genomic intervals for all samples in the panel of normals (output of CollectReadCounts).  Intervals must be identical and in the same order for all samples.", fullName="input", shortName="I", minElements=1)
    private List<File> inputReadCountFiles = new ArrayList<File>();
    @Argument(doc="Input file containing annotations for GC content in genomic intervals (output of AnnotateIntervals).  If provided, explicit GC correction will be performed before performing SVD.  Intervals must be identical to and in the same order as those in the input read-counts files.", fullName="annotated-intervals", optional=true)
    private File inputAnnotatedIntervalsFile = null;
    @Argument(doc="Output file for the panel of normals.", fullName="output", shortName="O")
    private File outputPanelOfNormalsFile;
    @Argument(doc="Genomic intervals with a median (across samples) of fractional coverage (optionally corrected for GC bias) less than or equal to this percentile are filtered out.  (This is the first filter applied.)", fullName="minimum-interval-median-percentile", minValue=0.0, maxValue=100.0, optional=true)
    private double minimumIntervalMedianPercentile = 10.0;
    @Argument(doc="Samples with a fraction of zero-coverage genomic intervals greater than or equal to this percentage are filtered out.  (This is the second filter applied.)", fullName="maximum-zeros-in-sample-percentage", minValue=0.0, maxValue=100.0, optional=true)
    private double maximumZerosInSamplePercentage = 5.0;
    @Argument(doc="Genomic intervals with a fraction of zero-coverage samples greater than or equal to this percentage are filtered out.  (This is the third filter applied.)", fullName="maximum-zeros-in-interval-percentage", minValue=0.0, maxValue=100.0, optional=true)
    private double maximumZerosInIntervalPercentage = 5.0;
    @Argument(doc="Samples with a median (across genomic intervals) of fractional coverage normalized by genomic-interval medians  strictly below this percentile or strictly above the complementary percentile are filtered out.  (This is the fourth filter applied.)", fullName="extreme-sample-median-percentile", minValue=0.0, maxValue=50.0, optional=true)
    private double extremeSampleMedianPercentile = 2.5;
    @Argument(doc="If true, impute zero-coverage values as the median of the non-zero values in the corresponding interval.  (This is applied after all filters.)", fullName="do-impute-zeros", optional=true)
    private boolean doImputeZeros = true;
    @Argument(doc="Fractional coverages normalized by genomic-interval medians that are strictly below this percentile or strictly above the complementary percentile are set to the corresponding percentile value.  (This is applied after all filters and imputation.)", fullName="extreme-outlier-truncation-percentile", minValue=0.0, maxValue=50.0, optional=true)
    private double extremeOutlierTruncationPercentile = 0.1;
    @Argument(doc="Number of eigensamples to use for truncated SVD and to store in the panel of normals.  The number of samples retained after filtering will be used instead if it is smaller than this.", fullName="number-of-eigensamples", minValue=0.0, optional=true)
    private int numEigensamplesRequested = 20;
    @Advanced
    @Argument(doc="Maximum HDF5 matrix chunk size.  Large matrices written to HDF5 are chunked into equally sized subsets of rows (plus a subset containing the remainder, if necessary) to avoid a hard limit in Java HDF5 on the number of elements in a matrix.  However, since a single row is not allowed to be split across multiple chunks, the number of columns must be less than the maximum number of values in each chunk.  Decreasing this number will reduce heap usage when writing chunks.", fullName="maximum-chunk-size", minValue=1.0, maxValue=2.68435455E8, optional=true)
    private int maximumChunkSize = 0xFFFFFF;

    @Override
    protected void runPipeline(JavaSparkContext ctx) {
        if (!new HDF5Library().load(null)) {
            throw new UserException.HardwareFeatureException("Cannot load the required HDF5 library. HDF5 is currently supported on x86-64 architecture and Linux or OSX systems.");
        }
        this.validateArguments();
        List<String> sampleFilenames = this.inputReadCountFiles.stream().map(File::getAbsolutePath).collect(Collectors.toList());
        File firstReadCountFile = this.inputReadCountFiles.get(0);
        this.logger.info(String.format("Retrieving intervals from first read-counts file (%s)...", firstReadCountFile));
        SimpleCountCollection firstReadCounts = SimpleCountCollection.read(firstReadCountFile);
        SAMSequenceDictionary sequenceDictionary = ((SampleLocatableMetadata)firstReadCounts.getMetadata()).getSequenceDictionary();
        List<SimpleInterval> intervals = firstReadCounts.getIntervals();
        Utils.validateArg(firstReadCounts.size() <= this.maximumChunkSize, String.format("The number of intervals (%d) in each read-counts file cannot exceed the maximum chunk size (%d).", firstReadCounts.size(), this.maximumChunkSize));
        AnnotatedIntervalCollection annotatedIntervals = CopyNumberArgumentValidationUtils.validateAnnotatedIntervals(this.inputAnnotatedIntervalsFile, firstReadCounts, this.logger);
        double[] intervalGCContent = annotatedIntervals == null ? null : annotatedIntervals.getRecords().stream().mapToDouble(i -> i.getAnnotationMap().getValue(CopyNumberAnnotations.GC_CONTENT)).toArray();
        RealMatrix readCountMatrix = CreateReadCountPanelOfNormals.constructReadCountMatrix(this.logger, this.inputReadCountFiles, sequenceDictionary, intervals);
        this.logger.info("Creating the panel of normals...");
        HDF5SVDReadCountPanelOfNormals.create(this.outputPanelOfNormalsFile, this.getCommandLine(), sequenceDictionary, readCountMatrix, sampleFilenames, intervals, intervalGCContent, this.minimumIntervalMedianPercentile, this.maximumZerosInSamplePercentage, this.maximumZerosInIntervalPercentage, this.extremeSampleMedianPercentile, this.doImputeZeros, this.extremeOutlierTruncationPercentile, this.numEigensamplesRequested, this.maximumChunkSize, ctx);
        this.logger.info(String.format("%s complete.", this.getClass().getSimpleName()));
    }

    private void validateArguments() {
        if (this.numEigensamplesRequested > this.inputReadCountFiles.size()) {
            this.logger.warn(String.format("Number of eigensamples (%d) is greater than the number of input samples (%d); the number of samples retained after filtering will be used instead.", this.numEigensamplesRequested, this.inputReadCountFiles.size()));
        }
        Utils.validateArg(this.inputReadCountFiles.size() == new HashSet<File>(this.inputReadCountFiles).size(), "List of input read-counts files cannot contain duplicates.");
        this.inputReadCountFiles.forEach(xva$0 -> CopyNumberArgumentValidationUtils.validateInputs(xva$0));
        CopyNumberArgumentValidationUtils.validateInputs(this.inputAnnotatedIntervalsFile);
        CopyNumberArgumentValidationUtils.validateOutputFiles(this.outputPanelOfNormalsFile);
    }

    private static RealMatrix constructReadCountMatrix(Logger logger, List<File> inputReadCountFiles, SAMSequenceDictionary sequenceDictionary, List<SimpleInterval> intervals) {
        logger.info("Validating and aggregating input read-counts files...");
        int numSamples = inputReadCountFiles.size();
        int numIntervals = intervals.size();
        Array2DRowRealMatrix readCountMatrix = new Array2DRowRealMatrix(numSamples, numIntervals);
        ListIterator<File> inputReadCountFilesIterator = inputReadCountFiles.listIterator();
        while (inputReadCountFilesIterator.hasNext()) {
            int sampleIndex = inputReadCountFilesIterator.nextIndex();
            File inputReadCountFile = inputReadCountFilesIterator.next();
            logger.info(String.format("Aggregating read-counts file %s (%d / %d)", inputReadCountFile, sampleIndex + 1, numSamples));
            SimpleCountCollection readCounts = SimpleCountCollection.read(inputReadCountFile);
            if (!CopyNumberArgumentValidationUtils.isSameDictionary(((SampleLocatableMetadata)readCounts.getMetadata()).getSequenceDictionary(), sequenceDictionary)) {
                logger.warn(String.format("Sequence dictionary for read-counts file %s does not match those in other read-counts files.", inputReadCountFile));
            }
            Utils.validateArg(readCounts.getIntervals().equals(intervals), String.format("Intervals for read-counts file %s do not match those in other read-counts files.", inputReadCountFile));
            readCountMatrix.setRow(sampleIndex, readCounts.getCounts());
        }
        return readCountMatrix;
    }
}

