/*
 * Decompiled with CFR 0.152.
 */
package org.broadinstitute.hellbender.tools.spark.sv.evidence;

import com.google.common.annotations.VisibleForTesting;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMSequenceDictionary;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import org.apache.spark.HashPartitioner;
import org.apache.spark.Partitioner;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.PairFunction;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.BetaFeature;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.engine.spark.GATKSparkTool;
import org.broadinstitute.hellbender.engine.spark.datasources.ReferenceMultiSparkSource;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.tools.spark.sv.utils.KmerAndCount;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVDUSTFilteredKmerizer;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVFileUtils;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVKmer;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVKmerLong;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVReferenceUtils;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVUtils;
import org.broadinstitute.hellbender.tools.spark.utils.HopscotchMap;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.gcs.BucketUtils;
import picard.cmdline.programgroups.ReferenceProgramGroup;
import scala.Tuple2;

@DocumentedFeature
@BetaFeature
@CommandLineProgramProperties(oneLineSummary="Identifies sequences that occur at high frequency in a reference", summary="Search the reference for kmers (fixed-length substrings) that occur more than a specified number of times, and list them to an output file.  The resulting output file is appropriate for use as the --kmers-to-ignore input file by the StructuralVariationDiscoveryPipelineSpark tool, which will ignore these kmers when trying to produce candidate reads for local assemblies.", programGroup=ReferenceProgramGroup.class)
public final class FindBadGenomicKmersSpark
extends GATKSparkTool {
    private static final long serialVersionUID = 1L;
    public static final int REF_RECORD_LEN = 10000;
    public static final int REF_RECORDS_PER_PARTITION = 104;
    public static final int MAX_KMER_FREQ = 3;
    @Argument(doc="file for ubiquitous kmer output", shortName="O", fullName="output")
    private String outputFile;
    @Argument(doc="kmer size", fullName="k-size")
    private int kSize = 51;
    @Argument(doc="maximum kmer DUST score", fullName="kmer-max-dust-score")
    private int maxDUSTScore = 49;
    @Argument(doc="additional high copy kmers (mitochondrion, e.g.) fasta file name", fullName="high-copy-fasta", optional=true)
    private String highCopyFastaFilename;

    @Override
    public boolean requiresReference() {
        return true;
    }

    @Override
    protected void runTool(JavaSparkContext ctx) {
        SAMFileHeader hdr = this.getHeaderForReads();
        SAMSequenceDictionary dict = null;
        if (hdr != null) {
            dict = hdr.getSequenceDictionary();
        }
        ReferenceMultiSparkSource referenceMultiSource = this.getReference();
        Collection<SVKmer> killList = FindBadGenomicKmersSpark.findBadGenomicKmers(ctx, this.kSize, this.maxDUSTScore, referenceMultiSource, dict);
        if (this.highCopyFastaFilename != null) {
            killList = SVUtils.uniquify(killList, FindBadGenomicKmersSpark.processFasta(this.kSize, this.maxDUSTScore, this.highCopyFastaFilename));
        }
        SVFileUtils.writeKmersFile(this.outputFile, this.kSize, killList);
    }

    @VisibleForTesting
    static List<SVKmer> findBadGenomicKmers(JavaSparkContext ctx, int kSize, int maxDUSTScore, ReferenceMultiSparkSource ref, SAMSequenceDictionary readsDict) {
        SAMSequenceDictionary dict = ref.getReferenceSequenceDictionary(readsDict);
        if (dict == null) {
            throw new GATKException("No reference dictionary available");
        }
        JavaRDD<byte[]> refRDD = SVReferenceUtils.getReferenceBasesRDD(ctx, kSize, ref, dict, 10000, 104);
        return FindBadGenomicKmersSpark.collectUbiquitousKmersInReference(kSize, maxDUSTScore, 3, refRDD);
    }

    @VisibleForTesting
    static List<SVKmer> collectUbiquitousKmersInReference(int kSize, int maxDUSTScore, int maxKmerFreq, JavaRDD<byte[]> refRDD) {
        Utils.nonNull(refRDD, "reference bases RDD is null");
        Utils.validateArg(kSize > 0, "provided kmer size is non positive");
        Utils.validateArg(maxDUSTScore > 0, "provided DUST filter score is non positive");
        Utils.validateArg(maxKmerFreq > 0, "provided kmer frequency is non positive");
        int nPartitions = refRDD.getNumPartitions();
        int hashSize = 208;
        return refRDD.mapPartitions((FlatMapFunction & Serializable)seqItr -> {
            HopscotchMap kmerCounts = new HopscotchMap(208);
            while (seqItr.hasNext()) {
                byte[] seq = (byte[])seqItr.next();
                SVDUSTFilteredKmerizer.canonicalStream(seq, kSize, maxDUSTScore, (SVKmer)new SVKmerLong()).forEach(kmer -> {
                    KmerAndCount entry = (KmerAndCount)kmerCounts.find(kmer);
                    if (entry == null) {
                        kmerCounts.add(new KmerAndCount((SVKmerLong)kmer));
                    } else {
                        entry.bumpCount();
                    }
                });
            }
            return kmerCounts.iterator();
        }).mapToPair((PairFunction & Serializable)entry -> new Tuple2((Object)entry.getKey(), (Object)entry.getValue())).partitionBy((Partitioner)new HashPartitioner(nPartitions)).mapPartitions((FlatMapFunction & Serializable)pairItr -> {
            HopscotchMap kmerCounts = new HopscotchMap(208);
            while (pairItr.hasNext()) {
                Tuple2 pair = (Tuple2)pairItr.next();
                SVKmer kmer = (SVKmer)pair._1();
                int count = (Integer)pair._2();
                KmerAndCount entry = (KmerAndCount)kmerCounts.find(kmer);
                if (entry == null) {
                    kmerCounts.add(new KmerAndCount((SVKmerLong)kmer, count));
                    continue;
                }
                entry.bumpCount(count);
            }
            return kmerCounts.stream().filter(kmerAndCount -> kmerAndCount.grabCount() > maxKmerFreq).map(KmerAndCount::getKey).iterator();
        }).collect();
    }

    /*
     * Enabled aggressive block sorting
     * Enabled unnecessary exception pruning
     * Enabled aggressive exception aggregation
     */
    @VisibleForTesting
    static List<SVKmer> processFasta(int kSize, int maxDUSTScore, String fastaFilename) {
        try (BufferedReader rdr = new BufferedReader(new InputStreamReader(BucketUtils.openFile(fastaFilename)));){
            String line;
            ArrayList<SVKmer> kmers = new ArrayList<SVKmer>((int)BucketUtils.fileSize(fastaFilename));
            StringBuilder sb = new StringBuilder();
            SVKmerLong kmerSeed = new SVKmerLong();
            while ((line = rdr.readLine()) != null) {
                if (line.charAt(0) != '>') {
                    sb.append(line);
                    continue;
                }
                if (sb.length() <= 0) continue;
                SVDUSTFilteredKmerizer.canonicalStream(sb, kSize, maxDUSTScore, (SVKmer)kmerSeed).forEach(kmers::add);
                sb.setLength(0);
            }
            if (sb.length() > 0) {
                SVDUSTFilteredKmerizer.canonicalStream(sb, kSize, maxDUSTScore, (SVKmer)kmerSeed).forEach(kmers::add);
            }
            ArrayList<SVKmer> arrayList = kmers;
            return arrayList;
        }
        catch (IOException ioe) {
            throw new GATKException("Can't read high copy kmers fasta file " + fastaFilename, ioe);
        }
    }
}

