/*
 * Decompiled with CFR 0.152.
 */
package com.metaeffekt.artifact.analysis.preprocess.filter.wordlist;

import com.metaeffekt.artifact.terms.model.Evidence;
import com.metaeffekt.artifact.terms.model.Masks;
import com.metaeffekt.artifact.terms.model.MatchSet;
import com.metaeffekt.artifact.terms.model.NormalizationMetaData;
import com.metaeffekt.artifact.terms.model.TermsMetaData;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.LinkOption;
import java.nio.file.OpenOption;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentSkipListSet;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class WordlistGenerator {
    private static final Logger log = LoggerFactory.getLogger(WordlistGenerator.class);
    protected static final Pattern SPLIT_PATTERN = Pattern.compile("(\\p{Zs}|\\R|\\p{P}|\t|\\p{Sk}|\\p{Sm}|\\p{Cf})");

    public static Collection<String> createWordlist(NormalizationMetaData normalizationMetaData) throws IOException {
        return WordlistGenerator.createWordlist(normalizationMetaData.getLicenseMetaDataMap());
    }

    public static List<String> createWordlist(Map<String, TermsMetaData> licenseMetaDataMap) throws IOException {
        ArrayList<String> wordlist = new ArrayList<String>();
        wordlist.addAll(WordlistGenerator.getLicenseTextWords(licenseMetaDataMap));
        wordlist.addAll(WordlistGenerator.getMetadataWords(licenseMetaDataMap));
        wordlist.removeIf(Objects::isNull);
        wordlist.removeIf(String::isEmpty);
        return wordlist;
    }

    public static Collection<String> getMetadataWords(Map<String, TermsMetaData> licenseMetaDataMap) {
        ArrayList<TermsMetaData> metaList = new ArrayList<TermsMetaData>(licenseMetaDataMap.values());
        HashSet<String> names = new HashSet<String>();
        HashSet<String> textPieces = new HashSet<String>();
        for (TermsMetaData termsMetaData : metaList) {
            List<MatchSet> oneOfMatchSetList;
            Evidence evidence;
            names.addAll(termsMetaData.getAlternativeNames());
            names.addAll(termsMetaData.getCanonicalNameHistory());
            names.add(termsMetaData.getCanonicalName());
            names.add(termsMetaData.getSpdxIdentifier());
            names.add(termsMetaData.getShortName());
            Masks masks = termsMetaData.getMasks();
            if (masks != null) {
                textPieces.addAll(masks.getMatches());
            }
            Map<String, String> mappings = termsMetaData.getMappings();
            if (termsMetaData.getMappings() != null) {
                textPieces.addAll(mappings.keySet());
                textPieces.addAll(mappings.values());
            }
            if ((evidence = termsMetaData.getEvidence()) == null) continue;
            if (evidence.getMatches() != null) {
                textPieces.addAll(evidence.getMatches());
            }
            if (evidence.getExcludes() != null) {
                textPieces.addAll(evidence.getExcludes());
            }
            if ((oneOfMatchSetList = evidence.getOneOf()) == null) continue;
            for (MatchSet matchSet : oneOfMatchSetList) {
                if (matchSet == null || matchSet.getMatches() == null || matchSet.getMatches().isEmpty()) continue;
                textPieces.addAll(matchSet.getMatches());
            }
        }
        HashSet<String> relevantWords = new HashSet<String>();
        for (String name : names) {
            if (WordlistGenerator.shouldSkip(name)) continue;
            relevantWords.add(name);
        }
        for (String text : textPieces) {
            if (WordlistGenerator.shouldSkip(text)) continue;
            for (String word : WordlistGenerator.getSplit(text)) {
                if (WordlistGenerator.shouldSkip(word)) continue;
                relevantWords.add(word);
            }
        }
        return relevantWords;
    }

    public static String[] getSplit(String toSplit) {
        return SPLIT_PATTERN.split(toSplit);
    }

    public static boolean shouldSkip(String word) {
        int minimumWordLength = 4;
        if (word == null) {
            return true;
        }
        if (word.length() < 4) {
            return word.isEmpty() || word.codePoints().noneMatch(Character::isIdeographic);
        }
        return word.matches("\\d+");
    }

    public static List<String> finalizeWordCollection(Map<String, AtomicInteger> wordToCount) {
        wordToCount.remove("");
        List<String> sortedWords = wordToCount.entrySet().stream().sorted((e1, e2) -> ((AtomicInteger)e2.getValue()).get() - ((AtomicInteger)e1.getValue()).get()).map(Map.Entry::getKey).collect(Collectors.toList());
        List shortWords = sortedWords.stream().filter(line -> line.length() <= 1).collect(Collectors.toList());
        log.debug("Number of short \"words\": " + shortWords.size());
        log.debug(shortWords.toString());
        log.debug("Number of words generated from license texts: " + (long)sortedWords.size());
        return sortedWords;
    }

    public static Collection<String> getLicenseTextWords(Map<String, TermsMetaData> licenseMetaDataMap) throws IOException {
        ConcurrentHashMap<String, AtomicInteger> wordToCount = new ConcurrentHashMap<String, AtomicInteger>();
        ConcurrentSkipListSet<Integer> lineLengths = new ConcurrentSkipListSet<Integer>();
        for (Map.Entry<String, TermsMetaData> e : licenseMetaDataMap.entrySet()) {
            String licenseText;
            String licenseName = e.getKey();
            TermsMetaData tmd = e.getValue();
            String licenseFilePathString = tmd.getLicenseFile();
            if (licenseFilePathString == null) continue;
            if (StringUtils.isBlank((CharSequence)licenseFilePathString)) {
                log.debug("Non-null but blank licenseFilePathString in '" + licenseName + "'. Skipping.");
                continue;
            }
            File licenseFile = new File(licenseFilePathString);
            if (!Files.isRegularFile(licenseFile.toPath(), new LinkOption[0])) {
                log.error("Failed to include [{}]'s license at [{}]: Not a file.", (Object)e.getKey(), (Object)licenseFile);
            }
            try (InputStream inputStream = Files.newInputStream(licenseFile.toPath(), new OpenOption[0]);){
                licenseText = IOUtils.toString((InputStream)inputStream, (Charset)StandardCharsets.UTF_8);
            }
            lineLengths.add(Arrays.stream(licenseText.split("\n")).mapToInt(String::length).max().orElse(0));
            if (log.isTraceEnabled() && 8192 >= Arrays.stream(licenseText.split("\n")).mapToInt(String::length).max().orElse(0)) {
                log.trace("very long line(s) in file [{}].", (Object)licenseFilePathString);
            }
            if (licenseText.contains("\ufffd")) {
                log.debug("Unicode 'REPLACEMENT CHARACTER' found in license text at [{}].", (Object)licenseFilePathString);
            }
            for (String word : WordlistGenerator.getSplit(licenseText)) {
                if (WordlistGenerator.shouldSkip(word = word.toLowerCase(Locale.ENGLISH))) continue;
                wordToCount.putIfAbsent(word, new AtomicInteger());
                wordToCount.get(word).getAndIncrement();
            }
        }
        log.debug("Longest line found: " + lineLengths.stream().max(Integer::compareTo).orElse(0) + " code units.");
        return WordlistGenerator.finalizeWordCollection(wordToCount);
    }
}

