/*
 * Decompiled with CFR 0.152.
 */
package com.metaeffekt.artifact.analysis.preprocess.filter;

import com.metaeffekt.artifact.analysis.preprocess.filter.TextFilterFunctions;
import com.metaeffekt.artifact.analysis.preprocess.filter.charfile.CharFileHelper;
import com.metaeffekt.artifact.analysis.preprocess.filter.charfile.CharSequenceFile;
import com.metaeffekt.artifact.analysis.preprocess.filter.matcher.StringMatchGeneratorAhoCorasick;
import com.metaeffekt.artifact.analysis.preprocess.filter.range.IntegerDiscreteRangeMap;
import com.metaeffekt.artifact.terms.model.NormalizationMetaData;
import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.UUID;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class TextSieve {
    protected final StringMatchGeneratorAhoCorasick generator;
    protected final int includeReach;
    protected final List<String> wordlist;

    protected TextSieve(TextSieveBuilder builder) {
        this.includeReach = builder.includeReach;
        this.wordlist = builder.wordlist;
        this.generator = new StringMatchGeneratorAhoCorasick(this.wordlist, this.includeReach);
    }

    public static TextSieveBuilder builder() {
        return new TextSieveBuilder();
    }

    public StringBuilder loadFiltered(File file, Charset charset, File scratchFileDir) throws IOException {
        return this.loadFilteredToStringBuilder(file, charset, scratchFileDir);
    }

    protected StringBuilder loadFilteredToStringBuilder(File file, Charset charset, File scratchFileDir) throws IOException {
        StringBuilder result;
        String uuid = UUID.randomUUID().toString();
        String truncatedFileName = StringUtils.truncate((String)file.getName().replaceAll("\\W", "_"), (int)128);
        String tmpFilteringFileName = uuid + truncatedFileName + ".jchars.bin";
        File filteringTmpFile = new File(scratchFileDir, tmpFilteringFileName);
        try (CharFileHelper charFileHelper = CharFileHelper.createDataFile(file, filteringTmpFile, charset, true);
             CharSequenceFile charSequenceFile = new CharSequenceFile(charFileHelper, false);){
            IntegerDiscreteRangeMap<Integer> rangeMap = this.generator.getMatchMap(charSequenceFile);
            StringBuilder filteredBuilder = new StringBuilder();
            for (Map.Entry entry : rangeMap.subRangeSet(0, charSequenceFile.length() - 1)) {
                filteredBuilder.append(charSequenceFile, (int)((Integer)entry.getKey()), (Integer)entry.getValue() + 1).append(" ");
            }
            result = filteredBuilder;
        }
        result = TextFilterFunctions.removeInvalidUnicode(result);
        return result;
    }

    public StringBuilder loadFilteredDirectly(File file, Charset charset) throws IOException {
        return this.loadFilteredDirectlyToStringBuilder(file, charset);
    }

    public StringBuilder loadFilteredDirectlyToStringBuilder(File file, Charset charset) throws IOException {
        String charSequence = FileUtils.readFileToString((File)file, (Charset)charset);
        IntegerDiscreteRangeMap<Integer> rangeMap = this.generator.getMatchMap(charSequence);
        StringBuilder filteredBuilder = new StringBuilder();
        for (Map.Entry entry : rangeMap.subRangeSet(0, charSequence.length() - 1)) {
            filteredBuilder.append(charSequence, (int)((Integer)entry.getKey()), (Integer)entry.getValue() + 1).append(" ");
        }
        return TextFilterFunctions.removeInvalidUnicode(filteredBuilder);
    }

    public static class TextSieveBuilder {
        public static final Logger LOG = LoggerFactory.getLogger(TextSieveBuilder.class);
        private List<String> wordlist = null;
        private int includeReach = 128;

        public TextSieveBuilder includeReach(int includeReach) {
            this.includeReach = includeReach;
            return this;
        }

        public TextSieveBuilder wordlist(Collection<String> wordlist) {
            ArrayList<String> intermediate = new ArrayList<String>(wordlist);
            if (intermediate.removeIf(Objects::isNull)) {
                LOG.warn("Removed null word from wordlist while loading into TextSieveBuilder.");
            }
            if (intermediate.removeIf(String::isEmpty)) {
                LOG.warn("Removed empty word from wordlist while loading into TextSieveBuilder.");
            }
            this.wordlist = Collections.unmodifiableList(intermediate);
            return this;
        }

        public TextSieveBuilder wordlist(NormalizationMetaData normalizationMetaData) {
            return this.wordlist(Collections.unmodifiableCollection(normalizationMetaData.getWordlist()));
        }

        public TextSieve build() {
            if (this.wordlist == null) {
                throw new IllegalArgumentException("Can't build: wordlist is unset.");
            }
            if (this.wordlist.size() == 0) {
                throw new IllegalArgumentException("Can't build: wordlist is empty.");
            }
            return new TextSieve(this);
        }
    }
}

