/*
 * Decompiled with CFR 0.152.
 */
package com.quasiris.qsf.commons.text.normalizer;

import com.quasiris.qsf.commons.text.normalizer.NormalizerConfig;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import org.apache.commons.lang3.StringUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.charfilter.HTMLStripCharFilterFactory;
import org.apache.lucene.analysis.charfilter.MappingCharFilterFactory;
import org.apache.lucene.analysis.custom.CustomAnalyzer;
import org.apache.lucene.analysis.synonym.SynonymGraphFilterFactory;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

public class TextNormalizerService {
    private Analyzer analyzer;

    public TextNormalizerService(NormalizerConfig config) {
        String enabledCharsRegex = config.isKeepPunctuation() ? "[^a-zA-Z0-9\u00c4\u00d6\u00dc\u00e4\u00f6\u00fc\u00df.,!?:-]+" : "[^a-zA-Z0-9\u00c4\u00d6\u00dc\u00e4\u00f6\u00fc\u00df-]+";
        try {
            CustomAnalyzer.Builder builder = CustomAnalyzer.builder((Path)Paths.get(".", new String[0])).withTokenizer("whitespace", new String[0]).addCharFilter("patternReplace", new String[]{"pattern", "\\\\n|\\\\r\\\\n|\\n|\\r\\n", "replacement", " "}).addCharFilter("patternReplace", new String[]{"pattern", "(\\w+)[.|!]{1}(\\w+)", "replacement", "$1-$2"}).addCharFilter(HTMLStripCharFilterFactory.class, new String[0]).addCharFilter("patternReplace", new String[]{"pattern", "\\b(https?|ftp|file)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]", "replacement", " "}).addCharFilter("patternReplace", new String[]{"pattern", enabledCharsRegex, "replacement", " "});
            if (config.isRemoveNumbers()) {
                builder.addCharFilter("patternReplace", new String[]{"pattern", "\\b(\\d+[.,-/]?\\d*?)\\b", "replacement", " "});
            }
            builder.addCharFilter("patternReplace", new String[]{"pattern", "(?<=[\\s])-(?=[\\s])|(?<=[\\S])-(?=[\\s])|(?<=[\\s])-(?=[\\S])", "replacement", " "}).addCharFilter("patternReplace", new String[]{"pattern", "\\b((?:[\\w|\u00c4\u00d6\u00dc\u00e4\u00f6\u00fc\u00df]{1}[-!._]{1})+[\\w|\u00c4\u00d6\u00dc\u00e4\u00f6\u00fc\u00df]{1})\\b", "replacement", " "});
            if (config.isNormalizeUmlaut()) {
                builder.addCharFilter(MappingCharFilterFactory.class, new String[]{"mapping", "normalizer/umlaute-mapping.txt"});
            }
            builder.addTokenFilter("lowercase", new String[0]);
            if (StringUtils.isNotEmpty((CharSequence)config.getStopwordFilepath())) {
                builder.addTokenFilter("stop", new String[]{"ignoreCase", "false", "words", config.getStopwordFilepath()});
            }
            if (config.isStem()) {
                builder.addTokenFilter("germanLightStem", new String[0]);
            }
            if (StringUtils.isNotEmpty((CharSequence)config.getSynonymsFilepath())) {
                builder.addTokenFilter(SynonymGraphFilterFactory.class, new String[]{"synonyms", config.getSynonymsFilepath()});
            }
            if (config.isRemoveDuplicates()) {
                builder.addTokenFilter("removeDuplicates", new String[0]);
            }
            builder.addTokenFilter("trim", new String[0]);
            this.analyzer = builder.build();
        }
        catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    public List<String> normalizeToken(String text) {
        if (StringUtils.isEmpty((CharSequence)text)) {
            return Collections.emptyList();
        }
        try {
            List<String> tokens = TextNormalizerService.analyze(text, this.analyzer);
            return tokens;
        }
        catch (IOException ignored) {
            return Collections.emptyList();
        }
    }

    public String normalize(String text) {
        List<String> tokens = this.normalizeToken(text);
        return String.join((CharSequence)" ", tokens);
    }

    public static String normalizeWhitespace(String text) {
        String result = text;
        if (StringUtils.isNotEmpty((CharSequence)result)) {
            result = result.trim().replaceAll(" +", " ");
        }
        return result;
    }

    public static List<String> analyze(String text, Analyzer analyzer) throws IOException {
        ArrayList<String> result = new ArrayList<String>();
        try (TokenStream tokenStream = analyzer.tokenStream("", text);){
            CharTermAttribute attr = (CharTermAttribute)tokenStream.addAttribute(CharTermAttribute.class);
            tokenStream.reset();
            while (tokenStream.incrementToken()) {
                result.add(attr.toString());
            }
        }
        return result;
    }
}

