/*
 * Decompiled with CFR 0.152.
 */
package org.predict4all.nlp.words;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.text.DecimalFormat;
import java.util.Collection;
import java.util.List;
import java.util.UUID;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.LongAdder;
import java.util.function.Consumer;
import java.util.stream.Collectors;
import org.predict4all.nlp.Tag;
import org.predict4all.nlp.io.TokenFileInputStream;
import org.predict4all.nlp.io.TokenFileOutputStream;
import org.predict4all.nlp.io.WordFileOutputStream;
import org.predict4all.nlp.language.BaseWordDictionary;
import org.predict4all.nlp.language.LanguageModel;
import org.predict4all.nlp.parser.token.TagToken;
import org.predict4all.nlp.parser.token.Token;
import org.predict4all.nlp.parser.token.WordToken;
import org.predict4all.nlp.trainer.TrainerTask;
import org.predict4all.nlp.trainer.configuration.TrainingConfiguration;
import org.predict4all.nlp.trainer.corpus.AbstractTrainingDocument;
import org.predict4all.nlp.trainer.corpus.TrainingCorpus;
import org.predict4all.nlp.trainer.step.TrainingStep;
import org.predict4all.nlp.utils.Predict4AllUtils;
import org.predict4all.nlp.utils.progressindicator.LoggingProgressIndicator;
import org.predict4all.nlp.utils.progressindicator.ProgressIndicator;
import org.predict4all.nlp.words.WordDictionary;
import org.predict4all.nlp.words.model.Word;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class WordDictionaryGenerator {
    private static final Logger LOGGER = LoggerFactory.getLogger(WordDictionaryGenerator.class);
    public static final DecimalFormat PERCENT_FORMAT = new DecimalFormat("##0.00");
    public static final DecimalFormat COUNT_FORMAT = new DecimalFormat("###,###,###,###,###,###,###,###");
    private final LanguageModel languageModel;
    private final TrainingConfiguration trainingConfiguration;
    private final BaseWordDictionary baseWordDictionary;

    public WordDictionaryGenerator(LanguageModel languageModel, TrainingConfiguration trainingConfiguration) throws IOException {
        this.languageModel = Predict4AllUtils.checkNull(languageModel, "WordsConverter needs a language model, languageModel shouldn't be null");
        this.trainingConfiguration = Predict4AllUtils.checkNull(trainingConfiguration, "WordsConverter needs a training configuration, trainingConfiguration shouldn't be null");
        this.baseWordDictionary = languageModel.getBaseWordDictionary(trainingConfiguration);
        if (!this.baseWordDictionary.isInitialized()) {
            this.baseWordDictionary.initialize();
        }
    }

    public void createWordDictionary(TrainingCorpus corpus, Consumer<List<TrainerTask>> taskExecutor, File dictionaryOuputFile) throws FileNotFoundException, IOException {
        corpus.initStep(TrainingStep.WORDS_DICTIONARY);
        LoggingProgressIndicator progressIndicator = new LoggingProgressIndicator("Words converter and dictionary generation", corpus.getTotalCountFor(TrainingStep.WORDS_DICTIONARY) * 2);
        ConcurrentHashMap wordCounts = new ConcurrentHashMap();
        taskExecutor.accept(corpus.getDocuments(TrainingStep.WORDS_DICTIONARY).stream().map(d -> new CountWordTask(wordCounts, progressIndicator, (AbstractTrainingDocument)d)).collect(Collectors.toList()));
        long totalWordCount = wordCounts.values().parallelStream().mapToLong(LongAdder::sum).sum();
        LOGGER.info("Word count created, found {} differents words (before validation), corpus contained {} total words", (Object)wordCounts.size(), (Object)COUNT_FORMAT.format(totalWordCount));
        ConcurrentHashMap<String, Boolean> finalWordSet = new ConcurrentHashMap<String, Boolean>();
        taskExecutor.accept(corpus.getDocuments(TrainingStep.WORDS_DICTIONARY).stream().map(d -> new ReplaceInvalidWordsAndCreateDictionaryTask(progressIndicator, (AbstractTrainingDocument)d, wordCounts, finalWordSet, totalWordCount)).collect(Collectors.toList()));
        LOGGER.info("Detected {} different valid words in corpus", (Object)finalWordSet.size());
        if (dictionaryOuputFile != null) {
            long start = System.currentTimeMillis();
            String dictionaryID = UUID.randomUUID().toString();
            WordDictionary generatedWordDictionary = new WordDictionary(this.languageModel, dictionaryID);
            finalWordSet.forEach((word, count) -> generatedWordDictionary.putWordTraining((String)word));
            generatedWordDictionary.compact();
            LOGGER.info("Will save word dictionary to {}", (Object)dictionaryOuputFile);
            try (WordFileOutputStream wfos = new WordFileOutputStream(dictionaryOuputFile);){
                Collection<Word> words = generatedWordDictionary.getAllWords();
                wfos.writeUTF(dictionaryID);
                wfos.writeInt(generatedWordDictionary.getIDGeneratorState());
                for (Word word2 : words) {
                    if (!word2.isValidForSaving()) continue;
                    wfos.writeWord(word2);
                }
            }
            LOGGER.info("Word dictionary saved to {}, {} words added to dictionary and saved in {} ms", new Object[]{dictionaryOuputFile, generatedWordDictionary.size(), System.currentTimeMillis() - start});
        } else {
            LOGGER.info("Word dictionary will not be saved because dictionaryOutputFile is null");
        }
    }

    private Token getReplacingTokenFor(Token token, boolean sentenceStart, ConcurrentHashMap<String, LongAdder> wordCounts, ConcurrentHashMap<String, Boolean> finalWordSet, long totalWordCount) {
        if (token.isWord()) {
            String replacingWord;
            LongAdder countAdder = wordCounts.get(replacingWord = this.lowerCaseWordIfNeeded(wordCounts, token.getText(), sentenceStart));
            long count = countAdder != null ? countAdder.sum() : 0L;
            if ((replacingWord = this.checkWordCount(this.getReplacingWord(replacingWord, count), count)) == null) {
                return TagToken.create(Tag.UNKNOWN);
            }
            if (!token.getText().equals(replacingWord)) {
                this.addToDictionarySet(replacingWord, finalWordSet);
                return WordToken.create(replacingWord);
            }
            this.addToDictionarySet(token.getText(), finalWordSet);
        }
        return token;
    }

    private void addToDictionarySet(String word, ConcurrentHashMap<String, Boolean> finalWordSet) {
        if (word != null && (Predict4AllUtils.length(word) > 1 || this.languageModel.getValidOneCharWords().contains(Predict4AllUtils.lowerCase(word)))) {
            finalWordSet.put(word, true);
        }
    }

    private String checkWordCount(String word, long count) {
        return word != null && count > (long)this.trainingConfiguration.getUnknownWordCountThreshold() ? word : null;
    }

    private String getReplacingWord(String word, long count) {
        String lowerCaseWord;
        double freqLowerCase;
        String uncapitalizedWord;
        double freqUncapitalized;
        if (this.baseWordDictionary.containsWord(word) || count > (long)this.trainingConfiguration.getDirectlyValidWordCountThreshold()) {
            return word;
        }
        if (Character.isUpperCase(word.charAt(0)) && (freqUncapitalized = this.baseWordDictionary.getFrequency(uncapitalizedWord = Predict4AllUtils.uncapitalize(word))) > 0.0) {
            return freqUncapitalized > this.trainingConfiguration.getConvertCaseFromDictionaryModelThreshold() ? uncapitalizedWord : word;
        }
        if (Predict4AllUtils.containsUpperCase(word) && (freqLowerCase = this.baseWordDictionary.getFrequency(lowerCaseWord = Predict4AllUtils.lowerCase(word))) > 0.0) {
            return freqLowerCase > this.trainingConfiguration.getConvertCaseFromDictionaryModelThreshold() ? lowerCaseWord : word;
        }
        return null;
    }

    private String lowerCaseWordIfNeeded(ConcurrentHashMap<String, LongAdder> wordCounts, String word, boolean sentenceStart) {
        if (Character.isUpperCase(word.charAt(0))) {
            long countUpperCase;
            long countLowerCase;
            double percentLowerCase;
            String loweredWord = word.toLowerCase();
            if (sentenceStart) {
                return loweredWord;
            }
            if (wordCounts.containsKey(loweredWord) && (percentLowerCase = 1.0 * (double)(countLowerCase = wordCounts.get(loweredWord).sum()) / (double)(countLowerCase + (countUpperCase = wordCounts.get(word).sum()))) >= this.trainingConfiguration.getUpperCaseReplacementThreshold()) {
                return loweredWord;
            }
        }
        return word;
    }

    private class CountWordTask
    extends TrainerTask {
        private final ConcurrentHashMap<String, LongAdder> wordCounts;

        public CountWordTask(ConcurrentHashMap<String, LongAdder> wordCounts, ProgressIndicator progressIndicator, AbstractTrainingDocument document) {
            super(progressIndicator, document);
            this.wordCounts = wordCounts;
        }

        @Override
        public void run() throws Exception {
            try (TokenFileInputStream tokenFis = new TokenFileInputStream(this.document.getInputFile());){
                boolean sentenceStart = true;
                for (Token token = tokenFis.readToken(); token != null; token = token.getNext(tokenFis)) {
                    if (token.isWord()) {
                        String tokenText = token.getText();
                        if (sentenceStart && Character.isUpperCase(tokenText.charAt(0))) {
                            tokenText = tokenText.toLowerCase();
                        }
                        this.wordCounts.computeIfAbsent(tokenText, k -> new LongAdder()).increment();
                    }
                    sentenceStart = token.isSeparator() && (sentenceStart || token.getSeparator().isSentenceSeparator());
                    this.progressIndicator.increment();
                }
            }
        }
    }

    private class ReplaceInvalidWordsAndCreateDictionaryTask
    extends TrainerTask {
        private final ConcurrentHashMap<String, LongAdder> wordCounts;
        private final ConcurrentHashMap<String, Boolean> finalWordSet;
        private final long totalWordCount;

        public ReplaceInvalidWordsAndCreateDictionaryTask(ProgressIndicator progressIndicator, AbstractTrainingDocument document, ConcurrentHashMap<String, LongAdder> wordCounts, ConcurrentHashMap<String, Boolean> finalWordCounts, long totalWordCount) {
            super(progressIndicator, document);
            this.wordCounts = wordCounts;
            this.totalWordCount = totalWordCount;
            this.finalWordSet = finalWordCounts;
        }

        @Override
        public void run() throws Exception {
            try (TokenFileInputStream tokenFis = new TokenFileInputStream(this.document.getInputFile());
                 TokenFileOutputStream tokenFos = new TokenFileOutputStream(this.document.getOutputFile());){
                boolean sentenceStart = true;
                for (Token token = tokenFis.readToken(); token != null; token = token.getNext(tokenFis)) {
                    tokenFos.writeToken(WordDictionaryGenerator.this.getReplacingTokenFor(token, sentenceStart, this.wordCounts, this.finalWordSet, this.totalWordCount));
                    sentenceStart = token.isSeparator() && (sentenceStart || token.getSeparator().isSentenceSeparator());
                    this.progressIndicator.increment();
                }
                this.document.writeInformations(this.document.getCount());
            }
        }
    }
}

