/*
 * Decompiled with CFR 0.152.
 */
package org.predict4all.nlp.parser;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
import org.predict4all.nlp.Separator;
import org.predict4all.nlp.io.TokenFileOutputStream;
import org.predict4all.nlp.language.LanguageModel;
import org.predict4all.nlp.parser.StringProducer;
import org.predict4all.nlp.parser.TokenAppender;
import org.predict4all.nlp.parser.TokenListAppender;
import org.predict4all.nlp.parser.token.SeparatorToken;
import org.predict4all.nlp.parser.token.Token;
import org.predict4all.nlp.parser.token.WordToken;
import org.predict4all.nlp.trainer.TrainerTask;
import org.predict4all.nlp.trainer.corpus.AbstractTrainingDocument;
import org.predict4all.nlp.trainer.corpus.TrainingCorpus;
import org.predict4all.nlp.trainer.step.TrainingStep;
import org.predict4all.nlp.utils.Predict4AllUtils;
import org.predict4all.nlp.utils.progressindicator.LoggingProgressIndicator;
import org.predict4all.nlp.utils.progressindicator.NoOpProgressIndicator;
import org.predict4all.nlp.utils.progressindicator.ProgressIndicator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class Tokenizer {
    public static final DecimalFormat TOKEN_COUNT_FORMAT = new DecimalFormat("###,###,###,###,###");
    private static final Logger LOGGER = LoggerFactory.getLogger(Tokenizer.class);
    private static final String UTF8_BOM = "\ufeff";
    private final LanguageModel languageModel;

    public Tokenizer(LanguageModel languageModel) {
        this.languageModel = Predict4AllUtils.checkNull(languageModel, "Tokenizer needs a language model, languageModel shouldn't be null");
    }

    public List<Token> tokenize(final String rawText) throws IOException {
        ArrayList<Token> list = new ArrayList<Token>();
        this.tokenize(new StringProducer(){
            private boolean hasNext = true;

            @Override
            public void close() {
            }

            @Override
            public int size() {
                return 1;
            }

            @Override
            public String next() {
                this.hasNext = false;
                return rawText;
            }

            @Override
            public boolean hasNext() {
                return this.hasNext;
            }
        }, new TokenListAppender(list), NoOpProgressIndicator.INSTANCE);
        return list;
    }

    public List<TrainerTask> tokenize(TrainingCorpus corpus) {
        corpus.initStep(TrainingStep.PARSER);
        LoggingProgressIndicator progressIndicator = new LoggingProgressIndicator("Tokenization", corpus.getTotalCountFor(TrainingStep.PARSER));
        return corpus.getDocuments(TrainingStep.PARSER).stream().map(d -> new TokenizeTask(progressIndicator, (AbstractTrainingDocument)d, corpus)).collect(Collectors.toList());
    }

    private int tokenize(StringProducer stringProducer, TokenAppender tokenAppender, ProgressIndicator progressIndicator) throws IOException {
        LOGGER.debug("Start tokenization for {} string", (Object)stringProducer.size());
        long start = System.currentTimeMillis();
        int tokenCount = 0;
        StringBuilder currentContent = new StringBuilder(this.languageModel.getAverageWordLength());
        while (stringProducer.hasNext()) {
            String rawText = stringProducer.next() + (stringProducer.hasNext() ? "\n" : "");
            for (int i = 0; i < rawText.length(); ++i) {
                char charAt = rawText.charAt(i);
                Separator charSep = Separator.getSeparatorFor(charAt);
                if (charSep != null) {
                    if (currentContent.length() > 0) {
                        ++tokenCount;
                        tokenAppender.append(WordToken.create(currentContent.toString()));
                    }
                    tokenAppender.append(SeparatorToken.create(charSep));
                    ++tokenCount;
                    currentContent = new StringBuilder(this.languageModel.getAverageWordLength());
                    continue;
                }
                currentContent.append(charAt);
            }
            progressIndicator.increment();
        }
        if (currentContent.length() > 0) {
            ++tokenCount;
            tokenAppender.append(WordToken.create(currentContent.toString()));
        }
        LOGGER.debug("{} tokens created in {} s,", (Object)TOKEN_COUNT_FORMAT.format(tokenCount), (Object)((double)(System.currentTimeMillis() - start) / 1000.0));
        return tokenCount;
    }

    private class TokenizeTask
    extends TrainerTask {
        private final TrainingCorpus trainingCorpus;

        public TokenizeTask(ProgressIndicator progressIndicator, AbstractTrainingDocument document, TrainingCorpus trainingCorpus) {
            super(progressIndicator, document);
            this.trainingCorpus = trainingCorpus;
        }

        @Override
        public void run() throws Exception {
            try (TokenFileOutputStream tokenFileOuputStream = new TokenFileOutputStream(this.document.getOutputFile());
                 StringProducer stringProducer = this.getProducerFor(this.document);){
                int tokenCount = Tokenizer.this.tokenize(stringProducer, tokenFileOuputStream, this.progressIndicator);
                this.document.writeInformations(tokenCount);
            }
        }

        public StringProducer getProducerFor(final AbstractTrainingDocument document) throws IOException {
            final BufferedReader bufferedReader = new BufferedReader(new InputStreamReader((InputStream)new FileInputStream(document.getInputFile()), this.trainingCorpus.getEncoding()));
            return new StringProducer(){
                private String cachedLine;

                @Override
                public void close() throws Exception {
                    bufferedReader.close();
                }

                @Override
                public String next() {
                    if (this.hasNext()) {
                        String line = this.cachedLine;
                        this.cachedLine = null;
                        if (line.startsWith(Tokenizer.UTF8_BOM)) {
                            return line.substring(1);
                        }
                        return line;
                    }
                    return null;
                }

                @Override
                public boolean hasNext() {
                    if (this.cachedLine != null) {
                        return true;
                    }
                    try {
                        this.cachedLine = bufferedReader.readLine();
                        return this.cachedLine != null;
                    }
                    catch (IOException e) {
                        return false;
                    }
                }

                @Override
                public int size() {
                    return document.getCount();
                }
            };
        }
    }
}

