/*
 * Decompiled with CFR 0.152.
 */
package org.tribuo.data.text.impl;

import com.oracle.labs.mlrg.olcut.config.Config;
import com.oracle.labs.mlrg.olcut.config.Configurable;
import com.oracle.labs.mlrg.olcut.provenance.ConfiguredObjectProvenance;
import com.oracle.labs.mlrg.olcut.provenance.impl.ConfiguredObjectProvenanceImpl;
import java.util.ArrayList;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.tribuo.Feature;
import org.tribuo.data.text.FeatureAggregator;
import org.tribuo.data.text.FeatureTransformer;
import org.tribuo.data.text.TextPipeline;
import org.tribuo.data.text.TextProcessingException;
import org.tribuo.data.text.TextProcessor;
import org.tribuo.data.text.impl.FeatureHasher;
import org.tribuo.data.text.impl.NgramProcessor;
import org.tribuo.data.text.impl.SumAggregator;
import org.tribuo.data.text.impl.UniqueAggregator;
import org.tribuo.util.tokens.Tokenizer;

public class TokenPipeline
implements TextPipeline {
    private static final Logger logger = Logger.getLogger(TokenPipeline.class.getName());
    private List<TextProcessor> processors = new ArrayList<TextProcessor>();
    private List<FeatureTransformer> transformers = new ArrayList<FeatureTransformer>();
    private FeatureAggregator aggregator;
    @Config(mandatory=true, description="Use term counting, otherwise emit binary features.")
    private boolean termCounting;
    @Config(description="Dimension to map the hash into.")
    private int hashDim = -1;
    @Config(description="Should feature hashing preserve the value?")
    private boolean hashPreserveValue = true;
    @Config(mandatory=true, description="Tokenizer to use.")
    private Tokenizer tokenizer;
    @Config(description="n in the n-gram to emit.")
    private int ngram = 2;

    public TokenPipeline(Tokenizer tokenizer, int ngram, boolean termCounting) {
        this(tokenizer, ngram, termCounting, -1);
    }

    public TokenPipeline(Tokenizer tokenizer, int ngram, boolean termCounting, int dimension) {
        this(tokenizer, ngram, termCounting, dimension, true);
    }

    public TokenPipeline(Tokenizer tokenizer, int ngram, boolean termCounting, int dimension, boolean hashPreserveValue) {
        this.tokenizer = tokenizer;
        this.ngram = ngram;
        this.hashDim = dimension;
        this.termCounting = termCounting;
        this.hashPreserveValue = hashPreserveValue;
        this.postConfig();
    }

    private TokenPipeline() {
    }

    public void postConfig() {
        for (int i = 1; i <= this.ngram; ++i) {
            this.processors.add(new NgramProcessor(this.tokenizer, i, 1.0));
        }
        if (this.hashDim > 0) {
            this.transformers.add(new FeatureHasher(this.hashDim, this.hashPreserveValue));
        }
        this.aggregator = this.termCounting ? new SumAggregator() : new UniqueAggregator(1.0);
    }

    public String toString() {
        if (this.transformers.size() > 0) {
            return this.ngram + "gramPipeline({1.." + this.ngram + "}-grams,hashing)";
        }
        return this.ngram + "gramPipeline({1.." + this.ngram + "}-grams)";
    }

    @Override
    public List<Feature> process(String tag, String data) {
        List<Feature> features = new ArrayList<Feature>();
        for (TextProcessor p : this.processors) {
            try {
                features.addAll(p.process(tag, data));
            }
            catch (TextProcessingException e) {
                logger.log(Level.INFO, String.format("TextProcessingException thrown by processor %s with text %s", p, data), e);
            }
        }
        for (FeatureTransformer transformer : this.transformers) {
            features = transformer.map(tag, features);
        }
        return this.aggregator.aggregate(features);
    }

    public ConfiguredObjectProvenance getProvenance() {
        return new ConfiguredObjectProvenanceImpl((Configurable)this, "TextPipeline");
    }
}

