/*
 * Decompiled with CFR 0.152.
 */
package elki.datasource.parser;

import elki.data.LabelList;
import elki.data.NumberVector;
import elki.datasource.bundle.BundleStreamSource;
import elki.datasource.parser.CSVReaderFormat;
import elki.datasource.parser.NumberVectorLabelParser;
import elki.logging.Logging;
import elki.utilities.documentation.Description;
import elki.utilities.io.ParseUtil;
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
import java.util.Collection;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

@Description(value="This parser expects data in roughly the same format as the NumberVectorLabelParser,\nexcept that it will enumerate all unique strings to always produce numerical values.\nThis way, it can for example handle files that contain lines like 'y,n,y,y,n,y,n'.")
public class CategorialDataAsNumberVectorParser<V extends NumberVector>
extends NumberVectorLabelParser<V> {
    private static final Logging LOG = Logging.getLogger(CategorialDataAsNumberVectorParser.class);
    Object2IntOpenHashMap<String> unique = new Object2IntOpenHashMap();
    int ustart = Math.max(this.unique.defaultReturnValue() + 1, 1);
    Matcher nanpattern = Pattern.compile("\\?").matcher("Dummy text");

    public CategorialDataAsNumberVectorParser(NumberVector.Factory<V> factory) {
        this(CSVReaderFormat.DEFAULT_FORMAT, null, factory);
    }

    public CategorialDataAsNumberVectorParser(CSVReaderFormat format, long[] labelIndices, NumberVector.Factory<V> factory) {
        super(format, labelIndices, factory);
    }

    @Override
    public BundleStreamSource.Event nextEvent() {
        BundleStreamSource.Event e = super.nextEvent();
        if (e == BundleStreamSource.Event.END_OF_STREAM) {
            this.unique.clear();
        }
        return e;
    }

    @Override
    protected boolean parseLineInternal() {
        int i = 0;
        while (this.tokenizer.valid()) {
            block8: {
                if (!this.isLabelColumn(i)) {
                    try {
                        this.attributes.add(this.tokenizer.getDouble());
                    }
                    catch (NumberFormatException e) {
                        int id;
                        String s = this.tokenizer.getSubstring();
                        if (this.nanpattern.reset(s).matches()) {
                            this.attributes.add(Double.NaN);
                            break block8;
                        }
                        if (!(this.warnedPrecision || e != ParseUtil.PRECISION_OVERFLOW && e != ParseUtil.EXPONENT_OVERFLOW)) {
                            this.getLogger().warning((CharSequence)("Too many digits in what looked like a double number - treating as string: " + this.tokenizer.getSubstring()));
                            this.warnedPrecision = true;
                        }
                        if ((id = this.unique.getInt((Object)s)) == this.unique.defaultReturnValue()) {
                            id = this.ustart + this.unique.size();
                            this.unique.put((Object)s, id);
                        }
                        this.attributes.add((double)id);
                    }
                } else {
                    this.haslabels = true;
                    this.labels.add(this.tokenizer.getSubstring());
                }
            }
            this.tokenizer.advance();
            ++i;
        }
        this.curvec = this.createVector();
        this.curlbl = LabelList.make((Collection)this.labels);
        this.attributes.clear();
        this.labels.clear();
        return true;
    }

    @Override
    protected Logging getLogger() {
        return LOG;
    }

    public static class Par<V extends NumberVector>
    extends NumberVectorLabelParser.Par<V> {
        @Override
        public CategorialDataAsNumberVectorParser<V> make() {
            return new CategorialDataAsNumberVectorParser(this.format, this.labelIndices, this.factory);
        }
    }
}

