001package javax.visrec.ml.data;
002
003import java.util.Collection;
004import java.util.Collections;
005import java.util.Iterator;
006import java.util.List;
007import java.util.Objects;
008import java.util.Random;
009
010/**
011 * Generic interface for all data sets for machine learning, independent of type of elements.
012 *
013 * @author Zoran Sevarac
014 * @param <E> type of data set elements
015 * @since 1.0
016 */
017public interface DataSet<E> extends Iterable<E> {
018
019    // TODO: add stream for filtering elements in data set
020
021    /**
022     * Get a collection of the items in the {@link DataSet}
023     * @return {@link Collection}
024     */
025    List<E> getItems();
026
027    /**
028     * Adds an element to this data set.
029     *
030     * @param item data set item to add to the data set
031     * @return current instance of {@link DataSet}
032     */
033    default DataSet<E> add(E item) {
034        Objects.requireNonNull(item, "Null items are not allowed in dataset");
035        getItems().add(item);
036        return this;
037    }
038
039    /**
040     * Add an existing {@link DataSet} to the current {@link DataSet}
041     * @param dataSet existing {@link DataSet}
042     * @return current instance of {@link DataSet}
043     */
044    default DataSet<E> addAll(DataSet<E> dataSet) {
045        Objects.requireNonNull(dataSet, "Dataset is null. Cannot add items from null dataset");
046        getItems().addAll(dataSet.getItems());
047        return this;
048    }
049
050    /**
051     * Get an item from the {@link DataSet}
052     * @param idx index as {@code int} which corresponds with
053     *              the index of the {@link DataSet}
054     * @return item from the {@link DataSet}
055     */
056    default E get(int idx) {
057        return getItems().get(idx);
058    }
059
060    /**
061     * Clear items of the {@link DataSet}
062     */
063    default  void clear() {
064        getItems().clear();
065    }
066 
067    /**
068     * Determines whether the {@link DataSet} is empty or not.
069     * @return {@code true} if the {@link DataSet} is empty, otherwise {@code false}
070     */
071    default boolean isEmpty() {
072        return getItems().isEmpty();
073    }
074
075    /**
076     * Get the number of elements in {@link DataSet}
077     * @return size in {@code int}
078     */
079    default int size() {
080        return getItems().size();
081    }
082    
083    @Override
084    default Iterator<E> iterator() {
085        return getItems().iterator();
086    }
087       
088    /**
089     * Split dataset into specified number of equally sized parts.
090     * @param numParts number of parts to be returned
091     * @return multiple {@link DataSet} in an array.
092     */
093    default DataSet<E>[] split(int numParts) {
094        double part = 1.0 / (double)numParts;
095        double[] parts = new double[numParts];
096        
097        for (int i=0; i<numParts; i++) {
098            parts[i] = part;
099        }
100        
101        return split(parts);
102    }
103
104    /**
105     * Split dataset into specified number of equally sized parts, using specified random generator.
106     * @param numParts number of parts/subsets to return
107     * @param rnd random number generator
108     * @return multiple {@link DataSet} in an array.
109     */
110    default DataSet<E>[] split(int numParts, Random rnd) {
111        double part = 1.0 / (double)numParts;
112        double[] parts = new double[numParts];
113        
114        for (int i=0; i<numParts; i++) {
115            parts[i] = part;
116        }
117        
118        return split(rnd, parts);
119    }
120
121    /**
122     * Split data set in two parts, one with size of specified percentage, and other with rest of the data set
123     *
124     * @param part specified percentage of the first {@link DataSet}
125     * @return multiple {@link DataSet} in an array.
126     */
127    default DataSet<E>[] split(double part) {
128        return split(part, 1-part);
129    }
130
131    /**
132     * Split data set into parts of specified sizes
133     * @param parts specific sizes of {@link DataSet}
134     * @return array of {@link DataSet}
135     */
136    DataSet<E>[] split(double... parts);
137
138    /**
139     * Split data set into parts of specified sizes using specified random generator
140     * @param rnd random generator
141     * @param parts specific sizes of {@link DataSet}
142     * @return array of {@link DataSet}
143     */
144    default DataSet<E>[] split(Random rnd, double... parts) {
145        shuffle(rnd);
146        return split(parts);
147    }
148
149
150    /**
151     * Shuffles the data set.
152     */
153    default void shuffle() {
154        Collections.shuffle(getItems());
155    }
156
157    /**
158     * Shuffles the data set using the specified random number generator.
159     * @param rnd random generator
160     */
161    default void shuffle(Random rnd) {
162        Collections.shuffle(getItems(), rnd);
163    }
164
165    /**
166     * Get labels of target/output columns.
167     * @return array with labels of target/output columns
168     */
169    public String[] getTargetNames();
170    // also add setTargetNames(String ...) and setTargetColumns(int ...)
171
172    public void setColumnNames(String[] columnNames);
173    
174    public String[] getColumnNames();
175
176
177    public static class Column {
178        private final String name;
179        private final Type type;
180        private final boolean isTarget;
181
182        public Column(String name) {
183            this.name = name;
184            this.type = null;
185            this.isTarget = false;
186        }
187
188        public Column(String name, Type type, boolean isTarget) {
189            this.name = name;
190            this.type = type;
191            this.isTarget = isTarget;
192        }
193
194        public String getName() {
195            return name;
196        }
197
198        public Type getType() {
199            return type;
200        }
201
202        public boolean isTarget() {
203            return isTarget;
204        }
205    }
206
207    public static enum Type {
208        DECIMAL, INTEGER, BINARY, STRING; // ENUM?
209    }
210
211}