001package javax.visrec.ml.data; 002 003import java.util.Collection; 004import java.util.Collections; 005import java.util.Iterator; 006import java.util.List; 007import java.util.Objects; 008import java.util.Random; 009 010/** 011 * Generic interface for all data sets for machine learning, independent of type of elements. 012 * 013 * @author Zoran Sevarac 014 * @param <E> type of data set elements 015 * @since 1.0 016 */ 017public interface DataSet<E> extends Iterable<E> { 018 019 // TODO: add stream for filtering elements in data set 020 021 /** 022 * Get a collection of the items in the {@link DataSet} 023 * @return {@link Collection} 024 */ 025 List<E> getItems(); 026 027 /** 028 * Adds an element to this data set. 029 * 030 * @param item data set item to add to the data set 031 * @return current instance of {@link DataSet} 032 */ 033 default DataSet<E> add(E item) { 034 Objects.requireNonNull(item, "Null items are not allowed in dataset"); 035 getItems().add(item); 036 return this; 037 } 038 039 /** 040 * Add an existing {@link DataSet} to the current {@link DataSet} 041 * @param dataSet existing {@link DataSet} 042 * @return current instance of {@link DataSet} 043 */ 044 default DataSet<E> addAll(DataSet<E> dataSet) { 045 Objects.requireNonNull(dataSet, "Dataset is null. Cannot add items from null dataset"); 046 getItems().addAll(dataSet.getItems()); 047 return this; 048 } 049 050 /** 051 * Get an item from the {@link DataSet} 052 * @param idx index as {@code int} which corresponds with 053 * the index of the {@link DataSet} 054 * @return item from the {@link DataSet} 055 */ 056 default E get(int idx) { 057 return getItems().get(idx); 058 } 059 060 /** 061 * Clear items of the {@link DataSet} 062 */ 063 default void clear() { 064 getItems().clear(); 065 } 066 067 /** 068 * Determines whether the {@link DataSet} is empty or not. 069 * @return {@code true} if the {@link DataSet} is empty, otherwise {@code false} 070 */ 071 default boolean isEmpty() { 072 return getItems().isEmpty(); 073 } 074 075 /** 076 * Get the number of elements in {@link DataSet} 077 * @return size in {@code int} 078 */ 079 default int size() { 080 return getItems().size(); 081 } 082 083 @Override 084 default Iterator<E> iterator() { 085 return getItems().iterator(); 086 } 087 088 /** 089 * Split dataset into specified number of equally sized parts. 090 * @param numParts number of parts to be returned 091 * @return multiple {@link DataSet} in an array. 092 */ 093 default DataSet<E>[] split(int numParts) { 094 double part = 1.0 / (double)numParts; 095 double[] parts = new double[numParts]; 096 097 for (int i=0; i<numParts; i++) { 098 parts[i] = part; 099 } 100 101 return split(parts); 102 } 103 104 /** 105 * Split dataset into specified number of equally sized parts, using specified random generator. 106 * @param numParts number of parts/subsets to return 107 * @param rnd random number generator 108 * @return multiple {@link DataSet} in an array. 109 */ 110 default DataSet<E>[] split(int numParts, Random rnd) { 111 double part = 1.0 / (double)numParts; 112 double[] parts = new double[numParts]; 113 114 for (int i=0; i<numParts; i++) { 115 parts[i] = part; 116 } 117 118 return split(rnd, parts); 119 } 120 121 /** 122 * Split data set in two parts, one with size of specified percentage, and other with rest of the data set 123 * 124 * @param part specified percentage of the first {@link DataSet} 125 * @return multiple {@link DataSet} in an array. 126 */ 127 default DataSet<E>[] split(double part) { 128 return split(part, 1-part); 129 } 130 131 /** 132 * Split data set into parts of specified sizes 133 * @param parts specific sizes of {@link DataSet} 134 * @return array of {@link DataSet} 135 */ 136 DataSet<E>[] split(double... parts); 137 138 /** 139 * Split data set into parts of specified sizes using specified random generator 140 * @param rnd random generator 141 * @param parts specific sizes of {@link DataSet} 142 * @return array of {@link DataSet} 143 */ 144 default DataSet<E>[] split(Random rnd, double... parts) { 145 shuffle(rnd); 146 return split(parts); 147 } 148 149 150 /** 151 * Shuffles the data set. 152 */ 153 default void shuffle() { 154 Collections.shuffle(getItems()); 155 } 156 157 /** 158 * Shuffles the data set using the specified random number generator. 159 * @param rnd random generator 160 */ 161 default void shuffle(Random rnd) { 162 Collections.shuffle(getItems(), rnd); 163 } 164 165 /** 166 * Get labels of target/output columns. 167 * @return array with labels of target/output columns 168 */ 169 public String[] getTargetNames(); 170 // also add setTargetNames(String ...) and setTargetColumns(int ...) 171 172 public void setColumnNames(String[] columnNames); 173 174 public String[] getColumnNames(); 175 176 177 public static class Column { 178 private final String name; 179 private final Type type; 180 private final boolean isTarget; 181 182 public Column(String name) { 183 this.name = name; 184 this.type = null; 185 this.isTarget = false; 186 } 187 188 public Column(String name, Type type, boolean isTarget) { 189 this.name = name; 190 this.type = type; 191 this.isTarget = isTarget; 192 } 193 194 public String getName() { 195 return name; 196 } 197 198 public Type getType() { 199 return type; 200 } 201 202 public boolean isTarget() { 203 return isTarget; 204 } 205 } 206 207 public static enum Type { 208 DECIMAL, INTEGER, BINARY, STRING; // ENUM? 209 } 210 211}