public final class DHistogram
extends water.Iced
A DHistogram bins every value added to it, and computes a the
vec min and max (for use in the next split), and response mean and variance
for each bin. DHistograms are initialized with a min, max and
number-of- elements to be added (all of which are generally available from
a Vec). Bins run from min to max in uniform sizes. If the DHistogram can determine that fewer bins are needed (e.g. boolean columns
run from 0 to 1, but only ever take on 2 values, so only 2 bins are
needed), then fewer bins are used.
DHistogram are shared per-node, and atomically updated. There's
an add call to help cross-node reductions. The data is stored in
primitive arrays, so it can be sent over the wire.
If we are successively splitting rows (e.g. in a decision tree), then a
fresh DHistogram for each split will dynamically re-bin the data.
Each successive split will logarithmically divide the data. At the first
split, outliers will end up in their own bins - but perhaps some central
bins may be very full. At the next split(s), the full bins will get split,
and again until (with a log number of splits) each bin holds roughly the
same amount of data. This dynamic binning resolves a lot of problems with
picking the proper bin count or limits - generally a few more tree levels
will equal any fancy but fixed-size binning strategy.
| Modifier and Type | Field and Description |
|---|---|
double[] |
_bins |
byte |
_isInt |
double |
_maxEx |
protected double |
_maxIn |
double |
_min |
protected double |
_min2 |
double |
_minSplitImprovement |
java.lang.String |
_name |
char |
_nbin |
double |
_step |
| Constructor and Description |
|---|
DHistogram(java.lang.String name,
int nbins,
int nbins_cats,
byte isInt,
double min,
double maxEx,
double minSplitImprovement) |
| Modifier and Type | Method and Description |
|---|---|
static int[] |
activeColumns(DHistogram[] hist) |
void |
add(DHistogram dsh) |
void |
add0(DHistogram dsh) |
int |
bin(double col_data) |
double |
binAt(int b) |
double |
bins(int b) |
double |
find_maxEx() |
static double |
find_maxEx(double maxIn,
int isInt) |
double |
find_maxIn() |
double |
find_min() |
void |
incr0(int b,
double y,
double w) |
void |
incr1(int b,
double y,
double yy) |
void |
init() |
static DHistogram[] |
initialHist(water.fvec.Frame fr,
int ncols,
int nbins,
int nbins_cats,
double minSplitImprovement,
DHistogram[] hs) |
static DHistogram |
make(java.lang.String name,
int nbins,
int nbins_cats,
byte isInt,
double min,
double maxEx,
double minSplitImprovement) |
int |
nbins() |
DTree.Split |
scoreMSE(int col,
double min_rows,
int nid) |
void |
setMax(double max) |
void |
setMin(double min) |
java.lang.String |
toString() |
double |
var(int b)
compute the sample variance within a given bin
|
public final transient java.lang.String _name
public final double _minSplitImprovement
public final byte _isInt
public final char _nbin
public final double _step
public final double _min
public final double _maxEx
public double[] _bins
protected double _min2
protected double _maxIn
public DHistogram(java.lang.String name,
int nbins,
int nbins_cats,
byte isInt,
double min,
double maxEx,
double minSplitImprovement)
public static int[] activeColumns(DHistogram[] hist)
public void setMin(double min)
public void setMax(double max)
public int bin(double col_data)
public double binAt(int b)
public int nbins()
public double bins(int b)
public void init()
public void add(DHistogram dsh)
public double find_min()
public double find_maxIn()
public double find_maxEx()
public static double find_maxEx(double maxIn,
int isInt)
public static DHistogram[] initialHist(water.fvec.Frame fr, int ncols, int nbins, int nbins_cats, double minSplitImprovement, DHistogram[] hs)
public static DHistogram make(java.lang.String name, int nbins, int nbins_cats, byte isInt, double min, double maxEx, double minSplitImprovement)
public java.lang.String toString()
toString in class java.lang.Objectpublic double var(int b)
b - bin idpublic void incr0(int b,
double y,
double w)
public void incr1(int b,
double y,
double yy)
public void add0(DHistogram dsh)
public DTree.Split scoreMSE(int col, double min_rows, int nid)