@KFStep(name="Join", category="Flow", toolTipText="Performs an inner join on two incoming datasets/instance streams (IMPORTANT: assumes that both datasets are sorted in ascending order of the key fields). If data is not sorted then usea Sorter step to sort both into ascending order of the key fields. Does not handle the case wherekeys are not unique in one or both inputs.", iconPath="weka/gui/knowledgeflow/icons/Join.gif") public class Join extends BaseStep
| Modifier and Type | Field and Description |
|---|---|
static java.lang.String |
KEY_SPEC_SEPARATOR
Separator used to separate first and second input key specifications
|
protected java.util.concurrent.atomic.AtomicInteger |
m_count
Used to cycle over the headers in the header pool
|
protected java.util.Queue<Sorter.InstanceHolder> |
m_firstBuffer
Buffer for the first input (capped at 100 for incremental)
|
protected boolean |
m_firstFinished
Whether the first is finished (incremental mode)
|
protected StepManager |
m_firstInput
First source of data
|
protected java.lang.String |
m_firstInputConnectionType
Connection type of the first input
|
protected boolean |
m_firstIsWaiting
True if the first input stream is waiting due to a full buffer (incremental
mode only)
|
protected Instances |
m_headerOne
The structure of the first incoming dataset
|
protected java.util.List<Instances> |
m_headerPool
A set of copied outgoing structure instances.
|
protected Instances |
m_headerTwo
The structure of the second incoming dataset
|
protected int[] |
m_keyIndexesOne
Indexes of the key fields for the first input
|
protected int[] |
m_keyIndexesTwo
Indexes of the key fields for the second input
|
protected java.lang.String |
m_keySpec
Holds the internal representation of the key specification
|
protected Instances |
m_mergedHeader
The structure of the outgoing dataset
|
protected boolean |
m_runningIncrementally
True if the step is running incrementally
|
protected java.util.Queue<Sorter.InstanceHolder> |
m_secondBuffer
Buffer for the second input (capped at 100 for incremental)
|
protected boolean |
m_secondFinished
Whether the second is finished (incremental mode)
|
protected StepManager |
m_secondInput
Second source of data
|
protected java.lang.String |
m_secondInputConnectionType
Connection type of the second input
|
protected boolean |
m_secondIsWaiting
True if the second input stream is waiting due to a full buffer
(incremental mode only)
|
protected Data |
m_streamingData
Reusable data object for streaming output
|
protected java.util.Map<java.lang.String,java.lang.Integer> |
m_stringAttIndexesOne
Holds indexes of string attributes, keyed by attribute name
|
protected java.util.Map<java.lang.String,java.lang.Integer> |
m_stringAttIndexesTwo
Holds indexes of string attributes, keyed by attribute name
|
protected boolean |
m_stringAttsPresent
True if string attributes are present in the incoming data
|
m_stepIsResourceIntensive, m_stepManager, m_stepName| Constructor and Description |
|---|
Join() |
| Modifier and Type | Method and Description |
|---|---|
protected void |
addToFirstBuffer(Instance inst)
Add an instance to the first buffer
|
protected void |
addToSecondBuffer(Instance inst)
Add an instance to the second buffer
|
protected void |
clearBuffers()
Clear the buffers
|
protected int |
compare(Instance one,
Instance two,
Sorter.InstanceHolder oneH,
Sorter.InstanceHolder twoH)
Compares two instances according to the keys
|
protected void |
establishFirstAndSecondConnectedInputs()
Look for, and configure with respect to, first and second inputs
|
protected void |
generateMergedHeader()
Generate the header of the output instance structure
|
protected Instance |
generateMergedInstance(Sorter.InstanceHolder one,
Sorter.InstanceHolder two)
Generate a merged instance from two input instances that match on the key
fields
|
java.util.List<java.lang.String> |
getConnectedInputNames()
Get the names of the connected steps as a list
|
java.lang.String |
getCustomEditorForStep()
Return the fully qualified name of a custom editor component (JComponent)
to use for editing the properties of the step.
|
Instances |
getFirstInputStructure()
Get the Instances structure being produced by the first input
|
java.util.List<java.lang.String> |
getIncomingConnectionTypes()
Get a list of incoming connection types that this step can accept.
|
java.lang.String |
getKeySpec()
Get the key specification (in internal format -
k11,k12,...
|
java.util.List<java.lang.String> |
getOutgoingConnectionTypes()
Get a list of outgoing connection types that this step can produce.
|
Instances |
getSecondInputStructure()
Get the Instances structure being produced by the second input
|
protected void |
processBatch(Data data)
Process batch data.
|
protected Instance |
processBuffers()
Check both buffers and return a joined instance (if possible at this time)
or null
|
void |
processIncoming(Data data)
Process some incoming data
|
protected void |
processStreaming(Data data)
Handle streaming data
|
void |
setKeySpec(java.lang.String ks)
Set the key specification (in internal format -
k11,k12,...
|
void |
stepInit()
Initialize the step
|
environmentSubstitute, getDefaultSettings, getInteractiveViewers, getInteractiveViewersImpls, getName, getStepManager, globalInfo, isResourceIntensive, isStopRequested, outputStructureForConnectionType, setName, setStepIsResourceIntensive, setStepManager, setStepMustRunSingleThreaded, start, stepMustRunSingleThreaded, stoppublic static final java.lang.String KEY_SPEC_SEPARATOR
protected StepManager m_firstInput
protected StepManager m_secondInput
protected transient boolean m_firstFinished
protected transient boolean m_secondFinished
protected java.lang.String m_firstInputConnectionType
protected java.lang.String m_secondInputConnectionType
protected transient java.util.Queue<Sorter.InstanceHolder> m_firstBuffer
protected transient java.util.Queue<Sorter.InstanceHolder> m_secondBuffer
protected Data m_streamingData
protected transient Instances m_headerOne
protected transient Instances m_headerTwo
protected transient Instances m_mergedHeader
protected transient java.util.List<Instances> m_headerPool
protected transient java.util.concurrent.atomic.AtomicInteger m_count
protected boolean m_stringAttsPresent
protected boolean m_runningIncrementally
protected int[] m_keyIndexesOne
protected int[] m_keyIndexesTwo
protected java.lang.String m_keySpec
protected java.util.Map<java.lang.String,java.lang.Integer> m_stringAttIndexesOne
protected java.util.Map<java.lang.String,java.lang.Integer> m_stringAttIndexesTwo
protected boolean m_firstIsWaiting
protected boolean m_secondIsWaiting
public void setKeySpec(java.lang.String ks)
ks - the keys specificationpublic java.lang.String getKeySpec()
public java.util.List<java.lang.String> getConnectedInputNames()
public Instances getFirstInputStructure() throws WekaException
WekaException - if a problem occurspublic Instances getSecondInputStructure() throws WekaException
WekaException - if a problem occursprotected void establishFirstAndSecondConnectedInputs()
public void stepInit()
throws WekaException
WekaException - if a problem occurspublic void processIncoming(Data data) throws WekaException
processIncoming in interface BaseStepExtenderprocessIncoming in interface StepprocessIncoming in class BaseStepdata - the data to processWekaException - if a problem occursprotected void processStreaming(Data data) throws WekaException
data - an instance of streaming dataWekaException - if a problem occursprotected void addToFirstBuffer(Instance inst)
inst - the instance to addprotected void addToSecondBuffer(Instance inst)
inst - the instance to addprotected void clearBuffers()
throws WekaException
WekaException - if a problem occursprotected void processBatch(Data data) throws WekaException
data - the data to processWekaException - if a problem occursprotected Instance processBuffers()
protected int compare(Instance one, Instance two, Sorter.InstanceHolder oneH, Sorter.InstanceHolder twoH)
one - the first instancetwo - the second instanceoneH - the first instance holder (in case string attributes are
present and we are running incrementally)twoH - the second instance holderprotected Instance generateMergedInstance(Sorter.InstanceHolder one, Sorter.InstanceHolder two)
one - the first input instancetwo - the second input instanceprotected void generateMergedHeader()
throws WekaException
WekaExceptionpublic java.util.List<java.lang.String> getIncomingConnectionTypes()
public java.util.List<java.lang.String> getOutgoingConnectionTypes()
public java.lang.String getCustomEditorForStep()
getCustomEditorForStep in interface StepgetCustomEditorForStep in class BaseStep