@EventDriven @SupportsBatching @InputRequirement(value=INPUT_REQUIRED) @SystemResourceConsideration(resource=MEMORY,description="The HashSet filter type will grow memory space proportionate to the number of unique records processed. The BloomFilter type will use constant memory regardless of the number of records processed.") @SystemResourceConsideration(resource=CPU,description="If a more advanced hash algorithm is chosen, the amount of time required to hash any particular record could increase substantially.") @Tags(value={"text","record","update","change","replace","modify","distinct","unique","filter","hash","dupe","duplicate","dedupe"}) @CapabilityDescription(value="This processor attempts to deduplicate a record set in memory using either a hashset or a bloom filter. It operates on a per-file basis rather than across an entire data set that spans multiple files.") @WritesAttribute(attribute="record.count", description="The number of records processed.") @DynamicProperty(name="RecordPath", value="An expression language statement used to determine how the RecordPath is resolved. The following variables are availible: ${field.name}, ${field.value}, ${field.type}", description="The name of each user-defined property must be a valid RecordPath.") @SeeAlso(classNames={"org.apache.nifi.distributed.cache.client.DistributedMapCacheClientService","org.apache.nifi.distributed.cache.server.map.DistributedMapCacheServer","org.apache.nifi.processors.standard.DetectDuplicate"}) public class DeduplicateRecord extends AbstractProcessor
| Modifier and Type | Class and Description |
|---|---|
private static class |
DeduplicateRecord.BloomFilterWrapper |
private static class |
DeduplicateRecord.CacheValue |
private static class |
DeduplicateRecord.DistributedMapCacheClientWrapper |
private static class |
DeduplicateRecord.FilterWrapper |
private static class |
DeduplicateRecord.HashSetFilterWrapper |
| Constructor and Description |
|---|
DeduplicateRecord() |
| Modifier and Type | Method and Description |
|---|---|
protected Collection<ValidationResult> |
customValidate(ValidationContext context) |
private String |
executeDynamicRecordPaths(ProcessContext context,
Record record,
FlowFile flowFile) |
private DeduplicateRecord.FilterWrapper |
getFilter(ProcessContext context) |
Set<Relationship> |
getRelationships() |
protected PropertyDescriptor |
getSupportedDynamicPropertyDescriptor(String propertyDescriptorName) |
List<PropertyDescriptor> |
getSupportedPropertyDescriptors() |
protected void |
init(ProcessorInitializationContext context) |
void |
onScheduled(ProcessContext context) |
void |
onTrigger(ProcessContext context,
ProcessSession session) |
private void |
sendOrRemove(ProcessSession session,
FlowFile outputFlowFile,
Relationship targetRelationship,
String mimeType,
boolean includeZeroRecordFlowFiles,
WriteResult writeResult) |
onTriggergetControllerServiceLookup, getIdentifier, getLogger, getNodeTypeProvider, initialize, isConfigurationRestored, isScheduled, toString, updateConfiguredRestoredTrue, updateScheduledFalse, updateScheduledTrueequals, getPropertyDescriptor, getPropertyDescriptors, hashCode, onPropertyModified, validateclone, finalize, getClass, notify, notifyAll, wait, wait, waitisStatefulgetPropertyDescriptor, getPropertyDescriptors, onPropertyModified, validatepublic static final char JOIN_CHAR
private static final String FIELD_NAME
private static final String FIELD_VALUE
private static final String FIELD_TYPE
private volatile RecordPathCache recordPathCache
private volatile List<PropertyDescriptor> dynamicProperties
static final AllowableValue NONE_ALGORITHM_VALUE
static final AllowableValue SHA256_ALGORITHM_VALUE
static final AllowableValue SHA512_ALGORITHM_VALUE
static final AllowableValue HASH_SET_VALUE
static final AllowableValue BLOOM_FILTER_VALUE
static final PropertyDescriptor RECORD_READER
static final PropertyDescriptor RECORD_WRITER
static final AllowableValue OPTION_SINGLE_FILE
static final AllowableValue OPTION_MULTIPLE_FILES
static final PropertyDescriptor DEDUPLICATION_STRATEGY
static final PropertyDescriptor DISTRIBUTED_MAP_CACHE
static final PropertyDescriptor CACHE_IDENTIFIER
static final PropertyDescriptor INCLUDE_ZERO_RECORD_FLOWFILES
static final PropertyDescriptor RECORD_HASHING_ALGORITHM
static final PropertyDescriptor FILTER_TYPE
static final PropertyDescriptor FILTER_CAPACITY_HINT
static final PropertyDescriptor BLOOM_FILTER_FPP
static final Relationship REL_DUPLICATE
static final Relationship REL_NON_DUPLICATE
static final Relationship REL_ORIGINAL
static final Relationship REL_FAILURE
private List<PropertyDescriptor> descriptors
private Set<Relationship> relationships
private DistributedMapCacheClient mapCacheClient
private RecordReaderFactory readerFactory
private RecordSetWriterFactory writerFactory
private boolean useInMemoryStrategy
private static final Serializer<String> STRING_SERIALIZER
private static final Serializer<Boolean> BOOLEAN_SERIALIZER
protected void init(ProcessorInitializationContext context)
init in class AbstractSessionFactoryProcessorpublic Set<Relationship> getRelationships()
getRelationships in interface ProcessorgetRelationships in class AbstractSessionFactoryProcessorpublic final List<PropertyDescriptor> getSupportedPropertyDescriptors()
getSupportedPropertyDescriptors in class AbstractConfigurableComponentprotected PropertyDescriptor getSupportedDynamicPropertyDescriptor(String propertyDescriptorName)
getSupportedDynamicPropertyDescriptor in class AbstractConfigurableComponentprotected Collection<ValidationResult> customValidate(ValidationContext context)
customValidate in class AbstractConfigurableComponent@OnScheduled public void onScheduled(ProcessContext context)
private DeduplicateRecord.FilterWrapper getFilter(ProcessContext context)
public void onTrigger(ProcessContext context, ProcessSession session) throws ProcessException
onTrigger in class AbstractProcessorProcessExceptionprivate void sendOrRemove(ProcessSession session, FlowFile outputFlowFile, Relationship targetRelationship, String mimeType, boolean includeZeroRecordFlowFiles, WriteResult writeResult)
private String executeDynamicRecordPaths(ProcessContext context, Record record, FlowFile flowFile)
Copyright © 2023 Apache NiFi Project. All rights reserved.