public class SparkDistributedDataScan
extends org.apache.iceberg.SnapshotScan<ThisT,T,G>
This scan remotely filters manifests, fetching only the relevant data and delete files to the driver. The delete file assignment is done locally after the remote filtering step. Such approach is beneficial if the remote parallelism is much higher than the number of driver cores.
This scan is best suited for queries with selective filters on lower/upper bounds across all partitions, or against poorly clustered metadata. This allows job planning to benefit from highly concurrent remote filtering while not incurring high serialization and data transfer costs. This class is also useful for full table scans over large tables but the cost of bringing data and delete file details to the driver may become noticeable. Make sure to follow the performance tips below in such cases.
Ensure the filtered metadata size doesn't exceed the driver's max result size. For large table scans, consider increasing `spark.driver.maxResultSize` to avoid job failures.
Performance tips:
| Modifier and Type | Field and Description |
|---|---|
protected static java.util.List<java.lang.String> |
DELETE_SCAN_COLUMNS |
protected static java.util.List<java.lang.String> |
DELETE_SCAN_WITH_STATS_COLUMNS |
protected static boolean |
PLAN_SCANS_WITH_WORKER_POOL |
protected static java.util.List<java.lang.String> |
SCAN_COLUMNS |
protected static java.util.List<java.lang.String> |
SCAN_WITH_STATS_COLUMNS |
| Constructor and Description |
|---|
SparkDistributedDataScan(org.apache.spark.sql.SparkSession spark,
org.apache.iceberg.Table table,
SparkReadConf readConf) |
| Modifier and Type | Method and Description |
|---|---|
protected java.util.Set<java.lang.Integer> |
columnsToKeepStats() |
protected org.apache.iceberg.TableScanContext |
context() |
protected org.apache.iceberg.PlanningMode |
dataPlanningMode() |
protected org.apache.iceberg.PlanningMode |
deletePlanningMode() |
protected org.apache.iceberg.io.CloseableIterable<org.apache.iceberg.ScanTask> |
doPlanFiles() |
protected org.apache.iceberg.io.FileIO |
io() |
protected org.apache.iceberg.ManifestGroup |
newManifestGroup(java.util.List<org.apache.iceberg.ManifestFile> arg0,
boolean arg1) |
protected org.apache.iceberg.ManifestGroup |
newManifestGroup(java.util.List<org.apache.iceberg.ManifestFile> arg0,
java.util.List<org.apache.iceberg.ManifestFile> arg1) |
protected org.apache.iceberg.ManifestGroup |
newManifestGroup(java.util.List<org.apache.iceberg.ManifestFile> arg0,
java.util.List<org.apache.iceberg.ManifestFile> arg1,
boolean arg2) |
protected org.apache.iceberg.BatchScan |
newRefinedScan(org.apache.iceberg.Table newTable,
org.apache.iceberg.Schema newSchema,
org.apache.iceberg.TableScanContext newContext) |
protected java.util.Map<java.lang.String,java.lang.String> |
options() |
protected java.lang.Iterable<org.apache.iceberg.io.CloseableIterable<org.apache.iceberg.DataFile>> |
planDataRemotely(java.util.List<org.apache.iceberg.ManifestFile> dataManifests,
boolean withColumnStats) |
protected org.apache.iceberg.DeleteFileIndex |
planDeletesRemotely(java.util.List<org.apache.iceberg.ManifestFile> deleteManifests) |
protected java.util.concurrent.ExecutorService |
planExecutor() |
org.apache.iceberg.io.CloseableIterable<org.apache.iceberg.ScanTaskGroup<org.apache.iceberg.ScanTask>> |
planTasks() |
protected int |
remoteParallelism() |
protected org.apache.iceberg.expressions.Expression |
residualFilter() |
protected java.util.List<java.lang.String> |
scanColumns() |
protected boolean |
shouldCopyRemotelyPlannedDataFiles() |
protected boolean |
shouldIgnoreResiduals() |
protected boolean |
shouldPlanWithExecutor() |
protected boolean |
shouldReturnColumnStats() |
protected org.apache.iceberg.Schema |
tableSchema() |
protected boolean |
useSnapshotSchema() |
planFiles, scanMetrics, snapshot, snapshotId, toStringprotected static final java.util.List<java.lang.String> SCAN_COLUMNS
protected static final java.util.List<java.lang.String> SCAN_WITH_STATS_COLUMNS
protected static final java.util.List<java.lang.String> DELETE_SCAN_COLUMNS
protected static final java.util.List<java.lang.String> DELETE_SCAN_WITH_STATS_COLUMNS
protected static final boolean PLAN_SCANS_WITH_WORKER_POOL
public SparkDistributedDataScan(org.apache.spark.sql.SparkSession spark,
org.apache.iceberg.Table table,
SparkReadConf readConf)
protected org.apache.iceberg.BatchScan newRefinedScan(org.apache.iceberg.Table newTable,
org.apache.iceberg.Schema newSchema,
org.apache.iceberg.TableScanContext newContext)
protected int remoteParallelism()
protected org.apache.iceberg.PlanningMode dataPlanningMode()
protected boolean shouldCopyRemotelyPlannedDataFiles()
protected java.lang.Iterable<org.apache.iceberg.io.CloseableIterable<org.apache.iceberg.DataFile>> planDataRemotely(java.util.List<org.apache.iceberg.ManifestFile> dataManifests,
boolean withColumnStats)
protected org.apache.iceberg.PlanningMode deletePlanningMode()
protected org.apache.iceberg.DeleteFileIndex planDeletesRemotely(java.util.List<org.apache.iceberg.ManifestFile> deleteManifests)
protected org.apache.iceberg.io.CloseableIterable<org.apache.iceberg.ScanTask> doPlanFiles()
doPlanFiles in class org.apache.iceberg.SnapshotScan<org.apache.iceberg.BatchScan,org.apache.iceberg.ScanTask,org.apache.iceberg.ScanTaskGroup<org.apache.iceberg.ScanTask>>public org.apache.iceberg.io.CloseableIterable<org.apache.iceberg.ScanTaskGroup<org.apache.iceberg.ScanTask>> planTasks()
planTasks in interface org.apache.iceberg.Scan<org.apache.iceberg.BatchScan,org.apache.iceberg.ScanTask,org.apache.iceberg.ScanTaskGroup<org.apache.iceberg.ScanTask>>protected boolean useSnapshotSchema()
useSnapshotSchema in class org.apache.iceberg.SnapshotScan<ThisT,T extends org.apache.iceberg.ScanTask,G extends org.apache.iceberg.ScanTaskGroup<T>>protected org.apache.iceberg.ManifestGroup newManifestGroup(java.util.List<org.apache.iceberg.ManifestFile> arg0,
java.util.List<org.apache.iceberg.ManifestFile> arg1)
protected org.apache.iceberg.ManifestGroup newManifestGroup(java.util.List<org.apache.iceberg.ManifestFile> arg0,
boolean arg1)
protected org.apache.iceberg.ManifestGroup newManifestGroup(java.util.List<org.apache.iceberg.ManifestFile> arg0,
java.util.List<org.apache.iceberg.ManifestFile> arg1,
boolean arg2)
protected org.apache.iceberg.io.FileIO io()
protected org.apache.iceberg.Schema tableSchema()
protected org.apache.iceberg.TableScanContext context()
protected java.util.Map<java.lang.String,java.lang.String> options()
protected java.util.List<java.lang.String> scanColumns()
protected boolean shouldReturnColumnStats()
protected java.util.Set<java.lang.Integer> columnsToKeepStats()
protected boolean shouldIgnoreResiduals()
protected org.apache.iceberg.expressions.Expression residualFilter()
protected boolean shouldPlanWithExecutor()
protected java.util.concurrent.ExecutorService planExecutor()