public class ExpireSnapshotsSparkAction
extends java.lang.Object
implements org.apache.iceberg.actions.ExpireSnapshots
ExpireSnapshots but uses
Spark to determine the delta in files between the pre and post-expiration table metadata. All of
the same restrictions of ExpireSnapshots also apply to this action.
This action first leverages ExpireSnapshots to expire snapshots and
then uses metadata tables to find files that can be safely deleted. This is done by anti-joining
two Datasets that contain all manifest and content files before and after the expiration. The
snapshot expiration will be fully committed before any deletes are issued.
This operation performs a shuffle so the parallelism can be controlled through 'spark.sql.shuffle.partitions'.
Deletes are still performed locally after retrieving the results from the Spark executors.
| Modifier and Type | Field and Description |
|---|---|
protected static org.apache.iceberg.relocated.com.google.common.base.Joiner |
COMMA_JOINER |
protected static org.apache.iceberg.relocated.com.google.common.base.Splitter |
COMMA_SPLITTER |
protected static java.lang.String |
FILE_PATH |
protected static java.lang.String |
LAST_MODIFIED |
protected static java.lang.String |
MANIFEST |
protected static java.lang.String |
MANIFEST_LIST |
protected static java.lang.String |
OTHERS |
protected static java.lang.String |
STATISTICS_FILES |
static java.lang.String |
STREAM_RESULTS |
static boolean |
STREAM_RESULTS_DEFAULT |
| Modifier and Type | Method and Description |
|---|---|
protected org.apache.spark.sql.Dataset<FileInfo> |
allReachableOtherMetadataFileDS(org.apache.iceberg.Table table) |
protected org.apache.spark.sql.Dataset<FileInfo> |
contentFileDS(org.apache.iceberg.Table table) |
protected org.apache.spark.sql.Dataset<FileInfo> |
contentFileDS(org.apache.iceberg.Table table,
java.util.Set<java.lang.Long> snapshotIds) |
protected org.apache.iceberg.spark.actions.BaseSparkAction.DeleteSummary |
deleteFiles(java.util.concurrent.ExecutorService executorService,
java.util.function.Consumer<java.lang.String> deleteFunc,
java.util.Iterator<FileInfo> files)
Deletes files and keeps track of how many files were removed for each file type.
|
protected org.apache.iceberg.spark.actions.BaseSparkAction.DeleteSummary |
deleteFiles(org.apache.iceberg.io.SupportsBulkOperations io,
java.util.Iterator<FileInfo> files) |
ExpireSnapshotsSparkAction |
deleteWith(java.util.function.Consumer<java.lang.String> newDeleteFunc) |
org.apache.iceberg.actions.ExpireSnapshots.Result |
execute() |
ExpireSnapshotsSparkAction |
executeDeleteWith(java.util.concurrent.ExecutorService executorService) |
org.apache.spark.sql.Dataset<FileInfo> |
expireFiles()
Expires snapshots and commits the changes to the table, returning a Dataset of files to delete.
|
ExpireSnapshotsSparkAction |
expireOlderThan(long timestampMillis) |
ExpireSnapshotsSparkAction |
expireSnapshotId(long snapshotId) |
protected org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> |
loadMetadataTable(org.apache.iceberg.Table table,
org.apache.iceberg.MetadataTableType type) |
protected org.apache.spark.sql.Dataset<FileInfo> |
manifestDS(org.apache.iceberg.Table table) |
protected org.apache.spark.sql.Dataset<FileInfo> |
manifestDS(org.apache.iceberg.Table table,
java.util.Set<java.lang.Long> snapshotIds) |
protected org.apache.spark.sql.Dataset<FileInfo> |
manifestListDS(org.apache.iceberg.Table table) |
protected org.apache.spark.sql.Dataset<FileInfo> |
manifestListDS(org.apache.iceberg.Table table,
java.util.Set<java.lang.Long> snapshotIds) |
protected JobGroupInfo |
newJobGroupInfo(java.lang.String groupId,
java.lang.String desc) |
protected org.apache.iceberg.Table |
newStaticTable(org.apache.iceberg.TableMetadata metadata,
org.apache.iceberg.io.FileIO io) |
ThisT |
option(java.lang.String name,
java.lang.String value) |
protected java.util.Map<java.lang.String,java.lang.String> |
options() |
ThisT |
options(java.util.Map<java.lang.String,java.lang.String> newOptions) |
protected org.apache.spark.sql.Dataset<FileInfo> |
otherMetadataFileDS(org.apache.iceberg.Table table) |
ExpireSnapshotsSparkAction |
retainLast(int numSnapshots) |
protected ExpireSnapshotsSparkAction |
self() |
protected org.apache.spark.sql.SparkSession |
spark() |
protected org.apache.spark.api.java.JavaSparkContext |
sparkContext() |
protected org.apache.spark.sql.Dataset<FileInfo> |
statisticsFileDS(org.apache.iceberg.Table table,
java.util.Set<java.lang.Long> snapshotIds) |
protected <T> T |
withJobGroupInfo(JobGroupInfo info,
java.util.function.Supplier<T> supplier) |
public static final java.lang.String STREAM_RESULTS
public static final boolean STREAM_RESULTS_DEFAULT
protected static final java.lang.String MANIFEST
protected static final java.lang.String MANIFEST_LIST
protected static final java.lang.String STATISTICS_FILES
protected static final java.lang.String OTHERS
protected static final java.lang.String FILE_PATH
protected static final java.lang.String LAST_MODIFIED
protected static final org.apache.iceberg.relocated.com.google.common.base.Splitter COMMA_SPLITTER
protected static final org.apache.iceberg.relocated.com.google.common.base.Joiner COMMA_JOINER
protected ExpireSnapshotsSparkAction self()
public ExpireSnapshotsSparkAction executeDeleteWith(java.util.concurrent.ExecutorService executorService)
executeDeleteWith in interface org.apache.iceberg.actions.ExpireSnapshotspublic ExpireSnapshotsSparkAction expireSnapshotId(long snapshotId)
expireSnapshotId in interface org.apache.iceberg.actions.ExpireSnapshotspublic ExpireSnapshotsSparkAction expireOlderThan(long timestampMillis)
expireOlderThan in interface org.apache.iceberg.actions.ExpireSnapshotspublic ExpireSnapshotsSparkAction retainLast(int numSnapshots)
retainLast in interface org.apache.iceberg.actions.ExpireSnapshotspublic ExpireSnapshotsSparkAction deleteWith(java.util.function.Consumer<java.lang.String> newDeleteFunc)
deleteWith in interface org.apache.iceberg.actions.ExpireSnapshotspublic org.apache.spark.sql.Dataset<FileInfo> expireFiles()
This does not delete data files. To delete data files, run execute().
This may be called before or after execute() to return the expired files.
public org.apache.iceberg.actions.ExpireSnapshots.Result execute()
execute in interface org.apache.iceberg.actions.Action<org.apache.iceberg.actions.ExpireSnapshots,org.apache.iceberg.actions.ExpireSnapshots.Result>protected org.apache.spark.sql.SparkSession spark()
protected org.apache.spark.api.java.JavaSparkContext sparkContext()
public ThisT option(java.lang.String name,
java.lang.String value)
public ThisT options(java.util.Map<java.lang.String,java.lang.String> newOptions)
protected java.util.Map<java.lang.String,java.lang.String> options()
protected <T> T withJobGroupInfo(JobGroupInfo info, java.util.function.Supplier<T> supplier)
protected JobGroupInfo newJobGroupInfo(java.lang.String groupId, java.lang.String desc)
protected org.apache.iceberg.Table newStaticTable(org.apache.iceberg.TableMetadata metadata,
org.apache.iceberg.io.FileIO io)
protected org.apache.spark.sql.Dataset<FileInfo> contentFileDS(org.apache.iceberg.Table table)
protected org.apache.spark.sql.Dataset<FileInfo> contentFileDS(org.apache.iceberg.Table table, java.util.Set<java.lang.Long> snapshotIds)
protected org.apache.spark.sql.Dataset<FileInfo> manifestDS(org.apache.iceberg.Table table)
protected org.apache.spark.sql.Dataset<FileInfo> manifestDS(org.apache.iceberg.Table table, java.util.Set<java.lang.Long> snapshotIds)
protected org.apache.spark.sql.Dataset<FileInfo> manifestListDS(org.apache.iceberg.Table table)
protected org.apache.spark.sql.Dataset<FileInfo> manifestListDS(org.apache.iceberg.Table table, java.util.Set<java.lang.Long> snapshotIds)
protected org.apache.spark.sql.Dataset<FileInfo> statisticsFileDS(org.apache.iceberg.Table table, java.util.Set<java.lang.Long> snapshotIds)
protected org.apache.spark.sql.Dataset<FileInfo> otherMetadataFileDS(org.apache.iceberg.Table table)
protected org.apache.spark.sql.Dataset<FileInfo> allReachableOtherMetadataFileDS(org.apache.iceberg.Table table)
protected org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> loadMetadataTable(org.apache.iceberg.Table table,
org.apache.iceberg.MetadataTableType type)
protected org.apache.iceberg.spark.actions.BaseSparkAction.DeleteSummary deleteFiles(java.util.concurrent.ExecutorService executorService,
java.util.function.Consumer<java.lang.String> deleteFunc,
java.util.Iterator<FileInfo> files)
executorService - an executor service to use for parallel deletesdeleteFunc - a delete funcfiles - an iterator of Spark rows of the structure (path: String, type: String)protected org.apache.iceberg.spark.actions.BaseSparkAction.DeleteSummary deleteFiles(org.apache.iceberg.io.SupportsBulkOperations io,
java.util.Iterator<FileInfo> files)