public class DeleteOrphanFilesSparkAction
extends java.lang.Object
implements org.apache.iceberg.actions.DeleteOrphanFiles
FileSystem.
By default, this action cleans up the table location returned by Table.location() and
removes unreachable files that are older than 3 days using Table.io(). The behavior can
be modified by passing a custom location to location and a custom timestamp to olderThan(long). For example, someone might point this action to the data folder to clean up
only orphan data files.
Configure an alternative delete method using deleteWith(Consumer).
For full control of the set of files being evaluated, use the compareToFileList(Dataset) argument. This skips the directory listing - any files in the
dataset provided which are not found in table metadata will be deleted, using the same Table.location() and olderThan(long) filtering as above.
Note: It is dangerous to call this action with a short retention interval as it might corrupt the state of the table if another operation is writing at the same time.
| Modifier and Type | Class and Description |
|---|---|
static class |
DeleteOrphanFilesSparkAction.FileURI |
| Modifier and Type | Field and Description |
|---|---|
protected static org.apache.iceberg.relocated.com.google.common.base.Joiner |
COMMA_JOINER |
protected static org.apache.iceberg.relocated.com.google.common.base.Splitter |
COMMA_SPLITTER |
protected static java.lang.String |
FILE_PATH |
protected static java.lang.String |
LAST_MODIFIED |
protected static java.lang.String |
MANIFEST |
protected static java.lang.String |
MANIFEST_LIST |
protected static java.lang.String |
OTHERS |
protected static java.lang.String |
STATISTICS_FILES |
| Modifier and Type | Method and Description |
|---|---|
protected org.apache.spark.sql.Dataset<FileInfo> |
allReachableOtherMetadataFileDS(org.apache.iceberg.Table table) |
DeleteOrphanFilesSparkAction |
compareToFileList(org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> files) |
protected org.apache.spark.sql.Dataset<FileInfo> |
contentFileDS(org.apache.iceberg.Table table) |
protected org.apache.spark.sql.Dataset<FileInfo> |
contentFileDS(org.apache.iceberg.Table table,
java.util.Set<java.lang.Long> snapshotIds) |
protected org.apache.iceberg.spark.actions.BaseSparkAction.DeleteSummary |
deleteFiles(java.util.concurrent.ExecutorService executorService,
java.util.function.Consumer<java.lang.String> deleteFunc,
java.util.Iterator<FileInfo> files)
Deletes files and keeps track of how many files were removed for each file type.
|
protected org.apache.iceberg.spark.actions.BaseSparkAction.DeleteSummary |
deleteFiles(org.apache.iceberg.io.SupportsBulkOperations io,
java.util.Iterator<FileInfo> files) |
DeleteOrphanFilesSparkAction |
deleteWith(java.util.function.Consumer<java.lang.String> newDeleteFunc) |
DeleteOrphanFilesSparkAction |
equalAuthorities(java.util.Map<java.lang.String,java.lang.String> newEqualAuthorities) |
DeleteOrphanFilesSparkAction |
equalSchemes(java.util.Map<java.lang.String,java.lang.String> newEqualSchemes) |
org.apache.iceberg.actions.DeleteOrphanFiles.Result |
execute() |
DeleteOrphanFilesSparkAction |
executeDeleteWith(java.util.concurrent.ExecutorService executorService) |
protected org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> |
loadMetadataTable(org.apache.iceberg.Table table,
org.apache.iceberg.MetadataTableType type) |
DeleteOrphanFilesSparkAction |
location(java.lang.String newLocation) |
protected org.apache.spark.sql.Dataset<FileInfo> |
manifestDS(org.apache.iceberg.Table table) |
protected org.apache.spark.sql.Dataset<FileInfo> |
manifestDS(org.apache.iceberg.Table table,
java.util.Set<java.lang.Long> snapshotIds) |
protected org.apache.spark.sql.Dataset<FileInfo> |
manifestListDS(org.apache.iceberg.Table table) |
protected org.apache.spark.sql.Dataset<FileInfo> |
manifestListDS(org.apache.iceberg.Table table,
java.util.Set<java.lang.Long> snapshotIds) |
protected JobGroupInfo |
newJobGroupInfo(java.lang.String groupId,
java.lang.String desc) |
protected org.apache.iceberg.Table |
newStaticTable(org.apache.iceberg.TableMetadata metadata,
org.apache.iceberg.io.FileIO io) |
DeleteOrphanFilesSparkAction |
olderThan(long newOlderThanTimestamp) |
ThisT |
option(java.lang.String name,
java.lang.String value) |
protected java.util.Map<java.lang.String,java.lang.String> |
options() |
ThisT |
options(java.util.Map<java.lang.String,java.lang.String> newOptions) |
protected org.apache.spark.sql.Dataset<FileInfo> |
otherMetadataFileDS(org.apache.iceberg.Table table) |
DeleteOrphanFilesSparkAction |
prefixMismatchMode(org.apache.iceberg.actions.DeleteOrphanFiles.PrefixMismatchMode newPrefixMismatchMode) |
protected DeleteOrphanFilesSparkAction |
self() |
protected org.apache.spark.sql.SparkSession |
spark() |
protected org.apache.spark.api.java.JavaSparkContext |
sparkContext() |
protected org.apache.spark.sql.Dataset<FileInfo> |
statisticsFileDS(org.apache.iceberg.Table table,
java.util.Set<java.lang.Long> snapshotIds) |
protected <T> T |
withJobGroupInfo(JobGroupInfo info,
java.util.function.Supplier<T> supplier) |
protected static final java.lang.String MANIFEST
protected static final java.lang.String MANIFEST_LIST
protected static final java.lang.String STATISTICS_FILES
protected static final java.lang.String OTHERS
protected static final java.lang.String FILE_PATH
protected static final java.lang.String LAST_MODIFIED
protected static final org.apache.iceberg.relocated.com.google.common.base.Splitter COMMA_SPLITTER
protected static final org.apache.iceberg.relocated.com.google.common.base.Joiner COMMA_JOINER
protected DeleteOrphanFilesSparkAction self()
public DeleteOrphanFilesSparkAction executeDeleteWith(java.util.concurrent.ExecutorService executorService)
executeDeleteWith in interface org.apache.iceberg.actions.DeleteOrphanFilespublic DeleteOrphanFilesSparkAction prefixMismatchMode(org.apache.iceberg.actions.DeleteOrphanFiles.PrefixMismatchMode newPrefixMismatchMode)
prefixMismatchMode in interface org.apache.iceberg.actions.DeleteOrphanFilespublic DeleteOrphanFilesSparkAction equalSchemes(java.util.Map<java.lang.String,java.lang.String> newEqualSchemes)
equalSchemes in interface org.apache.iceberg.actions.DeleteOrphanFilespublic DeleteOrphanFilesSparkAction equalAuthorities(java.util.Map<java.lang.String,java.lang.String> newEqualAuthorities)
equalAuthorities in interface org.apache.iceberg.actions.DeleteOrphanFilespublic DeleteOrphanFilesSparkAction location(java.lang.String newLocation)
location in interface org.apache.iceberg.actions.DeleteOrphanFilespublic DeleteOrphanFilesSparkAction olderThan(long newOlderThanTimestamp)
olderThan in interface org.apache.iceberg.actions.DeleteOrphanFilespublic DeleteOrphanFilesSparkAction deleteWith(java.util.function.Consumer<java.lang.String> newDeleteFunc)
deleteWith in interface org.apache.iceberg.actions.DeleteOrphanFilespublic DeleteOrphanFilesSparkAction compareToFileList(org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> files)
public org.apache.iceberg.actions.DeleteOrphanFiles.Result execute()
execute in interface org.apache.iceberg.actions.Action<org.apache.iceberg.actions.DeleteOrphanFiles,org.apache.iceberg.actions.DeleteOrphanFiles.Result>protected org.apache.spark.sql.SparkSession spark()
protected org.apache.spark.api.java.JavaSparkContext sparkContext()
public ThisT option(java.lang.String name,
java.lang.String value)
public ThisT options(java.util.Map<java.lang.String,java.lang.String> newOptions)
protected java.util.Map<java.lang.String,java.lang.String> options()
protected <T> T withJobGroupInfo(JobGroupInfo info, java.util.function.Supplier<T> supplier)
protected JobGroupInfo newJobGroupInfo(java.lang.String groupId, java.lang.String desc)
protected org.apache.iceberg.Table newStaticTable(org.apache.iceberg.TableMetadata metadata,
org.apache.iceberg.io.FileIO io)
protected org.apache.spark.sql.Dataset<FileInfo> contentFileDS(org.apache.iceberg.Table table)
protected org.apache.spark.sql.Dataset<FileInfo> contentFileDS(org.apache.iceberg.Table table, java.util.Set<java.lang.Long> snapshotIds)
protected org.apache.spark.sql.Dataset<FileInfo> manifestDS(org.apache.iceberg.Table table)
protected org.apache.spark.sql.Dataset<FileInfo> manifestDS(org.apache.iceberg.Table table, java.util.Set<java.lang.Long> snapshotIds)
protected org.apache.spark.sql.Dataset<FileInfo> manifestListDS(org.apache.iceberg.Table table)
protected org.apache.spark.sql.Dataset<FileInfo> manifestListDS(org.apache.iceberg.Table table, java.util.Set<java.lang.Long> snapshotIds)
protected org.apache.spark.sql.Dataset<FileInfo> statisticsFileDS(org.apache.iceberg.Table table, java.util.Set<java.lang.Long> snapshotIds)
protected org.apache.spark.sql.Dataset<FileInfo> otherMetadataFileDS(org.apache.iceberg.Table table)
protected org.apache.spark.sql.Dataset<FileInfo> allReachableOtherMetadataFileDS(org.apache.iceberg.Table table)
protected org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> loadMetadataTable(org.apache.iceberg.Table table,
org.apache.iceberg.MetadataTableType type)
protected org.apache.iceberg.spark.actions.BaseSparkAction.DeleteSummary deleteFiles(java.util.concurrent.ExecutorService executorService,
java.util.function.Consumer<java.lang.String> deleteFunc,
java.util.Iterator<FileInfo> files)
executorService - an executor service to use for parallel deletesdeleteFunc - a delete funcfiles - an iterator of Spark rows of the structure (path: String, type: String)protected org.apache.iceberg.spark.actions.BaseSparkAction.DeleteSummary deleteFiles(org.apache.iceberg.io.SupportsBulkOperations io,
java.util.Iterator<FileInfo> files)