public class HadoopFSUtils extends Object
| Constructor and Description |
|---|
HadoopFSUtils() |
| Modifier and Type | Method and Description |
|---|---|
static org.apache.hadoop.fs.Path |
addSchemeIfLocalPath(String path) |
static org.apache.hadoop.fs.Path |
constructAbsolutePathInHadoopPath(String basePath,
String relativePartitionPath) |
static org.apache.hadoop.fs.FileStatus |
convertToHadoopFileStatus(StoragePathInfo pathInfo) |
static org.apache.hadoop.fs.Path |
convertToHadoopPath(StoragePath path) |
static StoragePath |
convertToStoragePath(org.apache.hadoop.fs.Path path) |
static StoragePathInfo |
convertToStoragePathInfo(org.apache.hadoop.fs.FileStatus fileStatus) |
static Map<String,Boolean> |
deleteFilesParallelize(HoodieTableMetaClient metaClient,
List<String> paths,
HoodieEngineContext context,
int parallelism,
boolean ignoreFailed) |
static HoodieFileStatus |
fromFileStatus(org.apache.hadoop.fs.FileStatus fileStatus) |
static HoodieFSPermission |
fromFSPermission(org.apache.hadoop.fs.permission.FsPermission fsPermission) |
static HoodiePath |
fromPath(org.apache.hadoop.fs.Path path) |
static org.apache.hadoop.fs.FileStatus[] |
getAllDataFilesInPartition(org.apache.hadoop.fs.FileSystem fs,
org.apache.hadoop.fs.Path partitionPath)
Get the names of all the base and log files in the given partition path.
|
static String |
getDFSFullPartitionPath(org.apache.hadoop.fs.FileSystem fs,
org.apache.hadoop.fs.Path fullPartitionPath)
Get DFS full partition path (e.g.
|
static String |
getFileIdFromFilePath(org.apache.hadoop.fs.Path filePath)
Check if the file is a base file of a log file.
|
static String |
getFileIdFromLogPath(org.apache.hadoop.fs.Path path)
Get the first part of the file name in the log file.
|
static long |
getFileSize(org.apache.hadoop.fs.FileSystem fs,
org.apache.hadoop.fs.Path path) |
static List<org.apache.hadoop.fs.FileStatus> |
getFileStatusAtLevel(HoodieEngineContext hoodieEngineContext,
org.apache.hadoop.fs.FileSystem fs,
org.apache.hadoop.fs.Path rootPath,
int expectLevel,
int parallelism)
Lists file status at a certain level in the directory hierarchy.
|
static org.apache.hadoop.fs.FileSystem |
getFs(org.apache.hadoop.fs.Path path,
org.apache.hadoop.conf.Configuration conf) |
static <T> org.apache.hadoop.fs.FileSystem |
getFs(org.apache.hadoop.fs.Path path,
StorageConfiguration<T> storageConf) |
static <T> org.apache.hadoop.fs.FileSystem |
getFs(org.apache.hadoop.fs.Path path,
StorageConfiguration<T> storageConf,
boolean newCopy) |
static org.apache.hadoop.fs.FileSystem |
getFs(StoragePath path,
org.apache.hadoop.conf.Configuration conf) |
static org.apache.hadoop.fs.FileSystem |
getFs(String pathStr,
org.apache.hadoop.conf.Configuration conf) |
static org.apache.hadoop.fs.FileSystem |
getFs(String pathStr,
org.apache.hadoop.conf.Configuration conf,
boolean localByDefault) |
static <T> org.apache.hadoop.fs.FileSystem |
getFs(String pathStr,
StorageConfiguration<T> storageConf) |
static org.apache.hadoop.fs.FSDataInputStream |
getFSDataInputStream(org.apache.hadoop.fs.FileSystem fs,
StoragePath filePath,
int bufferSize,
boolean wrapStream)
Fetch the right
FSDataInputStream to be used by wrapping with required input streams. |
static String |
getRelativePartitionPath(org.apache.hadoop.fs.Path basePath,
org.apache.hadoop.fs.Path fullPartitionPath)
Given a base partition and a partition path, return relative path of partition path to the base path.
|
static StorageConfiguration<org.apache.hadoop.conf.Configuration> |
getStorageConf(org.apache.hadoop.conf.Configuration conf) |
static StorageConfiguration<org.apache.hadoop.conf.Configuration> |
getStorageConfWithCopy(org.apache.hadoop.conf.Configuration conf) |
static boolean |
isBaseFile(org.apache.hadoop.fs.Path path) |
static boolean |
isCHDFileSystem(org.apache.hadoop.fs.FileSystem fs)
Chdfs will throw
IOException instead of EOFException. |
static boolean |
isDataFile(org.apache.hadoop.fs.Path path)
Returns true if the given path is a Base file or a Log file.
|
static boolean |
isGCSFileSystem(org.apache.hadoop.fs.FileSystem fs)
This is due to HUDI-140 GCS has a different behavior for detecting EOF during seek().
|
static boolean |
isLogFile(org.apache.hadoop.fs.Path logPath) |
static <T> Map<String,T> |
parallelizeFilesProcess(HoodieEngineContext hoodieEngineContext,
org.apache.hadoop.fs.FileSystem fs,
int parallelism,
FSUtils.SerializableFunction<Pair<String,StorageConfiguration<org.apache.hadoop.conf.Configuration>>,T> pairFunction,
List<String> subPaths) |
static org.apache.hadoop.conf.Configuration |
prepareHadoopConf(org.apache.hadoop.conf.Configuration conf) |
static boolean |
recoverDFSFileLease(org.apache.hadoop.hdfs.DistributedFileSystem dfs,
org.apache.hadoop.fs.Path p)
When a file was opened and the task died without closing the stream, another task executor cannot open because the
existing lease will be active.
|
static org.apache.hadoop.conf.Configuration |
registerFileSystem(StoragePath file,
org.apache.hadoop.conf.Configuration conf) |
static org.apache.hadoop.fs.permission.FsPermission |
toFSPermission(HoodieFSPermission fsPermission) |
static org.apache.hadoop.fs.Path |
toPath(HoodiePath path) |
public static org.apache.hadoop.conf.Configuration prepareHadoopConf(org.apache.hadoop.conf.Configuration conf)
public static StorageConfiguration<org.apache.hadoop.conf.Configuration> getStorageConf(org.apache.hadoop.conf.Configuration conf)
public static StorageConfiguration<org.apache.hadoop.conf.Configuration> getStorageConfWithCopy(org.apache.hadoop.conf.Configuration conf)
public static <T> org.apache.hadoop.fs.FileSystem getFs(String pathStr, StorageConfiguration<T> storageConf)
public static <T> org.apache.hadoop.fs.FileSystem getFs(org.apache.hadoop.fs.Path path,
StorageConfiguration<T> storageConf)
public static <T> org.apache.hadoop.fs.FileSystem getFs(org.apache.hadoop.fs.Path path,
StorageConfiguration<T> storageConf,
boolean newCopy)
public static org.apache.hadoop.fs.FileSystem getFs(String pathStr, org.apache.hadoop.conf.Configuration conf)
public static org.apache.hadoop.fs.FileSystem getFs(StoragePath path, org.apache.hadoop.conf.Configuration conf)
public static org.apache.hadoop.fs.FileSystem getFs(org.apache.hadoop.fs.Path path,
org.apache.hadoop.conf.Configuration conf)
public static org.apache.hadoop.fs.FileSystem getFs(String pathStr, org.apache.hadoop.conf.Configuration conf, boolean localByDefault)
public static org.apache.hadoop.fs.Path addSchemeIfLocalPath(String path)
public static org.apache.hadoop.fs.Path convertToHadoopPath(StoragePath path)
path - StoragePath instance.Path instance after conversion.public static StoragePath convertToStoragePath(org.apache.hadoop.fs.Path path)
path - Hadoop Path instance.StoragePath instance after conversion.public static StoragePathInfo convertToStoragePathInfo(org.apache.hadoop.fs.FileStatus fileStatus)
fileStatus - Hadoop FileStatus instance.StoragePathInfo instance after conversion.public static org.apache.hadoop.fs.FileStatus convertToHadoopFileStatus(StoragePathInfo pathInfo)
pathInfo - StoragePathInfo instance.FileStatus instance after conversion.public static org.apache.hadoop.fs.FSDataInputStream getFSDataInputStream(org.apache.hadoop.fs.FileSystem fs,
StoragePath filePath,
int bufferSize,
boolean wrapStream)
FSDataInputStream to be used by wrapping with required input streams.fs - instance of FileSystem in use.filePath - path of the file.bufferSize - buffer size to be used.wrapStream - if false, don't attempt to wrap the streamFSDataInputStream as required.public static boolean isGCSFileSystem(org.apache.hadoop.fs.FileSystem fs)
fs - fileSystem instance.public static boolean isCHDFileSystem(org.apache.hadoop.fs.FileSystem fs)
IOException instead of EOFException. It will cause error in isBlockCorrupted().
Wrapped by BoundedFsDataInputStream, to check whether the desired offset is out of the file size in advance.public static org.apache.hadoop.conf.Configuration registerFileSystem(StoragePath file, org.apache.hadoop.conf.Configuration conf)
public static org.apache.hadoop.fs.Path toPath(HoodiePath path)
public static HoodiePath fromPath(org.apache.hadoop.fs.Path path)
public static org.apache.hadoop.fs.permission.FsPermission toFSPermission(HoodieFSPermission fsPermission)
public static HoodieFSPermission fromFSPermission(org.apache.hadoop.fs.permission.FsPermission fsPermission)
public static HoodieFileStatus fromFileStatus(org.apache.hadoop.fs.FileStatus fileStatus)
public static long getFileSize(org.apache.hadoop.fs.FileSystem fs,
org.apache.hadoop.fs.Path path)
throws IOException
IOExceptionpublic static String getRelativePartitionPath(org.apache.hadoop.fs.Path basePath, org.apache.hadoop.fs.Path fullPartitionPath)
public static String getFileIdFromLogPath(org.apache.hadoop.fs.Path path)
public static String getFileIdFromFilePath(org.apache.hadoop.fs.Path filePath)
public static boolean isBaseFile(org.apache.hadoop.fs.Path path)
public static boolean isLogFile(org.apache.hadoop.fs.Path logPath)
public static boolean isDataFile(org.apache.hadoop.fs.Path path)
public static org.apache.hadoop.fs.FileStatus[] getAllDataFilesInPartition(org.apache.hadoop.fs.FileSystem fs,
org.apache.hadoop.fs.Path partitionPath)
throws IOException
IOExceptionpublic static org.apache.hadoop.fs.Path constructAbsolutePathInHadoopPath(String basePath, String relativePartitionPath)
public static String getDFSFullPartitionPath(org.apache.hadoop.fs.FileSystem fs, org.apache.hadoop.fs.Path fullPartitionPath)
public static <T> Map<String,T> parallelizeFilesProcess(HoodieEngineContext hoodieEngineContext, org.apache.hadoop.fs.FileSystem fs, int parallelism, FSUtils.SerializableFunction<Pair<String,StorageConfiguration<org.apache.hadoop.conf.Configuration>>,T> pairFunction, List<String> subPaths)
public static List<org.apache.hadoop.fs.FileStatus> getFileStatusAtLevel(HoodieEngineContext hoodieEngineContext, org.apache.hadoop.fs.FileSystem fs, org.apache.hadoop.fs.Path rootPath, int expectLevel, int parallelism)
E.g., given "/tmp/hoodie_table" as the rootPath, and 3 as the expected level,
this method gives back the FileStatus of all files under
"/tmp/hoodie_table/[*]/[*]/[*]/" folders.
hoodieEngineContext - HoodieEngineContext instance.fs - FileSystem instance.rootPath - Root path for the file listing.expectLevel - Expected level of directory hierarchy for files to be added.parallelism - Parallelism for the file listing.public static Map<String,Boolean> deleteFilesParallelize(HoodieTableMetaClient metaClient, List<String> paths, HoodieEngineContext context, int parallelism, boolean ignoreFailed)
public static boolean recoverDFSFileLease(org.apache.hadoop.hdfs.DistributedFileSystem dfs,
org.apache.hadoop.fs.Path p)
throws IOException,
InterruptedException
IOExceptionInterruptedExceptionCopyright © 2024 The Apache Software Foundation. All rights reserved.