public class HoodieInputFormatUtils extends Object
| Modifier and Type | Field and Description |
|---|---|
static int |
HOODIE_COMMIT_TIME_COL_POS |
static int |
HOODIE_PARTITION_PATH_COL_POS |
static String |
HOODIE_READ_COLUMNS_PROP |
static int |
HOODIE_RECORD_KEY_COL_POS |
| Constructor and Description |
|---|
HoodieInputFormatUtils() |
| Modifier and Type | Method and Description |
|---|---|
static HoodieMetadataConfig |
buildMetadataConfig(org.apache.hadoop.conf.Configuration conf) |
static List<org.apache.hadoop.fs.FileStatus> |
filterIncrementalFileStatus(org.apache.hadoop.mapreduce.Job job,
HoodieTableMetaClient tableMetaClient,
HoodieTimeline timeline,
org.apache.hadoop.fs.FileStatus[] fileStatuses,
List<HoodieInstant> commitsToCheck)
Filter a list of FileStatus based on commitsToCheck for incremental view.
|
static HoodieDefaultTimeline |
filterInstantsTimeline(HoodieDefaultTimeline timeline)
Filter any specific instants that we do not want to process.
|
static Option<String> |
getAffectedPartitions(List<HoodieInstant> commitsToCheck,
HoodieTableMetaClient tableMetaClient,
HoodieTimeline timeline,
List<org.apache.hadoop.fs.Path> inputPaths)
Extract partitions touched by the commitsToCheck.
|
static HoodieCommitMetadata |
getCommitMetadata(HoodieInstant instant,
HoodieTimeline timeline)
Returns the commit metadata of the given instant.
|
static Option<List<HoodieInstant>> |
getCommitsForIncrementalQuery(org.apache.hadoop.mapreduce.Job job,
String tableName,
HoodieTimeline timeline)
Get commits for incremental query from Hive map reduce configuration.
|
static org.apache.hadoop.fs.FileStatus |
getFileStatus(HoodieBaseFile baseFile) |
static Option<HoodieTimeline> |
getFilteredCommitsTimeline(org.apache.hadoop.mapreduce.JobContext job,
HoodieTableMetaClient tableMetaClient)
Extract HoodieTimeline based on HoodieTableMetaClient.
|
static HoodieTimeline |
getHoodieTimelineForIncrementalQuery(org.apache.hadoop.mapreduce.JobContext job,
String tableName,
HoodieTimeline timeline)
Get HoodieTimeline for incremental query from Hive map reduce configuration.
|
static org.apache.hadoop.mapred.FileInputFormat |
getInputFormat(HoodieFileFormat baseFileFormat,
boolean realtime,
org.apache.hadoop.conf.Configuration conf) |
static org.apache.hadoop.mapred.FileInputFormat |
getInputFormat(String path,
boolean realtime,
org.apache.hadoop.conf.Configuration conf) |
static String |
getInputFormatClassName(HoodieFileFormat baseFileFormat,
boolean realtime) |
static String |
getOutputFormatClassName(HoodieFileFormat baseFileFormat) |
static String |
getSerDeClassName(HoodieFileFormat baseFileFormat) |
static Map<org.apache.hadoop.fs.Path,HoodieTableMetaClient> |
getTableMetaClientByPartitionPath(org.apache.hadoop.conf.Configuration conf,
Set<org.apache.hadoop.fs.Path> partitions)
Extract HoodieTableMetaClient by partition path.
|
static HoodieTableMetaClient |
getTableMetaClientForBasePathUnchecked(org.apache.hadoop.conf.Configuration conf,
org.apache.hadoop.fs.Path partitionPath)
Extract HoodieTableMetaClient from a partition path (not base path)
|
static Set<String> |
getWritePartitionPaths(List<HoodieCommitMetadata> metadataList)
Returns all the incremental write partition paths as a set with the given commits metadata.
|
static Map<HoodieTableMetaClient,List<org.apache.hadoop.fs.FileStatus>> |
groupFileStatusForSnapshotPaths(org.apache.hadoop.fs.FileStatus[] fileStatuses,
String fileExtension,
Collection<HoodieTableMetaClient> metaClientList)
Takes in a list of filesStatus and a list of table metadatas.
|
static Map<HoodieTableMetaClient,List<org.apache.hadoop.fs.Path>> |
groupSnapshotPathsByMetaClient(Collection<HoodieTableMetaClient> metaClientList,
List<org.apache.hadoop.fs.Path> snapshotPaths) |
static org.apache.hadoop.fs.FileStatus[] |
listAffectedFilesForCommits(org.apache.hadoop.conf.Configuration hadoopConf,
org.apache.hadoop.fs.Path basePath,
List<HoodieCommitMetadata> metadataList)
Iterate through a list of commit metadata in natural order, and extract the file status of
all affected files from the commits metadata grouping by file full path.
|
static HoodieBaseFile |
refreshFileStatus(org.apache.hadoop.conf.Configuration conf,
HoodieBaseFile dataFile)
Checks the file status for a race condition which can set the file size to 0.
|
public static final int HOODIE_COMMIT_TIME_COL_POS
public static final int HOODIE_RECORD_KEY_COL_POS
public static final int HOODIE_PARTITION_PATH_COL_POS
public static final String HOODIE_READ_COLUMNS_PROP
public static org.apache.hadoop.mapred.FileInputFormat getInputFormat(HoodieFileFormat baseFileFormat, boolean realtime, org.apache.hadoop.conf.Configuration conf)
public static String getInputFormatClassName(HoodieFileFormat baseFileFormat, boolean realtime)
public static String getOutputFormatClassName(HoodieFileFormat baseFileFormat)
public static String getSerDeClassName(HoodieFileFormat baseFileFormat)
public static org.apache.hadoop.mapred.FileInputFormat getInputFormat(String path, boolean realtime, org.apache.hadoop.conf.Configuration conf)
public static HoodieDefaultTimeline filterInstantsTimeline(HoodieDefaultTimeline timeline)
timeline - public static Option<String> getAffectedPartitions(List<HoodieInstant> commitsToCheck, HoodieTableMetaClient tableMetaClient, HoodieTimeline timeline, List<org.apache.hadoop.fs.Path> inputPaths) throws IOException
commitsToCheck - tableMetaClient - timeline - inputPaths - IOExceptionpublic static Option<HoodieTimeline> getFilteredCommitsTimeline(org.apache.hadoop.mapreduce.JobContext job, HoodieTableMetaClient tableMetaClient)
job - tableMetaClient - public static Option<List<HoodieInstant>> getCommitsForIncrementalQuery(org.apache.hadoop.mapreduce.Job job, String tableName, HoodieTimeline timeline)
job - tableName - timeline - public static HoodieTimeline getHoodieTimelineForIncrementalQuery(org.apache.hadoop.mapreduce.JobContext job, String tableName, HoodieTimeline timeline)
job - tableName - timeline - public static Map<org.apache.hadoop.fs.Path,HoodieTableMetaClient> getTableMetaClientByPartitionPath(org.apache.hadoop.conf.Configuration conf, Set<org.apache.hadoop.fs.Path> partitions)
conf - The hadoop confpartitions - The partitionspublic static HoodieTableMetaClient getTableMetaClientForBasePathUnchecked(org.apache.hadoop.conf.Configuration conf, org.apache.hadoop.fs.Path partitionPath) throws IOException
IOExceptionpublic static org.apache.hadoop.fs.FileStatus getFileStatus(HoodieBaseFile baseFile) throws IOException
IOExceptionpublic static List<org.apache.hadoop.fs.FileStatus> filterIncrementalFileStatus(org.apache.hadoop.mapreduce.Job job, HoodieTableMetaClient tableMetaClient, HoodieTimeline timeline, org.apache.hadoop.fs.FileStatus[] fileStatuses, List<HoodieInstant> commitsToCheck) throws IOException
job - tableMetaClient - timeline - fileStatuses - commitsToCheck - IOExceptionpublic static Map<HoodieTableMetaClient,List<org.apache.hadoop.fs.FileStatus>> groupFileStatusForSnapshotPaths(org.apache.hadoop.fs.FileStatus[] fileStatuses, String fileExtension, Collection<HoodieTableMetaClient> metaClientList)
fileStatuses - fileExtension - metaClientList - IOExceptionpublic static Map<HoodieTableMetaClient,List<org.apache.hadoop.fs.Path>> groupSnapshotPathsByMetaClient(Collection<HoodieTableMetaClient> metaClientList, List<org.apache.hadoop.fs.Path> snapshotPaths)
public static HoodieMetadataConfig buildMetadataConfig(org.apache.hadoop.conf.Configuration conf)
public static HoodieBaseFile refreshFileStatus(org.apache.hadoop.conf.Configuration conf, HoodieBaseFile dataFile)
conf - dataFile - public static org.apache.hadoop.fs.FileStatus[] listAffectedFilesForCommits(org.apache.hadoop.conf.Configuration hadoopConf,
org.apache.hadoop.fs.Path basePath,
List<HoodieCommitMetadata> metadataList)
basePath - The table base pathmetadataList - The metadata list to read the data frompublic static Set<String> getWritePartitionPaths(List<HoodieCommitMetadata> metadataList)
metadataList - The commits metadatapublic static HoodieCommitMetadata getCommitMetadata(HoodieInstant instant, HoodieTimeline timeline) throws IOException
instant - The hoodie instanttimeline - The timelineIOExceptionCopyright © 2022 The Apache Software Foundation. All rights reserved.