class DeltaInputFormat extends FileInputFormat[NullWritable, ArrayWritable]
A special InputFormat to wrap ParquetInputFormat to read a Delta table.
The underlying files in a Delta table are in Parquet format. However, we cannot use the existing ParquetInputFormat to read them directly because they only store data for data columns. The values of partition columns are in Delta's metadata. Hence, we need to read them from Delta's metadata and re-assemble rows to include partition values and data values from the raw Parquet files.
Note: We cannot use the file name to infer partition values because Delta Transaction Log Protocol requires "Actual partition values for a file must be read from the transaction log".
In the current implementation, when listing files, we also read the partition values and put them
into an Array[PartitionColumnInfo]. Then create a temp Map to store the mapping from the file
path to PartitionColumnInfos. When creating an InputSplit, we will create a special
FileSplit called DeltaInputSplit to carry over PartitionColumnInfos.
For each reader created from a DeltaInputSplit, we can get all partition column types, the locations of a partition column in the schema, and their string values. The reader can build org.apache.hadoop.io.Writable for all partition values, and insert them to the raw row returned by org.apache.parquet.hadoop.ParquetRecordReader.
- Alphabetic
- By Inheritance
- DeltaInputFormat
- FileInputFormat
- InputFormat
- AnyRef
- Any
- Hide All
- Show All
- Public
- All
Instance Constructors
Value Members
-
final
def
!=(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
-
final
def
##(): Int
- Definition Classes
- AnyRef → Any
-
final
def
==(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
-
def
addInputPathRecursively(arg0: List[FileStatus], arg1: FileSystem, arg2: Path, arg3: PathFilter): Unit
- Attributes
- protected[mapred]
- Definition Classes
- FileInputFormat
- Annotations
- @throws( classOf[java.io.IOException] )
-
final
def
asInstanceOf[T0]: T0
- Definition Classes
- Any
-
def
clone(): AnyRef
- Attributes
- protected[lang]
- Definition Classes
- AnyRef
- Annotations
- @throws( ... ) @native()
-
def
computeSplitSize(arg0: Long, arg1: Long, arg2: Long): Long
- Attributes
- protected[mapred]
- Definition Classes
- FileInputFormat
-
final
def
eq(arg0: AnyRef): Boolean
- Definition Classes
- AnyRef
-
def
equals(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
-
def
finalize(): Unit
- Attributes
- protected[lang]
- Definition Classes
- AnyRef
- Annotations
- @throws( classOf[java.lang.Throwable] )
-
def
getBlockIndex(arg0: Array[BlockLocation], arg1: Long): Int
- Attributes
- protected[mapred]
- Definition Classes
- FileInputFormat
-
final
def
getClass(): Class[_]
- Definition Classes
- AnyRef → Any
- Annotations
- @native()
-
def
getRecordReader(split: InputSplit, job: JobConf, reporter: Reporter): RecordReader[NullWritable, ArrayWritable]
- Definition Classes
- DeltaInputFormat → FileInputFormat → InputFormat
-
def
getSplitHosts(arg0: Array[BlockLocation], arg1: Long, arg2: Long, arg3: NetworkTopology): Array[String]
- Attributes
- protected[mapred]
- Definition Classes
- FileInputFormat
- Annotations
- @throws( classOf[java.io.IOException] )
-
def
getSplits(job: JobConf, numSplits: Int): Array[InputSplit]
- Definition Classes
- DeltaInputFormat → FileInputFormat → InputFormat
-
def
hashCode(): Int
- Definition Classes
- AnyRef → Any
- Annotations
- @native()
-
final
def
isInstanceOf[T0]: Boolean
- Definition Classes
- Any
-
def
isSplitable(arg0: FileSystem, arg1: Path): Boolean
- Attributes
- protected[mapred]
- Definition Classes
- FileInputFormat
-
def
listStatus(job: JobConf): Array[FileStatus]
- Definition Classes
- DeltaInputFormat → FileInputFormat
- Annotations
- @throws( classOf[IOException] )
-
def
makeSplit(file: Path, start: Long, length: Long, hosts: Array[String], inMemoryHosts: Array[String]): FileSplit
- Definition Classes
- DeltaInputFormat → FileInputFormat
-
def
makeSplit(file: Path, start: Long, length: Long, hosts: Array[String]): FileSplit
- Definition Classes
- DeltaInputFormat → FileInputFormat
-
final
def
ne(arg0: AnyRef): Boolean
- Definition Classes
- AnyRef
-
final
def
notify(): Unit
- Definition Classes
- AnyRef
- Annotations
- @native()
-
final
def
notifyAll(): Unit
- Definition Classes
- AnyRef
- Annotations
- @native()
-
def
setMinSplitSize(arg0: Long): Unit
- Attributes
- protected[mapred]
- Definition Classes
- FileInputFormat
-
final
def
synchronized[T0](arg0: ⇒ T0): T0
- Definition Classes
- AnyRef
-
def
toString(): String
- Definition Classes
- AnyRef → Any
-
final
def
wait(): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... )
-
final
def
wait(arg0: Long, arg1: Int): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... )
-
final
def
wait(arg0: Long): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... ) @native()