c

io.delta.hive

DeltaInputFormat

class DeltaInputFormat extends FileInputFormat[NullWritable, ArrayWritable]

A special InputFormat to wrap ParquetInputFormat to read a Delta table.

The underlying files in a Delta table are in Parquet format. However, we cannot use the existing ParquetInputFormat to read them directly because they only store data for data columns. The values of partition columns are in Delta's metadata. Hence, we need to read them from Delta's metadata and re-assemble rows to include partition values and data values from the raw Parquet files.

Note: We cannot use the file name to infer partition values because Delta Transaction Log Protocol requires "Actual partition values for a file must be read from the transaction log".

In the current implementation, when listing files, we also read the partition values and put them into an Array[PartitionColumnInfo]. Then create a temp Map to store the mapping from the file path to PartitionColumnInfos. When creating an InputSplit, we will create a special FileSplit called DeltaInputSplit to carry over PartitionColumnInfos.

For each reader created from a DeltaInputSplit, we can get all partition column types, the locations of a partition column in the schema, and their string values. The reader can build org.apache.hadoop.io.Writable for all partition values, and insert them to the raw row returned by org.apache.parquet.hadoop.ParquetRecordReader.

Linear Supertypes
FileInputFormat[NullWritable, ArrayWritable], InputFormat[NullWritable, ArrayWritable], AnyRef, Any
Ordering
  1. Alphabetic
  2. By Inheritance
Inherited
  1. DeltaInputFormat
  2. FileInputFormat
  3. InputFormat
  4. AnyRef
  5. Any
  1. Hide All
  2. Show All
Visibility
  1. Public
  2. All

Instance Constructors

  1. new DeltaInputFormat()
  2. new DeltaInputFormat(realInput: ParquetInputFormat[ArrayWritable])

Value Members

  1. final def !=(arg0: Any): Boolean
    Definition Classes
    AnyRef → Any
  2. final def ##(): Int
    Definition Classes
    AnyRef → Any
  3. final def ==(arg0: Any): Boolean
    Definition Classes
    AnyRef → Any
  4. def addInputPathRecursively(arg0: List[FileStatus], arg1: FileSystem, arg2: Path, arg3: PathFilter): Unit
    Attributes
    protected[mapred]
    Definition Classes
    FileInputFormat
    Annotations
    @throws( classOf[java.io.IOException] )
  5. final def asInstanceOf[T0]: T0
    Definition Classes
    Any
  6. def clone(): AnyRef
    Attributes
    protected[lang]
    Definition Classes
    AnyRef
    Annotations
    @throws( ... ) @native()
  7. def computeSplitSize(arg0: Long, arg1: Long, arg2: Long): Long
    Attributes
    protected[mapred]
    Definition Classes
    FileInputFormat
  8. final def eq(arg0: AnyRef): Boolean
    Definition Classes
    AnyRef
  9. def equals(arg0: Any): Boolean
    Definition Classes
    AnyRef → Any
  10. def finalize(): Unit
    Attributes
    protected[lang]
    Definition Classes
    AnyRef
    Annotations
    @throws( classOf[java.lang.Throwable] )
  11. def getBlockIndex(arg0: Array[BlockLocation], arg1: Long): Int
    Attributes
    protected[mapred]
    Definition Classes
    FileInputFormat
  12. final def getClass(): Class[_]
    Definition Classes
    AnyRef → Any
    Annotations
    @native()
  13. def getRecordReader(split: InputSplit, job: JobConf, reporter: Reporter): RecordReader[NullWritable, ArrayWritable]
    Definition Classes
    DeltaInputFormat → FileInputFormat → InputFormat
  14. def getSplitHosts(arg0: Array[BlockLocation], arg1: Long, arg2: Long, arg3: NetworkTopology): Array[String]
    Attributes
    protected[mapred]
    Definition Classes
    FileInputFormat
    Annotations
    @throws( classOf[java.io.IOException] )
  15. def getSplits(job: JobConf, numSplits: Int): Array[InputSplit]
    Definition Classes
    DeltaInputFormat → FileInputFormat → InputFormat
  16. def hashCode(): Int
    Definition Classes
    AnyRef → Any
    Annotations
    @native()
  17. final def isInstanceOf[T0]: Boolean
    Definition Classes
    Any
  18. def isSplitable(arg0: FileSystem, arg1: Path): Boolean
    Attributes
    protected[mapred]
    Definition Classes
    FileInputFormat
  19. def listStatus(job: JobConf): Array[FileStatus]
    Definition Classes
    DeltaInputFormat → FileInputFormat
    Annotations
    @throws( classOf[IOException] )
  20. def makeSplit(file: Path, start: Long, length: Long, hosts: Array[String], inMemoryHosts: Array[String]): FileSplit
    Definition Classes
    DeltaInputFormat → FileInputFormat
  21. def makeSplit(file: Path, start: Long, length: Long, hosts: Array[String]): FileSplit
    Definition Classes
    DeltaInputFormat → FileInputFormat
  22. final def ne(arg0: AnyRef): Boolean
    Definition Classes
    AnyRef
  23. final def notify(): Unit
    Definition Classes
    AnyRef
    Annotations
    @native()
  24. final def notifyAll(): Unit
    Definition Classes
    AnyRef
    Annotations
    @native()
  25. def setMinSplitSize(arg0: Long): Unit
    Attributes
    protected[mapred]
    Definition Classes
    FileInputFormat
  26. final def synchronized[T0](arg0: ⇒ T0): T0
    Definition Classes
    AnyRef
  27. def toString(): String
    Definition Classes
    AnyRef → Any
  28. final def wait(): Unit
    Definition Classes
    AnyRef
    Annotations
    @throws( ... )
  29. final def wait(arg0: Long, arg1: Int): Unit
    Definition Classes
    AnyRef
    Annotations
    @throws( ... )
  30. final def wait(arg0: Long): Unit
    Definition Classes
    AnyRef
    Annotations
    @throws( ... ) @native()

Inherited from FileInputFormat[NullWritable, ArrayWritable]

Inherited from InputFormat[NullWritable, ArrayWritable]

Inherited from AnyRef

Inherited from Any

Ungrouped