object OrcUtils extends Logging
- Alphabetic
- By Inheritance
- OrcUtils
- Logging
- AnyRef
- Any
- Hide All
- Show All
- Public
- All
Value Members
-
final
def
!=(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
-
final
def
##(): Int
- Definition Classes
- AnyRef → Any
-
final
def
==(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
- val CATALYST_TYPE_ATTRIBUTE_NAME: String
-
def
addSparkVersionMetadata(writer: Writer): Unit
Add a metadata specifying Spark version.
-
final
def
asInstanceOf[T0]: T0
- Definition Classes
- Any
-
def
clone(): AnyRef
- Attributes
- protected[lang]
- Definition Classes
- AnyRef
- Annotations
- @throws( ... ) @native()
-
def
createAggInternalRowFromFooter(reader: Reader, filePath: String, dataSchema: StructType, partitionSchema: StructType, aggregation: Aggregation, aggSchema: StructType, partitionValues: InternalRow): InternalRow
When the partial aggregates (Max/Min/Count) are pushed down to ORC, we don't need to read data from ORC and aggregate at Spark layer.
When the partial aggregates (Max/Min/Count) are pushed down to ORC, we don't need to read data from ORC and aggregate at Spark layer. Instead we want to get the partial aggregates (Max/Min/Count) result using the statistics information from ORC file footer, and then construct an InternalRow from these aggregate results.
NOTE: if statistics is missing from ORC file footer, exception would be thrown.
- returns
Aggregate results in the format of InternalRow
-
final
def
eq(arg0: AnyRef): Boolean
- Definition Classes
- AnyRef
-
def
equals(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
- val extensionsForCompressionCodecNames: Map[String, String]
-
def
finalize(): Unit
- Attributes
- protected[lang]
- Definition Classes
- AnyRef
- Annotations
- @throws( classOf[java.lang.Throwable] )
-
final
def
getClass(): Class[_]
- Definition Classes
- AnyRef → Any
- Annotations
- @native()
-
def
getOrcSchemaString(dt: DataType): String
Given a
StructTypeobject, this methods converts it to corresponding string representation in ORC. -
def
hashCode(): Int
- Definition Classes
- AnyRef → Any
- Annotations
- @native()
- def inferSchema(sparkSession: SparkSession, files: Seq[FileStatus], options: Map[String, String]): Option[StructType]
-
def
initializeLogIfNecessary(isInterpreter: Boolean, silent: Boolean): Boolean
- Attributes
- protected
- Definition Classes
- Logging
-
def
initializeLogIfNecessary(isInterpreter: Boolean): Unit
- Attributes
- protected
- Definition Classes
- Logging
-
final
def
isInstanceOf[T0]: Boolean
- Definition Classes
- Any
-
def
isTraceEnabled(): Boolean
- Attributes
- protected
- Definition Classes
- Logging
- def listOrcFiles(pathStr: String, conf: Configuration): Seq[Path]
-
def
log: Logger
- Attributes
- protected
- Definition Classes
- Logging
-
def
logDebug(msg: ⇒ String, throwable: Throwable): Unit
- Attributes
- protected
- Definition Classes
- Logging
-
def
logDebug(msg: ⇒ String): Unit
- Attributes
- protected
- Definition Classes
- Logging
-
def
logError(msg: ⇒ String, throwable: Throwable): Unit
- Attributes
- protected
- Definition Classes
- Logging
-
def
logError(msg: ⇒ String): Unit
- Attributes
- protected
- Definition Classes
- Logging
-
def
logInfo(msg: ⇒ String, throwable: Throwable): Unit
- Attributes
- protected
- Definition Classes
- Logging
-
def
logInfo(msg: ⇒ String): Unit
- Attributes
- protected
- Definition Classes
- Logging
-
def
logName: String
- Attributes
- protected
- Definition Classes
- Logging
-
def
logTrace(msg: ⇒ String, throwable: Throwable): Unit
- Attributes
- protected
- Definition Classes
- Logging
-
def
logTrace(msg: ⇒ String): Unit
- Attributes
- protected
- Definition Classes
- Logging
-
def
logWarning(msg: ⇒ String, throwable: Throwable): Unit
- Attributes
- protected
- Definition Classes
- Logging
-
def
logWarning(msg: ⇒ String): Unit
- Attributes
- protected
- Definition Classes
- Logging
-
final
def
ne(arg0: AnyRef): Boolean
- Definition Classes
- AnyRef
-
final
def
notify(): Unit
- Definition Classes
- AnyRef
- Annotations
- @native()
-
final
def
notifyAll(): Unit
- Definition Classes
- AnyRef
- Annotations
- @native()
-
def
orcResultSchemaString(canPruneCols: Boolean, dataSchema: StructType, resultSchema: StructType, partitionSchema: StructType, conf: Configuration): String
Returns the result schema to read from ORC file.
Returns the result schema to read from ORC file. In addition, It sets the schema string to 'orc.mapred.input.schema' so ORC reader can use later.
- canPruneCols
Flag to decide whether pruned cols schema is send to resultSchema or to send the entire dataSchema to resultSchema.
- dataSchema
Schema of the orc files.
- resultSchema
Result data schema created after pruning cols.
- partitionSchema
Schema of partitions.
- conf
Hadoop Configuration.
- returns
Returns the result schema as string.
- def orcTypeDescription(dt: DataType): TypeDescription
- def readCatalystSchema(file: Path, conf: Configuration, ignoreCorruptFiles: Boolean): Option[StructType]
-
def
readOrcSchemasInParallel(files: Seq[FileStatus], conf: Configuration, ignoreCorruptFiles: Boolean): Seq[StructType]
Reads ORC file schemas in multi-threaded manner, using native version of ORC.
Reads ORC file schemas in multi-threaded manner, using native version of ORC. This is visible for testing.
- def readSchema(sparkSession: SparkSession, files: Seq[FileStatus], options: Map[String, String]): Option[StructType]
- def readSchema(file: Path, conf: Configuration, ignoreCorruptFiles: Boolean): Option[TypeDescription]
-
def
requestedColumnIds(isCaseSensitive: Boolean, dataSchema: StructType, requiredSchema: StructType, orcSchema: TypeDescription, conf: Configuration): Option[(Array[Int], Boolean)]
- returns
Returns the combination of requested column ids from the given ORC file and boolean flag to find if the pruneCols is allowed or not. Requested Column id can be -1, which means the requested column doesn't exist in the ORC file. Returns None if the given ORC file is empty.
-
def
supportColumnarReads(dataType: DataType, nestedColumnEnabled: Boolean): Boolean
Checks if
dataTypesupports columnar reads.Checks if
dataTypesupports columnar reads.- dataType
Data type of the orc files.
- nestedColumnEnabled
True if columnar reads is enabled for nested column types.
- returns
Returns true if data type supports columnar reads.
-
final
def
synchronized[T0](arg0: ⇒ T0): T0
- Definition Classes
- AnyRef
- def toCatalystSchema(schema: TypeDescription): StructType
-
def
toString(): String
- Definition Classes
- AnyRef → Any
-
final
def
wait(): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... )
-
final
def
wait(arg0: Long, arg1: Int): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... )
-
final
def
wait(arg0: Long): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... ) @native()