class FolderCompaction extends Logging
- Alphabetic
- By Inheritance
- FolderCompaction
- Logging
- AnyRef
- Any
- Hide All
- Show All
- Public
- All
Instance Constructors
- new FolderCompaction()
Value Members
-
final
def
!=(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
-
final
def
##(): Int
- Definition Classes
- AnyRef → Any
-
final
def
==(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
-
final
def
asInstanceOf[T0]: T0
- Definition Classes
- Any
-
def
clone(): AnyRef
- Attributes
- protected[lang]
- Definition Classes
- AnyRef
- Annotations
- @throws( ... ) @native() @HotSpotIntrinsicCandidate()
-
def
compact(inputModel: RawModel, outputModel: RawModel, partitions: Map[String, List[String]], numPartitions: Int, spark: SparkSession): Unit
See compact(conf:Config,spark:SparkSession)
See compact(conf:Config,spark:SparkSession)
- inputModel
the input model to read
- outputModel
the output model to write
- partitions
the partitions to compact and values that are part of the
- numPartitions
number of output partitions
- spark
the Spark session
-
def
compact(conf: Config, spark: SparkSession): Unit
Receives the following conf: { "inputModel" : "name of the input model" | ModelConf, "outputModel" : "name of the input model" | ModelConf, "partitions" : { "String1" : ["value1", "value2"], "String2" : ["value1", "value2"] }, "numPartitions" : "integer" }
Receives the following conf: { "inputModel" : "name of the input model" | ModelConf, "outputModel" : "name of the input model" | ModelConf, "partitions" : { "String1" : ["value1", "value2"], "String2" : ["value1", "value2"] }, "numPartitions" : "integer" }
ModelConf has the following structure:
{ "name": "a name you like" "uri": "URI of the dataset: basePath if the dataset is partitioned" "schema": "Spark json representation of the schema" "timed": true/false "options": { saveMode: "spark save mode" format: "spark data format" extraOptions: { // "extra format to the spark reader/writer" key: value } partitionBy: [ // "partition columns" partitionColumn1, partitionColumn2, ... partitionColumnN ] } }
The function will retrieve or build the two indicated models, check that they are file-based, and check that the indicated columns are partition columns. If the previous requirements are met then it will generate each combination of column values and then perform read, repartition to the "numPartitions" number, write to the outputModel and delete the files read.
- conf
the Config
- spark
the Spark session
-
def
delete(spark: SparkSession, rootDir: String, dfs: List[DataFrame]): Unit
Deletes all files that have been read and all empty folders after
Deletes all files that have been read and all empty folders after
- spark
the Spark session
- rootDir
the root dir all dataframes originated from
- dfs
Dataframes that have been read and filtered
-
def
deleteEmptyPartitionFolders(fs: FileSystem, dataframeRoot: Path, deletedFiles: List[Path]): List[Path]
- fs
the filesystem to be used for deletion
- dataframeRoot
the root path of the dataset
- deletedFiles
the files that have been just deleted from the fs
- returns
the paths that have been deleted
-
final
def
eq(arg0: AnyRef): Boolean
- Definition Classes
- AnyRef
-
def
equals(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
-
final
def
getClass(): Class[_]
- Definition Classes
- AnyRef → Any
- Annotations
- @native() @HotSpotIntrinsicCandidate()
-
def
hashCode(): Int
- Definition Classes
- AnyRef → Any
- Annotations
- @native() @HotSpotIntrinsicCandidate()
-
final
def
isInstanceOf[T0]: Boolean
- Definition Classes
- Any
-
final
def
isParent(child: Path, parentToFind: Path): Boolean
- Annotations
- @tailrec()
-
val
logger: WaspLogger
- Attributes
- protected
- Definition Classes
- Logging
-
final
def
ne(arg0: AnyRef): Boolean
- Definition Classes
- AnyRef
-
final
def
notify(): Unit
- Definition Classes
- AnyRef
- Annotations
- @native() @HotSpotIntrinsicCandidate()
-
final
def
notifyAll(): Unit
- Definition Classes
- AnyRef
- Annotations
- @native() @HotSpotIntrinsicCandidate()
-
def
read(spark: SparkSession, inputModel: RawModel, partitions: Map[String, List[String]], whereConditions: List[WhereCondition]): (List[DataFrame], List[Path])
- spark
the Spark session
- inputModel
the input model to read
- partitions
the partitions to compact and values that are part of the
- whereConditions
the where conditions to filter the Dataframe read from the inputModel
- returns
the list of DataFrames read, the list of files read from the inputModel
-
final
def
synchronized[T0](arg0: ⇒ T0): T0
- Definition Classes
- AnyRef
-
def
toString(): String
- Definition Classes
- AnyRef → Any
-
final
def
wait(arg0: Long, arg1: Int): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... )
-
final
def
wait(arg0: Long): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... ) @native()
-
final
def
wait(): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... )
- def write(writer: RawSparkBatchWriter, dataframes: List[DataFrame]): Unit
Deprecated Value Members
-
def
finalize(): Unit
- Attributes
- protected[lang]
- Definition Classes
- AnyRef
- Annotations
- @throws( classOf[java.lang.Throwable] ) @Deprecated @deprecated
- Deprecated
(Since version ) see corresponding Javadoc for more information.