001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hdfs.server.datanode.fsdataset; 019 020 021import java.io.Closeable; 022import java.io.EOFException; 023import java.io.File; 024import java.io.FileDescriptor; 025import java.io.FileNotFoundException; 026import java.io.IOException; 027import java.io.InputStream; 028import java.nio.channels.ClosedChannelException; 029import java.util.ArrayList; 030import java.util.Iterator; 031import java.util.List; 032import java.util.Map; 033import java.util.Set; 034 035import org.apache.hadoop.classification.InterfaceAudience; 036import org.apache.hadoop.conf.Configuration; 037import org.apache.hadoop.fs.StorageType; 038import org.apache.hadoop.hdfs.DFSConfigKeys; 039import org.apache.hadoop.hdfs.protocol.Block; 040import org.apache.hadoop.hdfs.protocol.BlockListAsLongs; 041import org.apache.hadoop.hdfs.protocol.BlockLocalPathInfo; 042import org.apache.hadoop.hdfs.protocol.ExtendedBlock; 043import org.apache.hadoop.hdfs.protocol.HdfsBlocksMetadata; 044import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.ReplicaState; 045import org.apache.hadoop.hdfs.server.datanode.DataNode; 046import org.apache.hadoop.hdfs.server.datanode.DataStorage; 047import org.apache.hadoop.hdfs.server.datanode.FinalizedReplica; 048import org.apache.hadoop.hdfs.server.datanode.Replica; 049import org.apache.hadoop.hdfs.server.datanode.ReplicaInPipelineInterface; 050import org.apache.hadoop.hdfs.server.datanode.ReplicaHandler; 051import org.apache.hadoop.hdfs.server.datanode.ReplicaInfo; 052import org.apache.hadoop.hdfs.server.datanode.ReplicaNotFoundException; 053import org.apache.hadoop.hdfs.server.datanode.StorageLocation; 054import org.apache.hadoop.hdfs.server.datanode.UnexpectedReplicaStateException; 055import org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetFactory; 056import org.apache.hadoop.hdfs.server.datanode.metrics.FSDatasetMBean; 057import org.apache.hadoop.hdfs.server.protocol.BlockRecoveryCommand.RecoveringBlock; 058import org.apache.hadoop.hdfs.server.protocol.DatanodeStorage; 059import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo; 060import org.apache.hadoop.hdfs.server.protocol.ReplicaRecoveryInfo; 061import org.apache.hadoop.hdfs.server.protocol.StorageReport; 062import org.apache.hadoop.hdfs.server.protocol.VolumeFailureSummary; 063import org.apache.hadoop.util.AutoCloseableLock; 064import org.apache.hadoop.util.ReflectionUtils; 065 066/** 067 * This is a service provider interface for the underlying storage that 068 * stores replicas for a data node. 069 * The default implementation stores replicas on local drives. 070 */ 071@InterfaceAudience.Private 072public interface FsDatasetSpi<V extends FsVolumeSpi> extends FSDatasetMBean { 073 /** 074 * A factory for creating {@link FsDatasetSpi} objects. 075 */ 076 abstract class Factory<D extends FsDatasetSpi<?>> { 077 /** @return the configured factory. */ 078 public static Factory<?> getFactory(Configuration conf) { 079 @SuppressWarnings("rawtypes") 080 final Class<? extends Factory> clazz = conf.getClass( 081 DFSConfigKeys.DFS_DATANODE_FSDATASET_FACTORY_KEY, 082 FsDatasetFactory.class, 083 Factory.class); 084 return ReflectionUtils.newInstance(clazz, conf); 085 } 086 087 /** Create a new object. */ 088 public abstract D newInstance(DataNode datanode, DataStorage storage, 089 Configuration conf) throws IOException; 090 091 /** Does the factory create simulated objects? */ 092 public boolean isSimulated() { 093 return false; 094 } 095 } 096 097 /** 098 * It behaviors as an unmodifiable list of FsVolume. Individual FsVolume can 099 * be obtained by using {@link #get(int)}. 100 * 101 * This also holds the reference counts for these volumes. It releases all the 102 * reference counts in {@link #close()}. 103 */ 104 class FsVolumeReferences implements Iterable<FsVolumeSpi>, Closeable { 105 private final List<FsVolumeReference> references; 106 107 public <S extends FsVolumeSpi> FsVolumeReferences(List<S> curVolumes) { 108 references = new ArrayList<>(); 109 for (FsVolumeSpi v : curVolumes) { 110 try { 111 references.add(v.obtainReference()); 112 } catch (ClosedChannelException e) { 113 // This volume has been closed. 114 } 115 } 116 } 117 118 private static class FsVolumeSpiIterator implements 119 Iterator<FsVolumeSpi> { 120 private final List<FsVolumeReference> references; 121 private int idx = 0; 122 123 FsVolumeSpiIterator(List<FsVolumeReference> refs) { 124 references = refs; 125 } 126 127 @Override 128 public boolean hasNext() { 129 return idx < references.size(); 130 } 131 132 @Override 133 public FsVolumeSpi next() { 134 int refIdx = idx++; 135 return references.get(refIdx).getVolume(); 136 } 137 138 @Override 139 public void remove() { 140 throw new UnsupportedOperationException(); 141 } 142 } 143 144 @Override 145 public Iterator<FsVolumeSpi> iterator() { 146 return new FsVolumeSpiIterator(references); 147 } 148 149 /** 150 * Get the number of volumes. 151 */ 152 public int size() { 153 return references.size(); 154 } 155 156 /** 157 * Get the volume for a given index. 158 */ 159 public FsVolumeSpi get(int index) { 160 return references.get(index).getVolume(); 161 } 162 163 @Override 164 public void close() throws IOException { 165 IOException ioe = null; 166 for (FsVolumeReference ref : references) { 167 try { 168 ref.close(); 169 } catch (IOException e) { 170 ioe = e; 171 } 172 } 173 references.clear(); 174 if (ioe != null) { 175 throw ioe; 176 } 177 } 178 } 179 180 /** 181 * Returns a list of FsVolumes that hold reference counts. 182 * 183 * The caller must release the reference of each volume by calling 184 * {@link FsVolumeReferences#close()}. 185 */ 186 FsVolumeReferences getFsVolumeReferences(); 187 188 /** 189 * Add a new volume to the FsDataset.<p/> 190 * 191 * If the FSDataset supports block scanning, this function registers 192 * the new volume with the block scanner. 193 * 194 * @param location The storage location for the new volume. 195 * @param nsInfos Namespace information for the new volume. 196 */ 197 void addVolume( 198 final StorageLocation location, 199 final List<NamespaceInfo> nsInfos) throws IOException; 200 201 /** 202 * Removes a collection of volumes from FsDataset. 203 * 204 * If the FSDataset supports block scanning, this function removes 205 * the volumes from the block scanner. 206 * 207 * @param volumes The paths of the volumes to be removed. 208 * @param clearFailure set true to clear the failure information about the 209 * volumes. 210 */ 211 void removeVolumes(Set<File> volumes, boolean clearFailure); 212 213 /** @return a storage with the given storage ID */ 214 DatanodeStorage getStorage(final String storageUuid); 215 216 /** @return one or more storage reports for attached volumes. */ 217 StorageReport[] getStorageReports(String bpid) 218 throws IOException; 219 220 /** @return the volume that contains a replica of the block. */ 221 V getVolume(ExtendedBlock b); 222 223 /** @return a volume information map (name => info). */ 224 Map<String, Object> getVolumeInfoMap(); 225 226 /** 227 * Returns info about volume failures. 228 * 229 * @return info about volume failures, possibly null 230 */ 231 VolumeFailureSummary getVolumeFailureSummary(); 232 233 /** 234 * Gets a list of references to the finalized blocks for the given block pool. 235 * <p> 236 * Callers of this function should call 237 * {@link FsDatasetSpi#acquireDatasetLock} to avoid blocks' status being 238 * changed during list iteration. 239 * </p> 240 * @return a list of references to the finalized blocks for the given block 241 * pool. 242 */ 243 List<FinalizedReplica> getFinalizedBlocks(String bpid); 244 245 /** 246 * Check whether the in-memory block record matches the block on the disk, 247 * and, in case that they are not matched, update the record or mark it 248 * as corrupted. 249 */ 250 void checkAndUpdate(String bpid, long blockId, File diskFile, 251 File diskMetaFile, FsVolumeSpi vol) throws IOException; 252 253 /** 254 * @param b - the block 255 * @return a stream if the meta-data of the block exists; 256 * otherwise, return null. 257 * @throws IOException 258 */ 259 LengthInputStream getMetaDataInputStream(ExtendedBlock b 260 ) throws IOException; 261 262 /** 263 * Returns the specified block's on-disk length (excluding metadata). 264 * @return the specified block's on-disk length (excluding metadta) 265 * @throws IOException on error 266 */ 267 long getLength(ExtendedBlock b) throws IOException; 268 269 /** 270 * Get reference to the replica meta info in the replicasMap. 271 * To be called from methods that are synchronized on {@link FSDataset} 272 * @return replica from the replicas map 273 */ 274 @Deprecated 275 Replica getReplica(String bpid, long blockId); 276 277 /** 278 * @return replica meta information 279 */ 280 String getReplicaString(String bpid, long blockId); 281 282 /** 283 * @return the generation stamp stored with the block. 284 */ 285 Block getStoredBlock(String bpid, long blkid) throws IOException; 286 287 /** 288 * Returns an input stream at specified offset of the specified block. 289 * @param b block 290 * @param seekOffset offset with in the block to seek to 291 * @return an input stream to read the contents of the specified block, 292 * starting at the offset 293 * @throws IOException 294 */ 295 InputStream getBlockInputStream(ExtendedBlock b, long seekOffset) 296 throws IOException; 297 298 /** 299 * Returns an input stream at specified offset of the specified block. 300 * The block is still in the tmp directory and is not finalized 301 * @return an input stream to read the contents of the specified block, 302 * starting at the offset 303 * @throws IOException 304 */ 305 ReplicaInputStreams getTmpInputStreams(ExtendedBlock b, long blkoff, 306 long ckoff) throws IOException; 307 308 /** 309 * Creates a temporary replica and returns the meta information of the replica 310 * . 311 * 312 * @param b block 313 * @return the meta info of the replica which is being written to 314 * @throws IOException if an error occurs 315 */ 316 ReplicaHandler createTemporary(StorageType storageType, 317 ExtendedBlock b) throws IOException; 318 319 /** 320 * Creates a RBW replica and returns the meta info of the replica 321 * 322 * @param b block 323 * @return the meta info of the replica which is being written to 324 * @throws IOException if an error occurs 325 */ 326 ReplicaHandler createRbw(StorageType storageType, 327 ExtendedBlock b, boolean allowLazyPersist) throws IOException; 328 329 /** 330 * Recovers a RBW replica and returns the meta info of the replica. 331 * 332 * @param b block 333 * @param newGS the new generation stamp for the replica 334 * @param minBytesRcvd the minimum number of bytes that the replica could have 335 * @param maxBytesRcvd the maximum number of bytes that the replica could have 336 * @return the meta info of the replica which is being written to 337 * @throws IOException if an error occurs 338 */ 339 ReplicaHandler recoverRbw(ExtendedBlock b, 340 long newGS, long minBytesRcvd, long maxBytesRcvd) throws IOException; 341 342 /** 343 * Covert a temporary replica to a RBW. 344 * @param temporary the temporary replica being converted 345 * @return the result RBW 346 */ 347 ReplicaInPipelineInterface convertTemporaryToRbw( 348 ExtendedBlock temporary) throws IOException; 349 350 /** 351 * Append to a finalized replica and returns the meta info of the replica. 352 * 353 * @param b block 354 * @param newGS the new generation stamp for the replica 355 * @param expectedBlockLen the number of bytes the replica is expected to have 356 * @return the meata info of the replica which is being written to 357 * @throws IOException 358 */ 359 ReplicaHandler append(ExtendedBlock b, long newGS, 360 long expectedBlockLen) throws IOException; 361 362 /** 363 * Recover a failed append to a finalized replica and returns the meta 364 * info of the replica. 365 * 366 * @param b block 367 * @param newGS the new generation stamp for the replica 368 * @param expectedBlockLen the number of bytes the replica is expected to have 369 * @return the meta info of the replica which is being written to 370 * @throws IOException 371 */ 372 ReplicaHandler recoverAppend( 373 ExtendedBlock b, long newGS, long expectedBlockLen) throws IOException; 374 375 /** 376 * Recover a failed pipeline close. 377 * It bumps the replica's generation stamp and finalize it if RBW replica 378 * 379 * @param b block 380 * @param newGS the new generation stamp for the replica 381 * @param expectedBlockLen the number of bytes the replica is expected to have 382 * @return the storage uuid of the replica. 383 * @throws IOException 384 */ 385 Replica recoverClose(ExtendedBlock b, long newGS, long expectedBlockLen 386 ) throws IOException; 387 388 /** 389 * Finalizes the block previously opened for writing using writeToBlock. 390 * The block size is what is in the parameter b and it must match the amount 391 * of data written 392 * @throws IOException 393 * @throws ReplicaNotFoundException if the replica can not be found when the 394 * block is been finalized. For instance, the block resides on an HDFS volume 395 * that has been removed. 396 */ 397 void finalizeBlock(ExtendedBlock b) throws IOException; 398 399 /** 400 * Unfinalizes the block previously opened for writing using writeToBlock. 401 * The temporary file associated with this block is deleted. 402 * @throws IOException 403 */ 404 void unfinalizeBlock(ExtendedBlock b) throws IOException; 405 406 /** 407 * Returns one block report per volume. 408 * @param bpid Block Pool Id 409 * @return - a map of DatanodeStorage to block report for the volume. 410 */ 411 Map<DatanodeStorage, BlockListAsLongs> getBlockReports(String bpid); 412 413 /** 414 * Returns the cache report - the full list of cached block IDs of a 415 * block pool. 416 * @param bpid Block Pool Id 417 * @return the cache report - the full list of cached block IDs. 418 */ 419 List<Long> getCacheReport(String bpid); 420 421 /** Does the dataset contain the block? */ 422 boolean contains(ExtendedBlock block); 423 424 /** 425 * Check if a block is valid. 426 * 427 * @param b The block to check. 428 * @param minLength The minimum length that the block must have. May be 0. 429 * @param state If this is null, it is ignored. If it is non-null, we 430 * will check that the replica has this state. 431 * 432 * @throws ReplicaNotFoundException If the replica is not found 433 * 434 * @throws UnexpectedReplicaStateException If the replica is not in the 435 * expected state. 436 * @throws FileNotFoundException If the block file is not found or there 437 * was an error locating it. 438 * @throws EOFException If the replica length is too short. 439 * 440 * @throws IOException May be thrown from the methods called. 441 */ 442 void checkBlock(ExtendedBlock b, long minLength, ReplicaState state) 443 throws ReplicaNotFoundException, UnexpectedReplicaStateException, 444 FileNotFoundException, EOFException, IOException; 445 446 447 /** 448 * Is the block valid? 449 * @return - true if the specified block is valid 450 */ 451 boolean isValidBlock(ExtendedBlock b); 452 453 /** 454 * Is the block a valid RBW? 455 * @return - true if the specified block is a valid RBW 456 */ 457 boolean isValidRbw(ExtendedBlock b); 458 459 /** 460 * Invalidates the specified blocks 461 * @param bpid Block pool Id 462 * @param invalidBlks - the blocks to be invalidated 463 * @throws IOException 464 */ 465 void invalidate(String bpid, Block invalidBlks[]) throws IOException; 466 467 /** 468 * Caches the specified blocks 469 * @param bpid Block pool id 470 * @param blockIds - block ids to cache 471 */ 472 void cache(String bpid, long[] blockIds); 473 474 /** 475 * Uncaches the specified blocks 476 * @param bpid Block pool id 477 * @param blockIds - blocks ids to uncache 478 */ 479 void uncache(String bpid, long[] blockIds); 480 481 /** 482 * Determine if the specified block is cached. 483 * @param bpid Block pool id 484 * @param blockIds - block id 485 * @return true if the block is cached 486 */ 487 boolean isCached(String bpid, long blockId); 488 489 /** 490 * Check if all the data directories are healthy 491 * @return A set of unhealthy data directories. 492 */ 493 Set<File> checkDataDir(); 494 495 /** 496 * Shutdown the FSDataset 497 */ 498 void shutdown(); 499 500 /** 501 * Sets the file pointer of the checksum stream so that the last checksum 502 * will be overwritten 503 * @param b block 504 * @param outs The streams for the data file and checksum file 505 * @param checksumSize number of bytes each checksum has 506 * @throws IOException 507 */ 508 void adjustCrcChannelPosition(ExtendedBlock b, 509 ReplicaOutputStreams outs, int checksumSize) throws IOException; 510 511 /** 512 * Checks how many valid storage volumes there are in the DataNode. 513 * @return true if more than the minimum number of valid volumes are left 514 * in the FSDataSet. 515 */ 516 boolean hasEnoughResource(); 517 518 /** 519 * Get visible length of the specified replica. 520 */ 521 long getReplicaVisibleLength(final ExtendedBlock block) throws IOException; 522 523 /** 524 * Initialize a replica recovery. 525 * @return actual state of the replica on this data-node or 526 * null if data-node does not have the replica. 527 */ 528 ReplicaRecoveryInfo initReplicaRecovery(RecoveringBlock rBlock 529 ) throws IOException; 530 531 /** 532 * Update replica's generation stamp and length and finalize it. 533 * @return the ID of storage that stores the block 534 */ 535 Replica updateReplicaUnderRecovery(ExtendedBlock oldBlock, 536 long recoveryId, long newBlockId, long newLength) throws IOException; 537 538 /** 539 * add new block pool ID 540 * @param bpid Block pool Id 541 * @param conf Configuration 542 */ 543 void addBlockPool(String bpid, Configuration conf) throws IOException; 544 545 /** 546 * Shutdown and remove the block pool from underlying storage. 547 * @param bpid Block pool Id to be removed 548 */ 549 void shutdownBlockPool(String bpid) ; 550 551 /** 552 * Deletes the block pool directories. If force is false, directories are 553 * deleted only if no block files exist for the block pool. If force 554 * is true entire directory for the blockpool is deleted along with its 555 * contents. 556 * @param bpid BlockPool Id to be deleted. 557 * @param force If force is false, directories are deleted only if no 558 * block files exist for the block pool, otherwise entire 559 * directory for the blockpool is deleted along with its contents. 560 * @throws IOException 561 */ 562 void deleteBlockPool(String bpid, boolean force) throws IOException; 563 564 /** 565 * Get {@link BlockLocalPathInfo} for the given block. 566 */ 567 BlockLocalPathInfo getBlockLocalPathInfo(ExtendedBlock b 568 ) throws IOException; 569 570 /** 571 * Get a {@link HdfsBlocksMetadata} corresponding to the list of blocks in 572 * <code>blocks</code>. 573 * 574 * @param bpid pool to query 575 * @param blockIds List of block ids for which to return metadata 576 * @return metadata Metadata for the list of blocks 577 * @throws IOException 578 */ 579 HdfsBlocksMetadata getHdfsBlocksMetadata(String bpid, 580 long[] blockIds) throws IOException; 581 582 /** 583 * Enable 'trash' for the given dataset. When trash is enabled, files are 584 * moved to a separate trash directory instead of being deleted immediately. 585 * This can be useful for example during rolling upgrades. 586 */ 587 void enableTrash(String bpid); 588 589 /** 590 * Clear trash 591 */ 592 void clearTrash(String bpid); 593 594 /** 595 * @return true when trash is enabled 596 */ 597 boolean trashEnabled(String bpid); 598 599 /** 600 * Create a marker file indicating that a rolling upgrade is in progress. 601 */ 602 void setRollingUpgradeMarker(String bpid) throws IOException; 603 604 /** 605 * Delete the rolling upgrade marker file if it exists. 606 * @param bpid 607 */ 608 void clearRollingUpgradeMarker(String bpid) throws IOException; 609 610 /** 611 * submit a sync_file_range request to AsyncDiskService. 612 */ 613 void submitBackgroundSyncFileRangeRequest(final ExtendedBlock block, 614 final FileDescriptor fd, final long offset, final long nbytes, 615 final int flags); 616 617 /** 618 * Callback from RamDiskAsyncLazyPersistService upon async lazy persist task end 619 */ 620 void onCompleteLazyPersist(String bpId, long blockId, 621 long creationTime, File[] savedFiles, V targetVolume); 622 623 /** 624 * Callback from RamDiskAsyncLazyPersistService upon async lazy persist task fail 625 */ 626 void onFailLazyPersist(String bpId, long blockId); 627 628 /** 629 * Move block from one storage to another storage 630 */ 631 ReplicaInfo moveBlockAcrossStorage(final ExtendedBlock block, 632 StorageType targetStorageType) throws IOException; 633 634 /** 635 * Set a block to be pinned on this datanode so that it cannot be moved 636 * by Balancer/Mover. 637 * 638 * It is a no-op when dfs.datanode.block-pinning.enabled is set to false. 639 */ 640 void setPinning(ExtendedBlock block) throws IOException; 641 642 /** 643 * Check whether the block was pinned 644 */ 645 boolean getPinning(ExtendedBlock block) throws IOException; 646 647 /** 648 * Confirm whether the block is deleting 649 */ 650 boolean isDeletingBlock(String bpid, long blockId); 651 652 /** 653 * Acquire the lock of the dataset. 654 */ 655 AutoCloseableLock acquireDatasetLock(); 656}