001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hdfs.server.datanode.fsdataset;
019
020
021import java.io.Closeable;
022import java.io.EOFException;
023import java.io.File;
024import java.io.FileDescriptor;
025import java.io.FileNotFoundException;
026import java.io.IOException;
027import java.io.InputStream;
028import java.nio.channels.ClosedChannelException;
029import java.util.ArrayList;
030import java.util.Iterator;
031import java.util.List;
032import java.util.Map;
033import java.util.Set;
034
035import org.apache.hadoop.classification.InterfaceAudience;
036import org.apache.hadoop.conf.Configuration;
037import org.apache.hadoop.fs.StorageType;
038import org.apache.hadoop.hdfs.DFSConfigKeys;
039import org.apache.hadoop.hdfs.protocol.Block;
040import org.apache.hadoop.hdfs.protocol.BlockListAsLongs;
041import org.apache.hadoop.hdfs.protocol.BlockLocalPathInfo;
042import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
043import org.apache.hadoop.hdfs.protocol.HdfsBlocksMetadata;
044import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.ReplicaState;
045import org.apache.hadoop.hdfs.server.datanode.DataNode;
046import org.apache.hadoop.hdfs.server.datanode.DataStorage;
047import org.apache.hadoop.hdfs.server.datanode.FinalizedReplica;
048import org.apache.hadoop.hdfs.server.datanode.Replica;
049import org.apache.hadoop.hdfs.server.datanode.ReplicaInPipelineInterface;
050import org.apache.hadoop.hdfs.server.datanode.ReplicaHandler;
051import org.apache.hadoop.hdfs.server.datanode.ReplicaInfo;
052import org.apache.hadoop.hdfs.server.datanode.ReplicaNotFoundException;
053import org.apache.hadoop.hdfs.server.datanode.StorageLocation;
054import org.apache.hadoop.hdfs.server.datanode.UnexpectedReplicaStateException;
055import org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetFactory;
056import org.apache.hadoop.hdfs.server.datanode.metrics.FSDatasetMBean;
057import org.apache.hadoop.hdfs.server.protocol.BlockRecoveryCommand.RecoveringBlock;
058import org.apache.hadoop.hdfs.server.protocol.DatanodeStorage;
059import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
060import org.apache.hadoop.hdfs.server.protocol.ReplicaRecoveryInfo;
061import org.apache.hadoop.hdfs.server.protocol.StorageReport;
062import org.apache.hadoop.hdfs.server.protocol.VolumeFailureSummary;
063import org.apache.hadoop.util.AutoCloseableLock;
064import org.apache.hadoop.util.ReflectionUtils;
065
066/**
067 * This is a service provider interface for the underlying storage that
068 * stores replicas for a data node.
069 * The default implementation stores replicas on local drives. 
070 */
071@InterfaceAudience.Private
072public interface FsDatasetSpi<V extends FsVolumeSpi> extends FSDatasetMBean {
073  /**
074   * A factory for creating {@link FsDatasetSpi} objects.
075   */
076  abstract class Factory<D extends FsDatasetSpi<?>> {
077    /** @return the configured factory. */
078    public static Factory<?> getFactory(Configuration conf) {
079      @SuppressWarnings("rawtypes")
080      final Class<? extends Factory> clazz = conf.getClass(
081          DFSConfigKeys.DFS_DATANODE_FSDATASET_FACTORY_KEY,
082          FsDatasetFactory.class,
083          Factory.class);
084      return ReflectionUtils.newInstance(clazz, conf);
085    }
086
087    /** Create a new object. */
088    public abstract D newInstance(DataNode datanode, DataStorage storage,
089        Configuration conf) throws IOException;
090
091    /** Does the factory create simulated objects? */
092    public boolean isSimulated() {
093      return false;
094    }
095  }
096
097  /**
098   * It behaviors as an unmodifiable list of FsVolume. Individual FsVolume can
099   * be obtained by using {@link #get(int)}.
100   *
101   * This also holds the reference counts for these volumes. It releases all the
102   * reference counts in {@link #close()}.
103   */
104  class FsVolumeReferences implements Iterable<FsVolumeSpi>, Closeable {
105    private final List<FsVolumeReference> references;
106
107    public <S extends FsVolumeSpi> FsVolumeReferences(List<S> curVolumes) {
108      references = new ArrayList<>();
109      for (FsVolumeSpi v : curVolumes) {
110        try {
111          references.add(v.obtainReference());
112        } catch (ClosedChannelException e) {
113          // This volume has been closed.
114        }
115      }
116    }
117
118    private static class FsVolumeSpiIterator implements
119        Iterator<FsVolumeSpi> {
120      private final List<FsVolumeReference> references;
121      private int idx = 0;
122
123      FsVolumeSpiIterator(List<FsVolumeReference> refs) {
124        references = refs;
125      }
126
127      @Override
128      public boolean hasNext() {
129        return idx < references.size();
130      }
131
132      @Override
133      public FsVolumeSpi next() {
134        int refIdx = idx++;
135        return references.get(refIdx).getVolume();
136      }
137
138      @Override
139      public void remove() {
140        throw new UnsupportedOperationException();
141      }
142    }
143
144    @Override
145    public Iterator<FsVolumeSpi> iterator() {
146      return new FsVolumeSpiIterator(references);
147    }
148
149    /**
150     * Get the number of volumes.
151     */
152    public int size() {
153      return references.size();
154    }
155
156    /**
157     * Get the volume for a given index.
158     */
159    public FsVolumeSpi get(int index) {
160      return references.get(index).getVolume();
161    }
162
163    @Override
164    public void close() throws IOException {
165      IOException ioe = null;
166      for (FsVolumeReference ref : references) {
167        try {
168          ref.close();
169        } catch (IOException e) {
170          ioe = e;
171        }
172      }
173      references.clear();
174      if (ioe != null) {
175        throw ioe;
176      }
177    }
178  }
179
180  /**
181   * Returns a list of FsVolumes that hold reference counts.
182   *
183   * The caller must release the reference of each volume by calling
184   * {@link FsVolumeReferences#close()}.
185   */
186  FsVolumeReferences getFsVolumeReferences();
187
188  /**
189   * Add a new volume to the FsDataset.<p/>
190   *
191   * If the FSDataset supports block scanning, this function registers
192   * the new volume with the block scanner.
193   *
194   * @param location      The storage location for the new volume.
195   * @param nsInfos       Namespace information for the new volume.
196   */
197  void addVolume(
198      final StorageLocation location,
199      final List<NamespaceInfo> nsInfos) throws IOException;
200
201  /**
202   * Removes a collection of volumes from FsDataset.
203   *
204   * If the FSDataset supports block scanning, this function removes
205   * the volumes from the block scanner.
206   *
207   * @param volumes  The paths of the volumes to be removed.
208   * @param clearFailure set true to clear the failure information about the
209   *                     volumes.
210   */
211  void removeVolumes(Set<File> volumes, boolean clearFailure);
212
213  /** @return a storage with the given storage ID */
214  DatanodeStorage getStorage(final String storageUuid);
215
216  /** @return one or more storage reports for attached volumes. */
217  StorageReport[] getStorageReports(String bpid)
218      throws IOException;
219
220  /** @return the volume that contains a replica of the block. */
221  V getVolume(ExtendedBlock b);
222
223  /** @return a volume information map (name => info). */
224  Map<String, Object> getVolumeInfoMap();
225
226  /**
227   * Returns info about volume failures.
228   *
229   * @return info about volume failures, possibly null
230   */
231  VolumeFailureSummary getVolumeFailureSummary();
232
233  /**
234   * Gets a list of references to the finalized blocks for the given block pool.
235   * <p>
236   * Callers of this function should call
237   * {@link FsDatasetSpi#acquireDatasetLock} to avoid blocks' status being
238   * changed during list iteration.
239   * </p>
240   * @return a list of references to the finalized blocks for the given block
241   *         pool.
242   */
243  List<FinalizedReplica> getFinalizedBlocks(String bpid);
244
245  /**
246   * Check whether the in-memory block record matches the block on the disk,
247   * and, in case that they are not matched, update the record or mark it
248   * as corrupted.
249   */
250  void checkAndUpdate(String bpid, long blockId, File diskFile,
251      File diskMetaFile, FsVolumeSpi vol) throws IOException;
252
253  /**
254   * @param b - the block
255   * @return a stream if the meta-data of the block exists;
256   *         otherwise, return null.
257   * @throws IOException
258   */
259  LengthInputStream getMetaDataInputStream(ExtendedBlock b
260      ) throws IOException;
261
262  /**
263   * Returns the specified block's on-disk length (excluding metadata).
264   * @return   the specified block's on-disk length (excluding metadta)
265   * @throws IOException on error
266   */
267  long getLength(ExtendedBlock b) throws IOException;
268
269  /**
270   * Get reference to the replica meta info in the replicasMap. 
271   * To be called from methods that are synchronized on {@link FSDataset}
272   * @return replica from the replicas map
273   */
274  @Deprecated
275  Replica getReplica(String bpid, long blockId);
276
277  /**
278   * @return replica meta information
279   */
280  String getReplicaString(String bpid, long blockId);
281
282  /**
283   * @return the generation stamp stored with the block.
284   */
285  Block getStoredBlock(String bpid, long blkid) throws IOException;
286
287  /**
288   * Returns an input stream at specified offset of the specified block.
289   * @param b block
290   * @param seekOffset offset with in the block to seek to
291   * @return an input stream to read the contents of the specified block,
292   *  starting at the offset
293   * @throws IOException
294   */
295  InputStream getBlockInputStream(ExtendedBlock b, long seekOffset)
296            throws IOException;
297
298  /**
299   * Returns an input stream at specified offset of the specified block.
300   * The block is still in the tmp directory and is not finalized
301   * @return an input stream to read the contents of the specified block,
302   *  starting at the offset
303   * @throws IOException
304   */
305  ReplicaInputStreams getTmpInputStreams(ExtendedBlock b, long blkoff,
306      long ckoff) throws IOException;
307
308  /**
309   * Creates a temporary replica and returns the meta information of the replica
310   * .
311   * 
312   * @param b block
313   * @return the meta info of the replica which is being written to
314   * @throws IOException if an error occurs
315   */
316  ReplicaHandler createTemporary(StorageType storageType,
317      ExtendedBlock b) throws IOException;
318
319  /**
320   * Creates a RBW replica and returns the meta info of the replica
321   * 
322   * @param b block
323   * @return the meta info of the replica which is being written to
324   * @throws IOException if an error occurs
325   */
326  ReplicaHandler createRbw(StorageType storageType,
327      ExtendedBlock b, boolean allowLazyPersist) throws IOException;
328
329  /**
330   * Recovers a RBW replica and returns the meta info of the replica.
331   * 
332   * @param b block
333   * @param newGS the new generation stamp for the replica
334   * @param minBytesRcvd the minimum number of bytes that the replica could have
335   * @param maxBytesRcvd the maximum number of bytes that the replica could have
336   * @return the meta info of the replica which is being written to
337   * @throws IOException if an error occurs
338   */
339  ReplicaHandler recoverRbw(ExtendedBlock b,
340      long newGS, long minBytesRcvd, long maxBytesRcvd) throws IOException;
341
342  /**
343   * Covert a temporary replica to a RBW.
344   * @param temporary the temporary replica being converted
345   * @return the result RBW
346   */
347  ReplicaInPipelineInterface convertTemporaryToRbw(
348      ExtendedBlock temporary) throws IOException;
349
350  /**
351   * Append to a finalized replica and returns the meta info of the replica.
352   * 
353   * @param b block
354   * @param newGS the new generation stamp for the replica
355   * @param expectedBlockLen the number of bytes the replica is expected to have
356   * @return the meata info of the replica which is being written to
357   * @throws IOException
358   */
359  ReplicaHandler append(ExtendedBlock b, long newGS,
360      long expectedBlockLen) throws IOException;
361
362  /**
363   * Recover a failed append to a finalized replica and returns the meta
364   * info of the replica.
365   * 
366   * @param b block
367   * @param newGS the new generation stamp for the replica
368   * @param expectedBlockLen the number of bytes the replica is expected to have
369   * @return the meta info of the replica which is being written to
370   * @throws IOException
371   */
372  ReplicaHandler recoverAppend(
373      ExtendedBlock b, long newGS, long expectedBlockLen) throws IOException;
374  
375  /**
376   * Recover a failed pipeline close.
377   * It bumps the replica's generation stamp and finalize it if RBW replica
378   * 
379   * @param b block
380   * @param newGS the new generation stamp for the replica
381   * @param expectedBlockLen the number of bytes the replica is expected to have
382   * @return the storage uuid of the replica.
383   * @throws IOException
384   */
385  Replica recoverClose(ExtendedBlock b, long newGS, long expectedBlockLen
386      ) throws IOException;
387  
388  /**
389   * Finalizes the block previously opened for writing using writeToBlock.
390   * The block size is what is in the parameter b and it must match the amount
391   *  of data written
392   * @throws IOException
393   * @throws ReplicaNotFoundException if the replica can not be found when the
394   * block is been finalized. For instance, the block resides on an HDFS volume
395   * that has been removed.
396   */
397  void finalizeBlock(ExtendedBlock b) throws IOException;
398
399  /**
400   * Unfinalizes the block previously opened for writing using writeToBlock.
401   * The temporary file associated with this block is deleted.
402   * @throws IOException
403   */
404  void unfinalizeBlock(ExtendedBlock b) throws IOException;
405
406  /**
407   * Returns one block report per volume.
408   * @param bpid Block Pool Id
409   * @return - a map of DatanodeStorage to block report for the volume.
410   */
411  Map<DatanodeStorage, BlockListAsLongs> getBlockReports(String bpid);
412
413  /**
414   * Returns the cache report - the full list of cached block IDs of a
415   * block pool.
416   * @param   bpid Block Pool Id
417   * @return  the cache report - the full list of cached block IDs.
418   */
419  List<Long> getCacheReport(String bpid);
420
421  /** Does the dataset contain the block? */
422  boolean contains(ExtendedBlock block);
423
424  /**
425   * Check if a block is valid.
426   *
427   * @param b           The block to check.
428   * @param minLength   The minimum length that the block must have.  May be 0.
429   * @param state       If this is null, it is ignored.  If it is non-null, we
430   *                        will check that the replica has this state.
431   *
432   * @throws ReplicaNotFoundException          If the replica is not found
433   *
434   * @throws UnexpectedReplicaStateException   If the replica is not in the 
435   *                                             expected state.
436   * @throws FileNotFoundException             If the block file is not found or there 
437   *                                              was an error locating it.
438   * @throws EOFException                      If the replica length is too short.
439   * 
440   * @throws IOException                       May be thrown from the methods called. 
441   */
442  void checkBlock(ExtendedBlock b, long minLength, ReplicaState state)
443      throws ReplicaNotFoundException, UnexpectedReplicaStateException,
444      FileNotFoundException, EOFException, IOException;
445      
446  
447  /**
448   * Is the block valid?
449   * @return - true if the specified block is valid
450   */
451  boolean isValidBlock(ExtendedBlock b);
452
453  /**
454   * Is the block a valid RBW?
455   * @return - true if the specified block is a valid RBW
456   */
457  boolean isValidRbw(ExtendedBlock b);
458
459  /**
460   * Invalidates the specified blocks
461   * @param bpid Block pool Id
462   * @param invalidBlks - the blocks to be invalidated
463   * @throws IOException
464   */
465  void invalidate(String bpid, Block invalidBlks[]) throws IOException;
466
467  /**
468   * Caches the specified blocks
469   * @param bpid Block pool id
470   * @param blockIds - block ids to cache
471   */
472  void cache(String bpid, long[] blockIds);
473
474  /**
475   * Uncaches the specified blocks
476   * @param bpid Block pool id
477   * @param blockIds - blocks ids to uncache
478   */
479  void uncache(String bpid, long[] blockIds);
480
481  /**
482   * Determine if the specified block is cached.
483   * @param bpid Block pool id
484   * @param blockIds - block id
485   * @return true if the block is cached
486   */
487  boolean isCached(String bpid, long blockId);
488
489    /**
490     * Check if all the data directories are healthy
491     * @return A set of unhealthy data directories.
492     */
493  Set<File> checkDataDir();
494
495  /**
496   * Shutdown the FSDataset
497   */
498  void shutdown();
499
500  /**
501   * Sets the file pointer of the checksum stream so that the last checksum
502   * will be overwritten
503   * @param b block
504   * @param outs The streams for the data file and checksum file
505   * @param checksumSize number of bytes each checksum has
506   * @throws IOException
507   */
508  void adjustCrcChannelPosition(ExtendedBlock b,
509      ReplicaOutputStreams outs, int checksumSize) throws IOException;
510
511  /**
512   * Checks how many valid storage volumes there are in the DataNode.
513   * @return true if more than the minimum number of valid volumes are left 
514   * in the FSDataSet.
515   */
516  boolean hasEnoughResource();
517
518  /**
519   * Get visible length of the specified replica.
520   */
521  long getReplicaVisibleLength(final ExtendedBlock block) throws IOException;
522
523  /**
524   * Initialize a replica recovery.
525   * @return actual state of the replica on this data-node or 
526   * null if data-node does not have the replica.
527   */
528  ReplicaRecoveryInfo initReplicaRecovery(RecoveringBlock rBlock
529      ) throws IOException;
530
531  /**
532   * Update replica's generation stamp and length and finalize it.
533   * @return the ID of storage that stores the block
534   */
535  Replica updateReplicaUnderRecovery(ExtendedBlock oldBlock,
536      long recoveryId, long newBlockId, long newLength) throws IOException;
537
538  /**
539   * add new block pool ID
540   * @param bpid Block pool Id
541   * @param conf Configuration
542   */
543  void addBlockPool(String bpid, Configuration conf) throws IOException;
544
545  /**
546   * Shutdown and remove the block pool from underlying storage.
547   * @param bpid Block pool Id to be removed
548   */
549  void shutdownBlockPool(String bpid) ;
550
551  /**
552   * Deletes the block pool directories. If force is false, directories are 
553   * deleted only if no block files exist for the block pool. If force 
554   * is true entire directory for the blockpool is deleted along with its
555   * contents.
556   * @param bpid BlockPool Id to be deleted.
557   * @param force If force is false, directories are deleted only if no
558   *        block files exist for the block pool, otherwise entire 
559   *        directory for the blockpool is deleted along with its contents.
560   * @throws IOException
561   */
562  void deleteBlockPool(String bpid, boolean force) throws IOException;
563
564  /**
565   * Get {@link BlockLocalPathInfo} for the given block.
566   */
567  BlockLocalPathInfo getBlockLocalPathInfo(ExtendedBlock b
568      ) throws IOException;
569
570  /**
571   * Get a {@link HdfsBlocksMetadata} corresponding to the list of blocks in 
572   * <code>blocks</code>.
573   * 
574   * @param bpid pool to query
575   * @param blockIds List of block ids for which to return metadata
576   * @return metadata Metadata for the list of blocks
577   * @throws IOException
578   */
579  HdfsBlocksMetadata getHdfsBlocksMetadata(String bpid,
580      long[] blockIds) throws IOException;
581
582  /**
583   * Enable 'trash' for the given dataset. When trash is enabled, files are
584   * moved to a separate trash directory instead of being deleted immediately.
585   * This can be useful for example during rolling upgrades.
586   */
587  void enableTrash(String bpid);
588
589  /**
590   * Clear trash
591   */
592  void clearTrash(String bpid);
593
594  /**
595   * @return true when trash is enabled
596   */
597  boolean trashEnabled(String bpid);
598
599  /**
600   * Create a marker file indicating that a rolling upgrade is in progress.
601   */
602  void setRollingUpgradeMarker(String bpid) throws IOException;
603
604  /**
605   * Delete the rolling upgrade marker file if it exists.
606   * @param bpid
607   */
608  void clearRollingUpgradeMarker(String bpid) throws IOException;
609
610  /**
611   * submit a sync_file_range request to AsyncDiskService.
612   */
613  void submitBackgroundSyncFileRangeRequest(final ExtendedBlock block,
614      final FileDescriptor fd, final long offset, final long nbytes,
615      final int flags);
616
617  /**
618   * Callback from RamDiskAsyncLazyPersistService upon async lazy persist task end
619   */
620  void onCompleteLazyPersist(String bpId, long blockId,
621      long creationTime, File[] savedFiles, V targetVolume);
622
623   /**
624    * Callback from RamDiskAsyncLazyPersistService upon async lazy persist task fail
625    */
626   void onFailLazyPersist(String bpId, long blockId);
627
628    /**
629     * Move block from one storage to another storage
630     */
631   ReplicaInfo moveBlockAcrossStorage(final ExtendedBlock block,
632        StorageType targetStorageType) throws IOException;
633
634  /**
635   * Set a block to be pinned on this datanode so that it cannot be moved
636   * by Balancer/Mover.
637   *
638   * It is a no-op when dfs.datanode.block-pinning.enabled is set to false.
639   */
640  void setPinning(ExtendedBlock block) throws IOException;
641
642  /**
643   * Check whether the block was pinned
644   */
645  boolean getPinning(ExtendedBlock block) throws IOException;
646
647  /**
648   * Confirm whether the block is deleting
649   */
650  boolean isDeletingBlock(String bpid, long blockId);
651
652  /**
653   * Acquire the lock of the dataset.
654   */
655  AutoCloseableLock acquireDatasetLock();
656}