001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    package org.apache.hadoop.hdfs.server.blockmanagement;
019    
020    import java.util.ArrayList;
021    import java.util.Collection;
022    import java.util.Collections;
023    import java.util.HashMap;
024    import java.util.Iterator;
025    import java.util.LinkedList;
026    import java.util.List;
027    import java.util.Map;
028    import java.util.Queue;
029    
030    import com.google.common.annotations.VisibleForTesting;
031    import org.apache.commons.logging.Log;
032    import org.apache.commons.logging.LogFactory;
033    import org.apache.hadoop.classification.InterfaceAudience;
034    import org.apache.hadoop.classification.InterfaceStability;
035    import org.apache.hadoop.hdfs.protocol.Block;
036    import org.apache.hadoop.hdfs.protocol.DatanodeID;
037    import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
038    import org.apache.hadoop.hdfs.server.namenode.CachedBlock;
039    import org.apache.hadoop.hdfs.server.protocol.DatanodeStorage;
040    import org.apache.hadoop.hdfs.server.protocol.StorageReport;
041    import org.apache.hadoop.hdfs.util.LightWeightHashSet;
042    import org.apache.hadoop.util.IntrusiveCollection;
043    import org.apache.hadoop.util.Time;
044    
045    import com.google.common.annotations.VisibleForTesting;
046    
047    /**
048     * This class extends the DatanodeInfo class with ephemeral information (eg
049     * health, capacity, what blocks are associated with the Datanode) that is
050     * private to the Namenode, ie this class is not exposed to clients.
051     */
052    @InterfaceAudience.Private
053    @InterfaceStability.Evolving
054    public class DatanodeDescriptor extends DatanodeInfo {
055      public static final Log LOG = LogFactory.getLog(DatanodeDescriptor.class);
056      public static final DatanodeDescriptor[] EMPTY_ARRAY = {};
057    
058      // Stores status of decommissioning.
059      // If node is not decommissioning, do not use this object for anything.
060      public final DecommissioningStatus decommissioningStatus = new DecommissioningStatus();
061      
062      /** Block and targets pair */
063      @InterfaceAudience.Private
064      @InterfaceStability.Evolving
065      public static class BlockTargetPair {
066        public final Block block;
067        public final DatanodeStorageInfo[] targets;    
068    
069        BlockTargetPair(Block block, DatanodeStorageInfo[] targets) {
070          this.block = block;
071          this.targets = targets;
072        }
073      }
074    
075      /** A BlockTargetPair queue. */
076      private static class BlockQueue<E> {
077        private final Queue<E> blockq = new LinkedList<E>();
078    
079        /** Size of the queue */
080        synchronized int size() {return blockq.size();}
081    
082        /** Enqueue */
083        synchronized boolean offer(E e) { 
084          return blockq.offer(e);
085        }
086    
087        /** Dequeue */
088        synchronized List<E> poll(int numBlocks) {
089          if (numBlocks <= 0 || blockq.isEmpty()) {
090            return null;
091          }
092    
093          List<E> results = new ArrayList<E>();
094          for(; !blockq.isEmpty() && numBlocks > 0; numBlocks--) {
095            results.add(blockq.poll());
096          }
097          return results;
098        }
099    
100        /**
101         * Returns <tt>true</tt> if the queue contains the specified element.
102         */
103        boolean contains(E e) {
104          return blockq.contains(e);
105        }
106    
107        synchronized void clear() {
108          blockq.clear();
109        }
110      }
111    
112      private final Map<String, DatanodeStorageInfo> storageMap = 
113          new HashMap<String, DatanodeStorageInfo>();
114    
115      /**
116       * A list of CachedBlock objects on this datanode.
117       */
118      public static class CachedBlocksList extends IntrusiveCollection<CachedBlock> {
119        public enum Type {
120          PENDING_CACHED,
121          CACHED,
122          PENDING_UNCACHED
123        }
124    
125        private final DatanodeDescriptor datanode;
126    
127        private final Type type;
128    
129        CachedBlocksList(DatanodeDescriptor datanode, Type type) {
130          this.datanode = datanode;
131          this.type = type;
132        }
133    
134        public DatanodeDescriptor getDatanode() {
135          return datanode;
136        }
137    
138        public Type getType() {
139          return type;
140        }
141      }
142    
143      /**
144       * The blocks which we want to cache on this DataNode.
145       */
146      private final CachedBlocksList pendingCached = 
147          new CachedBlocksList(this, CachedBlocksList.Type.PENDING_CACHED);
148    
149      /**
150       * The blocks which we know are cached on this datanode.
151       * This list is updated by periodic cache reports.
152       */
153      private final CachedBlocksList cached = 
154          new CachedBlocksList(this, CachedBlocksList.Type.CACHED);
155    
156      /**
157       * The blocks which we want to uncache on this DataNode.
158       */
159      private final CachedBlocksList pendingUncached = 
160          new CachedBlocksList(this, CachedBlocksList.Type.PENDING_UNCACHED);
161    
162      public CachedBlocksList getPendingCached() {
163        return pendingCached;
164      }
165    
166      public CachedBlocksList getCached() {
167        return cached;
168      }
169    
170      public CachedBlocksList getPendingUncached() {
171        return pendingUncached;
172      }
173    
174      /**
175       * The time when the last batch of caching directives was sent, in
176       * monotonic milliseconds.
177       */
178      private long lastCachingDirectiveSentTimeMs;
179    
180      // isAlive == heartbeats.contains(this)
181      // This is an optimization, because contains takes O(n) time on Arraylist
182      public boolean isAlive = false;
183      public boolean needKeyUpdate = false;
184    
185      
186      // A system administrator can tune the balancer bandwidth parameter
187      // (dfs.balance.bandwidthPerSec) dynamically by calling
188      // "dfsadmin -setBalanacerBandwidth <newbandwidth>", at which point the
189      // following 'bandwidth' variable gets updated with the new value for each
190      // node. Once the heartbeat command is issued to update the value on the
191      // specified datanode, this value will be set back to 0.
192      private long bandwidth;
193    
194      /** A queue of blocks to be replicated by this datanode */
195      private final BlockQueue<BlockTargetPair> replicateBlocks = new BlockQueue<BlockTargetPair>();
196      /** A queue of blocks to be recovered by this datanode */
197      private final BlockQueue<BlockInfoUnderConstruction> recoverBlocks =
198                                    new BlockQueue<BlockInfoUnderConstruction>();
199      /** A set of blocks to be invalidated by this datanode */
200      private final LightWeightHashSet<Block> invalidateBlocks = new LightWeightHashSet<Block>();
201    
202      /* Variables for maintaining number of blocks scheduled to be written to
203       * this storage. This count is approximate and might be slightly bigger
204       * in case of errors (e.g. datanode does not report if an error occurs
205       * while writing the block).
206       */
207      private int currApproxBlocksScheduled = 0;
208      private int prevApproxBlocksScheduled = 0;
209      private long lastBlocksScheduledRollTime = 0;
210      private static final int BLOCKS_SCHEDULED_ROLL_INTERVAL = 600*1000; //10min
211      private int volumeFailures = 0;
212      
213      /** 
214       * When set to true, the node is not in include list and is not allowed
215       * to communicate with the namenode
216       */
217      private boolean disallowed = false;
218    
219      /**
220       * DatanodeDescriptor constructor
221       * @param nodeID id of the data node
222       */
223      public DatanodeDescriptor(DatanodeID nodeID) {
224        super(nodeID);
225        updateHeartbeat(StorageReport.EMPTY_ARRAY, 0L, 0L, 0, 0);
226      }
227    
228      /**
229       * DatanodeDescriptor constructor
230       * @param nodeID id of the data node
231       * @param networkLocation location of the data node in network
232       */
233      public DatanodeDescriptor(DatanodeID nodeID, 
234                                String networkLocation) {
235        super(nodeID, networkLocation);
236        updateHeartbeat(StorageReport.EMPTY_ARRAY, 0L, 0L, 0, 0);
237      }
238    
239      /**
240       * Add data-node to the block. Add block to the head of the list of blocks
241       * belonging to the data-node.
242       */
243      public boolean addBlock(String storageID, BlockInfo b) {
244        DatanodeStorageInfo s = getStorageInfo(storageID);
245        if (s != null) {
246          return s.addBlock(b);
247        }
248        return false;
249      }
250    
251      @VisibleForTesting
252      public DatanodeStorageInfo getStorageInfo(String storageID) {
253        synchronized (storageMap) {
254          return storageMap.get(storageID);
255        }
256      }
257      DatanodeStorageInfo[] getStorageInfos() {
258        synchronized (storageMap) {
259          final Collection<DatanodeStorageInfo> storages = storageMap.values();
260          return storages.toArray(new DatanodeStorageInfo[storages.size()]);
261        }
262      }
263    
264      boolean hasStaleStorages() {
265        synchronized (storageMap) {
266          for (DatanodeStorageInfo storage : storageMap.values()) {
267            if (storage.areBlockContentsStale()) {
268              return true;
269            }
270          }
271          return false;
272        }
273      }
274    
275      /**
276       * Remove block from the list of blocks belonging to the data-node. Remove
277       * data-node from the block.
278       */
279      boolean removeBlock(BlockInfo b) {
280        int index = b.findStorageInfo(this);
281        // if block exists on this datanode
282        if (index >= 0) {
283          DatanodeStorageInfo s = b.getStorageInfo(index);
284          if (s != null) {
285            return s.removeBlock(b);
286          }
287        }
288        return false;
289      }
290      
291      /**
292       * Remove block from the list of blocks belonging to the data-node. Remove
293       * data-node from the block.
294       */
295      boolean removeBlock(String storageID, BlockInfo b) {
296        DatanodeStorageInfo s = getStorageInfo(storageID);
297        if (s != null) {
298          return s.removeBlock(b);
299        }
300        return false;
301      }
302    
303      /**
304       * Replace specified old block with a new one in the DataNodeDescriptor.
305       *
306       * @param oldBlock - block to be replaced
307       * @param newBlock - a replacement block
308       * @return the new block
309       */
310      public BlockInfo replaceBlock(BlockInfo oldBlock, BlockInfo newBlock) {
311        int index = oldBlock.findStorageInfo(this);
312        DatanodeStorageInfo s = oldBlock.getStorageInfo(index);
313        boolean done = s.removeBlock(oldBlock);
314        assert done : "Old block should belong to the data-node when replacing";
315    
316        done = s.addBlock(newBlock);
317        assert done : "New block should not belong to the data-node when replacing";
318        return newBlock;
319      }
320    
321      public void resetBlocks() {
322        setCapacity(0);
323        setRemaining(0);
324        setBlockPoolUsed(0);
325        setDfsUsed(0);
326        setXceiverCount(0);
327        this.invalidateBlocks.clear();
328        this.volumeFailures = 0;
329        // pendingCached, cached, and pendingUncached are protected by the
330        // FSN lock.
331        this.pendingCached.clear();
332        this.cached.clear();
333        this.pendingUncached.clear();
334      }
335      
336      public void clearBlockQueues() {
337        synchronized (invalidateBlocks) {
338          this.invalidateBlocks.clear();
339          this.recoverBlocks.clear();
340          this.replicateBlocks.clear();
341        }
342        // pendingCached, cached, and pendingUncached are protected by the
343        // FSN lock.
344        this.pendingCached.clear();
345        this.cached.clear();
346        this.pendingUncached.clear();
347      }
348    
349      public int numBlocks() {
350        int blocks = 0;
351        for (DatanodeStorageInfo entry : getStorageInfos()) {
352          blocks += entry.numBlocks();
353        }
354        return blocks;
355      }
356    
357      /**
358       * Updates stats from datanode heartbeat.
359       */
360      public void updateHeartbeat(StorageReport[] reports, long cacheCapacity,
361          long cacheUsed, int xceiverCount, int volFailures) {
362        long totalCapacity = 0;
363        long totalRemaining = 0;
364        long totalBlockPoolUsed = 0;
365        long totalDfsUsed = 0;
366    
367        setCacheCapacity(cacheCapacity);
368        setCacheUsed(cacheUsed);
369        setXceiverCount(xceiverCount);
370        setLastUpdate(Time.now());    
371        this.volumeFailures = volFailures;
372        for (StorageReport report : reports) {
373          DatanodeStorageInfo storage = updateStorage(report.getStorage());
374          storage.receivedHeartbeat(report);
375          totalCapacity += report.getCapacity();
376          totalRemaining += report.getRemaining();
377          totalBlockPoolUsed += report.getBlockPoolUsed();
378          totalDfsUsed += report.getDfsUsed();
379        }
380        rollBlocksScheduled(getLastUpdate());
381    
382        // Update total metrics for the node.
383        setCapacity(totalCapacity);
384        setRemaining(totalRemaining);
385        setBlockPoolUsed(totalBlockPoolUsed);
386        setDfsUsed(totalDfsUsed);
387      }
388    
389      private static class BlockIterator implements Iterator<BlockInfo> {
390        private int index = 0;
391        private final List<Iterator<BlockInfo>> iterators;
392        
393        private BlockIterator(final DatanodeStorageInfo... storages) {
394          List<Iterator<BlockInfo>> iterators = new ArrayList<Iterator<BlockInfo>>();
395          for (DatanodeStorageInfo e : storages) {
396            iterators.add(e.getBlockIterator());
397          }
398          this.iterators = Collections.unmodifiableList(iterators);
399        }
400    
401        @Override
402        public boolean hasNext() {
403          update();
404          return !iterators.isEmpty() && iterators.get(index).hasNext();
405        }
406    
407        @Override
408        public BlockInfo next() {
409          update();
410          return iterators.get(index).next();
411        }
412        
413        @Override
414        public void remove() {
415          throw new UnsupportedOperationException("Remove unsupported.");
416        }
417        
418        private void update() {
419          while(index < iterators.size() - 1 && !iterators.get(index).hasNext()) {
420            index++;
421          }
422        }
423      }
424    
425      Iterator<BlockInfo> getBlockIterator() {
426        return new BlockIterator(getStorageInfos());
427      }
428      Iterator<BlockInfo> getBlockIterator(final String storageID) {
429        return new BlockIterator(getStorageInfo(storageID));
430      }
431    
432      /**
433       * Store block replication work.
434       */
435      void addBlockToBeReplicated(Block block, DatanodeStorageInfo[] targets) {
436        assert(block != null && targets != null && targets.length > 0);
437        replicateBlocks.offer(new BlockTargetPair(block, targets));
438      }
439    
440      /**
441       * Store block recovery work.
442       */
443      void addBlockToBeRecovered(BlockInfoUnderConstruction block) {
444        if(recoverBlocks.contains(block)) {
445          // this prevents adding the same block twice to the recovery queue
446          BlockManager.LOG.info(block + " is already in the recovery queue");
447          return;
448        }
449        recoverBlocks.offer(block);
450      }
451    
452      /**
453       * Store block invalidation work.
454       */
455      void addBlocksToBeInvalidated(List<Block> blocklist) {
456        assert(blocklist != null && blocklist.size() > 0);
457        synchronized (invalidateBlocks) {
458          for(Block blk : blocklist) {
459            invalidateBlocks.add(blk);
460          }
461        }
462      }
463      
464      /**
465       * The number of work items that are pending to be replicated
466       */
467      int getNumberOfBlocksToBeReplicated() {
468        return replicateBlocks.size();
469      }
470    
471      /**
472       * The number of block invalidation items that are pending to 
473       * be sent to the datanode
474       */
475      int getNumberOfBlocksToBeInvalidated() {
476        synchronized (invalidateBlocks) {
477          return invalidateBlocks.size();
478        }
479      }
480    
481      public List<BlockTargetPair> getReplicationCommand(int maxTransfers) {
482        return replicateBlocks.poll(maxTransfers);
483      }
484    
485      public BlockInfoUnderConstruction[] getLeaseRecoveryCommand(int maxTransfers) {
486        List<BlockInfoUnderConstruction> blocks = recoverBlocks.poll(maxTransfers);
487        if(blocks == null)
488          return null;
489        return blocks.toArray(new BlockInfoUnderConstruction[blocks.size()]);
490      }
491    
492      /**
493       * Remove the specified number of blocks to be invalidated
494       */
495      public Block[] getInvalidateBlocks(int maxblocks) {
496        synchronized (invalidateBlocks) {
497          Block[] deleteList = invalidateBlocks.pollToArray(new Block[Math.min(
498              invalidateBlocks.size(), maxblocks)]);
499          return deleteList.length == 0 ? null : deleteList;
500        }
501      }
502    
503      /**
504       * @return Approximate number of blocks currently scheduled to be written 
505       * to this datanode.
506       */
507      public int getBlocksScheduled() {
508        return currApproxBlocksScheduled + prevApproxBlocksScheduled;
509      }
510    
511      /** Increment the number of blocks scheduled. */
512      void incrementBlocksScheduled() {
513        currApproxBlocksScheduled++;
514      }
515      
516      /** Decrement the number of blocks scheduled. */
517      void decrementBlocksScheduled() {
518        if (prevApproxBlocksScheduled > 0) {
519          prevApproxBlocksScheduled--;
520        } else if (currApproxBlocksScheduled > 0) {
521          currApproxBlocksScheduled--;
522        } 
523        // its ok if both counters are zero.
524      }
525      
526      /** Adjusts curr and prev number of blocks scheduled every few minutes. */
527      private void rollBlocksScheduled(long now) {
528        if (now - lastBlocksScheduledRollTime > BLOCKS_SCHEDULED_ROLL_INTERVAL) {
529          prevApproxBlocksScheduled = currApproxBlocksScheduled;
530          currApproxBlocksScheduled = 0;
531          lastBlocksScheduledRollTime = now;
532        }
533      }
534      
535      @Override
536      public int hashCode() {
537        // Super implementation is sufficient
538        return super.hashCode();
539      }
540      
541      @Override
542      public boolean equals(Object obj) {
543        // Sufficient to use super equality as datanodes are uniquely identified
544        // by DatanodeID
545        return (this == obj) || super.equals(obj);
546      }
547    
548      /** Decommissioning status */
549      public class DecommissioningStatus {
550        private int underReplicatedBlocks;
551        private int decommissionOnlyReplicas;
552        private int underReplicatedInOpenFiles;
553        private long startTime;
554        
555        synchronized void set(int underRep,
556            int onlyRep, int underConstruction) {
557          if (isDecommissionInProgress() == false) {
558            return;
559          }
560          underReplicatedBlocks = underRep;
561          decommissionOnlyReplicas = onlyRep;
562          underReplicatedInOpenFiles = underConstruction;
563        }
564    
565        /** @return the number of under-replicated blocks */
566        public synchronized int getUnderReplicatedBlocks() {
567          if (isDecommissionInProgress() == false) {
568            return 0;
569          }
570          return underReplicatedBlocks;
571        }
572        /** @return the number of decommission-only replicas */
573        public synchronized int getDecommissionOnlyReplicas() {
574          if (isDecommissionInProgress() == false) {
575            return 0;
576          }
577          return decommissionOnlyReplicas;
578        }
579        /** @return the number of under-replicated blocks in open files */
580        public synchronized int getUnderReplicatedInOpenFiles() {
581          if (isDecommissionInProgress() == false) {
582            return 0;
583          }
584          return underReplicatedInOpenFiles;
585        }
586        /** Set start time */
587        public synchronized void setStartTime(long time) {
588          startTime = time;
589        }
590        /** @return start time */
591        public synchronized long getStartTime() {
592          if (isDecommissionInProgress() == false) {
593            return 0;
594          }
595          return startTime;
596        }
597      }  // End of class DecommissioningStatus
598    
599      /**
600       * Set the flag to indicate if this datanode is disallowed from communicating
601       * with the namenode.
602       */
603      public void setDisallowed(boolean flag) {
604        disallowed = flag;
605      }
606      /** Is the datanode disallowed from communicating with the namenode? */
607      public boolean isDisallowed() {
608        return disallowed;
609      }
610    
611      /**
612       * @return number of failed volumes in the datanode.
613       */
614      public int getVolumeFailures() {
615        return volumeFailures;
616      }
617    
618      /**
619       * @param nodeReg DatanodeID to update registration for.
620       */
621      @Override
622      public void updateRegInfo(DatanodeID nodeReg) {
623        super.updateRegInfo(nodeReg);
624        
625        // must re-process IBR after re-registration
626        for(DatanodeStorageInfo storage : getStorageInfos()) {
627          storage.setBlockReportCount(0);
628        }
629      }
630    
631      /**
632       * @return balancer bandwidth in bytes per second for this datanode
633       */
634      public long getBalancerBandwidth() {
635        return this.bandwidth;
636      }
637    
638      /**
639       * @param bandwidth balancer bandwidth in bytes per second for this datanode
640       */
641      public void setBalancerBandwidth(long bandwidth) {
642        this.bandwidth = bandwidth;
643      }
644    
645      @Override
646      public String dumpDatanode() {
647        StringBuilder sb = new StringBuilder(super.dumpDatanode());
648        int repl = replicateBlocks.size();
649        if (repl > 0) {
650          sb.append(" ").append(repl).append(" blocks to be replicated;");
651        }
652        int inval = invalidateBlocks.size();
653        if (inval > 0) {
654          sb.append(" ").append(inval).append(" blocks to be invalidated;");      
655        }
656        int recover = recoverBlocks.size();
657        if (recover > 0) {
658          sb.append(" ").append(recover).append(" blocks to be recovered;");
659        }
660        return sb.toString();
661      }
662    
663      DatanodeStorageInfo updateStorage(DatanodeStorage s) {
664        synchronized (storageMap) {
665          DatanodeStorageInfo storage = storageMap.get(s.getStorageID());
666          if (storage == null) {
667            LOG.info("Adding new storage ID " + s.getStorageID() +
668                     " for DN " + getXferAddr());
669            storage = new DatanodeStorageInfo(this, s);
670            storageMap.put(s.getStorageID(), storage);
671          } else if (storage.getState() != s.getState() ||
672                     storage.getStorageType() != s.getStorageType()) {
673            // For backwards compatibility, make sure that the type and
674            // state are updated. Some reports from older datanodes do
675            // not include these fields so we may have assumed defaults.
676            // This check can be removed in the next major release after
677            // 2.4.
678            storage.updateFromStorage(s);
679            storageMap.put(storage.getStorageID(), storage);
680          }
681          return storage;
682        }
683      }
684    
685      /**
686       * @return   The time at which we last sent caching directives to this 
687       *           DataNode, in monotonic milliseconds.
688       */
689      public long getLastCachingDirectiveSentTimeMs() {
690        return this.lastCachingDirectiveSentTimeMs;
691      }
692    
693      /**
694       * @param time  The time at which we last sent caching directives to this 
695       *              DataNode, in monotonic milliseconds.
696       */
697      public void setLastCachingDirectiveSentTimeMs(long time) {
698        this.lastCachingDirectiveSentTimeMs = time;
699      }
700    }
701