001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    package org.apache.hadoop.hdfs.server.blockmanagement;
019    
020    import java.util.ArrayList;
021    import java.util.Collection;
022    import java.util.Iterator;
023    import java.util.LinkedList;
024    import java.util.List;
025    import java.util.Queue;
026    import java.util.Set;
027    import java.util.TreeSet;
028    
029    import org.apache.hadoop.classification.InterfaceAudience;
030    import org.apache.hadoop.classification.InterfaceStability;
031    import org.apache.hadoop.hdfs.protocol.Block;
032    import org.apache.hadoop.hdfs.protocol.DatanodeID;
033    import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
034    import org.apache.hadoop.hdfs.util.LightWeightHashSet;
035    import org.apache.hadoop.util.Time;
036    
037    /**
038     * This class extends the DatanodeInfo class with ephemeral information (eg
039     * health, capacity, what blocks are associated with the Datanode) that is
040     * private to the Namenode, ie this class is not exposed to clients.
041     */
042    @InterfaceAudience.Private
043    @InterfaceStability.Evolving
044    public class DatanodeDescriptor extends DatanodeInfo {
045      
046      // Stores status of decommissioning.
047      // If node is not decommissioning, do not use this object for anything.
048      public DecommissioningStatus decommissioningStatus = new DecommissioningStatus();
049      
050      /** Block and targets pair */
051      @InterfaceAudience.Private
052      @InterfaceStability.Evolving
053      public static class BlockTargetPair {
054        public final Block block;
055        public final DatanodeDescriptor[] targets;    
056    
057        BlockTargetPair(Block block, DatanodeDescriptor[] targets) {
058          this.block = block;
059          this.targets = targets;
060        }
061      }
062    
063      /** A BlockTargetPair queue. */
064      private static class BlockQueue<E> {
065        private final Queue<E> blockq = new LinkedList<E>();
066    
067        /** Size of the queue */
068        synchronized int size() {return blockq.size();}
069    
070        /** Enqueue */
071        synchronized boolean offer(E e) { 
072          return blockq.offer(e);
073        }
074    
075        /** Dequeue */
076        synchronized List<E> poll(int numBlocks) {
077          if (numBlocks <= 0 || blockq.isEmpty()) {
078            return null;
079          }
080    
081          List<E> results = new ArrayList<E>();
082          for(; !blockq.isEmpty() && numBlocks > 0; numBlocks--) {
083            results.add(blockq.poll());
084          }
085          return results;
086        }
087    
088        /**
089         * Returns <tt>true</tt> if the queue contains the specified element.
090         */
091        boolean contains(E e) {
092          return blockq.contains(e);
093        }
094    
095        synchronized void clear() {
096          blockq.clear();
097        }
098      }
099    
100      private volatile BlockInfo blockList = null;
101      private int numBlocks = 0;
102      // isAlive == heartbeats.contains(this)
103      // This is an optimization, because contains takes O(n) time on Arraylist
104      public boolean isAlive = false;
105      public boolean needKeyUpdate = false;
106    
107      /**
108       * Set to false on any NN failover, and reset to true
109       * whenever a block report is received.
110       */
111      private boolean heartbeatedSinceFailover = false;
112      
113      /**
114       * At startup or at any failover, the DNs in the cluster may
115       * have pending block deletions from a previous incarnation
116       * of the NameNode. Thus, we consider their block contents
117       * stale until we have received a block report. When a DN
118       * is considered stale, any replicas on it are transitively
119       * considered stale. If any block has at least one stale replica,
120       * then no invalidations will be processed for this block.
121       * See HDFS-1972.
122       */
123      private boolean blockContentsStale = true;
124      
125      // A system administrator can tune the balancer bandwidth parameter
126      // (dfs.balance.bandwidthPerSec) dynamically by calling
127      // "dfsadmin -setBalanacerBandwidth <newbandwidth>", at which point the
128      // following 'bandwidth' variable gets updated with the new value for each
129      // node. Once the heartbeat command is issued to update the value on the
130      // specified datanode, this value will be set back to 0.
131      private long bandwidth;
132    
133      /** A queue of blocks to be replicated by this datanode */
134      private BlockQueue<BlockTargetPair> replicateBlocks = new BlockQueue<BlockTargetPair>();
135      /** A queue of blocks to be recovered by this datanode */
136      private BlockQueue<BlockInfoUnderConstruction> recoverBlocks =
137                                    new BlockQueue<BlockInfoUnderConstruction>();
138      /** A set of blocks to be invalidated by this datanode */
139      private LightWeightHashSet<Block> invalidateBlocks = new LightWeightHashSet<Block>();
140    
141      /* Variables for maintaining number of blocks scheduled to be written to
142       * this datanode. This count is approximate and might be slightly bigger
143       * in case of errors (e.g. datanode does not report if an error occurs
144       * while writing the block).
145       */
146      private int currApproxBlocksScheduled = 0;
147      private int prevApproxBlocksScheduled = 0;
148      private long lastBlocksScheduledRollTime = 0;
149      private static final int BLOCKS_SCHEDULED_ROLL_INTERVAL = 600*1000; //10min
150      private int volumeFailures = 0;
151      
152      /** Set to false after processing first block report */
153      private boolean firstBlockReport = true;
154      
155      /** 
156       * When set to true, the node is not in include list and is not allowed
157       * to communicate with the namenode
158       */
159      private boolean disallowed = false;
160    
161      /**
162       * DatanodeDescriptor constructor
163       * @param nodeID id of the data node
164       */
165      public DatanodeDescriptor(DatanodeID nodeID) {
166        this(nodeID, 0L, 0L, 0L, 0L, 0, 0);
167      }
168    
169      /**
170       * DatanodeDescriptor constructor
171       * @param nodeID id of the data node
172       * @param networkLocation location of the data node in network
173       */
174      public DatanodeDescriptor(DatanodeID nodeID, 
175                                String networkLocation) {
176        this(nodeID, networkLocation, 0L, 0L, 0L, 0L, 0, 0);
177      }
178      
179      /**
180       * DatanodeDescriptor constructor
181       * @param nodeID id of the data node
182       * @param capacity capacity of the data node
183       * @param dfsUsed space used by the data node
184       * @param remaining remaining capacity of the data node
185       * @param bpused space used by the block pool corresponding to this namenode
186       * @param xceiverCount # of data transfers at the data node
187       */
188      public DatanodeDescriptor(DatanodeID nodeID, 
189                                long capacity,
190                                long dfsUsed,
191                                long remaining,
192                                long bpused,
193                                int xceiverCount,
194                                int failedVolumes) {
195        super(nodeID);
196        updateHeartbeat(capacity, dfsUsed, remaining, bpused, xceiverCount, 
197            failedVolumes);
198      }
199    
200      /**
201       * DatanodeDescriptor constructor
202       * @param nodeID id of the data node
203       * @param networkLocation location of the data node in network
204       * @param capacity capacity of the data node, including space used by non-dfs
205       * @param dfsUsed the used space by dfs datanode
206       * @param remaining remaining capacity of the data node
207       * @param bpused space used by the block pool corresponding to this namenode
208       * @param xceiverCount # of data transfers at the data node
209       */
210      public DatanodeDescriptor(DatanodeID nodeID,
211                                String networkLocation,
212                                long capacity,
213                                long dfsUsed,
214                                long remaining,
215                                long bpused,
216                                int xceiverCount,
217                                int failedVolumes) {
218        super(nodeID, networkLocation);
219        updateHeartbeat(capacity, dfsUsed, remaining, bpused, xceiverCount, 
220            failedVolumes);
221      }
222    
223      /**
224       * Add datanode to the block.
225       * Add block to the head of the list of blocks belonging to the data-node.
226       */
227      public boolean addBlock(BlockInfo b) {
228        if(!b.addNode(this))
229          return false;
230        // add to the head of the data-node list
231        blockList = b.listInsert(blockList, this);
232        numBlocks++;
233        return true;
234      }
235      
236      /**
237       * Remove block from the list of blocks belonging to the data-node.
238       * Remove datanode from the block.
239       */
240      public boolean removeBlock(BlockInfo b) {
241        blockList = b.listRemove(blockList, this);
242        if ( b.removeNode(this) ) {
243          numBlocks--;
244          return true;
245        } else {
246          return false;
247        }
248      }
249    
250      /**
251       * Move block to the head of the list of blocks belonging to the data-node.
252       * @return the index of the head of the blockList
253       */
254      int moveBlockToHead(BlockInfo b, int curIndex, int headIndex) {
255        blockList = b.moveBlockToHead(blockList, this, curIndex, headIndex);
256        return curIndex;
257      }
258    
259      /**
260       * Used for testing only
261       * @return the head of the blockList
262       */
263      protected BlockInfo getHead(){
264        return blockList;
265      }
266    
267      /**
268       * Replace specified old block with a new one in the DataNodeDescriptor.
269       *
270       * @param oldBlock - block to be replaced
271       * @param newBlock - a replacement block
272       * @return the new block
273       */
274      public BlockInfo replaceBlock(BlockInfo oldBlock, BlockInfo newBlock) {
275        boolean done = removeBlock(oldBlock);
276        assert done : "Old block should belong to the data-node when replacing";
277        done = addBlock(newBlock);
278        assert done : "New block should not belong to the data-node when replacing";
279        return newBlock;
280      }
281    
282      public void resetBlocks() {
283        setCapacity(0);
284        setRemaining(0);
285        setBlockPoolUsed(0);
286        setDfsUsed(0);
287        setXceiverCount(0);
288        this.blockList = null;
289        this.invalidateBlocks.clear();
290        this.volumeFailures = 0;
291      }
292      
293      public void clearBlockQueues() {
294        synchronized (invalidateBlocks) {
295          this.invalidateBlocks.clear();
296          this.recoverBlocks.clear();
297          this.replicateBlocks.clear();
298        }
299      }
300    
301      public int numBlocks() {
302        return numBlocks;
303      }
304    
305      /**
306       * Updates stats from datanode heartbeat.
307       */
308      public void updateHeartbeat(long capacity, long dfsUsed, long remaining,
309          long blockPoolUsed, int xceiverCount, int volFailures) {
310        setCapacity(capacity);
311        setRemaining(remaining);
312        setBlockPoolUsed(blockPoolUsed);
313        setDfsUsed(dfsUsed);
314        setXceiverCount(xceiverCount);
315        setLastUpdate(Time.now());    
316        this.volumeFailures = volFailures;
317        this.heartbeatedSinceFailover = true;
318        rollBlocksScheduled(getLastUpdate());
319      }
320    
321      /**
322       * Iterates over the list of blocks belonging to the datanode.
323       */
324      public static class BlockIterator implements Iterator<BlockInfo> {
325        private BlockInfo current;
326        private DatanodeDescriptor node;
327          
328        BlockIterator(BlockInfo head, DatanodeDescriptor dn) {
329          this.current = head;
330          this.node = dn;
331        }
332    
333        @Override
334        public boolean hasNext() {
335          return current != null;
336        }
337    
338        @Override
339        public BlockInfo next() {
340          BlockInfo res = current;
341          current = current.getNext(current.findDatanode(node));
342          return res;
343        }
344    
345        @Override
346        public void remove()  {
347          throw new UnsupportedOperationException("Sorry. can't remove.");
348        }
349      }
350    
351      public Iterator<BlockInfo> getBlockIterator() {
352        return new BlockIterator(this.blockList, this);
353      }
354      
355      /**
356       * Store block replication work.
357       */
358      void addBlockToBeReplicated(Block block, DatanodeDescriptor[] targets) {
359        assert(block != null && targets != null && targets.length > 0);
360        replicateBlocks.offer(new BlockTargetPair(block, targets));
361      }
362    
363      /**
364       * Store block recovery work.
365       */
366      void addBlockToBeRecovered(BlockInfoUnderConstruction block) {
367        if(recoverBlocks.contains(block)) {
368          // this prevents adding the same block twice to the recovery queue
369          BlockManager.LOG.info(block + " is already in the recovery queue");
370          return;
371        }
372        recoverBlocks.offer(block);
373      }
374    
375      /**
376       * Store block invalidation work.
377       */
378      void addBlocksToBeInvalidated(List<Block> blocklist) {
379        assert(blocklist != null && blocklist.size() > 0);
380        synchronized (invalidateBlocks) {
381          for(Block blk : blocklist) {
382            invalidateBlocks.add(blk);
383          }
384        }
385      }
386    
387      /**
388       * The number of work items that are pending to be replicated
389       */
390      int getNumberOfBlocksToBeReplicated() {
391        return replicateBlocks.size();
392      }
393    
394      /**
395       * The number of block invalidation items that are pending to 
396       * be sent to the datanode
397       */
398      int getNumberOfBlocksToBeInvalidated() {
399        synchronized (invalidateBlocks) {
400          return invalidateBlocks.size();
401        }
402      }
403      
404      public List<BlockTargetPair> getReplicationCommand(int maxTransfers) {
405        return replicateBlocks.poll(maxTransfers);
406      }
407    
408      public BlockInfoUnderConstruction[] getLeaseRecoveryCommand(int maxTransfers) {
409        List<BlockInfoUnderConstruction> blocks = recoverBlocks.poll(maxTransfers);
410        if(blocks == null)
411          return null;
412        return blocks.toArray(new BlockInfoUnderConstruction[blocks.size()]);
413      }
414    
415      /**
416       * Remove the specified number of blocks to be invalidated
417       */
418      public Block[] getInvalidateBlocks(int maxblocks) {
419        synchronized (invalidateBlocks) {
420          Block[] deleteList = invalidateBlocks.pollToArray(new Block[Math.min(
421              invalidateBlocks.size(), maxblocks)]);
422          return deleteList.length == 0 ? null : deleteList;
423        }
424      }
425    
426      /**
427       * @return Approximate number of blocks currently scheduled to be written 
428       * to this datanode.
429       */
430      public int getBlocksScheduled() {
431        return currApproxBlocksScheduled + prevApproxBlocksScheduled;
432      }
433      
434      /**
435       * Increments counter for number of blocks scheduled. 
436       */
437      public void incBlocksScheduled() {
438        currApproxBlocksScheduled++;
439      }
440      
441      /**
442       * Decrements counter for number of blocks scheduled.
443       */
444      void decBlocksScheduled() {
445        if (prevApproxBlocksScheduled > 0) {
446          prevApproxBlocksScheduled--;
447        } else if (currApproxBlocksScheduled > 0) {
448          currApproxBlocksScheduled--;
449        } 
450        // its ok if both counters are zero.
451      }
452      
453      /**
454       * Adjusts curr and prev number of blocks scheduled every few minutes.
455       */
456      private void rollBlocksScheduled(long now) {
457        if ((now - lastBlocksScheduledRollTime) > 
458            BLOCKS_SCHEDULED_ROLL_INTERVAL) {
459          prevApproxBlocksScheduled = currApproxBlocksScheduled;
460          currApproxBlocksScheduled = 0;
461          lastBlocksScheduledRollTime = now;
462        }
463      }
464      
465      @Override
466      public int hashCode() {
467        // Super implementation is sufficient
468        return super.hashCode();
469      }
470      
471      @Override
472      public boolean equals(Object obj) {
473        // Sufficient to use super equality as datanodes are uniquely identified
474        // by DatanodeID
475        return (this == obj) || super.equals(obj);
476      }
477    
478      /** Decommissioning status */
479      public class DecommissioningStatus {
480        private int underReplicatedBlocks;
481        private int decommissionOnlyReplicas;
482        private int underReplicatedInOpenFiles;
483        private long startTime;
484        
485        synchronized void set(int underRep,
486            int onlyRep, int underConstruction) {
487          if (isDecommissionInProgress() == false) {
488            return;
489          }
490          underReplicatedBlocks = underRep;
491          decommissionOnlyReplicas = onlyRep;
492          underReplicatedInOpenFiles = underConstruction;
493        }
494    
495        /** @return the number of under-replicated blocks */
496        public synchronized int getUnderReplicatedBlocks() {
497          if (isDecommissionInProgress() == false) {
498            return 0;
499          }
500          return underReplicatedBlocks;
501        }
502        /** @return the number of decommission-only replicas */
503        public synchronized int getDecommissionOnlyReplicas() {
504          if (isDecommissionInProgress() == false) {
505            return 0;
506          }
507          return decommissionOnlyReplicas;
508        }
509        /** @return the number of under-replicated blocks in open files */
510        public synchronized int getUnderReplicatedInOpenFiles() {
511          if (isDecommissionInProgress() == false) {
512            return 0;
513          }
514          return underReplicatedInOpenFiles;
515        }
516        /** Set start time */
517        public synchronized void setStartTime(long time) {
518          startTime = time;
519        }
520        /** @return start time */
521        public synchronized long getStartTime() {
522          if (isDecommissionInProgress() == false) {
523            return 0;
524          }
525          return startTime;
526        }
527      }  // End of class DecommissioningStatus
528    
529      /**
530       * Set the flag to indicate if this datanode is disallowed from communicating
531       * with the namenode.
532       */
533      public void setDisallowed(boolean flag) {
534        disallowed = flag;
535      }
536      /** Is the datanode disallowed from communicating with the namenode? */
537      public boolean isDisallowed() {
538        return disallowed;
539      }
540    
541      /**
542       * @return number of failed volumes in the datanode.
543       */
544      public int getVolumeFailures() {
545        return volumeFailures;
546      }
547    
548      /**
549       * @param nodeReg DatanodeID to update registration for.
550       */
551      @Override
552      public void updateRegInfo(DatanodeID nodeReg) {
553        super.updateRegInfo(nodeReg);
554        firstBlockReport = true; // must re-process IBR after re-registration
555      }
556    
557      /**
558       * @return balancer bandwidth in bytes per second for this datanode
559       */
560      public long getBalancerBandwidth() {
561        return this.bandwidth;
562      }
563    
564      /**
565       * @param bandwidth balancer bandwidth in bytes per second for this datanode
566       */
567      public void setBalancerBandwidth(long bandwidth) {
568        this.bandwidth = bandwidth;
569      }
570    
571      public boolean areBlockContentsStale() {
572        return blockContentsStale;
573      }
574    
575      public void markStaleAfterFailover() {
576        heartbeatedSinceFailover = false;
577        blockContentsStale = true;
578      }
579    
580      public void receivedBlockReport() {
581        if (heartbeatedSinceFailover) {
582          blockContentsStale = false;
583        }
584        firstBlockReport = false;
585      }
586      
587      boolean isFirstBlockReport() {
588        return firstBlockReport;
589      }
590    
591      @Override
592      public String dumpDatanode() {
593        StringBuilder sb = new StringBuilder(super.dumpDatanode());
594        int repl = replicateBlocks.size();
595        if (repl > 0) {
596          sb.append(" ").append(repl).append(" blocks to be replicated;");
597        }
598        int inval = invalidateBlocks.size();
599        if (inval > 0) {
600          sb.append(" ").append(inval).append(" blocks to be invalidated;");      
601        }
602        int recover = recoverBlocks.size();
603        if (recover > 0) {
604          sb.append(" ").append(recover).append(" blocks to be recovered;");
605        }
606        return sb.toString();
607      }
608    }