001 /**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements. See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership. The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License. You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018 package org.apache.hadoop.hdfs.server.blockmanagement;
019
020 import java.util.ArrayList;
021 import java.util.Collection;
022 import java.util.Iterator;
023 import java.util.LinkedList;
024 import java.util.List;
025 import java.util.Queue;
026 import java.util.Set;
027 import java.util.TreeSet;
028
029 import org.apache.hadoop.classification.InterfaceAudience;
030 import org.apache.hadoop.classification.InterfaceStability;
031 import org.apache.hadoop.hdfs.protocol.Block;
032 import org.apache.hadoop.hdfs.protocol.DatanodeID;
033 import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
034 import org.apache.hadoop.hdfs.util.LightWeightHashSet;
035 import org.apache.hadoop.util.Time;
036
037 /**
038 * This class extends the DatanodeInfo class with ephemeral information (eg
039 * health, capacity, what blocks are associated with the Datanode) that is
040 * private to the Namenode, ie this class is not exposed to clients.
041 */
042 @InterfaceAudience.Private
043 @InterfaceStability.Evolving
044 public class DatanodeDescriptor extends DatanodeInfo {
045
046 // Stores status of decommissioning.
047 // If node is not decommissioning, do not use this object for anything.
048 public DecommissioningStatus decommissioningStatus = new DecommissioningStatus();
049
050 /** Block and targets pair */
051 @InterfaceAudience.Private
052 @InterfaceStability.Evolving
053 public static class BlockTargetPair {
054 public final Block block;
055 public final DatanodeDescriptor[] targets;
056
057 BlockTargetPair(Block block, DatanodeDescriptor[] targets) {
058 this.block = block;
059 this.targets = targets;
060 }
061 }
062
063 /** A BlockTargetPair queue. */
064 private static class BlockQueue<E> {
065 private final Queue<E> blockq = new LinkedList<E>();
066
067 /** Size of the queue */
068 synchronized int size() {return blockq.size();}
069
070 /** Enqueue */
071 synchronized boolean offer(E e) {
072 return blockq.offer(e);
073 }
074
075 /** Dequeue */
076 synchronized List<E> poll(int numBlocks) {
077 if (numBlocks <= 0 || blockq.isEmpty()) {
078 return null;
079 }
080
081 List<E> results = new ArrayList<E>();
082 for(; !blockq.isEmpty() && numBlocks > 0; numBlocks--) {
083 results.add(blockq.poll());
084 }
085 return results;
086 }
087
088 /**
089 * Returns <tt>true</tt> if the queue contains the specified element.
090 */
091 boolean contains(E e) {
092 return blockq.contains(e);
093 }
094
095 synchronized void clear() {
096 blockq.clear();
097 }
098 }
099
100 private volatile BlockInfo blockList = null;
101 private int numBlocks = 0;
102 // isAlive == heartbeats.contains(this)
103 // This is an optimization, because contains takes O(n) time on Arraylist
104 public boolean isAlive = false;
105 public boolean needKeyUpdate = false;
106
107 /**
108 * Set to false on any NN failover, and reset to true
109 * whenever a block report is received.
110 */
111 private boolean heartbeatedSinceFailover = false;
112
113 /**
114 * At startup or at any failover, the DNs in the cluster may
115 * have pending block deletions from a previous incarnation
116 * of the NameNode. Thus, we consider their block contents
117 * stale until we have received a block report. When a DN
118 * is considered stale, any replicas on it are transitively
119 * considered stale. If any block has at least one stale replica,
120 * then no invalidations will be processed for this block.
121 * See HDFS-1972.
122 */
123 private boolean blockContentsStale = true;
124
125 // A system administrator can tune the balancer bandwidth parameter
126 // (dfs.balance.bandwidthPerSec) dynamically by calling
127 // "dfsadmin -setBalanacerBandwidth <newbandwidth>", at which point the
128 // following 'bandwidth' variable gets updated with the new value for each
129 // node. Once the heartbeat command is issued to update the value on the
130 // specified datanode, this value will be set back to 0.
131 private long bandwidth;
132
133 /** A queue of blocks to be replicated by this datanode */
134 private BlockQueue<BlockTargetPair> replicateBlocks = new BlockQueue<BlockTargetPair>();
135 /** A queue of blocks to be recovered by this datanode */
136 private BlockQueue<BlockInfoUnderConstruction> recoverBlocks =
137 new BlockQueue<BlockInfoUnderConstruction>();
138 /** A set of blocks to be invalidated by this datanode */
139 private LightWeightHashSet<Block> invalidateBlocks = new LightWeightHashSet<Block>();
140
141 /* Variables for maintaining number of blocks scheduled to be written to
142 * this datanode. This count is approximate and might be slightly bigger
143 * in case of errors (e.g. datanode does not report if an error occurs
144 * while writing the block).
145 */
146 private int currApproxBlocksScheduled = 0;
147 private int prevApproxBlocksScheduled = 0;
148 private long lastBlocksScheduledRollTime = 0;
149 private static final int BLOCKS_SCHEDULED_ROLL_INTERVAL = 600*1000; //10min
150 private int volumeFailures = 0;
151
152 /** Set to false after processing first block report */
153 private boolean firstBlockReport = true;
154
155 /**
156 * When set to true, the node is not in include list and is not allowed
157 * to communicate with the namenode
158 */
159 private boolean disallowed = false;
160
161 /**
162 * DatanodeDescriptor constructor
163 * @param nodeID id of the data node
164 */
165 public DatanodeDescriptor(DatanodeID nodeID) {
166 this(nodeID, 0L, 0L, 0L, 0L, 0, 0);
167 }
168
169 /**
170 * DatanodeDescriptor constructor
171 * @param nodeID id of the data node
172 * @param networkLocation location of the data node in network
173 */
174 public DatanodeDescriptor(DatanodeID nodeID,
175 String networkLocation) {
176 this(nodeID, networkLocation, 0L, 0L, 0L, 0L, 0, 0);
177 }
178
179 /**
180 * DatanodeDescriptor constructor
181 * @param nodeID id of the data node
182 * @param capacity capacity of the data node
183 * @param dfsUsed space used by the data node
184 * @param remaining remaining capacity of the data node
185 * @param bpused space used by the block pool corresponding to this namenode
186 * @param xceiverCount # of data transfers at the data node
187 */
188 public DatanodeDescriptor(DatanodeID nodeID,
189 long capacity,
190 long dfsUsed,
191 long remaining,
192 long bpused,
193 int xceiverCount,
194 int failedVolumes) {
195 super(nodeID);
196 updateHeartbeat(capacity, dfsUsed, remaining, bpused, xceiverCount,
197 failedVolumes);
198 }
199
200 /**
201 * DatanodeDescriptor constructor
202 * @param nodeID id of the data node
203 * @param networkLocation location of the data node in network
204 * @param capacity capacity of the data node, including space used by non-dfs
205 * @param dfsUsed the used space by dfs datanode
206 * @param remaining remaining capacity of the data node
207 * @param bpused space used by the block pool corresponding to this namenode
208 * @param xceiverCount # of data transfers at the data node
209 */
210 public DatanodeDescriptor(DatanodeID nodeID,
211 String networkLocation,
212 long capacity,
213 long dfsUsed,
214 long remaining,
215 long bpused,
216 int xceiverCount,
217 int failedVolumes) {
218 super(nodeID, networkLocation);
219 updateHeartbeat(capacity, dfsUsed, remaining, bpused, xceiverCount,
220 failedVolumes);
221 }
222
223 /**
224 * Add datanode to the block.
225 * Add block to the head of the list of blocks belonging to the data-node.
226 */
227 public boolean addBlock(BlockInfo b) {
228 if(!b.addNode(this))
229 return false;
230 // add to the head of the data-node list
231 blockList = b.listInsert(blockList, this);
232 numBlocks++;
233 return true;
234 }
235
236 /**
237 * Remove block from the list of blocks belonging to the data-node.
238 * Remove datanode from the block.
239 */
240 public boolean removeBlock(BlockInfo b) {
241 blockList = b.listRemove(blockList, this);
242 if ( b.removeNode(this) ) {
243 numBlocks--;
244 return true;
245 } else {
246 return false;
247 }
248 }
249
250 /**
251 * Move block to the head of the list of blocks belonging to the data-node.
252 * @return the index of the head of the blockList
253 */
254 int moveBlockToHead(BlockInfo b, int curIndex, int headIndex) {
255 blockList = b.moveBlockToHead(blockList, this, curIndex, headIndex);
256 return curIndex;
257 }
258
259 /**
260 * Used for testing only
261 * @return the head of the blockList
262 */
263 protected BlockInfo getHead(){
264 return blockList;
265 }
266
267 /**
268 * Replace specified old block with a new one in the DataNodeDescriptor.
269 *
270 * @param oldBlock - block to be replaced
271 * @param newBlock - a replacement block
272 * @return the new block
273 */
274 public BlockInfo replaceBlock(BlockInfo oldBlock, BlockInfo newBlock) {
275 boolean done = removeBlock(oldBlock);
276 assert done : "Old block should belong to the data-node when replacing";
277 done = addBlock(newBlock);
278 assert done : "New block should not belong to the data-node when replacing";
279 return newBlock;
280 }
281
282 public void resetBlocks() {
283 setCapacity(0);
284 setRemaining(0);
285 setBlockPoolUsed(0);
286 setDfsUsed(0);
287 setXceiverCount(0);
288 this.blockList = null;
289 this.invalidateBlocks.clear();
290 this.volumeFailures = 0;
291 }
292
293 public void clearBlockQueues() {
294 synchronized (invalidateBlocks) {
295 this.invalidateBlocks.clear();
296 this.recoverBlocks.clear();
297 this.replicateBlocks.clear();
298 }
299 }
300
301 public int numBlocks() {
302 return numBlocks;
303 }
304
305 /**
306 * Updates stats from datanode heartbeat.
307 */
308 public void updateHeartbeat(long capacity, long dfsUsed, long remaining,
309 long blockPoolUsed, int xceiverCount, int volFailures) {
310 setCapacity(capacity);
311 setRemaining(remaining);
312 setBlockPoolUsed(blockPoolUsed);
313 setDfsUsed(dfsUsed);
314 setXceiverCount(xceiverCount);
315 setLastUpdate(Time.now());
316 this.volumeFailures = volFailures;
317 this.heartbeatedSinceFailover = true;
318 rollBlocksScheduled(getLastUpdate());
319 }
320
321 /**
322 * Iterates over the list of blocks belonging to the datanode.
323 */
324 public static class BlockIterator implements Iterator<BlockInfo> {
325 private BlockInfo current;
326 private DatanodeDescriptor node;
327
328 BlockIterator(BlockInfo head, DatanodeDescriptor dn) {
329 this.current = head;
330 this.node = dn;
331 }
332
333 @Override
334 public boolean hasNext() {
335 return current != null;
336 }
337
338 @Override
339 public BlockInfo next() {
340 BlockInfo res = current;
341 current = current.getNext(current.findDatanode(node));
342 return res;
343 }
344
345 @Override
346 public void remove() {
347 throw new UnsupportedOperationException("Sorry. can't remove.");
348 }
349 }
350
351 public Iterator<BlockInfo> getBlockIterator() {
352 return new BlockIterator(this.blockList, this);
353 }
354
355 /**
356 * Store block replication work.
357 */
358 void addBlockToBeReplicated(Block block, DatanodeDescriptor[] targets) {
359 assert(block != null && targets != null && targets.length > 0);
360 replicateBlocks.offer(new BlockTargetPair(block, targets));
361 }
362
363 /**
364 * Store block recovery work.
365 */
366 void addBlockToBeRecovered(BlockInfoUnderConstruction block) {
367 if(recoverBlocks.contains(block)) {
368 // this prevents adding the same block twice to the recovery queue
369 BlockManager.LOG.info(block + " is already in the recovery queue");
370 return;
371 }
372 recoverBlocks.offer(block);
373 }
374
375 /**
376 * Store block invalidation work.
377 */
378 void addBlocksToBeInvalidated(List<Block> blocklist) {
379 assert(blocklist != null && blocklist.size() > 0);
380 synchronized (invalidateBlocks) {
381 for(Block blk : blocklist) {
382 invalidateBlocks.add(blk);
383 }
384 }
385 }
386
387 /**
388 * The number of work items that are pending to be replicated
389 */
390 int getNumberOfBlocksToBeReplicated() {
391 return replicateBlocks.size();
392 }
393
394 /**
395 * The number of block invalidation items that are pending to
396 * be sent to the datanode
397 */
398 int getNumberOfBlocksToBeInvalidated() {
399 synchronized (invalidateBlocks) {
400 return invalidateBlocks.size();
401 }
402 }
403
404 public List<BlockTargetPair> getReplicationCommand(int maxTransfers) {
405 return replicateBlocks.poll(maxTransfers);
406 }
407
408 public BlockInfoUnderConstruction[] getLeaseRecoveryCommand(int maxTransfers) {
409 List<BlockInfoUnderConstruction> blocks = recoverBlocks.poll(maxTransfers);
410 if(blocks == null)
411 return null;
412 return blocks.toArray(new BlockInfoUnderConstruction[blocks.size()]);
413 }
414
415 /**
416 * Remove the specified number of blocks to be invalidated
417 */
418 public Block[] getInvalidateBlocks(int maxblocks) {
419 synchronized (invalidateBlocks) {
420 Block[] deleteList = invalidateBlocks.pollToArray(new Block[Math.min(
421 invalidateBlocks.size(), maxblocks)]);
422 return deleteList.length == 0 ? null : deleteList;
423 }
424 }
425
426 /**
427 * @return Approximate number of blocks currently scheduled to be written
428 * to this datanode.
429 */
430 public int getBlocksScheduled() {
431 return currApproxBlocksScheduled + prevApproxBlocksScheduled;
432 }
433
434 /**
435 * Increments counter for number of blocks scheduled.
436 */
437 public void incBlocksScheduled() {
438 currApproxBlocksScheduled++;
439 }
440
441 /**
442 * Decrements counter for number of blocks scheduled.
443 */
444 void decBlocksScheduled() {
445 if (prevApproxBlocksScheduled > 0) {
446 prevApproxBlocksScheduled--;
447 } else if (currApproxBlocksScheduled > 0) {
448 currApproxBlocksScheduled--;
449 }
450 // its ok if both counters are zero.
451 }
452
453 /**
454 * Adjusts curr and prev number of blocks scheduled every few minutes.
455 */
456 private void rollBlocksScheduled(long now) {
457 if ((now - lastBlocksScheduledRollTime) >
458 BLOCKS_SCHEDULED_ROLL_INTERVAL) {
459 prevApproxBlocksScheduled = currApproxBlocksScheduled;
460 currApproxBlocksScheduled = 0;
461 lastBlocksScheduledRollTime = now;
462 }
463 }
464
465 @Override
466 public int hashCode() {
467 // Super implementation is sufficient
468 return super.hashCode();
469 }
470
471 @Override
472 public boolean equals(Object obj) {
473 // Sufficient to use super equality as datanodes are uniquely identified
474 // by DatanodeID
475 return (this == obj) || super.equals(obj);
476 }
477
478 /** Decommissioning status */
479 public class DecommissioningStatus {
480 private int underReplicatedBlocks;
481 private int decommissionOnlyReplicas;
482 private int underReplicatedInOpenFiles;
483 private long startTime;
484
485 synchronized void set(int underRep,
486 int onlyRep, int underConstruction) {
487 if (isDecommissionInProgress() == false) {
488 return;
489 }
490 underReplicatedBlocks = underRep;
491 decommissionOnlyReplicas = onlyRep;
492 underReplicatedInOpenFiles = underConstruction;
493 }
494
495 /** @return the number of under-replicated blocks */
496 public synchronized int getUnderReplicatedBlocks() {
497 if (isDecommissionInProgress() == false) {
498 return 0;
499 }
500 return underReplicatedBlocks;
501 }
502 /** @return the number of decommission-only replicas */
503 public synchronized int getDecommissionOnlyReplicas() {
504 if (isDecommissionInProgress() == false) {
505 return 0;
506 }
507 return decommissionOnlyReplicas;
508 }
509 /** @return the number of under-replicated blocks in open files */
510 public synchronized int getUnderReplicatedInOpenFiles() {
511 if (isDecommissionInProgress() == false) {
512 return 0;
513 }
514 return underReplicatedInOpenFiles;
515 }
516 /** Set start time */
517 public synchronized void setStartTime(long time) {
518 startTime = time;
519 }
520 /** @return start time */
521 public synchronized long getStartTime() {
522 if (isDecommissionInProgress() == false) {
523 return 0;
524 }
525 return startTime;
526 }
527 } // End of class DecommissioningStatus
528
529 /**
530 * Set the flag to indicate if this datanode is disallowed from communicating
531 * with the namenode.
532 */
533 public void setDisallowed(boolean flag) {
534 disallowed = flag;
535 }
536 /** Is the datanode disallowed from communicating with the namenode? */
537 public boolean isDisallowed() {
538 return disallowed;
539 }
540
541 /**
542 * @return number of failed volumes in the datanode.
543 */
544 public int getVolumeFailures() {
545 return volumeFailures;
546 }
547
548 /**
549 * @param nodeReg DatanodeID to update registration for.
550 */
551 @Override
552 public void updateRegInfo(DatanodeID nodeReg) {
553 super.updateRegInfo(nodeReg);
554 firstBlockReport = true; // must re-process IBR after re-registration
555 }
556
557 /**
558 * @return balancer bandwidth in bytes per second for this datanode
559 */
560 public long getBalancerBandwidth() {
561 return this.bandwidth;
562 }
563
564 /**
565 * @param bandwidth balancer bandwidth in bytes per second for this datanode
566 */
567 public void setBalancerBandwidth(long bandwidth) {
568 this.bandwidth = bandwidth;
569 }
570
571 public boolean areBlockContentsStale() {
572 return blockContentsStale;
573 }
574
575 public void markStaleAfterFailover() {
576 heartbeatedSinceFailover = false;
577 blockContentsStale = true;
578 }
579
580 public void receivedBlockReport() {
581 if (heartbeatedSinceFailover) {
582 blockContentsStale = false;
583 }
584 firstBlockReport = false;
585 }
586
587 boolean isFirstBlockReport() {
588 return firstBlockReport;
589 }
590
591 @Override
592 public String dumpDatanode() {
593 StringBuilder sb = new StringBuilder(super.dumpDatanode());
594 int repl = replicateBlocks.size();
595 if (repl > 0) {
596 sb.append(" ").append(repl).append(" blocks to be replicated;");
597 }
598 int inval = invalidateBlocks.size();
599 if (inval > 0) {
600 sb.append(" ").append(inval).append(" blocks to be invalidated;");
601 }
602 int recover = recoverBlocks.size();
603 if (recover > 0) {
604 sb.append(" ").append(recover).append(" blocks to be recovered;");
605 }
606 return sb.toString();
607 }
608 }