001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    package org.apache.hadoop.hdfs.server.blockmanagement;
019    
020    import java.util.ArrayList;
021    import java.util.Collection;
022    import java.util.HashMap;
023    import java.util.Iterator;
024    import java.util.List;
025    import java.util.Map;
026    
027    import org.apache.hadoop.conf.Configuration;
028    import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
029    import org.apache.hadoop.hdfs.server.namenode.FSClusterStats;
030    import org.apache.hadoop.net.NetworkTopology;
031    import org.apache.hadoop.net.NetworkTopologyWithNodeGroup;
032    import org.apache.hadoop.net.Node;
033    import org.apache.hadoop.net.NodeBase;
034    
035    /** The class is responsible for choosing the desired number of targets
036     * for placing block replicas on environment with node-group layer.
037     * The replica placement strategy is adjusted to:
038     * If the writer is on a datanode, the 1st replica is placed on the local 
039     *     node (or local node-group), otherwise a random datanode. 
040     * The 2nd replica is placed on a datanode that is on a different rack with 1st
041     *     replica node. 
042     * The 3rd replica is placed on a datanode which is on a different node-group
043     *     but the same rack as the second replica node.
044     */
045    public class BlockPlacementPolicyWithNodeGroup extends BlockPlacementPolicyDefault {
046    
047      BlockPlacementPolicyWithNodeGroup(Configuration conf,  FSClusterStats stats,
048          NetworkTopology clusterMap) {
049        initialize(conf, stats, clusterMap);
050      }
051    
052      BlockPlacementPolicyWithNodeGroup() {
053      }
054    
055      public void initialize(Configuration conf,  FSClusterStats stats,
056              NetworkTopology clusterMap) {
057        super.initialize(conf, stats, clusterMap);
058      }
059    
060      /** choose local node of localMachine as the target.
061       * if localMachine is not available, choose a node on the same nodegroup or 
062       * rack instead.
063       * @return the chosen node
064       */
065      @Override
066      protected DatanodeDescriptor chooseLocalNode(
067          DatanodeDescriptor localMachine,
068          HashMap<Node, Node> excludedNodes,
069          long blocksize,
070          int maxNodesPerRack,
071          List<DatanodeDescriptor> results,
072          boolean avoidStaleNodes)
073            throws NotEnoughReplicasException {
074        // if no local machine, randomly choose one node
075        if (localMachine == null)
076          return chooseRandom(NodeBase.ROOT, excludedNodes, 
077              blocksize, maxNodesPerRack, results, avoidStaleNodes);
078    
079        // otherwise try local machine first
080        Node oldNode = excludedNodes.put(localMachine, localMachine);
081        if (oldNode == null) { // was not in the excluded list
082          if (isGoodTarget(localMachine, blocksize,
083              maxNodesPerRack, false, results, avoidStaleNodes)) {
084            results.add(localMachine);
085            // Nodes under same nodegroup should be excluded.
086            addNodeGroupToExcludedNodes(excludedNodes,
087                localMachine.getNetworkLocation());
088            return localMachine;
089          }
090        } 
091    
092        // try a node on local node group
093        DatanodeDescriptor chosenNode = chooseLocalNodeGroup(
094            (NetworkTopologyWithNodeGroup)clusterMap, localMachine, excludedNodes, 
095            blocksize, maxNodesPerRack, results, avoidStaleNodes);
096        if (chosenNode != null) {
097          return chosenNode;
098        }
099        // try a node on local rack
100        return chooseLocalRack(localMachine, excludedNodes, 
101            blocksize, maxNodesPerRack, results, avoidStaleNodes);
102      }
103    
104      /**
105       * {@inheritDoc}
106       */
107      @Override
108      protected void adjustExcludedNodes(HashMap<Node, Node> excludedNodes,
109          Node chosenNode) {
110        // as node-group aware implementation, it should make sure no two replica
111        // are placing on the same node group.
112        addNodeGroupToExcludedNodes(excludedNodes, chosenNode.getNetworkLocation());
113      }
114      
115      // add all nodes under specific nodegroup to excludedNodes.
116      private void addNodeGroupToExcludedNodes(HashMap<Node, Node> excludedNodes,
117          String nodeGroup) {
118        List<Node> leafNodes = clusterMap.getLeaves(nodeGroup);
119        for (Node node : leafNodes) {
120          excludedNodes.put(node, node);
121        }
122      }
123    
124      /**
125       * {@inheritDoc}
126       */
127      @Override
128      protected DatanodeDescriptor chooseLocalRack(
129                                                 DatanodeDescriptor localMachine,
130                                                 HashMap<Node, Node> excludedNodes,
131                                                 long blocksize,
132                                                 int maxNodesPerRack,
133                                                 List<DatanodeDescriptor> results,
134                                                 boolean avoidStaleNodes)
135        throws NotEnoughReplicasException {
136        // no local machine, so choose a random machine
137        if (localMachine == null) {
138          return chooseRandom(NodeBase.ROOT, excludedNodes, 
139                              blocksize, maxNodesPerRack, results, avoidStaleNodes);
140        }
141    
142        // choose one from the local rack, but off-nodegroup
143        try {
144          return chooseRandom(NetworkTopology.getFirstHalf(
145                                  localMachine.getNetworkLocation()),
146                              excludedNodes, blocksize, 
147                              maxNodesPerRack, results, avoidStaleNodes);
148        } catch (NotEnoughReplicasException e1) {
149          // find the second replica
150          DatanodeDescriptor newLocal=null;
151          for(Iterator<DatanodeDescriptor> iter=results.iterator();
152              iter.hasNext();) {
153            DatanodeDescriptor nextNode = iter.next();
154            if (nextNode != localMachine) {
155              newLocal = nextNode;
156              break;
157            }
158          }
159          if (newLocal != null) {
160            try {
161              return chooseRandom(clusterMap.getRack(newLocal.getNetworkLocation()),
162                                  excludedNodes, blocksize, maxNodesPerRack, results,
163                                  avoidStaleNodes);
164            } catch(NotEnoughReplicasException e2) {
165              //otherwise randomly choose one from the network
166              return chooseRandom(NodeBase.ROOT, excludedNodes,
167                                  blocksize, maxNodesPerRack, results,
168                                  avoidStaleNodes);
169            }
170          } else {
171            //otherwise randomly choose one from the network
172            return chooseRandom(NodeBase.ROOT, excludedNodes,
173                                blocksize, maxNodesPerRack, results,
174                                avoidStaleNodes);
175          }
176        }
177      }
178    
179      /**
180       * {@inheritDoc}
181       */
182      @Override
183      protected void chooseRemoteRack(int numOfReplicas,
184              DatanodeDescriptor localMachine,
185              HashMap<Node, Node> excludedNodes,
186              long blocksize,
187              int maxReplicasPerRack,
188              List<DatanodeDescriptor> results,
189              boolean avoidStaleNodes)
190              throws NotEnoughReplicasException {
191        int oldNumOfReplicas = results.size();
192    
193        final String rackLocation = NetworkTopology.getFirstHalf(
194            localMachine.getNetworkLocation());
195        try {
196          // randomly choose from remote racks
197          chooseRandom(numOfReplicas, "~" + rackLocation, excludedNodes, blocksize,
198              maxReplicasPerRack, results, avoidStaleNodes);
199        } catch (NotEnoughReplicasException e) {
200          // fall back to the local rack
201          chooseRandom(numOfReplicas - (results.size() - oldNumOfReplicas),
202              rackLocation, excludedNodes, blocksize,
203              maxReplicasPerRack, results, avoidStaleNodes);
204        }
205      }
206    
207      /* choose one node from the nodegroup that <i>localMachine</i> is on.
208       * if no such node is available, choose one node from the nodegroup where
209       * a second replica is on.
210       * if still no such node is available, choose a random node in the cluster.
211       * @return the chosen node
212       */
213      private DatanodeDescriptor chooseLocalNodeGroup(NetworkTopologyWithNodeGroup clusterMap,
214          DatanodeDescriptor localMachine, HashMap<Node, Node> excludedNodes, long blocksize, 
215          int maxNodesPerRack, List<DatanodeDescriptor> results, boolean avoidStaleNodes)
216              throws NotEnoughReplicasException {
217        // no local machine, so choose a random machine
218        if (localMachine == null) {
219          return chooseRandom(NodeBase.ROOT, excludedNodes, 
220          blocksize, maxNodesPerRack, results, avoidStaleNodes);
221        }
222    
223        // choose one from the local node group
224        try {
225          return chooseRandom(clusterMap.getNodeGroup(localMachine.getNetworkLocation()),
226          excludedNodes, blocksize, maxNodesPerRack, results, avoidStaleNodes);
227        } catch (NotEnoughReplicasException e1) {
228          // find the second replica
229          DatanodeDescriptor newLocal=null;
230          for(Iterator<DatanodeDescriptor> iter=results.iterator();
231            iter.hasNext();) {
232            DatanodeDescriptor nextNode = iter.next();
233            if (nextNode != localMachine) {
234              newLocal = nextNode;
235              break;
236            }
237          }
238          if (newLocal != null) {
239            try {
240              return chooseRandom(clusterMap.getNodeGroup(newLocal.getNetworkLocation()),
241                excludedNodes, blocksize, maxNodesPerRack, results, avoidStaleNodes);
242            } catch(NotEnoughReplicasException e2) {
243              //otherwise randomly choose one from the network
244              return chooseRandom(NodeBase.ROOT, excludedNodes,
245                  blocksize, maxNodesPerRack, results, avoidStaleNodes);
246            }
247          } else {
248            //otherwise randomly choose one from the network
249            return chooseRandom(NodeBase.ROOT, excludedNodes,
250                blocksize, maxNodesPerRack, results, avoidStaleNodes);
251          }
252        }
253      }
254    
255      @Override
256      protected String getRack(final DatanodeInfo cur) {
257        String nodeGroupString = cur.getNetworkLocation();
258        return NetworkTopology.getFirstHalf(nodeGroupString);
259      }
260      
261      /**
262       * Find other nodes in the same nodegroup of <i>localMachine</i> and add them
263       * into <i>excludeNodes</i> as replica should not be duplicated for nodes 
264       * within the same nodegroup
265       * @return number of new excluded nodes
266       */
267      protected int addToExcludedNodes(DatanodeDescriptor localMachine,
268          HashMap<Node, Node> excludedNodes) {
269        int countOfExcludedNodes = 0;
270        String nodeGroupScope = localMachine.getNetworkLocation();
271        List<Node> leafNodes = clusterMap.getLeaves(nodeGroupScope);
272        for (Node leafNode : leafNodes) {
273          Node node = excludedNodes.put(leafNode, leafNode);
274          if (node == null) {
275            // not a existing node in excludedNodes
276            countOfExcludedNodes++;
277          }
278        }
279        return countOfExcludedNodes;
280      }
281    
282      /**
283       * Pick up replica node set for deleting replica as over-replicated. 
284       * First set contains replica nodes on rack with more than one
285       * replica while second set contains remaining replica nodes.
286       * If first is not empty, divide first set into two subsets:
287       *   moreThanOne contains nodes on nodegroup with more than one replica
288       *   exactlyOne contains the remaining nodes in first set
289       * then pickup priSet if not empty.
290       * If first is empty, then pick second.
291       */
292      @Override
293      public Iterator<DatanodeDescriptor> pickupReplicaSet(
294          Collection<DatanodeDescriptor> first,
295          Collection<DatanodeDescriptor> second) {
296        // If no replica within same rack, return directly.
297        if (first.isEmpty()) {
298          return second.iterator();
299        }
300        // Split data nodes in the first set into two sets, 
301        // moreThanOne contains nodes on nodegroup with more than one replica
302        // exactlyOne contains the remaining nodes
303        Map<String, List<DatanodeDescriptor>> nodeGroupMap = 
304            new HashMap<String, List<DatanodeDescriptor>>();
305        
306        for(DatanodeDescriptor node : first) {
307          final String nodeGroupName = 
308              NetworkTopology.getLastHalf(node.getNetworkLocation());
309          List<DatanodeDescriptor> datanodeList = 
310              nodeGroupMap.get(nodeGroupName);
311          if (datanodeList == null) {
312            datanodeList = new ArrayList<DatanodeDescriptor>();
313            nodeGroupMap.put(nodeGroupName, datanodeList);
314          }
315          datanodeList.add(node);
316        }
317        
318        final List<DatanodeDescriptor> moreThanOne = new ArrayList<DatanodeDescriptor>();
319        final List<DatanodeDescriptor> exactlyOne = new ArrayList<DatanodeDescriptor>();
320        // split nodes into two sets
321        for(List<DatanodeDescriptor> datanodeList : nodeGroupMap.values()) {
322          if (datanodeList.size() == 1 ) {
323            // exactlyOne contains nodes on nodegroup with exactly one replica
324            exactlyOne.add(datanodeList.get(0));
325          } else {
326            // moreThanOne contains nodes on nodegroup with more than one replica
327            moreThanOne.addAll(datanodeList);
328          }
329        }
330        
331        Iterator<DatanodeDescriptor> iter =
332            moreThanOne.isEmpty() ? exactlyOne.iterator() : moreThanOne.iterator();
333        return iter;
334      }
335      
336    }