001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    package org.apache.hadoop.hdfs.server.blockmanagement;
019    
020    import java.util.ArrayList;
021    import java.util.Collection;
022    import java.util.HashMap;
023    import java.util.List;
024    import java.util.Map;
025    import java.util.Set;
026    
027    import org.apache.hadoop.conf.Configuration;
028    import org.apache.hadoop.hdfs.DFSUtil;
029    import org.apache.hadoop.hdfs.StorageType;
030    import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
031    import org.apache.hadoop.hdfs.server.namenode.FSClusterStats;
032    import org.apache.hadoop.net.NetworkTopology;
033    import org.apache.hadoop.net.NetworkTopologyWithNodeGroup;
034    import org.apache.hadoop.net.Node;
035    import org.apache.hadoop.net.NodeBase;
036    
037    /** The class is responsible for choosing the desired number of targets
038     * for placing block replicas on environment with node-group layer.
039     * The replica placement strategy is adjusted to:
040     * If the writer is on a datanode, the 1st replica is placed on the local 
041     *     node (or local node-group), otherwise a random datanode. 
042     * The 2nd replica is placed on a datanode that is on a different rack with 1st
043     *     replica node. 
044     * The 3rd replica is placed on a datanode which is on a different node-group
045     *     but the same rack as the second replica node.
046     */
047    public class BlockPlacementPolicyWithNodeGroup extends BlockPlacementPolicyDefault {
048    
049      protected BlockPlacementPolicyWithNodeGroup(Configuration conf,  FSClusterStats stats,
050          NetworkTopology clusterMap) {
051        initialize(conf, stats, clusterMap);
052      }
053    
054      protected BlockPlacementPolicyWithNodeGroup() {
055      }
056    
057      public void initialize(Configuration conf,  FSClusterStats stats,
058              NetworkTopology clusterMap) {
059        super.initialize(conf, stats, clusterMap);
060      }
061    
062      /** choose local node of localMachine as the target.
063       * if localMachine is not available, choose a node on the same nodegroup or 
064       * rack instead.
065       * @return the chosen node
066       */
067      @Override
068      protected DatanodeStorageInfo chooseLocalStorage(Node localMachine,
069          Set<Node> excludedNodes, long blocksize, int maxNodesPerRack,
070          List<DatanodeStorageInfo> results, boolean avoidStaleNodes,
071          StorageType storageType) throws NotEnoughReplicasException {
072        // if no local machine, randomly choose one node
073        if (localMachine == null)
074          return chooseRandom(NodeBase.ROOT, excludedNodes, 
075              blocksize, maxNodesPerRack, results, avoidStaleNodes, storageType);
076    
077        // otherwise try local machine first
078        if (localMachine instanceof DatanodeDescriptor) {
079          DatanodeDescriptor localDataNode = (DatanodeDescriptor)localMachine;
080          if (excludedNodes.add(localMachine)) { // was not in the excluded list
081            for(DatanodeStorageInfo localStorage : DFSUtil.shuffle(
082                localDataNode.getStorageInfos())) {
083              if (addIfIsGoodTarget(localStorage, excludedNodes, blocksize,
084                  maxNodesPerRack, false, results, avoidStaleNodes, storageType) >= 0) {
085                return localStorage;
086              }
087            }
088          }
089        }
090    
091        // try a node on local node group
092        DatanodeStorageInfo chosenStorage = chooseLocalNodeGroup(
093            (NetworkTopologyWithNodeGroup)clusterMap, localMachine, excludedNodes, 
094            blocksize, maxNodesPerRack, results, avoidStaleNodes, storageType);
095        if (chosenStorage != null) {
096          return chosenStorage;
097        }
098        // try a node on local rack
099        return chooseLocalRack(localMachine, excludedNodes, 
100            blocksize, maxNodesPerRack, results, avoidStaleNodes, storageType);
101      }
102    
103      /** @return the node of the second replica */
104      private static DatanodeDescriptor secondNode(Node localMachine,
105          List<DatanodeStorageInfo> results) {
106        // find the second replica
107        for(DatanodeStorageInfo nextStorage : results) {
108          DatanodeDescriptor nextNode = nextStorage.getDatanodeDescriptor();
109          if (nextNode != localMachine) {
110            return nextNode;
111          }
112        }
113        return null;
114      }
115    
116      @Override
117      protected DatanodeStorageInfo chooseLocalRack(Node localMachine,
118          Set<Node> excludedNodes, long blocksize, int maxNodesPerRack,
119          List<DatanodeStorageInfo> results, boolean avoidStaleNodes,
120          StorageType storageType) throws NotEnoughReplicasException {
121        // no local machine, so choose a random machine
122        if (localMachine == null) {
123          return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
124              maxNodesPerRack, results, avoidStaleNodes, storageType);
125        }
126    
127        // choose one from the local rack, but off-nodegroup
128        try {
129          final String scope = NetworkTopology.getFirstHalf(localMachine.getNetworkLocation());
130          return chooseRandom(scope, excludedNodes, blocksize, maxNodesPerRack,
131              results, avoidStaleNodes, storageType);
132        } catch (NotEnoughReplicasException e1) {
133          // find the second replica
134          final DatanodeDescriptor newLocal = secondNode(localMachine, results);
135          if (newLocal != null) {
136            try {
137              return chooseRandom(
138                  clusterMap.getRack(newLocal.getNetworkLocation()), excludedNodes,
139                  blocksize, maxNodesPerRack, results, avoidStaleNodes, storageType);
140            } catch(NotEnoughReplicasException e2) {
141              //otherwise randomly choose one from the network
142              return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
143                  maxNodesPerRack, results, avoidStaleNodes, storageType);
144            }
145          } else {
146            //otherwise randomly choose one from the network
147            return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
148                maxNodesPerRack, results, avoidStaleNodes, storageType);
149          }
150        }
151      }
152    
153      /**
154       * {@inheritDoc}
155       */
156      @Override
157      protected void chooseRemoteRack(int numOfReplicas,
158          DatanodeDescriptor localMachine, Set<Node> excludedNodes,
159          long blocksize, int maxReplicasPerRack, List<DatanodeStorageInfo> results,
160          boolean avoidStaleNodes, StorageType storageType)
161              throws NotEnoughReplicasException {
162        int oldNumOfReplicas = results.size();
163    
164        final String rackLocation = NetworkTopology.getFirstHalf(
165            localMachine.getNetworkLocation());
166        try {
167          // randomly choose from remote racks
168          chooseRandom(numOfReplicas, "~" + rackLocation, excludedNodes, blocksize,
169              maxReplicasPerRack, results, avoidStaleNodes, storageType);
170        } catch (NotEnoughReplicasException e) {
171          // fall back to the local rack
172          chooseRandom(numOfReplicas - (results.size() - oldNumOfReplicas),
173              rackLocation, excludedNodes, blocksize,
174              maxReplicasPerRack, results, avoidStaleNodes, storageType);
175        }
176      }
177    
178      /* choose one node from the nodegroup that <i>localMachine</i> is on.
179       * if no such node is available, choose one node from the nodegroup where
180       * a second replica is on.
181       * if still no such node is available, choose a random node in the cluster.
182       * @return the chosen node
183       */
184      private DatanodeStorageInfo chooseLocalNodeGroup(
185          NetworkTopologyWithNodeGroup clusterMap, Node localMachine,
186          Set<Node> excludedNodes, long blocksize, int maxNodesPerRack,
187          List<DatanodeStorageInfo> results, boolean avoidStaleNodes,
188          StorageType storageType) throws NotEnoughReplicasException {
189        // no local machine, so choose a random machine
190        if (localMachine == null) {
191          return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
192              maxNodesPerRack, results, avoidStaleNodes, storageType);
193        }
194    
195        // choose one from the local node group
196        try {
197          return chooseRandom(
198              clusterMap.getNodeGroup(localMachine.getNetworkLocation()),
199              excludedNodes, blocksize, maxNodesPerRack, results, avoidStaleNodes,
200              storageType);
201        } catch (NotEnoughReplicasException e1) {
202          final DatanodeDescriptor newLocal = secondNode(localMachine, results);
203          if (newLocal != null) {
204            try {
205              return chooseRandom(
206                  clusterMap.getNodeGroup(newLocal.getNetworkLocation()),
207                  excludedNodes, blocksize, maxNodesPerRack, results,
208                  avoidStaleNodes, storageType);
209            } catch(NotEnoughReplicasException e2) {
210              //otherwise randomly choose one from the network
211              return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
212                  maxNodesPerRack, results, avoidStaleNodes, storageType);
213            }
214          } else {
215            //otherwise randomly choose one from the network
216            return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
217                maxNodesPerRack, results, avoidStaleNodes, storageType);
218          }
219        }
220      }
221    
222      @Override
223      protected String getRack(final DatanodeInfo cur) {
224        String nodeGroupString = cur.getNetworkLocation();
225        return NetworkTopology.getFirstHalf(nodeGroupString);
226      }
227      
228      /**
229       * Find other nodes in the same nodegroup of <i>localMachine</i> and add them
230       * into <i>excludeNodes</i> as replica should not be duplicated for nodes 
231       * within the same nodegroup
232       * @return number of new excluded nodes
233       */
234      @Override
235      protected int addToExcludedNodes(DatanodeDescriptor chosenNode,
236          Set<Node> excludedNodes) {
237        int countOfExcludedNodes = 0;
238        String nodeGroupScope = chosenNode.getNetworkLocation();
239        List<Node> leafNodes = clusterMap.getLeaves(nodeGroupScope);
240        for (Node leafNode : leafNodes) {
241          if (excludedNodes.add(leafNode)) {
242            // not a existing node in excludedNodes
243            countOfExcludedNodes++;
244          }
245        }
246        return countOfExcludedNodes;
247      }
248    
249      /**
250       * Pick up replica node set for deleting replica as over-replicated. 
251       * First set contains replica nodes on rack with more than one
252       * replica while second set contains remaining replica nodes.
253       * If first is not empty, divide first set into two subsets:
254       *   moreThanOne contains nodes on nodegroup with more than one replica
255       *   exactlyOne contains the remaining nodes in first set
256       * then pickup priSet if not empty.
257       * If first is empty, then pick second.
258       */
259      @Override
260      public Collection<DatanodeDescriptor> pickupReplicaSet(
261          Collection<DatanodeDescriptor> first,
262          Collection<DatanodeDescriptor> second) {
263        // If no replica within same rack, return directly.
264        if (first.isEmpty()) {
265          return second;
266        }
267        // Split data nodes in the first set into two sets, 
268        // moreThanOne contains nodes on nodegroup with more than one replica
269        // exactlyOne contains the remaining nodes
270        Map<String, List<DatanodeDescriptor>> nodeGroupMap = 
271            new HashMap<String, List<DatanodeDescriptor>>();
272        
273        for(DatanodeDescriptor node : first) {
274          final String nodeGroupName = 
275              NetworkTopology.getLastHalf(node.getNetworkLocation());
276          List<DatanodeDescriptor> datanodeList = 
277              nodeGroupMap.get(nodeGroupName);
278          if (datanodeList == null) {
279            datanodeList = new ArrayList<DatanodeDescriptor>();
280            nodeGroupMap.put(nodeGroupName, datanodeList);
281          }
282          datanodeList.add(node);
283        }
284        
285        final List<DatanodeDescriptor> moreThanOne = new ArrayList<DatanodeDescriptor>();
286        final List<DatanodeDescriptor> exactlyOne = new ArrayList<DatanodeDescriptor>();
287        // split nodes into two sets
288        for(List<DatanodeDescriptor> datanodeList : nodeGroupMap.values()) {
289          if (datanodeList.size() == 1 ) {
290            // exactlyOne contains nodes on nodegroup with exactly one replica
291            exactlyOne.add(datanodeList.get(0));
292          } else {
293            // moreThanOne contains nodes on nodegroup with more than one replica
294            moreThanOne.addAll(datanodeList);
295          }
296        }
297        
298        return moreThanOne.isEmpty()? exactlyOne : moreThanOne;
299      }
300      
301    }