001 /**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements. See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership. The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License. You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018 package org.apache.hadoop.hdfs.server.blockmanagement;
019
020 import java.util.ArrayList;
021 import java.util.Collection;
022 import java.util.HashMap;
023 import java.util.List;
024 import java.util.Map;
025 import java.util.Set;
026
027 import org.apache.hadoop.conf.Configuration;
028 import org.apache.hadoop.hdfs.DFSUtil;
029 import org.apache.hadoop.hdfs.StorageType;
030 import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
031 import org.apache.hadoop.hdfs.server.namenode.FSClusterStats;
032 import org.apache.hadoop.net.NetworkTopology;
033 import org.apache.hadoop.net.NetworkTopologyWithNodeGroup;
034 import org.apache.hadoop.net.Node;
035 import org.apache.hadoop.net.NodeBase;
036
037 /** The class is responsible for choosing the desired number of targets
038 * for placing block replicas on environment with node-group layer.
039 * The replica placement strategy is adjusted to:
040 * If the writer is on a datanode, the 1st replica is placed on the local
041 * node (or local node-group), otherwise a random datanode.
042 * The 2nd replica is placed on a datanode that is on a different rack with 1st
043 * replica node.
044 * The 3rd replica is placed on a datanode which is on a different node-group
045 * but the same rack as the second replica node.
046 */
047 public class BlockPlacementPolicyWithNodeGroup extends BlockPlacementPolicyDefault {
048
049 protected BlockPlacementPolicyWithNodeGroup(Configuration conf, FSClusterStats stats,
050 NetworkTopology clusterMap) {
051 initialize(conf, stats, clusterMap);
052 }
053
054 protected BlockPlacementPolicyWithNodeGroup() {
055 }
056
057 public void initialize(Configuration conf, FSClusterStats stats,
058 NetworkTopology clusterMap) {
059 super.initialize(conf, stats, clusterMap);
060 }
061
062 /** choose local node of localMachine as the target.
063 * if localMachine is not available, choose a node on the same nodegroup or
064 * rack instead.
065 * @return the chosen node
066 */
067 @Override
068 protected DatanodeStorageInfo chooseLocalStorage(Node localMachine,
069 Set<Node> excludedNodes, long blocksize, int maxNodesPerRack,
070 List<DatanodeStorageInfo> results, boolean avoidStaleNodes,
071 StorageType storageType) throws NotEnoughReplicasException {
072 // if no local machine, randomly choose one node
073 if (localMachine == null)
074 return chooseRandom(NodeBase.ROOT, excludedNodes,
075 blocksize, maxNodesPerRack, results, avoidStaleNodes, storageType);
076
077 // otherwise try local machine first
078 if (localMachine instanceof DatanodeDescriptor) {
079 DatanodeDescriptor localDataNode = (DatanodeDescriptor)localMachine;
080 if (excludedNodes.add(localMachine)) { // was not in the excluded list
081 for(DatanodeStorageInfo localStorage : DFSUtil.shuffle(
082 localDataNode.getStorageInfos())) {
083 if (addIfIsGoodTarget(localStorage, excludedNodes, blocksize,
084 maxNodesPerRack, false, results, avoidStaleNodes, storageType) >= 0) {
085 return localStorage;
086 }
087 }
088 }
089 }
090
091 // try a node on local node group
092 DatanodeStorageInfo chosenStorage = chooseLocalNodeGroup(
093 (NetworkTopologyWithNodeGroup)clusterMap, localMachine, excludedNodes,
094 blocksize, maxNodesPerRack, results, avoidStaleNodes, storageType);
095 if (chosenStorage != null) {
096 return chosenStorage;
097 }
098 // try a node on local rack
099 return chooseLocalRack(localMachine, excludedNodes,
100 blocksize, maxNodesPerRack, results, avoidStaleNodes, storageType);
101 }
102
103 /** @return the node of the second replica */
104 private static DatanodeDescriptor secondNode(Node localMachine,
105 List<DatanodeStorageInfo> results) {
106 // find the second replica
107 for(DatanodeStorageInfo nextStorage : results) {
108 DatanodeDescriptor nextNode = nextStorage.getDatanodeDescriptor();
109 if (nextNode != localMachine) {
110 return nextNode;
111 }
112 }
113 return null;
114 }
115
116 @Override
117 protected DatanodeStorageInfo chooseLocalRack(Node localMachine,
118 Set<Node> excludedNodes, long blocksize, int maxNodesPerRack,
119 List<DatanodeStorageInfo> results, boolean avoidStaleNodes,
120 StorageType storageType) throws NotEnoughReplicasException {
121 // no local machine, so choose a random machine
122 if (localMachine == null) {
123 return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
124 maxNodesPerRack, results, avoidStaleNodes, storageType);
125 }
126
127 // choose one from the local rack, but off-nodegroup
128 try {
129 final String scope = NetworkTopology.getFirstHalf(localMachine.getNetworkLocation());
130 return chooseRandom(scope, excludedNodes, blocksize, maxNodesPerRack,
131 results, avoidStaleNodes, storageType);
132 } catch (NotEnoughReplicasException e1) {
133 // find the second replica
134 final DatanodeDescriptor newLocal = secondNode(localMachine, results);
135 if (newLocal != null) {
136 try {
137 return chooseRandom(
138 clusterMap.getRack(newLocal.getNetworkLocation()), excludedNodes,
139 blocksize, maxNodesPerRack, results, avoidStaleNodes, storageType);
140 } catch(NotEnoughReplicasException e2) {
141 //otherwise randomly choose one from the network
142 return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
143 maxNodesPerRack, results, avoidStaleNodes, storageType);
144 }
145 } else {
146 //otherwise randomly choose one from the network
147 return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
148 maxNodesPerRack, results, avoidStaleNodes, storageType);
149 }
150 }
151 }
152
153 /**
154 * {@inheritDoc}
155 */
156 @Override
157 protected void chooseRemoteRack(int numOfReplicas,
158 DatanodeDescriptor localMachine, Set<Node> excludedNodes,
159 long blocksize, int maxReplicasPerRack, List<DatanodeStorageInfo> results,
160 boolean avoidStaleNodes, StorageType storageType)
161 throws NotEnoughReplicasException {
162 int oldNumOfReplicas = results.size();
163
164 final String rackLocation = NetworkTopology.getFirstHalf(
165 localMachine.getNetworkLocation());
166 try {
167 // randomly choose from remote racks
168 chooseRandom(numOfReplicas, "~" + rackLocation, excludedNodes, blocksize,
169 maxReplicasPerRack, results, avoidStaleNodes, storageType);
170 } catch (NotEnoughReplicasException e) {
171 // fall back to the local rack
172 chooseRandom(numOfReplicas - (results.size() - oldNumOfReplicas),
173 rackLocation, excludedNodes, blocksize,
174 maxReplicasPerRack, results, avoidStaleNodes, storageType);
175 }
176 }
177
178 /* choose one node from the nodegroup that <i>localMachine</i> is on.
179 * if no such node is available, choose one node from the nodegroup where
180 * a second replica is on.
181 * if still no such node is available, choose a random node in the cluster.
182 * @return the chosen node
183 */
184 private DatanodeStorageInfo chooseLocalNodeGroup(
185 NetworkTopologyWithNodeGroup clusterMap, Node localMachine,
186 Set<Node> excludedNodes, long blocksize, int maxNodesPerRack,
187 List<DatanodeStorageInfo> results, boolean avoidStaleNodes,
188 StorageType storageType) throws NotEnoughReplicasException {
189 // no local machine, so choose a random machine
190 if (localMachine == null) {
191 return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
192 maxNodesPerRack, results, avoidStaleNodes, storageType);
193 }
194
195 // choose one from the local node group
196 try {
197 return chooseRandom(
198 clusterMap.getNodeGroup(localMachine.getNetworkLocation()),
199 excludedNodes, blocksize, maxNodesPerRack, results, avoidStaleNodes,
200 storageType);
201 } catch (NotEnoughReplicasException e1) {
202 final DatanodeDescriptor newLocal = secondNode(localMachine, results);
203 if (newLocal != null) {
204 try {
205 return chooseRandom(
206 clusterMap.getNodeGroup(newLocal.getNetworkLocation()),
207 excludedNodes, blocksize, maxNodesPerRack, results,
208 avoidStaleNodes, storageType);
209 } catch(NotEnoughReplicasException e2) {
210 //otherwise randomly choose one from the network
211 return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
212 maxNodesPerRack, results, avoidStaleNodes, storageType);
213 }
214 } else {
215 //otherwise randomly choose one from the network
216 return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
217 maxNodesPerRack, results, avoidStaleNodes, storageType);
218 }
219 }
220 }
221
222 @Override
223 protected String getRack(final DatanodeInfo cur) {
224 String nodeGroupString = cur.getNetworkLocation();
225 return NetworkTopology.getFirstHalf(nodeGroupString);
226 }
227
228 /**
229 * Find other nodes in the same nodegroup of <i>localMachine</i> and add them
230 * into <i>excludeNodes</i> as replica should not be duplicated for nodes
231 * within the same nodegroup
232 * @return number of new excluded nodes
233 */
234 @Override
235 protected int addToExcludedNodes(DatanodeDescriptor chosenNode,
236 Set<Node> excludedNodes) {
237 int countOfExcludedNodes = 0;
238 String nodeGroupScope = chosenNode.getNetworkLocation();
239 List<Node> leafNodes = clusterMap.getLeaves(nodeGroupScope);
240 for (Node leafNode : leafNodes) {
241 if (excludedNodes.add(leafNode)) {
242 // not a existing node in excludedNodes
243 countOfExcludedNodes++;
244 }
245 }
246 return countOfExcludedNodes;
247 }
248
249 /**
250 * Pick up replica node set for deleting replica as over-replicated.
251 * First set contains replica nodes on rack with more than one
252 * replica while second set contains remaining replica nodes.
253 * If first is not empty, divide first set into two subsets:
254 * moreThanOne contains nodes on nodegroup with more than one replica
255 * exactlyOne contains the remaining nodes in first set
256 * then pickup priSet if not empty.
257 * If first is empty, then pick second.
258 */
259 @Override
260 public Collection<DatanodeDescriptor> pickupReplicaSet(
261 Collection<DatanodeDescriptor> first,
262 Collection<DatanodeDescriptor> second) {
263 // If no replica within same rack, return directly.
264 if (first.isEmpty()) {
265 return second;
266 }
267 // Split data nodes in the first set into two sets,
268 // moreThanOne contains nodes on nodegroup with more than one replica
269 // exactlyOne contains the remaining nodes
270 Map<String, List<DatanodeDescriptor>> nodeGroupMap =
271 new HashMap<String, List<DatanodeDescriptor>>();
272
273 for(DatanodeDescriptor node : first) {
274 final String nodeGroupName =
275 NetworkTopology.getLastHalf(node.getNetworkLocation());
276 List<DatanodeDescriptor> datanodeList =
277 nodeGroupMap.get(nodeGroupName);
278 if (datanodeList == null) {
279 datanodeList = new ArrayList<DatanodeDescriptor>();
280 nodeGroupMap.put(nodeGroupName, datanodeList);
281 }
282 datanodeList.add(node);
283 }
284
285 final List<DatanodeDescriptor> moreThanOne = new ArrayList<DatanodeDescriptor>();
286 final List<DatanodeDescriptor> exactlyOne = new ArrayList<DatanodeDescriptor>();
287 // split nodes into two sets
288 for(List<DatanodeDescriptor> datanodeList : nodeGroupMap.values()) {
289 if (datanodeList.size() == 1 ) {
290 // exactlyOne contains nodes on nodegroup with exactly one replica
291 exactlyOne.add(datanodeList.get(0));
292 } else {
293 // moreThanOne contains nodes on nodegroup with more than one replica
294 moreThanOne.addAll(datanodeList);
295 }
296 }
297
298 return moreThanOne.isEmpty()? exactlyOne : moreThanOne;
299 }
300
301 }