001 /**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements. See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership. The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License. You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018 package org.apache.hadoop.hdfs.server.blockmanagement;
019
020 import java.util.ArrayList;
021 import java.util.Collection;
022 import java.util.HashMap;
023 import java.util.Iterator;
024 import java.util.List;
025 import java.util.Map;
026
027 import org.apache.hadoop.conf.Configuration;
028 import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
029 import org.apache.hadoop.hdfs.server.namenode.FSClusterStats;
030 import org.apache.hadoop.net.NetworkTopology;
031 import org.apache.hadoop.net.NetworkTopologyWithNodeGroup;
032 import org.apache.hadoop.net.Node;
033 import org.apache.hadoop.net.NodeBase;
034
035 /** The class is responsible for choosing the desired number of targets
036 * for placing block replicas on environment with node-group layer.
037 * The replica placement strategy is adjusted to:
038 * If the writer is on a datanode, the 1st replica is placed on the local
039 * node (or local node-group), otherwise a random datanode.
040 * The 2nd replica is placed on a datanode that is on a different rack with 1st
041 * replica node.
042 * The 3rd replica is placed on a datanode which is on a different node-group
043 * but the same rack as the second replica node.
044 */
045 public class BlockPlacementPolicyWithNodeGroup extends BlockPlacementPolicyDefault {
046
047 BlockPlacementPolicyWithNodeGroup(Configuration conf, FSClusterStats stats,
048 NetworkTopology clusterMap) {
049 initialize(conf, stats, clusterMap);
050 }
051
052 BlockPlacementPolicyWithNodeGroup() {
053 }
054
055 public void initialize(Configuration conf, FSClusterStats stats,
056 NetworkTopology clusterMap) {
057 super.initialize(conf, stats, clusterMap);
058 }
059
060 /** choose local node of localMachine as the target.
061 * if localMachine is not available, choose a node on the same nodegroup or
062 * rack instead.
063 * @return the chosen node
064 */
065 @Override
066 protected DatanodeDescriptor chooseLocalNode(
067 DatanodeDescriptor localMachine,
068 HashMap<Node, Node> excludedNodes,
069 long blocksize,
070 int maxNodesPerRack,
071 List<DatanodeDescriptor> results,
072 boolean avoidStaleNodes)
073 throws NotEnoughReplicasException {
074 // if no local machine, randomly choose one node
075 if (localMachine == null)
076 return chooseRandom(NodeBase.ROOT, excludedNodes,
077 blocksize, maxNodesPerRack, results, avoidStaleNodes);
078
079 // otherwise try local machine first
080 Node oldNode = excludedNodes.put(localMachine, localMachine);
081 if (oldNode == null) { // was not in the excluded list
082 if (isGoodTarget(localMachine, blocksize,
083 maxNodesPerRack, false, results, avoidStaleNodes)) {
084 results.add(localMachine);
085 // Nodes under same nodegroup should be excluded.
086 addNodeGroupToExcludedNodes(excludedNodes,
087 localMachine.getNetworkLocation());
088 return localMachine;
089 }
090 }
091
092 // try a node on local node group
093 DatanodeDescriptor chosenNode = chooseLocalNodeGroup(
094 (NetworkTopologyWithNodeGroup)clusterMap, localMachine, excludedNodes,
095 blocksize, maxNodesPerRack, results, avoidStaleNodes);
096 if (chosenNode != null) {
097 return chosenNode;
098 }
099 // try a node on local rack
100 return chooseLocalRack(localMachine, excludedNodes,
101 blocksize, maxNodesPerRack, results, avoidStaleNodes);
102 }
103
104 /**
105 * {@inheritDoc}
106 */
107 @Override
108 protected void adjustExcludedNodes(HashMap<Node, Node> excludedNodes,
109 Node chosenNode) {
110 // as node-group aware implementation, it should make sure no two replica
111 // are placing on the same node group.
112 addNodeGroupToExcludedNodes(excludedNodes, chosenNode.getNetworkLocation());
113 }
114
115 // add all nodes under specific nodegroup to excludedNodes.
116 private void addNodeGroupToExcludedNodes(HashMap<Node, Node> excludedNodes,
117 String nodeGroup) {
118 List<Node> leafNodes = clusterMap.getLeaves(nodeGroup);
119 for (Node node : leafNodes) {
120 excludedNodes.put(node, node);
121 }
122 }
123
124 /**
125 * {@inheritDoc}
126 */
127 @Override
128 protected DatanodeDescriptor chooseLocalRack(
129 DatanodeDescriptor localMachine,
130 HashMap<Node, Node> excludedNodes,
131 long blocksize,
132 int maxNodesPerRack,
133 List<DatanodeDescriptor> results,
134 boolean avoidStaleNodes)
135 throws NotEnoughReplicasException {
136 // no local machine, so choose a random machine
137 if (localMachine == null) {
138 return chooseRandom(NodeBase.ROOT, excludedNodes,
139 blocksize, maxNodesPerRack, results, avoidStaleNodes);
140 }
141
142 // choose one from the local rack, but off-nodegroup
143 try {
144 return chooseRandom(NetworkTopology.getFirstHalf(
145 localMachine.getNetworkLocation()),
146 excludedNodes, blocksize,
147 maxNodesPerRack, results, avoidStaleNodes);
148 } catch (NotEnoughReplicasException e1) {
149 // find the second replica
150 DatanodeDescriptor newLocal=null;
151 for(Iterator<DatanodeDescriptor> iter=results.iterator();
152 iter.hasNext();) {
153 DatanodeDescriptor nextNode = iter.next();
154 if (nextNode != localMachine) {
155 newLocal = nextNode;
156 break;
157 }
158 }
159 if (newLocal != null) {
160 try {
161 return chooseRandom(clusterMap.getRack(newLocal.getNetworkLocation()),
162 excludedNodes, blocksize, maxNodesPerRack, results,
163 avoidStaleNodes);
164 } catch(NotEnoughReplicasException e2) {
165 //otherwise randomly choose one from the network
166 return chooseRandom(NodeBase.ROOT, excludedNodes,
167 blocksize, maxNodesPerRack, results,
168 avoidStaleNodes);
169 }
170 } else {
171 //otherwise randomly choose one from the network
172 return chooseRandom(NodeBase.ROOT, excludedNodes,
173 blocksize, maxNodesPerRack, results,
174 avoidStaleNodes);
175 }
176 }
177 }
178
179 /**
180 * {@inheritDoc}
181 */
182 @Override
183 protected void chooseRemoteRack(int numOfReplicas,
184 DatanodeDescriptor localMachine,
185 HashMap<Node, Node> excludedNodes,
186 long blocksize,
187 int maxReplicasPerRack,
188 List<DatanodeDescriptor> results,
189 boolean avoidStaleNodes)
190 throws NotEnoughReplicasException {
191 int oldNumOfReplicas = results.size();
192
193 final String rackLocation = NetworkTopology.getFirstHalf(
194 localMachine.getNetworkLocation());
195 try {
196 // randomly choose from remote racks
197 chooseRandom(numOfReplicas, "~" + rackLocation, excludedNodes, blocksize,
198 maxReplicasPerRack, results, avoidStaleNodes);
199 } catch (NotEnoughReplicasException e) {
200 // fall back to the local rack
201 chooseRandom(numOfReplicas - (results.size() - oldNumOfReplicas),
202 rackLocation, excludedNodes, blocksize,
203 maxReplicasPerRack, results, avoidStaleNodes);
204 }
205 }
206
207 /* choose one node from the nodegroup that <i>localMachine</i> is on.
208 * if no such node is available, choose one node from the nodegroup where
209 * a second replica is on.
210 * if still no such node is available, choose a random node in the cluster.
211 * @return the chosen node
212 */
213 private DatanodeDescriptor chooseLocalNodeGroup(NetworkTopologyWithNodeGroup clusterMap,
214 DatanodeDescriptor localMachine, HashMap<Node, Node> excludedNodes, long blocksize,
215 int maxNodesPerRack, List<DatanodeDescriptor> results, boolean avoidStaleNodes)
216 throws NotEnoughReplicasException {
217 // no local machine, so choose a random machine
218 if (localMachine == null) {
219 return chooseRandom(NodeBase.ROOT, excludedNodes,
220 blocksize, maxNodesPerRack, results, avoidStaleNodes);
221 }
222
223 // choose one from the local node group
224 try {
225 return chooseRandom(clusterMap.getNodeGroup(localMachine.getNetworkLocation()),
226 excludedNodes, blocksize, maxNodesPerRack, results, avoidStaleNodes);
227 } catch (NotEnoughReplicasException e1) {
228 // find the second replica
229 DatanodeDescriptor newLocal=null;
230 for(Iterator<DatanodeDescriptor> iter=results.iterator();
231 iter.hasNext();) {
232 DatanodeDescriptor nextNode = iter.next();
233 if (nextNode != localMachine) {
234 newLocal = nextNode;
235 break;
236 }
237 }
238 if (newLocal != null) {
239 try {
240 return chooseRandom(clusterMap.getNodeGroup(newLocal.getNetworkLocation()),
241 excludedNodes, blocksize, maxNodesPerRack, results, avoidStaleNodes);
242 } catch(NotEnoughReplicasException e2) {
243 //otherwise randomly choose one from the network
244 return chooseRandom(NodeBase.ROOT, excludedNodes,
245 blocksize, maxNodesPerRack, results, avoidStaleNodes);
246 }
247 } else {
248 //otherwise randomly choose one from the network
249 return chooseRandom(NodeBase.ROOT, excludedNodes,
250 blocksize, maxNodesPerRack, results, avoidStaleNodes);
251 }
252 }
253 }
254
255 @Override
256 protected String getRack(final DatanodeInfo cur) {
257 String nodeGroupString = cur.getNetworkLocation();
258 return NetworkTopology.getFirstHalf(nodeGroupString);
259 }
260
261 /**
262 * Find other nodes in the same nodegroup of <i>localMachine</i> and add them
263 * into <i>excludeNodes</i> as replica should not be duplicated for nodes
264 * within the same nodegroup
265 * @return number of new excluded nodes
266 */
267 protected int addToExcludedNodes(DatanodeDescriptor localMachine,
268 HashMap<Node, Node> excludedNodes) {
269 int countOfExcludedNodes = 0;
270 String nodeGroupScope = localMachine.getNetworkLocation();
271 List<Node> leafNodes = clusterMap.getLeaves(nodeGroupScope);
272 for (Node leafNode : leafNodes) {
273 Node node = excludedNodes.put(leafNode, leafNode);
274 if (node == null) {
275 // not a existing node in excludedNodes
276 countOfExcludedNodes++;
277 }
278 }
279 return countOfExcludedNodes;
280 }
281
282 /**
283 * Pick up replica node set for deleting replica as over-replicated.
284 * First set contains replica nodes on rack with more than one
285 * replica while second set contains remaining replica nodes.
286 * If first is not empty, divide first set into two subsets:
287 * moreThanOne contains nodes on nodegroup with more than one replica
288 * exactlyOne contains the remaining nodes in first set
289 * then pickup priSet if not empty.
290 * If first is empty, then pick second.
291 */
292 @Override
293 public Iterator<DatanodeDescriptor> pickupReplicaSet(
294 Collection<DatanodeDescriptor> first,
295 Collection<DatanodeDescriptor> second) {
296 // If no replica within same rack, return directly.
297 if (first.isEmpty()) {
298 return second.iterator();
299 }
300 // Split data nodes in the first set into two sets,
301 // moreThanOne contains nodes on nodegroup with more than one replica
302 // exactlyOne contains the remaining nodes
303 Map<String, List<DatanodeDescriptor>> nodeGroupMap =
304 new HashMap<String, List<DatanodeDescriptor>>();
305
306 for(DatanodeDescriptor node : first) {
307 final String nodeGroupName =
308 NetworkTopology.getLastHalf(node.getNetworkLocation());
309 List<DatanodeDescriptor> datanodeList =
310 nodeGroupMap.get(nodeGroupName);
311 if (datanodeList == null) {
312 datanodeList = new ArrayList<DatanodeDescriptor>();
313 nodeGroupMap.put(nodeGroupName, datanodeList);
314 }
315 datanodeList.add(node);
316 }
317
318 final List<DatanodeDescriptor> moreThanOne = new ArrayList<DatanodeDescriptor>();
319 final List<DatanodeDescriptor> exactlyOne = new ArrayList<DatanodeDescriptor>();
320 // split nodes into two sets
321 for(List<DatanodeDescriptor> datanodeList : nodeGroupMap.values()) {
322 if (datanodeList.size() == 1 ) {
323 // exactlyOne contains nodes on nodegroup with exactly one replica
324 exactlyOne.add(datanodeList.get(0));
325 } else {
326 // moreThanOne contains nodes on nodegroup with more than one replica
327 moreThanOne.addAll(datanodeList);
328 }
329 }
330
331 Iterator<DatanodeDescriptor> iter =
332 moreThanOne.isEmpty() ? exactlyOne.iterator() : moreThanOne.iterator();
333 return iter;
334 }
335
336 }