001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    package org.apache.hadoop.hdfs.server.datanode.fsdataset;
019    
020    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_THRESHOLD_DEFAULT;
021    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_THRESHOLD_KEY;
022    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_PREFERENCE_FRACTION_DEFAULT;
023    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_PREFERENCE_FRACTION_KEY;
024    
025    import java.io.IOException;
026    import java.util.ArrayList;
027    import java.util.List;
028    import java.util.Random;
029    
030    import org.apache.commons.logging.Log;
031    import org.apache.commons.logging.LogFactory;
032    import org.apache.hadoop.conf.Configurable;
033    import org.apache.hadoop.conf.Configuration;
034    import org.apache.hadoop.util.DiskChecker.DiskOutOfSpaceException;
035    
036    /**
037     * A DN volume choosing policy which takes into account the amount of free
038     * space on each of the available volumes when considering where to assign a
039     * new replica allocation. By default this policy prefers assigning replicas to
040     * those volumes with more available free space, so as to over time balance the
041     * available space of all the volumes within a DN.
042     */
043    public class AvailableSpaceVolumeChoosingPolicy<V extends FsVolumeSpi>
044        implements VolumeChoosingPolicy<V>, Configurable {
045      
046      private static final Log LOG = LogFactory.getLog(AvailableSpaceVolumeChoosingPolicy.class);
047      
048      private final Random random;
049      
050      private long balancedSpaceThreshold = DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_THRESHOLD_DEFAULT;
051      private float balancedPreferencePercent = DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_PREFERENCE_FRACTION_DEFAULT;
052    
053      AvailableSpaceVolumeChoosingPolicy(Random random) {
054        this.random = random;
055      }
056    
057      public AvailableSpaceVolumeChoosingPolicy() {
058        this(new Random());
059      }
060    
061      @Override
062      public synchronized void setConf(Configuration conf) {
063        balancedSpaceThreshold = conf.getLong(
064            DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_THRESHOLD_KEY,
065            DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_THRESHOLD_DEFAULT);
066        balancedPreferencePercent = conf.getFloat(
067            DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_PREFERENCE_FRACTION_KEY,
068            DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_PREFERENCE_FRACTION_DEFAULT);
069        
070        LOG.info("Available space volume choosing policy initialized: " +
071            DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_THRESHOLD_KEY +
072            " = " + balancedSpaceThreshold + ", " +
073            DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_PREFERENCE_FRACTION_KEY +
074            " = " + balancedPreferencePercent);
075    
076        if (balancedPreferencePercent > 1.0) {
077          LOG.warn("The value of " + DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_PREFERENCE_FRACTION_KEY +
078                   " is greater than 1.0 but should be in the range 0.0 - 1.0");
079        }
080    
081        if (balancedPreferencePercent < 0.5) {
082          LOG.warn("The value of " + DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_PREFERENCE_FRACTION_KEY +
083                   " is less than 0.5 so volumes with less available disk space will receive more block allocations");
084        }
085      }
086      
087      @Override
088      public synchronized Configuration getConf() {
089        // Nothing to do. Only added to fulfill the Configurable contract.
090        return null;
091      }
092      
093      private final VolumeChoosingPolicy<V> roundRobinPolicyBalanced =
094          new RoundRobinVolumeChoosingPolicy<V>();
095      private final VolumeChoosingPolicy<V> roundRobinPolicyHighAvailable =
096          new RoundRobinVolumeChoosingPolicy<V>();
097      private final VolumeChoosingPolicy<V> roundRobinPolicyLowAvailable =
098          new RoundRobinVolumeChoosingPolicy<V>();
099    
100      @Override
101      public synchronized V chooseVolume(List<V> volumes,
102          long replicaSize) throws IOException {
103        if (volumes.size() < 1) {
104          throw new DiskOutOfSpaceException("No more available volumes");
105        }
106        
107        AvailableSpaceVolumeList volumesWithSpaces =
108            new AvailableSpaceVolumeList(volumes);
109        
110        if (volumesWithSpaces.areAllVolumesWithinFreeSpaceThreshold()) {
111          // If they're actually not too far out of whack, fall back on pure round
112          // robin.
113          V volume = roundRobinPolicyBalanced.chooseVolume(volumes, replicaSize);
114          if (LOG.isDebugEnabled()) {
115            LOG.debug("All volumes are within the configured free space balance " +
116                "threshold. Selecting " + volume + " for write of block size " +
117                replicaSize);
118          }
119          return volume;
120        } else {
121          V volume = null;
122          // If none of the volumes with low free space have enough space for the
123          // replica, always try to choose a volume with a lot of free space.
124          long mostAvailableAmongLowVolumes = volumesWithSpaces
125              .getMostAvailableSpaceAmongVolumesWithLowAvailableSpace();
126          
127          List<V> highAvailableVolumes = extractVolumesFromPairs(
128              volumesWithSpaces.getVolumesWithHighAvailableSpace());
129          List<V> lowAvailableVolumes = extractVolumesFromPairs(
130              volumesWithSpaces.getVolumesWithLowAvailableSpace());
131          
132          float preferencePercentScaler =
133              (highAvailableVolumes.size() * balancedPreferencePercent) +
134              (lowAvailableVolumes.size() * (1 - balancedPreferencePercent));
135          float scaledPreferencePercent =
136              (highAvailableVolumes.size() * balancedPreferencePercent) /
137              preferencePercentScaler;
138          if (mostAvailableAmongLowVolumes < replicaSize ||
139              random.nextFloat() < scaledPreferencePercent) {
140            volume = roundRobinPolicyHighAvailable.chooseVolume(
141                highAvailableVolumes, replicaSize);
142            if (LOG.isDebugEnabled()) {
143              LOG.debug("Volumes are imbalanced. Selecting " + volume +
144                  " from high available space volumes for write of block size "
145                  + replicaSize);
146            }
147          } else {
148            volume = roundRobinPolicyLowAvailable.chooseVolume(
149                lowAvailableVolumes, replicaSize);
150            if (LOG.isDebugEnabled()) {
151              LOG.debug("Volumes are imbalanced. Selecting " + volume +
152                  " from low available space volumes for write of block size "
153                  + replicaSize);
154            }
155          }
156          return volume;
157        }
158      }
159      
160      /**
161       * Used to keep track of the list of volumes we're choosing from.
162       */
163      private class AvailableSpaceVolumeList {
164        private final List<AvailableSpaceVolumePair> volumes;
165        
166        public AvailableSpaceVolumeList(List<V> volumes) throws IOException {
167          this.volumes = new ArrayList<AvailableSpaceVolumePair>();
168          for (V volume : volumes) {
169            this.volumes.add(new AvailableSpaceVolumePair(volume));
170          }
171        }
172        
173        /**
174         * @return true if all volumes' free space is within the
175         *         configured threshold, false otherwise.
176         */
177        public boolean areAllVolumesWithinFreeSpaceThreshold() {
178          long leastAvailable = Long.MAX_VALUE;
179          long mostAvailable = 0;
180          for (AvailableSpaceVolumePair volume : volumes) {
181            leastAvailable = Math.min(leastAvailable, volume.getAvailable());
182            mostAvailable = Math.max(mostAvailable, volume.getAvailable());
183          }
184          return (mostAvailable - leastAvailable) < balancedSpaceThreshold;
185        }
186        
187        /**
188         * @return the minimum amount of space available on a single volume,
189         *         across all volumes.
190         */
191        private long getLeastAvailableSpace() {
192          long leastAvailable = Long.MAX_VALUE;
193          for (AvailableSpaceVolumePair volume : volumes) {
194            leastAvailable = Math.min(leastAvailable, volume.getAvailable());
195          }
196          return leastAvailable;
197        }
198        
199        /**
200         * @return the maximum amount of space available across volumes with low space.
201         */
202        public long getMostAvailableSpaceAmongVolumesWithLowAvailableSpace() {
203          long mostAvailable = Long.MIN_VALUE;
204          for (AvailableSpaceVolumePair volume : getVolumesWithLowAvailableSpace()) {
205            mostAvailable = Math.max(mostAvailable, volume.getAvailable());
206          }
207          return mostAvailable;
208        }
209        
210        /**
211         * @return the list of volumes with relatively low available space.
212         */
213        public List<AvailableSpaceVolumePair> getVolumesWithLowAvailableSpace() {
214          long leastAvailable = getLeastAvailableSpace();
215          List<AvailableSpaceVolumePair> ret = new ArrayList<AvailableSpaceVolumePair>();
216          for (AvailableSpaceVolumePair volume : volumes) {
217            if (volume.getAvailable() <= leastAvailable + balancedSpaceThreshold) {
218              ret.add(volume);
219            }
220          }
221          return ret;
222        }
223        
224        /**
225         * @return the list of volumes with a lot of available space.
226         */
227        public List<AvailableSpaceVolumePair> getVolumesWithHighAvailableSpace() {
228          long leastAvailable = getLeastAvailableSpace();
229          List<AvailableSpaceVolumePair> ret = new ArrayList<AvailableSpaceVolumePair>();
230          for (AvailableSpaceVolumePair volume : volumes) {
231            if (volume.getAvailable() > leastAvailable + balancedSpaceThreshold) {
232              ret.add(volume);
233            }
234          }
235          return ret;
236        }
237        
238      }
239      
240      /**
241       * Used so that we only check the available space on a given volume once, at
242       * the beginning of {@link AvailableSpaceVolumeChoosingPolicy#chooseVolume(List, long)}.
243       */
244      private class AvailableSpaceVolumePair {
245        private final V volume;
246        private final long availableSpace;
247        
248        public AvailableSpaceVolumePair(V volume) throws IOException {
249          this.volume = volume;
250          this.availableSpace = volume.getAvailable();
251        }
252        
253        public long getAvailable() {
254          return availableSpace;
255        }
256        
257        public V getVolume() {
258          return volume;
259        }
260      }
261      
262      private List<V> extractVolumesFromPairs(List<AvailableSpaceVolumePair> volumes) {
263        List<V> ret = new ArrayList<V>();
264        for (AvailableSpaceVolumePair volume : volumes) {
265          ret.add(volume.getVolume());
266        }
267        return ret;
268      }
269    
270    }