001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    package org.apache.hadoop.hdfs.server.datanode.fsdataset;
019    
020    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_THRESHOLD_DEFAULT;
021    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_THRESHOLD_KEY;
022    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_PREFERENCE_FRACTION_DEFAULT;
023    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_PREFERENCE_FRACTION_KEY;
024    
025    import java.io.IOException;
026    import java.util.ArrayList;
027    import java.util.List;
028    import java.util.Random;
029    
030    import org.apache.commons.logging.Log;
031    import org.apache.commons.logging.LogFactory;
032    import org.apache.hadoop.conf.Configurable;
033    import org.apache.hadoop.conf.Configuration;
034    import org.apache.hadoop.util.DiskChecker.DiskOutOfSpaceException;
035    
036    /**
037     * A DN volume choosing policy which takes into account the amount of free
038     * space on each of the available volumes when considering where to assign a
039     * new replica allocation. By default this policy prefers assigning replicas to
040     * those volumes with more available free space, so as to over time balance the
041     * available space of all the volumes within a DN.
042     */
043    public class AvailableSpaceVolumeChoosingPolicy<V extends FsVolumeSpi>
044        implements VolumeChoosingPolicy<V>, Configurable {
045      
046      private static final Log LOG = LogFactory.getLog(AvailableSpaceVolumeChoosingPolicy.class);
047      
048      private static final Random RAND = new Random();
049      
050      private long balancedSpaceThreshold = DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_THRESHOLD_DEFAULT;
051      private float balancedPreferencePercent = DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_PREFERENCE_FRACTION_DEFAULT;
052    
053      @Override
054      public synchronized void setConf(Configuration conf) {
055        balancedSpaceThreshold = conf.getLong(
056            DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_THRESHOLD_KEY,
057            DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_THRESHOLD_DEFAULT);
058        balancedPreferencePercent = conf.getFloat(
059            DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_PREFERENCE_FRACTION_KEY,
060            DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_PREFERENCE_FRACTION_DEFAULT);
061        
062        LOG.info("Available space volume choosing policy initialized: " +
063            DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_THRESHOLD_KEY +
064            " = " + balancedSpaceThreshold + ", " +
065            DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_PREFERENCE_FRACTION_KEY +
066            " = " + balancedPreferencePercent);
067    
068        if (balancedPreferencePercent > 1.0) {
069          LOG.warn("The value of " + DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_PREFERENCE_FRACTION_KEY +
070                   " is greater than 1.0 but should be in the range 0.0 - 1.0");
071        }
072    
073        if (balancedPreferencePercent < 0.5) {
074          LOG.warn("The value of " + DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_PREFERENCE_FRACTION_KEY +
075                   " is less than 0.5 so volumes with less available disk space will receive more block allocations");
076        }
077      }
078      
079      @Override
080      public synchronized Configuration getConf() {
081        // Nothing to do. Only added to fulfill the Configurable contract.
082        return null;
083      }
084      
085      private final VolumeChoosingPolicy<V> roundRobinPolicyBalanced =
086          new RoundRobinVolumeChoosingPolicy<V>();
087      private final VolumeChoosingPolicy<V> roundRobinPolicyHighAvailable =
088          new RoundRobinVolumeChoosingPolicy<V>();
089      private final VolumeChoosingPolicy<V> roundRobinPolicyLowAvailable =
090          new RoundRobinVolumeChoosingPolicy<V>();
091    
092      @Override
093      public synchronized V chooseVolume(List<V> volumes,
094          final long replicaSize) throws IOException {
095        if (volumes.size() < 1) {
096          throw new DiskOutOfSpaceException("No more available volumes");
097        }
098        
099        AvailableSpaceVolumeList volumesWithSpaces =
100            new AvailableSpaceVolumeList(volumes);
101        
102        if (volumesWithSpaces.areAllVolumesWithinFreeSpaceThreshold()) {
103          // If they're actually not too far out of whack, fall back on pure round
104          // robin.
105          V volume = roundRobinPolicyBalanced.chooseVolume(volumes, replicaSize);
106          if (LOG.isDebugEnabled()) {
107            LOG.debug("All volumes are within the configured free space balance " +
108                "threshold. Selecting " + volume + " for write of block size " +
109                replicaSize);
110          }
111          return volume;
112        } else {
113          V volume = null;
114          // If none of the volumes with low free space have enough space for the
115          // replica, always try to choose a volume with a lot of free space.
116          long mostAvailableAmongLowVolumes = volumesWithSpaces
117              .getMostAvailableSpaceAmongVolumesWithLowAvailableSpace();
118          
119          List<V> highAvailableVolumes = extractVolumesFromPairs(
120              volumesWithSpaces.getVolumesWithHighAvailableSpace());
121          List<V> lowAvailableVolumes = extractVolumesFromPairs(
122              volumesWithSpaces.getVolumesWithLowAvailableSpace());
123          
124          float preferencePercentScaler =
125              (highAvailableVolumes.size() * balancedPreferencePercent) +
126              (lowAvailableVolumes.size() * (1 - balancedPreferencePercent));
127          float scaledPreferencePercent =
128              (highAvailableVolumes.size() * balancedPreferencePercent) /
129              preferencePercentScaler;
130          if (mostAvailableAmongLowVolumes < replicaSize ||
131              RAND.nextFloat() < scaledPreferencePercent) {
132            volume = roundRobinPolicyHighAvailable.chooseVolume(
133                highAvailableVolumes,
134                replicaSize);
135            if (LOG.isDebugEnabled()) {
136              LOG.debug("Volumes are imbalanced. Selecting " + volume +
137                  " from high available space volumes for write of block size "
138                  + replicaSize);
139            }
140          } else {
141            volume = roundRobinPolicyLowAvailable.chooseVolume(
142                lowAvailableVolumes,
143                replicaSize);
144            if (LOG.isDebugEnabled()) {
145              LOG.debug("Volumes are imbalanced. Selecting " + volume +
146                  " from low available space volumes for write of block size "
147                  + replicaSize);
148            }
149          }
150          return volume;
151        }
152      }
153      
154      /**
155       * Used to keep track of the list of volumes we're choosing from.
156       */
157      private class AvailableSpaceVolumeList {
158        private final List<AvailableSpaceVolumePair> volumes;
159        
160        public AvailableSpaceVolumeList(List<V> volumes) throws IOException {
161          this.volumes = new ArrayList<AvailableSpaceVolumePair>();
162          for (V volume : volumes) {
163            this.volumes.add(new AvailableSpaceVolumePair(volume));
164          }
165        }
166        
167        /**
168         * Check if the available space on all the volumes is roughly equal.
169         * 
170         * @param volumes the volumes to check
171         * @return true if all volumes' free space is within the configured threshold,
172         *         false otherwise.
173         * @throws IOException
174         *           in the event of error checking amount of available space
175         */
176        public boolean areAllVolumesWithinFreeSpaceThreshold() {
177          long leastAvailable = Long.MAX_VALUE;
178          long mostAvailable = 0;
179          for (AvailableSpaceVolumePair volume : volumes) {
180            leastAvailable = Math.min(leastAvailable, volume.getAvailable());
181            mostAvailable = Math.max(mostAvailable, volume.getAvailable());
182          }
183          return (mostAvailable - leastAvailable) < balancedSpaceThreshold;
184        }
185        
186        /**
187         * @return the minimum amount of space available on a single volume,
188         *         across all volumes.
189         */
190        private long getLeastAvailableSpace() {
191          long leastAvailable = Long.MAX_VALUE;
192          for (AvailableSpaceVolumePair volume : volumes) {
193            leastAvailable = Math.min(leastAvailable, volume.getAvailable());
194          }
195          return leastAvailable;
196        }
197        
198        /**
199         * @return the maximum amount of space available across volumes with low space.
200         */
201        public long getMostAvailableSpaceAmongVolumesWithLowAvailableSpace() {
202          long mostAvailable = Long.MIN_VALUE;
203          for (AvailableSpaceVolumePair volume : getVolumesWithLowAvailableSpace()) {
204            mostAvailable = Math.max(mostAvailable, volume.getAvailable());
205          }
206          return mostAvailable;
207        }
208        
209        /**
210         * @return the list of volumes with relatively low available space.
211         */
212        public List<AvailableSpaceVolumePair> getVolumesWithLowAvailableSpace() {
213          long leastAvailable = getLeastAvailableSpace();
214          List<AvailableSpaceVolumePair> ret = new ArrayList<AvailableSpaceVolumePair>();
215          for (AvailableSpaceVolumePair volume : volumes) {
216            if (volume.getAvailable() <= leastAvailable + balancedSpaceThreshold) {
217              ret.add(volume);
218            }
219          }
220          return ret;
221        }
222        
223        /**
224         * @return the list of volumes with a lot of available space.
225         */
226        public List<AvailableSpaceVolumePair> getVolumesWithHighAvailableSpace() {
227          long leastAvailable = getLeastAvailableSpace();
228          List<AvailableSpaceVolumePair> ret = new ArrayList<AvailableSpaceVolumePair>();
229          for (AvailableSpaceVolumePair volume : volumes) {
230            if (volume.getAvailable() > leastAvailable + balancedSpaceThreshold) {
231              ret.add(volume);
232            }
233          }
234          return ret;
235        }
236        
237      }
238      
239      /**
240       * Used so that we only check the available space on a given volume once, at
241       * the beginning of {@link AvailableSpaceVolumeChoosingPolicy#chooseVolume(List, long)}.
242       */
243      private class AvailableSpaceVolumePair {
244        private final V volume;
245        private final long availableSpace;
246        
247        public AvailableSpaceVolumePair(V volume) throws IOException {
248          this.volume = volume;
249          this.availableSpace = volume.getAvailable();
250        }
251        
252        public long getAvailable() {
253          return availableSpace;
254        }
255        
256        public V getVolume() {
257          return volume;
258        }
259      }
260      
261      private List<V> extractVolumesFromPairs(List<AvailableSpaceVolumePair> volumes) {
262        List<V> ret = new ArrayList<V>();
263        for (AvailableSpaceVolumePair volume : volumes) {
264          ret.add(volume.getVolume());
265        }
266        return ret;
267      }
268    
269    }