001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    package org.apache.hadoop.hdfs.shortcircuit;
019    
020    import java.io.BufferedOutputStream;
021    import java.io.Closeable;
022    import java.io.DataOutputStream;
023    import java.io.EOFException;
024    import java.io.FileInputStream;
025    import java.io.IOException;
026    import java.util.HashMap;
027    import java.util.Map.Entry;
028    import java.util.TreeMap;
029    import java.util.concurrent.locks.Condition;
030    import java.util.concurrent.locks.ReentrantLock;
031    
032    import org.apache.commons.lang.mutable.MutableBoolean;
033    import org.apache.commons.logging.Log;
034    import org.apache.commons.logging.LogFactory;
035    import org.apache.hadoop.classification.InterfaceAudience;
036    import org.apache.hadoop.hdfs.ExtendedBlockId;
037    import org.apache.hadoop.hdfs.net.DomainPeer;
038    import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
039    import org.apache.hadoop.hdfs.protocol.datatransfer.DataTransferProtocol;
040    import org.apache.hadoop.hdfs.protocol.datatransfer.Sender;
041    import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.ShortCircuitShmResponseProto;
042    import org.apache.hadoop.hdfs.protocolPB.PBHelper;
043    import org.apache.hadoop.hdfs.server.datanode.ShortCircuitRegistry;
044    import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitShm.ShmId;
045    import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitShm.Slot;
046    import org.apache.hadoop.io.IOUtils;
047    import org.apache.hadoop.net.unix.DomainSocket;
048    import org.apache.hadoop.net.unix.DomainSocketWatcher;
049    
050    import com.google.common.annotations.VisibleForTesting;
051    import com.google.common.base.Preconditions;
052    
053    /**
054     * Manages short-circuit memory segments for an HDFS client.
055     * 
056     * Clients are responsible for requesting and releasing shared memory segments used
057     * for communicating with the DataNode. The client will try to allocate new slots
058     * in the set of existing segments, falling back to getting a new segment from the
059     * DataNode via {@link DataTransferProtocol#requestShortCircuitFds}.
060     * 
061     * The counterpart to this class on the DataNode is {@link ShortCircuitRegistry}.
062     * See {@link ShortCircuitRegistry} for more information on the communication protocol.
063     */
064    @InterfaceAudience.Private
065    public class DfsClientShmManager implements Closeable {
066      private static final Log LOG = LogFactory.getLog(DfsClientShmManager.class);
067    
068      /**
069       * Manages short-circuit memory segments that pertain to a given DataNode.
070       */
071      class EndpointShmManager {
072        /**
073         * The datanode we're managing.
074         */
075        private final DatanodeInfo datanode;
076    
077        /**
078         * Shared memory segments which have no empty slots.
079         *
080         * Protected by the manager lock.
081         */
082        private final TreeMap<ShmId, DfsClientShm> full =
083            new TreeMap<ShmId, DfsClientShm>();
084    
085        /**
086         * Shared memory segments which have at least one empty slot.
087         *
088         * Protected by the manager lock.
089         */
090        private final TreeMap<ShmId, DfsClientShm> notFull =
091            new TreeMap<ShmId, DfsClientShm>();
092    
093        /**
094         * True if this datanode doesn't support short-circuit shared memory
095         * segments.
096         *
097         * Protected by the manager lock.
098         */
099        private boolean disabled = false;
100    
101        /**
102         * True if we're in the process of loading a shared memory segment from
103         * this DataNode.
104         *
105         * Protected by the manager lock.
106         */
107        private boolean loading = false;
108    
109        EndpointShmManager (DatanodeInfo datanode) {
110          this.datanode = datanode;
111        }
112    
113        /**
114         * Pull a slot out of a preexisting shared memory segment.
115         *
116         * Must be called with the manager lock held.
117         *
118         * @param blockId     The blockId to put inside the Slot object.
119         *
120         * @return            null if none of our shared memory segments contain a
121         *                      free slot; the slot object otherwise.
122         */
123        private Slot allocSlotFromExistingShm(ExtendedBlockId blockId) {
124          if (notFull.isEmpty()) {
125            return null;
126          }
127          Entry<ShmId, DfsClientShm> entry = notFull.firstEntry();
128          DfsClientShm shm = entry.getValue();
129          ShmId shmId = shm.getShmId();
130          Slot slot = shm.allocAndRegisterSlot(blockId);
131          if (shm.isFull()) {
132            if (LOG.isTraceEnabled()) {
133              LOG.trace(this + ": pulled the last slot " + slot.getSlotIdx() +
134                  " out of " + shm);
135            }
136            DfsClientShm removedShm = notFull.remove(shmId);
137            Preconditions.checkState(removedShm == shm);
138            full.put(shmId, shm);
139          } else {
140            if (LOG.isTraceEnabled()) {
141              LOG.trace(this + ": pulled slot " + slot.getSlotIdx() +
142                  " out of " + shm);
143            }
144          }
145          return slot;
146        }
147    
148        /**
149         * Ask the DataNode for a new shared memory segment.  This function must be
150         * called with the manager lock held.  We will release the lock while
151         * communicating with the DataNode.
152         *
153         * @param clientName    The current client name.
154         * @param peer          The peer to use to talk to the DataNode.
155         *
156         * @return              Null if the DataNode does not support shared memory
157         *                        segments, or experienced an error creating the
158         *                        shm.  The shared memory segment itself on success.
159         * @throws IOException  If there was an error communicating over the socket.
160         *                        We will not throw an IOException unless the socket
161         *                        itself (or the network) is the problem.
162         */
163        private DfsClientShm requestNewShm(String clientName, DomainPeer peer)
164            throws IOException {
165          final DataOutputStream out = 
166              new DataOutputStream(
167                  new BufferedOutputStream(peer.getOutputStream()));
168          new Sender(out).requestShortCircuitShm(clientName);
169          ShortCircuitShmResponseProto resp = 
170              ShortCircuitShmResponseProto.parseFrom(
171                  PBHelper.vintPrefixed(peer.getInputStream()));
172          String error = resp.hasError() ? resp.getError() : "(unknown)";
173          switch (resp.getStatus()) {
174          case SUCCESS:
175            DomainSocket sock = peer.getDomainSocket();
176            byte buf[] = new byte[1];
177            FileInputStream fis[] = new FileInputStream[1];
178            if (sock.recvFileInputStreams(fis, buf, 0, buf.length) < 0) {
179              throw new EOFException("got EOF while trying to transfer the " +
180                  "file descriptor for the shared memory segment.");
181            }
182            if (fis[0] == null) {
183              throw new IOException("the datanode " + datanode + " failed to " +
184                  "pass a file descriptor for the shared memory segment.");
185            }
186            try {
187              DfsClientShm shm = 
188                  new DfsClientShm(PBHelper.convert(resp.getId()),
189                      fis[0], this, peer);
190              if (LOG.isTraceEnabled()) {
191                LOG.trace(this + ": createNewShm: created " + shm);
192              }
193              return shm;
194            } finally {
195              IOUtils.cleanup(LOG,  fis[0]);
196            }
197          case ERROR_UNSUPPORTED:
198            // The DataNode just does not support short-circuit shared memory
199            // access, and we should stop asking.
200            LOG.info(this + ": datanode does not support short-circuit " +
201                "shared memory access: " + error);
202            disabled = true;
203            return null;
204          default:
205            // The datanode experienced some kind of unexpected error when trying to
206            // create the short-circuit shared memory segment.
207            LOG.warn(this + ": error requesting short-circuit shared memory " +
208                "access: " + error);
209            return null;
210          }
211        }
212    
213        /**
214         * Allocate a new shared memory slot connected to this datanode.
215         *
216         * Must be called with the EndpointShmManager lock held.
217         *
218         * @param peer          The peer to use to talk to the DataNode.
219         * @param clientName    The client name.
220         * @param usedPeer      (out param) Will be set to true if we used the peer.
221         *                        When a peer is used
222         *
223         * @return              null if the DataNode does not support shared memory
224         *                        segments, or experienced an error creating the
225         *                        shm.  The shared memory segment itself on success.
226         * @throws IOException  If there was an error communicating over the socket.
227         */
228        Slot allocSlot(DomainPeer peer, MutableBoolean usedPeer,
229            String clientName, ExtendedBlockId blockId) throws IOException {
230          while (true) {
231            if (closed) {
232              if (LOG.isTraceEnabled()) {
233                LOG.trace(this + ": the DfsClientShmManager has been closed.");
234              }
235              return null;
236            }
237            if (disabled) {
238              if (LOG.isTraceEnabled()) {
239                LOG.trace(this + ": shared memory segment access is disabled.");
240              }
241              return null;
242            }
243            // Try to use an existing slot.
244            Slot slot = allocSlotFromExistingShm(blockId);
245            if (slot != null) {
246              return slot;
247            }
248            // There are no free slots.  If someone is loading more slots, wait
249            // for that to finish.
250            if (loading) {
251              if (LOG.isTraceEnabled()) {
252                LOG.trace(this + ": waiting for loading to finish...");
253              }
254              finishedLoading.awaitUninterruptibly();
255            } else {
256              // Otherwise, load the slot ourselves.
257              loading = true;
258              lock.unlock();
259              DfsClientShm shm;
260              try {
261                shm = requestNewShm(clientName, peer);
262                if (shm == null) continue;
263                // See #{DfsClientShmManager#domainSocketWatcher} for details
264                // about why we do this before retaking the manager lock.
265                domainSocketWatcher.add(peer.getDomainSocket(), shm);
266                // The DomainPeer is now our responsibility, and should not be
267                // closed by the caller.
268                usedPeer.setValue(true);
269              } finally {
270                lock.lock();
271                loading = false;
272                finishedLoading.signalAll();
273              }
274              if (shm.isDisconnected()) {
275                // If the peer closed immediately after the shared memory segment
276                // was created, the DomainSocketWatcher callback might already have
277                // fired and marked the shm as disconnected.  In this case, we
278                // obviously don't want to add the SharedMemorySegment to our list
279                // of valid not-full segments.
280                if (LOG.isDebugEnabled()) {
281                  LOG.debug(this + ": the UNIX domain socket associated with " +
282                      "this short-circuit memory closed before we could make " +
283                      "use of the shm.");
284                }
285              } else {
286                notFull.put(shm.getShmId(), shm);
287              }
288            }
289          }
290        }
291        
292        /**
293         * Stop tracking a slot.
294         *
295         * Must be called with the EndpointShmManager lock held.
296         *
297         * @param slot          The slot to release.
298         */
299        void freeSlot(Slot slot) {
300          DfsClientShm shm = (DfsClientShm)slot.getShm();
301          shm.unregisterSlot(slot.getSlotIdx());
302          if (shm.isDisconnected()) {
303            // Stale shared memory segments should not be tracked here.
304            Preconditions.checkState(!full.containsKey(shm.getShmId()));
305            Preconditions.checkState(!notFull.containsKey(shm.getShmId()));
306            if (shm.isEmpty()) {
307              if (LOG.isTraceEnabled()) {
308                LOG.trace(this + ": freeing empty stale " + shm);
309              }
310              shm.free();
311            }
312          } else {
313            ShmId shmId = shm.getShmId();
314            full.remove(shmId); // The shm can't be full if we just freed a slot.
315            if (shm.isEmpty()) {
316              notFull.remove(shmId);
317      
318              // If the shared memory segment is now empty, we call shutdown(2) on
319              // the UNIX domain socket associated with it.  The DomainSocketWatcher,
320              // which is watching this socket, will call DfsClientShm#handle,
321              // cleaning up this shared memory segment.
322              //
323              // See #{DfsClientShmManager#domainSocketWatcher} for details about why
324              // we don't want to call DomainSocketWatcher#remove directly here.
325              //
326              // Note that we could experience 'fragmentation' here, where the
327              // DFSClient allocates a bunch of slots in different shared memory
328              // segments, and then frees most of them, but never fully empties out
329              // any segment.  We make some attempt to avoid this fragmentation by
330              // always allocating new slots out of the shared memory segment with the
331              // lowest ID, but it could still occur.  In most workloads,
332              // fragmentation should not be a major concern, since it doesn't impact
333              // peak file descriptor usage or the speed of allocation.
334              if (LOG.isTraceEnabled()) {
335                LOG.trace(this + ": shutting down UNIX domain socket for " +
336                    "empty " + shm);
337              }
338              shutdown(shm);
339            } else {
340              notFull.put(shmId, shm);
341            }
342          }
343        }
344        
345        /**
346         * Unregister a shared memory segment.
347         *
348         * Once a segment is unregistered, we will not allocate any more slots
349         * inside that segment.
350         *
351         * The DomainSocketWatcher calls this while holding the DomainSocketWatcher
352         * lock.
353         *
354         * @param shmId         The ID of the shared memory segment to unregister.
355         */
356        void unregisterShm(ShmId shmId) {
357          lock.lock();
358          try {
359            full.remove(shmId);
360            notFull.remove(shmId);
361          } finally {
362            lock.unlock();
363          }
364        }
365    
366        @Override
367        public String toString() {
368          return String.format("EndpointShmManager(%s, parent=%s)",
369              datanode, DfsClientShmManager.this);
370        }
371    
372        PerDatanodeVisitorInfo getVisitorInfo() {
373          return new PerDatanodeVisitorInfo(full, notFull, disabled);
374        }
375    
376        final void shutdown(DfsClientShm shm) {
377          try {
378            shm.getPeer().getDomainSocket().shutdown();
379          } catch (IOException e) {
380            LOG.warn(this + ": error shutting down shm: got IOException calling " +
381                "shutdown(SHUT_RDWR)", e);
382          }
383        }
384      }
385    
386      private boolean closed = false;
387    
388      private final ReentrantLock lock = new ReentrantLock();
389    
390      /**
391       * A condition variable which is signalled when we finish loading a segment
392       * from the Datanode.
393       */
394      private final Condition finishedLoading = lock.newCondition();
395    
396      /**
397       * Information about each Datanode.
398       */
399      private final HashMap<DatanodeInfo, EndpointShmManager> datanodes =
400          new HashMap<DatanodeInfo, EndpointShmManager>(1);
401      
402      /**
403       * The DomainSocketWatcher which keeps track of the UNIX domain socket
404       * associated with each shared memory segment.
405       *
406       * Note: because the DomainSocketWatcher makes callbacks into this
407       * DfsClientShmManager object, you must MUST NOT attempt to take the
408       * DomainSocketWatcher lock while holding the DfsClientShmManager lock,
409       * or else deadlock might result.   This means that most DomainSocketWatcher
410       * methods are off-limits unless you release the manager lock first.
411       */
412      private final DomainSocketWatcher domainSocketWatcher;
413      
414      DfsClientShmManager(int interruptCheckPeriodMs) throws IOException {
415        this.domainSocketWatcher = new DomainSocketWatcher(interruptCheckPeriodMs);
416      }
417      
418      public Slot allocSlot(DatanodeInfo datanode, DomainPeer peer,
419          MutableBoolean usedPeer, ExtendedBlockId blockId,
420          String clientName) throws IOException {
421        lock.lock();
422        try {
423          if (closed) {
424            LOG.trace(this + ": the DfsClientShmManager isclosed.");
425            return null;
426          }
427          EndpointShmManager shmManager = datanodes.get(datanode);
428          if (shmManager == null) {
429            shmManager = new EndpointShmManager(datanode);
430            datanodes.put(datanode, shmManager);
431          }
432          return shmManager.allocSlot(peer, usedPeer, clientName, blockId);
433        } finally {
434          lock.unlock();
435        }
436      }
437      
438      public void freeSlot(Slot slot) {
439        lock.lock();
440        try {
441          DfsClientShm shm = (DfsClientShm)slot.getShm();
442          shm.getEndpointShmManager().freeSlot(slot);
443        } finally {
444          lock.unlock();
445        }
446      }
447    
448      @VisibleForTesting
449      public static class PerDatanodeVisitorInfo {
450        public final TreeMap<ShmId, DfsClientShm> full;
451        public final TreeMap<ShmId, DfsClientShm> notFull;
452        public final boolean disabled;
453    
454        PerDatanodeVisitorInfo(TreeMap<ShmId, DfsClientShm> full,
455            TreeMap<ShmId, DfsClientShm> notFull, boolean disabled) {
456          this.full = full;
457          this.notFull = notFull;
458          this.disabled = disabled;
459        }
460      }
461    
462      @VisibleForTesting
463      public interface Visitor {
464        void visit(HashMap<DatanodeInfo, PerDatanodeVisitorInfo> info)
465            throws IOException;
466      }
467    
468      @VisibleForTesting
469      public void visit(Visitor visitor) throws IOException {
470        lock.lock();
471        try {
472          HashMap<DatanodeInfo, PerDatanodeVisitorInfo> info = 
473              new HashMap<DatanodeInfo, PerDatanodeVisitorInfo>();
474          for (Entry<DatanodeInfo, EndpointShmManager> entry :
475                datanodes.entrySet()) {
476            info.put(entry.getKey(), entry.getValue().getVisitorInfo());
477          }
478          visitor.visit(info);
479        } finally {
480          lock.unlock();
481        }
482      }
483    
484      /**
485       * Close the DfsClientShmManager.
486       */
487      @Override
488      public void close() throws IOException {
489        lock.lock();
490        try {
491          if (closed) return;
492          closed = true;
493        } finally {
494          lock.unlock();
495        }
496        // When closed, the domainSocketWatcher will issue callbacks that mark
497        // all the outstanding DfsClientShm segments as stale.
498        IOUtils.cleanup(LOG, domainSocketWatcher);
499      }
500    
501    
502      @Override
503      public String toString() {
504        return String.format("ShortCircuitShmManager(%08x)",
505            System.identityHashCode(this));
506      }
507    
508      @VisibleForTesting
509      public DomainSocketWatcher getDomainSocketWatcher() {
510        return domainSocketWatcher;
511      }
512    }