001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hdfs.shortcircuit;
019
020import java.io.BufferedOutputStream;
021import java.io.Closeable;
022import java.io.DataOutputStream;
023import java.io.EOFException;
024import java.io.FileInputStream;
025import java.io.IOException;
026import java.util.HashMap;
027import java.util.Map.Entry;
028import java.util.TreeMap;
029import java.util.concurrent.locks.Condition;
030import java.util.concurrent.locks.ReentrantLock;
031
032import org.apache.commons.lang.mutable.MutableBoolean;
033import org.apache.hadoop.classification.InterfaceAudience;
034import org.apache.hadoop.hdfs.ExtendedBlockId;
035import org.apache.hadoop.hdfs.net.DomainPeer;
036import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
037import org.apache.hadoop.hdfs.protocol.datatransfer.DataTransferProtocol;
038import org.apache.hadoop.hdfs.protocol.datatransfer.Sender;
039import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.ShortCircuitShmResponseProto;
040import org.apache.hadoop.hdfs.protocolPB.PBHelperClient;
041import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitShm.ShmId;
042import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitShm.Slot;
043import org.apache.hadoop.net.unix.DomainSocket;
044import org.apache.hadoop.net.unix.DomainSocketWatcher;
045
046import com.google.common.annotations.VisibleForTesting;
047import com.google.common.base.Preconditions;
048
049import org.slf4j.Logger;
050import org.slf4j.LoggerFactory;
051
052/**
053 * Manages short-circuit memory segments for an HDFS client.
054 *
055 * Clients are responsible for requesting and releasing shared memory segments
056 * used for communicating with the DataNode. The client will try to allocate new
057 * slots in the set of existing segments, falling back to getting a new segment
058 * from the DataNode via {@link DataTransferProtocol#requestShortCircuitFds}.
059 *
060 * The counterpart to this class on the DataNode is
061 * {@link ShortCircuitRegistry}. See {@link ShortCircuitRegistry} for more
062 * information on the communication protocol.
063 */
064@InterfaceAudience.Private
065public class DfsClientShmManager implements Closeable {
066  private static final Logger LOG = LoggerFactory.getLogger(
067      DfsClientShmManager.class);
068
069  /**
070   * Manages short-circuit memory segments that pertain to a given DataNode.
071   */
072  class EndpointShmManager {
073    /**
074     * The datanode we're managing.
075     */
076    private final DatanodeInfo datanode;
077
078    /**
079     * Shared memory segments which have no empty slots.
080     *
081     * Protected by the manager lock.
082     */
083    private final TreeMap<ShmId, DfsClientShm> full = new TreeMap<>();
084
085    /**
086     * Shared memory segments which have at least one empty slot.
087     *
088     * Protected by the manager lock.
089     */
090    private final TreeMap<ShmId, DfsClientShm> notFull = new TreeMap<>();
091
092    /**
093     * True if this datanode doesn't support short-circuit shared memory
094     * segments.
095     *
096     * Protected by the manager lock.
097     */
098    private boolean disabled = false;
099
100    /**
101     * True if we're in the process of loading a shared memory segment from
102     * this DataNode.
103     *
104     * Protected by the manager lock.
105     */
106    private boolean loading = false;
107
108    EndpointShmManager (DatanodeInfo datanode) {
109      this.datanode = datanode;
110    }
111
112    /**
113     * Pull a slot out of a preexisting shared memory segment.
114     *
115     * Must be called with the manager lock held.
116     *
117     * @param blockId     The blockId to put inside the Slot object.
118     *
119     * @return            null if none of our shared memory segments contain a
120     *                      free slot; the slot object otherwise.
121     */
122    private Slot allocSlotFromExistingShm(ExtendedBlockId blockId) {
123      if (notFull.isEmpty()) {
124        return null;
125      }
126      Entry<ShmId, DfsClientShm> entry = notFull.firstEntry();
127      DfsClientShm shm = entry.getValue();
128      ShmId shmId = shm.getShmId();
129      Slot slot = shm.allocAndRegisterSlot(blockId);
130      if (shm.isFull()) {
131        LOG.trace("{}: pulled the last slot {} out of {}",
132            this, slot.getSlotIdx(), shm);
133        DfsClientShm removedShm = notFull.remove(shmId);
134        Preconditions.checkState(removedShm == shm);
135        full.put(shmId, shm);
136      } else {
137        LOG.trace("{}: pulled slot {} out of {}", this, slot.getSlotIdx(), shm);
138      }
139      return slot;
140    }
141
142    /**
143     * Ask the DataNode for a new shared memory segment.  This function must be
144     * called with the manager lock held.  We will release the lock while
145     * communicating with the DataNode.
146     *
147     * @param clientName    The current client name.
148     * @param peer          The peer to use to talk to the DataNode.
149     *
150     * @return              Null if the DataNode does not support shared memory
151     *                        segments, or experienced an error creating the
152     *                        shm.  The shared memory segment itself on success.
153     * @throws IOException  If there was an error communicating over the socket.
154     *                        We will not throw an IOException unless the socket
155     *                        itself (or the network) is the problem.
156     */
157    private DfsClientShm requestNewShm(String clientName, DomainPeer peer)
158        throws IOException {
159      final DataOutputStream out =
160          new DataOutputStream(
161              new BufferedOutputStream(peer.getOutputStream()));
162      new Sender(out).requestShortCircuitShm(clientName);
163      ShortCircuitShmResponseProto resp =
164          ShortCircuitShmResponseProto.parseFrom(
165            PBHelperClient.vintPrefixed(peer.getInputStream()));
166      String error = resp.hasError() ? resp.getError() : "(unknown)";
167      switch (resp.getStatus()) {
168      case SUCCESS:
169        DomainSocket sock = peer.getDomainSocket();
170        byte buf[] = new byte[1];
171        FileInputStream[] fis = new FileInputStream[1];
172        if (sock.recvFileInputStreams(fis, buf, 0, buf.length) < 0) {
173          throw new EOFException("got EOF while trying to transfer the " +
174              "file descriptor for the shared memory segment.");
175        }
176        if (fis[0] == null) {
177          throw new IOException("the datanode " + datanode + " failed to " +
178              "pass a file descriptor for the shared memory segment.");
179        }
180        try {
181          DfsClientShm shm =
182              new DfsClientShm(PBHelperClient.convert(resp.getId()),
183                  fis[0], this, peer);
184          LOG.trace("{}: createNewShm: created {}", this, shm);
185          return shm;
186        } finally {
187          try {
188            fis[0].close();
189          } catch (Throwable e) {
190            LOG.debug("Exception in closing " + fis[0], e);
191          }
192        }
193      case ERROR_UNSUPPORTED:
194        // The DataNode just does not support short-circuit shared memory
195        // access, and we should stop asking.
196        LOG.info(this + ": datanode does not support short-circuit " +
197            "shared memory access: " + error);
198        disabled = true;
199        return null;
200      default:
201        // The datanode experienced some kind of unexpected error when trying to
202        // create the short-circuit shared memory segment.
203        LOG.warn(this + ": error requesting short-circuit shared memory " +
204            "access: " + error);
205        return null;
206      }
207    }
208
209    /**
210     * Allocate a new shared memory slot connected to this datanode.
211     *
212     * Must be called with the EndpointShmManager lock held.
213     *
214     * @param peer          The peer to use to talk to the DataNode.
215     * @param usedPeer      (out param) Will be set to true if we used the peer.
216     *                        When a peer is used
217     *
218     * @param clientName    The client name.
219     * @param blockId       The block ID to use.
220     * @return              null if the DataNode does not support shared memory
221     *                        segments, or experienced an error creating the
222     *                        shm.  The shared memory segment itself on success.
223     * @throws IOException  If there was an error communicating over the socket.
224     */
225    Slot allocSlot(DomainPeer peer, MutableBoolean usedPeer,
226        String clientName, ExtendedBlockId blockId) throws IOException {
227      while (true) {
228        if (closed) {
229          LOG.trace("{}: the DfsClientShmManager has been closed.", this);
230          return null;
231        }
232        if (disabled) {
233          LOG.trace("{}: shared memory segment access is disabled.", this);
234          return null;
235        }
236        // Try to use an existing slot.
237        Slot slot = allocSlotFromExistingShm(blockId);
238        if (slot != null) {
239          return slot;
240        }
241        // There are no free slots.  If someone is loading more slots, wait
242        // for that to finish.
243        if (loading) {
244          LOG.trace("{}: waiting for loading to finish...", this);
245          finishedLoading.awaitUninterruptibly();
246        } else {
247          // Otherwise, load the slot ourselves.
248          loading = true;
249          lock.unlock();
250          DfsClientShm shm;
251          try {
252            shm = requestNewShm(clientName, peer);
253            if (shm == null) continue;
254            // See #{DfsClientShmManager#domainSocketWatcher} for details
255            // about why we do this before retaking the manager lock.
256            domainSocketWatcher.add(peer.getDomainSocket(), shm);
257            // The DomainPeer is now our responsibility, and should not be
258            // closed by the caller.
259            usedPeer.setValue(true);
260          } finally {
261            lock.lock();
262            loading = false;
263            finishedLoading.signalAll();
264          }
265          if (shm.isDisconnected()) {
266            // If the peer closed immediately after the shared memory segment
267            // was created, the DomainSocketWatcher callback might already have
268            // fired and marked the shm as disconnected.  In this case, we
269            // obviously don't want to add the SharedMemorySegment to our list
270            // of valid not-full segments.
271            LOG.debug("{}: the UNIX domain socket associated with this "
272                + "short-circuit memory closed before we could make use of "
273                + "the shm.", this);
274          } else {
275            notFull.put(shm.getShmId(), shm);
276          }
277        }
278      }
279    }
280
281    /**
282     * Stop tracking a slot.
283     *
284     * Must be called with the EndpointShmManager lock held.
285     *
286     * @param slot          The slot to release.
287     */
288    void freeSlot(Slot slot) {
289      DfsClientShm shm = (DfsClientShm)slot.getShm();
290      shm.unregisterSlot(slot.getSlotIdx());
291      if (shm.isDisconnected()) {
292        // Stale shared memory segments should not be tracked here.
293        Preconditions.checkState(!full.containsKey(shm.getShmId()));
294        Preconditions.checkState(!notFull.containsKey(shm.getShmId()));
295        if (shm.isEmpty()) {
296          LOG.trace("{}: freeing empty stale {}", this, shm);
297          shm.free();
298        }
299      } else {
300        ShmId shmId = shm.getShmId();
301        full.remove(shmId); // The shm can't be full if we just freed a slot.
302        if (shm.isEmpty()) {
303          notFull.remove(shmId);
304
305          // If the shared memory segment is now empty, we call shutdown(2) on
306          // the UNIX domain socket associated with it.  The DomainSocketWatcher,
307          // which is watching this socket, will call DfsClientShm#handle,
308          // cleaning up this shared memory segment.
309          //
310          // See #{DfsClientShmManager#domainSocketWatcher} for details about why
311          // we don't want to call DomainSocketWatcher#remove directly here.
312          //
313          // Note that we could experience 'fragmentation' here, where the
314          // DFSClient allocates a bunch of slots in different shared memory
315          // segments, and then frees most of them, but never fully empties out
316          // any segment.  We make some attempt to avoid this fragmentation by
317          // always allocating new slots out of the shared memory segment with the
318          // lowest ID, but it could still occur.  In most workloads,
319          // fragmentation should not be a major concern, since it doesn't impact
320          // peak file descriptor usage or the speed of allocation.
321          LOG.trace("{}: shutting down UNIX domain socket for empty {}",
322              this, shm);
323          shutdown(shm);
324        } else {
325          notFull.put(shmId, shm);
326        }
327      }
328    }
329
330    /**
331     * Unregister a shared memory segment.
332     *
333     * Once a segment is unregistered, we will not allocate any more slots
334     * inside that segment.
335     *
336     * The DomainSocketWatcher calls this while holding the DomainSocketWatcher
337     * lock.
338     *
339     * @param shmId         The ID of the shared memory segment to unregister.
340     */
341    void unregisterShm(ShmId shmId) {
342      lock.lock();
343      try {
344        full.remove(shmId);
345        notFull.remove(shmId);
346      } finally {
347        lock.unlock();
348      }
349    }
350
351    @Override
352    public String toString() {
353      return String.format("EndpointShmManager(%s, parent=%s)",
354          datanode, DfsClientShmManager.this);
355    }
356
357    PerDatanodeVisitorInfo getVisitorInfo() {
358      return new PerDatanodeVisitorInfo(full, notFull, disabled);
359    }
360
361    final void shutdown(DfsClientShm shm) {
362      try {
363        shm.getPeer().getDomainSocket().shutdown();
364      } catch (IOException e) {
365        LOG.warn(this + ": error shutting down shm: got IOException calling " +
366            "shutdown(SHUT_RDWR)", e);
367      }
368    }
369  }
370
371  private boolean closed = false;
372
373  private final ReentrantLock lock = new ReentrantLock();
374
375  /**
376   * A condition variable which is signalled when we finish loading a segment
377   * from the Datanode.
378   */
379  private final Condition finishedLoading = lock.newCondition();
380
381  /**
382   * Information about each Datanode.
383   */
384  private final HashMap<DatanodeInfo, EndpointShmManager> datanodes =
385      new HashMap<>(1);
386
387  /**
388   * The DomainSocketWatcher which keeps track of the UNIX domain socket
389   * associated with each shared memory segment.
390   *
391   * Note: because the DomainSocketWatcher makes callbacks into this
392   * DfsClientShmManager object, you must MUST NOT attempt to take the
393   * DomainSocketWatcher lock while holding the DfsClientShmManager lock,
394   * or else deadlock might result.   This means that most DomainSocketWatcher
395   * methods are off-limits unless you release the manager lock first.
396   */
397  private final DomainSocketWatcher domainSocketWatcher;
398
399  DfsClientShmManager(int interruptCheckPeriodMs) throws IOException {
400    this.domainSocketWatcher = new DomainSocketWatcher(interruptCheckPeriodMs,
401        "client");
402  }
403
404  public Slot allocSlot(DatanodeInfo datanode, DomainPeer peer,
405      MutableBoolean usedPeer, ExtendedBlockId blockId,
406      String clientName) throws IOException {
407    lock.lock();
408    try {
409      if (closed) {
410        LOG.trace(this + ": the DfsClientShmManager isclosed.");
411        return null;
412      }
413      EndpointShmManager shmManager = datanodes.get(datanode);
414      if (shmManager == null) {
415        shmManager = new EndpointShmManager(datanode);
416        datanodes.put(datanode, shmManager);
417      }
418      return shmManager.allocSlot(peer, usedPeer, clientName, blockId);
419    } finally {
420      lock.unlock();
421    }
422  }
423
424  public void freeSlot(Slot slot) {
425    lock.lock();
426    try {
427      DfsClientShm shm = (DfsClientShm)slot.getShm();
428      shm.getEndpointShmManager().freeSlot(slot);
429    } finally {
430      lock.unlock();
431    }
432  }
433
434  @VisibleForTesting
435  public static class PerDatanodeVisitorInfo {
436    public final TreeMap<ShmId, DfsClientShm> full;
437    public final TreeMap<ShmId, DfsClientShm> notFull;
438    public final boolean disabled;
439
440    PerDatanodeVisitorInfo(TreeMap<ShmId, DfsClientShm> full,
441        TreeMap<ShmId, DfsClientShm> notFull, boolean disabled) {
442      this.full = full;
443      this.notFull = notFull;
444      this.disabled = disabled;
445    }
446  }
447
448  @VisibleForTesting
449  public interface Visitor {
450    void visit(HashMap<DatanodeInfo, PerDatanodeVisitorInfo> info)
451        throws IOException;
452  }
453
454  @VisibleForTesting
455  public void visit(Visitor visitor) throws IOException {
456    lock.lock();
457    try {
458      HashMap<DatanodeInfo, PerDatanodeVisitorInfo> info = new HashMap<>();
459      for (Entry<DatanodeInfo, EndpointShmManager> entry :
460            datanodes.entrySet()) {
461        info.put(entry.getKey(), entry.getValue().getVisitorInfo());
462      }
463      visitor.visit(info);
464    } finally {
465      lock.unlock();
466    }
467  }
468
469  /**
470   * Close the DfsClientShmManager.
471   */
472  @Override
473  public void close() throws IOException {
474    lock.lock();
475    try {
476      if (closed) return;
477      closed = true;
478    } finally {
479      lock.unlock();
480    }
481    // When closed, the domainSocketWatcher will issue callbacks that mark
482    // all the outstanding DfsClientShm segments as stale.
483    try {
484      domainSocketWatcher.close();
485    } catch (Throwable e) {
486      LOG.debug("Exception in closing " + domainSocketWatcher, e);
487    }
488  }
489
490
491  @Override
492  public String toString() {
493    return String.format("ShortCircuitShmManager(%08x)",
494        System.identityHashCode(this));
495  }
496
497  @VisibleForTesting
498  public DomainSocketWatcher getDomainSocketWatcher() {
499    return domainSocketWatcher;
500  }
501}