001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hdfs.server.datanode;
019
020import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_SHARED_FILE_DESCRIPTOR_PATHS;
021import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_SHARED_FILE_DESCRIPTOR_PATHS_DEFAULT;
022import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS;
023import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS_DEFAULT;
024
025import java.io.Closeable;
026import java.io.FileInputStream;
027import java.io.IOException;
028import java.util.HashMap;
029import java.util.HashSet;
030import java.util.Iterator;
031import java.util.Set;
032
033import com.google.common.annotations.VisibleForTesting;
034import org.apache.commons.io.IOUtils;
035import org.apache.commons.logging.Log;
036import org.apache.commons.logging.LogFactory;
037import org.apache.hadoop.conf.Configuration;
038import org.apache.hadoop.fs.InvalidRequestException;
039import org.apache.hadoop.hdfs.ExtendedBlockId;
040import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitShm;
041import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitShm.ShmId;
042import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitShm.Slot;
043import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitShm.SlotId;
044import org.apache.hadoop.io.nativeio.SharedFileDescriptorFactory;
045import org.apache.hadoop.net.unix.DomainSocket;
046import org.apache.hadoop.net.unix.DomainSocketWatcher;
047
048import com.google.common.base.Joiner;
049import com.google.common.base.Preconditions;
050import com.google.common.collect.HashMultimap;
051
052/**
053 * Manages client short-circuit memory segments on the DataNode.
054 *
055 * DFSClients request shared memory segments from the DataNode.  The 
056 * ShortCircuitRegistry generates and manages these segments.  Each segment
057 * has a randomly generated 128-bit ID which uniquely identifies it.  The
058 * segments each contain several "slots."
059 *
060 * Before performing a short-circuit read, DFSClients must request a pair of
061 * file descriptors from the DataNode via the REQUEST_SHORT_CIRCUIT_FDS
062 * operation.  As part of this operation, DFSClients pass the ID of the shared
063 * memory segment they would like to use to communicate information about this
064 * replica, as well as the slot number within that segment they would like to
065 * use.  Slot allocation is always done by the client.
066 *
067 * Slots are used to track the state of the block on the both the client and
068 * datanode. When this DataNode mlocks a block, the corresponding slots for the
069 * replicas are marked as "anchorable".  Anchorable blocks can be safely read
070 * without verifying the checksum.  This means that BlockReaderLocal objects
071 * using these replicas can skip checksumming.  It also means that we can do
072 * zero-copy reads on these replicas (the ZCR interface has no way of
073 * verifying checksums.)
074 * 
075 * When a DN needs to munlock a block, it needs to first wait for the block to
076 * be unanchored by clients doing a no-checksum read or a zero-copy read. The 
077 * DN also marks the block's slots as "unanchorable" to prevent additional 
078 * clients from initiating these operations in the future.
079 * 
080 * The counterpart of this class on the client is {@link DfsClientShmManager}.
081 */
082public class ShortCircuitRegistry {
083  public static final Log LOG = LogFactory.getLog(ShortCircuitRegistry.class);
084
085  private static final int SHM_LENGTH = 8192;
086
087  public static class RegisteredShm extends ShortCircuitShm
088      implements DomainSocketWatcher.Handler {
089    private final String clientName;
090    private final ShortCircuitRegistry registry;
091
092    RegisteredShm(String clientName, ShmId shmId, FileInputStream stream,
093        ShortCircuitRegistry registry) throws IOException {
094      super(shmId, stream);
095      this.clientName = clientName;
096      this.registry = registry;
097    }
098
099    @Override
100    public boolean handle(DomainSocket sock) {
101      synchronized (registry) {
102        synchronized (this) {
103          registry.removeShm(this);
104        }
105      }
106      return true;
107    }
108
109    String getClientName() {
110      return clientName;
111    }
112  }
113
114  public synchronized void removeShm(ShortCircuitShm shm) {
115    if (LOG.isTraceEnabled()) {
116      LOG.debug("removing shm " + shm);
117    }
118    // Stop tracking the shmId.
119    RegisteredShm removedShm = segments.remove(shm.getShmId());
120    Preconditions.checkState(removedShm == shm,
121        "failed to remove " + shm.getShmId());
122    // Stop tracking the slots.
123    for (Iterator<Slot> iter = shm.slotIterator(); iter.hasNext(); ) {
124      Slot slot = iter.next();
125      boolean removed = slots.remove(slot.getBlockId(), slot);
126      Preconditions.checkState(removed);
127      slot.makeInvalid();
128    }
129    // De-allocate the memory map and close the shared file. 
130    shm.free();
131  }
132
133  /**
134   * Whether or not the registry is enabled.
135   */
136  private boolean enabled;
137
138  /**
139   * The factory which creates shared file descriptors.
140   */
141  private final SharedFileDescriptorFactory shmFactory;
142  
143  /**
144   * A watcher which sends out callbacks when the UNIX domain socket
145   * associated with a shared memory segment closes.
146   */
147  private final DomainSocketWatcher watcher;
148
149  private final HashMap<ShmId, RegisteredShm> segments =
150      new HashMap<ShmId, RegisteredShm>(0);
151  
152  private final HashMultimap<ExtendedBlockId, Slot> slots =
153      HashMultimap.create(0, 1);
154  
155  public ShortCircuitRegistry(Configuration conf) throws IOException {
156    boolean enabled = false;
157    SharedFileDescriptorFactory shmFactory = null;
158    DomainSocketWatcher watcher = null;
159    try {
160      int interruptCheck = conf.getInt(
161          DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS,
162          DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS_DEFAULT);
163      if (interruptCheck <= 0) {
164        throw new IOException(
165            DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS +
166            " was set to " + interruptCheck);
167      }
168      String[] shmPaths =
169          conf.getTrimmedStrings(DFS_DATANODE_SHARED_FILE_DESCRIPTOR_PATHS);
170      if (shmPaths.length == 0) {
171        shmPaths =
172            DFS_DATANODE_SHARED_FILE_DESCRIPTOR_PATHS_DEFAULT.split(",");
173      }
174      shmFactory = SharedFileDescriptorFactory.
175          create("HadoopShortCircuitShm_", shmPaths);
176      String dswLoadingFailure = DomainSocketWatcher.getLoadingFailureReason();
177      if (dswLoadingFailure != null) {
178        throw new IOException(dswLoadingFailure);
179      }
180      watcher = new DomainSocketWatcher(interruptCheck, "datanode");
181      enabled = true;
182      if (LOG.isDebugEnabled()) {
183        LOG.debug("created new ShortCircuitRegistry with interruptCheck=" +
184                  interruptCheck + ", shmPath=" + shmFactory.getPath());
185      }
186    } catch (IOException e) {
187      if (LOG.isDebugEnabled()) {
188        LOG.debug("Disabling ShortCircuitRegistry", e);
189      }
190    } finally {
191      this.enabled = enabled;
192      this.shmFactory = shmFactory;
193      this.watcher = watcher;
194    }
195  }
196
197  /**
198   * Process a block mlock event from the FsDatasetCache.
199   *
200   * @param blockId    The block that was mlocked.
201   */
202  public synchronized void processBlockMlockEvent(ExtendedBlockId blockId) {
203    if (!enabled) return;
204    Set<Slot> affectedSlots = slots.get(blockId);
205    for (Slot slot : affectedSlots) {
206      slot.makeAnchorable();
207    }
208  }
209
210  /**
211   * Mark any slots associated with this blockId as unanchorable.
212   *
213   * @param blockId        The block ID.
214   * @return               True if we should allow the munlock request.
215   */
216  public synchronized boolean processBlockMunlockRequest(
217      ExtendedBlockId blockId) {
218    if (!enabled) return true;
219    boolean allowMunlock = true;
220    Set<Slot> affectedSlots = slots.get(blockId);
221    for (Slot slot : affectedSlots) {
222      slot.makeUnanchorable();
223      if (slot.isAnchored()) {
224        allowMunlock = false;
225      }
226    }
227    return allowMunlock;
228  }
229
230  /**
231   * Invalidate any slot associated with a blockId that we are invalidating
232   * (deleting) from this DataNode.  When a slot is invalid, the DFSClient will
233   * not use the corresponding replica for new read or mmap operations (although
234   * existing, ongoing read or mmap operations will complete.)
235   *
236   * @param blockId        The block ID.
237   */
238  public synchronized void processBlockInvalidation(ExtendedBlockId blockId) {
239    if (!enabled) return;
240    final Set<Slot> affectedSlots = slots.get(blockId);
241    if (!affectedSlots.isEmpty()) {
242      final StringBuilder bld = new StringBuilder();
243      String prefix = "";
244      bld.append("Block ").append(blockId).append(" has been invalidated.  ").
245          append("Marking short-circuit slots as invalid: ");
246      for (Slot slot : affectedSlots) {
247        slot.makeInvalid();
248        bld.append(prefix).append(slot.toString());
249        prefix = ", ";
250      }
251      LOG.info(bld.toString());
252    }
253  }
254
255  public synchronized String getClientNames(ExtendedBlockId blockId) {
256    if (!enabled) return "";
257    final HashSet<String> clientNames = new HashSet<String>();
258    final Set<Slot> affectedSlots = slots.get(blockId);
259    for (Slot slot : affectedSlots) {
260      clientNames.add(((RegisteredShm)slot.getShm()).getClientName());
261    }
262    return Joiner.on(",").join(clientNames);
263  }
264
265  public static class NewShmInfo implements Closeable {
266    private final ShmId shmId;
267    private final FileInputStream stream;
268
269    NewShmInfo(ShmId shmId, FileInputStream stream) {
270      this.shmId = shmId;
271      this.stream = stream;
272    }
273
274    public ShmId getShmId() {
275      return shmId;
276    }
277
278    public FileInputStream getFileStream() {
279      return stream;
280    }
281
282    @Override
283    public void close() throws IOException {
284      stream.close();
285    }
286  }
287
288  /**
289   * Handle a DFSClient request to create a new memory segment.
290   *
291   * @param clientName    Client name as reported by the client.
292   * @param sock          The DomainSocket to associate with this memory
293   *                        segment.  When this socket is closed, or the
294   *                        other side writes anything to the socket, the
295   *                        segment will be closed.  This can happen at any
296   *                        time, including right after this function returns.
297   * @return              A NewShmInfo object.  The caller must close the
298   *                        NewShmInfo object once they are done with it.
299   * @throws IOException  If the new memory segment could not be created.
300   */
301  public NewShmInfo createNewMemorySegment(String clientName,
302      DomainSocket sock) throws IOException {
303    NewShmInfo info = null;
304    RegisteredShm shm = null;
305    ShmId shmId = null;
306    synchronized (this) {
307      if (!enabled) {
308        if (LOG.isTraceEnabled()) {
309          LOG.trace("createNewMemorySegment: ShortCircuitRegistry is " +
310              "not enabled.");
311        }
312        throw new UnsupportedOperationException();
313      }
314      FileInputStream fis = null;
315      try {
316        do {
317          shmId = ShmId.createRandom();
318        } while (segments.containsKey(shmId));
319        fis = shmFactory.createDescriptor(clientName, SHM_LENGTH);
320        shm = new RegisteredShm(clientName, shmId, fis, this);
321      } finally {
322        if (shm == null) {
323          IOUtils.closeQuietly(fis);
324        }
325      }
326      info = new NewShmInfo(shmId, fis);
327      segments.put(shmId, shm);
328    }
329    // Drop the registry lock to prevent deadlock.
330    // After this point, RegisteredShm#handle may be called at any time.
331    watcher.add(sock, shm);
332    if (LOG.isTraceEnabled()) {
333      LOG.trace("createNewMemorySegment: created " + info.shmId);
334    }
335    return info;
336  }
337  
338  public synchronized void registerSlot(ExtendedBlockId blockId, SlotId slotId,
339      boolean isCached) throws InvalidRequestException {
340    if (!enabled) {
341      if (LOG.isTraceEnabled()) {
342        LOG.trace(this + " can't register a slot because the " +
343            "ShortCircuitRegistry is not enabled.");
344      }
345      throw new UnsupportedOperationException();
346    }
347    ShmId shmId = slotId.getShmId();
348    RegisteredShm shm = segments.get(shmId);
349    if (shm == null) {
350      throw new InvalidRequestException("there is no shared memory segment " +
351          "registered with shmId " + shmId);
352    }
353    Slot slot = shm.registerSlot(slotId.getSlotIdx(), blockId);
354    if (isCached) {
355      slot.makeAnchorable();
356    } else {
357      slot.makeUnanchorable();
358    }
359    boolean added = slots.put(blockId, slot);
360    Preconditions.checkState(added);
361    if (LOG.isTraceEnabled()) {
362      LOG.trace(this + ": registered " + blockId + " with slot " +
363        slotId + " (isCached=" + isCached + ")");
364    }
365  }
366  
367  public synchronized void unregisterSlot(SlotId slotId)
368      throws InvalidRequestException {
369    if (!enabled) {
370      if (LOG.isTraceEnabled()) {
371        LOG.trace("unregisterSlot: ShortCircuitRegistry is " +
372            "not enabled.");
373      }
374      throw new UnsupportedOperationException();
375    }
376    ShmId shmId = slotId.getShmId();
377    RegisteredShm shm = segments.get(shmId);
378    if (shm == null) {
379      throw new InvalidRequestException("there is no shared memory segment " +
380          "registered with shmId " + shmId);
381    }
382    Slot slot = shm.getSlot(slotId.getSlotIdx());
383    slot.makeInvalid();
384    shm.unregisterSlot(slotId.getSlotIdx());
385    slots.remove(slot.getBlockId(), slot);
386  }
387  
388  public void shutdown() {
389    synchronized (this) {
390      if (!enabled) return;
391      enabled = false;
392    }
393    IOUtils.closeQuietly(watcher);
394  }
395
396  public static interface Visitor {
397    boolean accept(HashMap<ShmId, RegisteredShm> segments,
398                HashMultimap<ExtendedBlockId, Slot> slots);
399  }
400
401  @VisibleForTesting
402  public synchronized boolean visit(Visitor visitor) {
403    return visitor.accept(segments, slots);
404  }
405}