001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    package org.apache.hadoop.hdfs.server.datanode;
019    
020    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_SHARED_FILE_DESCRIPTOR_PATHS;
021    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_SHARED_FILE_DESCRIPTOR_PATHS_DEFAULT;
022    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS;
023    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS_DEFAULT;
024    
025    import java.io.Closeable;
026    import java.io.FileInputStream;
027    import java.io.IOException;
028    import java.util.HashMap;
029    import java.util.HashSet;
030    import java.util.Iterator;
031    import java.util.Set;
032    
033    import org.apache.commons.io.IOUtils;
034    import org.apache.commons.logging.Log;
035    import org.apache.commons.logging.LogFactory;
036    import org.apache.hadoop.conf.Configuration;
037    import org.apache.hadoop.fs.InvalidRequestException;
038    import org.apache.hadoop.hdfs.ExtendedBlockId;
039    import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitShm;
040    import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitShm.ShmId;
041    import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitShm.Slot;
042    import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitShm.SlotId;
043    import org.apache.hadoop.io.nativeio.SharedFileDescriptorFactory;
044    import org.apache.hadoop.net.unix.DomainSocket;
045    import org.apache.hadoop.net.unix.DomainSocketWatcher;
046    
047    import com.google.common.base.Joiner;
048    import com.google.common.base.Preconditions;
049    import com.google.common.collect.HashMultimap;
050    
051    /*
052     * Manages client short-circuit memory segments on the DataNode.
053     *
054     * DFSClients request shared memory segments from the DataNode.  The 
055     * ShortCircuitRegistry generates and manages these segments.  Each segment
056     * has a randomly generated 128-bit ID which uniquely identifies it.  The
057     * segments each contain several "slots."
058     *
059     * Before performing a short-circuit read, DFSClients must request a pair of
060     * file descriptors from the DataNode via the REQUEST_SHORT_CIRCUIT_FDS
061     * operation.  As part of this operation, DFSClients pass the ID of the shared
062     * memory segment they would like to use to communicate information about this
063     * replica, as well as the slot number within that segment they would like to
064     * use.  Slot allocation is always done by the client.
065     *
066     * Slots are used to track the state of the block on the both the client and
067     * datanode. When this DataNode mlocks a block, the corresponding slots for the
068     * replicas are marked as "anchorable".  Anchorable blocks can be safely read
069     * without verifying the checksum.  This means that BlockReaderLocal objects
070     * using these replicas can skip checksumming.  It also means that we can do
071     * zero-copy reads on these replicas (the ZCR interface has no way of
072     * verifying checksums.)
073     * 
074     * When a DN needs to munlock a block, it needs to first wait for the block to
075     * be unanchored by clients doing a no-checksum read or a zero-copy read. The 
076     * DN also marks the block's slots as "unanchorable" to prevent additional 
077     * clients from initiating these operations in the future.
078     * 
079     * The counterpart of this class on the client is {@link DfsClientShmManager}.
080     */
081    public class ShortCircuitRegistry {
082      public static final Log LOG = LogFactory.getLog(ShortCircuitRegistry.class);
083    
084      private static final int SHM_LENGTH = 8192;
085    
086      private static class RegisteredShm extends ShortCircuitShm
087          implements DomainSocketWatcher.Handler {
088        private final String clientName;
089        private final ShortCircuitRegistry registry;
090    
091        RegisteredShm(String clientName, ShmId shmId, FileInputStream stream,
092            ShortCircuitRegistry registry) throws IOException {
093          super(shmId, stream);
094          this.clientName = clientName;
095          this.registry = registry;
096        }
097    
098        @Override
099        public boolean handle(DomainSocket sock) {
100          synchronized (registry) {
101            synchronized (this) {
102              registry.removeShm(this);
103            }
104          }
105          return true;
106        }
107    
108        String getClientName() {
109          return clientName;
110        }
111      }
112    
113      public synchronized void removeShm(ShortCircuitShm shm) {
114        if (LOG.isTraceEnabled()) {
115          LOG.debug("removing shm " + shm);
116        }
117        // Stop tracking the shmId.
118        RegisteredShm removedShm = segments.remove(shm.getShmId());
119        Preconditions.checkState(removedShm == shm,
120            "failed to remove " + shm.getShmId());
121        // Stop tracking the slots.
122        for (Iterator<Slot> iter = shm.slotIterator(); iter.hasNext(); ) {
123          Slot slot = iter.next();
124          boolean removed = slots.remove(slot.getBlockId(), slot);
125          Preconditions.checkState(removed);
126          slot.makeInvalid();
127        }
128        // De-allocate the memory map and close the shared file. 
129        shm.free();
130      }
131    
132      /**
133       * Whether or not the registry is enabled.
134       */
135      private boolean enabled;
136    
137      /**
138       * The factory which creates shared file descriptors.
139       */
140      private final SharedFileDescriptorFactory shmFactory;
141      
142      /**
143       * A watcher which sends out callbacks when the UNIX domain socket
144       * associated with a shared memory segment closes.
145       */
146      private final DomainSocketWatcher watcher;
147    
148      private final HashMap<ShmId, RegisteredShm> segments =
149          new HashMap<ShmId, RegisteredShm>(0);
150      
151      private final HashMultimap<ExtendedBlockId, Slot> slots =
152          HashMultimap.create(0, 1);
153      
154      public ShortCircuitRegistry(Configuration conf) throws IOException {
155        boolean enabled = false;
156        SharedFileDescriptorFactory shmFactory = null;
157        DomainSocketWatcher watcher = null;
158        try {
159          int interruptCheck = conf.getInt(
160              DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS,
161              DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS_DEFAULT);
162          if (interruptCheck <= 0) {
163            throw new IOException(
164                DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS +
165                " was set to " + interruptCheck);
166          }
167          String shmPaths[] =
168              conf.getTrimmedStrings(DFS_DATANODE_SHARED_FILE_DESCRIPTOR_PATHS);
169          if (shmPaths.length == 0) {
170            shmPaths =
171                DFS_DATANODE_SHARED_FILE_DESCRIPTOR_PATHS_DEFAULT.split(",");
172          }
173          shmFactory = SharedFileDescriptorFactory.
174              create("HadoopShortCircuitShm_", shmPaths);
175          String dswLoadingFailure = DomainSocketWatcher.getLoadingFailureReason();
176          if (dswLoadingFailure != null) {
177            throw new IOException(dswLoadingFailure);
178          }
179          watcher = new DomainSocketWatcher(interruptCheck);
180          enabled = true;
181          if (LOG.isDebugEnabled()) {
182            LOG.debug("created new ShortCircuitRegistry with interruptCheck=" +
183                      interruptCheck + ", shmPath=" + shmFactory.getPath());
184          }
185        } catch (IOException e) {
186          if (LOG.isDebugEnabled()) {
187            LOG.debug("Disabling ShortCircuitRegistry", e);
188          }
189        } finally {
190          this.enabled = enabled;
191          this.shmFactory = shmFactory;
192          this.watcher = watcher;
193        }
194      }
195    
196      /**
197       * Process a block mlock event from the FsDatasetCache.
198       *
199       * @param blockId    The block that was mlocked.
200       */
201      public synchronized void processBlockMlockEvent(ExtendedBlockId blockId) {
202        if (!enabled) return;
203        Set<Slot> affectedSlots = slots.get(blockId);
204        for (Slot slot : affectedSlots) {
205          slot.makeAnchorable();
206        }
207      }
208    
209      /**
210       * Mark any slots associated with this blockId as unanchorable.
211       *
212       * @param blockId        The block ID.
213       * @return               True if we should allow the munlock request.
214       */
215      public synchronized boolean processBlockMunlockRequest(
216          ExtendedBlockId blockId) {
217        if (!enabled) return true;
218        boolean allowMunlock = true;
219        Set<Slot> affectedSlots = slots.get(blockId);
220        for (Slot slot : affectedSlots) {
221          slot.makeUnanchorable();
222          if (slot.isAnchored()) {
223            allowMunlock = false;
224          }
225        }
226        return allowMunlock;
227      }
228    
229      /**
230       * Invalidate any slot associated with a blockId that we are invalidating
231       * (deleting) from this DataNode.  When a slot is invalid, the DFSClient will
232       * not use the corresponding replica for new read or mmap operations (although
233       * existing, ongoing read or mmap operations will complete.)
234       *
235       * @param blockId        The block ID.
236       */
237      public synchronized void processBlockInvalidation(ExtendedBlockId blockId) {
238        if (!enabled) return;
239        final Set<Slot> affectedSlots = slots.get(blockId);
240        if (!affectedSlots.isEmpty()) {
241          final StringBuilder bld = new StringBuilder();
242          String prefix = "";
243          bld.append("Block ").append(blockId).append(" has been invalidated.  ").
244              append("Marking short-circuit slots as invalid: ");
245          for (Slot slot : affectedSlots) {
246            slot.makeInvalid();
247            bld.append(prefix).append(slot.toString());
248            prefix = ", ";
249          }
250          LOG.info(bld.toString());
251        }
252      }
253    
254      public synchronized String getClientNames(ExtendedBlockId blockId) {
255        if (!enabled) return "";
256        final HashSet<String> clientNames = new HashSet<String>();
257        final Set<Slot> affectedSlots = slots.get(blockId);
258        for (Slot slot : affectedSlots) {
259          clientNames.add(((RegisteredShm)slot.getShm()).getClientName());
260        }
261        return Joiner.on(",").join(clientNames);
262      }
263    
264      public static class NewShmInfo implements Closeable {
265        public final ShmId shmId;
266        public final FileInputStream stream;
267    
268        NewShmInfo(ShmId shmId, FileInputStream stream) {
269          this.shmId = shmId;
270          this.stream = stream;
271        }
272    
273        @Override
274        public void close() throws IOException {
275          stream.close();
276        }
277      }
278    
279      /**
280       * Handle a DFSClient request to create a new memory segment.
281       *
282       * @param clientName    Client name as reported by the client.
283       * @param sock          The DomainSocket to associate with this memory
284       *                        segment.  When this socket is closed, or the
285       *                        other side writes anything to the socket, the
286       *                        segment will be closed.  This can happen at any
287       *                        time, including right after this function returns.
288       * @return              A NewShmInfo object.  The caller must close the
289       *                        NewShmInfo object once they are done with it.
290       * @throws IOException  If the new memory segment could not be created.
291       */
292      public NewShmInfo createNewMemorySegment(String clientName,
293          DomainSocket sock) throws IOException {
294        NewShmInfo info = null;
295        RegisteredShm shm = null;
296        ShmId shmId = null;
297        synchronized (this) {
298          if (!enabled) {
299            if (LOG.isTraceEnabled()) {
300              LOG.trace("createNewMemorySegment: ShortCircuitRegistry is " +
301                  "not enabled.");
302            }
303            throw new UnsupportedOperationException();
304          }
305          FileInputStream fis = null;
306          try {
307            do {
308              shmId = ShmId.createRandom();
309            } while (segments.containsKey(shmId));
310            fis = shmFactory.createDescriptor(clientName, SHM_LENGTH);
311            shm = new RegisteredShm(clientName, shmId, fis, this);
312          } finally {
313            if (shm == null) {
314              IOUtils.closeQuietly(fis);
315            }
316          }
317          info = new NewShmInfo(shmId, fis);
318          segments.put(shmId, shm);
319        }
320        // Drop the registry lock to prevent deadlock.
321        // After this point, RegisteredShm#handle may be called at any time.
322        watcher.add(sock, shm);
323        if (LOG.isTraceEnabled()) {
324          LOG.trace("createNewMemorySegment: created " + info.shmId);
325        }
326        return info;
327      }
328      
329      public synchronized void registerSlot(ExtendedBlockId blockId, SlotId slotId,
330          boolean isCached) throws InvalidRequestException {
331        if (!enabled) {
332          if (LOG.isTraceEnabled()) {
333            LOG.trace(this + " can't register a slot because the " +
334                "ShortCircuitRegistry is not enabled.");
335          }
336          throw new UnsupportedOperationException();
337        }
338        ShmId shmId = slotId.getShmId();
339        RegisteredShm shm = segments.get(shmId);
340        if (shm == null) {
341          throw new InvalidRequestException("there is no shared memory segment " +
342              "registered with shmId " + shmId);
343        }
344        Slot slot = shm.registerSlot(slotId.getSlotIdx(), blockId);
345        if (isCached) {
346          slot.makeAnchorable();
347        } else {
348          slot.makeUnanchorable();
349        }
350        boolean added = slots.put(blockId, slot);
351        Preconditions.checkState(added);
352        if (LOG.isTraceEnabled()) {
353          LOG.trace(this + ": registered " + blockId + " with slot " +
354            slotId + " (isCached=" + isCached + ")");
355        }
356      }
357      
358      public synchronized void unregisterSlot(SlotId slotId)
359          throws InvalidRequestException {
360        if (!enabled) {
361          if (LOG.isTraceEnabled()) {
362            LOG.trace("unregisterSlot: ShortCircuitRegistry is " +
363                "not enabled.");
364          }
365          throw new UnsupportedOperationException();
366        }
367        ShmId shmId = slotId.getShmId();
368        RegisteredShm shm = segments.get(shmId);
369        if (shm == null) {
370          throw new InvalidRequestException("there is no shared memory segment " +
371              "registered with shmId " + shmId);
372        }
373        Slot slot = shm.getSlot(slotId.getSlotIdx());
374        slot.makeInvalid();
375        shm.unregisterSlot(slotId.getSlotIdx());
376        slots.remove(slot.getBlockId(), slot);
377      }
378      
379      public void shutdown() {
380        synchronized (this) {
381          if (!enabled) return;
382          enabled = false;
383        }
384        IOUtils.closeQuietly(watcher);
385      }
386    }