001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    package org.apache.hadoop.hdfs.server.datanode;
019    
020    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_SHARED_FILE_DESCRIPTOR_PATHS;
021    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_SHARED_FILE_DESCRIPTOR_PATHS_DEFAULT;
022    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS;
023    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS_DEFAULT;
024    
025    import java.io.Closeable;
026    import java.io.FileInputStream;
027    import java.io.IOException;
028    import java.util.Collections;
029    import java.util.HashMap;
030    import java.util.Iterator;
031    import java.util.Set;
032    
033    import org.apache.commons.io.IOUtils;
034    import org.apache.commons.logging.Log;
035    import org.apache.commons.logging.LogFactory;
036    import org.apache.hadoop.conf.Configuration;
037    import org.apache.hadoop.fs.InvalidRequestException;
038    import org.apache.hadoop.hdfs.ExtendedBlockId;
039    import org.apache.hadoop.hdfs.ShortCircuitShm;
040    import org.apache.hadoop.hdfs.ShortCircuitShm.ShmId;
041    import org.apache.hadoop.hdfs.ShortCircuitShm.Slot;
042    import org.apache.hadoop.hdfs.ShortCircuitShm.SlotId;
043    import org.apache.hadoop.io.nativeio.NativeIO;
044    import org.apache.hadoop.io.nativeio.SharedFileDescriptorFactory;
045    import org.apache.hadoop.net.unix.DomainSocket;
046    import org.apache.hadoop.net.unix.DomainSocketWatcher;
047    
048    import com.google.common.base.Preconditions;
049    import com.google.common.base.Splitter;
050    import com.google.common.collect.HashMultimap;
051    import com.google.common.collect.Iterables;
052    
053    /*
054     * Manages client short-circuit memory segments on the DataNode.
055     *
056     * DFSClients request shared memory segments from the DataNode.  The 
057     * ShortCircuitRegistry generates and manages these segments.  Each segment
058     * has a randomly generated 128-bit ID which uniquely identifies it.  The
059     * segments each contain several "slots."
060     *
061     * Before performing a short-circuit read, DFSClients must request a pair of
062     * file descriptors from the DataNode via the REQUEST_SHORT_CIRCUIT_FDS
063     * operation.  As part of this operation, DFSClients pass the ID of the shared
064     * memory segment they would like to use to communicate information about this
065     * replica, as well as the slot number within that segment they would like to
066     * use.  Slot allocation is always done by the client.
067     *
068     * Slots are used to track the state of the block on the both the client and
069     * datanode. When this DataNode mlocks a block, the corresponding slots for the
070     * replicas are marked as "anchorable".  Anchorable blocks can be safely read
071     * without verifying the checksum.  This means that BlockReaderLocal objects
072     * using these replicas can skip checksumming.  It also means that we can do
073     * zero-copy reads on these replicas (the ZCR interface has no way of
074     * verifying checksums.)
075     * 
076     * When a DN needs to munlock a block, it needs to first wait for the block to
077     * be unanchored by clients doing a no-checksum read or a zero-copy read. The 
078     * DN also marks the block's slots as "unanchorable" to prevent additional 
079     * clients from initiating these operations in the future.
080     * 
081     * The counterpart fo this class on the client is {@link DfsClientShmManager}.
082     */
083    public class ShortCircuitRegistry {
084      public static final Log LOG = LogFactory.getLog(ShortCircuitRegistry.class);
085    
086      private static final int SHM_LENGTH = 8192;
087    
088      private static class RegisteredShm extends ShortCircuitShm
089          implements DomainSocketWatcher.Handler {
090        private final ShortCircuitRegistry registry;
091    
092        RegisteredShm(ShmId shmId, FileInputStream stream,
093            ShortCircuitRegistry registry) throws IOException {
094          super(shmId, stream);
095          this.registry = registry;
096        }
097    
098        @Override
099        public boolean handle(DomainSocket sock) {
100          synchronized (registry) {
101            synchronized (this) {
102              registry.removeShm(this);
103            }
104          }
105          return true;
106        }
107      }
108    
109      public synchronized void removeShm(ShortCircuitShm shm) {
110        if (LOG.isTraceEnabled()) {
111          LOG.debug("removing shm " + shm);
112        }
113        // Stop tracking the shmId.
114        RegisteredShm removedShm = segments.remove(shm.getShmId());
115        Preconditions.checkState(removedShm == shm,
116            "failed to remove " + shm.getShmId());
117        // Stop tracking the slots.
118        for (Iterator<Slot> iter = shm.slotIterator(); iter.hasNext(); ) {
119          Slot slot = iter.next();
120          boolean removed = slots.remove(slot.getBlockId(), slot);
121          Preconditions.checkState(removed);
122          slot.makeInvalid();
123        }
124        // De-allocate the memory map and close the shared file. 
125        shm.free();
126      }
127    
128      /**
129       * Whether or not the registry is enabled.
130       */
131      private boolean enabled;
132    
133      /**
134       * The factory which creates shared file descriptors.
135       */
136      private final SharedFileDescriptorFactory shmFactory;
137      
138      /**
139       * A watcher which sends out callbacks when the UNIX domain socket
140       * associated with a shared memory segment closes.
141       */
142      private final DomainSocketWatcher watcher;
143    
144      private final HashMap<ShmId, RegisteredShm> segments =
145          new HashMap<ShmId, RegisteredShm>(0);
146      
147      private final HashMultimap<ExtendedBlockId, Slot> slots =
148          HashMultimap.create(0, 1);
149      
150      public ShortCircuitRegistry(Configuration conf) throws IOException {
151        boolean enabled = false;
152        SharedFileDescriptorFactory shmFactory = null;
153        DomainSocketWatcher watcher = null;
154        try {
155          int interruptCheck = conf.getInt(
156              DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS,
157              DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS_DEFAULT);
158          if (interruptCheck <= 0) {
159            throw new IOException(
160                DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS +
161                " was set to " + interruptCheck);
162          }
163          String shmPaths[] =
164              conf.getTrimmedStrings(DFS_DATANODE_SHARED_FILE_DESCRIPTOR_PATHS);
165          if (shmPaths.length == 0) {
166            shmPaths =
167                DFS_DATANODE_SHARED_FILE_DESCRIPTOR_PATHS_DEFAULT.split(",");
168          }
169          shmFactory = SharedFileDescriptorFactory.
170              create("HadoopShortCircuitShm_", shmPaths);
171          String dswLoadingFailure = DomainSocketWatcher.getLoadingFailureReason();
172          if (dswLoadingFailure != null) {
173            throw new IOException(dswLoadingFailure);
174          }
175          watcher = new DomainSocketWatcher(interruptCheck);
176          enabled = true;
177          if (LOG.isDebugEnabled()) {
178            LOG.debug("created new ShortCircuitRegistry with interruptCheck=" +
179                      interruptCheck + ", shmPath=" + shmFactory.getPath());
180          }
181        } catch (IOException e) {
182          if (LOG.isDebugEnabled()) {
183            LOG.debug("Disabling ShortCircuitRegistry", e);
184          }
185        } finally {
186          this.enabled = enabled;
187          this.shmFactory = shmFactory;
188          this.watcher = watcher;
189        }
190      }
191    
192      /**
193       * Process a block mlock event from the FsDatasetCache.
194       *
195       * @param blockId    The block that was mlocked.
196       */
197      public synchronized void processBlockMlockEvent(ExtendedBlockId blockId) {
198        if (!enabled) return;
199        Set<Slot> affectedSlots = slots.get(blockId);
200        for (Slot slot : affectedSlots) {
201          slot.makeAnchorable();
202        }
203      }
204    
205      /**
206       * Mark any slots associated with this blockId as unanchorable.
207       *
208       * @param blockId        The block ID.
209       * @return               True if we should allow the munlock request.
210       */
211      public synchronized boolean processBlockMunlockRequest(
212          ExtendedBlockId blockId) {
213        if (!enabled) return true;
214        boolean allowMunlock = true;
215        Set<Slot> affectedSlots = slots.get(blockId);
216        for (Slot slot : affectedSlots) {
217          slot.makeUnanchorable();
218          if (slot.isAnchored()) {
219            allowMunlock = false;
220          }
221        }
222        return allowMunlock;
223      }
224      
225      public static class NewShmInfo implements Closeable {
226        public final ShmId shmId;
227        public final FileInputStream stream;
228    
229        NewShmInfo(ShmId shmId, FileInputStream stream) {
230          this.shmId = shmId;
231          this.stream = stream;
232        }
233    
234        @Override
235        public void close() throws IOException {
236          stream.close();
237        }
238      }
239    
240      /**
241       * Handle a DFSClient request to create a new memory segment.
242       *
243       * @param clientName    Client name as reported by the client.
244       * @param sock          The DomainSocket to associate with this memory
245       *                        segment.  When this socket is closed, or the
246       *                        other side writes anything to the socket, the
247       *                        segment will be closed.  This can happen at any
248       *                        time, including right after this function returns.
249       * @return              A NewShmInfo object.  The caller must close the
250       *                        NewShmInfo object once they are done with it.
251       * @throws IOException  If the new memory segment could not be created.
252       */
253      public NewShmInfo createNewMemorySegment(String clientName,
254          DomainSocket sock) throws IOException {
255        NewShmInfo info = null;
256        RegisteredShm shm = null;
257        ShmId shmId = null;
258        synchronized (this) {
259          if (!enabled) {
260            if (LOG.isTraceEnabled()) {
261              LOG.trace("createNewMemorySegment: ShortCircuitRegistry is " +
262                  "not enabled.");
263            }
264            throw new UnsupportedOperationException();
265          }
266          FileInputStream fis = null;
267          try {
268            do {
269              shmId = ShmId.createRandom();
270            } while (segments.containsKey(shmId));
271            fis = shmFactory.createDescriptor(clientName, SHM_LENGTH);
272            shm = new RegisteredShm(shmId, fis, this);
273          } finally {
274            if (shm == null) {
275              IOUtils.closeQuietly(fis);
276            }
277          }
278          info = new NewShmInfo(shmId, fis);
279          segments.put(shmId, shm);
280        }
281        // Drop the registry lock to prevent deadlock.
282        // After this point, RegisteredShm#handle may be called at any time.
283        watcher.add(sock, shm);
284        if (LOG.isTraceEnabled()) {
285          LOG.trace("createNewMemorySegment: created " + info.shmId);
286        }
287        return info;
288      }
289      
290      public synchronized void registerSlot(ExtendedBlockId blockId, SlotId slotId,
291          boolean isCached) throws InvalidRequestException {
292        if (!enabled) {
293          if (LOG.isTraceEnabled()) {
294            LOG.trace(this + " can't register a slot because the " +
295                "ShortCircuitRegistry is not enabled.");
296          }
297          throw new UnsupportedOperationException();
298        }
299        ShmId shmId = slotId.getShmId();
300        RegisteredShm shm = segments.get(shmId);
301        if (shm == null) {
302          throw new InvalidRequestException("there is no shared memory segment " +
303              "registered with shmId " + shmId);
304        }
305        Slot slot = shm.registerSlot(slotId.getSlotIdx(), blockId);
306        if (isCached) {
307          slot.makeAnchorable();
308        } else {
309          slot.makeUnanchorable();
310        }
311        boolean added = slots.put(blockId, slot);
312        Preconditions.checkState(added);
313        if (LOG.isTraceEnabled()) {
314          LOG.trace(this + ": registered " + blockId + " with slot " +
315            slotId + " (isCached=" + isCached + ")");
316        }
317      }
318      
319      public synchronized void unregisterSlot(SlotId slotId)
320          throws InvalidRequestException {
321        if (!enabled) {
322          if (LOG.isTraceEnabled()) {
323            LOG.trace("unregisterSlot: ShortCircuitRegistry is " +
324                "not enabled.");
325          }
326          throw new UnsupportedOperationException();
327        }
328        ShmId shmId = slotId.getShmId();
329        RegisteredShm shm = segments.get(shmId);
330        if (shm == null) {
331          throw new InvalidRequestException("there is no shared memory segment " +
332              "registered with shmId " + shmId);
333        }
334        Slot slot = shm.getSlot(slotId.getSlotIdx());
335        slot.makeInvalid();
336        shm.unregisterSlot(slotId.getSlotIdx());
337        slots.remove(slot.getBlockId(), slot);
338      }
339      
340      public void shutdown() {
341        synchronized (this) {
342          if (!enabled) return;
343          enabled = false;
344        }
345        IOUtils.closeQuietly(watcher);
346      }
347    }