001 /**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements. See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership. The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License. You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018 package org.apache.hadoop.hdfs.server.datanode;
019
020 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_SHARED_FILE_DESCRIPTOR_PATHS;
021 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_SHARED_FILE_DESCRIPTOR_PATHS_DEFAULT;
022 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS;
023 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS_DEFAULT;
024
025 import java.io.Closeable;
026 import java.io.FileInputStream;
027 import java.io.IOException;
028 import java.util.Collections;
029 import java.util.HashMap;
030 import java.util.Iterator;
031 import java.util.Set;
032
033 import org.apache.commons.io.IOUtils;
034 import org.apache.commons.logging.Log;
035 import org.apache.commons.logging.LogFactory;
036 import org.apache.hadoop.conf.Configuration;
037 import org.apache.hadoop.fs.InvalidRequestException;
038 import org.apache.hadoop.hdfs.ExtendedBlockId;
039 import org.apache.hadoop.hdfs.ShortCircuitShm;
040 import org.apache.hadoop.hdfs.ShortCircuitShm.ShmId;
041 import org.apache.hadoop.hdfs.ShortCircuitShm.Slot;
042 import org.apache.hadoop.hdfs.ShortCircuitShm.SlotId;
043 import org.apache.hadoop.io.nativeio.NativeIO;
044 import org.apache.hadoop.io.nativeio.SharedFileDescriptorFactory;
045 import org.apache.hadoop.net.unix.DomainSocket;
046 import org.apache.hadoop.net.unix.DomainSocketWatcher;
047
048 import com.google.common.base.Preconditions;
049 import com.google.common.base.Splitter;
050 import com.google.common.collect.HashMultimap;
051 import com.google.common.collect.Iterables;
052
053 /*
054 * Manages client short-circuit memory segments on the DataNode.
055 *
056 * DFSClients request shared memory segments from the DataNode. The
057 * ShortCircuitRegistry generates and manages these segments. Each segment
058 * has a randomly generated 128-bit ID which uniquely identifies it. The
059 * segments each contain several "slots."
060 *
061 * Before performing a short-circuit read, DFSClients must request a pair of
062 * file descriptors from the DataNode via the REQUEST_SHORT_CIRCUIT_FDS
063 * operation. As part of this operation, DFSClients pass the ID of the shared
064 * memory segment they would like to use to communicate information about this
065 * replica, as well as the slot number within that segment they would like to
066 * use. Slot allocation is always done by the client.
067 *
068 * Slots are used to track the state of the block on the both the client and
069 * datanode. When this DataNode mlocks a block, the corresponding slots for the
070 * replicas are marked as "anchorable". Anchorable blocks can be safely read
071 * without verifying the checksum. This means that BlockReaderLocal objects
072 * using these replicas can skip checksumming. It also means that we can do
073 * zero-copy reads on these replicas (the ZCR interface has no way of
074 * verifying checksums.)
075 *
076 * When a DN needs to munlock a block, it needs to first wait for the block to
077 * be unanchored by clients doing a no-checksum read or a zero-copy read. The
078 * DN also marks the block's slots as "unanchorable" to prevent additional
079 * clients from initiating these operations in the future.
080 *
081 * The counterpart fo this class on the client is {@link DfsClientShmManager}.
082 */
083 public class ShortCircuitRegistry {
084 public static final Log LOG = LogFactory.getLog(ShortCircuitRegistry.class);
085
086 private static final int SHM_LENGTH = 8192;
087
088 private static class RegisteredShm extends ShortCircuitShm
089 implements DomainSocketWatcher.Handler {
090 private final ShortCircuitRegistry registry;
091
092 RegisteredShm(ShmId shmId, FileInputStream stream,
093 ShortCircuitRegistry registry) throws IOException {
094 super(shmId, stream);
095 this.registry = registry;
096 }
097
098 @Override
099 public boolean handle(DomainSocket sock) {
100 synchronized (registry) {
101 synchronized (this) {
102 registry.removeShm(this);
103 }
104 }
105 return true;
106 }
107 }
108
109 public synchronized void removeShm(ShortCircuitShm shm) {
110 if (LOG.isTraceEnabled()) {
111 LOG.debug("removing shm " + shm);
112 }
113 // Stop tracking the shmId.
114 RegisteredShm removedShm = segments.remove(shm.getShmId());
115 Preconditions.checkState(removedShm == shm,
116 "failed to remove " + shm.getShmId());
117 // Stop tracking the slots.
118 for (Iterator<Slot> iter = shm.slotIterator(); iter.hasNext(); ) {
119 Slot slot = iter.next();
120 boolean removed = slots.remove(slot.getBlockId(), slot);
121 Preconditions.checkState(removed);
122 slot.makeInvalid();
123 }
124 // De-allocate the memory map and close the shared file.
125 shm.free();
126 }
127
128 /**
129 * Whether or not the registry is enabled.
130 */
131 private boolean enabled;
132
133 /**
134 * The factory which creates shared file descriptors.
135 */
136 private final SharedFileDescriptorFactory shmFactory;
137
138 /**
139 * A watcher which sends out callbacks when the UNIX domain socket
140 * associated with a shared memory segment closes.
141 */
142 private final DomainSocketWatcher watcher;
143
144 private final HashMap<ShmId, RegisteredShm> segments =
145 new HashMap<ShmId, RegisteredShm>(0);
146
147 private final HashMultimap<ExtendedBlockId, Slot> slots =
148 HashMultimap.create(0, 1);
149
150 public ShortCircuitRegistry(Configuration conf) throws IOException {
151 boolean enabled = false;
152 SharedFileDescriptorFactory shmFactory = null;
153 DomainSocketWatcher watcher = null;
154 try {
155 int interruptCheck = conf.getInt(
156 DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS,
157 DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS_DEFAULT);
158 if (interruptCheck <= 0) {
159 throw new IOException(
160 DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS +
161 " was set to " + interruptCheck);
162 }
163 String shmPaths[] =
164 conf.getTrimmedStrings(DFS_DATANODE_SHARED_FILE_DESCRIPTOR_PATHS);
165 if (shmPaths.length == 0) {
166 shmPaths =
167 DFS_DATANODE_SHARED_FILE_DESCRIPTOR_PATHS_DEFAULT.split(",");
168 }
169 shmFactory = SharedFileDescriptorFactory.
170 create("HadoopShortCircuitShm_", shmPaths);
171 String dswLoadingFailure = DomainSocketWatcher.getLoadingFailureReason();
172 if (dswLoadingFailure != null) {
173 throw new IOException(dswLoadingFailure);
174 }
175 watcher = new DomainSocketWatcher(interruptCheck);
176 enabled = true;
177 if (LOG.isDebugEnabled()) {
178 LOG.debug("created new ShortCircuitRegistry with interruptCheck=" +
179 interruptCheck + ", shmPath=" + shmFactory.getPath());
180 }
181 } catch (IOException e) {
182 if (LOG.isDebugEnabled()) {
183 LOG.debug("Disabling ShortCircuitRegistry", e);
184 }
185 } finally {
186 this.enabled = enabled;
187 this.shmFactory = shmFactory;
188 this.watcher = watcher;
189 }
190 }
191
192 /**
193 * Process a block mlock event from the FsDatasetCache.
194 *
195 * @param blockId The block that was mlocked.
196 */
197 public synchronized void processBlockMlockEvent(ExtendedBlockId blockId) {
198 if (!enabled) return;
199 Set<Slot> affectedSlots = slots.get(blockId);
200 for (Slot slot : affectedSlots) {
201 slot.makeAnchorable();
202 }
203 }
204
205 /**
206 * Mark any slots associated with this blockId as unanchorable.
207 *
208 * @param blockId The block ID.
209 * @return True if we should allow the munlock request.
210 */
211 public synchronized boolean processBlockMunlockRequest(
212 ExtendedBlockId blockId) {
213 if (!enabled) return true;
214 boolean allowMunlock = true;
215 Set<Slot> affectedSlots = slots.get(blockId);
216 for (Slot slot : affectedSlots) {
217 slot.makeUnanchorable();
218 if (slot.isAnchored()) {
219 allowMunlock = false;
220 }
221 }
222 return allowMunlock;
223 }
224
225 public static class NewShmInfo implements Closeable {
226 public final ShmId shmId;
227 public final FileInputStream stream;
228
229 NewShmInfo(ShmId shmId, FileInputStream stream) {
230 this.shmId = shmId;
231 this.stream = stream;
232 }
233
234 @Override
235 public void close() throws IOException {
236 stream.close();
237 }
238 }
239
240 /**
241 * Handle a DFSClient request to create a new memory segment.
242 *
243 * @param clientName Client name as reported by the client.
244 * @param sock The DomainSocket to associate with this memory
245 * segment. When this socket is closed, or the
246 * other side writes anything to the socket, the
247 * segment will be closed. This can happen at any
248 * time, including right after this function returns.
249 * @return A NewShmInfo object. The caller must close the
250 * NewShmInfo object once they are done with it.
251 * @throws IOException If the new memory segment could not be created.
252 */
253 public NewShmInfo createNewMemorySegment(String clientName,
254 DomainSocket sock) throws IOException {
255 NewShmInfo info = null;
256 RegisteredShm shm = null;
257 ShmId shmId = null;
258 synchronized (this) {
259 if (!enabled) {
260 if (LOG.isTraceEnabled()) {
261 LOG.trace("createNewMemorySegment: ShortCircuitRegistry is " +
262 "not enabled.");
263 }
264 throw new UnsupportedOperationException();
265 }
266 FileInputStream fis = null;
267 try {
268 do {
269 shmId = ShmId.createRandom();
270 } while (segments.containsKey(shmId));
271 fis = shmFactory.createDescriptor(clientName, SHM_LENGTH);
272 shm = new RegisteredShm(shmId, fis, this);
273 } finally {
274 if (shm == null) {
275 IOUtils.closeQuietly(fis);
276 }
277 }
278 info = new NewShmInfo(shmId, fis);
279 segments.put(shmId, shm);
280 }
281 // Drop the registry lock to prevent deadlock.
282 // After this point, RegisteredShm#handle may be called at any time.
283 watcher.add(sock, shm);
284 if (LOG.isTraceEnabled()) {
285 LOG.trace("createNewMemorySegment: created " + info.shmId);
286 }
287 return info;
288 }
289
290 public synchronized void registerSlot(ExtendedBlockId blockId, SlotId slotId,
291 boolean isCached) throws InvalidRequestException {
292 if (!enabled) {
293 if (LOG.isTraceEnabled()) {
294 LOG.trace(this + " can't register a slot because the " +
295 "ShortCircuitRegistry is not enabled.");
296 }
297 throw new UnsupportedOperationException();
298 }
299 ShmId shmId = slotId.getShmId();
300 RegisteredShm shm = segments.get(shmId);
301 if (shm == null) {
302 throw new InvalidRequestException("there is no shared memory segment " +
303 "registered with shmId " + shmId);
304 }
305 Slot slot = shm.registerSlot(slotId.getSlotIdx(), blockId);
306 if (isCached) {
307 slot.makeAnchorable();
308 } else {
309 slot.makeUnanchorable();
310 }
311 boolean added = slots.put(blockId, slot);
312 Preconditions.checkState(added);
313 if (LOG.isTraceEnabled()) {
314 LOG.trace(this + ": registered " + blockId + " with slot " +
315 slotId + " (isCached=" + isCached + ")");
316 }
317 }
318
319 public synchronized void unregisterSlot(SlotId slotId)
320 throws InvalidRequestException {
321 if (!enabled) {
322 if (LOG.isTraceEnabled()) {
323 LOG.trace("unregisterSlot: ShortCircuitRegistry is " +
324 "not enabled.");
325 }
326 throw new UnsupportedOperationException();
327 }
328 ShmId shmId = slotId.getShmId();
329 RegisteredShm shm = segments.get(shmId);
330 if (shm == null) {
331 throw new InvalidRequestException("there is no shared memory segment " +
332 "registered with shmId " + shmId);
333 }
334 Slot slot = shm.getSlot(slotId.getSlotIdx());
335 slot.makeInvalid();
336 shm.unregisterSlot(slotId.getSlotIdx());
337 slots.remove(slot.getBlockId(), slot);
338 }
339
340 public void shutdown() {
341 synchronized (this) {
342 if (!enabled) return;
343 enabled = false;
344 }
345 IOUtils.closeQuietly(watcher);
346 }
347 }