001 /**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements. See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership. The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License. You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018 package org.apache.hadoop.hdfs.server.datanode;
019
020 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_SHARED_FILE_DESCRIPTOR_PATHS;
021 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_SHARED_FILE_DESCRIPTOR_PATHS_DEFAULT;
022 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS;
023 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS_DEFAULT;
024
025 import java.io.Closeable;
026 import java.io.FileInputStream;
027 import java.io.IOException;
028 import java.util.HashMap;
029 import java.util.HashSet;
030 import java.util.Iterator;
031 import java.util.Set;
032
033 import org.apache.commons.io.IOUtils;
034 import org.apache.commons.logging.Log;
035 import org.apache.commons.logging.LogFactory;
036 import org.apache.hadoop.conf.Configuration;
037 import org.apache.hadoop.fs.InvalidRequestException;
038 import org.apache.hadoop.hdfs.ExtendedBlockId;
039 import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitShm;
040 import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitShm.ShmId;
041 import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitShm.Slot;
042 import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitShm.SlotId;
043 import org.apache.hadoop.io.nativeio.SharedFileDescriptorFactory;
044 import org.apache.hadoop.net.unix.DomainSocket;
045 import org.apache.hadoop.net.unix.DomainSocketWatcher;
046
047 import com.google.common.base.Joiner;
048 import com.google.common.base.Preconditions;
049 import com.google.common.collect.HashMultimap;
050
051 /*
052 * Manages client short-circuit memory segments on the DataNode.
053 *
054 * DFSClients request shared memory segments from the DataNode. The
055 * ShortCircuitRegistry generates and manages these segments. Each segment
056 * has a randomly generated 128-bit ID which uniquely identifies it. The
057 * segments each contain several "slots."
058 *
059 * Before performing a short-circuit read, DFSClients must request a pair of
060 * file descriptors from the DataNode via the REQUEST_SHORT_CIRCUIT_FDS
061 * operation. As part of this operation, DFSClients pass the ID of the shared
062 * memory segment they would like to use to communicate information about this
063 * replica, as well as the slot number within that segment they would like to
064 * use. Slot allocation is always done by the client.
065 *
066 * Slots are used to track the state of the block on the both the client and
067 * datanode. When this DataNode mlocks a block, the corresponding slots for the
068 * replicas are marked as "anchorable". Anchorable blocks can be safely read
069 * without verifying the checksum. This means that BlockReaderLocal objects
070 * using these replicas can skip checksumming. It also means that we can do
071 * zero-copy reads on these replicas (the ZCR interface has no way of
072 * verifying checksums.)
073 *
074 * When a DN needs to munlock a block, it needs to first wait for the block to
075 * be unanchored by clients doing a no-checksum read or a zero-copy read. The
076 * DN also marks the block's slots as "unanchorable" to prevent additional
077 * clients from initiating these operations in the future.
078 *
079 * The counterpart of this class on the client is {@link DfsClientShmManager}.
080 */
081 public class ShortCircuitRegistry {
082 public static final Log LOG = LogFactory.getLog(ShortCircuitRegistry.class);
083
084 private static final int SHM_LENGTH = 8192;
085
086 private static class RegisteredShm extends ShortCircuitShm
087 implements DomainSocketWatcher.Handler {
088 private final String clientName;
089 private final ShortCircuitRegistry registry;
090
091 RegisteredShm(String clientName, ShmId shmId, FileInputStream stream,
092 ShortCircuitRegistry registry) throws IOException {
093 super(shmId, stream);
094 this.clientName = clientName;
095 this.registry = registry;
096 }
097
098 @Override
099 public boolean handle(DomainSocket sock) {
100 synchronized (registry) {
101 synchronized (this) {
102 registry.removeShm(this);
103 }
104 }
105 return true;
106 }
107
108 String getClientName() {
109 return clientName;
110 }
111 }
112
113 public synchronized void removeShm(ShortCircuitShm shm) {
114 if (LOG.isTraceEnabled()) {
115 LOG.debug("removing shm " + shm);
116 }
117 // Stop tracking the shmId.
118 RegisteredShm removedShm = segments.remove(shm.getShmId());
119 Preconditions.checkState(removedShm == shm,
120 "failed to remove " + shm.getShmId());
121 // Stop tracking the slots.
122 for (Iterator<Slot> iter = shm.slotIterator(); iter.hasNext(); ) {
123 Slot slot = iter.next();
124 boolean removed = slots.remove(slot.getBlockId(), slot);
125 Preconditions.checkState(removed);
126 slot.makeInvalid();
127 }
128 // De-allocate the memory map and close the shared file.
129 shm.free();
130 }
131
132 /**
133 * Whether or not the registry is enabled.
134 */
135 private boolean enabled;
136
137 /**
138 * The factory which creates shared file descriptors.
139 */
140 private final SharedFileDescriptorFactory shmFactory;
141
142 /**
143 * A watcher which sends out callbacks when the UNIX domain socket
144 * associated with a shared memory segment closes.
145 */
146 private final DomainSocketWatcher watcher;
147
148 private final HashMap<ShmId, RegisteredShm> segments =
149 new HashMap<ShmId, RegisteredShm>(0);
150
151 private final HashMultimap<ExtendedBlockId, Slot> slots =
152 HashMultimap.create(0, 1);
153
154 public ShortCircuitRegistry(Configuration conf) throws IOException {
155 boolean enabled = false;
156 SharedFileDescriptorFactory shmFactory = null;
157 DomainSocketWatcher watcher = null;
158 try {
159 int interruptCheck = conf.getInt(
160 DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS,
161 DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS_DEFAULT);
162 if (interruptCheck <= 0) {
163 throw new IOException(
164 DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS +
165 " was set to " + interruptCheck);
166 }
167 String shmPaths[] =
168 conf.getTrimmedStrings(DFS_DATANODE_SHARED_FILE_DESCRIPTOR_PATHS);
169 if (shmPaths.length == 0) {
170 shmPaths =
171 DFS_DATANODE_SHARED_FILE_DESCRIPTOR_PATHS_DEFAULT.split(",");
172 }
173 shmFactory = SharedFileDescriptorFactory.
174 create("HadoopShortCircuitShm_", shmPaths);
175 String dswLoadingFailure = DomainSocketWatcher.getLoadingFailureReason();
176 if (dswLoadingFailure != null) {
177 throw new IOException(dswLoadingFailure);
178 }
179 watcher = new DomainSocketWatcher(interruptCheck);
180 enabled = true;
181 if (LOG.isDebugEnabled()) {
182 LOG.debug("created new ShortCircuitRegistry with interruptCheck=" +
183 interruptCheck + ", shmPath=" + shmFactory.getPath());
184 }
185 } catch (IOException e) {
186 if (LOG.isDebugEnabled()) {
187 LOG.debug("Disabling ShortCircuitRegistry", e);
188 }
189 } finally {
190 this.enabled = enabled;
191 this.shmFactory = shmFactory;
192 this.watcher = watcher;
193 }
194 }
195
196 /**
197 * Process a block mlock event from the FsDatasetCache.
198 *
199 * @param blockId The block that was mlocked.
200 */
201 public synchronized void processBlockMlockEvent(ExtendedBlockId blockId) {
202 if (!enabled) return;
203 Set<Slot> affectedSlots = slots.get(blockId);
204 for (Slot slot : affectedSlots) {
205 slot.makeAnchorable();
206 }
207 }
208
209 /**
210 * Mark any slots associated with this blockId as unanchorable.
211 *
212 * @param blockId The block ID.
213 * @return True if we should allow the munlock request.
214 */
215 public synchronized boolean processBlockMunlockRequest(
216 ExtendedBlockId blockId) {
217 if (!enabled) return true;
218 boolean allowMunlock = true;
219 Set<Slot> affectedSlots = slots.get(blockId);
220 for (Slot slot : affectedSlots) {
221 slot.makeUnanchorable();
222 if (slot.isAnchored()) {
223 allowMunlock = false;
224 }
225 }
226 return allowMunlock;
227 }
228
229 /**
230 * Invalidate any slot associated with a blockId that we are invalidating
231 * (deleting) from this DataNode. When a slot is invalid, the DFSClient will
232 * not use the corresponding replica for new read or mmap operations (although
233 * existing, ongoing read or mmap operations will complete.)
234 *
235 * @param blockId The block ID.
236 */
237 public synchronized void processBlockInvalidation(ExtendedBlockId blockId) {
238 if (!enabled) return;
239 final Set<Slot> affectedSlots = slots.get(blockId);
240 if (!affectedSlots.isEmpty()) {
241 final StringBuilder bld = new StringBuilder();
242 String prefix = "";
243 bld.append("Block ").append(blockId).append(" has been invalidated. ").
244 append("Marking short-circuit slots as invalid: ");
245 for (Slot slot : affectedSlots) {
246 slot.makeInvalid();
247 bld.append(prefix).append(slot.toString());
248 prefix = ", ";
249 }
250 LOG.info(bld.toString());
251 }
252 }
253
254 public synchronized String getClientNames(ExtendedBlockId blockId) {
255 if (!enabled) return "";
256 final HashSet<String> clientNames = new HashSet<String>();
257 final Set<Slot> affectedSlots = slots.get(blockId);
258 for (Slot slot : affectedSlots) {
259 clientNames.add(((RegisteredShm)slot.getShm()).getClientName());
260 }
261 return Joiner.on(",").join(clientNames);
262 }
263
264 public static class NewShmInfo implements Closeable {
265 public final ShmId shmId;
266 public final FileInputStream stream;
267
268 NewShmInfo(ShmId shmId, FileInputStream stream) {
269 this.shmId = shmId;
270 this.stream = stream;
271 }
272
273 @Override
274 public void close() throws IOException {
275 stream.close();
276 }
277 }
278
279 /**
280 * Handle a DFSClient request to create a new memory segment.
281 *
282 * @param clientName Client name as reported by the client.
283 * @param sock The DomainSocket to associate with this memory
284 * segment. When this socket is closed, or the
285 * other side writes anything to the socket, the
286 * segment will be closed. This can happen at any
287 * time, including right after this function returns.
288 * @return A NewShmInfo object. The caller must close the
289 * NewShmInfo object once they are done with it.
290 * @throws IOException If the new memory segment could not be created.
291 */
292 public NewShmInfo createNewMemorySegment(String clientName,
293 DomainSocket sock) throws IOException {
294 NewShmInfo info = null;
295 RegisteredShm shm = null;
296 ShmId shmId = null;
297 synchronized (this) {
298 if (!enabled) {
299 if (LOG.isTraceEnabled()) {
300 LOG.trace("createNewMemorySegment: ShortCircuitRegistry is " +
301 "not enabled.");
302 }
303 throw new UnsupportedOperationException();
304 }
305 FileInputStream fis = null;
306 try {
307 do {
308 shmId = ShmId.createRandom();
309 } while (segments.containsKey(shmId));
310 fis = shmFactory.createDescriptor(clientName, SHM_LENGTH);
311 shm = new RegisteredShm(clientName, shmId, fis, this);
312 } finally {
313 if (shm == null) {
314 IOUtils.closeQuietly(fis);
315 }
316 }
317 info = new NewShmInfo(shmId, fis);
318 segments.put(shmId, shm);
319 }
320 // Drop the registry lock to prevent deadlock.
321 // After this point, RegisteredShm#handle may be called at any time.
322 watcher.add(sock, shm);
323 if (LOG.isTraceEnabled()) {
324 LOG.trace("createNewMemorySegment: created " + info.shmId);
325 }
326 return info;
327 }
328
329 public synchronized void registerSlot(ExtendedBlockId blockId, SlotId slotId,
330 boolean isCached) throws InvalidRequestException {
331 if (!enabled) {
332 if (LOG.isTraceEnabled()) {
333 LOG.trace(this + " can't register a slot because the " +
334 "ShortCircuitRegistry is not enabled.");
335 }
336 throw new UnsupportedOperationException();
337 }
338 ShmId shmId = slotId.getShmId();
339 RegisteredShm shm = segments.get(shmId);
340 if (shm == null) {
341 throw new InvalidRequestException("there is no shared memory segment " +
342 "registered with shmId " + shmId);
343 }
344 Slot slot = shm.registerSlot(slotId.getSlotIdx(), blockId);
345 if (isCached) {
346 slot.makeAnchorable();
347 } else {
348 slot.makeUnanchorable();
349 }
350 boolean added = slots.put(blockId, slot);
351 Preconditions.checkState(added);
352 if (LOG.isTraceEnabled()) {
353 LOG.trace(this + ": registered " + blockId + " with slot " +
354 slotId + " (isCached=" + isCached + ")");
355 }
356 }
357
358 public synchronized void unregisterSlot(SlotId slotId)
359 throws InvalidRequestException {
360 if (!enabled) {
361 if (LOG.isTraceEnabled()) {
362 LOG.trace("unregisterSlot: ShortCircuitRegistry is " +
363 "not enabled.");
364 }
365 throw new UnsupportedOperationException();
366 }
367 ShmId shmId = slotId.getShmId();
368 RegisteredShm shm = segments.get(shmId);
369 if (shm == null) {
370 throw new InvalidRequestException("there is no shared memory segment " +
371 "registered with shmId " + shmId);
372 }
373 Slot slot = shm.getSlot(slotId.getSlotIdx());
374 slot.makeInvalid();
375 shm.unregisterSlot(slotId.getSlotIdx());
376 slots.remove(slot.getBlockId(), slot);
377 }
378
379 public void shutdown() {
380 synchronized (this) {
381 if (!enabled) return;
382 enabled = false;
383 }
384 IOUtils.closeQuietly(watcher);
385 }
386 }