001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hdfs.server.datanode; 019 020import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_SHARED_FILE_DESCRIPTOR_PATHS; 021import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_SHARED_FILE_DESCRIPTOR_PATHS_DEFAULT; 022import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS; 023import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS_DEFAULT; 024 025import java.io.Closeable; 026import java.io.FileInputStream; 027import java.io.IOException; 028import java.util.HashMap; 029import java.util.HashSet; 030import java.util.Iterator; 031import java.util.Set; 032 033import com.google.common.annotations.VisibleForTesting; 034import org.apache.commons.io.IOUtils; 035import org.apache.commons.logging.Log; 036import org.apache.commons.logging.LogFactory; 037import org.apache.hadoop.conf.Configuration; 038import org.apache.hadoop.fs.InvalidRequestException; 039import org.apache.hadoop.hdfs.ExtendedBlockId; 040import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitShm; 041import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitShm.ShmId; 042import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitShm.Slot; 043import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitShm.SlotId; 044import org.apache.hadoop.io.nativeio.SharedFileDescriptorFactory; 045import org.apache.hadoop.net.unix.DomainSocket; 046import org.apache.hadoop.net.unix.DomainSocketWatcher; 047 048import com.google.common.base.Joiner; 049import com.google.common.base.Preconditions; 050import com.google.common.collect.HashMultimap; 051 052/** 053 * Manages client short-circuit memory segments on the DataNode. 054 * 055 * DFSClients request shared memory segments from the DataNode. The 056 * ShortCircuitRegistry generates and manages these segments. Each segment 057 * has a randomly generated 128-bit ID which uniquely identifies it. The 058 * segments each contain several "slots." 059 * 060 * Before performing a short-circuit read, DFSClients must request a pair of 061 * file descriptors from the DataNode via the REQUEST_SHORT_CIRCUIT_FDS 062 * operation. As part of this operation, DFSClients pass the ID of the shared 063 * memory segment they would like to use to communicate information about this 064 * replica, as well as the slot number within that segment they would like to 065 * use. Slot allocation is always done by the client. 066 * 067 * Slots are used to track the state of the block on the both the client and 068 * datanode. When this DataNode mlocks a block, the corresponding slots for the 069 * replicas are marked as "anchorable". Anchorable blocks can be safely read 070 * without verifying the checksum. This means that BlockReaderLocal objects 071 * using these replicas can skip checksumming. It also means that we can do 072 * zero-copy reads on these replicas (the ZCR interface has no way of 073 * verifying checksums.) 074 * 075 * When a DN needs to munlock a block, it needs to first wait for the block to 076 * be unanchored by clients doing a no-checksum read or a zero-copy read. The 077 * DN also marks the block's slots as "unanchorable" to prevent additional 078 * clients from initiating these operations in the future. 079 * 080 * The counterpart of this class on the client is {@link DfsClientShmManager}. 081 */ 082public class ShortCircuitRegistry { 083 public static final Log LOG = LogFactory.getLog(ShortCircuitRegistry.class); 084 085 private static final int SHM_LENGTH = 8192; 086 087 public static class RegisteredShm extends ShortCircuitShm 088 implements DomainSocketWatcher.Handler { 089 private final String clientName; 090 private final ShortCircuitRegistry registry; 091 092 RegisteredShm(String clientName, ShmId shmId, FileInputStream stream, 093 ShortCircuitRegistry registry) throws IOException { 094 super(shmId, stream); 095 this.clientName = clientName; 096 this.registry = registry; 097 } 098 099 @Override 100 public boolean handle(DomainSocket sock) { 101 synchronized (registry) { 102 synchronized (this) { 103 registry.removeShm(this); 104 } 105 } 106 return true; 107 } 108 109 String getClientName() { 110 return clientName; 111 } 112 } 113 114 public synchronized void removeShm(ShortCircuitShm shm) { 115 if (LOG.isTraceEnabled()) { 116 LOG.debug("removing shm " + shm); 117 } 118 // Stop tracking the shmId. 119 RegisteredShm removedShm = segments.remove(shm.getShmId()); 120 Preconditions.checkState(removedShm == shm, 121 "failed to remove " + shm.getShmId()); 122 // Stop tracking the slots. 123 for (Iterator<Slot> iter = shm.slotIterator(); iter.hasNext(); ) { 124 Slot slot = iter.next(); 125 boolean removed = slots.remove(slot.getBlockId(), slot); 126 Preconditions.checkState(removed); 127 slot.makeInvalid(); 128 } 129 // De-allocate the memory map and close the shared file. 130 shm.free(); 131 } 132 133 /** 134 * Whether or not the registry is enabled. 135 */ 136 private boolean enabled; 137 138 /** 139 * The factory which creates shared file descriptors. 140 */ 141 private final SharedFileDescriptorFactory shmFactory; 142 143 /** 144 * A watcher which sends out callbacks when the UNIX domain socket 145 * associated with a shared memory segment closes. 146 */ 147 private final DomainSocketWatcher watcher; 148 149 private final HashMap<ShmId, RegisteredShm> segments = 150 new HashMap<ShmId, RegisteredShm>(0); 151 152 private final HashMultimap<ExtendedBlockId, Slot> slots = 153 HashMultimap.create(0, 1); 154 155 public ShortCircuitRegistry(Configuration conf) throws IOException { 156 boolean enabled = false; 157 SharedFileDescriptorFactory shmFactory = null; 158 DomainSocketWatcher watcher = null; 159 try { 160 int interruptCheck = conf.getInt( 161 DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS, 162 DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS_DEFAULT); 163 if (interruptCheck <= 0) { 164 throw new IOException( 165 DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS + 166 " was set to " + interruptCheck); 167 } 168 String[] shmPaths = 169 conf.getTrimmedStrings(DFS_DATANODE_SHARED_FILE_DESCRIPTOR_PATHS); 170 if (shmPaths.length == 0) { 171 shmPaths = 172 DFS_DATANODE_SHARED_FILE_DESCRIPTOR_PATHS_DEFAULT.split(","); 173 } 174 shmFactory = SharedFileDescriptorFactory. 175 create("HadoopShortCircuitShm_", shmPaths); 176 String dswLoadingFailure = DomainSocketWatcher.getLoadingFailureReason(); 177 if (dswLoadingFailure != null) { 178 throw new IOException(dswLoadingFailure); 179 } 180 watcher = new DomainSocketWatcher(interruptCheck, "datanode"); 181 enabled = true; 182 if (LOG.isDebugEnabled()) { 183 LOG.debug("created new ShortCircuitRegistry with interruptCheck=" + 184 interruptCheck + ", shmPath=" + shmFactory.getPath()); 185 } 186 } catch (IOException e) { 187 if (LOG.isDebugEnabled()) { 188 LOG.debug("Disabling ShortCircuitRegistry", e); 189 } 190 } finally { 191 this.enabled = enabled; 192 this.shmFactory = shmFactory; 193 this.watcher = watcher; 194 } 195 } 196 197 /** 198 * Process a block mlock event from the FsDatasetCache. 199 * 200 * @param blockId The block that was mlocked. 201 */ 202 public synchronized void processBlockMlockEvent(ExtendedBlockId blockId) { 203 if (!enabled) return; 204 Set<Slot> affectedSlots = slots.get(blockId); 205 for (Slot slot : affectedSlots) { 206 slot.makeAnchorable(); 207 } 208 } 209 210 /** 211 * Mark any slots associated with this blockId as unanchorable. 212 * 213 * @param blockId The block ID. 214 * @return True if we should allow the munlock request. 215 */ 216 public synchronized boolean processBlockMunlockRequest( 217 ExtendedBlockId blockId) { 218 if (!enabled) return true; 219 boolean allowMunlock = true; 220 Set<Slot> affectedSlots = slots.get(blockId); 221 for (Slot slot : affectedSlots) { 222 slot.makeUnanchorable(); 223 if (slot.isAnchored()) { 224 allowMunlock = false; 225 } 226 } 227 return allowMunlock; 228 } 229 230 /** 231 * Invalidate any slot associated with a blockId that we are invalidating 232 * (deleting) from this DataNode. When a slot is invalid, the DFSClient will 233 * not use the corresponding replica for new read or mmap operations (although 234 * existing, ongoing read or mmap operations will complete.) 235 * 236 * @param blockId The block ID. 237 */ 238 public synchronized void processBlockInvalidation(ExtendedBlockId blockId) { 239 if (!enabled) return; 240 final Set<Slot> affectedSlots = slots.get(blockId); 241 if (!affectedSlots.isEmpty()) { 242 final StringBuilder bld = new StringBuilder(); 243 String prefix = ""; 244 bld.append("Block ").append(blockId).append(" has been invalidated. "). 245 append("Marking short-circuit slots as invalid: "); 246 for (Slot slot : affectedSlots) { 247 slot.makeInvalid(); 248 bld.append(prefix).append(slot.toString()); 249 prefix = ", "; 250 } 251 LOG.info(bld.toString()); 252 } 253 } 254 255 public synchronized String getClientNames(ExtendedBlockId blockId) { 256 if (!enabled) return ""; 257 final HashSet<String> clientNames = new HashSet<String>(); 258 final Set<Slot> affectedSlots = slots.get(blockId); 259 for (Slot slot : affectedSlots) { 260 clientNames.add(((RegisteredShm)slot.getShm()).getClientName()); 261 } 262 return Joiner.on(",").join(clientNames); 263 } 264 265 public static class NewShmInfo implements Closeable { 266 private final ShmId shmId; 267 private final FileInputStream stream; 268 269 NewShmInfo(ShmId shmId, FileInputStream stream) { 270 this.shmId = shmId; 271 this.stream = stream; 272 } 273 274 public ShmId getShmId() { 275 return shmId; 276 } 277 278 public FileInputStream getFileStream() { 279 return stream; 280 } 281 282 @Override 283 public void close() throws IOException { 284 stream.close(); 285 } 286 } 287 288 /** 289 * Handle a DFSClient request to create a new memory segment. 290 * 291 * @param clientName Client name as reported by the client. 292 * @param sock The DomainSocket to associate with this memory 293 * segment. When this socket is closed, or the 294 * other side writes anything to the socket, the 295 * segment will be closed. This can happen at any 296 * time, including right after this function returns. 297 * @return A NewShmInfo object. The caller must close the 298 * NewShmInfo object once they are done with it. 299 * @throws IOException If the new memory segment could not be created. 300 */ 301 public NewShmInfo createNewMemorySegment(String clientName, 302 DomainSocket sock) throws IOException { 303 NewShmInfo info = null; 304 RegisteredShm shm = null; 305 ShmId shmId = null; 306 synchronized (this) { 307 if (!enabled) { 308 if (LOG.isTraceEnabled()) { 309 LOG.trace("createNewMemorySegment: ShortCircuitRegistry is " + 310 "not enabled."); 311 } 312 throw new UnsupportedOperationException(); 313 } 314 FileInputStream fis = null; 315 try { 316 do { 317 shmId = ShmId.createRandom(); 318 } while (segments.containsKey(shmId)); 319 fis = shmFactory.createDescriptor(clientName, SHM_LENGTH); 320 shm = new RegisteredShm(clientName, shmId, fis, this); 321 } finally { 322 if (shm == null) { 323 IOUtils.closeQuietly(fis); 324 } 325 } 326 info = new NewShmInfo(shmId, fis); 327 segments.put(shmId, shm); 328 } 329 // Drop the registry lock to prevent deadlock. 330 // After this point, RegisteredShm#handle may be called at any time. 331 watcher.add(sock, shm); 332 if (LOG.isTraceEnabled()) { 333 LOG.trace("createNewMemorySegment: created " + info.shmId); 334 } 335 return info; 336 } 337 338 public synchronized void registerSlot(ExtendedBlockId blockId, SlotId slotId, 339 boolean isCached) throws InvalidRequestException { 340 if (!enabled) { 341 if (LOG.isTraceEnabled()) { 342 LOG.trace(this + " can't register a slot because the " + 343 "ShortCircuitRegistry is not enabled."); 344 } 345 throw new UnsupportedOperationException(); 346 } 347 ShmId shmId = slotId.getShmId(); 348 RegisteredShm shm = segments.get(shmId); 349 if (shm == null) { 350 throw new InvalidRequestException("there is no shared memory segment " + 351 "registered with shmId " + shmId); 352 } 353 Slot slot = shm.registerSlot(slotId.getSlotIdx(), blockId); 354 if (isCached) { 355 slot.makeAnchorable(); 356 } else { 357 slot.makeUnanchorable(); 358 } 359 boolean added = slots.put(blockId, slot); 360 Preconditions.checkState(added); 361 if (LOG.isTraceEnabled()) { 362 LOG.trace(this + ": registered " + blockId + " with slot " + 363 slotId + " (isCached=" + isCached + ")"); 364 } 365 } 366 367 public synchronized void unregisterSlot(SlotId slotId) 368 throws InvalidRequestException { 369 if (!enabled) { 370 if (LOG.isTraceEnabled()) { 371 LOG.trace("unregisterSlot: ShortCircuitRegistry is " + 372 "not enabled."); 373 } 374 throw new UnsupportedOperationException(); 375 } 376 ShmId shmId = slotId.getShmId(); 377 RegisteredShm shm = segments.get(shmId); 378 if (shm == null) { 379 throw new InvalidRequestException("there is no shared memory segment " + 380 "registered with shmId " + shmId); 381 } 382 Slot slot = shm.getSlot(slotId.getSlotIdx()); 383 slot.makeInvalid(); 384 shm.unregisterSlot(slotId.getSlotIdx()); 385 slots.remove(slot.getBlockId(), slot); 386 } 387 388 public void shutdown() { 389 synchronized (this) { 390 if (!enabled) return; 391 enabled = false; 392 } 393 IOUtils.closeQuietly(watcher); 394 } 395 396 public static interface Visitor { 397 boolean accept(HashMap<ShmId, RegisteredShm> segments, 398 HashMultimap<ExtendedBlockId, Slot> slots); 399 } 400 401 @VisibleForTesting 402 public synchronized boolean visit(Visitor visitor) { 403 return visitor.accept(segments, slots); 404 } 405}