001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hdfs.shortcircuit; 019 020import java.io.BufferedOutputStream; 021import java.io.Closeable; 022import java.io.DataOutputStream; 023import java.io.EOFException; 024import java.io.FileInputStream; 025import java.io.IOException; 026import java.util.HashMap; 027import java.util.Map.Entry; 028import java.util.TreeMap; 029import java.util.concurrent.locks.Condition; 030import java.util.concurrent.locks.ReentrantLock; 031 032import org.apache.commons.lang.mutable.MutableBoolean; 033import org.apache.hadoop.classification.InterfaceAudience; 034import org.apache.hadoop.hdfs.ExtendedBlockId; 035import org.apache.hadoop.hdfs.net.DomainPeer; 036import org.apache.hadoop.hdfs.protocol.DatanodeInfo; 037import org.apache.hadoop.hdfs.protocol.datatransfer.DataTransferProtocol; 038import org.apache.hadoop.hdfs.protocol.datatransfer.Sender; 039import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.ShortCircuitShmResponseProto; 040import org.apache.hadoop.hdfs.protocolPB.PBHelperClient; 041import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitShm.ShmId; 042import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitShm.Slot; 043import org.apache.hadoop.net.unix.DomainSocket; 044import org.apache.hadoop.net.unix.DomainSocketWatcher; 045 046import com.google.common.annotations.VisibleForTesting; 047import com.google.common.base.Preconditions; 048 049import org.slf4j.Logger; 050import org.slf4j.LoggerFactory; 051 052/** 053 * Manages short-circuit memory segments for an HDFS client. 054 * 055 * Clients are responsible for requesting and releasing shared memory segments 056 * used for communicating with the DataNode. The client will try to allocate new 057 * slots in the set of existing segments, falling back to getting a new segment 058 * from the DataNode via {@link DataTransferProtocol#requestShortCircuitFds}. 059 * 060 * The counterpart to this class on the DataNode is 061 * {@link ShortCircuitRegistry}. See {@link ShortCircuitRegistry} for more 062 * information on the communication protocol. 063 */ 064@InterfaceAudience.Private 065public class DfsClientShmManager implements Closeable { 066 private static final Logger LOG = LoggerFactory.getLogger( 067 DfsClientShmManager.class); 068 069 /** 070 * Manages short-circuit memory segments that pertain to a given DataNode. 071 */ 072 class EndpointShmManager { 073 /** 074 * The datanode we're managing. 075 */ 076 private final DatanodeInfo datanode; 077 078 /** 079 * Shared memory segments which have no empty slots. 080 * 081 * Protected by the manager lock. 082 */ 083 private final TreeMap<ShmId, DfsClientShm> full = new TreeMap<>(); 084 085 /** 086 * Shared memory segments which have at least one empty slot. 087 * 088 * Protected by the manager lock. 089 */ 090 private final TreeMap<ShmId, DfsClientShm> notFull = new TreeMap<>(); 091 092 /** 093 * True if this datanode doesn't support short-circuit shared memory 094 * segments. 095 * 096 * Protected by the manager lock. 097 */ 098 private boolean disabled = false; 099 100 /** 101 * True if we're in the process of loading a shared memory segment from 102 * this DataNode. 103 * 104 * Protected by the manager lock. 105 */ 106 private boolean loading = false; 107 108 EndpointShmManager (DatanodeInfo datanode) { 109 this.datanode = datanode; 110 } 111 112 /** 113 * Pull a slot out of a preexisting shared memory segment. 114 * 115 * Must be called with the manager lock held. 116 * 117 * @param blockId The blockId to put inside the Slot object. 118 * 119 * @return null if none of our shared memory segments contain a 120 * free slot; the slot object otherwise. 121 */ 122 private Slot allocSlotFromExistingShm(ExtendedBlockId blockId) { 123 if (notFull.isEmpty()) { 124 return null; 125 } 126 Entry<ShmId, DfsClientShm> entry = notFull.firstEntry(); 127 DfsClientShm shm = entry.getValue(); 128 ShmId shmId = shm.getShmId(); 129 Slot slot = shm.allocAndRegisterSlot(blockId); 130 if (shm.isFull()) { 131 LOG.trace("{}: pulled the last slot {} out of {}", 132 this, slot.getSlotIdx(), shm); 133 DfsClientShm removedShm = notFull.remove(shmId); 134 Preconditions.checkState(removedShm == shm); 135 full.put(shmId, shm); 136 } else { 137 LOG.trace("{}: pulled slot {} out of {}", this, slot.getSlotIdx(), shm); 138 } 139 return slot; 140 } 141 142 /** 143 * Ask the DataNode for a new shared memory segment. This function must be 144 * called with the manager lock held. We will release the lock while 145 * communicating with the DataNode. 146 * 147 * @param clientName The current client name. 148 * @param peer The peer to use to talk to the DataNode. 149 * 150 * @return Null if the DataNode does not support shared memory 151 * segments, or experienced an error creating the 152 * shm. The shared memory segment itself on success. 153 * @throws IOException If there was an error communicating over the socket. 154 * We will not throw an IOException unless the socket 155 * itself (or the network) is the problem. 156 */ 157 private DfsClientShm requestNewShm(String clientName, DomainPeer peer) 158 throws IOException { 159 final DataOutputStream out = 160 new DataOutputStream( 161 new BufferedOutputStream(peer.getOutputStream())); 162 new Sender(out).requestShortCircuitShm(clientName); 163 ShortCircuitShmResponseProto resp = 164 ShortCircuitShmResponseProto.parseFrom( 165 PBHelperClient.vintPrefixed(peer.getInputStream())); 166 String error = resp.hasError() ? resp.getError() : "(unknown)"; 167 switch (resp.getStatus()) { 168 case SUCCESS: 169 DomainSocket sock = peer.getDomainSocket(); 170 byte buf[] = new byte[1]; 171 FileInputStream[] fis = new FileInputStream[1]; 172 if (sock.recvFileInputStreams(fis, buf, 0, buf.length) < 0) { 173 throw new EOFException("got EOF while trying to transfer the " + 174 "file descriptor for the shared memory segment."); 175 } 176 if (fis[0] == null) { 177 throw new IOException("the datanode " + datanode + " failed to " + 178 "pass a file descriptor for the shared memory segment."); 179 } 180 try { 181 DfsClientShm shm = 182 new DfsClientShm(PBHelperClient.convert(resp.getId()), 183 fis[0], this, peer); 184 LOG.trace("{}: createNewShm: created {}", this, shm); 185 return shm; 186 } finally { 187 try { 188 fis[0].close(); 189 } catch (Throwable e) { 190 LOG.debug("Exception in closing " + fis[0], e); 191 } 192 } 193 case ERROR_UNSUPPORTED: 194 // The DataNode just does not support short-circuit shared memory 195 // access, and we should stop asking. 196 LOG.info(this + ": datanode does not support short-circuit " + 197 "shared memory access: " + error); 198 disabled = true; 199 return null; 200 default: 201 // The datanode experienced some kind of unexpected error when trying to 202 // create the short-circuit shared memory segment. 203 LOG.warn(this + ": error requesting short-circuit shared memory " + 204 "access: " + error); 205 return null; 206 } 207 } 208 209 /** 210 * Allocate a new shared memory slot connected to this datanode. 211 * 212 * Must be called with the EndpointShmManager lock held. 213 * 214 * @param peer The peer to use to talk to the DataNode. 215 * @param usedPeer (out param) Will be set to true if we used the peer. 216 * When a peer is used 217 * 218 * @param clientName The client name. 219 * @param blockId The block ID to use. 220 * @return null if the DataNode does not support shared memory 221 * segments, or experienced an error creating the 222 * shm. The shared memory segment itself on success. 223 * @throws IOException If there was an error communicating over the socket. 224 */ 225 Slot allocSlot(DomainPeer peer, MutableBoolean usedPeer, 226 String clientName, ExtendedBlockId blockId) throws IOException { 227 while (true) { 228 if (closed) { 229 LOG.trace("{}: the DfsClientShmManager has been closed.", this); 230 return null; 231 } 232 if (disabled) { 233 LOG.trace("{}: shared memory segment access is disabled.", this); 234 return null; 235 } 236 // Try to use an existing slot. 237 Slot slot = allocSlotFromExistingShm(blockId); 238 if (slot != null) { 239 return slot; 240 } 241 // There are no free slots. If someone is loading more slots, wait 242 // for that to finish. 243 if (loading) { 244 LOG.trace("{}: waiting for loading to finish...", this); 245 finishedLoading.awaitUninterruptibly(); 246 } else { 247 // Otherwise, load the slot ourselves. 248 loading = true; 249 lock.unlock(); 250 DfsClientShm shm; 251 try { 252 shm = requestNewShm(clientName, peer); 253 if (shm == null) continue; 254 // See #{DfsClientShmManager#domainSocketWatcher} for details 255 // about why we do this before retaking the manager lock. 256 domainSocketWatcher.add(peer.getDomainSocket(), shm); 257 // The DomainPeer is now our responsibility, and should not be 258 // closed by the caller. 259 usedPeer.setValue(true); 260 } finally { 261 lock.lock(); 262 loading = false; 263 finishedLoading.signalAll(); 264 } 265 if (shm.isDisconnected()) { 266 // If the peer closed immediately after the shared memory segment 267 // was created, the DomainSocketWatcher callback might already have 268 // fired and marked the shm as disconnected. In this case, we 269 // obviously don't want to add the SharedMemorySegment to our list 270 // of valid not-full segments. 271 LOG.debug("{}: the UNIX domain socket associated with this " 272 + "short-circuit memory closed before we could make use of " 273 + "the shm.", this); 274 } else { 275 notFull.put(shm.getShmId(), shm); 276 } 277 } 278 } 279 } 280 281 /** 282 * Stop tracking a slot. 283 * 284 * Must be called with the EndpointShmManager lock held. 285 * 286 * @param slot The slot to release. 287 */ 288 void freeSlot(Slot slot) { 289 DfsClientShm shm = (DfsClientShm)slot.getShm(); 290 shm.unregisterSlot(slot.getSlotIdx()); 291 if (shm.isDisconnected()) { 292 // Stale shared memory segments should not be tracked here. 293 Preconditions.checkState(!full.containsKey(shm.getShmId())); 294 Preconditions.checkState(!notFull.containsKey(shm.getShmId())); 295 if (shm.isEmpty()) { 296 LOG.trace("{}: freeing empty stale {}", this, shm); 297 shm.free(); 298 } 299 } else { 300 ShmId shmId = shm.getShmId(); 301 full.remove(shmId); // The shm can't be full if we just freed a slot. 302 if (shm.isEmpty()) { 303 notFull.remove(shmId); 304 305 // If the shared memory segment is now empty, we call shutdown(2) on 306 // the UNIX domain socket associated with it. The DomainSocketWatcher, 307 // which is watching this socket, will call DfsClientShm#handle, 308 // cleaning up this shared memory segment. 309 // 310 // See #{DfsClientShmManager#domainSocketWatcher} for details about why 311 // we don't want to call DomainSocketWatcher#remove directly here. 312 // 313 // Note that we could experience 'fragmentation' here, where the 314 // DFSClient allocates a bunch of slots in different shared memory 315 // segments, and then frees most of them, but never fully empties out 316 // any segment. We make some attempt to avoid this fragmentation by 317 // always allocating new slots out of the shared memory segment with the 318 // lowest ID, but it could still occur. In most workloads, 319 // fragmentation should not be a major concern, since it doesn't impact 320 // peak file descriptor usage or the speed of allocation. 321 LOG.trace("{}: shutting down UNIX domain socket for empty {}", 322 this, shm); 323 shutdown(shm); 324 } else { 325 notFull.put(shmId, shm); 326 } 327 } 328 } 329 330 /** 331 * Unregister a shared memory segment. 332 * 333 * Once a segment is unregistered, we will not allocate any more slots 334 * inside that segment. 335 * 336 * The DomainSocketWatcher calls this while holding the DomainSocketWatcher 337 * lock. 338 * 339 * @param shmId The ID of the shared memory segment to unregister. 340 */ 341 void unregisterShm(ShmId shmId) { 342 lock.lock(); 343 try { 344 full.remove(shmId); 345 notFull.remove(shmId); 346 } finally { 347 lock.unlock(); 348 } 349 } 350 351 @Override 352 public String toString() { 353 return String.format("EndpointShmManager(%s, parent=%s)", 354 datanode, DfsClientShmManager.this); 355 } 356 357 PerDatanodeVisitorInfo getVisitorInfo() { 358 return new PerDatanodeVisitorInfo(full, notFull, disabled); 359 } 360 361 final void shutdown(DfsClientShm shm) { 362 try { 363 shm.getPeer().getDomainSocket().shutdown(); 364 } catch (IOException e) { 365 LOG.warn(this + ": error shutting down shm: got IOException calling " + 366 "shutdown(SHUT_RDWR)", e); 367 } 368 } 369 } 370 371 private boolean closed = false; 372 373 private final ReentrantLock lock = new ReentrantLock(); 374 375 /** 376 * A condition variable which is signalled when we finish loading a segment 377 * from the Datanode. 378 */ 379 private final Condition finishedLoading = lock.newCondition(); 380 381 /** 382 * Information about each Datanode. 383 */ 384 private final HashMap<DatanodeInfo, EndpointShmManager> datanodes = 385 new HashMap<>(1); 386 387 /** 388 * The DomainSocketWatcher which keeps track of the UNIX domain socket 389 * associated with each shared memory segment. 390 * 391 * Note: because the DomainSocketWatcher makes callbacks into this 392 * DfsClientShmManager object, you must MUST NOT attempt to take the 393 * DomainSocketWatcher lock while holding the DfsClientShmManager lock, 394 * or else deadlock might result. This means that most DomainSocketWatcher 395 * methods are off-limits unless you release the manager lock first. 396 */ 397 private final DomainSocketWatcher domainSocketWatcher; 398 399 DfsClientShmManager(int interruptCheckPeriodMs) throws IOException { 400 this.domainSocketWatcher = new DomainSocketWatcher(interruptCheckPeriodMs, 401 "client"); 402 } 403 404 public Slot allocSlot(DatanodeInfo datanode, DomainPeer peer, 405 MutableBoolean usedPeer, ExtendedBlockId blockId, 406 String clientName) throws IOException { 407 lock.lock(); 408 try { 409 if (closed) { 410 LOG.trace(this + ": the DfsClientShmManager isclosed."); 411 return null; 412 } 413 EndpointShmManager shmManager = datanodes.get(datanode); 414 if (shmManager == null) { 415 shmManager = new EndpointShmManager(datanode); 416 datanodes.put(datanode, shmManager); 417 } 418 return shmManager.allocSlot(peer, usedPeer, clientName, blockId); 419 } finally { 420 lock.unlock(); 421 } 422 } 423 424 public void freeSlot(Slot slot) { 425 lock.lock(); 426 try { 427 DfsClientShm shm = (DfsClientShm)slot.getShm(); 428 shm.getEndpointShmManager().freeSlot(slot); 429 } finally { 430 lock.unlock(); 431 } 432 } 433 434 @VisibleForTesting 435 public static class PerDatanodeVisitorInfo { 436 public final TreeMap<ShmId, DfsClientShm> full; 437 public final TreeMap<ShmId, DfsClientShm> notFull; 438 public final boolean disabled; 439 440 PerDatanodeVisitorInfo(TreeMap<ShmId, DfsClientShm> full, 441 TreeMap<ShmId, DfsClientShm> notFull, boolean disabled) { 442 this.full = full; 443 this.notFull = notFull; 444 this.disabled = disabled; 445 } 446 } 447 448 @VisibleForTesting 449 public interface Visitor { 450 void visit(HashMap<DatanodeInfo, PerDatanodeVisitorInfo> info) 451 throws IOException; 452 } 453 454 @VisibleForTesting 455 public void visit(Visitor visitor) throws IOException { 456 lock.lock(); 457 try { 458 HashMap<DatanodeInfo, PerDatanodeVisitorInfo> info = new HashMap<>(); 459 for (Entry<DatanodeInfo, EndpointShmManager> entry : 460 datanodes.entrySet()) { 461 info.put(entry.getKey(), entry.getValue().getVisitorInfo()); 462 } 463 visitor.visit(info); 464 } finally { 465 lock.unlock(); 466 } 467 } 468 469 /** 470 * Close the DfsClientShmManager. 471 */ 472 @Override 473 public void close() throws IOException { 474 lock.lock(); 475 try { 476 if (closed) return; 477 closed = true; 478 } finally { 479 lock.unlock(); 480 } 481 // When closed, the domainSocketWatcher will issue callbacks that mark 482 // all the outstanding DfsClientShm segments as stale. 483 try { 484 domainSocketWatcher.close(); 485 } catch (Throwable e) { 486 LOG.debug("Exception in closing " + domainSocketWatcher, e); 487 } 488 } 489 490 491 @Override 492 public String toString() { 493 return String.format("ShortCircuitShmManager(%08x)", 494 System.identityHashCode(this)); 495 } 496 497 @VisibleForTesting 498 public DomainSocketWatcher getDomainSocketWatcher() { 499 return domainSocketWatcher; 500 } 501}