001 /**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements. See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership. The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License. You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018 package org.apache.hadoop.hdfs.client;
019
020 import com.google.common.annotations.VisibleForTesting;
021 import com.google.common.base.Preconditions;
022
023 import java.io.BufferedOutputStream;
024 import java.io.Closeable;
025 import java.io.DataOutputStream;
026 import java.io.EOFException;
027 import java.io.FileInputStream;
028 import java.io.IOException;
029 import java.util.HashMap;
030 import java.util.TreeMap;
031 import java.util.Map.Entry;
032 import java.util.concurrent.locks.Condition;
033 import java.util.concurrent.locks.ReentrantLock;
034
035 import org.apache.commons.lang.mutable.MutableBoolean;
036 import org.apache.commons.logging.Log;
037 import org.apache.commons.logging.LogFactory;
038 import org.apache.hadoop.hdfs.ExtendedBlockId;
039 import org.apache.hadoop.hdfs.ShortCircuitShm.ShmId;
040 import org.apache.hadoop.hdfs.ShortCircuitShm.Slot;
041 import org.apache.hadoop.hdfs.net.DomainPeer;
042 import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
043 import org.apache.hadoop.hdfs.protocol.datatransfer.Sender;
044 import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.ShortCircuitShmResponseProto;
045 import org.apache.hadoop.hdfs.protocolPB.PBHelper;
046 import org.apache.hadoop.io.IOUtils;
047 import org.apache.hadoop.net.unix.DomainSocket;
048 import org.apache.hadoop.net.unix.DomainSocketWatcher;
049 import org.apache.hadoop.classification.InterfaceAudience;
050
051 /**
052 * Manages short-circuit memory segments for an HDFS client.
053 *
054 * Clients are responsible for requesting and releasing shared memory segments used
055 * for communicating with the DataNode. The client will try to allocate new slots
056 * in the set of existing segments, falling back to getting a new segment from the
057 * DataNode via {@link DataTransferProtocol#requestShortCircuitFds}.
058 *
059 * The counterpart to this class on the DataNode is {@link ShortCircuitRegistry}.
060 * See {@link ShortCircuitRegistry} for more information on the communication protocol.
061 */
062 @InterfaceAudience.Private
063 public class DfsClientShmManager implements Closeable {
064 private static final Log LOG = LogFactory.getLog(DfsClientShmManager.class);
065
066 /**
067 * Manages short-circuit memory segments that pertain to a given DataNode.
068 */
069 class EndpointShmManager {
070 /**
071 * The datanode we're managing.
072 */
073 private final DatanodeInfo datanode;
074
075 /**
076 * Shared memory segments which have no empty slots.
077 *
078 * Protected by the manager lock.
079 */
080 private final TreeMap<ShmId, DfsClientShm> full =
081 new TreeMap<ShmId, DfsClientShm>();
082
083 /**
084 * Shared memory segments which have at least one empty slot.
085 *
086 * Protected by the manager lock.
087 */
088 private final TreeMap<ShmId, DfsClientShm> notFull =
089 new TreeMap<ShmId, DfsClientShm>();
090
091 /**
092 * True if this datanode doesn't support short-circuit shared memory
093 * segments.
094 *
095 * Protected by the manager lock.
096 */
097 private boolean disabled = false;
098
099 /**
100 * True if we're in the process of loading a shared memory segment from
101 * this DataNode.
102 *
103 * Protected by the manager lock.
104 */
105 private boolean loading = false;
106
107 EndpointShmManager (DatanodeInfo datanode) {
108 this.datanode = datanode;
109 }
110
111 /**
112 * Pull a slot out of a preexisting shared memory segment.
113 *
114 * Must be called with the manager lock held.
115 *
116 * @param blockId The blockId to put inside the Slot object.
117 *
118 * @return null if none of our shared memory segments contain a
119 * free slot; the slot object otherwise.
120 */
121 private Slot allocSlotFromExistingShm(ExtendedBlockId blockId) {
122 if (notFull.isEmpty()) {
123 return null;
124 }
125 Entry<ShmId, DfsClientShm> entry = notFull.firstEntry();
126 DfsClientShm shm = entry.getValue();
127 ShmId shmId = shm.getShmId();
128 Slot slot = shm.allocAndRegisterSlot(blockId);
129 if (shm.isFull()) {
130 if (LOG.isTraceEnabled()) {
131 LOG.trace(this + ": pulled the last slot " + slot.getSlotIdx() +
132 " out of " + shm);
133 }
134 DfsClientShm removedShm = notFull.remove(shmId);
135 Preconditions.checkState(removedShm == shm);
136 full.put(shmId, shm);
137 } else {
138 if (LOG.isTraceEnabled()) {
139 LOG.trace(this + ": pulled slot " + slot.getSlotIdx() +
140 " out of " + shm);
141 }
142 }
143 return slot;
144 }
145
146 /**
147 * Ask the DataNode for a new shared memory segment. This function must be
148 * called with the manager lock held. We will release the lock while
149 * communicating with the DataNode.
150 *
151 * @param clientName The current client name.
152 * @param peer The peer to use to talk to the DataNode.
153 *
154 * @return Null if the DataNode does not support shared memory
155 * segments, or experienced an error creating the
156 * shm. The shared memory segment itself on success.
157 * @throws IOException If there was an error communicating over the socket.
158 * We will not throw an IOException unless the socket
159 * itself (or the network) is the problem.
160 */
161 private DfsClientShm requestNewShm(String clientName, DomainPeer peer)
162 throws IOException {
163 final DataOutputStream out =
164 new DataOutputStream(
165 new BufferedOutputStream(peer.getOutputStream()));
166 new Sender(out).requestShortCircuitShm(clientName);
167 ShortCircuitShmResponseProto resp =
168 ShortCircuitShmResponseProto.parseFrom(
169 PBHelper.vintPrefixed(peer.getInputStream()));
170 String error = resp.hasError() ? resp.getError() : "(unknown)";
171 switch (resp.getStatus()) {
172 case SUCCESS:
173 DomainSocket sock = peer.getDomainSocket();
174 byte buf[] = new byte[1];
175 FileInputStream fis[] = new FileInputStream[1];
176 if (sock.recvFileInputStreams(fis, buf, 0, buf.length) < 0) {
177 throw new EOFException("got EOF while trying to transfer the " +
178 "file descriptor for the shared memory segment.");
179 }
180 if (fis[0] == null) {
181 throw new IOException("the datanode " + datanode + " failed to " +
182 "pass a file descriptor for the shared memory segment.");
183 }
184 try {
185 DfsClientShm shm =
186 new DfsClientShm(PBHelper.convert(resp.getId()),
187 fis[0], this, peer);
188 if (LOG.isTraceEnabled()) {
189 LOG.trace(this + ": createNewShm: created " + shm);
190 }
191 return shm;
192 } finally {
193 IOUtils.cleanup(LOG, fis[0]);
194 }
195 case ERROR_UNSUPPORTED:
196 // The DataNode just does not support short-circuit shared memory
197 // access, and we should stop asking.
198 LOG.info(this + ": datanode does not support short-circuit " +
199 "shared memory access: " + error);
200 disabled = true;
201 return null;
202 default:
203 // The datanode experienced some kind of unexpected error when trying to
204 // create the short-circuit shared memory segment.
205 LOG.warn(this + ": error requesting short-circuit shared memory " +
206 "access: " + error);
207 return null;
208 }
209 }
210
211 /**
212 * Allocate a new shared memory slot connected to this datanode.
213 *
214 * Must be called with the EndpointShmManager lock held.
215 *
216 * @param peer The peer to use to talk to the DataNode.
217 * @param clientName The client name.
218 * @param usedPeer (out param) Will be set to true if we used the peer.
219 * When a peer is used
220 *
221 * @return null if the DataNode does not support shared memory
222 * segments, or experienced an error creating the
223 * shm. The shared memory segment itself on success.
224 * @throws IOException If there was an error communicating over the socket.
225 */
226 Slot allocSlot(DomainPeer peer, MutableBoolean usedPeer,
227 String clientName, ExtendedBlockId blockId) throws IOException {
228 while (true) {
229 if (closed) {
230 if (LOG.isTraceEnabled()) {
231 LOG.trace(this + ": the DfsClientShmManager has been closed.");
232 }
233 return null;
234 }
235 if (disabled) {
236 if (LOG.isTraceEnabled()) {
237 LOG.trace(this + ": shared memory segment access is disabled.");
238 }
239 return null;
240 }
241 // Try to use an existing slot.
242 Slot slot = allocSlotFromExistingShm(blockId);
243 if (slot != null) {
244 return slot;
245 }
246 // There are no free slots. If someone is loading more slots, wait
247 // for that to finish.
248 if (loading) {
249 if (LOG.isTraceEnabled()) {
250 LOG.trace(this + ": waiting for loading to finish...");
251 }
252 finishedLoading.awaitUninterruptibly();
253 } else {
254 // Otherwise, load the slot ourselves.
255 loading = true;
256 lock.unlock();
257 DfsClientShm shm;
258 try {
259 shm = requestNewShm(clientName, peer);
260 if (shm == null) continue;
261 // See #{DfsClientShmManager#domainSocketWatcher} for details
262 // about why we do this before retaking the manager lock.
263 domainSocketWatcher.add(peer.getDomainSocket(), shm);
264 // The DomainPeer is now our responsibility, and should not be
265 // closed by the caller.
266 usedPeer.setValue(true);
267 } finally {
268 lock.lock();
269 loading = false;
270 finishedLoading.signalAll();
271 }
272 if (shm.isStale()) {
273 // If the peer closed immediately after the shared memory segment
274 // was created, the DomainSocketWatcher callback might already have
275 // fired and marked the shm as stale. In this case, we obviously
276 // don't want to add the SharedMemorySegment to our list of valid
277 // not-full segments.
278 if (LOG.isDebugEnabled()) {
279 LOG.debug(this + ": the UNIX domain socket associated with " +
280 "this short-circuit memory closed before we could make " +
281 "use of the shm.");
282 }
283 } else {
284 notFull.put(shm.getShmId(), shm);
285 }
286 }
287 }
288 }
289
290 /**
291 * Stop tracking a slot.
292 *
293 * Must be called with the EndpointShmManager lock held.
294 *
295 * @param slot The slot to release.
296 */
297 void freeSlot(Slot slot) {
298 DfsClientShm shm = (DfsClientShm)slot.getShm();
299 shm.unregisterSlot(slot.getSlotIdx());
300 if (shm.isStale()) {
301 // Stale shared memory segments should not be tracked here.
302 Preconditions.checkState(!full.containsKey(shm.getShmId()));
303 Preconditions.checkState(!notFull.containsKey(shm.getShmId()));
304 if (shm.isEmpty()) {
305 if (LOG.isTraceEnabled()) {
306 LOG.trace(this + ": freeing empty stale " + shm);
307 }
308 shm.free();
309 }
310 } else {
311 ShmId shmId = shm.getShmId();
312 full.remove(shmId); // The shm can't be full if we just freed a slot.
313 if (shm.isEmpty()) {
314 notFull.remove(shmId);
315
316 // If the shared memory segment is now empty, we call shutdown(2) on
317 // the UNIX domain socket associated with it. The DomainSocketWatcher,
318 // which is watching this socket, will call DfsClientShm#handle,
319 // cleaning up this shared memory segment.
320 //
321 // See #{DfsClientShmManager#domainSocketWatcher} for details about why
322 // we don't want to call DomainSocketWatcher#remove directly here.
323 //
324 // Note that we could experience 'fragmentation' here, where the
325 // DFSClient allocates a bunch of slots in different shared memory
326 // segments, and then frees most of them, but never fully empties out
327 // any segment. We make some attempt to avoid this fragmentation by
328 // always allocating new slots out of the shared memory segment with the
329 // lowest ID, but it could still occur. In most workloads,
330 // fragmentation should not be a major concern, since it doesn't impact
331 // peak file descriptor usage or the speed of allocation.
332 if (LOG.isTraceEnabled()) {
333 LOG.trace(this + ": shutting down UNIX domain socket for " +
334 "empty " + shm);
335 }
336 shutdown(shm);
337 } else {
338 notFull.put(shmId, shm);
339 }
340 }
341 }
342
343 /**
344 * Unregister a shared memory segment.
345 *
346 * Once a segment is unregistered, we will not allocate any more slots
347 * inside that segment.
348 *
349 * The DomainSocketWatcher calls this while holding the DomainSocketWatcher
350 * lock.
351 *
352 * @param shmId The ID of the shared memory segment to unregister.
353 */
354 void unregisterShm(ShmId shmId) {
355 lock.lock();
356 try {
357 full.remove(shmId);
358 notFull.remove(shmId);
359 } finally {
360 lock.unlock();
361 }
362 }
363
364 @Override
365 public String toString() {
366 return String.format("EndpointShmManager(%s, parent=%s)",
367 datanode, DfsClientShmManager.this);
368 }
369
370 PerDatanodeVisitorInfo getVisitorInfo() {
371 return new PerDatanodeVisitorInfo(full, notFull, disabled);
372 }
373
374 final void shutdown(DfsClientShm shm) {
375 try {
376 shm.getPeer().getDomainSocket().shutdown();
377 } catch (IOException e) {
378 LOG.warn(this + ": error shutting down shm: got IOException calling " +
379 "shutdown(SHUT_RDWR)", e);
380 }
381 }
382 }
383
384 private boolean closed = false;
385
386 private final ReentrantLock lock = new ReentrantLock();
387
388 /**
389 * A condition variable which is signalled when we finish loading a segment
390 * from the Datanode.
391 */
392 private final Condition finishedLoading = lock.newCondition();
393
394 /**
395 * Information about each Datanode.
396 */
397 private final HashMap<DatanodeInfo, EndpointShmManager> datanodes =
398 new HashMap<DatanodeInfo, EndpointShmManager>(1);
399
400 /**
401 * The DomainSocketWatcher which keeps track of the UNIX domain socket
402 * associated with each shared memory segment.
403 *
404 * Note: because the DomainSocketWatcher makes callbacks into this
405 * DfsClientShmManager object, you must MUST NOT attempt to take the
406 * DomainSocketWatcher lock while holding the DfsClientShmManager lock,
407 * or else deadlock might result. This means that most DomainSocketWatcher
408 * methods are off-limits unless you release the manager lock first.
409 */
410 private final DomainSocketWatcher domainSocketWatcher;
411
412 DfsClientShmManager(int interruptCheckPeriodMs) throws IOException {
413 this.domainSocketWatcher = new DomainSocketWatcher(interruptCheckPeriodMs);
414 }
415
416 public Slot allocSlot(DatanodeInfo datanode, DomainPeer peer,
417 MutableBoolean usedPeer, ExtendedBlockId blockId,
418 String clientName) throws IOException {
419 lock.lock();
420 try {
421 if (closed) {
422 LOG.trace(this + ": the DfsClientShmManager isclosed.");
423 return null;
424 }
425 EndpointShmManager shmManager = datanodes.get(datanode);
426 if (shmManager == null) {
427 shmManager = new EndpointShmManager(datanode);
428 datanodes.put(datanode, shmManager);
429 }
430 return shmManager.allocSlot(peer, usedPeer, clientName, blockId);
431 } finally {
432 lock.unlock();
433 }
434 }
435
436 public void freeSlot(Slot slot) {
437 lock.lock();
438 try {
439 DfsClientShm shm = (DfsClientShm)slot.getShm();
440 shm.getEndpointShmManager().freeSlot(slot);
441 } finally {
442 lock.unlock();
443 }
444 }
445
446 @VisibleForTesting
447 public static class PerDatanodeVisitorInfo {
448 public final TreeMap<ShmId, DfsClientShm> full;
449 public final TreeMap<ShmId, DfsClientShm> notFull;
450 public final boolean disabled;
451
452 PerDatanodeVisitorInfo(TreeMap<ShmId, DfsClientShm> full,
453 TreeMap<ShmId, DfsClientShm> notFull, boolean disabled) {
454 this.full = full;
455 this.notFull = notFull;
456 this.disabled = disabled;
457 }
458 }
459
460 @VisibleForTesting
461 public interface Visitor {
462 void visit(HashMap<DatanodeInfo, PerDatanodeVisitorInfo> info)
463 throws IOException;
464 }
465
466 @VisibleForTesting
467 public void visit(Visitor visitor) throws IOException {
468 lock.lock();
469 try {
470 HashMap<DatanodeInfo, PerDatanodeVisitorInfo> info =
471 new HashMap<DatanodeInfo, PerDatanodeVisitorInfo>();
472 for (Entry<DatanodeInfo, EndpointShmManager> entry :
473 datanodes.entrySet()) {
474 info.put(entry.getKey(), entry.getValue().getVisitorInfo());
475 }
476 visitor.visit(info);
477 } finally {
478 lock.unlock();
479 }
480 }
481
482 /**
483 * Close the DfsClientShmManager.
484 */
485 @Override
486 public void close() throws IOException {
487 lock.lock();
488 try {
489 if (closed) return;
490 closed = true;
491 } finally {
492 lock.unlock();
493 }
494 // When closed, the domainSocketWatcher will issue callbacks that mark
495 // all the outstanding DfsClientShm segments as stale.
496 IOUtils.cleanup(LOG, domainSocketWatcher);
497 }
498
499
500 @Override
501 public String toString() {
502 return String.format("ShortCircuitShmManager(%08x)",
503 System.identityHashCode(this));
504 }
505
506 @VisibleForTesting
507 public DomainSocketWatcher getDomainSocketWatcher() {
508 return domainSocketWatcher;
509 }
510 }