001 /**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements. See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership. The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License. You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018 package org.apache.hadoop.hdfs.client;
019
020 import java.io.BufferedOutputStream;
021 import java.io.Closeable;
022 import java.io.DataInputStream;
023 import java.io.DataOutputStream;
024
025 import org.apache.hadoop.classification.InterfaceAudience;
026
027 import java.io.IOException;
028 import java.nio.MappedByteBuffer;
029 import java.util.HashMap;
030 import java.util.Map;
031 import java.util.Map.Entry;
032 import java.util.TreeMap;
033 import java.util.concurrent.ScheduledFuture;
034 import java.util.concurrent.ScheduledThreadPoolExecutor;
035 import java.util.concurrent.TimeUnit;
036 import java.util.concurrent.locks.Condition;
037 import java.util.concurrent.locks.ReentrantLock;
038
039 import org.apache.commons.lang.mutable.MutableBoolean;
040 import org.apache.commons.logging.Log;
041 import org.apache.commons.logging.LogFactory;
042 import org.apache.hadoop.conf.Configuration;
043 import org.apache.hadoop.hdfs.ExtendedBlockId;
044 import org.apache.hadoop.hdfs.DFSConfigKeys;
045 import org.apache.hadoop.hdfs.client.ShortCircuitReplica;
046 import org.apache.hadoop.hdfs.ShortCircuitShm.Slot;
047 import org.apache.hadoop.hdfs.net.DomainPeer;
048 import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
049 import org.apache.hadoop.hdfs.protocol.datatransfer.Sender;
050 import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.ReleaseShortCircuitAccessResponseProto;
051 import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.Status;
052 import org.apache.hadoop.hdfs.protocolPB.PBHelper;
053 import org.apache.hadoop.io.IOUtils;
054 import org.apache.hadoop.ipc.RetriableException;
055 import org.apache.hadoop.net.unix.DomainSocket;
056 import org.apache.hadoop.net.unix.DomainSocketWatcher;
057 import org.apache.hadoop.security.token.SecretManager.InvalidToken;
058 import org.apache.hadoop.util.StringUtils;
059 import org.apache.hadoop.util.Time;
060 import org.apache.hadoop.util.Waitable;
061
062 import com.google.common.annotations.VisibleForTesting;
063 import com.google.common.base.Preconditions;
064 import com.google.common.util.concurrent.ThreadFactoryBuilder;
065
066 /**
067 * The ShortCircuitCache tracks things which the client needs to access
068 * HDFS block files via short-circuit.
069 *
070 * These things include: memory-mapped regions, file descriptors, and shared
071 * memory areas for communicating with the DataNode.
072 */
073 @InterfaceAudience.Private
074 public class ShortCircuitCache implements Closeable {
075 public static final Log LOG = LogFactory.getLog(ShortCircuitCache.class);
076
077 /**
078 * Expiry thread which makes sure that the file descriptors get closed
079 * after a while.
080 */
081 private class CacheCleaner implements Runnable, Closeable {
082 private ScheduledFuture<?> future;
083
084 /**
085 * Run the CacheCleaner thread.
086 *
087 * Whenever a thread requests a ShortCircuitReplica object, we will make
088 * sure it gets one. That ShortCircuitReplica object can then be re-used
089 * when another thread requests a ShortCircuitReplica object for the same
090 * block. So in that sense, there is no maximum size to the cache.
091 *
092 * However, when a ShortCircuitReplica object is unreferenced by the
093 * thread(s) that are using it, it becomes evictable. There are two
094 * separate eviction lists-- one for mmaped objects, and another for
095 * non-mmaped objects. We do this in order to avoid having the regular
096 * files kick the mmaped files out of the cache too quickly. Reusing
097 * an already-existing mmap gives a huge performance boost, since the
098 * page table entries don't have to be re-populated. Both the mmap
099 * and non-mmap evictable lists have maximum sizes and maximum lifespans.
100 */
101 @Override
102 public void run() {
103 ShortCircuitCache.this.lock.lock();
104 try {
105 if (ShortCircuitCache.this.closed) return;
106 long curMs = Time.monotonicNow();
107
108 if (LOG.isDebugEnabled()) {
109 LOG.debug(this + ": cache cleaner running at " + curMs);
110 }
111
112 int numDemoted = demoteOldEvictableMmaped(curMs);
113 int numPurged = 0;
114 Long evictionTimeNs = Long.valueOf(0);
115 while (true) {
116 Entry<Long, ShortCircuitReplica> entry =
117 evictableMmapped.ceilingEntry(evictionTimeNs);
118 if (entry == null) break;
119 evictionTimeNs = entry.getKey();
120 long evictionTimeMs =
121 TimeUnit.MILLISECONDS.convert(evictionTimeNs, TimeUnit.NANOSECONDS);
122 if (evictionTimeMs + maxNonMmappedEvictableLifespanMs >= curMs) break;
123 ShortCircuitReplica replica = entry.getValue();
124 if (LOG.isTraceEnabled()) {
125 LOG.trace("CacheCleaner: purging " + replica + ": " +
126 StringUtils.getStackTrace(Thread.currentThread()));
127 }
128 purge(replica);
129 numPurged++;
130 }
131
132 if (LOG.isDebugEnabled()) {
133 LOG.debug(this + ": finishing cache cleaner run started at " +
134 curMs + ". Demoted " + numDemoted + " mmapped replicas; " +
135 "purged " + numPurged + " replicas.");
136 }
137 } finally {
138 ShortCircuitCache.this.lock.unlock();
139 }
140 }
141
142 @Override
143 public void close() throws IOException {
144 if (future != null) {
145 future.cancel(false);
146 }
147 }
148
149 public void setFuture(ScheduledFuture<?> future) {
150 this.future = future;
151 }
152
153 /**
154 * Get the rate at which this cleaner thread should be scheduled.
155 *
156 * We do this by taking the minimum expiration time and dividing by 4.
157 *
158 * @return the rate in milliseconds at which this thread should be
159 * scheduled.
160 */
161 public long getRateInMs() {
162 long minLifespanMs =
163 Math.min(maxNonMmappedEvictableLifespanMs,
164 maxEvictableMmapedLifespanMs);
165 long sampleTimeMs = minLifespanMs / 4;
166 return (sampleTimeMs < 1) ? 1 : sampleTimeMs;
167 }
168 }
169
170 /**
171 * A task which asks the DataNode to release a short-circuit shared memory
172 * slot. If successful, this will tell the DataNode to stop monitoring
173 * changes to the mlock status of the replica associated with the slot.
174 * It will also allow us (the client) to re-use this slot for another
175 * replica. If we can't communicate with the DataNode for some reason,
176 * we tear down the shared memory segment to avoid being in an inconsistent
177 * state.
178 */
179 private class SlotReleaser implements Runnable {
180 /**
181 * The slot that we need to release.
182 */
183 private final Slot slot;
184
185 SlotReleaser(Slot slot) {
186 this.slot = slot;
187 }
188
189 @Override
190 public void run() {
191 if (LOG.isTraceEnabled()) {
192 LOG.trace(ShortCircuitCache.this + ": about to release " + slot);
193 }
194 final DfsClientShm shm = (DfsClientShm)slot.getShm();
195 final DomainSocket shmSock = shm.getPeer().getDomainSocket();
196 DomainSocket sock = null;
197 DataOutputStream out = null;
198 final String path = shmSock.getPath();
199 boolean success = false;
200 try {
201 sock = DomainSocket.connect(path);
202 out = new DataOutputStream(
203 new BufferedOutputStream(sock.getOutputStream()));
204 new Sender(out).releaseShortCircuitFds(slot.getSlotId());
205 DataInputStream in = new DataInputStream(sock.getInputStream());
206 ReleaseShortCircuitAccessResponseProto resp =
207 ReleaseShortCircuitAccessResponseProto.parseFrom(
208 PBHelper.vintPrefixed(in));
209 if (resp.getStatus() != Status.SUCCESS) {
210 String error = resp.hasError() ? resp.getError() : "(unknown)";
211 throw new IOException(resp.getStatus().toString() + ": " + error);
212 }
213 if (LOG.isTraceEnabled()) {
214 LOG.trace(ShortCircuitCache.this + ": released " + slot);
215 }
216 success = true;
217 } catch (IOException e) {
218 LOG.error(ShortCircuitCache.this + ": failed to release " +
219 "short-circuit shared memory slot " + slot + " by sending " +
220 "ReleaseShortCircuitAccessRequestProto to " + path +
221 ". Closing shared memory segment.", e);
222 } finally {
223 if (success) {
224 shmManager.freeSlot(slot);
225 } else {
226 shm.getEndpointShmManager().shutdown(shm);
227 }
228 IOUtils.cleanup(LOG, sock, out);
229 }
230 }
231 }
232
233 public interface ShortCircuitReplicaCreator {
234 /**
235 * Attempt to create a ShortCircuitReplica object.
236 *
237 * This callback will be made without holding any locks.
238 *
239 * @return a non-null ShortCircuitReplicaInfo object.
240 */
241 ShortCircuitReplicaInfo createShortCircuitReplicaInfo();
242 }
243
244 /**
245 * Lock protecting the cache.
246 */
247 private final ReentrantLock lock = new ReentrantLock();
248
249 /**
250 * The executor service that runs the cacheCleaner.
251 */
252 private final ScheduledThreadPoolExecutor cleanerExecutor
253 = new ScheduledThreadPoolExecutor(1, new ThreadFactoryBuilder().
254 setDaemon(true).setNameFormat("ShortCircuitCache_Cleaner").
255 build());
256
257 /**
258 * The executor service that runs the cacheCleaner.
259 */
260 private final ScheduledThreadPoolExecutor releaserExecutor
261 = new ScheduledThreadPoolExecutor(1, new ThreadFactoryBuilder().
262 setDaemon(true).setNameFormat("ShortCircuitCache_SlotReleaser").
263 build());
264
265 /**
266 * A map containing all ShortCircuitReplicaInfo objects, organized by Key.
267 * ShortCircuitReplicaInfo objects may contain a replica, or an InvalidToken
268 * exception.
269 */
270 private final HashMap<ExtendedBlockId, Waitable<ShortCircuitReplicaInfo>>
271 replicaInfoMap = new HashMap<ExtendedBlockId,
272 Waitable<ShortCircuitReplicaInfo>>();
273
274 /**
275 * The CacheCleaner. We don't create this and schedule it until it becomes
276 * necessary.
277 */
278 private CacheCleaner cacheCleaner;
279
280 /**
281 * Tree of evictable elements.
282 *
283 * Maps (unique) insertion time in nanoseconds to the element.
284 */
285 private final TreeMap<Long, ShortCircuitReplica> evictable =
286 new TreeMap<Long, ShortCircuitReplica>();
287
288 /**
289 * Maximum total size of the cache, including both mmapped and
290 * no$-mmapped elements.
291 */
292 private final int maxTotalSize;
293
294 /**
295 * Non-mmaped elements older than this will be closed.
296 */
297 private long maxNonMmappedEvictableLifespanMs;
298
299 /**
300 * Tree of mmaped evictable elements.
301 *
302 * Maps (unique) insertion time in nanoseconds to the element.
303 */
304 private final TreeMap<Long, ShortCircuitReplica> evictableMmapped =
305 new TreeMap<Long, ShortCircuitReplica>();
306
307 /**
308 * Maximum number of mmaped evictable elements.
309 */
310 private int maxEvictableMmapedSize;
311
312 /**
313 * Mmaped elements older than this will be closed.
314 */
315 private final long maxEvictableMmapedLifespanMs;
316
317 /**
318 * The minimum number of milliseconds we'll wait after an unsuccessful
319 * mmap attempt before trying again.
320 */
321 private final long mmapRetryTimeoutMs;
322
323 /**
324 * How long we will keep replicas in the cache before declaring them
325 * to be stale.
326 */
327 private final long staleThresholdMs;
328
329 /**
330 * True if the ShortCircuitCache is closed.
331 */
332 private boolean closed = false;
333
334 /**
335 * Number of existing mmaps associated with this cache.
336 */
337 private int outstandingMmapCount = 0;
338
339 /**
340 * Manages short-circuit shared memory segments for the client.
341 */
342 private final DfsClientShmManager shmManager;
343
344 /**
345 * Create a {@link ShortCircuitCache} object from a {@link Configuration}
346 */
347 public static ShortCircuitCache fromConf(Configuration conf) {
348 return new ShortCircuitCache(
349 conf.getInt(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_SIZE_KEY,
350 DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_SIZE_DEFAULT),
351 conf.getLong(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS_KEY,
352 DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS_DEFAULT),
353 conf.getInt(DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_SIZE,
354 DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_SIZE_DEFAULT),
355 conf.getLong(DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_TIMEOUT_MS,
356 DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_TIMEOUT_MS_DEFAULT),
357 conf.getLong(DFSConfigKeys.DFS_CLIENT_MMAP_RETRY_TIMEOUT_MS,
358 DFSConfigKeys.DFS_CLIENT_MMAP_RETRY_TIMEOUT_MS_DEFAULT),
359 conf.getLong(DFSConfigKeys.DFS_CLIENT_SHORT_CIRCUIT_REPLICA_STALE_THRESHOLD_MS,
360 DFSConfigKeys.DFS_CLIENT_SHORT_CIRCUIT_REPLICA_STALE_THRESHOLD_MS_DEFAULT),
361 conf.getInt(DFSConfigKeys.DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS,
362 DFSConfigKeys.DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS_DEFAULT));
363 }
364
365 public ShortCircuitCache(int maxTotalSize, long maxNonMmappedEvictableLifespanMs,
366 int maxEvictableMmapedSize, long maxEvictableMmapedLifespanMs,
367 long mmapRetryTimeoutMs, long staleThresholdMs, int shmInterruptCheckMs) {
368 Preconditions.checkArgument(maxTotalSize >= 0);
369 this.maxTotalSize = maxTotalSize;
370 Preconditions.checkArgument(maxNonMmappedEvictableLifespanMs >= 0);
371 this.maxNonMmappedEvictableLifespanMs = maxNonMmappedEvictableLifespanMs;
372 Preconditions.checkArgument(maxEvictableMmapedSize >= 0);
373 this.maxEvictableMmapedSize = maxEvictableMmapedSize;
374 Preconditions.checkArgument(maxEvictableMmapedLifespanMs >= 0);
375 this.maxEvictableMmapedLifespanMs = maxEvictableMmapedLifespanMs;
376 this.mmapRetryTimeoutMs = mmapRetryTimeoutMs;
377 this.staleThresholdMs = staleThresholdMs;
378 DfsClientShmManager shmManager = null;
379 if ((shmInterruptCheckMs > 0) &&
380 (DomainSocketWatcher.getLoadingFailureReason() == null)) {
381 try {
382 shmManager = new DfsClientShmManager(shmInterruptCheckMs);
383 } catch (IOException e) {
384 LOG.error("failed to create ShortCircuitShmManager", e);
385 }
386 }
387 this.shmManager = shmManager;
388 }
389
390 public long getMmapRetryTimeoutMs() {
391 return mmapRetryTimeoutMs;
392 }
393
394 public long getStaleThresholdMs() {
395 return staleThresholdMs;
396 }
397
398 /**
399 * Increment the reference count of a replica, and remove it from any free
400 * list it may be in.
401 *
402 * You must hold the cache lock while calling this function.
403 *
404 * @param replica The replica we're removing.
405 */
406 private void ref(ShortCircuitReplica replica) {
407 lock.lock();
408 try {
409 Preconditions.checkArgument(replica.refCount > 0,
410 "can't ref " + replica + " because its refCount reached " +
411 replica.refCount);
412 Long evictableTimeNs = replica.getEvictableTimeNs();
413 replica.refCount++;
414 if (evictableTimeNs != null) {
415 String removedFrom = removeEvictable(replica);
416 if (LOG.isTraceEnabled()) {
417 LOG.trace(this + ": " + removedFrom +
418 " no longer contains " + replica + ". refCount " +
419 (replica.refCount - 1) + " -> " + replica.refCount +
420 StringUtils.getStackTrace(Thread.currentThread()));
421
422 }
423 } else if (LOG.isTraceEnabled()) {
424 LOG.trace(this + ": replica refCount " +
425 (replica.refCount - 1) + " -> " + replica.refCount +
426 StringUtils.getStackTrace(Thread.currentThread()));
427 }
428 } finally {
429 lock.unlock();
430 }
431 }
432
433 /**
434 * Unreference a replica.
435 *
436 * You must hold the cache lock while calling this function.
437 *
438 * @param replica The replica being unreferenced.
439 */
440 void unref(ShortCircuitReplica replica) {
441 lock.lock();
442 try {
443 // If the replica is stale, but we haven't purged it yet, let's do that.
444 // It would be a shame to evict a non-stale replica so that we could put
445 // a stale one into the cache.
446 if ((!replica.purged) && replica.isStale()) {
447 purge(replica);
448 }
449 String addedString = "";
450 boolean shouldTrimEvictionMaps = false;
451 int newRefCount = --replica.refCount;
452 if (newRefCount == 0) {
453 // Close replica, since there are no remaining references to it.
454 Preconditions.checkArgument(replica.purged,
455 "Replica " + replica + " reached a refCount of 0 without " +
456 "being purged");
457 replica.close();
458 } else if (newRefCount == 1) {
459 Preconditions.checkState(null == replica.getEvictableTimeNs(),
460 "Replica " + replica + " had a refCount higher than 1, " +
461 "but was still evictable (evictableTimeNs = " +
462 replica.getEvictableTimeNs() + ")");
463 if (!replica.purged) {
464 // Add the replica to the end of an eviction list.
465 // Eviction lists are sorted by time.
466 if (replica.hasMmap()) {
467 insertEvictable(System.nanoTime(), replica, evictableMmapped);
468 addedString = "added to evictableMmapped, ";
469 } else {
470 insertEvictable(System.nanoTime(), replica, evictable);
471 addedString = "added to evictable, ";
472 }
473 shouldTrimEvictionMaps = true;
474 }
475 } else {
476 Preconditions.checkArgument(replica.refCount >= 0,
477 "replica's refCount went negative (refCount = " +
478 replica.refCount + " for " + replica + ")");
479 }
480 if (LOG.isTraceEnabled()) {
481 LOG.trace(this + ": unref replica " + replica +
482 ": " + addedString + " refCount " +
483 (newRefCount + 1) + " -> " + newRefCount +
484 StringUtils.getStackTrace(Thread.currentThread()));
485 }
486 if (shouldTrimEvictionMaps) {
487 trimEvictionMaps();
488 }
489 } finally {
490 lock.unlock();
491 }
492 }
493
494 /**
495 * Demote old evictable mmaps into the regular eviction map.
496 *
497 * You must hold the cache lock while calling this function.
498 *
499 * @param now Current time in monotonic milliseconds.
500 * @return Number of replicas demoted.
501 */
502 private int demoteOldEvictableMmaped(long now) {
503 int numDemoted = 0;
504 boolean needMoreSpace = false;
505 Long evictionTimeNs = Long.valueOf(0);
506
507 while (true) {
508 Entry<Long, ShortCircuitReplica> entry =
509 evictableMmapped.ceilingEntry(evictionTimeNs);
510 if (entry == null) break;
511 evictionTimeNs = entry.getKey();
512 long evictionTimeMs =
513 TimeUnit.MILLISECONDS.convert(evictionTimeNs, TimeUnit.NANOSECONDS);
514 if (evictionTimeMs + maxEvictableMmapedLifespanMs >= now) {
515 if (evictableMmapped.size() < maxEvictableMmapedSize) {
516 break;
517 }
518 needMoreSpace = true;
519 }
520 ShortCircuitReplica replica = entry.getValue();
521 if (LOG.isTraceEnabled()) {
522 String rationale = needMoreSpace ? "because we need more space" :
523 "because it's too old";
524 LOG.trace("demoteOldEvictable: demoting " + replica + ": " +
525 rationale + ": " +
526 StringUtils.getStackTrace(Thread.currentThread()));
527 }
528 removeEvictable(replica, evictableMmapped);
529 munmap(replica);
530 insertEvictable(evictionTimeNs, replica, evictable);
531 numDemoted++;
532 }
533 return numDemoted;
534 }
535
536 /**
537 * Trim the eviction lists.
538 */
539 private void trimEvictionMaps() {
540 long now = Time.monotonicNow();
541 demoteOldEvictableMmaped(now);
542
543 while (true) {
544 long evictableSize = evictable.size();
545 long evictableMmappedSize = evictableMmapped.size();
546 if (evictableSize + evictableMmappedSize <= maxTotalSize) {
547 return;
548 }
549 ShortCircuitReplica replica;
550 if (evictableSize == 0) {
551 replica = evictableMmapped.firstEntry().getValue();
552 } else {
553 replica = evictable.firstEntry().getValue();
554 }
555 if (LOG.isTraceEnabled()) {
556 LOG.trace(this + ": trimEvictionMaps is purging " + replica +
557 StringUtils.getStackTrace(Thread.currentThread()));
558 }
559 purge(replica);
560 }
561 }
562
563 /**
564 * Munmap a replica, updating outstandingMmapCount.
565 *
566 * @param replica The replica to munmap.
567 */
568 private void munmap(ShortCircuitReplica replica) {
569 replica.munmap();
570 outstandingMmapCount--;
571 }
572
573 /**
574 * Remove a replica from an evictable map.
575 *
576 * @param replica The replica to remove.
577 * @return The map it was removed from.
578 */
579 private String removeEvictable(ShortCircuitReplica replica) {
580 if (replica.hasMmap()) {
581 removeEvictable(replica, evictableMmapped);
582 return "evictableMmapped";
583 } else {
584 removeEvictable(replica, evictable);
585 return "evictable";
586 }
587 }
588
589 /**
590 * Remove a replica from an evictable map.
591 *
592 * @param replica The replica to remove.
593 * @param map The map to remove it from.
594 */
595 private void removeEvictable(ShortCircuitReplica replica,
596 TreeMap<Long, ShortCircuitReplica> map) {
597 Long evictableTimeNs = replica.getEvictableTimeNs();
598 Preconditions.checkNotNull(evictableTimeNs);
599 ShortCircuitReplica removed = map.remove(evictableTimeNs);
600 Preconditions.checkState(removed == replica,
601 "failed to make " + replica + " unevictable");
602 replica.setEvictableTimeNs(null);
603 }
604
605 /**
606 * Insert a replica into an evictable map.
607 *
608 * If an element already exists with this eviction time, we add a nanosecond
609 * to it until we find an unused key.
610 *
611 * @param evictionTimeNs The eviction time in absolute nanoseconds.
612 * @param replica The replica to insert.
613 * @param map The map to insert it into.
614 */
615 private void insertEvictable(Long evictionTimeNs,
616 ShortCircuitReplica replica, TreeMap<Long, ShortCircuitReplica> map) {
617 while (map.containsKey(evictionTimeNs)) {
618 evictionTimeNs++;
619 }
620 Preconditions.checkState(null == replica.getEvictableTimeNs());
621 Long time = Long.valueOf(evictionTimeNs);
622 replica.setEvictableTimeNs(time);
623 map.put(time, replica);
624 }
625
626 /**
627 * Purge a replica from the cache.
628 *
629 * This doesn't necessarily close the replica, since there may be
630 * outstanding references to it. However, it does mean the cache won't
631 * hand it out to anyone after this.
632 *
633 * You must hold the cache lock while calling this function.
634 *
635 * @param replica The replica being removed.
636 */
637 private void purge(ShortCircuitReplica replica) {
638 boolean removedFromInfoMap = false;
639 String evictionMapName = null;
640 Preconditions.checkArgument(!replica.purged);
641 replica.purged = true;
642 Waitable<ShortCircuitReplicaInfo> val = replicaInfoMap.get(replica.key);
643 if (val != null) {
644 ShortCircuitReplicaInfo info = val.getVal();
645 if ((info != null) && (info.getReplica() == replica)) {
646 replicaInfoMap.remove(replica.key);
647 removedFromInfoMap = true;
648 }
649 }
650 Long evictableTimeNs = replica.getEvictableTimeNs();
651 if (evictableTimeNs != null) {
652 evictionMapName = removeEvictable(replica);
653 }
654 if (LOG.isTraceEnabled()) {
655 StringBuilder builder = new StringBuilder();
656 builder.append(this).append(": ").append(": purged ").
657 append(replica).append(" from the cache.");
658 if (removedFromInfoMap) {
659 builder.append(" Removed from the replicaInfoMap.");
660 }
661 if (evictionMapName != null) {
662 builder.append(" Removed from ").append(evictionMapName);
663 }
664 LOG.trace(builder.toString());
665 }
666 unref(replica);
667 }
668
669 /**
670 * Fetch or create a replica.
671 *
672 * You must hold the cache lock while calling this function.
673 *
674 * @param key Key to use for lookup.
675 * @param creator Replica creator callback. Will be called without
676 * the cache lock being held.
677 *
678 * @return Null if no replica could be found or created.
679 * The replica, otherwise.
680 */
681 public ShortCircuitReplicaInfo fetchOrCreate(ExtendedBlockId key,
682 ShortCircuitReplicaCreator creator) {
683 Waitable<ShortCircuitReplicaInfo> newWaitable = null;
684 lock.lock();
685 try {
686 ShortCircuitReplicaInfo info = null;
687 do {
688 if (closed) {
689 if (LOG.isTraceEnabled()) {
690 LOG.trace(this + ": can't fetchOrCreate " + key +
691 " because the cache is closed.");
692 }
693 return null;
694 }
695 Waitable<ShortCircuitReplicaInfo> waitable = replicaInfoMap.get(key);
696 if (waitable != null) {
697 try {
698 info = fetch(key, waitable);
699 } catch (RetriableException e) {
700 if (LOG.isDebugEnabled()) {
701 LOG.debug(this + ": retrying " + e.getMessage());
702 }
703 continue;
704 }
705 }
706 } while (false);
707 if (info != null) return info;
708 // We need to load the replica ourselves.
709 newWaitable = new Waitable<ShortCircuitReplicaInfo>(lock.newCondition());
710 replicaInfoMap.put(key, newWaitable);
711 } finally {
712 lock.unlock();
713 }
714 return create(key, creator, newWaitable);
715 }
716
717 /**
718 * Fetch an existing ReplicaInfo object.
719 *
720 * @param key The key that we're using.
721 * @param waitable The waitable object to wait on.
722 * @return The existing ReplicaInfo object, or null if there is
723 * none.
724 *
725 * @throws RetriableException If the caller needs to retry.
726 */
727 private ShortCircuitReplicaInfo fetch(ExtendedBlockId key,
728 Waitable<ShortCircuitReplicaInfo> waitable) throws RetriableException {
729 // Another thread is already in the process of loading this
730 // ShortCircuitReplica. So we simply wait for it to complete.
731 ShortCircuitReplicaInfo info;
732 try {
733 if (LOG.isTraceEnabled()) {
734 LOG.trace(this + ": found waitable for " + key);
735 }
736 info = waitable.await();
737 } catch (InterruptedException e) {
738 LOG.info(this + ": interrupted while waiting for " + key);
739 Thread.currentThread().interrupt();
740 throw new RetriableException("interrupted");
741 }
742 if (info.getInvalidTokenException() != null) {
743 LOG.warn(this + ": could not get " + key + " due to InvalidToken " +
744 "exception.", info.getInvalidTokenException());
745 return info;
746 }
747 ShortCircuitReplica replica = info.getReplica();
748 if (replica == null) {
749 LOG.warn(this + ": failed to get " + key);
750 return info;
751 }
752 if (replica.purged) {
753 // Ignore replicas that have already been purged from the cache.
754 throw new RetriableException("Ignoring purged replica " +
755 replica + ". Retrying.");
756 }
757 // Check if the replica is stale before using it.
758 // If it is, purge it and retry.
759 if (replica.isStale()) {
760 LOG.info(this + ": got stale replica " + replica + ". Removing " +
761 "this replica from the replicaInfoMap and retrying.");
762 // Remove the cache's reference to the replica. This may or may not
763 // trigger a close.
764 purge(replica);
765 throw new RetriableException("ignoring stale replica " + replica);
766 }
767 ref(replica);
768 return info;
769 }
770
771 private ShortCircuitReplicaInfo create(ExtendedBlockId key,
772 ShortCircuitReplicaCreator creator,
773 Waitable<ShortCircuitReplicaInfo> newWaitable) {
774 // Handle loading a new replica.
775 ShortCircuitReplicaInfo info = null;
776 try {
777 if (LOG.isTraceEnabled()) {
778 LOG.trace(this + ": loading " + key);
779 }
780 info = creator.createShortCircuitReplicaInfo();
781 } catch (RuntimeException e) {
782 LOG.warn(this + ": failed to load " + key, e);
783 }
784 if (info == null) info = new ShortCircuitReplicaInfo();
785 lock.lock();
786 try {
787 if (info.getReplica() != null) {
788 // On success, make sure the cache cleaner thread is running.
789 if (LOG.isTraceEnabled()) {
790 LOG.trace(this + ": successfully loaded " + info.getReplica());
791 }
792 startCacheCleanerThreadIfNeeded();
793 // Note: new ShortCircuitReplicas start with a refCount of 2,
794 // indicating that both this cache and whoever requested the
795 // creation of the replica hold a reference. So we don't need
796 // to increment the reference count here.
797 } else {
798 // On failure, remove the waitable from the replicaInfoMap.
799 Waitable<ShortCircuitReplicaInfo> waitableInMap = replicaInfoMap.get(key);
800 if (waitableInMap == newWaitable) replicaInfoMap.remove(key);
801 if (info.getInvalidTokenException() != null) {
802 LOG.warn(this + ": could not load " + key + " due to InvalidToken " +
803 "exception.", info.getInvalidTokenException());
804 } else {
805 LOG.warn(this + ": failed to load " + key);
806 }
807 }
808 newWaitable.provide(info);
809 } finally {
810 lock.unlock();
811 }
812 return info;
813 }
814
815 private void startCacheCleanerThreadIfNeeded() {
816 if (cacheCleaner == null) {
817 cacheCleaner = new CacheCleaner();
818 long rateMs = cacheCleaner.getRateInMs();
819 ScheduledFuture<?> future =
820 cleanerExecutor.scheduleAtFixedRate(cacheCleaner, rateMs, rateMs,
821 TimeUnit.MILLISECONDS);
822 cacheCleaner.setFuture(future);
823 if (LOG.isDebugEnabled()) {
824 LOG.debug(this + ": starting cache cleaner thread which will run " +
825 "every " + rateMs + " ms");
826 }
827 }
828 }
829
830 ClientMmap getOrCreateClientMmap(ShortCircuitReplica replica,
831 boolean anchored) {
832 Condition newCond;
833 lock.lock();
834 try {
835 while (replica.mmapData != null) {
836 if (replica.mmapData instanceof MappedByteBuffer) {
837 ref(replica);
838 MappedByteBuffer mmap = (MappedByteBuffer)replica.mmapData;
839 return new ClientMmap(replica, mmap, anchored);
840 } else if (replica.mmapData instanceof Long) {
841 long lastAttemptTimeMs = (Long)replica.mmapData;
842 long delta = Time.monotonicNow() - lastAttemptTimeMs;
843 if (delta < staleThresholdMs) {
844 if (LOG.isTraceEnabled()) {
845 LOG.trace(this + ": can't create client mmap for " +
846 replica + " because we failed to " +
847 "create one just " + delta + "ms ago.");
848 }
849 return null;
850 }
851 if (LOG.isTraceEnabled()) {
852 LOG.trace(this + ": retrying client mmap for " + replica +
853 ", " + delta + " ms after the previous failure.");
854 }
855 } else if (replica.mmapData instanceof Condition) {
856 Condition cond = (Condition)replica.mmapData;
857 cond.awaitUninterruptibly();
858 } else {
859 Preconditions.checkState(false, "invalid mmapData type " +
860 replica.mmapData.getClass().getName());
861 }
862 }
863 newCond = lock.newCondition();
864 replica.mmapData = newCond;
865 } finally {
866 lock.unlock();
867 }
868 MappedByteBuffer map = replica.loadMmapInternal();
869 lock.lock();
870 try {
871 if (map == null) {
872 replica.mmapData = Long.valueOf(Time.monotonicNow());
873 newCond.signalAll();
874 return null;
875 } else {
876 outstandingMmapCount++;
877 replica.mmapData = map;
878 ref(replica);
879 newCond.signalAll();
880 return new ClientMmap(replica, map, anchored);
881 }
882 } finally {
883 lock.unlock();
884 }
885 }
886
887 /**
888 * Close the cache and free all associated resources.
889 */
890 @Override
891 public void close() {
892 try {
893 lock.lock();
894 if (closed) return;
895 closed = true;
896 LOG.info(this + ": closing");
897 maxNonMmappedEvictableLifespanMs = 0;
898 maxEvictableMmapedSize = 0;
899 // Close and join cacheCleaner thread.
900 IOUtils.cleanup(LOG, cacheCleaner);
901 // Purge all replicas.
902 while (true) {
903 Entry<Long, ShortCircuitReplica> entry = evictable.firstEntry();
904 if (entry == null) break;
905 purge(entry.getValue());
906 }
907 while (true) {
908 Entry<Long, ShortCircuitReplica> entry = evictableMmapped.firstEntry();
909 if (entry == null) break;
910 purge(entry.getValue());
911 }
912 } finally {
913 lock.unlock();
914 }
915 IOUtils.cleanup(LOG, shmManager);
916 }
917
918 @VisibleForTesting // ONLY for testing
919 public interface CacheVisitor {
920 void visit(int numOutstandingMmaps,
921 Map<ExtendedBlockId, ShortCircuitReplica> replicas,
922 Map<ExtendedBlockId, InvalidToken> failedLoads,
923 Map<Long, ShortCircuitReplica> evictable,
924 Map<Long, ShortCircuitReplica> evictableMmapped);
925 }
926
927 @VisibleForTesting // ONLY for testing
928 public void accept(CacheVisitor visitor) {
929 lock.lock();
930 try {
931 Map<ExtendedBlockId, ShortCircuitReplica> replicas =
932 new HashMap<ExtendedBlockId, ShortCircuitReplica>();
933 Map<ExtendedBlockId, InvalidToken> failedLoads =
934 new HashMap<ExtendedBlockId, InvalidToken>();
935 for (Entry<ExtendedBlockId, Waitable<ShortCircuitReplicaInfo>> entry :
936 replicaInfoMap.entrySet()) {
937 Waitable<ShortCircuitReplicaInfo> waitable = entry.getValue();
938 if (waitable.hasVal()) {
939 if (waitable.getVal().getReplica() != null) {
940 replicas.put(entry.getKey(), waitable.getVal().getReplica());
941 } else {
942 // The exception may be null here, indicating a failed load that
943 // isn't the result of an invalid block token.
944 failedLoads.put(entry.getKey(),
945 waitable.getVal().getInvalidTokenException());
946 }
947 }
948 }
949 if (LOG.isDebugEnabled()) {
950 StringBuilder builder = new StringBuilder();
951 builder.append("visiting ").append(visitor.getClass().getName()).
952 append("with outstandingMmapCount=").append(outstandingMmapCount).
953 append(", replicas=");
954 String prefix = "";
955 for (Entry<ExtendedBlockId, ShortCircuitReplica> entry : replicas.entrySet()) {
956 builder.append(prefix).append(entry.getValue());
957 prefix = ",";
958 }
959 prefix = "";
960 builder.append(", failedLoads=");
961 for (Entry<ExtendedBlockId, InvalidToken> entry : failedLoads.entrySet()) {
962 builder.append(prefix).append(entry.getValue());
963 prefix = ",";
964 }
965 prefix = "";
966 builder.append(", evictable=");
967 for (Entry<Long, ShortCircuitReplica> entry : evictable.entrySet()) {
968 builder.append(prefix).append(entry.getKey()).
969 append(":").append(entry.getValue());
970 prefix = ",";
971 }
972 prefix = "";
973 builder.append(", evictableMmapped=");
974 for (Entry<Long, ShortCircuitReplica> entry : evictableMmapped.entrySet()) {
975 builder.append(prefix).append(entry.getKey()).
976 append(":").append(entry.getValue());
977 prefix = ",";
978 }
979 LOG.debug(builder.toString());
980 }
981 visitor.visit(outstandingMmapCount, replicas, failedLoads,
982 evictable, evictableMmapped);
983 } finally {
984 lock.unlock();
985 }
986 }
987
988 @Override
989 public String toString() {
990 return "ShortCircuitCache(0x" +
991 Integer.toHexString(System.identityHashCode(this)) + ")";
992 }
993
994 /**
995 * Allocate a new shared memory slot.
996 *
997 * @param datanode The datanode to allocate a shm slot with.
998 * @param peer A peer connected to the datanode.
999 * @param usedPeer Will be set to true if we use up the provided peer.
1000 * @param blockId The block id and block pool id of the block we're
1001 * allocating this slot for.
1002 * @param clientName The name of the DFSClient allocating the shared
1003 * memory.
1004 * @return Null if short-circuit shared memory is disabled;
1005 * a short-circuit memory slot otherwise.
1006 * @throws IOException An exception if there was an error talking to
1007 * the datanode.
1008 */
1009 public Slot allocShmSlot(DatanodeInfo datanode,
1010 DomainPeer peer, MutableBoolean usedPeer,
1011 ExtendedBlockId blockId, String clientName) throws IOException {
1012 if (shmManager != null) {
1013 return shmManager.allocSlot(datanode, peer, usedPeer,
1014 blockId, clientName);
1015 } else {
1016 return null;
1017 }
1018 }
1019
1020 /**
1021 * Free a slot immediately.
1022 *
1023 * ONLY use this if the DataNode is not yet aware of the slot.
1024 *
1025 * @param slot The slot to free.
1026 */
1027 public void freeSlot(Slot slot) {
1028 Preconditions.checkState(shmManager != null);
1029 slot.makeInvalid();
1030 shmManager.freeSlot(slot);
1031 }
1032
1033 /**
1034 * Schedule a shared memory slot to be released.
1035 *
1036 * @param slot The slot to release.
1037 */
1038 public void scheduleSlotReleaser(Slot slot) {
1039 Preconditions.checkState(shmManager != null);
1040 releaserExecutor.execute(new SlotReleaser(slot));
1041 }
1042
1043 @VisibleForTesting
1044 public DfsClientShmManager getDfsClientShmManager() {
1045 return shmManager;
1046 }
1047 }