001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hdfs.shortcircuit;
019
020import java.io.BufferedOutputStream;
021import java.io.Closeable;
022import java.io.DataInputStream;
023import java.io.DataOutputStream;
024import java.io.IOException;
025import java.nio.MappedByteBuffer;
026import java.util.HashMap;
027import java.util.Map;
028import java.util.Map.Entry;
029import java.util.NoSuchElementException;
030import java.util.concurrent.ScheduledFuture;
031import java.util.concurrent.ScheduledThreadPoolExecutor;
032import java.util.concurrent.TimeUnit;
033import java.util.concurrent.locks.Condition;
034import java.util.concurrent.locks.ReentrantLock;
035
036import org.apache.commons.collections.map.LinkedMap;
037import org.apache.commons.lang.mutable.MutableBoolean;
038import org.apache.hadoop.classification.InterfaceAudience;
039import org.apache.hadoop.hdfs.ExtendedBlockId;
040import org.apache.hadoop.hdfs.client.impl.DfsClientConf.ShortCircuitConf;
041import org.apache.hadoop.hdfs.net.DomainPeer;
042import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
043import org.apache.hadoop.hdfs.protocol.datatransfer.Sender;
044import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.ReleaseShortCircuitAccessResponseProto;
045import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.Status;
046import org.apache.hadoop.hdfs.protocolPB.PBHelperClient;
047import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitShm.Slot;
048import org.apache.hadoop.hdfs.util.IOUtilsClient;
049import org.apache.hadoop.ipc.RetriableException;
050import org.apache.hadoop.net.unix.DomainSocket;
051import org.apache.hadoop.net.unix.DomainSocketWatcher;
052import org.apache.hadoop.security.token.SecretManager.InvalidToken;
053import org.apache.hadoop.util.StringUtils;
054import org.apache.hadoop.util.Time;
055import org.apache.hadoop.util.Waitable;
056
057import com.google.common.annotations.VisibleForTesting;
058import com.google.common.base.Preconditions;
059import com.google.common.util.concurrent.ThreadFactoryBuilder;
060
061import org.slf4j.Logger;
062import org.slf4j.LoggerFactory;
063
064/**
065 * The ShortCircuitCache tracks things which the client needs to access
066 * HDFS block files via short-circuit.
067 *
068 * These things include: memory-mapped regions, file descriptors, and shared
069 * memory areas for communicating with the DataNode.
070 */
071@InterfaceAudience.Private
072public class ShortCircuitCache implements Closeable {
073  public static final Logger LOG = LoggerFactory.getLogger(
074      ShortCircuitCache.class);
075
076  /**
077   * Expiry thread which makes sure that the file descriptors get closed
078   * after a while.
079   */
080  private class CacheCleaner implements Runnable, Closeable {
081    private ScheduledFuture<?> future;
082
083    /**
084     * Run the CacheCleaner thread.
085     *
086     * Whenever a thread requests a ShortCircuitReplica object, we will make
087     * sure it gets one.  That ShortCircuitReplica object can then be re-used
088     * when another thread requests a ShortCircuitReplica object for the same
089     * block.  So in that sense, there is no maximum size to the cache.
090     *
091     * However, when a ShortCircuitReplica object is unreferenced by the
092     * thread(s) that are using it, it becomes evictable.  There are two
093     * separate eviction lists-- one for mmaped objects, and another for
094     * non-mmaped objects.  We do this in order to avoid having the regular
095     * files kick the mmaped files out of the cache too quickly.  Reusing
096     * an already-existing mmap gives a huge performance boost, since the
097     * page table entries don't have to be re-populated.  Both the mmap
098     * and non-mmap evictable lists have maximum sizes and maximum lifespans.
099     */
100    @Override
101    public void run() {
102      ShortCircuitCache.this.lock.lock();
103      try {
104        if (ShortCircuitCache.this.closed) return;
105        long curMs = Time.monotonicNow();
106
107        LOG.debug("{}: cache cleaner running at {}", this, curMs);
108
109        int numDemoted = demoteOldEvictableMmaped(curMs);
110        int numPurged = 0;
111        Long evictionTimeNs;
112        while (true) {
113          Object eldestKey;
114          try {
115            eldestKey = evictable.firstKey();
116          } catch (NoSuchElementException e) {
117            break;
118          }
119          evictionTimeNs = (Long)eldestKey;
120          long evictionTimeMs =
121              TimeUnit.MILLISECONDS.convert(evictionTimeNs, TimeUnit.NANOSECONDS);
122          if (evictionTimeMs + maxNonMmappedEvictableLifespanMs >= curMs) break;
123          ShortCircuitReplica replica = (ShortCircuitReplica)evictable.get(
124              eldestKey);
125          if (LOG.isTraceEnabled()) {
126            LOG.trace("CacheCleaner: purging " + replica + ": " +
127                StringUtils.getStackTrace(Thread.currentThread()));
128          }
129          purge(replica);
130          numPurged++;
131        }
132
133        LOG.debug("{}: finishing cache cleaner run started at {}. Demoted {} "
134                + "mmapped replicas; purged {} replicas.",
135            this, curMs, numDemoted, numPurged);
136      } finally {
137        ShortCircuitCache.this.lock.unlock();
138      }
139    }
140
141    @Override
142    public void close() throws IOException {
143      if (future != null) {
144        future.cancel(false);
145      }
146    }
147
148    public void setFuture(ScheduledFuture<?> future) {
149      this.future = future;
150    }
151
152    /**
153     * Get the rate at which this cleaner thread should be scheduled.
154     *
155     * We do this by taking the minimum expiration time and dividing by 4.
156     *
157     * @return the rate in milliseconds at which this thread should be
158     *         scheduled.
159     */
160    public long getRateInMs() {
161      long minLifespanMs =
162          Math.min(maxNonMmappedEvictableLifespanMs,
163              maxEvictableMmapedLifespanMs);
164      long sampleTimeMs = minLifespanMs / 4;
165      return (sampleTimeMs < 1) ? 1 : sampleTimeMs;
166    }
167  }
168
169  /**
170   * A task which asks the DataNode to release a short-circuit shared memory
171   * slot.  If successful, this will tell the DataNode to stop monitoring
172   * changes to the mlock status of the replica associated with the slot.
173   * It will also allow us (the client) to re-use this slot for another
174   * replica.  If we can't communicate with the DataNode for some reason,
175   * we tear down the shared memory segment to avoid being in an inconsistent
176   * state.
177   */
178  private class SlotReleaser implements Runnable {
179    /**
180     * The slot that we need to release.
181     */
182    private final Slot slot;
183
184    SlotReleaser(Slot slot) {
185      this.slot = slot;
186    }
187
188    @Override
189    public void run() {
190      LOG.trace("{}: about to release {}", ShortCircuitCache.this, slot);
191      final DfsClientShm shm = (DfsClientShm)slot.getShm();
192      final DomainSocket shmSock = shm.getPeer().getDomainSocket();
193      final String path = shmSock.getPath();
194      boolean success = false;
195      try (DomainSocket sock = DomainSocket.connect(path);
196           DataOutputStream out = new DataOutputStream(
197               new BufferedOutputStream(sock.getOutputStream()))) {
198        new Sender(out).releaseShortCircuitFds(slot.getSlotId());
199        DataInputStream in = new DataInputStream(sock.getInputStream());
200        ReleaseShortCircuitAccessResponseProto resp =
201            ReleaseShortCircuitAccessResponseProto.parseFrom(
202                PBHelperClient.vintPrefixed(in));
203        if (resp.getStatus() != Status.SUCCESS) {
204          String error = resp.hasError() ? resp.getError() : "(unknown)";
205          throw new IOException(resp.getStatus().toString() + ": " + error);
206        }
207        LOG.trace("{}: released {}", this, slot);
208        success = true;
209      } catch (IOException e) {
210        LOG.error(ShortCircuitCache.this + ": failed to release " +
211            "short-circuit shared memory slot " + slot + " by sending " +
212            "ReleaseShortCircuitAccessRequestProto to " + path +
213            ".  Closing shared memory segment.", e);
214      } finally {
215        if (success) {
216          shmManager.freeSlot(slot);
217        } else {
218          shm.getEndpointShmManager().shutdown(shm);
219        }
220      }
221    }
222  }
223
224  public interface ShortCircuitReplicaCreator {
225    /**
226     * Attempt to create a ShortCircuitReplica object.
227     *
228     * This callback will be made without holding any locks.
229     *
230     * @return a non-null ShortCircuitReplicaInfo object.
231     */
232    ShortCircuitReplicaInfo createShortCircuitReplicaInfo();
233  }
234
235  /**
236   * Lock protecting the cache.
237   */
238  private final ReentrantLock lock = new ReentrantLock();
239
240  /**
241   * The executor service that runs the cacheCleaner.
242   */
243  private final ScheduledThreadPoolExecutor cleanerExecutor
244      = new ScheduledThreadPoolExecutor(1, new ThreadFactoryBuilder().
245      setDaemon(true).setNameFormat("ShortCircuitCache_Cleaner").
246      build());
247
248  /**
249   * The executor service that runs the cacheCleaner.
250   */
251  private final ScheduledThreadPoolExecutor releaserExecutor
252      = new ScheduledThreadPoolExecutor(1, new ThreadFactoryBuilder().
253      setDaemon(true).setNameFormat("ShortCircuitCache_SlotReleaser").
254      build());
255
256  /**
257   * A map containing all ShortCircuitReplicaInfo objects, organized by Key.
258   * ShortCircuitReplicaInfo objects may contain a replica, or an InvalidToken
259   * exception.
260   */
261  private final HashMap<ExtendedBlockId, Waitable<ShortCircuitReplicaInfo>>
262      replicaInfoMap = new HashMap<>();
263
264  /**
265   * The CacheCleaner.  We don't create this and schedule it until it becomes
266   * necessary.
267   */
268  private CacheCleaner cacheCleaner;
269
270  /**
271   * LinkedMap of evictable elements.
272   *
273   * Maps (unique) insertion time in nanoseconds to the element.
274   */
275  private final LinkedMap evictable = new LinkedMap();
276
277  /**
278   * Maximum total size of the cache, including both mmapped and
279   * no$-mmapped elements.
280   */
281  private final int maxTotalSize;
282
283  /**
284   * Non-mmaped elements older than this will be closed.
285   */
286  private long maxNonMmappedEvictableLifespanMs;
287
288  /**
289   * LinkedMap of mmaped evictable elements.
290   *
291   * Maps (unique) insertion time in nanoseconds to the element.
292   */
293  private final LinkedMap evictableMmapped = new LinkedMap();
294
295  /**
296   * Maximum number of mmaped evictable elements.
297   */
298  private int maxEvictableMmapedSize;
299
300  /**
301   * Mmaped elements older than this will be closed.
302   */
303  private final long maxEvictableMmapedLifespanMs;
304
305  /**
306   * The minimum number of milliseconds we'll wait after an unsuccessful
307   * mmap attempt before trying again.
308   */
309  private final long mmapRetryTimeoutMs;
310
311  /**
312   * How long we will keep replicas in the cache before declaring them
313   * to be stale.
314   */
315  private final long staleThresholdMs;
316
317  /**
318   * True if the ShortCircuitCache is closed.
319   */
320  private boolean closed = false;
321
322  /**
323   * Number of existing mmaps associated with this cache.
324   */
325  private int outstandingMmapCount = 0;
326
327  /**
328   * Manages short-circuit shared memory segments for the client.
329   */
330  private final DfsClientShmManager shmManager;
331
332  public static ShortCircuitCache fromConf(ShortCircuitConf conf) {
333    return new ShortCircuitCache(
334        conf.getShortCircuitStreamsCacheSize(),
335        conf.getShortCircuitStreamsCacheExpiryMs(),
336        conf.getShortCircuitMmapCacheSize(),
337        conf.getShortCircuitMmapCacheExpiryMs(),
338        conf.getShortCircuitMmapCacheRetryTimeout(),
339        conf.getShortCircuitCacheStaleThresholdMs(),
340        conf.getShortCircuitSharedMemoryWatcherInterruptCheckMs());
341  }
342
343  public ShortCircuitCache(int maxTotalSize, long maxNonMmappedEvictableLifespanMs,
344      int maxEvictableMmapedSize, long maxEvictableMmapedLifespanMs,
345      long mmapRetryTimeoutMs, long staleThresholdMs, int shmInterruptCheckMs) {
346    Preconditions.checkArgument(maxTotalSize >= 0);
347    this.maxTotalSize = maxTotalSize;
348    Preconditions.checkArgument(maxNonMmappedEvictableLifespanMs >= 0);
349    this.maxNonMmappedEvictableLifespanMs = maxNonMmappedEvictableLifespanMs;
350    Preconditions.checkArgument(maxEvictableMmapedSize >= 0);
351    this.maxEvictableMmapedSize = maxEvictableMmapedSize;
352    Preconditions.checkArgument(maxEvictableMmapedLifespanMs >= 0);
353    this.maxEvictableMmapedLifespanMs = maxEvictableMmapedLifespanMs;
354    this.mmapRetryTimeoutMs = mmapRetryTimeoutMs;
355    this.staleThresholdMs = staleThresholdMs;
356    DfsClientShmManager shmManager = null;
357    if ((shmInterruptCheckMs > 0) &&
358        (DomainSocketWatcher.getLoadingFailureReason() == null)) {
359      try {
360        shmManager = new DfsClientShmManager(shmInterruptCheckMs);
361      } catch (IOException e) {
362        LOG.error("failed to create ShortCircuitShmManager", e);
363      }
364    }
365    this.shmManager = shmManager;
366  }
367
368  public long getStaleThresholdMs() {
369    return staleThresholdMs;
370  }
371
372  /**
373   * Increment the reference count of a replica, and remove it from any free
374   * list it may be in.
375   *
376   * You must hold the cache lock while calling this function.
377   *
378   * @param replica      The replica we're removing.
379   */
380  private void ref(ShortCircuitReplica replica) {
381    lock.lock();
382    try {
383      Preconditions.checkArgument(replica.refCount > 0,
384          "can't ref %s because its refCount reached %d", replica,
385          replica.refCount);
386      Long evictableTimeNs = replica.getEvictableTimeNs();
387      replica.refCount++;
388      if (evictableTimeNs != null) {
389        String removedFrom = removeEvictable(replica);
390        if (LOG.isTraceEnabled()) {
391          LOG.trace(this + ": " + removedFrom +
392              " no longer contains " + replica + ".  refCount " +
393              (replica.refCount - 1) + " -> " + replica.refCount +
394              StringUtils.getStackTrace(Thread.currentThread()));
395
396        }
397      } else if (LOG.isTraceEnabled()) {
398        LOG.trace(this + ": replica  refCount " +
399            (replica.refCount - 1) + " -> " + replica.refCount +
400            StringUtils.getStackTrace(Thread.currentThread()));
401      }
402    } finally {
403      lock.unlock();
404    }
405  }
406
407  /**
408   * Unreference a replica.
409   *
410   * You must hold the cache lock while calling this function.
411   *
412   * @param replica   The replica being unreferenced.
413   */
414  void unref(ShortCircuitReplica replica) {
415    lock.lock();
416    try {
417      // If the replica is stale or unusable, but we haven't purged it yet,
418      // let's do that.  It would be a shame to evict a non-stale replica so
419      // that we could put a stale or unusable one into the cache.
420      if (!replica.purged) {
421        String purgeReason = null;
422        if (!replica.getDataStream().getChannel().isOpen()) {
423          purgeReason = "purging replica because its data channel is closed.";
424        } else if (!replica.getMetaStream().getChannel().isOpen()) {
425          purgeReason = "purging replica because its meta channel is closed.";
426        } else if (replica.isStale()) {
427          purgeReason = "purging replica because it is stale.";
428        }
429        if (purgeReason != null) {
430          LOG.debug("{}: {}", this, purgeReason);
431          purge(replica);
432        }
433      }
434      String addedString = "";
435      boolean shouldTrimEvictionMaps = false;
436      int newRefCount = --replica.refCount;
437      if (newRefCount == 0) {
438        // Close replica, since there are no remaining references to it.
439        Preconditions.checkArgument(replica.purged,
440            "Replica %s reached a refCount of 0 without being purged", replica);
441        replica.close();
442      } else if (newRefCount == 1) {
443        Preconditions.checkState(null == replica.getEvictableTimeNs(),
444            "Replica %s had a refCount higher than 1, " +
445                "but was still evictable (evictableTimeNs = %d)",
446            replica, replica.getEvictableTimeNs());
447        if (!replica.purged) {
448          // Add the replica to the end of an eviction list.
449          // Eviction lists are sorted by time.
450          if (replica.hasMmap()) {
451            insertEvictable(System.nanoTime(), replica, evictableMmapped);
452            addedString = "added to evictableMmapped, ";
453          } else {
454            insertEvictable(System.nanoTime(), replica, evictable);
455            addedString = "added to evictable, ";
456          }
457          shouldTrimEvictionMaps = true;
458        }
459      } else {
460        Preconditions.checkArgument(replica.refCount >= 0,
461            "replica's refCount went negative (refCount = %d" +
462                " for %s)", replica.refCount, replica);
463      }
464      if (LOG.isTraceEnabled()) {
465        LOG.trace(this + ": unref replica " + replica +
466            ": " + addedString + " refCount " +
467            (newRefCount + 1) + " -> " + newRefCount +
468            StringUtils.getStackTrace(Thread.currentThread()));
469      }
470      if (shouldTrimEvictionMaps) {
471        trimEvictionMaps();
472      }
473    } finally {
474      lock.unlock();
475    }
476  }
477
478  /**
479   * Demote old evictable mmaps into the regular eviction map.
480   *
481   * You must hold the cache lock while calling this function.
482   *
483   * @param now   Current time in monotonic milliseconds.
484   * @return      Number of replicas demoted.
485   */
486  private int demoteOldEvictableMmaped(long now) {
487    int numDemoted = 0;
488    boolean needMoreSpace = false;
489    Long evictionTimeNs;
490
491    while (true) {
492      Object eldestKey;
493      try {
494        eldestKey = evictableMmapped.firstKey();
495      } catch (NoSuchElementException e) {
496        break;
497      }
498      evictionTimeNs = (Long)eldestKey;
499      long evictionTimeMs =
500          TimeUnit.MILLISECONDS.convert(evictionTimeNs, TimeUnit.NANOSECONDS);
501      if (evictionTimeMs + maxEvictableMmapedLifespanMs >= now) {
502        if (evictableMmapped.size() < maxEvictableMmapedSize) {
503          break;
504        }
505        needMoreSpace = true;
506      }
507      ShortCircuitReplica replica = (ShortCircuitReplica)evictableMmapped.get(
508          eldestKey);
509      if (LOG.isTraceEnabled()) {
510        String rationale = needMoreSpace ? "because we need more space" :
511            "because it's too old";
512        LOG.trace("demoteOldEvictable: demoting " + replica + ": " +
513            rationale + ": " +
514            StringUtils.getStackTrace(Thread.currentThread()));
515      }
516      removeEvictable(replica, evictableMmapped);
517      munmap(replica);
518      insertEvictable(evictionTimeNs, replica, evictable);
519      numDemoted++;
520    }
521    return numDemoted;
522  }
523
524  /**
525   * Trim the eviction lists.
526   */
527  private void trimEvictionMaps() {
528    long now = Time.monotonicNow();
529    demoteOldEvictableMmaped(now);
530
531    while (true) {
532      long evictableSize = evictable.size();
533      long evictableMmappedSize = evictableMmapped.size();
534      if (evictableSize + evictableMmappedSize <= maxTotalSize) {
535        return;
536      }
537      ShortCircuitReplica replica;
538      try {
539        if (evictableSize == 0) {
540          replica = (ShortCircuitReplica)evictableMmapped.get(evictableMmapped
541              .firstKey());
542        } else {
543          replica = (ShortCircuitReplica)evictable.get(evictable.firstKey());
544        }
545      } catch (NoSuchElementException e) {
546        break;
547      }
548      if (LOG.isTraceEnabled()) {
549        LOG.trace(this + ": trimEvictionMaps is purging " + replica +
550            StringUtils.getStackTrace(Thread.currentThread()));
551      }
552      purge(replica);
553    }
554  }
555
556  /**
557   * Munmap a replica, updating outstandingMmapCount.
558   *
559   * @param replica  The replica to munmap.
560   */
561  private void munmap(ShortCircuitReplica replica) {
562    replica.munmap();
563    outstandingMmapCount--;
564  }
565
566  /**
567   * Remove a replica from an evictable map.
568   *
569   * @param replica   The replica to remove.
570   * @return          The map it was removed from.
571   */
572  private String removeEvictable(ShortCircuitReplica replica) {
573    if (replica.hasMmap()) {
574      removeEvictable(replica, evictableMmapped);
575      return "evictableMmapped";
576    } else {
577      removeEvictable(replica, evictable);
578      return "evictable";
579    }
580  }
581
582  /**
583   * Remove a replica from an evictable map.
584   *
585   * @param replica   The replica to remove.
586   * @param map       The map to remove it from.
587   */
588  private void removeEvictable(ShortCircuitReplica replica,
589      LinkedMap map) {
590    Long evictableTimeNs = replica.getEvictableTimeNs();
591    Preconditions.checkNotNull(evictableTimeNs);
592    ShortCircuitReplica removed = (ShortCircuitReplica)map.remove(
593        evictableTimeNs);
594    Preconditions.checkState(removed == replica,
595        "failed to make %s unevictable", replica);
596    replica.setEvictableTimeNs(null);
597  }
598
599  /**
600   * Insert a replica into an evictable map.
601   *
602   * If an element already exists with this eviction time, we add a nanosecond
603   * to it until we find an unused key.
604   *
605   * @param evictionTimeNs   The eviction time in absolute nanoseconds.
606   * @param replica          The replica to insert.
607   * @param map              The map to insert it into.
608   */
609  private void insertEvictable(Long evictionTimeNs,
610      ShortCircuitReplica replica, LinkedMap map) {
611    while (map.containsKey(evictionTimeNs)) {
612      evictionTimeNs++;
613    }
614    Preconditions.checkState(null == replica.getEvictableTimeNs());
615    replica.setEvictableTimeNs(evictionTimeNs);
616    map.put(evictionTimeNs, replica);
617  }
618
619  /**
620   * Purge a replica from the cache.
621   *
622   * This doesn't necessarily close the replica, since there may be
623   * outstanding references to it.  However, it does mean the cache won't
624   * hand it out to anyone after this.
625   *
626   * You must hold the cache lock while calling this function.
627   *
628   * @param replica   The replica being removed.
629   */
630  private void purge(ShortCircuitReplica replica) {
631    boolean removedFromInfoMap = false;
632    String evictionMapName = null;
633    Preconditions.checkArgument(!replica.purged);
634    replica.purged = true;
635    Waitable<ShortCircuitReplicaInfo> val = replicaInfoMap.get(replica.key);
636    if (val != null) {
637      ShortCircuitReplicaInfo info = val.getVal();
638      if ((info != null) && (info.getReplica() == replica)) {
639        replicaInfoMap.remove(replica.key);
640        removedFromInfoMap = true;
641      }
642    }
643    Long evictableTimeNs = replica.getEvictableTimeNs();
644    if (evictableTimeNs != null) {
645      evictionMapName = removeEvictable(replica);
646    }
647    if (LOG.isTraceEnabled()) {
648      StringBuilder builder = new StringBuilder();
649      builder.append(this).append(": ").append(": purged ").
650          append(replica).append(" from the cache.");
651      if (removedFromInfoMap) {
652        builder.append("  Removed from the replicaInfoMap.");
653      }
654      if (evictionMapName != null) {
655        builder.append("  Removed from ").append(evictionMapName);
656      }
657      LOG.trace(builder.toString());
658    }
659    unref(replica);
660  }
661
662  /**
663   * Fetch or create a replica.
664   *
665   * You must hold the cache lock while calling this function.
666   *
667   * @param key          Key to use for lookup.
668   * @param creator      Replica creator callback.  Will be called without
669   *                     the cache lock being held.
670   *
671   * @return             Null if no replica could be found or created.
672   *                     The replica, otherwise.
673   */
674  public ShortCircuitReplicaInfo fetchOrCreate(ExtendedBlockId key,
675      ShortCircuitReplicaCreator creator) {
676    Waitable<ShortCircuitReplicaInfo> newWaitable = null;
677    lock.lock();
678    try {
679      ShortCircuitReplicaInfo info = null;
680      do {
681        if (closed) {
682          LOG.trace("{}: can't fethchOrCreate {} because the cache is closed.",
683              this, key);
684          return null;
685        }
686        Waitable<ShortCircuitReplicaInfo> waitable = replicaInfoMap.get(key);
687        if (waitable != null) {
688          try {
689            info = fetch(key, waitable);
690          } catch (RetriableException e) {
691            LOG.debug("{}: retrying {}", this, e.getMessage());
692          }
693        }
694      } while (false);
695      if (info != null) return info;
696      // We need to load the replica ourselves.
697      newWaitable = new Waitable<>(lock.newCondition());
698      replicaInfoMap.put(key, newWaitable);
699    } finally {
700      lock.unlock();
701    }
702    return create(key, creator, newWaitable);
703  }
704
705  /**
706   * Fetch an existing ReplicaInfo object.
707   *
708   * @param key       The key that we're using.
709   * @param waitable  The waitable object to wait on.
710   * @return          The existing ReplicaInfo object, or null if there is
711   *                  none.
712   *
713   * @throws RetriableException   If the caller needs to retry.
714   */
715  private ShortCircuitReplicaInfo fetch(ExtendedBlockId key,
716      Waitable<ShortCircuitReplicaInfo> waitable) throws RetriableException {
717    // Another thread is already in the process of loading this
718    // ShortCircuitReplica.  So we simply wait for it to complete.
719    ShortCircuitReplicaInfo info;
720    try {
721      LOG.trace("{}: found waitable for {}", this, key);
722      info = waitable.await();
723    } catch (InterruptedException e) {
724      LOG.info(this + ": interrupted while waiting for " + key);
725      Thread.currentThread().interrupt();
726      throw new RetriableException("interrupted");
727    }
728    if (info.getInvalidTokenException() != null) {
729      LOG.info(this + ": could not get " + key + " due to InvalidToken " +
730          "exception.", info.getInvalidTokenException());
731      return info;
732    }
733    ShortCircuitReplica replica = info.getReplica();
734    if (replica == null) {
735      LOG.warn(this + ": failed to get " + key);
736      return info;
737    }
738    if (replica.purged) {
739      // Ignore replicas that have already been purged from the cache.
740      throw new RetriableException("Ignoring purged replica " +
741          replica + ".  Retrying.");
742    }
743    // Check if the replica is stale before using it.
744    // If it is, purge it and retry.
745    if (replica.isStale()) {
746      LOG.info(this + ": got stale replica " + replica + ".  Removing " +
747          "this replica from the replicaInfoMap and retrying.");
748      // Remove the cache's reference to the replica.  This may or may not
749      // trigger a close.
750      purge(replica);
751      throw new RetriableException("ignoring stale replica " + replica);
752    }
753    ref(replica);
754    return info;
755  }
756
757  private ShortCircuitReplicaInfo create(ExtendedBlockId key,
758      ShortCircuitReplicaCreator creator,
759      Waitable<ShortCircuitReplicaInfo> newWaitable) {
760    // Handle loading a new replica.
761    ShortCircuitReplicaInfo info = null;
762    try {
763      LOG.trace("{}: loading {}", this, key);
764      info = creator.createShortCircuitReplicaInfo();
765    } catch (RuntimeException e) {
766      LOG.warn(this + ": failed to load " + key, e);
767    }
768    if (info == null) info = new ShortCircuitReplicaInfo();
769    lock.lock();
770    try {
771      if (info.getReplica() != null) {
772        // On success, make sure the cache cleaner thread is running.
773        LOG.trace("{}: successfully loaded {}", this, info.getReplica());
774        startCacheCleanerThreadIfNeeded();
775        // Note: new ShortCircuitReplicas start with a refCount of 2,
776        // indicating that both this cache and whoever requested the
777        // creation of the replica hold a reference.  So we don't need
778        // to increment the reference count here.
779      } else {
780        // On failure, remove the waitable from the replicaInfoMap.
781        Waitable<ShortCircuitReplicaInfo> waitableInMap = replicaInfoMap.get(key);
782        if (waitableInMap == newWaitable) replicaInfoMap.remove(key);
783        if (info.getInvalidTokenException() != null) {
784          LOG.info(this + ": could not load " + key + " due to InvalidToken " +
785              "exception.", info.getInvalidTokenException());
786        } else {
787          LOG.warn(this + ": failed to load " + key);
788        }
789      }
790      newWaitable.provide(info);
791    } finally {
792      lock.unlock();
793    }
794    return info;
795  }
796
797  private void startCacheCleanerThreadIfNeeded() {
798    if (cacheCleaner == null) {
799      cacheCleaner = new CacheCleaner();
800      long rateMs = cacheCleaner.getRateInMs();
801      ScheduledFuture<?> future =
802          cleanerExecutor.scheduleAtFixedRate(cacheCleaner, rateMs, rateMs,
803              TimeUnit.MILLISECONDS);
804      cacheCleaner.setFuture(future);
805      LOG.debug("{}: starting cache cleaner thread which will run every {} ms",
806          this, rateMs);
807    }
808  }
809
810  ClientMmap getOrCreateClientMmap(ShortCircuitReplica replica,
811      boolean anchored) {
812    Condition newCond;
813    lock.lock();
814    try {
815      while (replica.mmapData != null) {
816        if (replica.mmapData instanceof MappedByteBuffer) {
817          ref(replica);
818          MappedByteBuffer mmap = (MappedByteBuffer)replica.mmapData;
819          return new ClientMmap(replica, mmap, anchored);
820        } else if (replica.mmapData instanceof Long) {
821          long lastAttemptTimeMs = (Long)replica.mmapData;
822          long delta = Time.monotonicNow() - lastAttemptTimeMs;
823          if (delta < mmapRetryTimeoutMs) {
824            LOG.trace("{}: can't create client mmap for {} because we failed to"
825                + " create one just {}ms ago.", this, replica, delta);
826            return null;
827          }
828          LOG.trace("{}: retrying client mmap for {}, {} ms after the previous "
829              + "failure.", this, replica, delta);
830        } else if (replica.mmapData instanceof Condition) {
831          Condition cond = (Condition)replica.mmapData;
832          cond.awaitUninterruptibly();
833        } else {
834          Preconditions.checkState(false, "invalid mmapData type %s",
835              replica.mmapData.getClass().getName());
836        }
837      }
838      newCond = lock.newCondition();
839      replica.mmapData = newCond;
840    } finally {
841      lock.unlock();
842    }
843    MappedByteBuffer map = replica.loadMmapInternal();
844    lock.lock();
845    try {
846      if (map == null) {
847        replica.mmapData = Time.monotonicNow();
848        newCond.signalAll();
849        return null;
850      } else {
851        outstandingMmapCount++;
852        replica.mmapData = map;
853        ref(replica);
854        newCond.signalAll();
855        return new ClientMmap(replica, map, anchored);
856      }
857    } finally {
858      lock.unlock();
859    }
860  }
861
862  /**
863   * Close the cache and free all associated resources.
864   */
865  @Override
866  public void close() {
867    try {
868      lock.lock();
869      if (closed) return;
870      closed = true;
871      LOG.info(this + ": closing");
872      maxNonMmappedEvictableLifespanMs = 0;
873      maxEvictableMmapedSize = 0;
874      // Close and join cacheCleaner thread.
875      IOUtilsClient.cleanup(LOG, cacheCleaner);
876      // Purge all replicas.
877      while (true) {
878        Object eldestKey;
879        try {
880          eldestKey = evictable.firstKey();
881        } catch (NoSuchElementException e) {
882          break;
883        }
884        purge((ShortCircuitReplica)evictable.get(eldestKey));
885      }
886      while (true) {
887        Object eldestKey;
888        try {
889          eldestKey = evictableMmapped.firstKey();
890        } catch (NoSuchElementException e) {
891          break;
892        }
893        purge((ShortCircuitReplica)evictableMmapped.get(eldestKey));
894      }
895    } finally {
896      lock.unlock();
897    }
898
899    releaserExecutor.shutdown();
900    cleanerExecutor.shutdown();
901    // wait for existing tasks to terminate
902    try {
903      if (!releaserExecutor.awaitTermination(30, TimeUnit.SECONDS)) {
904        LOG.error("Forcing SlotReleaserThreadPool to shutdown!");
905        releaserExecutor.shutdownNow();
906      }
907    } catch (InterruptedException e) {
908      releaserExecutor.shutdownNow();
909      Thread.currentThread().interrupt();
910      LOG.error("Interrupted while waiting for SlotReleaserThreadPool "
911          + "to terminate", e);
912    }
913
914    // wait for existing tasks to terminate
915    try {
916      if (!cleanerExecutor.awaitTermination(30, TimeUnit.SECONDS)) {
917        LOG.error("Forcing CleanerThreadPool to shutdown!");
918        cleanerExecutor.shutdownNow();
919      }
920    } catch (InterruptedException e) {
921      cleanerExecutor.shutdownNow();
922      Thread.currentThread().interrupt();
923      LOG.error("Interrupted while waiting for CleanerThreadPool "
924          + "to terminate", e);
925    }
926    IOUtilsClient.cleanup(LOG, shmManager);
927  }
928
929  @VisibleForTesting // ONLY for testing
930  public interface CacheVisitor {
931    void visit(int numOutstandingMmaps,
932        Map<ExtendedBlockId, ShortCircuitReplica> replicas,
933        Map<ExtendedBlockId, InvalidToken> failedLoads,
934        LinkedMap evictable,
935        LinkedMap evictableMmapped);
936  }
937
938  @VisibleForTesting // ONLY for testing
939  public void accept(CacheVisitor visitor) {
940    lock.lock();
941    try {
942      Map<ExtendedBlockId, ShortCircuitReplica> replicas = new HashMap<>();
943      Map<ExtendedBlockId, InvalidToken> failedLoads = new HashMap<>();
944      for (Entry<ExtendedBlockId, Waitable<ShortCircuitReplicaInfo>> entry :
945          replicaInfoMap.entrySet()) {
946        Waitable<ShortCircuitReplicaInfo> waitable = entry.getValue();
947        if (waitable.hasVal()) {
948          if (waitable.getVal().getReplica() != null) {
949            replicas.put(entry.getKey(), waitable.getVal().getReplica());
950          } else {
951            // The exception may be null here, indicating a failed load that
952            // isn't the result of an invalid block token.
953            failedLoads.put(entry.getKey(),
954                waitable.getVal().getInvalidTokenException());
955          }
956        }
957      }
958      LOG.debug("visiting {} with outstandingMmapCount={}, replicas={}, "
959              + "failedLoads={}, evictable={}, evictableMmapped={}",
960          visitor.getClass().getName(), outstandingMmapCount, replicas,
961          failedLoads, evictable, evictableMmapped);
962      visitor.visit(outstandingMmapCount, replicas, failedLoads,
963          evictable, evictableMmapped);
964    } finally {
965      lock.unlock();
966    }
967  }
968
969  @Override
970  public String toString() {
971    return "ShortCircuitCache(0x" +
972        Integer.toHexString(System.identityHashCode(this)) + ")";
973  }
974
975  /**
976   * Allocate a new shared memory slot.
977   *
978   * @param datanode       The datanode to allocate a shm slot with.
979   * @param peer           A peer connected to the datanode.
980   * @param usedPeer       Will be set to true if we use up the provided peer.
981   * @param blockId        The block id and block pool id of the block we're
982   *                         allocating this slot for.
983   * @param clientName     The name of the DFSClient allocating the shared
984   *                         memory.
985   * @return               Null if short-circuit shared memory is disabled;
986   *                         a short-circuit memory slot otherwise.
987   * @throws IOException   An exception if there was an error talking to
988   *                         the datanode.
989   */
990  public Slot allocShmSlot(DatanodeInfo datanode,
991      DomainPeer peer, MutableBoolean usedPeer,
992      ExtendedBlockId blockId, String clientName) throws IOException {
993    if (shmManager != null) {
994      return shmManager.allocSlot(datanode, peer, usedPeer,
995          blockId, clientName);
996    } else {
997      return null;
998    }
999  }
1000
1001  /**
1002   * Free a slot immediately.
1003   *
1004   * ONLY use this if the DataNode is not yet aware of the slot.
1005   *
1006   * @param slot           The slot to free.
1007   */
1008  public void freeSlot(Slot slot) {
1009    Preconditions.checkState(shmManager != null);
1010    slot.makeInvalid();
1011    shmManager.freeSlot(slot);
1012  }
1013
1014  /**
1015   * Schedule a shared memory slot to be released.
1016   *
1017   * @param slot           The slot to release.
1018   */
1019  public void scheduleSlotReleaser(Slot slot) {
1020    Preconditions.checkState(shmManager != null);
1021    releaserExecutor.execute(new SlotReleaser(slot));
1022  }
1023
1024  @VisibleForTesting
1025  public DfsClientShmManager getDfsClientShmManager() {
1026    return shmManager;
1027  }
1028}