001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    package org.apache.hadoop.hdfs.client;
019    
020    import java.io.BufferedOutputStream;
021    import java.io.Closeable;
022    import java.io.DataInputStream;
023    import java.io.DataOutputStream;
024    
025    import org.apache.hadoop.classification.InterfaceAudience;
026    
027    import java.io.IOException;
028    import java.nio.MappedByteBuffer;
029    import java.util.HashMap;
030    import java.util.Map;
031    import java.util.Map.Entry;
032    import java.util.TreeMap;
033    import java.util.concurrent.ScheduledFuture;
034    import java.util.concurrent.ScheduledThreadPoolExecutor;
035    import java.util.concurrent.TimeUnit;
036    import java.util.concurrent.locks.Condition;
037    import java.util.concurrent.locks.ReentrantLock;
038    
039    import org.apache.commons.lang.mutable.MutableBoolean;
040    import org.apache.commons.logging.Log;
041    import org.apache.commons.logging.LogFactory;
042    import org.apache.hadoop.conf.Configuration;
043    import org.apache.hadoop.hdfs.ExtendedBlockId;
044    import org.apache.hadoop.hdfs.DFSConfigKeys;
045    import org.apache.hadoop.hdfs.client.ShortCircuitReplica;
046    import org.apache.hadoop.hdfs.ShortCircuitShm.Slot;
047    import org.apache.hadoop.hdfs.net.DomainPeer;
048    import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
049    import org.apache.hadoop.hdfs.protocol.datatransfer.Sender;
050    import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.ReleaseShortCircuitAccessResponseProto;
051    import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.Status;
052    import org.apache.hadoop.hdfs.protocolPB.PBHelper;
053    import org.apache.hadoop.io.IOUtils;
054    import org.apache.hadoop.ipc.RetriableException;
055    import org.apache.hadoop.net.unix.DomainSocket;
056    import org.apache.hadoop.net.unix.DomainSocketWatcher;
057    import org.apache.hadoop.security.token.SecretManager.InvalidToken;
058    import org.apache.hadoop.util.StringUtils;
059    import org.apache.hadoop.util.Time;
060    import org.apache.hadoop.util.Waitable;
061    
062    import com.google.common.annotations.VisibleForTesting;
063    import com.google.common.base.Preconditions;
064    import com.google.common.util.concurrent.ThreadFactoryBuilder;
065    
066    /**
067     * The ShortCircuitCache tracks things which the client needs to access
068     * HDFS block files via short-circuit.
069     *
070     * These things include: memory-mapped regions, file descriptors, and shared
071     * memory areas for communicating with the DataNode.
072     */
073    @InterfaceAudience.Private
074    public class ShortCircuitCache implements Closeable {
075      public static final Log LOG = LogFactory.getLog(ShortCircuitCache.class);
076    
077      /**
078       * Expiry thread which makes sure that the file descriptors get closed
079       * after a while.
080       */
081      private class CacheCleaner implements Runnable, Closeable {
082        private ScheduledFuture<?> future;
083    
084        /**
085         * Run the CacheCleaner thread.
086         *
087         * Whenever a thread requests a ShortCircuitReplica object, we will make
088         * sure it gets one.  That ShortCircuitReplica object can then be re-used
089         * when another thread requests a ShortCircuitReplica object for the same
090         * block.  So in that sense, there is no maximum size to the cache.
091         *
092         * However, when a ShortCircuitReplica object is unreferenced by the
093         * thread(s) that are using it, it becomes evictable.  There are two
094         * separate eviction lists-- one for mmaped objects, and another for
095         * non-mmaped objects.  We do this in order to avoid having the regular
096         * files kick the mmaped files out of the cache too quickly.  Reusing
097         * an already-existing mmap gives a huge performance boost, since the
098         * page table entries don't have to be re-populated.  Both the mmap
099         * and non-mmap evictable lists have maximum sizes and maximum lifespans.
100         */
101        @Override
102        public void run() {
103          ShortCircuitCache.this.lock.lock();
104          try {
105            if (ShortCircuitCache.this.closed) return;
106            long curMs = Time.monotonicNow();
107    
108            if (LOG.isDebugEnabled()) {
109              LOG.debug(this + ": cache cleaner running at " + curMs);
110            }
111    
112            int numDemoted = demoteOldEvictableMmaped(curMs);
113            int numPurged = 0;
114            Long evictionTimeNs = Long.valueOf(0);
115            while (true) {
116              Entry<Long, ShortCircuitReplica> entry = 
117                  evictableMmapped.ceilingEntry(evictionTimeNs);
118              if (entry == null) break;
119              evictionTimeNs = entry.getKey();
120              long evictionTimeMs = 
121                  TimeUnit.MILLISECONDS.convert(evictionTimeNs, TimeUnit.NANOSECONDS);
122              if (evictionTimeMs + maxNonMmappedEvictableLifespanMs >= curMs) break;
123              ShortCircuitReplica replica = entry.getValue();
124              if (LOG.isTraceEnabled()) {
125                LOG.trace("CacheCleaner: purging " + replica + ": " + 
126                      StringUtils.getStackTrace(Thread.currentThread()));
127              }
128              purge(replica);
129              numPurged++;
130            }
131    
132            if (LOG.isDebugEnabled()) {
133              LOG.debug(this + ": finishing cache cleaner run started at " +
134                curMs + ".  Demoted " + numDemoted + " mmapped replicas; " +
135                "purged " + numPurged + " replicas.");
136            }
137          } finally {
138            ShortCircuitCache.this.lock.unlock();
139          }
140        }
141    
142        @Override
143        public void close() throws IOException {
144          if (future != null) {
145            future.cancel(false);
146          }
147        }
148    
149        public void setFuture(ScheduledFuture<?> future) {
150          this.future = future;
151        }
152    
153        /**
154         * Get the rate at which this cleaner thread should be scheduled.
155         *
156         * We do this by taking the minimum expiration time and dividing by 4.
157         *
158         * @return the rate in milliseconds at which this thread should be
159         *         scheduled.
160         */
161        public long getRateInMs() {
162          long minLifespanMs =
163              Math.min(maxNonMmappedEvictableLifespanMs,
164                  maxEvictableMmapedLifespanMs);
165          long sampleTimeMs = minLifespanMs / 4;
166          return (sampleTimeMs < 1) ? 1 : sampleTimeMs;
167        }
168      }
169    
170      /**
171       * A task which asks the DataNode to release a short-circuit shared memory
172       * slot.  If successful, this will tell the DataNode to stop monitoring
173       * changes to the mlock status of the replica associated with the slot.
174       * It will also allow us (the client) to re-use this slot for another
175       * replica.  If we can't communicate with the DataNode for some reason,
176       * we tear down the shared memory segment to avoid being in an inconsistent
177       * state.
178       */
179      private class SlotReleaser implements Runnable {
180        /**
181         * The slot that we need to release.
182         */
183        private final Slot slot;
184    
185        SlotReleaser(Slot slot) {
186          this.slot = slot;
187        }
188    
189        @Override
190        public void run() {
191          if (LOG.isTraceEnabled()) {
192            LOG.trace(ShortCircuitCache.this + ": about to release " + slot);
193          }
194          final DfsClientShm shm = (DfsClientShm)slot.getShm();
195          final DomainSocket shmSock = shm.getPeer().getDomainSocket();
196          DomainSocket sock = null;
197          DataOutputStream out = null;
198          final String path = shmSock.getPath();
199          boolean success = false;
200          try {
201            sock = DomainSocket.connect(path);
202            out = new DataOutputStream(
203                new BufferedOutputStream(sock.getOutputStream()));
204            new Sender(out).releaseShortCircuitFds(slot.getSlotId());
205            DataInputStream in = new DataInputStream(sock.getInputStream());
206            ReleaseShortCircuitAccessResponseProto resp =
207                ReleaseShortCircuitAccessResponseProto.parseFrom(
208                    PBHelper.vintPrefixed(in));
209            if (resp.getStatus() != Status.SUCCESS) {
210              String error = resp.hasError() ? resp.getError() : "(unknown)";
211              throw new IOException(resp.getStatus().toString() + ": " + error);
212            }
213            if (LOG.isTraceEnabled()) {
214              LOG.trace(ShortCircuitCache.this + ": released " + slot);
215            }
216            success = true;
217          } catch (IOException e) {
218            LOG.error(ShortCircuitCache.this + ": failed to release " +
219                "short-circuit shared memory slot " + slot + " by sending " +
220                "ReleaseShortCircuitAccessRequestProto to " + path +
221                ".  Closing shared memory segment.", e);
222          } finally {
223            if (success) {
224              shmManager.freeSlot(slot);
225            } else {
226              shm.getEndpointShmManager().shutdown(shm);
227            }
228            IOUtils.cleanup(LOG, sock, out);
229          }
230        }
231      }
232    
233      public interface ShortCircuitReplicaCreator {
234        /**
235         * Attempt to create a ShortCircuitReplica object.
236         *
237         * This callback will be made without holding any locks.
238         *
239         * @return a non-null ShortCircuitReplicaInfo object.
240         */
241        ShortCircuitReplicaInfo createShortCircuitReplicaInfo();
242      }
243    
244      /**
245       * Lock protecting the cache.
246       */
247      private final ReentrantLock lock = new ReentrantLock();
248    
249      /**
250       * The executor service that runs the cacheCleaner.
251       */
252      private final ScheduledThreadPoolExecutor cleanerExecutor
253      = new ScheduledThreadPoolExecutor(1, new ThreadFactoryBuilder().
254              setDaemon(true).setNameFormat("ShortCircuitCache_Cleaner").
255              build());
256    
257      /**
258       * The executor service that runs the cacheCleaner.
259       */
260      private final ScheduledThreadPoolExecutor releaserExecutor
261          = new ScheduledThreadPoolExecutor(1, new ThreadFactoryBuilder().
262              setDaemon(true).setNameFormat("ShortCircuitCache_SlotReleaser").
263              build());
264    
265      /**
266       * A map containing all ShortCircuitReplicaInfo objects, organized by Key.
267       * ShortCircuitReplicaInfo objects may contain a replica, or an InvalidToken
268       * exception.
269       */
270      private final HashMap<ExtendedBlockId, Waitable<ShortCircuitReplicaInfo>> 
271          replicaInfoMap = new HashMap<ExtendedBlockId,
272              Waitable<ShortCircuitReplicaInfo>>();
273    
274      /**
275       * The CacheCleaner.  We don't create this and schedule it until it becomes
276       * necessary.
277       */
278      private CacheCleaner cacheCleaner;
279    
280      /**
281       * Tree of evictable elements.
282       *
283       * Maps (unique) insertion time in nanoseconds to the element.
284       */
285      private final TreeMap<Long, ShortCircuitReplica> evictable =
286          new TreeMap<Long, ShortCircuitReplica>();
287    
288      /**
289       * Maximum total size of the cache, including both mmapped and
290       * no$-mmapped elements.
291       */
292      private final int maxTotalSize;
293    
294      /**
295       * Non-mmaped elements older than this will be closed.
296       */
297      private long maxNonMmappedEvictableLifespanMs;
298    
299      /**
300       * Tree of mmaped evictable elements.
301       *
302       * Maps (unique) insertion time in nanoseconds to the element.
303       */
304      private final TreeMap<Long, ShortCircuitReplica> evictableMmapped =
305          new TreeMap<Long, ShortCircuitReplica>();
306    
307      /**
308       * Maximum number of mmaped evictable elements.
309       */
310      private int maxEvictableMmapedSize;
311    
312      /**
313       * Mmaped elements older than this will be closed.
314       */
315      private final long maxEvictableMmapedLifespanMs;
316    
317      /**
318       * The minimum number of milliseconds we'll wait after an unsuccessful
319       * mmap attempt before trying again.
320       */
321      private final long mmapRetryTimeoutMs;
322    
323      /**
324       * How long we will keep replicas in the cache before declaring them
325       * to be stale.
326       */
327      private final long staleThresholdMs;
328    
329      /**
330       * True if the ShortCircuitCache is closed.
331       */
332      private boolean closed = false;
333    
334      /**
335       * Number of existing mmaps associated with this cache.
336       */
337      private int outstandingMmapCount = 0;
338    
339      /**
340       * Manages short-circuit shared memory segments for the client.
341       */
342      private final DfsClientShmManager shmManager;
343    
344      /**
345       * Create a {@link ShortCircuitCache} object from a {@link Configuration}
346       */
347      public static ShortCircuitCache fromConf(Configuration conf) {
348        return new ShortCircuitCache(
349            conf.getInt(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_SIZE_KEY,
350                DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_SIZE_DEFAULT),
351            conf.getLong(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS_KEY,
352                DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS_DEFAULT),
353            conf.getInt(DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_SIZE,
354                DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_SIZE_DEFAULT),
355            conf.getLong(DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_TIMEOUT_MS,
356                DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_TIMEOUT_MS_DEFAULT),
357            conf.getLong(DFSConfigKeys.DFS_CLIENT_MMAP_RETRY_TIMEOUT_MS,
358                DFSConfigKeys.DFS_CLIENT_MMAP_RETRY_TIMEOUT_MS_DEFAULT),
359            conf.getLong(DFSConfigKeys.DFS_CLIENT_SHORT_CIRCUIT_REPLICA_STALE_THRESHOLD_MS,
360                DFSConfigKeys.DFS_CLIENT_SHORT_CIRCUIT_REPLICA_STALE_THRESHOLD_MS_DEFAULT),
361            conf.getInt(DFSConfigKeys.DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS,
362                DFSConfigKeys.DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS_DEFAULT));
363      }
364    
365      public ShortCircuitCache(int maxTotalSize, long maxNonMmappedEvictableLifespanMs,
366          int maxEvictableMmapedSize, long maxEvictableMmapedLifespanMs,
367          long mmapRetryTimeoutMs, long staleThresholdMs, int shmInterruptCheckMs) {
368        Preconditions.checkArgument(maxTotalSize >= 0);
369        this.maxTotalSize = maxTotalSize;
370        Preconditions.checkArgument(maxNonMmappedEvictableLifespanMs >= 0);
371        this.maxNonMmappedEvictableLifespanMs = maxNonMmappedEvictableLifespanMs;
372        Preconditions.checkArgument(maxEvictableMmapedSize >= 0);
373        this.maxEvictableMmapedSize = maxEvictableMmapedSize;
374        Preconditions.checkArgument(maxEvictableMmapedLifespanMs >= 0);
375        this.maxEvictableMmapedLifespanMs = maxEvictableMmapedLifespanMs;
376        this.mmapRetryTimeoutMs = mmapRetryTimeoutMs;
377        this.staleThresholdMs = staleThresholdMs;
378        DfsClientShmManager shmManager = null;
379        if ((shmInterruptCheckMs > 0) &&
380            (DomainSocketWatcher.getLoadingFailureReason() == null)) {
381          try {
382            shmManager = new DfsClientShmManager(shmInterruptCheckMs);
383          } catch (IOException e) {
384            LOG.error("failed to create ShortCircuitShmManager", e);
385          }
386        }
387        this.shmManager = shmManager;
388      }
389    
390      public long getMmapRetryTimeoutMs() {
391        return mmapRetryTimeoutMs;
392      }
393    
394      public long getStaleThresholdMs() {
395        return staleThresholdMs;
396      }
397    
398      /**
399       * Increment the reference count of a replica, and remove it from any free
400       * list it may be in.
401       *
402       * You must hold the cache lock while calling this function.
403       *
404       * @param replica      The replica we're removing.
405       */
406      private void ref(ShortCircuitReplica replica) {
407        lock.lock();
408        try {
409          Preconditions.checkArgument(replica.refCount > 0,
410              "can't ref " + replica + " because its refCount reached " +
411              replica.refCount);
412          Long evictableTimeNs = replica.getEvictableTimeNs();
413          replica.refCount++;
414          if (evictableTimeNs != null) {
415            String removedFrom = removeEvictable(replica);
416            if (LOG.isTraceEnabled()) {
417              LOG.trace(this + ": " + removedFrom +
418                  " no longer contains " + replica + ".  refCount " +
419                  (replica.refCount - 1) + " -> " + replica.refCount +
420                  StringUtils.getStackTrace(Thread.currentThread()));
421    
422            }
423          } else if (LOG.isTraceEnabled()) {
424            LOG.trace(this + ": replica  refCount " +
425                (replica.refCount - 1) + " -> " + replica.refCount +
426                StringUtils.getStackTrace(Thread.currentThread()));
427          }
428        } finally {
429          lock.unlock();
430        }
431      }
432    
433      /**
434       * Unreference a replica.
435       *
436       * You must hold the cache lock while calling this function.
437       *
438       * @param replica   The replica being unreferenced.
439       */
440      void unref(ShortCircuitReplica replica) {
441        lock.lock();
442        try {
443          // If the replica is stale, but we haven't purged it yet, let's do that.
444          // It would be a shame to evict a non-stale replica so that we could put
445          // a stale one into the cache.
446          if ((!replica.purged) && replica.isStale()) {
447            purge(replica);
448          }
449          String addedString = "";
450          boolean shouldTrimEvictionMaps = false;
451          int newRefCount = --replica.refCount;
452          if (newRefCount == 0) {
453            // Close replica, since there are no remaining references to it.
454            Preconditions.checkArgument(replica.purged,
455                "Replica " + replica + " reached a refCount of 0 without " +
456                "being purged");
457            replica.close();
458          } else if (newRefCount == 1) {
459            Preconditions.checkState(null == replica.getEvictableTimeNs(),
460                "Replica " + replica + " had a refCount higher than 1, " +
461                  "but was still evictable (evictableTimeNs = " +
462                    replica.getEvictableTimeNs() + ")");
463            if (!replica.purged) {
464              // Add the replica to the end of an eviction list.
465              // Eviction lists are sorted by time.
466              if (replica.hasMmap()) {
467                insertEvictable(System.nanoTime(), replica, evictableMmapped);
468                addedString = "added to evictableMmapped, ";
469              } else {
470                insertEvictable(System.nanoTime(), replica, evictable);
471                addedString = "added to evictable, ";
472              }
473              shouldTrimEvictionMaps = true;
474            }
475          } else {
476            Preconditions.checkArgument(replica.refCount >= 0,
477                "replica's refCount went negative (refCount = " +
478                replica.refCount + " for " + replica + ")");
479          }
480          if (LOG.isTraceEnabled()) {
481            LOG.trace(this + ": unref replica " + replica +
482                ": " + addedString + " refCount " +
483                (newRefCount + 1) + " -> " + newRefCount +
484                StringUtils.getStackTrace(Thread.currentThread()));
485          }
486          if (shouldTrimEvictionMaps) {
487            trimEvictionMaps();
488          }
489        } finally {
490          lock.unlock();
491        }
492      }
493    
494      /**
495       * Demote old evictable mmaps into the regular eviction map.
496       *
497       * You must hold the cache lock while calling this function.
498       *
499       * @param now   Current time in monotonic milliseconds.
500       * @return      Number of replicas demoted.
501       */
502      private int demoteOldEvictableMmaped(long now) {
503        int numDemoted = 0;
504        boolean needMoreSpace = false;
505        Long evictionTimeNs = Long.valueOf(0);
506    
507        while (true) {
508          Entry<Long, ShortCircuitReplica> entry = 
509              evictableMmapped.ceilingEntry(evictionTimeNs);
510          if (entry == null) break;
511          evictionTimeNs = entry.getKey();
512          long evictionTimeMs = 
513              TimeUnit.MILLISECONDS.convert(evictionTimeNs, TimeUnit.NANOSECONDS);
514          if (evictionTimeMs + maxEvictableMmapedLifespanMs >= now) {
515            if (evictableMmapped.size() < maxEvictableMmapedSize) {
516              break;
517            }
518            needMoreSpace = true;
519          }
520          ShortCircuitReplica replica = entry.getValue();
521          if (LOG.isTraceEnabled()) {
522            String rationale = needMoreSpace ? "because we need more space" : 
523                "because it's too old";
524            LOG.trace("demoteOldEvictable: demoting " + replica + ": " +
525                rationale + ": " +
526                StringUtils.getStackTrace(Thread.currentThread()));
527          }
528          removeEvictable(replica, evictableMmapped);
529          munmap(replica);
530          insertEvictable(evictionTimeNs, replica, evictable);
531          numDemoted++;
532        }
533        return numDemoted;
534      }
535    
536      /**
537       * Trim the eviction lists.
538       */
539      private void trimEvictionMaps() {
540        long now = Time.monotonicNow();
541        demoteOldEvictableMmaped(now);
542    
543        while (true) {
544          long evictableSize = evictable.size();
545          long evictableMmappedSize = evictableMmapped.size();
546          if (evictableSize + evictableMmappedSize <= maxTotalSize) {
547            return;
548          }
549          ShortCircuitReplica replica;
550          if (evictableSize == 0) {
551           replica = evictableMmapped.firstEntry().getValue();
552          } else {
553           replica = evictable.firstEntry().getValue();
554          }
555          if (LOG.isTraceEnabled()) {
556            LOG.trace(this + ": trimEvictionMaps is purging " + replica +
557              StringUtils.getStackTrace(Thread.currentThread()));
558          }
559          purge(replica);
560        }
561      }
562    
563      /**
564       * Munmap a replica, updating outstandingMmapCount.
565       *
566       * @param replica  The replica to munmap.
567       */
568      private void munmap(ShortCircuitReplica replica) {
569        replica.munmap();
570        outstandingMmapCount--;
571      }
572    
573      /**
574       * Remove a replica from an evictable map.
575       *
576       * @param replica   The replica to remove.
577       * @return          The map it was removed from.
578       */
579      private String removeEvictable(ShortCircuitReplica replica) {
580        if (replica.hasMmap()) {
581          removeEvictable(replica, evictableMmapped);
582          return "evictableMmapped";
583        } else {
584          removeEvictable(replica, evictable);
585          return "evictable";
586        }
587      }
588    
589      /**
590       * Remove a replica from an evictable map.
591       *
592       * @param replica   The replica to remove.
593       * @param map       The map to remove it from.
594       */
595      private void removeEvictable(ShortCircuitReplica replica,
596          TreeMap<Long, ShortCircuitReplica> map) {
597        Long evictableTimeNs = replica.getEvictableTimeNs();
598        Preconditions.checkNotNull(evictableTimeNs);
599        ShortCircuitReplica removed = map.remove(evictableTimeNs);
600        Preconditions.checkState(removed == replica,
601            "failed to make " + replica + " unevictable");
602        replica.setEvictableTimeNs(null);
603      }
604    
605      /**
606       * Insert a replica into an evictable map.
607       *
608       * If an element already exists with this eviction time, we add a nanosecond
609       * to it until we find an unused key.
610       *
611       * @param evictionTimeNs   The eviction time in absolute nanoseconds.
612       * @param replica          The replica to insert.
613       * @param map              The map to insert it into.
614       */
615      private void insertEvictable(Long evictionTimeNs,
616          ShortCircuitReplica replica, TreeMap<Long, ShortCircuitReplica> map) {
617        while (map.containsKey(evictionTimeNs)) {
618          evictionTimeNs++;
619        }
620        Preconditions.checkState(null == replica.getEvictableTimeNs());
621        Long time = Long.valueOf(evictionTimeNs);
622        replica.setEvictableTimeNs(time);
623        map.put(time, replica);
624      }
625    
626      /**
627       * Purge a replica from the cache.
628       *
629       * This doesn't necessarily close the replica, since there may be
630       * outstanding references to it.  However, it does mean the cache won't
631       * hand it out to anyone after this.
632       *
633       * You must hold the cache lock while calling this function.
634       *
635       * @param replica   The replica being removed.
636       */
637      private void purge(ShortCircuitReplica replica) {
638        boolean removedFromInfoMap = false;
639        String evictionMapName = null;
640        Preconditions.checkArgument(!replica.purged);
641        replica.purged = true;
642        Waitable<ShortCircuitReplicaInfo> val = replicaInfoMap.get(replica.key);
643        if (val != null) {
644          ShortCircuitReplicaInfo info = val.getVal();
645          if ((info != null) && (info.getReplica() == replica)) {
646            replicaInfoMap.remove(replica.key);
647            removedFromInfoMap = true;
648          }
649        }
650        Long evictableTimeNs = replica.getEvictableTimeNs();
651        if (evictableTimeNs != null) {
652          evictionMapName = removeEvictable(replica);
653        }
654        if (LOG.isTraceEnabled()) {
655          StringBuilder builder = new StringBuilder();
656          builder.append(this).append(": ").append(": purged ").
657              append(replica).append(" from the cache.");
658          if (removedFromInfoMap) {
659            builder.append("  Removed from the replicaInfoMap.");
660          }
661          if (evictionMapName != null) {
662            builder.append("  Removed from ").append(evictionMapName);
663          }
664          LOG.trace(builder.toString());
665        }
666        unref(replica);
667      }
668    
669      /**
670       * Fetch or create a replica.
671       *
672       * You must hold the cache lock while calling this function.
673       *
674       * @param key          Key to use for lookup.
675       * @param creator      Replica creator callback.  Will be called without
676       *                     the cache lock being held.
677       *
678       * @return             Null if no replica could be found or created.
679       *                     The replica, otherwise.
680       */
681      public ShortCircuitReplicaInfo fetchOrCreate(ExtendedBlockId key,
682          ShortCircuitReplicaCreator creator) {
683        Waitable<ShortCircuitReplicaInfo> newWaitable = null;
684        lock.lock();
685        try {
686          ShortCircuitReplicaInfo info = null;
687          do {
688            if (closed) {
689              if (LOG.isTraceEnabled()) {
690                LOG.trace(this + ": can't fetchOrCreate " + key +
691                    " because the cache is closed.");
692              }
693              return null;
694            }
695            Waitable<ShortCircuitReplicaInfo> waitable = replicaInfoMap.get(key);
696            if (waitable != null) {
697              try {
698                info = fetch(key, waitable);
699              } catch (RetriableException e) {
700                if (LOG.isDebugEnabled()) {
701                  LOG.debug(this + ": retrying " + e.getMessage());
702                }
703                continue;
704              }
705            }
706          } while (false);
707          if (info != null) return info;
708          // We need to load the replica ourselves.
709          newWaitable = new Waitable<ShortCircuitReplicaInfo>(lock.newCondition());
710          replicaInfoMap.put(key, newWaitable);
711        } finally {
712          lock.unlock();
713        }
714        return create(key, creator, newWaitable);
715      }
716    
717      /**
718       * Fetch an existing ReplicaInfo object.
719       *
720       * @param key       The key that we're using.
721       * @param waitable  The waitable object to wait on.
722       * @return          The existing ReplicaInfo object, or null if there is
723       *                  none.
724       *
725       * @throws RetriableException   If the caller needs to retry.
726       */
727      private ShortCircuitReplicaInfo fetch(ExtendedBlockId key,
728          Waitable<ShortCircuitReplicaInfo> waitable) throws RetriableException {
729        // Another thread is already in the process of loading this
730        // ShortCircuitReplica.  So we simply wait for it to complete.
731        ShortCircuitReplicaInfo info;
732        try {
733          if (LOG.isTraceEnabled()) {
734            LOG.trace(this + ": found waitable for " + key);
735          }
736          info = waitable.await();
737        } catch (InterruptedException e) {
738          LOG.info(this + ": interrupted while waiting for " + key);
739          Thread.currentThread().interrupt();
740          throw new RetriableException("interrupted");
741        }
742        if (info.getInvalidTokenException() != null) {
743          LOG.warn(this + ": could not get " + key + " due to InvalidToken " +
744                "exception.", info.getInvalidTokenException());
745          return info;
746        }
747        ShortCircuitReplica replica = info.getReplica();
748        if (replica == null) {
749          LOG.warn(this + ": failed to get " + key);
750          return info;
751        }
752        if (replica.purged) {
753          // Ignore replicas that have already been purged from the cache.
754          throw new RetriableException("Ignoring purged replica " +
755              replica + ".  Retrying.");
756        }
757        // Check if the replica is stale before using it.
758        // If it is, purge it and retry.
759        if (replica.isStale()) {
760          LOG.info(this + ": got stale replica " + replica + ".  Removing " +
761              "this replica from the replicaInfoMap and retrying.");
762          // Remove the cache's reference to the replica.  This may or may not
763          // trigger a close.
764          purge(replica);
765          throw new RetriableException("ignoring stale replica " + replica);
766        }
767        ref(replica);
768        return info;
769      }
770    
771      private ShortCircuitReplicaInfo create(ExtendedBlockId key,
772          ShortCircuitReplicaCreator creator,
773          Waitable<ShortCircuitReplicaInfo> newWaitable) {
774        // Handle loading a new replica.
775        ShortCircuitReplicaInfo info = null;
776        try {
777          if (LOG.isTraceEnabled()) {
778            LOG.trace(this + ": loading " + key);
779          }
780          info = creator.createShortCircuitReplicaInfo();
781        } catch (RuntimeException e) {
782          LOG.warn(this + ": failed to load " + key, e);
783        }
784        if (info == null) info = new ShortCircuitReplicaInfo();
785        lock.lock();
786        try {
787          if (info.getReplica() != null) {
788            // On success, make sure the cache cleaner thread is running.
789            if (LOG.isTraceEnabled()) {
790              LOG.trace(this + ": successfully loaded " + info.getReplica());
791            }
792            startCacheCleanerThreadIfNeeded();
793            // Note: new ShortCircuitReplicas start with a refCount of 2,
794            // indicating that both this cache and whoever requested the 
795            // creation of the replica hold a reference.  So we don't need
796            // to increment the reference count here.
797          } else {
798            // On failure, remove the waitable from the replicaInfoMap.
799            Waitable<ShortCircuitReplicaInfo> waitableInMap = replicaInfoMap.get(key);
800            if (waitableInMap == newWaitable) replicaInfoMap.remove(key);
801            if (info.getInvalidTokenException() != null) {
802              LOG.warn(this + ": could not load " + key + " due to InvalidToken " +
803                  "exception.", info.getInvalidTokenException());
804            } else {
805              LOG.warn(this + ": failed to load " + key);
806            }
807          }
808          newWaitable.provide(info);
809        } finally {
810          lock.unlock();
811        }
812        return info;
813      }
814    
815      private void startCacheCleanerThreadIfNeeded() {
816        if (cacheCleaner == null) {
817          cacheCleaner = new CacheCleaner();
818          long rateMs = cacheCleaner.getRateInMs();
819          ScheduledFuture<?> future =
820              cleanerExecutor.scheduleAtFixedRate(cacheCleaner, rateMs, rateMs,
821                  TimeUnit.MILLISECONDS);
822          cacheCleaner.setFuture(future);
823          if (LOG.isDebugEnabled()) {
824            LOG.debug(this + ": starting cache cleaner thread which will run " +
825              "every " + rateMs + " ms");
826          }
827        }
828      }
829    
830      ClientMmap getOrCreateClientMmap(ShortCircuitReplica replica,
831          boolean anchored) {
832        Condition newCond;
833        lock.lock();
834        try {
835          while (replica.mmapData != null) {
836            if (replica.mmapData instanceof MappedByteBuffer) {
837              ref(replica);
838              MappedByteBuffer mmap = (MappedByteBuffer)replica.mmapData;
839              return new ClientMmap(replica, mmap, anchored);
840            } else if (replica.mmapData instanceof Long) {
841              long lastAttemptTimeMs = (Long)replica.mmapData;
842              long delta = Time.monotonicNow() - lastAttemptTimeMs;
843              if (delta < staleThresholdMs) {
844                if (LOG.isTraceEnabled()) {
845                  LOG.trace(this + ": can't create client mmap for " +
846                      replica + " because we failed to " +
847                      "create one just " + delta + "ms ago.");
848                }
849                return null;
850              }
851              if (LOG.isTraceEnabled()) {
852                LOG.trace(this + ": retrying client mmap for " + replica +
853                    ", " + delta + " ms after the previous failure.");
854              }
855            } else if (replica.mmapData instanceof Condition) {
856              Condition cond = (Condition)replica.mmapData;
857              cond.awaitUninterruptibly();
858            } else {
859              Preconditions.checkState(false, "invalid mmapData type " +
860                  replica.mmapData.getClass().getName());
861            }
862          }
863          newCond = lock.newCondition();
864          replica.mmapData = newCond;
865        } finally {
866          lock.unlock();
867        }
868        MappedByteBuffer map = replica.loadMmapInternal();
869        lock.lock();
870        try {
871          if (map == null) {
872            replica.mmapData = Long.valueOf(Time.monotonicNow());
873            newCond.signalAll();
874            return null;
875          } else {
876            outstandingMmapCount++;
877            replica.mmapData = map;
878            ref(replica);
879            newCond.signalAll();
880            return new ClientMmap(replica, map, anchored);
881          }
882        } finally {
883          lock.unlock();
884        }
885      }
886    
887      /**
888       * Close the cache and free all associated resources.
889       */
890      @Override
891      public void close() {
892        try {
893          lock.lock();
894          if (closed) return;
895          closed = true;
896          LOG.info(this + ": closing");
897          maxNonMmappedEvictableLifespanMs = 0;
898          maxEvictableMmapedSize = 0;
899          // Close and join cacheCleaner thread.
900          IOUtils.cleanup(LOG, cacheCleaner);
901          // Purge all replicas.
902          while (true) {
903            Entry<Long, ShortCircuitReplica> entry = evictable.firstEntry();
904            if (entry == null) break;
905            purge(entry.getValue());
906          }
907          while (true) {
908            Entry<Long, ShortCircuitReplica> entry = evictableMmapped.firstEntry();
909            if (entry == null) break;
910            purge(entry.getValue());
911          }
912        } finally {
913          lock.unlock();
914        }
915        IOUtils.cleanup(LOG, shmManager);
916      }
917    
918      @VisibleForTesting // ONLY for testing
919      public interface CacheVisitor {
920        void visit(int numOutstandingMmaps,
921            Map<ExtendedBlockId, ShortCircuitReplica> replicas,
922            Map<ExtendedBlockId, InvalidToken> failedLoads,
923            Map<Long, ShortCircuitReplica> evictable,
924            Map<Long, ShortCircuitReplica> evictableMmapped);
925      }
926    
927      @VisibleForTesting // ONLY for testing
928      public void accept(CacheVisitor visitor) {
929        lock.lock();
930        try {
931          Map<ExtendedBlockId, ShortCircuitReplica> replicas =
932              new HashMap<ExtendedBlockId, ShortCircuitReplica>();
933          Map<ExtendedBlockId, InvalidToken> failedLoads =
934              new HashMap<ExtendedBlockId, InvalidToken>();
935          for (Entry<ExtendedBlockId, Waitable<ShortCircuitReplicaInfo>> entry :
936                replicaInfoMap.entrySet()) {
937            Waitable<ShortCircuitReplicaInfo> waitable = entry.getValue();
938            if (waitable.hasVal()) {
939              if (waitable.getVal().getReplica() != null) {
940                replicas.put(entry.getKey(), waitable.getVal().getReplica());
941              } else {
942                // The exception may be null here, indicating a failed load that
943                // isn't the result of an invalid block token.
944                failedLoads.put(entry.getKey(),
945                    waitable.getVal().getInvalidTokenException());
946              }
947            }
948          }
949          if (LOG.isDebugEnabled()) {
950            StringBuilder builder = new StringBuilder();
951            builder.append("visiting ").append(visitor.getClass().getName()).
952                append("with outstandingMmapCount=").append(outstandingMmapCount).
953                append(", replicas=");
954            String prefix = "";
955            for (Entry<ExtendedBlockId, ShortCircuitReplica> entry : replicas.entrySet()) {
956              builder.append(prefix).append(entry.getValue());
957              prefix = ",";
958            }
959            prefix = "";
960            builder.append(", failedLoads=");
961            for (Entry<ExtendedBlockId, InvalidToken> entry : failedLoads.entrySet()) {
962              builder.append(prefix).append(entry.getValue());
963              prefix = ",";
964            }
965            prefix = "";
966            builder.append(", evictable=");
967            for (Entry<Long, ShortCircuitReplica> entry : evictable.entrySet()) {
968              builder.append(prefix).append(entry.getKey()).
969                  append(":").append(entry.getValue());
970              prefix = ",";
971            }
972            prefix = "";
973            builder.append(", evictableMmapped=");
974            for (Entry<Long, ShortCircuitReplica> entry : evictableMmapped.entrySet()) {
975              builder.append(prefix).append(entry.getKey()).
976                  append(":").append(entry.getValue());
977              prefix = ",";
978            }
979            LOG.debug(builder.toString());
980          }
981          visitor.visit(outstandingMmapCount, replicas, failedLoads,
982                evictable, evictableMmapped);
983        } finally {
984          lock.unlock();
985        }
986      }
987    
988      @Override
989      public String toString() {
990        return "ShortCircuitCache(0x" +
991            Integer.toHexString(System.identityHashCode(this)) + ")";
992      }
993    
994      /**
995       * Allocate a new shared memory slot.
996       *
997       * @param datanode       The datanode to allocate a shm slot with.
998       * @param peer           A peer connected to the datanode.
999       * @param usedPeer       Will be set to true if we use up the provided peer.
1000       * @param blockId        The block id and block pool id of the block we're 
1001       *                         allocating this slot for.
1002       * @param clientName     The name of the DFSClient allocating the shared
1003       *                         memory.
1004       * @return               Null if short-circuit shared memory is disabled;
1005       *                         a short-circuit memory slot otherwise.
1006       * @throws IOException   An exception if there was an error talking to 
1007       *                         the datanode.
1008       */
1009      public Slot allocShmSlot(DatanodeInfo datanode,
1010            DomainPeer peer, MutableBoolean usedPeer,
1011            ExtendedBlockId blockId, String clientName) throws IOException {
1012        if (shmManager != null) {
1013          return shmManager.allocSlot(datanode, peer, usedPeer,
1014              blockId, clientName);
1015        } else {
1016          return null;
1017        }
1018      }
1019    
1020      /**
1021       * Free a slot immediately.
1022       *
1023       * ONLY use this if the DataNode is not yet aware of the slot.
1024       * 
1025       * @param slot           The slot to free.
1026       */
1027      public void freeSlot(Slot slot) {
1028        Preconditions.checkState(shmManager != null);
1029        slot.makeInvalid();
1030        shmManager.freeSlot(slot);
1031      }
1032      
1033      /**
1034       * Schedule a shared memory slot to be released.
1035       *
1036       * @param slot           The slot to release.
1037       */
1038      public void scheduleSlotReleaser(Slot slot) {
1039        Preconditions.checkState(shmManager != null);
1040        releaserExecutor.execute(new SlotReleaser(slot));
1041      }
1042    
1043      @VisibleForTesting
1044      public DfsClientShmManager getDfsClientShmManager() {
1045        return shmManager;
1046      }
1047    }