001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hdfs.shortcircuit; 019 020import java.io.BufferedOutputStream; 021import java.io.Closeable; 022import java.io.DataInputStream; 023import java.io.DataOutputStream; 024import java.io.IOException; 025import java.nio.MappedByteBuffer; 026import java.util.HashMap; 027import java.util.Map; 028import java.util.Map.Entry; 029import java.util.NoSuchElementException; 030import java.util.concurrent.ScheduledFuture; 031import java.util.concurrent.ScheduledThreadPoolExecutor; 032import java.util.concurrent.TimeUnit; 033import java.util.concurrent.locks.Condition; 034import java.util.concurrent.locks.ReentrantLock; 035 036import org.apache.commons.collections.map.LinkedMap; 037import org.apache.commons.lang.mutable.MutableBoolean; 038import org.apache.hadoop.classification.InterfaceAudience; 039import org.apache.hadoop.hdfs.ExtendedBlockId; 040import org.apache.hadoop.hdfs.client.impl.DfsClientConf.ShortCircuitConf; 041import org.apache.hadoop.hdfs.net.DomainPeer; 042import org.apache.hadoop.hdfs.protocol.DatanodeInfo; 043import org.apache.hadoop.hdfs.protocol.datatransfer.Sender; 044import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.ReleaseShortCircuitAccessResponseProto; 045import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.Status; 046import org.apache.hadoop.hdfs.protocolPB.PBHelperClient; 047import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitShm.Slot; 048import org.apache.hadoop.hdfs.util.IOUtilsClient; 049import org.apache.hadoop.ipc.RetriableException; 050import org.apache.hadoop.net.unix.DomainSocket; 051import org.apache.hadoop.net.unix.DomainSocketWatcher; 052import org.apache.hadoop.security.token.SecretManager.InvalidToken; 053import org.apache.hadoop.util.StringUtils; 054import org.apache.hadoop.util.Time; 055import org.apache.hadoop.util.Waitable; 056 057import com.google.common.annotations.VisibleForTesting; 058import com.google.common.base.Preconditions; 059import com.google.common.util.concurrent.ThreadFactoryBuilder; 060 061import org.slf4j.Logger; 062import org.slf4j.LoggerFactory; 063 064/** 065 * The ShortCircuitCache tracks things which the client needs to access 066 * HDFS block files via short-circuit. 067 * 068 * These things include: memory-mapped regions, file descriptors, and shared 069 * memory areas for communicating with the DataNode. 070 */ 071@InterfaceAudience.Private 072public class ShortCircuitCache implements Closeable { 073 public static final Logger LOG = LoggerFactory.getLogger( 074 ShortCircuitCache.class); 075 076 /** 077 * Expiry thread which makes sure that the file descriptors get closed 078 * after a while. 079 */ 080 private class CacheCleaner implements Runnable, Closeable { 081 private ScheduledFuture<?> future; 082 083 /** 084 * Run the CacheCleaner thread. 085 * 086 * Whenever a thread requests a ShortCircuitReplica object, we will make 087 * sure it gets one. That ShortCircuitReplica object can then be re-used 088 * when another thread requests a ShortCircuitReplica object for the same 089 * block. So in that sense, there is no maximum size to the cache. 090 * 091 * However, when a ShortCircuitReplica object is unreferenced by the 092 * thread(s) that are using it, it becomes evictable. There are two 093 * separate eviction lists-- one for mmaped objects, and another for 094 * non-mmaped objects. We do this in order to avoid having the regular 095 * files kick the mmaped files out of the cache too quickly. Reusing 096 * an already-existing mmap gives a huge performance boost, since the 097 * page table entries don't have to be re-populated. Both the mmap 098 * and non-mmap evictable lists have maximum sizes and maximum lifespans. 099 */ 100 @Override 101 public void run() { 102 ShortCircuitCache.this.lock.lock(); 103 try { 104 if (ShortCircuitCache.this.closed) return; 105 long curMs = Time.monotonicNow(); 106 107 LOG.debug("{}: cache cleaner running at {}", this, curMs); 108 109 int numDemoted = demoteOldEvictableMmaped(curMs); 110 int numPurged = 0; 111 Long evictionTimeNs; 112 while (true) { 113 Object eldestKey; 114 try { 115 eldestKey = evictable.firstKey(); 116 } catch (NoSuchElementException e) { 117 break; 118 } 119 evictionTimeNs = (Long)eldestKey; 120 long evictionTimeMs = 121 TimeUnit.MILLISECONDS.convert(evictionTimeNs, TimeUnit.NANOSECONDS); 122 if (evictionTimeMs + maxNonMmappedEvictableLifespanMs >= curMs) break; 123 ShortCircuitReplica replica = (ShortCircuitReplica)evictable.get( 124 eldestKey); 125 if (LOG.isTraceEnabled()) { 126 LOG.trace("CacheCleaner: purging " + replica + ": " + 127 StringUtils.getStackTrace(Thread.currentThread())); 128 } 129 purge(replica); 130 numPurged++; 131 } 132 133 LOG.debug("{}: finishing cache cleaner run started at {}. Demoted {} " 134 + "mmapped replicas; purged {} replicas.", 135 this, curMs, numDemoted, numPurged); 136 } finally { 137 ShortCircuitCache.this.lock.unlock(); 138 } 139 } 140 141 @Override 142 public void close() throws IOException { 143 if (future != null) { 144 future.cancel(false); 145 } 146 } 147 148 public void setFuture(ScheduledFuture<?> future) { 149 this.future = future; 150 } 151 152 /** 153 * Get the rate at which this cleaner thread should be scheduled. 154 * 155 * We do this by taking the minimum expiration time and dividing by 4. 156 * 157 * @return the rate in milliseconds at which this thread should be 158 * scheduled. 159 */ 160 public long getRateInMs() { 161 long minLifespanMs = 162 Math.min(maxNonMmappedEvictableLifespanMs, 163 maxEvictableMmapedLifespanMs); 164 long sampleTimeMs = minLifespanMs / 4; 165 return (sampleTimeMs < 1) ? 1 : sampleTimeMs; 166 } 167 } 168 169 /** 170 * A task which asks the DataNode to release a short-circuit shared memory 171 * slot. If successful, this will tell the DataNode to stop monitoring 172 * changes to the mlock status of the replica associated with the slot. 173 * It will also allow us (the client) to re-use this slot for another 174 * replica. If we can't communicate with the DataNode for some reason, 175 * we tear down the shared memory segment to avoid being in an inconsistent 176 * state. 177 */ 178 private class SlotReleaser implements Runnable { 179 /** 180 * The slot that we need to release. 181 */ 182 private final Slot slot; 183 184 SlotReleaser(Slot slot) { 185 this.slot = slot; 186 } 187 188 @Override 189 public void run() { 190 LOG.trace("{}: about to release {}", ShortCircuitCache.this, slot); 191 final DfsClientShm shm = (DfsClientShm)slot.getShm(); 192 final DomainSocket shmSock = shm.getPeer().getDomainSocket(); 193 final String path = shmSock.getPath(); 194 boolean success = false; 195 try (DomainSocket sock = DomainSocket.connect(path); 196 DataOutputStream out = new DataOutputStream( 197 new BufferedOutputStream(sock.getOutputStream()))) { 198 new Sender(out).releaseShortCircuitFds(slot.getSlotId()); 199 DataInputStream in = new DataInputStream(sock.getInputStream()); 200 ReleaseShortCircuitAccessResponseProto resp = 201 ReleaseShortCircuitAccessResponseProto.parseFrom( 202 PBHelperClient.vintPrefixed(in)); 203 if (resp.getStatus() != Status.SUCCESS) { 204 String error = resp.hasError() ? resp.getError() : "(unknown)"; 205 throw new IOException(resp.getStatus().toString() + ": " + error); 206 } 207 LOG.trace("{}: released {}", this, slot); 208 success = true; 209 } catch (IOException e) { 210 LOG.error(ShortCircuitCache.this + ": failed to release " + 211 "short-circuit shared memory slot " + slot + " by sending " + 212 "ReleaseShortCircuitAccessRequestProto to " + path + 213 ". Closing shared memory segment.", e); 214 } finally { 215 if (success) { 216 shmManager.freeSlot(slot); 217 } else { 218 shm.getEndpointShmManager().shutdown(shm); 219 } 220 } 221 } 222 } 223 224 public interface ShortCircuitReplicaCreator { 225 /** 226 * Attempt to create a ShortCircuitReplica object. 227 * 228 * This callback will be made without holding any locks. 229 * 230 * @return a non-null ShortCircuitReplicaInfo object. 231 */ 232 ShortCircuitReplicaInfo createShortCircuitReplicaInfo(); 233 } 234 235 /** 236 * Lock protecting the cache. 237 */ 238 private final ReentrantLock lock = new ReentrantLock(); 239 240 /** 241 * The executor service that runs the cacheCleaner. 242 */ 243 private final ScheduledThreadPoolExecutor cleanerExecutor 244 = new ScheduledThreadPoolExecutor(1, new ThreadFactoryBuilder(). 245 setDaemon(true).setNameFormat("ShortCircuitCache_Cleaner"). 246 build()); 247 248 /** 249 * The executor service that runs the cacheCleaner. 250 */ 251 private final ScheduledThreadPoolExecutor releaserExecutor 252 = new ScheduledThreadPoolExecutor(1, new ThreadFactoryBuilder(). 253 setDaemon(true).setNameFormat("ShortCircuitCache_SlotReleaser"). 254 build()); 255 256 /** 257 * A map containing all ShortCircuitReplicaInfo objects, organized by Key. 258 * ShortCircuitReplicaInfo objects may contain a replica, or an InvalidToken 259 * exception. 260 */ 261 private final HashMap<ExtendedBlockId, Waitable<ShortCircuitReplicaInfo>> 262 replicaInfoMap = new HashMap<>(); 263 264 /** 265 * The CacheCleaner. We don't create this and schedule it until it becomes 266 * necessary. 267 */ 268 private CacheCleaner cacheCleaner; 269 270 /** 271 * LinkedMap of evictable elements. 272 * 273 * Maps (unique) insertion time in nanoseconds to the element. 274 */ 275 private final LinkedMap evictable = new LinkedMap(); 276 277 /** 278 * Maximum total size of the cache, including both mmapped and 279 * no$-mmapped elements. 280 */ 281 private final int maxTotalSize; 282 283 /** 284 * Non-mmaped elements older than this will be closed. 285 */ 286 private long maxNonMmappedEvictableLifespanMs; 287 288 /** 289 * LinkedMap of mmaped evictable elements. 290 * 291 * Maps (unique) insertion time in nanoseconds to the element. 292 */ 293 private final LinkedMap evictableMmapped = new LinkedMap(); 294 295 /** 296 * Maximum number of mmaped evictable elements. 297 */ 298 private int maxEvictableMmapedSize; 299 300 /** 301 * Mmaped elements older than this will be closed. 302 */ 303 private final long maxEvictableMmapedLifespanMs; 304 305 /** 306 * The minimum number of milliseconds we'll wait after an unsuccessful 307 * mmap attempt before trying again. 308 */ 309 private final long mmapRetryTimeoutMs; 310 311 /** 312 * How long we will keep replicas in the cache before declaring them 313 * to be stale. 314 */ 315 private final long staleThresholdMs; 316 317 /** 318 * True if the ShortCircuitCache is closed. 319 */ 320 private boolean closed = false; 321 322 /** 323 * Number of existing mmaps associated with this cache. 324 */ 325 private int outstandingMmapCount = 0; 326 327 /** 328 * Manages short-circuit shared memory segments for the client. 329 */ 330 private final DfsClientShmManager shmManager; 331 332 public static ShortCircuitCache fromConf(ShortCircuitConf conf) { 333 return new ShortCircuitCache( 334 conf.getShortCircuitStreamsCacheSize(), 335 conf.getShortCircuitStreamsCacheExpiryMs(), 336 conf.getShortCircuitMmapCacheSize(), 337 conf.getShortCircuitMmapCacheExpiryMs(), 338 conf.getShortCircuitMmapCacheRetryTimeout(), 339 conf.getShortCircuitCacheStaleThresholdMs(), 340 conf.getShortCircuitSharedMemoryWatcherInterruptCheckMs()); 341 } 342 343 public ShortCircuitCache(int maxTotalSize, long maxNonMmappedEvictableLifespanMs, 344 int maxEvictableMmapedSize, long maxEvictableMmapedLifespanMs, 345 long mmapRetryTimeoutMs, long staleThresholdMs, int shmInterruptCheckMs) { 346 Preconditions.checkArgument(maxTotalSize >= 0); 347 this.maxTotalSize = maxTotalSize; 348 Preconditions.checkArgument(maxNonMmappedEvictableLifespanMs >= 0); 349 this.maxNonMmappedEvictableLifespanMs = maxNonMmappedEvictableLifespanMs; 350 Preconditions.checkArgument(maxEvictableMmapedSize >= 0); 351 this.maxEvictableMmapedSize = maxEvictableMmapedSize; 352 Preconditions.checkArgument(maxEvictableMmapedLifespanMs >= 0); 353 this.maxEvictableMmapedLifespanMs = maxEvictableMmapedLifespanMs; 354 this.mmapRetryTimeoutMs = mmapRetryTimeoutMs; 355 this.staleThresholdMs = staleThresholdMs; 356 DfsClientShmManager shmManager = null; 357 if ((shmInterruptCheckMs > 0) && 358 (DomainSocketWatcher.getLoadingFailureReason() == null)) { 359 try { 360 shmManager = new DfsClientShmManager(shmInterruptCheckMs); 361 } catch (IOException e) { 362 LOG.error("failed to create ShortCircuitShmManager", e); 363 } 364 } 365 this.shmManager = shmManager; 366 } 367 368 public long getStaleThresholdMs() { 369 return staleThresholdMs; 370 } 371 372 /** 373 * Increment the reference count of a replica, and remove it from any free 374 * list it may be in. 375 * 376 * You must hold the cache lock while calling this function. 377 * 378 * @param replica The replica we're removing. 379 */ 380 private void ref(ShortCircuitReplica replica) { 381 lock.lock(); 382 try { 383 Preconditions.checkArgument(replica.refCount > 0, 384 "can't ref %s because its refCount reached %d", replica, 385 replica.refCount); 386 Long evictableTimeNs = replica.getEvictableTimeNs(); 387 replica.refCount++; 388 if (evictableTimeNs != null) { 389 String removedFrom = removeEvictable(replica); 390 if (LOG.isTraceEnabled()) { 391 LOG.trace(this + ": " + removedFrom + 392 " no longer contains " + replica + ". refCount " + 393 (replica.refCount - 1) + " -> " + replica.refCount + 394 StringUtils.getStackTrace(Thread.currentThread())); 395 396 } 397 } else if (LOG.isTraceEnabled()) { 398 LOG.trace(this + ": replica refCount " + 399 (replica.refCount - 1) + " -> " + replica.refCount + 400 StringUtils.getStackTrace(Thread.currentThread())); 401 } 402 } finally { 403 lock.unlock(); 404 } 405 } 406 407 /** 408 * Unreference a replica. 409 * 410 * You must hold the cache lock while calling this function. 411 * 412 * @param replica The replica being unreferenced. 413 */ 414 void unref(ShortCircuitReplica replica) { 415 lock.lock(); 416 try { 417 // If the replica is stale or unusable, but we haven't purged it yet, 418 // let's do that. It would be a shame to evict a non-stale replica so 419 // that we could put a stale or unusable one into the cache. 420 if (!replica.purged) { 421 String purgeReason = null; 422 if (!replica.getDataStream().getChannel().isOpen()) { 423 purgeReason = "purging replica because its data channel is closed."; 424 } else if (!replica.getMetaStream().getChannel().isOpen()) { 425 purgeReason = "purging replica because its meta channel is closed."; 426 } else if (replica.isStale()) { 427 purgeReason = "purging replica because it is stale."; 428 } 429 if (purgeReason != null) { 430 LOG.debug("{}: {}", this, purgeReason); 431 purge(replica); 432 } 433 } 434 String addedString = ""; 435 boolean shouldTrimEvictionMaps = false; 436 int newRefCount = --replica.refCount; 437 if (newRefCount == 0) { 438 // Close replica, since there are no remaining references to it. 439 Preconditions.checkArgument(replica.purged, 440 "Replica %s reached a refCount of 0 without being purged", replica); 441 replica.close(); 442 } else if (newRefCount == 1) { 443 Preconditions.checkState(null == replica.getEvictableTimeNs(), 444 "Replica %s had a refCount higher than 1, " + 445 "but was still evictable (evictableTimeNs = %d)", 446 replica, replica.getEvictableTimeNs()); 447 if (!replica.purged) { 448 // Add the replica to the end of an eviction list. 449 // Eviction lists are sorted by time. 450 if (replica.hasMmap()) { 451 insertEvictable(System.nanoTime(), replica, evictableMmapped); 452 addedString = "added to evictableMmapped, "; 453 } else { 454 insertEvictable(System.nanoTime(), replica, evictable); 455 addedString = "added to evictable, "; 456 } 457 shouldTrimEvictionMaps = true; 458 } 459 } else { 460 Preconditions.checkArgument(replica.refCount >= 0, 461 "replica's refCount went negative (refCount = %d" + 462 " for %s)", replica.refCount, replica); 463 } 464 if (LOG.isTraceEnabled()) { 465 LOG.trace(this + ": unref replica " + replica + 466 ": " + addedString + " refCount " + 467 (newRefCount + 1) + " -> " + newRefCount + 468 StringUtils.getStackTrace(Thread.currentThread())); 469 } 470 if (shouldTrimEvictionMaps) { 471 trimEvictionMaps(); 472 } 473 } finally { 474 lock.unlock(); 475 } 476 } 477 478 /** 479 * Demote old evictable mmaps into the regular eviction map. 480 * 481 * You must hold the cache lock while calling this function. 482 * 483 * @param now Current time in monotonic milliseconds. 484 * @return Number of replicas demoted. 485 */ 486 private int demoteOldEvictableMmaped(long now) { 487 int numDemoted = 0; 488 boolean needMoreSpace = false; 489 Long evictionTimeNs; 490 491 while (true) { 492 Object eldestKey; 493 try { 494 eldestKey = evictableMmapped.firstKey(); 495 } catch (NoSuchElementException e) { 496 break; 497 } 498 evictionTimeNs = (Long)eldestKey; 499 long evictionTimeMs = 500 TimeUnit.MILLISECONDS.convert(evictionTimeNs, TimeUnit.NANOSECONDS); 501 if (evictionTimeMs + maxEvictableMmapedLifespanMs >= now) { 502 if (evictableMmapped.size() < maxEvictableMmapedSize) { 503 break; 504 } 505 needMoreSpace = true; 506 } 507 ShortCircuitReplica replica = (ShortCircuitReplica)evictableMmapped.get( 508 eldestKey); 509 if (LOG.isTraceEnabled()) { 510 String rationale = needMoreSpace ? "because we need more space" : 511 "because it's too old"; 512 LOG.trace("demoteOldEvictable: demoting " + replica + ": " + 513 rationale + ": " + 514 StringUtils.getStackTrace(Thread.currentThread())); 515 } 516 removeEvictable(replica, evictableMmapped); 517 munmap(replica); 518 insertEvictable(evictionTimeNs, replica, evictable); 519 numDemoted++; 520 } 521 return numDemoted; 522 } 523 524 /** 525 * Trim the eviction lists. 526 */ 527 private void trimEvictionMaps() { 528 long now = Time.monotonicNow(); 529 demoteOldEvictableMmaped(now); 530 531 while (true) { 532 long evictableSize = evictable.size(); 533 long evictableMmappedSize = evictableMmapped.size(); 534 if (evictableSize + evictableMmappedSize <= maxTotalSize) { 535 return; 536 } 537 ShortCircuitReplica replica; 538 try { 539 if (evictableSize == 0) { 540 replica = (ShortCircuitReplica)evictableMmapped.get(evictableMmapped 541 .firstKey()); 542 } else { 543 replica = (ShortCircuitReplica)evictable.get(evictable.firstKey()); 544 } 545 } catch (NoSuchElementException e) { 546 break; 547 } 548 if (LOG.isTraceEnabled()) { 549 LOG.trace(this + ": trimEvictionMaps is purging " + replica + 550 StringUtils.getStackTrace(Thread.currentThread())); 551 } 552 purge(replica); 553 } 554 } 555 556 /** 557 * Munmap a replica, updating outstandingMmapCount. 558 * 559 * @param replica The replica to munmap. 560 */ 561 private void munmap(ShortCircuitReplica replica) { 562 replica.munmap(); 563 outstandingMmapCount--; 564 } 565 566 /** 567 * Remove a replica from an evictable map. 568 * 569 * @param replica The replica to remove. 570 * @return The map it was removed from. 571 */ 572 private String removeEvictable(ShortCircuitReplica replica) { 573 if (replica.hasMmap()) { 574 removeEvictable(replica, evictableMmapped); 575 return "evictableMmapped"; 576 } else { 577 removeEvictable(replica, evictable); 578 return "evictable"; 579 } 580 } 581 582 /** 583 * Remove a replica from an evictable map. 584 * 585 * @param replica The replica to remove. 586 * @param map The map to remove it from. 587 */ 588 private void removeEvictable(ShortCircuitReplica replica, 589 LinkedMap map) { 590 Long evictableTimeNs = replica.getEvictableTimeNs(); 591 Preconditions.checkNotNull(evictableTimeNs); 592 ShortCircuitReplica removed = (ShortCircuitReplica)map.remove( 593 evictableTimeNs); 594 Preconditions.checkState(removed == replica, 595 "failed to make %s unevictable", replica); 596 replica.setEvictableTimeNs(null); 597 } 598 599 /** 600 * Insert a replica into an evictable map. 601 * 602 * If an element already exists with this eviction time, we add a nanosecond 603 * to it until we find an unused key. 604 * 605 * @param evictionTimeNs The eviction time in absolute nanoseconds. 606 * @param replica The replica to insert. 607 * @param map The map to insert it into. 608 */ 609 private void insertEvictable(Long evictionTimeNs, 610 ShortCircuitReplica replica, LinkedMap map) { 611 while (map.containsKey(evictionTimeNs)) { 612 evictionTimeNs++; 613 } 614 Preconditions.checkState(null == replica.getEvictableTimeNs()); 615 replica.setEvictableTimeNs(evictionTimeNs); 616 map.put(evictionTimeNs, replica); 617 } 618 619 /** 620 * Purge a replica from the cache. 621 * 622 * This doesn't necessarily close the replica, since there may be 623 * outstanding references to it. However, it does mean the cache won't 624 * hand it out to anyone after this. 625 * 626 * You must hold the cache lock while calling this function. 627 * 628 * @param replica The replica being removed. 629 */ 630 private void purge(ShortCircuitReplica replica) { 631 boolean removedFromInfoMap = false; 632 String evictionMapName = null; 633 Preconditions.checkArgument(!replica.purged); 634 replica.purged = true; 635 Waitable<ShortCircuitReplicaInfo> val = replicaInfoMap.get(replica.key); 636 if (val != null) { 637 ShortCircuitReplicaInfo info = val.getVal(); 638 if ((info != null) && (info.getReplica() == replica)) { 639 replicaInfoMap.remove(replica.key); 640 removedFromInfoMap = true; 641 } 642 } 643 Long evictableTimeNs = replica.getEvictableTimeNs(); 644 if (evictableTimeNs != null) { 645 evictionMapName = removeEvictable(replica); 646 } 647 if (LOG.isTraceEnabled()) { 648 StringBuilder builder = new StringBuilder(); 649 builder.append(this).append(": ").append(": purged "). 650 append(replica).append(" from the cache."); 651 if (removedFromInfoMap) { 652 builder.append(" Removed from the replicaInfoMap."); 653 } 654 if (evictionMapName != null) { 655 builder.append(" Removed from ").append(evictionMapName); 656 } 657 LOG.trace(builder.toString()); 658 } 659 unref(replica); 660 } 661 662 /** 663 * Fetch or create a replica. 664 * 665 * You must hold the cache lock while calling this function. 666 * 667 * @param key Key to use for lookup. 668 * @param creator Replica creator callback. Will be called without 669 * the cache lock being held. 670 * 671 * @return Null if no replica could be found or created. 672 * The replica, otherwise. 673 */ 674 public ShortCircuitReplicaInfo fetchOrCreate(ExtendedBlockId key, 675 ShortCircuitReplicaCreator creator) { 676 Waitable<ShortCircuitReplicaInfo> newWaitable = null; 677 lock.lock(); 678 try { 679 ShortCircuitReplicaInfo info = null; 680 do { 681 if (closed) { 682 LOG.trace("{}: can't fethchOrCreate {} because the cache is closed.", 683 this, key); 684 return null; 685 } 686 Waitable<ShortCircuitReplicaInfo> waitable = replicaInfoMap.get(key); 687 if (waitable != null) { 688 try { 689 info = fetch(key, waitable); 690 } catch (RetriableException e) { 691 LOG.debug("{}: retrying {}", this, e.getMessage()); 692 } 693 } 694 } while (false); 695 if (info != null) return info; 696 // We need to load the replica ourselves. 697 newWaitable = new Waitable<>(lock.newCondition()); 698 replicaInfoMap.put(key, newWaitable); 699 } finally { 700 lock.unlock(); 701 } 702 return create(key, creator, newWaitable); 703 } 704 705 /** 706 * Fetch an existing ReplicaInfo object. 707 * 708 * @param key The key that we're using. 709 * @param waitable The waitable object to wait on. 710 * @return The existing ReplicaInfo object, or null if there is 711 * none. 712 * 713 * @throws RetriableException If the caller needs to retry. 714 */ 715 private ShortCircuitReplicaInfo fetch(ExtendedBlockId key, 716 Waitable<ShortCircuitReplicaInfo> waitable) throws RetriableException { 717 // Another thread is already in the process of loading this 718 // ShortCircuitReplica. So we simply wait for it to complete. 719 ShortCircuitReplicaInfo info; 720 try { 721 LOG.trace("{}: found waitable for {}", this, key); 722 info = waitable.await(); 723 } catch (InterruptedException e) { 724 LOG.info(this + ": interrupted while waiting for " + key); 725 Thread.currentThread().interrupt(); 726 throw new RetriableException("interrupted"); 727 } 728 if (info.getInvalidTokenException() != null) { 729 LOG.info(this + ": could not get " + key + " due to InvalidToken " + 730 "exception.", info.getInvalidTokenException()); 731 return info; 732 } 733 ShortCircuitReplica replica = info.getReplica(); 734 if (replica == null) { 735 LOG.warn(this + ": failed to get " + key); 736 return info; 737 } 738 if (replica.purged) { 739 // Ignore replicas that have already been purged from the cache. 740 throw new RetriableException("Ignoring purged replica " + 741 replica + ". Retrying."); 742 } 743 // Check if the replica is stale before using it. 744 // If it is, purge it and retry. 745 if (replica.isStale()) { 746 LOG.info(this + ": got stale replica " + replica + ". Removing " + 747 "this replica from the replicaInfoMap and retrying."); 748 // Remove the cache's reference to the replica. This may or may not 749 // trigger a close. 750 purge(replica); 751 throw new RetriableException("ignoring stale replica " + replica); 752 } 753 ref(replica); 754 return info; 755 } 756 757 private ShortCircuitReplicaInfo create(ExtendedBlockId key, 758 ShortCircuitReplicaCreator creator, 759 Waitable<ShortCircuitReplicaInfo> newWaitable) { 760 // Handle loading a new replica. 761 ShortCircuitReplicaInfo info = null; 762 try { 763 LOG.trace("{}: loading {}", this, key); 764 info = creator.createShortCircuitReplicaInfo(); 765 } catch (RuntimeException e) { 766 LOG.warn(this + ": failed to load " + key, e); 767 } 768 if (info == null) info = new ShortCircuitReplicaInfo(); 769 lock.lock(); 770 try { 771 if (info.getReplica() != null) { 772 // On success, make sure the cache cleaner thread is running. 773 LOG.trace("{}: successfully loaded {}", this, info.getReplica()); 774 startCacheCleanerThreadIfNeeded(); 775 // Note: new ShortCircuitReplicas start with a refCount of 2, 776 // indicating that both this cache and whoever requested the 777 // creation of the replica hold a reference. So we don't need 778 // to increment the reference count here. 779 } else { 780 // On failure, remove the waitable from the replicaInfoMap. 781 Waitable<ShortCircuitReplicaInfo> waitableInMap = replicaInfoMap.get(key); 782 if (waitableInMap == newWaitable) replicaInfoMap.remove(key); 783 if (info.getInvalidTokenException() != null) { 784 LOG.info(this + ": could not load " + key + " due to InvalidToken " + 785 "exception.", info.getInvalidTokenException()); 786 } else { 787 LOG.warn(this + ": failed to load " + key); 788 } 789 } 790 newWaitable.provide(info); 791 } finally { 792 lock.unlock(); 793 } 794 return info; 795 } 796 797 private void startCacheCleanerThreadIfNeeded() { 798 if (cacheCleaner == null) { 799 cacheCleaner = new CacheCleaner(); 800 long rateMs = cacheCleaner.getRateInMs(); 801 ScheduledFuture<?> future = 802 cleanerExecutor.scheduleAtFixedRate(cacheCleaner, rateMs, rateMs, 803 TimeUnit.MILLISECONDS); 804 cacheCleaner.setFuture(future); 805 LOG.debug("{}: starting cache cleaner thread which will run every {} ms", 806 this, rateMs); 807 } 808 } 809 810 ClientMmap getOrCreateClientMmap(ShortCircuitReplica replica, 811 boolean anchored) { 812 Condition newCond; 813 lock.lock(); 814 try { 815 while (replica.mmapData != null) { 816 if (replica.mmapData instanceof MappedByteBuffer) { 817 ref(replica); 818 MappedByteBuffer mmap = (MappedByteBuffer)replica.mmapData; 819 return new ClientMmap(replica, mmap, anchored); 820 } else if (replica.mmapData instanceof Long) { 821 long lastAttemptTimeMs = (Long)replica.mmapData; 822 long delta = Time.monotonicNow() - lastAttemptTimeMs; 823 if (delta < mmapRetryTimeoutMs) { 824 LOG.trace("{}: can't create client mmap for {} because we failed to" 825 + " create one just {}ms ago.", this, replica, delta); 826 return null; 827 } 828 LOG.trace("{}: retrying client mmap for {}, {} ms after the previous " 829 + "failure.", this, replica, delta); 830 } else if (replica.mmapData instanceof Condition) { 831 Condition cond = (Condition)replica.mmapData; 832 cond.awaitUninterruptibly(); 833 } else { 834 Preconditions.checkState(false, "invalid mmapData type %s", 835 replica.mmapData.getClass().getName()); 836 } 837 } 838 newCond = lock.newCondition(); 839 replica.mmapData = newCond; 840 } finally { 841 lock.unlock(); 842 } 843 MappedByteBuffer map = replica.loadMmapInternal(); 844 lock.lock(); 845 try { 846 if (map == null) { 847 replica.mmapData = Time.monotonicNow(); 848 newCond.signalAll(); 849 return null; 850 } else { 851 outstandingMmapCount++; 852 replica.mmapData = map; 853 ref(replica); 854 newCond.signalAll(); 855 return new ClientMmap(replica, map, anchored); 856 } 857 } finally { 858 lock.unlock(); 859 } 860 } 861 862 /** 863 * Close the cache and free all associated resources. 864 */ 865 @Override 866 public void close() { 867 try { 868 lock.lock(); 869 if (closed) return; 870 closed = true; 871 LOG.info(this + ": closing"); 872 maxNonMmappedEvictableLifespanMs = 0; 873 maxEvictableMmapedSize = 0; 874 // Close and join cacheCleaner thread. 875 IOUtilsClient.cleanup(LOG, cacheCleaner); 876 // Purge all replicas. 877 while (true) { 878 Object eldestKey; 879 try { 880 eldestKey = evictable.firstKey(); 881 } catch (NoSuchElementException e) { 882 break; 883 } 884 purge((ShortCircuitReplica)evictable.get(eldestKey)); 885 } 886 while (true) { 887 Object eldestKey; 888 try { 889 eldestKey = evictableMmapped.firstKey(); 890 } catch (NoSuchElementException e) { 891 break; 892 } 893 purge((ShortCircuitReplica)evictableMmapped.get(eldestKey)); 894 } 895 } finally { 896 lock.unlock(); 897 } 898 899 releaserExecutor.shutdown(); 900 cleanerExecutor.shutdown(); 901 // wait for existing tasks to terminate 902 try { 903 if (!releaserExecutor.awaitTermination(30, TimeUnit.SECONDS)) { 904 LOG.error("Forcing SlotReleaserThreadPool to shutdown!"); 905 releaserExecutor.shutdownNow(); 906 } 907 } catch (InterruptedException e) { 908 releaserExecutor.shutdownNow(); 909 Thread.currentThread().interrupt(); 910 LOG.error("Interrupted while waiting for SlotReleaserThreadPool " 911 + "to terminate", e); 912 } 913 914 // wait for existing tasks to terminate 915 try { 916 if (!cleanerExecutor.awaitTermination(30, TimeUnit.SECONDS)) { 917 LOG.error("Forcing CleanerThreadPool to shutdown!"); 918 cleanerExecutor.shutdownNow(); 919 } 920 } catch (InterruptedException e) { 921 cleanerExecutor.shutdownNow(); 922 Thread.currentThread().interrupt(); 923 LOG.error("Interrupted while waiting for CleanerThreadPool " 924 + "to terminate", e); 925 } 926 IOUtilsClient.cleanup(LOG, shmManager); 927 } 928 929 @VisibleForTesting // ONLY for testing 930 public interface CacheVisitor { 931 void visit(int numOutstandingMmaps, 932 Map<ExtendedBlockId, ShortCircuitReplica> replicas, 933 Map<ExtendedBlockId, InvalidToken> failedLoads, 934 LinkedMap evictable, 935 LinkedMap evictableMmapped); 936 } 937 938 @VisibleForTesting // ONLY for testing 939 public void accept(CacheVisitor visitor) { 940 lock.lock(); 941 try { 942 Map<ExtendedBlockId, ShortCircuitReplica> replicas = new HashMap<>(); 943 Map<ExtendedBlockId, InvalidToken> failedLoads = new HashMap<>(); 944 for (Entry<ExtendedBlockId, Waitable<ShortCircuitReplicaInfo>> entry : 945 replicaInfoMap.entrySet()) { 946 Waitable<ShortCircuitReplicaInfo> waitable = entry.getValue(); 947 if (waitable.hasVal()) { 948 if (waitable.getVal().getReplica() != null) { 949 replicas.put(entry.getKey(), waitable.getVal().getReplica()); 950 } else { 951 // The exception may be null here, indicating a failed load that 952 // isn't the result of an invalid block token. 953 failedLoads.put(entry.getKey(), 954 waitable.getVal().getInvalidTokenException()); 955 } 956 } 957 } 958 LOG.debug("visiting {} with outstandingMmapCount={}, replicas={}, " 959 + "failedLoads={}, evictable={}, evictableMmapped={}", 960 visitor.getClass().getName(), outstandingMmapCount, replicas, 961 failedLoads, evictable, evictableMmapped); 962 visitor.visit(outstandingMmapCount, replicas, failedLoads, 963 evictable, evictableMmapped); 964 } finally { 965 lock.unlock(); 966 } 967 } 968 969 @Override 970 public String toString() { 971 return "ShortCircuitCache(0x" + 972 Integer.toHexString(System.identityHashCode(this)) + ")"; 973 } 974 975 /** 976 * Allocate a new shared memory slot. 977 * 978 * @param datanode The datanode to allocate a shm slot with. 979 * @param peer A peer connected to the datanode. 980 * @param usedPeer Will be set to true if we use up the provided peer. 981 * @param blockId The block id and block pool id of the block we're 982 * allocating this slot for. 983 * @param clientName The name of the DFSClient allocating the shared 984 * memory. 985 * @return Null if short-circuit shared memory is disabled; 986 * a short-circuit memory slot otherwise. 987 * @throws IOException An exception if there was an error talking to 988 * the datanode. 989 */ 990 public Slot allocShmSlot(DatanodeInfo datanode, 991 DomainPeer peer, MutableBoolean usedPeer, 992 ExtendedBlockId blockId, String clientName) throws IOException { 993 if (shmManager != null) { 994 return shmManager.allocSlot(datanode, peer, usedPeer, 995 blockId, clientName); 996 } else { 997 return null; 998 } 999 } 1000 1001 /** 1002 * Free a slot immediately. 1003 * 1004 * ONLY use this if the DataNode is not yet aware of the slot. 1005 * 1006 * @param slot The slot to free. 1007 */ 1008 public void freeSlot(Slot slot) { 1009 Preconditions.checkState(shmManager != null); 1010 slot.makeInvalid(); 1011 shmManager.freeSlot(slot); 1012 } 1013 1014 /** 1015 * Schedule a shared memory slot to be released. 1016 * 1017 * @param slot The slot to release. 1018 */ 1019 public void scheduleSlotReleaser(Slot slot) { 1020 Preconditions.checkState(shmManager != null); 1021 releaserExecutor.execute(new SlotReleaser(slot)); 1022 } 1023 1024 @VisibleForTesting 1025 public DfsClientShmManager getDfsClientShmManager() { 1026 return shmManager; 1027 } 1028}