001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hdfs.shortcircuit; 019 020import java.io.FileInputStream; 021import java.io.IOException; 022import java.lang.reflect.Field; 023import java.util.BitSet; 024import java.util.Iterator; 025import java.util.NoSuchElementException; 026import java.util.Random; 027 028import org.apache.commons.lang.builder.EqualsBuilder; 029import org.apache.commons.lang.builder.HashCodeBuilder; 030import org.apache.hadoop.fs.InvalidRequestException; 031import org.apache.hadoop.hdfs.ExtendedBlockId; 032import org.apache.hadoop.io.nativeio.NativeIO; 033import org.apache.hadoop.io.nativeio.NativeIO.POSIX; 034import org.apache.hadoop.util.Shell; 035import org.apache.hadoop.util.StringUtils; 036 037import org.slf4j.Logger; 038import org.slf4j.LoggerFactory; 039 040import sun.misc.Unsafe; 041 042import com.google.common.base.Preconditions; 043import com.google.common.collect.ComparisonChain; 044import com.google.common.primitives.Ints; 045 046import javax.annotation.Nonnull; 047 048/** 049 * A shared memory segment used to implement short-circuit reads. 050 */ 051public class ShortCircuitShm { 052 private static final Logger LOG = LoggerFactory.getLogger( 053 ShortCircuitShm.class); 054 055 protected static final int BYTES_PER_SLOT = 64; 056 057 private static final Unsafe unsafe = safetyDance(); 058 059 private static Unsafe safetyDance() { 060 try { 061 Field f = Unsafe.class.getDeclaredField("theUnsafe"); 062 f.setAccessible(true); 063 return (Unsafe)f.get(null); 064 } catch (Throwable e) { 065 LOG.error("failed to load misc.Unsafe", e); 066 } 067 return null; 068 } 069 070 /** 071 * Calculate the usable size of a shared memory segment. 072 * We round down to a multiple of the slot size and do some validation. 073 * 074 * @param stream The stream we're using. 075 * @return The usable size of the shared memory segment. 076 */ 077 private static int getUsableLength(FileInputStream stream) 078 throws IOException { 079 int intSize = Ints.checkedCast(stream.getChannel().size()); 080 int slots = intSize / BYTES_PER_SLOT; 081 if (slots == 0) { 082 throw new IOException("size of shared memory segment was " + 083 intSize + ", but that is not enough to hold even one slot."); 084 } 085 return slots * BYTES_PER_SLOT; 086 } 087 088 /** 089 * Identifies a DfsClientShm. 090 */ 091 public static class ShmId implements Comparable<ShmId> { 092 private static final Random random = new Random(); 093 private final long hi; 094 private final long lo; 095 096 /** 097 * Generate a random ShmId. 098 * 099 * We generate ShmIds randomly to prevent a malicious client from 100 * successfully guessing one and using that to interfere with another 101 * client. 102 */ 103 public static ShmId createRandom() { 104 return new ShmId(random.nextLong(), random.nextLong()); 105 } 106 107 public ShmId(long hi, long lo) { 108 this.hi = hi; 109 this.lo = lo; 110 } 111 112 public long getHi() { 113 return hi; 114 } 115 116 public long getLo() { 117 return lo; 118 } 119 120 @Override 121 public boolean equals(Object o) { 122 if ((o == null) || (o.getClass() != this.getClass())) { 123 return false; 124 } 125 ShmId other = (ShmId)o; 126 return new EqualsBuilder(). 127 append(hi, other.hi). 128 append(lo, other.lo). 129 isEquals(); 130 } 131 132 @Override 133 public int hashCode() { 134 return new HashCodeBuilder(). 135 append(this.hi). 136 append(this.lo). 137 toHashCode(); 138 } 139 140 @Override 141 public String toString() { 142 return String.format("%016x%016x", hi, lo); 143 } 144 145 @Override 146 public int compareTo(@Nonnull ShmId other) { 147 return ComparisonChain.start(). 148 compare(hi, other.hi). 149 compare(lo, other.lo). 150 result(); 151 } 152 } 153 154 /** 155 * Uniquely identifies a slot. 156 */ 157 public static class SlotId { 158 private final ShmId shmId; 159 private final int slotIdx; 160 161 public SlotId(ShmId shmId, int slotIdx) { 162 this.shmId = shmId; 163 this.slotIdx = slotIdx; 164 } 165 166 public ShmId getShmId() { 167 return shmId; 168 } 169 170 public int getSlotIdx() { 171 return slotIdx; 172 } 173 174 @Override 175 public boolean equals(Object o) { 176 if ((o == null) || (o.getClass() != this.getClass())) { 177 return false; 178 } 179 SlotId other = (SlotId)o; 180 return new EqualsBuilder(). 181 append(shmId, other.shmId). 182 append(slotIdx, other.slotIdx). 183 isEquals(); 184 } 185 186 @Override 187 public int hashCode() { 188 return new HashCodeBuilder(). 189 append(this.shmId). 190 append(this.slotIdx). 191 toHashCode(); 192 } 193 194 @Override 195 public String toString() { 196 return String.format("SlotId(%s:%d)", shmId.toString(), slotIdx); 197 } 198 } 199 200 public class SlotIterator implements Iterator<Slot> { 201 int slotIdx = -1; 202 203 @Override 204 public boolean hasNext() { 205 synchronized (ShortCircuitShm.this) { 206 return allocatedSlots.nextSetBit(slotIdx + 1) != -1; 207 } 208 } 209 210 @Override 211 public Slot next() { 212 synchronized (ShortCircuitShm.this) { 213 int nextSlotIdx = allocatedSlots.nextSetBit(slotIdx + 1); 214 if (nextSlotIdx == -1) { 215 throw new NoSuchElementException(); 216 } 217 slotIdx = nextSlotIdx; 218 return slots[nextSlotIdx]; 219 } 220 } 221 222 @Override 223 public void remove() { 224 throw new UnsupportedOperationException("SlotIterator " + 225 "doesn't support removal"); 226 } 227 } 228 229 /** 230 * A slot containing information about a replica. 231 * 232 * The format is: 233 * word 0 234 * bit 0:32 Slot flags (see below). 235 * bit 33:63 Anchor count. 236 * word 1:7 237 * Reserved for future use, such as statistics. 238 * Padding is also useful for avoiding false sharing. 239 * 240 * Little-endian versus big-endian is not relevant here since both the client 241 * and the server reside on the same computer and use the same orientation. 242 */ 243 public class Slot { 244 /** 245 * Flag indicating that the slot is valid. 246 * 247 * The DFSClient sets this flag when it allocates a new slot within one of 248 * its shared memory regions. 249 * 250 * The DataNode clears this flag when the replica associated with this slot 251 * is no longer valid. The client itself also clears this flag when it 252 * believes that the DataNode is no longer using this slot to communicate. 253 */ 254 private static final long VALID_FLAG = 1L<<63; 255 256 /** 257 * Flag indicating that the slot can be anchored. 258 */ 259 private static final long ANCHORABLE_FLAG = 1L<<62; 260 261 /** 262 * The slot address in memory. 263 */ 264 private final long slotAddress; 265 266 /** 267 * BlockId of the block this slot is used for. 268 */ 269 private final ExtendedBlockId blockId; 270 271 Slot(long slotAddress, ExtendedBlockId blockId) { 272 this.slotAddress = slotAddress; 273 this.blockId = blockId; 274 } 275 276 /** 277 * Get the short-circuit memory segment associated with this Slot. 278 * 279 * @return The enclosing short-circuit memory segment. 280 */ 281 public ShortCircuitShm getShm() { 282 return ShortCircuitShm.this; 283 } 284 285 /** 286 * Get the ExtendedBlockId associated with this slot. 287 * 288 * @return The ExtendedBlockId of this slot. 289 */ 290 public ExtendedBlockId getBlockId() { 291 return blockId; 292 } 293 294 /** 295 * Get the SlotId of this slot, containing both shmId and slotIdx. 296 * 297 * @return The SlotId of this slot. 298 */ 299 public SlotId getSlotId() { 300 return new SlotId(getShmId(), getSlotIdx()); 301 } 302 303 /** 304 * Get the Slot index. 305 * 306 * @return The index of this slot. 307 */ 308 public int getSlotIdx() { 309 return Ints.checkedCast( 310 (slotAddress - baseAddress) / BYTES_PER_SLOT); 311 } 312 313 /** 314 * Clear the slot. 315 */ 316 void clear() { 317 unsafe.putLongVolatile(null, this.slotAddress, 0); 318 } 319 320 private boolean isSet(long flag) { 321 long prev = unsafe.getLongVolatile(null, this.slotAddress); 322 return (prev & flag) != 0; 323 } 324 325 private void setFlag(long flag) { 326 long prev; 327 do { 328 prev = unsafe.getLongVolatile(null, this.slotAddress); 329 if ((prev & flag) != 0) { 330 return; 331 } 332 } while (!unsafe.compareAndSwapLong(null, this.slotAddress, 333 prev, prev | flag)); 334 } 335 336 private void clearFlag(long flag) { 337 long prev; 338 do { 339 prev = unsafe.getLongVolatile(null, this.slotAddress); 340 if ((prev & flag) == 0) { 341 return; 342 } 343 } while (!unsafe.compareAndSwapLong(null, this.slotAddress, 344 prev, prev & (~flag))); 345 } 346 347 public boolean isValid() { 348 return isSet(VALID_FLAG); 349 } 350 351 public void makeValid() { 352 setFlag(VALID_FLAG); 353 } 354 355 public void makeInvalid() { 356 clearFlag(VALID_FLAG); 357 } 358 359 public boolean isAnchorable() { 360 return isSet(ANCHORABLE_FLAG); 361 } 362 363 public void makeAnchorable() { 364 setFlag(ANCHORABLE_FLAG); 365 } 366 367 public void makeUnanchorable() { 368 clearFlag(ANCHORABLE_FLAG); 369 } 370 371 public boolean isAnchored() { 372 long prev = unsafe.getLongVolatile(null, this.slotAddress); 373 // Slot is no longer valid. 374 return (prev & VALID_FLAG) != 0 && ((prev & 0x7fffffff) != 0); 375 } 376 377 /** 378 * Try to add an anchor for a given slot. 379 * 380 * When a slot is anchored, we know that the block it refers to is resident 381 * in memory. 382 * 383 * @return True if the slot is anchored. 384 */ 385 public boolean addAnchor() { 386 long prev; 387 do { 388 prev = unsafe.getLongVolatile(null, this.slotAddress); 389 if ((prev & VALID_FLAG) == 0) { 390 // Slot is no longer valid. 391 return false; 392 } 393 if ((prev & ANCHORABLE_FLAG) == 0) { 394 // Slot can't be anchored right now. 395 return false; 396 } 397 if ((prev & 0x7fffffff) == 0x7fffffff) { 398 // Too many other threads have anchored the slot (2 billion?) 399 return false; 400 } 401 } while (!unsafe.compareAndSwapLong(null, this.slotAddress, 402 prev, prev + 1)); 403 return true; 404 } 405 406 /** 407 * Remove an anchor for a given slot. 408 */ 409 public void removeAnchor() { 410 long prev; 411 do { 412 prev = unsafe.getLongVolatile(null, this.slotAddress); 413 Preconditions.checkState((prev & 0x7fffffff) != 0, 414 "Tried to remove anchor for slot " + slotAddress +", which was " + 415 "not anchored."); 416 } while (!unsafe.compareAndSwapLong(null, this.slotAddress, 417 prev, prev - 1)); 418 } 419 420 @Override 421 public String toString() { 422 return "Slot(slotIdx=" + getSlotIdx() + ", shm=" + getShm() + ")"; 423 } 424 } 425 426 /** 427 * ID for this SharedMemorySegment. 428 */ 429 private final ShmId shmId; 430 431 /** 432 * The base address of the memory-mapped file. 433 */ 434 private final long baseAddress; 435 436 /** 437 * The mmapped length of the shared memory segment 438 */ 439 private final int mmappedLength; 440 441 /** 442 * The slots associated with this shared memory segment. 443 * slot[i] contains the slot at offset i * BYTES_PER_SLOT, 444 * or null if that slot is not allocated. 445 */ 446 private final Slot slots[]; 447 448 /** 449 * A bitset where each bit represents a slot which is in use. 450 */ 451 private final BitSet allocatedSlots; 452 453 /** 454 * Create the ShortCircuitShm. 455 * 456 * @param shmId The ID to use. 457 * @param stream The stream that we're going to use to create this 458 * shared memory segment. 459 * 460 * Although this is a FileInputStream, we are going to 461 * assume that the underlying file descriptor is writable 462 * as well as readable. It would be more appropriate to use 463 * a RandomAccessFile here, but that class does not have 464 * any public accessor which returns a FileDescriptor, 465 * unlike FileInputStream. 466 */ 467 public ShortCircuitShm(ShmId shmId, FileInputStream stream) 468 throws IOException { 469 if (!NativeIO.isAvailable()) { 470 throw new UnsupportedOperationException("NativeIO is not available."); 471 } 472 if (Shell.WINDOWS) { 473 throw new UnsupportedOperationException( 474 "DfsClientShm is not yet implemented for Windows."); 475 } 476 if (unsafe == null) { 477 throw new UnsupportedOperationException( 478 "can't use DfsClientShm because we failed to " + 479 "load misc.Unsafe."); 480 } 481 this.shmId = shmId; 482 this.mmappedLength = getUsableLength(stream); 483 this.baseAddress = POSIX.mmap(stream.getFD(), 484 POSIX.MMAP_PROT_READ | POSIX.MMAP_PROT_WRITE, true, mmappedLength); 485 this.slots = new Slot[mmappedLength / BYTES_PER_SLOT]; 486 this.allocatedSlots = new BitSet(slots.length); 487 LOG.trace("creating {}(shmId={}, mmappedLength={}, baseAddress={}, " 488 + "slots.length={})", this.getClass().getSimpleName(), shmId, 489 mmappedLength, String.format("%x", baseAddress), slots.length); 490 } 491 492 public final ShmId getShmId() { 493 return shmId; 494 } 495 496 /** 497 * Determine if this shared memory object is empty. 498 * 499 * @return True if the shared memory object is empty. 500 */ 501 synchronized final public boolean isEmpty() { 502 return allocatedSlots.nextSetBit(0) == -1; 503 } 504 505 /** 506 * Determine if this shared memory object is full. 507 * 508 * @return True if the shared memory object is full. 509 */ 510 synchronized final public boolean isFull() { 511 return allocatedSlots.nextClearBit(0) >= slots.length; 512 } 513 514 /** 515 * Calculate the base address of a slot. 516 * 517 * @param slotIdx Index of the slot. 518 * @return The base address of the slot. 519 */ 520 private long calculateSlotAddress(int slotIdx) { 521 long offset = slotIdx; 522 offset *= BYTES_PER_SLOT; 523 return this.baseAddress + offset; 524 } 525 526 /** 527 * Allocate a new slot and register it. 528 * 529 * This function chooses an empty slot, initializes it, and then returns 530 * the relevant Slot object. 531 * 532 * @return The new slot. 533 */ 534 synchronized public final Slot allocAndRegisterSlot( 535 ExtendedBlockId blockId) { 536 int idx = allocatedSlots.nextClearBit(0); 537 if (idx >= slots.length) { 538 throw new RuntimeException(this + ": no more slots are available."); 539 } 540 allocatedSlots.set(idx, true); 541 Slot slot = new Slot(calculateSlotAddress(idx), blockId); 542 slot.clear(); 543 slot.makeValid(); 544 slots[idx] = slot; 545 if (LOG.isTraceEnabled()) { 546 LOG.trace(this + ": allocAndRegisterSlot " + idx + ": allocatedSlots=" + allocatedSlots + 547 StringUtils.getStackTrace(Thread.currentThread())); 548 } 549 return slot; 550 } 551 552 synchronized public final Slot getSlot(int slotIdx) 553 throws InvalidRequestException { 554 if (!allocatedSlots.get(slotIdx)) { 555 throw new InvalidRequestException(this + ": slot " + slotIdx + 556 " does not exist."); 557 } 558 return slots[slotIdx]; 559 } 560 561 /** 562 * Register a slot. 563 * 564 * This function looks at a slot which has already been initialized (by 565 * another process), and registers it with us. Then, it returns the 566 * relevant Slot object. 567 * 568 * @return The slot. 569 * 570 * @throws InvalidRequestException 571 * If the slot index we're trying to allocate has not been 572 * initialized, or is already in use. 573 */ 574 synchronized public final Slot registerSlot(int slotIdx, 575 ExtendedBlockId blockId) throws InvalidRequestException { 576 if (slotIdx < 0) { 577 throw new InvalidRequestException(this + ": invalid negative slot " + 578 "index " + slotIdx); 579 } 580 if (slotIdx >= slots.length) { 581 throw new InvalidRequestException(this + ": invalid slot " + 582 "index " + slotIdx); 583 } 584 if (allocatedSlots.get(slotIdx)) { 585 throw new InvalidRequestException(this + ": slot " + slotIdx + 586 " is already in use."); 587 } 588 Slot slot = new Slot(calculateSlotAddress(slotIdx), blockId); 589 if (!slot.isValid()) { 590 throw new InvalidRequestException(this + ": slot " + slotIdx + 591 " is not marked as valid."); 592 } 593 slots[slotIdx] = slot; 594 allocatedSlots.set(slotIdx, true); 595 if (LOG.isTraceEnabled()) { 596 LOG.trace(this + ": registerSlot " + slotIdx + ": allocatedSlots=" + allocatedSlots + 597 StringUtils.getStackTrace(Thread.currentThread())); 598 } 599 return slot; 600 } 601 602 /** 603 * Unregisters a slot. 604 * 605 * This doesn't alter the contents of the slot. It just means 606 * 607 * @param slotIdx Index of the slot to unregister. 608 */ 609 synchronized public final void unregisterSlot(int slotIdx) { 610 Preconditions.checkState(allocatedSlots.get(slotIdx), 611 "tried to unregister slot " + slotIdx + ", which was not registered."); 612 allocatedSlots.set(slotIdx, false); 613 slots[slotIdx] = null; 614 LOG.trace("{}: unregisterSlot {}", this, slotIdx); 615 } 616 617 /** 618 * Iterate over all allocated slots. 619 * 620 * Note that this method isn't safe if 621 * 622 * @return The slot iterator. 623 */ 624 public SlotIterator slotIterator() { 625 return new SlotIterator(); 626 } 627 628 public void free() { 629 try { 630 POSIX.munmap(baseAddress, mmappedLength); 631 } catch (IOException e) { 632 LOG.warn(this + ": failed to munmap", e); 633 } 634 LOG.trace(this + ": freed"); 635 } 636 637 @Override 638 public String toString() { 639 return this.getClass().getSimpleName() + "(" + shmId + ")"; 640 } 641}