001 /**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements. See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership. The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License. You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018 package org.apache.hadoop.hdfs.shortcircuit;
019
020 import java.io.FileInputStream;
021 import java.io.IOException;
022 import java.lang.reflect.Field;
023 import java.util.BitSet;
024 import java.util.Iterator;
025 import java.util.NoSuchElementException;
026 import java.util.Random;
027
028 import org.apache.commons.lang.builder.EqualsBuilder;
029 import org.apache.commons.lang.builder.HashCodeBuilder;
030 import org.apache.commons.logging.Log;
031 import org.apache.commons.logging.LogFactory;
032 import org.apache.hadoop.fs.InvalidRequestException;
033 import org.apache.hadoop.hdfs.ExtendedBlockId;
034 import org.apache.hadoop.io.nativeio.NativeIO;
035 import org.apache.hadoop.io.nativeio.NativeIO.POSIX;
036 import org.apache.hadoop.util.Shell;
037 import org.apache.hadoop.util.StringUtils;
038
039 import sun.misc.Unsafe;
040
041 import com.google.common.base.Preconditions;
042 import com.google.common.collect.ComparisonChain;
043 import com.google.common.primitives.Ints;
044
045 /**
046 * A shared memory segment used to implement short-circuit reads.
047 */
048 public class ShortCircuitShm {
049 private static final Log LOG = LogFactory.getLog(ShortCircuitShm.class);
050
051 protected static final int BYTES_PER_SLOT = 64;
052
053 private static final Unsafe unsafe = safetyDance();
054
055 private static Unsafe safetyDance() {
056 try {
057 Field f = Unsafe.class.getDeclaredField("theUnsafe");
058 f.setAccessible(true);
059 return (Unsafe)f.get(null);
060 } catch (Throwable e) {
061 LOG.error("failed to load misc.Unsafe", e);
062 }
063 return null;
064 }
065
066 /**
067 * Calculate the usable size of a shared memory segment.
068 * We round down to a multiple of the slot size and do some validation.
069 *
070 * @param stream The stream we're using.
071 * @return The usable size of the shared memory segment.
072 */
073 private static int getUsableLength(FileInputStream stream)
074 throws IOException {
075 int intSize = Ints.checkedCast(stream.getChannel().size());
076 int slots = intSize / BYTES_PER_SLOT;
077 if (slots == 0) {
078 throw new IOException("size of shared memory segment was " +
079 intSize + ", but that is not enough to hold even one slot.");
080 }
081 return slots * BYTES_PER_SLOT;
082 }
083
084 /**
085 * Identifies a DfsClientShm.
086 */
087 public static class ShmId implements Comparable<ShmId> {
088 private static final Random random = new Random();
089 private final long hi;
090 private final long lo;
091
092 /**
093 * Generate a random ShmId.
094 *
095 * We generate ShmIds randomly to prevent a malicious client from
096 * successfully guessing one and using that to interfere with another
097 * client.
098 */
099 public static ShmId createRandom() {
100 return new ShmId(random.nextLong(), random.nextLong());
101 }
102
103 public ShmId(long hi, long lo) {
104 this.hi = hi;
105 this.lo = lo;
106 }
107
108 public long getHi() {
109 return hi;
110 }
111
112 public long getLo() {
113 return lo;
114 }
115
116 @Override
117 public boolean equals(Object o) {
118 if ((o == null) || (o.getClass() != this.getClass())) {
119 return false;
120 }
121 ShmId other = (ShmId)o;
122 return new EqualsBuilder().
123 append(hi, other.hi).
124 append(lo, other.lo).
125 isEquals();
126 }
127
128 @Override
129 public int hashCode() {
130 return new HashCodeBuilder().
131 append(this.hi).
132 append(this.lo).
133 toHashCode();
134 }
135
136 @Override
137 public String toString() {
138 return String.format("%016x%016x", hi, lo);
139 }
140
141 @Override
142 public int compareTo(ShmId other) {
143 return ComparisonChain.start().
144 compare(hi, other.hi).
145 compare(lo, other.lo).
146 result();
147 }
148 };
149
150 /**
151 * Uniquely identifies a slot.
152 */
153 public static class SlotId {
154 private final ShmId shmId;
155 private final int slotIdx;
156
157 public SlotId(ShmId shmId, int slotIdx) {
158 this.shmId = shmId;
159 this.slotIdx = slotIdx;
160 }
161
162 public ShmId getShmId() {
163 return shmId;
164 }
165
166 public int getSlotIdx() {
167 return slotIdx;
168 }
169
170 @Override
171 public boolean equals(Object o) {
172 if ((o == null) || (o.getClass() != this.getClass())) {
173 return false;
174 }
175 SlotId other = (SlotId)o;
176 return new EqualsBuilder().
177 append(shmId, other.shmId).
178 append(slotIdx, other.slotIdx).
179 isEquals();
180 }
181
182 @Override
183 public int hashCode() {
184 return new HashCodeBuilder().
185 append(this.shmId).
186 append(this.slotIdx).
187 toHashCode();
188 }
189
190 @Override
191 public String toString() {
192 return String.format("SlotId(%s:%d)", shmId.toString(), slotIdx);
193 }
194 }
195
196 public class SlotIterator implements Iterator<Slot> {
197 int slotIdx = -1;
198
199 @Override
200 public boolean hasNext() {
201 synchronized (ShortCircuitShm.this) {
202 return allocatedSlots.nextSetBit(slotIdx + 1) != -1;
203 }
204 }
205
206 @Override
207 public Slot next() {
208 synchronized (ShortCircuitShm.this) {
209 int nextSlotIdx = allocatedSlots.nextSetBit(slotIdx + 1);
210 if (nextSlotIdx == -1) {
211 throw new NoSuchElementException();
212 }
213 slotIdx = nextSlotIdx;
214 return slots[nextSlotIdx];
215 }
216 }
217
218 @Override
219 public void remove() {
220 throw new UnsupportedOperationException("SlotIterator " +
221 "doesn't support removal");
222 }
223 }
224
225 /**
226 * A slot containing information about a replica.
227 *
228 * The format is:
229 * word 0
230 * bit 0:32 Slot flags (see below).
231 * bit 33:63 Anchor count.
232 * word 1:7
233 * Reserved for future use, such as statistics.
234 * Padding is also useful for avoiding false sharing.
235 *
236 * Little-endian versus big-endian is not relevant here since both the client
237 * and the server reside on the same computer and use the same orientation.
238 */
239 public class Slot {
240 /**
241 * Flag indicating that the slot is valid.
242 *
243 * The DFSClient sets this flag when it allocates a new slot within one of
244 * its shared memory regions.
245 *
246 * The DataNode clears this flag when the replica associated with this slot
247 * is no longer valid. The client itself also clears this flag when it
248 * believes that the DataNode is no longer using this slot to communicate.
249 */
250 private static final long VALID_FLAG = 1L<<63;
251
252 /**
253 * Flag indicating that the slot can be anchored.
254 */
255 private static final long ANCHORABLE_FLAG = 1L<<62;
256
257 /**
258 * The slot address in memory.
259 */
260 private final long slotAddress;
261
262 /**
263 * BlockId of the block this slot is used for.
264 */
265 private final ExtendedBlockId blockId;
266
267 Slot(long slotAddress, ExtendedBlockId blockId) {
268 this.slotAddress = slotAddress;
269 this.blockId = blockId;
270 }
271
272 /**
273 * Get the short-circuit memory segment associated with this Slot.
274 *
275 * @return The enclosing short-circuit memory segment.
276 */
277 public ShortCircuitShm getShm() {
278 return ShortCircuitShm.this;
279 }
280
281 /**
282 * Get the ExtendedBlockId associated with this slot.
283 *
284 * @return The ExtendedBlockId of this slot.
285 */
286 public ExtendedBlockId getBlockId() {
287 return blockId;
288 }
289
290 /**
291 * Get the SlotId of this slot, containing both shmId and slotIdx.
292 *
293 * @return The SlotId of this slot.
294 */
295 public SlotId getSlotId() {
296 return new SlotId(getShmId(), getSlotIdx());
297 }
298
299 /**
300 * Get the Slot index.
301 *
302 * @return The index of this slot.
303 */
304 public int getSlotIdx() {
305 return Ints.checkedCast(
306 (slotAddress - baseAddress) / BYTES_PER_SLOT);
307 }
308
309 /**
310 * Clear the slot.
311 */
312 void clear() {
313 unsafe.putLongVolatile(null, this.slotAddress, 0);
314 }
315
316 private boolean isSet(long flag) {
317 long prev = unsafe.getLongVolatile(null, this.slotAddress);
318 return (prev & flag) != 0;
319 }
320
321 private void setFlag(long flag) {
322 long prev;
323 do {
324 prev = unsafe.getLongVolatile(null, this.slotAddress);
325 if ((prev & flag) != 0) {
326 return;
327 }
328 } while (!unsafe.compareAndSwapLong(null, this.slotAddress,
329 prev, prev | flag));
330 }
331
332 private void clearFlag(long flag) {
333 long prev;
334 do {
335 prev = unsafe.getLongVolatile(null, this.slotAddress);
336 if ((prev & flag) == 0) {
337 return;
338 }
339 } while (!unsafe.compareAndSwapLong(null, this.slotAddress,
340 prev, prev & (~flag)));
341 }
342
343 public boolean isValid() {
344 return isSet(VALID_FLAG);
345 }
346
347 public void makeValid() {
348 setFlag(VALID_FLAG);
349 }
350
351 public void makeInvalid() {
352 clearFlag(VALID_FLAG);
353 }
354
355 public boolean isAnchorable() {
356 return isSet(ANCHORABLE_FLAG);
357 }
358
359 public void makeAnchorable() {
360 setFlag(ANCHORABLE_FLAG);
361 }
362
363 public void makeUnanchorable() {
364 clearFlag(ANCHORABLE_FLAG);
365 }
366
367 public boolean isAnchored() {
368 long prev = unsafe.getLongVolatile(null, this.slotAddress);
369 if ((prev & VALID_FLAG) == 0) {
370 // Slot is no longer valid.
371 return false;
372 }
373 return ((prev & 0x7fffffff) != 0);
374 }
375
376 /**
377 * Try to add an anchor for a given slot.
378 *
379 * When a slot is anchored, we know that the block it refers to is resident
380 * in memory.
381 *
382 * @return True if the slot is anchored.
383 */
384 public boolean addAnchor() {
385 long prev;
386 do {
387 prev = unsafe.getLongVolatile(null, this.slotAddress);
388 if ((prev & VALID_FLAG) == 0) {
389 // Slot is no longer valid.
390 return false;
391 }
392 if ((prev & ANCHORABLE_FLAG) == 0) {
393 // Slot can't be anchored right now.
394 return false;
395 }
396 if ((prev & 0x7fffffff) == 0x7fffffff) {
397 // Too many other threads have anchored the slot (2 billion?)
398 return false;
399 }
400 } while (!unsafe.compareAndSwapLong(null, this.slotAddress,
401 prev, prev + 1));
402 return true;
403 }
404
405 /**
406 * Remove an anchor for a given slot.
407 */
408 public void removeAnchor() {
409 long prev;
410 do {
411 prev = unsafe.getLongVolatile(null, this.slotAddress);
412 Preconditions.checkState((prev & 0x7fffffff) != 0,
413 "Tried to remove anchor for slot " + slotAddress +", which was " +
414 "not anchored.");
415 } while (!unsafe.compareAndSwapLong(null, this.slotAddress,
416 prev, prev - 1));
417 }
418
419 @Override
420 public String toString() {
421 return "Slot(slotIdx=" + getSlotIdx() + ", shm=" + getShm() + ")";
422 }
423 }
424
425 /**
426 * ID for this SharedMemorySegment.
427 */
428 private final ShmId shmId;
429
430 /**
431 * The base address of the memory-mapped file.
432 */
433 private final long baseAddress;
434
435 /**
436 * The mmapped length of the shared memory segment
437 */
438 private final int mmappedLength;
439
440 /**
441 * The slots associated with this shared memory segment.
442 * slot[i] contains the slot at offset i * BYTES_PER_SLOT,
443 * or null if that slot is not allocated.
444 */
445 private final Slot slots[];
446
447 /**
448 * A bitset where each bit represents a slot which is in use.
449 */
450 private final BitSet allocatedSlots;
451
452 /**
453 * Create the ShortCircuitShm.
454 *
455 * @param shmId The ID to use.
456 * @param stream The stream that we're going to use to create this
457 * shared memory segment.
458 *
459 * Although this is a FileInputStream, we are going to
460 * assume that the underlying file descriptor is writable
461 * as well as readable. It would be more appropriate to use
462 * a RandomAccessFile here, but that class does not have
463 * any public accessor which returns a FileDescriptor,
464 * unlike FileInputStream.
465 */
466 public ShortCircuitShm(ShmId shmId, FileInputStream stream)
467 throws IOException {
468 if (!NativeIO.isAvailable()) {
469 throw new UnsupportedOperationException("NativeIO is not available.");
470 }
471 if (Shell.WINDOWS) {
472 throw new UnsupportedOperationException(
473 "DfsClientShm is not yet implemented for Windows.");
474 }
475 if (unsafe == null) {
476 throw new UnsupportedOperationException(
477 "can't use DfsClientShm because we failed to " +
478 "load misc.Unsafe.");
479 }
480 this.shmId = shmId;
481 this.mmappedLength = getUsableLength(stream);
482 this.baseAddress = POSIX.mmap(stream.getFD(),
483 POSIX.MMAP_PROT_READ | POSIX.MMAP_PROT_WRITE, true, mmappedLength);
484 this.slots = new Slot[mmappedLength / BYTES_PER_SLOT];
485 this.allocatedSlots = new BitSet(slots.length);
486 if (LOG.isTraceEnabled()) {
487 LOG.trace("creating " + this.getClass().getSimpleName() +
488 "(shmId=" + shmId +
489 ", mmappedLength=" + mmappedLength +
490 ", baseAddress=" + String.format("%x", baseAddress) +
491 ", slots.length=" + slots.length + ")");
492 }
493 }
494
495 public final ShmId getShmId() {
496 return shmId;
497 }
498
499 /**
500 * Determine if this shared memory object is empty.
501 *
502 * @return True if the shared memory object is empty.
503 */
504 synchronized final public boolean isEmpty() {
505 return allocatedSlots.nextSetBit(0) == -1;
506 }
507
508 /**
509 * Determine if this shared memory object is full.
510 *
511 * @return True if the shared memory object is full.
512 */
513 synchronized final public boolean isFull() {
514 return allocatedSlots.nextClearBit(0) >= slots.length;
515 }
516
517 /**
518 * Calculate the base address of a slot.
519 *
520 * @param slotIdx Index of the slot.
521 * @return The base address of the slot.
522 */
523 private final long calculateSlotAddress(int slotIdx) {
524 long offset = slotIdx;
525 offset *= BYTES_PER_SLOT;
526 return this.baseAddress + offset;
527 }
528
529 /**
530 * Allocate a new slot and register it.
531 *
532 * This function chooses an empty slot, initializes it, and then returns
533 * the relevant Slot object.
534 *
535 * @return The new slot.
536 */
537 synchronized public final Slot allocAndRegisterSlot(
538 ExtendedBlockId blockId) {
539 int idx = allocatedSlots.nextClearBit(0);
540 if (idx >= slots.length) {
541 throw new RuntimeException(this + ": no more slots are available.");
542 }
543 allocatedSlots.set(idx, true);
544 Slot slot = new Slot(calculateSlotAddress(idx), blockId);
545 slot.clear();
546 slot.makeValid();
547 slots[idx] = slot;
548 if (LOG.isTraceEnabled()) {
549 LOG.trace(this + ": allocAndRegisterSlot " + idx + ": allocatedSlots=" + allocatedSlots +
550 StringUtils.getStackTrace(Thread.currentThread()));
551 }
552 return slot;
553 }
554
555 synchronized public final Slot getSlot(int slotIdx)
556 throws InvalidRequestException {
557 if (!allocatedSlots.get(slotIdx)) {
558 throw new InvalidRequestException(this + ": slot " + slotIdx +
559 " does not exist.");
560 }
561 return slots[slotIdx];
562 }
563
564 /**
565 * Register a slot.
566 *
567 * This function looks at a slot which has already been initialized (by
568 * another process), and registers it with us. Then, it returns the
569 * relevant Slot object.
570 *
571 * @return The slot.
572 *
573 * @throws InvalidRequestException
574 * If the slot index we're trying to allocate has not been
575 * initialized, or is already in use.
576 */
577 synchronized public final Slot registerSlot(int slotIdx,
578 ExtendedBlockId blockId) throws InvalidRequestException {
579 if (slotIdx < 0) {
580 throw new InvalidRequestException(this + ": invalid negative slot " +
581 "index " + slotIdx);
582 }
583 if (slotIdx >= slots.length) {
584 throw new InvalidRequestException(this + ": invalid slot " +
585 "index " + slotIdx);
586 }
587 if (allocatedSlots.get(slotIdx)) {
588 throw new InvalidRequestException(this + ": slot " + slotIdx +
589 " is already in use.");
590 }
591 Slot slot = new Slot(calculateSlotAddress(slotIdx), blockId);
592 if (!slot.isValid()) {
593 throw new InvalidRequestException(this + ": slot " + slotIdx +
594 " is not marked as valid.");
595 }
596 slots[slotIdx] = slot;
597 allocatedSlots.set(slotIdx, true);
598 if (LOG.isTraceEnabled()) {
599 LOG.trace(this + ": registerSlot " + slotIdx + ": allocatedSlots=" + allocatedSlots +
600 StringUtils.getStackTrace(Thread.currentThread()));
601 }
602 return slot;
603 }
604
605 /**
606 * Unregisters a slot.
607 *
608 * This doesn't alter the contents of the slot. It just means
609 *
610 * @param slotIdx Index of the slot to unregister.
611 */
612 synchronized public final void unregisterSlot(int slotIdx) {
613 Preconditions.checkState(allocatedSlots.get(slotIdx),
614 "tried to unregister slot " + slotIdx + ", which was not registered.");
615 allocatedSlots.set(slotIdx, false);
616 slots[slotIdx] = null;
617 if (LOG.isTraceEnabled()) {
618 LOG.trace(this + ": unregisterSlot " + slotIdx);
619 }
620 }
621
622 /**
623 * Iterate over all allocated slots.
624 *
625 * Note that this method isn't safe if
626 *
627 * @return The slot iterator.
628 */
629 public SlotIterator slotIterator() {
630 return new SlotIterator();
631 }
632
633 public void free() {
634 try {
635 POSIX.munmap(baseAddress, mmappedLength);
636 } catch (IOException e) {
637 LOG.warn(this + ": failed to munmap", e);
638 }
639 LOG.trace(this + ": freed");
640 }
641
642 @Override
643 public String toString() {
644 return this.getClass().getSimpleName() + "(" + shmId + ")";
645 }
646 }