001 /**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements. See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership. The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License. You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018 package org.apache.hadoop.hdfs;
019
020 import java.io.FileInputStream;
021 import java.io.IOException;
022 import java.lang.reflect.Field;
023 import java.util.BitSet;
024 import java.util.Iterator;
025 import java.util.NoSuchElementException;
026 import java.util.Random;
027
028 import org.apache.commons.lang.builder.EqualsBuilder;
029 import org.apache.commons.lang.builder.HashCodeBuilder;
030 import org.apache.commons.logging.Log;
031 import org.apache.commons.logging.LogFactory;
032 import org.apache.hadoop.fs.InvalidRequestException;
033 import org.apache.hadoop.io.nativeio.NativeIO;
034 import org.apache.hadoop.io.nativeio.NativeIO.POSIX;
035 import org.apache.hadoop.util.Shell;
036 import org.apache.hadoop.util.StringUtils;
037
038 import com.google.common.base.Preconditions;
039 import com.google.common.collect.ComparisonChain;
040 import com.google.common.primitives.Ints;
041
042 import sun.misc.Unsafe;
043
044 /**
045 * A shared memory segment used to implement short-circuit reads.
046 */
047 public class ShortCircuitShm {
048 private static final Log LOG = LogFactory.getLog(ShortCircuitShm.class);
049
050 protected static final int BYTES_PER_SLOT = 64;
051
052 private static final Unsafe unsafe = safetyDance();
053
054 private static Unsafe safetyDance() {
055 try {
056 Field f = Unsafe.class.getDeclaredField("theUnsafe");
057 f.setAccessible(true);
058 return (Unsafe)f.get(null);
059 } catch (Throwable e) {
060 LOG.error("failed to load misc.Unsafe", e);
061 }
062 return null;
063 }
064
065 /**
066 * Calculate the usable size of a shared memory segment.
067 * We round down to a multiple of the slot size and do some validation.
068 *
069 * @param stream The stream we're using.
070 * @return The usable size of the shared memory segment.
071 */
072 private static int getUsableLength(FileInputStream stream)
073 throws IOException {
074 int intSize = Ints.checkedCast(stream.getChannel().size());
075 int slots = intSize / BYTES_PER_SLOT;
076 if (slots == 0) {
077 throw new IOException("size of shared memory segment was " +
078 intSize + ", but that is not enough to hold even one slot.");
079 }
080 return slots * BYTES_PER_SLOT;
081 }
082
083 /**
084 * Identifies a DfsClientShm.
085 */
086 public static class ShmId implements Comparable<ShmId> {
087 private static final Random random = new Random();
088 private final long hi;
089 private final long lo;
090
091 /**
092 * Generate a random ShmId.
093 *
094 * We generate ShmIds randomly to prevent a malicious client from
095 * successfully guessing one and using that to interfere with another
096 * client.
097 */
098 public static ShmId createRandom() {
099 return new ShmId(random.nextLong(), random.nextLong());
100 }
101
102 public ShmId(long hi, long lo) {
103 this.hi = hi;
104 this.lo = lo;
105 }
106
107 public long getHi() {
108 return hi;
109 }
110
111 public long getLo() {
112 return lo;
113 }
114
115 @Override
116 public boolean equals(Object o) {
117 if ((o == null) || (o.getClass() != this.getClass())) {
118 return false;
119 }
120 ShmId other = (ShmId)o;
121 return new EqualsBuilder().
122 append(hi, other.hi).
123 append(lo, other.lo).
124 isEquals();
125 }
126
127 @Override
128 public int hashCode() {
129 return new HashCodeBuilder().
130 append(this.hi).
131 append(this.lo).
132 toHashCode();
133 }
134
135 @Override
136 public String toString() {
137 return String.format("%016x%016x", hi, lo);
138 }
139
140 @Override
141 public int compareTo(ShmId other) {
142 return ComparisonChain.start().
143 compare(hi, other.hi).
144 compare(lo, other.lo).
145 result();
146 }
147 };
148
149 /**
150 * Uniquely identifies a slot.
151 */
152 public static class SlotId {
153 private final ShmId shmId;
154 private final int slotIdx;
155
156 public SlotId(ShmId shmId, int slotIdx) {
157 this.shmId = shmId;
158 this.slotIdx = slotIdx;
159 }
160
161 public ShmId getShmId() {
162 return shmId;
163 }
164
165 public int getSlotIdx() {
166 return slotIdx;
167 }
168
169 @Override
170 public boolean equals(Object o) {
171 if ((o == null) || (o.getClass() != this.getClass())) {
172 return false;
173 }
174 SlotId other = (SlotId)o;
175 return new EqualsBuilder().
176 append(shmId, other.shmId).
177 append(slotIdx, other.slotIdx).
178 isEquals();
179 }
180
181 @Override
182 public int hashCode() {
183 return new HashCodeBuilder().
184 append(this.shmId).
185 append(this.slotIdx).
186 toHashCode();
187 }
188
189 @Override
190 public String toString() {
191 return String.format("SlotId(%s:%d)", shmId.toString(), slotIdx);
192 }
193 }
194
195 public class SlotIterator implements Iterator<Slot> {
196 int slotIdx = -1;
197
198 @Override
199 public boolean hasNext() {
200 synchronized (ShortCircuitShm.this) {
201 return allocatedSlots.nextSetBit(slotIdx + 1) != -1;
202 }
203 }
204
205 @Override
206 public Slot next() {
207 synchronized (ShortCircuitShm.this) {
208 int nextSlotIdx = allocatedSlots.nextSetBit(slotIdx + 1);
209 if (nextSlotIdx == -1) {
210 throw new NoSuchElementException();
211 }
212 slotIdx = nextSlotIdx;
213 return slots[nextSlotIdx];
214 }
215 }
216
217 @Override
218 public void remove() {
219 throw new UnsupportedOperationException("SlotIterator " +
220 "doesn't support removal");
221 }
222 }
223
224 /**
225 * A slot containing information about a replica.
226 *
227 * The format is:
228 * word 0
229 * bit 0:32 Slot flags (see below).
230 * bit 33:63 Anchor count.
231 * word 1:7
232 * Reserved for future use, such as statistics.
233 * Padding is also useful for avoiding false sharing.
234 *
235 * Little-endian versus big-endian is not relevant here since both the client
236 * and the server reside on the same computer and use the same orientation.
237 */
238 public class Slot {
239 /**
240 * Flag indicating that the slot is valid.
241 *
242 * The DFSClient sets this flag when it allocates a new slot within one of
243 * its shared memory regions.
244 *
245 * The DataNode clears this flag when the replica associated with this slot
246 * is no longer valid. The client itself also clears this flag when it
247 * believes that the DataNode is no longer using this slot to communicate.
248 */
249 private static final long VALID_FLAG = 1L<<63;
250
251 /**
252 * Flag indicating that the slot can be anchored.
253 */
254 private static final long ANCHORABLE_FLAG = 1L<<62;
255
256 /**
257 * The slot address in memory.
258 */
259 private final long slotAddress;
260
261 /**
262 * BlockId of the block this slot is used for.
263 */
264 private final ExtendedBlockId blockId;
265
266 Slot(long slotAddress, ExtendedBlockId blockId) {
267 this.slotAddress = slotAddress;
268 this.blockId = blockId;
269 }
270
271 /**
272 * Get the short-circuit memory segment associated with this Slot.
273 *
274 * @return The enclosing short-circuit memory segment.
275 */
276 public ShortCircuitShm getShm() {
277 return ShortCircuitShm.this;
278 }
279
280 /**
281 * Get the ExtendedBlockId associated with this slot.
282 *
283 * @return The ExtendedBlockId of this slot.
284 */
285 public ExtendedBlockId getBlockId() {
286 return blockId;
287 }
288
289 /**
290 * Get the SlotId of this slot, containing both shmId and slotIdx.
291 *
292 * @return The SlotId of this slot.
293 */
294 public SlotId getSlotId() {
295 return new SlotId(getShmId(), getSlotIdx());
296 }
297
298 /**
299 * Get the Slot index.
300 *
301 * @return The index of this slot.
302 */
303 public int getSlotIdx() {
304 return Ints.checkedCast(
305 (slotAddress - baseAddress) / BYTES_PER_SLOT);
306 }
307
308 private boolean isSet(long flag) {
309 long prev = unsafe.getLongVolatile(null, this.slotAddress);
310 return (prev & flag) != 0;
311 }
312
313 private void setFlag(long flag) {
314 long prev;
315 do {
316 prev = unsafe.getLongVolatile(null, this.slotAddress);
317 if ((prev & flag) != 0) {
318 return;
319 }
320 } while (!unsafe.compareAndSwapLong(null, this.slotAddress,
321 prev, prev | flag));
322 }
323
324 private void clearFlag(long flag) {
325 long prev;
326 do {
327 prev = unsafe.getLongVolatile(null, this.slotAddress);
328 if ((prev & flag) == 0) {
329 return;
330 }
331 } while (!unsafe.compareAndSwapLong(null, this.slotAddress,
332 prev, prev & (~flag)));
333 }
334
335 public boolean isValid() {
336 return isSet(VALID_FLAG);
337 }
338
339 public void makeValid() {
340 setFlag(VALID_FLAG);
341 }
342
343 public void makeInvalid() {
344 clearFlag(VALID_FLAG);
345 }
346
347 public boolean isAnchorable() {
348 return isSet(ANCHORABLE_FLAG);
349 }
350
351 public void makeAnchorable() {
352 setFlag(ANCHORABLE_FLAG);
353 }
354
355 public void makeUnanchorable() {
356 clearFlag(ANCHORABLE_FLAG);
357 }
358
359 public boolean isAnchored() {
360 long prev = unsafe.getLongVolatile(null, this.slotAddress);
361 if ((prev & VALID_FLAG) == 0) {
362 // Slot is no longer valid.
363 return false;
364 }
365 return ((prev & 0x7fffffff) != 0);
366 }
367
368 /**
369 * Try to add an anchor for a given slot.
370 *
371 * When a slot is anchored, we know that the block it refers to is resident
372 * in memory.
373 *
374 * @return True if the slot is anchored.
375 */
376 public boolean addAnchor() {
377 long prev;
378 do {
379 prev = unsafe.getLongVolatile(null, this.slotAddress);
380 if ((prev & VALID_FLAG) == 0) {
381 // Slot is no longer valid.
382 return false;
383 }
384 if ((prev & ANCHORABLE_FLAG) == 0) {
385 // Slot can't be anchored right now.
386 return false;
387 }
388 if ((prev & 0x7fffffff) == 0x7fffffff) {
389 // Too many other threads have anchored the slot (2 billion?)
390 return false;
391 }
392 } while (!unsafe.compareAndSwapLong(null, this.slotAddress,
393 prev, prev + 1));
394 return true;
395 }
396
397 /**
398 * Remove an anchor for a given slot.
399 */
400 public void removeAnchor() {
401 long prev;
402 do {
403 prev = unsafe.getLongVolatile(null, this.slotAddress);
404 Preconditions.checkState((prev & 0x7fffffff) != 0,
405 "Tried to remove anchor for slot " + slotAddress +", which was " +
406 "not anchored.");
407 } while (!unsafe.compareAndSwapLong(null, this.slotAddress,
408 prev, prev - 1));
409 }
410
411 @Override
412 public String toString() {
413 return "Slot(slotIdx=" + getSlotIdx() + ", shm=" + getShm() + ")";
414 }
415 }
416
417 /**
418 * ID for this SharedMemorySegment.
419 */
420 private final ShmId shmId;
421
422 /**
423 * The base address of the memory-mapped file.
424 */
425 private final long baseAddress;
426
427 /**
428 * The mmapped length of the shared memory segment
429 */
430 private final int mmappedLength;
431
432 /**
433 * The slots associated with this shared memory segment.
434 * slot[i] contains the slot at offset i * BYTES_PER_SLOT,
435 * or null if that slot is not allocated.
436 */
437 private final Slot slots[];
438
439 /**
440 * A bitset where each bit represents a slot which is in use.
441 */
442 private final BitSet allocatedSlots;
443
444 /**
445 * Create the ShortCircuitShm.
446 *
447 * @param shmId The ID to use.
448 * @param stream The stream that we're going to use to create this
449 * shared memory segment.
450 *
451 * Although this is a FileInputStream, we are going to
452 * assume that the underlying file descriptor is writable
453 * as well as readable. It would be more appropriate to use
454 * a RandomAccessFile here, but that class does not have
455 * any public accessor which returns a FileDescriptor,
456 * unlike FileInputStream.
457 */
458 public ShortCircuitShm(ShmId shmId, FileInputStream stream)
459 throws IOException {
460 if (!NativeIO.isAvailable()) {
461 throw new UnsupportedOperationException("NativeIO is not available.");
462 }
463 if (Shell.WINDOWS) {
464 throw new UnsupportedOperationException(
465 "DfsClientShm is not yet implemented for Windows.");
466 }
467 if (unsafe == null) {
468 throw new UnsupportedOperationException(
469 "can't use DfsClientShm because we failed to " +
470 "load misc.Unsafe.");
471 }
472 this.shmId = shmId;
473 this.mmappedLength = getUsableLength(stream);
474 this.baseAddress = POSIX.mmap(stream.getFD(),
475 POSIX.MMAP_PROT_READ | POSIX.MMAP_PROT_WRITE, true, mmappedLength);
476 this.slots = new Slot[mmappedLength / BYTES_PER_SLOT];
477 this.allocatedSlots = new BitSet(slots.length);
478 if (LOG.isTraceEnabled()) {
479 LOG.trace("creating " + this.getClass().getSimpleName() +
480 "(shmId=" + shmId +
481 ", mmappedLength=" + mmappedLength +
482 ", baseAddress=" + String.format("%x", baseAddress) +
483 ", slots.length=" + slots.length + ")");
484 }
485 }
486
487 public final ShmId getShmId() {
488 return shmId;
489 }
490
491 /**
492 * Determine if this shared memory object is empty.
493 *
494 * @return True if the shared memory object is empty.
495 */
496 synchronized final public boolean isEmpty() {
497 return allocatedSlots.nextSetBit(0) == -1;
498 }
499
500 /**
501 * Determine if this shared memory object is full.
502 *
503 * @return True if the shared memory object is full.
504 */
505 synchronized final public boolean isFull() {
506 return allocatedSlots.nextClearBit(0) >= slots.length;
507 }
508
509 /**
510 * Calculate the base address of a slot.
511 *
512 * @param slotIdx Index of the slot.
513 * @return The base address of the slot.
514 */
515 private final long calculateSlotAddress(int slotIdx) {
516 long offset = slotIdx;
517 offset *= BYTES_PER_SLOT;
518 return this.baseAddress + offset;
519 }
520
521 /**
522 * Allocate a new slot and register it.
523 *
524 * This function chooses an empty slot, initializes it, and then returns
525 * the relevant Slot object.
526 *
527 * @return The new slot.
528 */
529 synchronized public final Slot allocAndRegisterSlot(
530 ExtendedBlockId blockId) {
531 int idx = allocatedSlots.nextClearBit(0);
532 if (idx >= slots.length) {
533 throw new RuntimeException(this + ": no more slots are available.");
534 }
535 allocatedSlots.set(idx, true);
536 Slot slot = new Slot(calculateSlotAddress(idx), blockId);
537 slot.makeValid();
538 slots[idx] = slot;
539 if (LOG.isTraceEnabled()) {
540 LOG.trace(this + ": allocAndRegisterSlot " + idx + ": allocatedSlots=" + allocatedSlots +
541 StringUtils.getStackTrace(Thread.currentThread()));
542 }
543 return slot;
544 }
545
546 synchronized public final Slot getSlot(int slotIdx)
547 throws InvalidRequestException {
548 if (!allocatedSlots.get(slotIdx)) {
549 throw new InvalidRequestException(this + ": slot " + slotIdx +
550 " does not exist.");
551 }
552 return slots[slotIdx];
553 }
554
555 /**
556 * Register a slot.
557 *
558 * This function looks at a slot which has already been initialized (by
559 * another process), and registers it with us. Then, it returns the
560 * relevant Slot object.
561 *
562 * @return The slot.
563 *
564 * @throws InvalidRequestException
565 * If the slot index we're trying to allocate has not been
566 * initialized, or is already in use.
567 */
568 synchronized public final Slot registerSlot(int slotIdx,
569 ExtendedBlockId blockId) throws InvalidRequestException {
570 if (slotIdx < 0) {
571 throw new InvalidRequestException(this + ": invalid negative slot " +
572 "index " + slotIdx);
573 }
574 if (slotIdx >= slots.length) {
575 throw new InvalidRequestException(this + ": invalid slot " +
576 "index " + slotIdx);
577 }
578 if (allocatedSlots.get(slotIdx)) {
579 throw new InvalidRequestException(this + ": slot " + slotIdx +
580 " is already in use.");
581 }
582 Slot slot = new Slot(calculateSlotAddress(slotIdx), blockId);
583 if (!slot.isValid()) {
584 throw new InvalidRequestException(this + ": slot " + slotIdx +
585 " has not been allocated.");
586 }
587 slots[slotIdx] = slot;
588 allocatedSlots.set(slotIdx, true);
589 if (LOG.isTraceEnabled()) {
590 LOG.trace(this + ": registerSlot " + slotIdx + ": allocatedSlots=" + allocatedSlots +
591 StringUtils.getStackTrace(Thread.currentThread()));
592 }
593 return slot;
594 }
595
596 /**
597 * Unregisters a slot.
598 *
599 * This doesn't alter the contents of the slot. It just means
600 *
601 * @param slotIdx Index of the slot to unregister.
602 */
603 synchronized public final void unregisterSlot(int slotIdx) {
604 Preconditions.checkState(allocatedSlots.get(slotIdx),
605 "tried to unregister slot " + slotIdx + ", which was not registered.");
606 allocatedSlots.set(slotIdx, false);
607 slots[slotIdx] = null;
608 if (LOG.isTraceEnabled()) {
609 LOG.trace(this + ": unregisterSlot " + slotIdx);
610 }
611 }
612
613 /**
614 * Iterate over all allocated slots.
615 *
616 * Note that this method isn't safe if
617 *
618 * @return The slot iterator.
619 */
620 public SlotIterator slotIterator() {
621 return new SlotIterator();
622 }
623
624 public void free() {
625 try {
626 POSIX.munmap(baseAddress, mmappedLength);
627 } catch (IOException e) {
628 LOG.warn(this + ": failed to munmap", e);
629 }
630 LOG.trace(this + ": freed");
631 }
632
633 @Override
634 public String toString() {
635 return this.getClass().getSimpleName() + "(" + shmId + ")";
636 }
637 }