001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hdfs.shortcircuit;
019
020import java.io.FileInputStream;
021import java.io.IOException;
022import java.lang.reflect.Field;
023import java.util.BitSet;
024import java.util.Iterator;
025import java.util.NoSuchElementException;
026import java.util.Random;
027
028import org.apache.commons.lang.builder.EqualsBuilder;
029import org.apache.commons.lang.builder.HashCodeBuilder;
030import org.apache.hadoop.fs.InvalidRequestException;
031import org.apache.hadoop.hdfs.ExtendedBlockId;
032import org.apache.hadoop.io.nativeio.NativeIO;
033import org.apache.hadoop.io.nativeio.NativeIO.POSIX;
034import org.apache.hadoop.util.Shell;
035import org.apache.hadoop.util.StringUtils;
036
037import org.slf4j.Logger;
038import org.slf4j.LoggerFactory;
039
040import sun.misc.Unsafe;
041
042import com.google.common.base.Preconditions;
043import com.google.common.collect.ComparisonChain;
044import com.google.common.primitives.Ints;
045
046import javax.annotation.Nonnull;
047
048/**
049 * A shared memory segment used to implement short-circuit reads.
050 */
051public class ShortCircuitShm {
052  private static final Logger LOG = LoggerFactory.getLogger(
053      ShortCircuitShm.class);
054
055  protected static final int BYTES_PER_SLOT = 64;
056
057  private static final Unsafe unsafe = safetyDance();
058
059  private static Unsafe safetyDance() {
060    try {
061      Field f = Unsafe.class.getDeclaredField("theUnsafe");
062      f.setAccessible(true);
063      return (Unsafe)f.get(null);
064    } catch (Throwable e) {
065      LOG.error("failed to load misc.Unsafe", e);
066    }
067    return null;
068  }
069
070  /**
071   * Calculate the usable size of a shared memory segment.
072   * We round down to a multiple of the slot size and do some validation.
073   *
074   * @param stream The stream we're using.
075   * @return       The usable size of the shared memory segment.
076   */
077  private static int getUsableLength(FileInputStream stream)
078      throws IOException {
079    int intSize = Ints.checkedCast(stream.getChannel().size());
080    int slots = intSize / BYTES_PER_SLOT;
081    if (slots == 0) {
082      throw new IOException("size of shared memory segment was " +
083          intSize + ", but that is not enough to hold even one slot.");
084    }
085    return slots * BYTES_PER_SLOT;
086  }
087
088  /**
089   * Identifies a DfsClientShm.
090   */
091  public static class ShmId implements Comparable<ShmId> {
092    private static final Random random = new Random();
093    private final long hi;
094    private final long lo;
095
096    /**
097     * Generate a random ShmId.
098     *
099     * We generate ShmIds randomly to prevent a malicious client from
100     * successfully guessing one and using that to interfere with another
101     * client.
102     */
103    public static ShmId createRandom() {
104      return new ShmId(random.nextLong(), random.nextLong());
105    }
106
107    public ShmId(long hi, long lo) {
108      this.hi = hi;
109      this.lo = lo;
110    }
111
112    public long getHi() {
113      return hi;
114    }
115
116    public long getLo() {
117      return lo;
118    }
119
120    @Override
121    public boolean equals(Object o) {
122      if ((o == null) || (o.getClass() != this.getClass())) {
123        return false;
124      }
125      ShmId other = (ShmId)o;
126      return new EqualsBuilder().
127          append(hi, other.hi).
128          append(lo, other.lo).
129          isEquals();
130    }
131
132    @Override
133    public int hashCode() {
134      return new HashCodeBuilder().
135          append(this.hi).
136          append(this.lo).
137          toHashCode();
138    }
139
140    @Override
141    public String toString() {
142      return String.format("%016x%016x", hi, lo);
143    }
144
145    @Override
146    public int compareTo(@Nonnull ShmId other) {
147      return ComparisonChain.start().
148          compare(hi, other.hi).
149          compare(lo, other.lo).
150          result();
151    }
152  }
153
154  /**
155   * Uniquely identifies a slot.
156   */
157  public static class SlotId {
158    private final ShmId shmId;
159    private final int slotIdx;
160
161    public SlotId(ShmId shmId, int slotIdx) {
162      this.shmId = shmId;
163      this.slotIdx = slotIdx;
164    }
165
166    public ShmId getShmId() {
167      return shmId;
168    }
169
170    public int getSlotIdx() {
171      return slotIdx;
172    }
173
174    @Override
175    public boolean equals(Object o) {
176      if ((o == null) || (o.getClass() != this.getClass())) {
177        return false;
178      }
179      SlotId other = (SlotId)o;
180      return new EqualsBuilder().
181          append(shmId, other.shmId).
182          append(slotIdx, other.slotIdx).
183          isEquals();
184    }
185
186    @Override
187    public int hashCode() {
188      return new HashCodeBuilder().
189          append(this.shmId).
190          append(this.slotIdx).
191          toHashCode();
192    }
193
194    @Override
195    public String toString() {
196      return String.format("SlotId(%s:%d)", shmId.toString(), slotIdx);
197    }
198  }
199
200  public class SlotIterator implements Iterator<Slot> {
201    int slotIdx = -1;
202
203    @Override
204    public boolean hasNext() {
205      synchronized (ShortCircuitShm.this) {
206        return allocatedSlots.nextSetBit(slotIdx + 1) != -1;
207      }
208    }
209
210    @Override
211    public Slot next() {
212      synchronized (ShortCircuitShm.this) {
213        int nextSlotIdx = allocatedSlots.nextSetBit(slotIdx + 1);
214        if (nextSlotIdx == -1) {
215          throw new NoSuchElementException();
216        }
217        slotIdx = nextSlotIdx;
218        return slots[nextSlotIdx];
219      }
220    }
221
222    @Override
223    public void remove() {
224      throw new UnsupportedOperationException("SlotIterator " +
225          "doesn't support removal");
226    }
227  }
228
229  /**
230   * A slot containing information about a replica.
231   *
232   * The format is:
233   * word 0
234   *   bit 0:32   Slot flags (see below).
235   *   bit 33:63  Anchor count.
236   * word 1:7
237   *   Reserved for future use, such as statistics.
238   *   Padding is also useful for avoiding false sharing.
239   *
240   * Little-endian versus big-endian is not relevant here since both the client
241   * and the server reside on the same computer and use the same orientation.
242   */
243  public class Slot {
244    /**
245     * Flag indicating that the slot is valid.
246     *
247     * The DFSClient sets this flag when it allocates a new slot within one of
248     * its shared memory regions.
249     *
250     * The DataNode clears this flag when the replica associated with this slot
251     * is no longer valid.  The client itself also clears this flag when it
252     * believes that the DataNode is no longer using this slot to communicate.
253     */
254    private static final long VALID_FLAG =          1L<<63;
255
256    /**
257     * Flag indicating that the slot can be anchored.
258     */
259    private static final long ANCHORABLE_FLAG =     1L<<62;
260
261    /**
262     * The slot address in memory.
263     */
264    private final long slotAddress;
265
266    /**
267     * BlockId of the block this slot is used for.
268     */
269    private final ExtendedBlockId blockId;
270
271    Slot(long slotAddress, ExtendedBlockId blockId) {
272      this.slotAddress = slotAddress;
273      this.blockId = blockId;
274    }
275
276    /**
277     * Get the short-circuit memory segment associated with this Slot.
278     *
279     * @return      The enclosing short-circuit memory segment.
280     */
281    public ShortCircuitShm getShm() {
282      return ShortCircuitShm.this;
283    }
284
285    /**
286     * Get the ExtendedBlockId associated with this slot.
287     *
288     * @return      The ExtendedBlockId of this slot.
289     */
290    public ExtendedBlockId getBlockId() {
291      return blockId;
292    }
293
294    /**
295     * Get the SlotId of this slot, containing both shmId and slotIdx.
296     *
297     * @return      The SlotId of this slot.
298     */
299    public SlotId getSlotId() {
300      return new SlotId(getShmId(), getSlotIdx());
301    }
302
303    /**
304     * Get the Slot index.
305     *
306     * @return      The index of this slot.
307     */
308    public int getSlotIdx() {
309      return Ints.checkedCast(
310          (slotAddress - baseAddress) / BYTES_PER_SLOT);
311    }
312
313    /**
314     * Clear the slot.
315     */
316    void clear() {
317      unsafe.putLongVolatile(null, this.slotAddress, 0);
318    }
319
320    private boolean isSet(long flag) {
321      long prev = unsafe.getLongVolatile(null, this.slotAddress);
322      return (prev & flag) != 0;
323    }
324
325    private void setFlag(long flag) {
326      long prev;
327      do {
328        prev = unsafe.getLongVolatile(null, this.slotAddress);
329        if ((prev & flag) != 0) {
330          return;
331        }
332      } while (!unsafe.compareAndSwapLong(null, this.slotAddress,
333                  prev, prev | flag));
334    }
335
336    private void clearFlag(long flag) {
337      long prev;
338      do {
339        prev = unsafe.getLongVolatile(null, this.slotAddress);
340        if ((prev & flag) == 0) {
341          return;
342        }
343      } while (!unsafe.compareAndSwapLong(null, this.slotAddress,
344                  prev, prev & (~flag)));
345    }
346
347    public boolean isValid() {
348      return isSet(VALID_FLAG);
349    }
350
351    public void makeValid() {
352      setFlag(VALID_FLAG);
353    }
354
355    public void makeInvalid() {
356      clearFlag(VALID_FLAG);
357    }
358
359    public boolean isAnchorable() {
360      return isSet(ANCHORABLE_FLAG);
361    }
362
363    public void makeAnchorable() {
364      setFlag(ANCHORABLE_FLAG);
365    }
366
367    public void makeUnanchorable() {
368      clearFlag(ANCHORABLE_FLAG);
369    }
370
371    public boolean isAnchored() {
372      long prev = unsafe.getLongVolatile(null, this.slotAddress);
373      // Slot is no longer valid.
374      return (prev & VALID_FLAG) != 0 && ((prev & 0x7fffffff) != 0);
375    }
376
377    /**
378     * Try to add an anchor for a given slot.
379     *
380     * When a slot is anchored, we know that the block it refers to is resident
381     * in memory.
382     *
383     * @return          True if the slot is anchored.
384     */
385    public boolean addAnchor() {
386      long prev;
387      do {
388        prev = unsafe.getLongVolatile(null, this.slotAddress);
389        if ((prev & VALID_FLAG) == 0) {
390          // Slot is no longer valid.
391          return false;
392        }
393        if ((prev & ANCHORABLE_FLAG) == 0) {
394          // Slot can't be anchored right now.
395          return false;
396        }
397        if ((prev & 0x7fffffff) == 0x7fffffff) {
398          // Too many other threads have anchored the slot (2 billion?)
399          return false;
400        }
401      } while (!unsafe.compareAndSwapLong(null, this.slotAddress,
402                  prev, prev + 1));
403      return true;
404    }
405
406    /**
407     * Remove an anchor for a given slot.
408     */
409    public void removeAnchor() {
410      long prev;
411      do {
412        prev = unsafe.getLongVolatile(null, this.slotAddress);
413        Preconditions.checkState((prev & 0x7fffffff) != 0,
414            "Tried to remove anchor for slot " + slotAddress +", which was " +
415            "not anchored.");
416      } while (!unsafe.compareAndSwapLong(null, this.slotAddress,
417                  prev, prev - 1));
418    }
419
420    @Override
421    public String toString() {
422      return "Slot(slotIdx=" + getSlotIdx() + ", shm=" + getShm() + ")";
423    }
424  }
425
426  /**
427   * ID for this SharedMemorySegment.
428   */
429  private final ShmId shmId;
430
431  /**
432   * The base address of the memory-mapped file.
433   */
434  private final long baseAddress;
435
436  /**
437   * The mmapped length of the shared memory segment
438   */
439  private final int mmappedLength;
440
441  /**
442   * The slots associated with this shared memory segment.
443   * slot[i] contains the slot at offset i * BYTES_PER_SLOT,
444   * or null if that slot is not allocated.
445   */
446  private final Slot slots[];
447
448  /**
449   * A bitset where each bit represents a slot which is in use.
450   */
451  private final BitSet allocatedSlots;
452
453  /**
454   * Create the ShortCircuitShm.
455   *
456   * @param shmId       The ID to use.
457   * @param stream      The stream that we're going to use to create this
458   *                    shared memory segment.
459   *
460   *                    Although this is a FileInputStream, we are going to
461   *                    assume that the underlying file descriptor is writable
462   *                    as well as readable. It would be more appropriate to use
463   *                    a RandomAccessFile here, but that class does not have
464   *                    any public accessor which returns a FileDescriptor,
465   *                    unlike FileInputStream.
466   */
467  public ShortCircuitShm(ShmId shmId, FileInputStream stream)
468        throws IOException {
469    if (!NativeIO.isAvailable()) {
470      throw new UnsupportedOperationException("NativeIO is not available.");
471    }
472    if (Shell.WINDOWS) {
473      throw new UnsupportedOperationException(
474          "DfsClientShm is not yet implemented for Windows.");
475    }
476    if (unsafe == null) {
477      throw new UnsupportedOperationException(
478          "can't use DfsClientShm because we failed to " +
479          "load misc.Unsafe.");
480    }
481    this.shmId = shmId;
482    this.mmappedLength = getUsableLength(stream);
483    this.baseAddress = POSIX.mmap(stream.getFD(),
484        POSIX.MMAP_PROT_READ | POSIX.MMAP_PROT_WRITE, true, mmappedLength);
485    this.slots = new Slot[mmappedLength / BYTES_PER_SLOT];
486    this.allocatedSlots = new BitSet(slots.length);
487    LOG.trace("creating {}(shmId={}, mmappedLength={}, baseAddress={}, "
488        + "slots.length={})", this.getClass().getSimpleName(), shmId,
489        mmappedLength, String.format("%x", baseAddress), slots.length);
490  }
491
492  public final ShmId getShmId() {
493    return shmId;
494  }
495
496  /**
497   * Determine if this shared memory object is empty.
498   *
499   * @return    True if the shared memory object is empty.
500   */
501  synchronized final public boolean isEmpty() {
502    return allocatedSlots.nextSetBit(0) == -1;
503  }
504
505  /**
506   * Determine if this shared memory object is full.
507   *
508   * @return    True if the shared memory object is full.
509   */
510  synchronized final public boolean isFull() {
511    return allocatedSlots.nextClearBit(0) >= slots.length;
512  }
513
514  /**
515   * Calculate the base address of a slot.
516   *
517   * @param slotIdx   Index of the slot.
518   * @return          The base address of the slot.
519   */
520  private long calculateSlotAddress(int slotIdx) {
521    long offset = slotIdx;
522    offset *= BYTES_PER_SLOT;
523    return this.baseAddress + offset;
524  }
525
526  /**
527   * Allocate a new slot and register it.
528   *
529   * This function chooses an empty slot, initializes it, and then returns
530   * the relevant Slot object.
531   *
532   * @return    The new slot.
533   */
534  synchronized public final Slot allocAndRegisterSlot(
535      ExtendedBlockId blockId) {
536    int idx = allocatedSlots.nextClearBit(0);
537    if (idx >= slots.length) {
538      throw new RuntimeException(this + ": no more slots are available.");
539    }
540    allocatedSlots.set(idx, true);
541    Slot slot = new Slot(calculateSlotAddress(idx), blockId);
542    slot.clear();
543    slot.makeValid();
544    slots[idx] = slot;
545    if (LOG.isTraceEnabled()) {
546      LOG.trace(this + ": allocAndRegisterSlot " + idx + ": allocatedSlots=" + allocatedSlots +
547                  StringUtils.getStackTrace(Thread.currentThread()));
548    }
549    return slot;
550  }
551
552  synchronized public final Slot getSlot(int slotIdx)
553      throws InvalidRequestException {
554    if (!allocatedSlots.get(slotIdx)) {
555      throw new InvalidRequestException(this + ": slot " + slotIdx +
556          " does not exist.");
557    }
558    return slots[slotIdx];
559  }
560
561  /**
562   * Register a slot.
563   *
564   * This function looks at a slot which has already been initialized (by
565   * another process), and registers it with us.  Then, it returns the
566   * relevant Slot object.
567   *
568   * @return    The slot.
569   *
570   * @throws InvalidRequestException
571   *            If the slot index we're trying to allocate has not been
572   *            initialized, or is already in use.
573   */
574  synchronized public final Slot registerSlot(int slotIdx,
575      ExtendedBlockId blockId) throws InvalidRequestException {
576    if (slotIdx < 0) {
577      throw new InvalidRequestException(this + ": invalid negative slot " +
578          "index " + slotIdx);
579    }
580    if (slotIdx >= slots.length) {
581      throw new InvalidRequestException(this + ": invalid slot " +
582          "index " + slotIdx);
583    }
584    if (allocatedSlots.get(slotIdx)) {
585      throw new InvalidRequestException(this + ": slot " + slotIdx +
586          " is already in use.");
587    }
588    Slot slot = new Slot(calculateSlotAddress(slotIdx), blockId);
589    if (!slot.isValid()) {
590      throw new InvalidRequestException(this + ": slot " + slotIdx +
591          " is not marked as valid.");
592    }
593    slots[slotIdx] = slot;
594    allocatedSlots.set(slotIdx, true);
595    if (LOG.isTraceEnabled()) {
596      LOG.trace(this + ": registerSlot " + slotIdx + ": allocatedSlots=" + allocatedSlots +
597                  StringUtils.getStackTrace(Thread.currentThread()));
598    }
599    return slot;
600  }
601
602  /**
603   * Unregisters a slot.
604   *
605   * This doesn't alter the contents of the slot.  It just means
606   *
607   * @param slotIdx  Index of the slot to unregister.
608   */
609  synchronized public final void unregisterSlot(int slotIdx) {
610    Preconditions.checkState(allocatedSlots.get(slotIdx),
611        "tried to unregister slot " + slotIdx + ", which was not registered.");
612    allocatedSlots.set(slotIdx, false);
613    slots[slotIdx] = null;
614    LOG.trace("{}: unregisterSlot {}", this, slotIdx);
615  }
616
617  /**
618   * Iterate over all allocated slots.
619   *
620   * Note that this method isn't safe if
621   *
622   * @return        The slot iterator.
623   */
624  public SlotIterator slotIterator() {
625    return new SlotIterator();
626  }
627
628  public void free() {
629    try {
630      POSIX.munmap(baseAddress, mmappedLength);
631    } catch (IOException e) {
632      LOG.warn(this + ": failed to munmap", e);
633    }
634    LOG.trace(this + ": freed");
635  }
636
637  @Override
638  public String toString() {
639    return this.getClass().getSimpleName() + "(" + shmId + ")";
640  }
641}