001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hdfs.protocol;
019
020import static org.apache.hadoop.fs.CommonConfigurationKeys.IPC_MAXIMUM_DATA_LENGTH_DEFAULT;
021
022import java.io.IOException;
023import java.io.InputStream;
024import java.io.OutputStream;
025import java.util.ArrayList;
026import java.util.Collection;
027import java.util.Collections;
028import java.util.Iterator;
029import java.util.List;
030
031import org.apache.hadoop.classification.InterfaceAudience;
032import org.apache.hadoop.classification.InterfaceStability;
033import org.apache.hadoop.hdfs.protocol.BlockListAsLongs.BlockReportReplica;
034import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.ReplicaState;
035import org.apache.hadoop.hdfs.server.datanode.Replica;
036import com.google.common.annotations.VisibleForTesting;
037import com.google.common.base.Preconditions;
038import com.google.protobuf.ByteString;
039import com.google.protobuf.CodedInputStream;
040import com.google.protobuf.CodedOutputStream;
041import com.google.protobuf.WireFormat;
042
043@InterfaceAudience.Private
044@InterfaceStability.Evolving
045public abstract class BlockListAsLongs implements Iterable<BlockReportReplica> {
046  private final static int CHUNK_SIZE = 64*1024; // 64K
047  private static long[] EMPTY_LONGS = new long[]{0, 0};
048
049  public static BlockListAsLongs EMPTY = new BlockListAsLongs() {
050    @Override
051    public int getNumberOfBlocks() {
052      return 0;
053    }
054    @Override
055    public ByteString getBlocksBuffer() {
056      return ByteString.EMPTY;
057    }
058    @Override
059    public long[] getBlockListAsLongs() {
060      return EMPTY_LONGS;
061    }
062    @Override
063    public Iterator<BlockReportReplica> iterator() {
064      return Collections.emptyIterator();
065    }
066  };
067
068  /**
069   * Prepare an instance to in-place decode the given ByteString buffer.
070   * @param numBlocks - blocks in the buffer
071   * @param blocksBuf - ByteString encoded varints
072   * @param maxDataLength - maximum allowable data size in protobuf message
073   * @return BlockListAsLongs
074   */
075  public static BlockListAsLongs decodeBuffer(final int numBlocks,
076      final ByteString blocksBuf, final int maxDataLength) {
077    return new BufferDecoder(numBlocks, blocksBuf, maxDataLength);
078  }
079
080  /**
081   * Prepare an instance to in-place decode the given ByteString buffers.
082   * @param numBlocks - blocks in the buffers
083   * @param blocksBufs - list of ByteString encoded varints
084   * @return BlockListAsLongs
085   */
086  @VisibleForTesting
087  public static BlockListAsLongs decodeBuffers(final int numBlocks,
088      final List<ByteString> blocksBufs) {
089    return decodeBuffers(numBlocks, blocksBufs,
090        IPC_MAXIMUM_DATA_LENGTH_DEFAULT);
091  }
092
093  /**
094   * Prepare an instance to in-place decode the given ByteString buffers.
095   * @param numBlocks - blocks in the buffers
096   * @param blocksBufs - list of ByteString encoded varints
097   * @param maxDataLength - maximum allowable data size in protobuf message
098   * @return BlockListAsLongs
099   */
100  public static BlockListAsLongs decodeBuffers(final int numBlocks,
101      final List<ByteString> blocksBufs, final int maxDataLength) {
102    // this doesn't actually copy the data
103    return decodeBuffer(numBlocks, ByteString.copyFrom(blocksBufs),
104        maxDataLength);
105  }
106
107  /**
108   * Prepare an instance to in-place decode the given list of Longs.  Note
109   * it's much more efficient to decode ByteString buffers and only exists
110   * for compatibility.
111   * @param blocksList - list of longs
112   * @return BlockListAsLongs
113   */
114  public static BlockListAsLongs decodeLongs(List<Long> blocksList) {
115    return decodeLongs(blocksList, IPC_MAXIMUM_DATA_LENGTH_DEFAULT);
116  }
117
118  /**
119   * Prepare an instance to in-place decode the given list of Longs.  Note
120   * it's much more efficient to decode ByteString buffers and only exists
121   * for compatibility.
122   * @param blocksList - list of longs
123   * @param maxDataLength - maximum allowable data size in protobuf message
124   * @return BlockListAsLongs
125   */
126  public static BlockListAsLongs decodeLongs(List<Long> blocksList,
127      int maxDataLength) {
128    return blocksList.isEmpty() ? EMPTY :
129        new LongsDecoder(blocksList, maxDataLength);
130  }
131
132  /**
133   * Prepare an instance to encode the collection of replicas into an
134   * efficient ByteString.
135   * @param replicas - replicas to encode
136   * @return BlockListAsLongs
137   */
138  @VisibleForTesting
139  public static BlockListAsLongs encode(
140      final Collection<? extends Replica> replicas) {
141    BlockListAsLongs.Builder builder = builder(IPC_MAXIMUM_DATA_LENGTH_DEFAULT);
142    for (Replica replica : replicas) {
143      builder.add(replica);
144    }
145    return builder.build();
146  }
147
148  public static BlockListAsLongs readFrom(InputStream is, int maxDataLength)
149      throws IOException {
150    CodedInputStream cis = CodedInputStream.newInstance(is);
151    if (maxDataLength != IPC_MAXIMUM_DATA_LENGTH_DEFAULT) {
152      cis.setSizeLimit(maxDataLength);
153    }
154    int numBlocks = -1;
155    ByteString blocksBuf = null;
156    while (!cis.isAtEnd()) {
157      int tag = cis.readTag();
158      int field = WireFormat.getTagFieldNumber(tag);
159      switch(field) {
160        case 0:
161          break;
162        case 1:
163          numBlocks = (int)cis.readInt32();
164          break;
165        case 2:
166          blocksBuf = cis.readBytes();
167          break;
168        default:
169          cis.skipField(tag);
170          break;
171      }
172    }
173    if (numBlocks != -1 && blocksBuf != null) {
174      return decodeBuffer(numBlocks, blocksBuf, maxDataLength);
175    }
176    return null;
177  }
178
179  public void writeTo(OutputStream os) throws IOException {
180    CodedOutputStream cos = CodedOutputStream.newInstance(os);
181    cos.writeInt32(1, getNumberOfBlocks());
182    cos.writeBytes(2, getBlocksBuffer());
183    cos.flush();
184  }
185
186  @VisibleForTesting
187  public static Builder builder() {
188    return builder(IPC_MAXIMUM_DATA_LENGTH_DEFAULT);
189  }
190
191  public static Builder builder(int maxDataLength) {
192    return new BlockListAsLongs.Builder(maxDataLength);
193  }
194
195  /**
196   * The number of blocks
197   * @return - the number of blocks
198   */
199  abstract public int getNumberOfBlocks();
200
201  /**
202   * Very efficient encoding of the block report into a ByteString to avoid
203   * the overhead of protobuf repeating fields.  Primitive repeating fields
204   * require re-allocs of an ArrayList<Long> and the associated (un)boxing
205   * overhead which puts pressure on GC.
206   * 
207   * The structure of the buffer is as follows:
208   * - each replica is represented by 4 longs:
209   *   blockId, block length, genstamp, replica state
210   *
211   * @return ByteString encoded block report
212   */
213  abstract public ByteString getBlocksBuffer();
214
215  /**
216   * List of ByteStrings that encode this block report
217   *
218   * @return ByteStrings
219   */
220  public List<ByteString> getBlocksBuffers() {
221    final ByteString blocksBuf = getBlocksBuffer();
222    final List<ByteString> buffers;
223    final int size = blocksBuf.size();
224    if (size <= CHUNK_SIZE) {
225      buffers = Collections.singletonList(blocksBuf);
226    } else {
227      buffers = new ArrayList<ByteString>();
228      for (int pos=0; pos < size; pos += CHUNK_SIZE) {
229        // this doesn't actually copy the data
230        buffers.add(blocksBuf.substring(pos, Math.min(pos+CHUNK_SIZE, size)));
231      }
232    }
233    return buffers;
234  }
235
236  /**
237   * Convert block report to old-style list of longs.  Only used to
238   * re-encode the block report when the DN detects an older NN. This is
239   * inefficient, but in practice a DN is unlikely to be upgraded first
240   * 
241   * The structure of the array is as follows:
242   * 0: the length of the finalized replica list;
243   * 1: the length of the under-construction replica list;
244   * - followed by finalized replica list where each replica is represented by
245   *   3 longs: one for the blockId, one for the block length, and one for
246   *   the generation stamp;
247   * - followed by the invalid replica represented with three -1s;
248   * - followed by the under-construction replica list where each replica is
249   *   represented by 4 longs: three for the block id, length, generation 
250   *   stamp, and the fourth for the replica state.
251   * @return list of longs
252   */
253  abstract public long[] getBlockListAsLongs();
254
255  /**
256   * Returns a singleton iterator over blocks in the block report.  Do not
257   * add the returned blocks to a collection.
258   * @return Iterator
259   */
260  abstract public Iterator<BlockReportReplica> iterator();
261
262  public static class Builder {
263    private final ByteString.Output out;
264    private final CodedOutputStream cos;
265    private int numBlocks = 0;
266    private int numFinalized = 0;
267    private final int maxDataLength;
268
269    Builder(int maxDataLength) {
270      out = ByteString.newOutput(64*1024);
271      cos = CodedOutputStream.newInstance(out);
272      this.maxDataLength = maxDataLength;
273    }
274
275    public void add(Replica replica) {
276      try {
277        // zig-zag to reduce size of legacy blocks
278        cos.writeSInt64NoTag(replica.getBlockId());
279        cos.writeRawVarint64(replica.getBytesOnDisk());
280        cos.writeRawVarint64(replica.getGenerationStamp());
281        ReplicaState state = replica.getState();
282        // although state is not a 64-bit value, using a long varint to
283        // allow for future use of the upper bits
284        cos.writeRawVarint64(state.getValue());
285        if (state == ReplicaState.FINALIZED) {
286          numFinalized++;
287        }
288        numBlocks++;
289      } catch (IOException ioe) {
290        // shouldn't happen, ByteString.Output doesn't throw IOE
291        throw new IllegalStateException(ioe);
292      }
293    }
294
295    public int getNumberOfBlocks() {
296      return numBlocks;
297    }
298    
299    public BlockListAsLongs build() {
300      try {
301        cos.flush();
302      } catch (IOException ioe) {
303        // shouldn't happen, ByteString.Output doesn't throw IOE
304        throw new IllegalStateException(ioe);
305      }
306      return new BufferDecoder(numBlocks, numFinalized, out.toByteString(),
307          maxDataLength);
308    }
309  }
310
311  // decode new-style ByteString buffer based block report
312  private static class BufferDecoder extends BlockListAsLongs {
313    // reserve upper bits for future use.  decoding masks off these bits to
314    // allow compatibility for the current through future release that may
315    // start using the bits
316    private static long NUM_BYTES_MASK = (-1L) >>> (64 - 48);
317    private static long REPLICA_STATE_MASK = (-1L) >>> (64 - 4);
318
319    private final ByteString buffer;
320    private final int numBlocks;
321    private int numFinalized;
322    private final int maxDataLength;
323
324    BufferDecoder(final int numBlocks, final ByteString buf,
325        final int maxDataLength) {
326      this(numBlocks, -1, buf, maxDataLength);
327    }
328
329    BufferDecoder(final int numBlocks, final int numFinalized,
330        final ByteString buf, final int maxDataLength) {
331      this.numBlocks = numBlocks;
332      this.numFinalized = numFinalized;
333      this.buffer = buf;
334      this.maxDataLength = maxDataLength;
335    }
336
337    @Override
338    public int getNumberOfBlocks() {
339      return numBlocks;
340    }
341
342    @Override
343    public ByteString getBlocksBuffer() {
344      return buffer;
345    }
346
347    @Override
348    public long[] getBlockListAsLongs() {
349      // terribly inefficient but only occurs if server tries to transcode
350      // an undecoded buffer into longs - ie. it will never happen but let's
351      // handle it anyway
352      if (numFinalized == -1) {
353        int n = 0;
354        for (Replica replica : this) {
355          if (replica.getState() == ReplicaState.FINALIZED) {
356            n++;
357          }
358        }
359        numFinalized = n;
360      }
361      int numUc = numBlocks - numFinalized;
362      int size = 2 + 3*(numFinalized+1) + 4*(numUc);
363      long[] longs = new long[size];
364      longs[0] = numFinalized;
365      longs[1] = numUc;
366
367      int idx = 2;
368      int ucIdx = idx + 3*numFinalized;
369      // delimiter block
370      longs[ucIdx++] = -1;
371      longs[ucIdx++] = -1;
372      longs[ucIdx++] = -1;
373
374      for (BlockReportReplica block : this) {
375        switch (block.getState()) {
376          case FINALIZED: {
377            longs[idx++] = block.getBlockId();
378            longs[idx++] = block.getNumBytes();
379            longs[idx++] = block.getGenerationStamp();
380            break;
381          }
382          default: {
383            longs[ucIdx++] = block.getBlockId();
384            longs[ucIdx++] = block.getNumBytes();
385            longs[ucIdx++] = block.getGenerationStamp();
386            longs[ucIdx++] = block.getState().getValue();
387            break;
388          }
389        }
390      }
391      return longs;
392    }
393
394    @Override
395    public Iterator<BlockReportReplica> iterator() {
396      return new Iterator<BlockReportReplica>() {
397        final BlockReportReplica block = new BlockReportReplica();
398        final CodedInputStream cis = buffer.newCodedInput();
399        private int currentBlockIndex = 0;
400
401        {
402          if (maxDataLength != IPC_MAXIMUM_DATA_LENGTH_DEFAULT) {
403            cis.setSizeLimit(maxDataLength);
404          }
405        }
406
407        @Override
408        public boolean hasNext() {
409          return currentBlockIndex < numBlocks;
410        }
411
412        @Override
413        public BlockReportReplica next() {
414          currentBlockIndex++;
415          try {
416            // zig-zag to reduce size of legacy blocks and mask off bits
417            // we don't (yet) understand
418            block.setBlockId(cis.readSInt64());
419            block.setNumBytes(cis.readRawVarint64() & NUM_BYTES_MASK);
420            block.setGenerationStamp(cis.readRawVarint64());
421            long state = cis.readRawVarint64() & REPLICA_STATE_MASK;
422            block.setState(ReplicaState.getState((int)state));
423          } catch (IOException e) {
424            throw new IllegalStateException(e);
425          }
426          return block;
427        }
428
429        @Override
430        public void remove() {
431          throw new UnsupportedOperationException();
432        }
433      };
434    }
435  }
436
437  // decode old style block report of longs
438  private static class LongsDecoder extends BlockListAsLongs {
439    private final List<Long> values;
440    private final int finalizedBlocks;
441    private final int numBlocks;
442    private final int maxDataLength;
443
444    // set the header
445    LongsDecoder(List<Long> values, int maxDataLength) {
446      this.values = values.subList(2, values.size());
447      this.finalizedBlocks = values.get(0).intValue();
448      this.numBlocks = finalizedBlocks + values.get(1).intValue();
449      this.maxDataLength = maxDataLength;
450    }
451
452    @Override
453    public int getNumberOfBlocks() {
454      return numBlocks;
455    }
456
457    @Override
458    public ByteString getBlocksBuffer() {
459      Builder builder = builder(maxDataLength);
460      for (Replica replica : this) {
461        builder.add(replica);
462      }
463      return builder.build().getBlocksBuffer();
464    }
465
466    @Override
467    public long[] getBlockListAsLongs() {
468      long[] longs = new long[2+values.size()];
469      longs[0] = finalizedBlocks;
470      longs[1] = numBlocks - finalizedBlocks;
471      for(int i=0; i<values.size(); i++) {
472        longs[2+i] = values.get(i);
473      }
474      return longs;
475    }
476
477    @Override
478    public Iterator<BlockReportReplica> iterator() {
479      return new Iterator<BlockReportReplica>() {
480        private final BlockReportReplica block = new BlockReportReplica();
481        final Iterator<Long> iter = values.iterator();
482        private int currentBlockIndex = 0;
483
484        @Override
485        public boolean hasNext() {
486          return currentBlockIndex < numBlocks;
487        }
488
489        @Override
490        public BlockReportReplica next() {
491          if (currentBlockIndex == finalizedBlocks) {
492            // verify the presence of the delimiter block
493            readBlock();
494            Preconditions.checkArgument(block.getBlockId() == -1 &&
495                                        block.getNumBytes() == -1 &&
496                                        block.getGenerationStamp() == -1,
497                                        "Invalid delimiter block");
498          }
499
500          readBlock();
501          if (currentBlockIndex++ < finalizedBlocks) {
502            block.setState(ReplicaState.FINALIZED);
503          } else {
504            block.setState(ReplicaState.getState(iter.next().intValue()));
505          }
506          return block;
507        }
508
509        private void readBlock() {
510          block.setBlockId(iter.next());
511          block.setNumBytes(iter.next());
512          block.setGenerationStamp(iter.next());
513        }
514
515        @Override
516        public void remove() {
517          throw new UnsupportedOperationException();
518        }
519      };
520    }
521  }
522  
523  @InterfaceAudience.Private
524  public static class BlockReportReplica extends Block implements Replica {
525    private ReplicaState state;
526    private BlockReportReplica() {
527    }
528    public BlockReportReplica(Block block) {
529      super(block);
530      if (block instanceof BlockReportReplica) {
531        this.state = ((BlockReportReplica)block).getState();
532      } else {
533        this.state = ReplicaState.FINALIZED;
534      }
535    }
536    public void setState(ReplicaState state) {
537      this.state = state;
538    }
539    @Override
540    public ReplicaState getState() {
541      return state;
542    }
543    @Override
544    public long getBytesOnDisk() {
545      return getNumBytes();
546    }
547    @Override
548    public long getVisibleLength() {
549      throw new UnsupportedOperationException();
550    }
551    @Override
552    public String getStorageUuid() {
553      throw new UnsupportedOperationException();
554    }
555    @Override
556    public boolean isOnTransientStorage() {
557      throw new UnsupportedOperationException();
558    }
559    @Override
560    public boolean equals(Object o) {
561      return super.equals(o);
562    }
563    @Override
564    public int hashCode() {
565      return super.hashCode();
566    }
567  }
568}