001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hdfs.protocol; 019 020import static org.apache.hadoop.fs.CommonConfigurationKeys.IPC_MAXIMUM_DATA_LENGTH_DEFAULT; 021 022import java.io.IOException; 023import java.io.InputStream; 024import java.io.OutputStream; 025import java.util.ArrayList; 026import java.util.Collection; 027import java.util.Collections; 028import java.util.Iterator; 029import java.util.List; 030 031import org.apache.hadoop.classification.InterfaceAudience; 032import org.apache.hadoop.classification.InterfaceStability; 033import org.apache.hadoop.hdfs.protocol.BlockListAsLongs.BlockReportReplica; 034import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.ReplicaState; 035import org.apache.hadoop.hdfs.server.datanode.Replica; 036import com.google.common.annotations.VisibleForTesting; 037import com.google.common.base.Preconditions; 038import com.google.protobuf.ByteString; 039import com.google.protobuf.CodedInputStream; 040import com.google.protobuf.CodedOutputStream; 041import com.google.protobuf.WireFormat; 042 043@InterfaceAudience.Private 044@InterfaceStability.Evolving 045public abstract class BlockListAsLongs implements Iterable<BlockReportReplica> { 046 private final static int CHUNK_SIZE = 64*1024; // 64K 047 private static long[] EMPTY_LONGS = new long[]{0, 0}; 048 049 public static BlockListAsLongs EMPTY = new BlockListAsLongs() { 050 @Override 051 public int getNumberOfBlocks() { 052 return 0; 053 } 054 @Override 055 public ByteString getBlocksBuffer() { 056 return ByteString.EMPTY; 057 } 058 @Override 059 public long[] getBlockListAsLongs() { 060 return EMPTY_LONGS; 061 } 062 @Override 063 public Iterator<BlockReportReplica> iterator() { 064 return Collections.emptyIterator(); 065 } 066 }; 067 068 /** 069 * Prepare an instance to in-place decode the given ByteString buffer. 070 * @param numBlocks - blocks in the buffer 071 * @param blocksBuf - ByteString encoded varints 072 * @param maxDataLength - maximum allowable data size in protobuf message 073 * @return BlockListAsLongs 074 */ 075 public static BlockListAsLongs decodeBuffer(final int numBlocks, 076 final ByteString blocksBuf, final int maxDataLength) { 077 return new BufferDecoder(numBlocks, blocksBuf, maxDataLength); 078 } 079 080 /** 081 * Prepare an instance to in-place decode the given ByteString buffers. 082 * @param numBlocks - blocks in the buffers 083 * @param blocksBufs - list of ByteString encoded varints 084 * @return BlockListAsLongs 085 */ 086 @VisibleForTesting 087 public static BlockListAsLongs decodeBuffers(final int numBlocks, 088 final List<ByteString> blocksBufs) { 089 return decodeBuffers(numBlocks, blocksBufs, 090 IPC_MAXIMUM_DATA_LENGTH_DEFAULT); 091 } 092 093 /** 094 * Prepare an instance to in-place decode the given ByteString buffers. 095 * @param numBlocks - blocks in the buffers 096 * @param blocksBufs - list of ByteString encoded varints 097 * @param maxDataLength - maximum allowable data size in protobuf message 098 * @return BlockListAsLongs 099 */ 100 public static BlockListAsLongs decodeBuffers(final int numBlocks, 101 final List<ByteString> blocksBufs, final int maxDataLength) { 102 // this doesn't actually copy the data 103 return decodeBuffer(numBlocks, ByteString.copyFrom(blocksBufs), 104 maxDataLength); 105 } 106 107 /** 108 * Prepare an instance to in-place decode the given list of Longs. Note 109 * it's much more efficient to decode ByteString buffers and only exists 110 * for compatibility. 111 * @param blocksList - list of longs 112 * @return BlockListAsLongs 113 */ 114 public static BlockListAsLongs decodeLongs(List<Long> blocksList) { 115 return decodeLongs(blocksList, IPC_MAXIMUM_DATA_LENGTH_DEFAULT); 116 } 117 118 /** 119 * Prepare an instance to in-place decode the given list of Longs. Note 120 * it's much more efficient to decode ByteString buffers and only exists 121 * for compatibility. 122 * @param blocksList - list of longs 123 * @param maxDataLength - maximum allowable data size in protobuf message 124 * @return BlockListAsLongs 125 */ 126 public static BlockListAsLongs decodeLongs(List<Long> blocksList, 127 int maxDataLength) { 128 return blocksList.isEmpty() ? EMPTY : 129 new LongsDecoder(blocksList, maxDataLength); 130 } 131 132 /** 133 * Prepare an instance to encode the collection of replicas into an 134 * efficient ByteString. 135 * @param replicas - replicas to encode 136 * @return BlockListAsLongs 137 */ 138 @VisibleForTesting 139 public static BlockListAsLongs encode( 140 final Collection<? extends Replica> replicas) { 141 BlockListAsLongs.Builder builder = builder(IPC_MAXIMUM_DATA_LENGTH_DEFAULT); 142 for (Replica replica : replicas) { 143 builder.add(replica); 144 } 145 return builder.build(); 146 } 147 148 public static BlockListAsLongs readFrom(InputStream is, int maxDataLength) 149 throws IOException { 150 CodedInputStream cis = CodedInputStream.newInstance(is); 151 if (maxDataLength != IPC_MAXIMUM_DATA_LENGTH_DEFAULT) { 152 cis.setSizeLimit(maxDataLength); 153 } 154 int numBlocks = -1; 155 ByteString blocksBuf = null; 156 while (!cis.isAtEnd()) { 157 int tag = cis.readTag(); 158 int field = WireFormat.getTagFieldNumber(tag); 159 switch(field) { 160 case 0: 161 break; 162 case 1: 163 numBlocks = (int)cis.readInt32(); 164 break; 165 case 2: 166 blocksBuf = cis.readBytes(); 167 break; 168 default: 169 cis.skipField(tag); 170 break; 171 } 172 } 173 if (numBlocks != -1 && blocksBuf != null) { 174 return decodeBuffer(numBlocks, blocksBuf, maxDataLength); 175 } 176 return null; 177 } 178 179 public void writeTo(OutputStream os) throws IOException { 180 CodedOutputStream cos = CodedOutputStream.newInstance(os); 181 cos.writeInt32(1, getNumberOfBlocks()); 182 cos.writeBytes(2, getBlocksBuffer()); 183 cos.flush(); 184 } 185 186 @VisibleForTesting 187 public static Builder builder() { 188 return builder(IPC_MAXIMUM_DATA_LENGTH_DEFAULT); 189 } 190 191 public static Builder builder(int maxDataLength) { 192 return new BlockListAsLongs.Builder(maxDataLength); 193 } 194 195 /** 196 * The number of blocks 197 * @return - the number of blocks 198 */ 199 abstract public int getNumberOfBlocks(); 200 201 /** 202 * Very efficient encoding of the block report into a ByteString to avoid 203 * the overhead of protobuf repeating fields. Primitive repeating fields 204 * require re-allocs of an ArrayList<Long> and the associated (un)boxing 205 * overhead which puts pressure on GC. 206 * 207 * The structure of the buffer is as follows: 208 * - each replica is represented by 4 longs: 209 * blockId, block length, genstamp, replica state 210 * 211 * @return ByteString encoded block report 212 */ 213 abstract public ByteString getBlocksBuffer(); 214 215 /** 216 * List of ByteStrings that encode this block report 217 * 218 * @return ByteStrings 219 */ 220 public List<ByteString> getBlocksBuffers() { 221 final ByteString blocksBuf = getBlocksBuffer(); 222 final List<ByteString> buffers; 223 final int size = blocksBuf.size(); 224 if (size <= CHUNK_SIZE) { 225 buffers = Collections.singletonList(blocksBuf); 226 } else { 227 buffers = new ArrayList<ByteString>(); 228 for (int pos=0; pos < size; pos += CHUNK_SIZE) { 229 // this doesn't actually copy the data 230 buffers.add(blocksBuf.substring(pos, Math.min(pos+CHUNK_SIZE, size))); 231 } 232 } 233 return buffers; 234 } 235 236 /** 237 * Convert block report to old-style list of longs. Only used to 238 * re-encode the block report when the DN detects an older NN. This is 239 * inefficient, but in practice a DN is unlikely to be upgraded first 240 * 241 * The structure of the array is as follows: 242 * 0: the length of the finalized replica list; 243 * 1: the length of the under-construction replica list; 244 * - followed by finalized replica list where each replica is represented by 245 * 3 longs: one for the blockId, one for the block length, and one for 246 * the generation stamp; 247 * - followed by the invalid replica represented with three -1s; 248 * - followed by the under-construction replica list where each replica is 249 * represented by 4 longs: three for the block id, length, generation 250 * stamp, and the fourth for the replica state. 251 * @return list of longs 252 */ 253 abstract public long[] getBlockListAsLongs(); 254 255 /** 256 * Returns a singleton iterator over blocks in the block report. Do not 257 * add the returned blocks to a collection. 258 * @return Iterator 259 */ 260 abstract public Iterator<BlockReportReplica> iterator(); 261 262 public static class Builder { 263 private final ByteString.Output out; 264 private final CodedOutputStream cos; 265 private int numBlocks = 0; 266 private int numFinalized = 0; 267 private final int maxDataLength; 268 269 Builder(int maxDataLength) { 270 out = ByteString.newOutput(64*1024); 271 cos = CodedOutputStream.newInstance(out); 272 this.maxDataLength = maxDataLength; 273 } 274 275 public void add(Replica replica) { 276 try { 277 // zig-zag to reduce size of legacy blocks 278 cos.writeSInt64NoTag(replica.getBlockId()); 279 cos.writeRawVarint64(replica.getBytesOnDisk()); 280 cos.writeRawVarint64(replica.getGenerationStamp()); 281 ReplicaState state = replica.getState(); 282 // although state is not a 64-bit value, using a long varint to 283 // allow for future use of the upper bits 284 cos.writeRawVarint64(state.getValue()); 285 if (state == ReplicaState.FINALIZED) { 286 numFinalized++; 287 } 288 numBlocks++; 289 } catch (IOException ioe) { 290 // shouldn't happen, ByteString.Output doesn't throw IOE 291 throw new IllegalStateException(ioe); 292 } 293 } 294 295 public int getNumberOfBlocks() { 296 return numBlocks; 297 } 298 299 public BlockListAsLongs build() { 300 try { 301 cos.flush(); 302 } catch (IOException ioe) { 303 // shouldn't happen, ByteString.Output doesn't throw IOE 304 throw new IllegalStateException(ioe); 305 } 306 return new BufferDecoder(numBlocks, numFinalized, out.toByteString(), 307 maxDataLength); 308 } 309 } 310 311 // decode new-style ByteString buffer based block report 312 private static class BufferDecoder extends BlockListAsLongs { 313 // reserve upper bits for future use. decoding masks off these bits to 314 // allow compatibility for the current through future release that may 315 // start using the bits 316 private static long NUM_BYTES_MASK = (-1L) >>> (64 - 48); 317 private static long REPLICA_STATE_MASK = (-1L) >>> (64 - 4); 318 319 private final ByteString buffer; 320 private final int numBlocks; 321 private int numFinalized; 322 private final int maxDataLength; 323 324 BufferDecoder(final int numBlocks, final ByteString buf, 325 final int maxDataLength) { 326 this(numBlocks, -1, buf, maxDataLength); 327 } 328 329 BufferDecoder(final int numBlocks, final int numFinalized, 330 final ByteString buf, final int maxDataLength) { 331 this.numBlocks = numBlocks; 332 this.numFinalized = numFinalized; 333 this.buffer = buf; 334 this.maxDataLength = maxDataLength; 335 } 336 337 @Override 338 public int getNumberOfBlocks() { 339 return numBlocks; 340 } 341 342 @Override 343 public ByteString getBlocksBuffer() { 344 return buffer; 345 } 346 347 @Override 348 public long[] getBlockListAsLongs() { 349 // terribly inefficient but only occurs if server tries to transcode 350 // an undecoded buffer into longs - ie. it will never happen but let's 351 // handle it anyway 352 if (numFinalized == -1) { 353 int n = 0; 354 for (Replica replica : this) { 355 if (replica.getState() == ReplicaState.FINALIZED) { 356 n++; 357 } 358 } 359 numFinalized = n; 360 } 361 int numUc = numBlocks - numFinalized; 362 int size = 2 + 3*(numFinalized+1) + 4*(numUc); 363 long[] longs = new long[size]; 364 longs[0] = numFinalized; 365 longs[1] = numUc; 366 367 int idx = 2; 368 int ucIdx = idx + 3*numFinalized; 369 // delimiter block 370 longs[ucIdx++] = -1; 371 longs[ucIdx++] = -1; 372 longs[ucIdx++] = -1; 373 374 for (BlockReportReplica block : this) { 375 switch (block.getState()) { 376 case FINALIZED: { 377 longs[idx++] = block.getBlockId(); 378 longs[idx++] = block.getNumBytes(); 379 longs[idx++] = block.getGenerationStamp(); 380 break; 381 } 382 default: { 383 longs[ucIdx++] = block.getBlockId(); 384 longs[ucIdx++] = block.getNumBytes(); 385 longs[ucIdx++] = block.getGenerationStamp(); 386 longs[ucIdx++] = block.getState().getValue(); 387 break; 388 } 389 } 390 } 391 return longs; 392 } 393 394 @Override 395 public Iterator<BlockReportReplica> iterator() { 396 return new Iterator<BlockReportReplica>() { 397 final BlockReportReplica block = new BlockReportReplica(); 398 final CodedInputStream cis = buffer.newCodedInput(); 399 private int currentBlockIndex = 0; 400 401 { 402 if (maxDataLength != IPC_MAXIMUM_DATA_LENGTH_DEFAULT) { 403 cis.setSizeLimit(maxDataLength); 404 } 405 } 406 407 @Override 408 public boolean hasNext() { 409 return currentBlockIndex < numBlocks; 410 } 411 412 @Override 413 public BlockReportReplica next() { 414 currentBlockIndex++; 415 try { 416 // zig-zag to reduce size of legacy blocks and mask off bits 417 // we don't (yet) understand 418 block.setBlockId(cis.readSInt64()); 419 block.setNumBytes(cis.readRawVarint64() & NUM_BYTES_MASK); 420 block.setGenerationStamp(cis.readRawVarint64()); 421 long state = cis.readRawVarint64() & REPLICA_STATE_MASK; 422 block.setState(ReplicaState.getState((int)state)); 423 } catch (IOException e) { 424 throw new IllegalStateException(e); 425 } 426 return block; 427 } 428 429 @Override 430 public void remove() { 431 throw new UnsupportedOperationException(); 432 } 433 }; 434 } 435 } 436 437 // decode old style block report of longs 438 private static class LongsDecoder extends BlockListAsLongs { 439 private final List<Long> values; 440 private final int finalizedBlocks; 441 private final int numBlocks; 442 private final int maxDataLength; 443 444 // set the header 445 LongsDecoder(List<Long> values, int maxDataLength) { 446 this.values = values.subList(2, values.size()); 447 this.finalizedBlocks = values.get(0).intValue(); 448 this.numBlocks = finalizedBlocks + values.get(1).intValue(); 449 this.maxDataLength = maxDataLength; 450 } 451 452 @Override 453 public int getNumberOfBlocks() { 454 return numBlocks; 455 } 456 457 @Override 458 public ByteString getBlocksBuffer() { 459 Builder builder = builder(maxDataLength); 460 for (Replica replica : this) { 461 builder.add(replica); 462 } 463 return builder.build().getBlocksBuffer(); 464 } 465 466 @Override 467 public long[] getBlockListAsLongs() { 468 long[] longs = new long[2+values.size()]; 469 longs[0] = finalizedBlocks; 470 longs[1] = numBlocks - finalizedBlocks; 471 for(int i=0; i<values.size(); i++) { 472 longs[2+i] = values.get(i); 473 } 474 return longs; 475 } 476 477 @Override 478 public Iterator<BlockReportReplica> iterator() { 479 return new Iterator<BlockReportReplica>() { 480 private final BlockReportReplica block = new BlockReportReplica(); 481 final Iterator<Long> iter = values.iterator(); 482 private int currentBlockIndex = 0; 483 484 @Override 485 public boolean hasNext() { 486 return currentBlockIndex < numBlocks; 487 } 488 489 @Override 490 public BlockReportReplica next() { 491 if (currentBlockIndex == finalizedBlocks) { 492 // verify the presence of the delimiter block 493 readBlock(); 494 Preconditions.checkArgument(block.getBlockId() == -1 && 495 block.getNumBytes() == -1 && 496 block.getGenerationStamp() == -1, 497 "Invalid delimiter block"); 498 } 499 500 readBlock(); 501 if (currentBlockIndex++ < finalizedBlocks) { 502 block.setState(ReplicaState.FINALIZED); 503 } else { 504 block.setState(ReplicaState.getState(iter.next().intValue())); 505 } 506 return block; 507 } 508 509 private void readBlock() { 510 block.setBlockId(iter.next()); 511 block.setNumBytes(iter.next()); 512 block.setGenerationStamp(iter.next()); 513 } 514 515 @Override 516 public void remove() { 517 throw new UnsupportedOperationException(); 518 } 519 }; 520 } 521 } 522 523 @InterfaceAudience.Private 524 public static class BlockReportReplica extends Block implements Replica { 525 private ReplicaState state; 526 private BlockReportReplica() { 527 } 528 public BlockReportReplica(Block block) { 529 super(block); 530 if (block instanceof BlockReportReplica) { 531 this.state = ((BlockReportReplica)block).getState(); 532 } else { 533 this.state = ReplicaState.FINALIZED; 534 } 535 } 536 public void setState(ReplicaState state) { 537 this.state = state; 538 } 539 @Override 540 public ReplicaState getState() { 541 return state; 542 } 543 @Override 544 public long getBytesOnDisk() { 545 return getNumBytes(); 546 } 547 @Override 548 public long getVisibleLength() { 549 throw new UnsupportedOperationException(); 550 } 551 @Override 552 public String getStorageUuid() { 553 throw new UnsupportedOperationException(); 554 } 555 @Override 556 public boolean isOnTransientStorage() { 557 throw new UnsupportedOperationException(); 558 } 559 @Override 560 public boolean equals(Object o) { 561 return super.equals(o); 562 } 563 @Override 564 public int hashCode() { 565 return super.hashCode(); 566 } 567 } 568}