001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hdfs.server.namenode; 019 020import static org.apache.hadoop.util.Time.monotonicNow; 021 022import java.io.DataInput; 023import java.io.DataInputStream; 024import java.io.DataOutputStream; 025import java.io.File; 026import java.io.FileInputStream; 027import java.io.FileOutputStream; 028import java.io.IOException; 029import java.security.DigestInputStream; 030import java.security.DigestOutputStream; 031import java.security.MessageDigest; 032import java.util.ArrayList; 033import java.util.Arrays; 034import java.util.Collection; 035import java.util.HashMap; 036import java.util.List; 037import java.util.Map; 038import java.util.TreeMap; 039 040import org.apache.commons.logging.Log; 041import org.apache.hadoop.classification.InterfaceAudience; 042import org.apache.hadoop.classification.InterfaceStability; 043import org.apache.hadoop.conf.Configuration; 044import org.apache.hadoop.fs.FileSystem; 045import org.apache.hadoop.fs.Path; 046import org.apache.hadoop.fs.permission.PermissionStatus; 047import org.apache.hadoop.hdfs.DFSUtil; 048import org.apache.hadoop.hdfs.protocol.HdfsConstants; 049import org.apache.hadoop.hdfs.protocol.LayoutFlags; 050import org.apache.hadoop.hdfs.protocol.LayoutVersion; 051import org.apache.hadoop.hdfs.protocol.LayoutVersion.Feature; 052import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfo; 053import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfoContiguous; 054import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager; 055import org.apache.hadoop.hdfs.server.common.HdfsServerConstants; 056import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption; 057import org.apache.hadoop.hdfs.server.common.InconsistentFSStateException; 058import org.apache.hadoop.hdfs.server.namenode.FSDirectory.DirOp; 059import org.apache.hadoop.hdfs.server.namenode.snapshot.DirectoryWithSnapshotFeature; 060import org.apache.hadoop.hdfs.server.namenode.snapshot.FileDiffList; 061import org.apache.hadoop.hdfs.server.namenode.snapshot.Snapshot; 062import org.apache.hadoop.hdfs.server.namenode.snapshot.SnapshotFSImageFormat; 063import org.apache.hadoop.hdfs.server.namenode.snapshot.SnapshotFSImageFormat.ReferenceMap; 064import org.apache.hadoop.hdfs.server.namenode.startupprogress.Phase; 065import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress; 066import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress.Counter; 067import org.apache.hadoop.hdfs.server.namenode.startupprogress.Step; 068import org.apache.hadoop.hdfs.server.namenode.startupprogress.StepType; 069import org.apache.hadoop.hdfs.util.ReadOnlyList; 070import org.apache.hadoop.io.IOUtils; 071import org.apache.hadoop.io.MD5Hash; 072import org.apache.hadoop.io.Text; 073import org.apache.hadoop.util.StringUtils; 074 075import com.google.common.annotations.VisibleForTesting; 076import com.google.common.base.Preconditions; 077 078/** 079 * Contains inner classes for reading or writing the on-disk format for 080 * FSImages. 081 * 082 * In particular, the format of the FSImage looks like: 083 * <pre> 084 * FSImage { 085 * layoutVersion: int, namespaceID: int, numberItemsInFSDirectoryTree: long, 086 * namesystemGenerationStampV1: long, namesystemGenerationStampV2: long, 087 * generationStampAtBlockIdSwitch:long, lastAllocatedBlockId: 088 * long transactionID: long, snapshotCounter: int, numberOfSnapshots: int, 089 * numOfSnapshottableDirs: int, 090 * {FSDirectoryTree, FilesUnderConstruction, SecretManagerState} (can be compressed) 091 * } 092 * 093 * FSDirectoryTree (if {@link Feature#FSIMAGE_NAME_OPTIMIZATION} is supported) { 094 * INodeInfo of root, numberOfChildren of root: int 095 * [list of INodeInfo of root's children], 096 * [list of INodeDirectoryInfo of root's directory children] 097 * } 098 * 099 * FSDirectoryTree (if {@link Feature#FSIMAGE_NAME_OPTIMIZATION} not supported){ 100 * [list of INodeInfo of INodes in topological order] 101 * } 102 * 103 * INodeInfo { 104 * { 105 * localName: short + byte[] 106 * } when {@link Feature#FSIMAGE_NAME_OPTIMIZATION} is supported 107 * or 108 * { 109 * fullPath: byte[] 110 * } when {@link Feature#FSIMAGE_NAME_OPTIMIZATION} is not supported 111 * replicationFactor: short, modificationTime: long, 112 * accessTime: long, preferredBlockSize: long, 113 * numberOfBlocks: int (-1 for INodeDirectory, -2 for INodeSymLink), 114 * { 115 * nsQuota: long, dsQuota: long, 116 * { 117 * isINodeSnapshottable: byte, 118 * isINodeWithSnapshot: byte (if isINodeSnapshottable is false) 119 * } (when {@link Feature#SNAPSHOT} is supported), 120 * fsPermission: short, PermissionStatus 121 * } for INodeDirectory 122 * or 123 * { 124 * symlinkString, fsPermission: short, PermissionStatus 125 * } for INodeSymlink 126 * or 127 * { 128 * [list of BlockInfo] 129 * [list of FileDiff] 130 * { 131 * isINodeFileUnderConstructionSnapshot: byte, 132 * {clientName: short + byte[], clientMachine: short + byte[]} (when 133 * isINodeFileUnderConstructionSnapshot is true), 134 * } (when {@link Feature#SNAPSHOT} is supported and writing snapshotINode), 135 * fsPermission: short, PermissionStatus 136 * } for INodeFile 137 * } 138 * 139 * INodeDirectoryInfo { 140 * fullPath of the directory: short + byte[], 141 * numberOfChildren: int, [list of INodeInfo of children INode], 142 * { 143 * numberOfSnapshots: int, 144 * [list of Snapshot] (when NumberOfSnapshots is positive), 145 * numberOfDirectoryDiffs: int, 146 * [list of DirectoryDiff] (NumberOfDirectoryDiffs is positive), 147 * number of children that are directories, 148 * [list of INodeDirectoryInfo of the directory children] (includes 149 * snapshot copies of deleted sub-directories) 150 * } (when {@link Feature#SNAPSHOT} is supported), 151 * } 152 * 153 * Snapshot { 154 * snapshotID: int, root of Snapshot: INodeDirectoryInfo (its local name is 155 * the name of the snapshot) 156 * } 157 * 158 * DirectoryDiff { 159 * full path of the root of the associated Snapshot: short + byte[], 160 * childrenSize: int, 161 * isSnapshotRoot: byte, 162 * snapshotINodeIsNotNull: byte (when isSnapshotRoot is false), 163 * snapshotINode: INodeDirectory (when SnapshotINodeIsNotNull is true), Diff 164 * } 165 * 166 * Diff { 167 * createdListSize: int, [Local name of INode in created list], 168 * deletedListSize: int, [INode in deleted list: INodeInfo] 169 * } 170 * 171 * FileDiff { 172 * full path of the root of the associated Snapshot: short + byte[], 173 * fileSize: long, 174 * snapshotINodeIsNotNull: byte, 175 * snapshotINode: INodeFile (when SnapshotINodeIsNotNull is true), Diff 176 * } 177 * </pre> 178 */ 179@InterfaceAudience.Private 180@InterfaceStability.Evolving 181public class FSImageFormat { 182 private static final Log LOG = FSImage.LOG; 183 184 // Static-only class 185 private FSImageFormat() {} 186 187 interface AbstractLoader { 188 MD5Hash getLoadedImageMd5(); 189 long getLoadedImageTxId(); 190 } 191 192 static class LoaderDelegator implements AbstractLoader { 193 private AbstractLoader impl; 194 private final Configuration conf; 195 private final FSNamesystem fsn; 196 197 LoaderDelegator(Configuration conf, FSNamesystem fsn) { 198 this.conf = conf; 199 this.fsn = fsn; 200 } 201 202 @Override 203 public MD5Hash getLoadedImageMd5() { 204 return impl.getLoadedImageMd5(); 205 } 206 207 @Override 208 public long getLoadedImageTxId() { 209 return impl.getLoadedImageTxId(); 210 } 211 212 public void load(File file, boolean requireSameLayoutVersion) 213 throws IOException { 214 Preconditions.checkState(impl == null, "Image already loaded!"); 215 216 FileInputStream is = null; 217 try { 218 is = new FileInputStream(file); 219 byte[] magic = new byte[FSImageUtil.MAGIC_HEADER.length]; 220 IOUtils.readFully(is, magic, 0, magic.length); 221 if (Arrays.equals(magic, FSImageUtil.MAGIC_HEADER)) { 222 FSImageFormatProtobuf.Loader loader = new FSImageFormatProtobuf.Loader( 223 conf, fsn, requireSameLayoutVersion); 224 impl = loader; 225 loader.load(file); 226 } else { 227 Loader loader = new Loader(conf, fsn); 228 impl = loader; 229 loader.load(file); 230 } 231 } finally { 232 IOUtils.cleanup(LOG, is); 233 } 234 } 235 } 236 237 /** 238 * Construct a loader class to load the image. It chooses the loader based on 239 * the layout version. 240 */ 241 public static LoaderDelegator newLoader(Configuration conf, FSNamesystem fsn) { 242 return new LoaderDelegator(conf, fsn); 243 } 244 245 /** 246 * A one-shot class responsible for loading an image. The load() function 247 * should be called once, after which the getter methods may be used to retrieve 248 * information about the image that was loaded, if loading was successful. 249 */ 250 public static class Loader implements AbstractLoader { 251 private final Configuration conf; 252 /** which namesystem this loader is working for */ 253 private final FSNamesystem namesystem; 254 255 /** Set to true once a file has been loaded using this loader. */ 256 private boolean loaded = false; 257 258 /** The transaction ID of the last edit represented by the loaded file */ 259 private long imgTxId; 260 /** The MD5 sum of the loaded file */ 261 private MD5Hash imgDigest; 262 263 private Map<Integer, Snapshot> snapshotMap = null; 264 private final ReferenceMap referenceMap = new ReferenceMap(); 265 266 Loader(Configuration conf, FSNamesystem namesystem) { 267 this.conf = conf; 268 this.namesystem = namesystem; 269 } 270 271 /** 272 * Return the MD5 checksum of the image that has been loaded. 273 * @throws IllegalStateException if load() has not yet been called. 274 */ 275 @Override 276 public MD5Hash getLoadedImageMd5() { 277 checkLoaded(); 278 return imgDigest; 279 } 280 281 @Override 282 public long getLoadedImageTxId() { 283 checkLoaded(); 284 return imgTxId; 285 } 286 287 /** 288 * Throw IllegalStateException if load() has not yet been called. 289 */ 290 private void checkLoaded() { 291 if (!loaded) { 292 throw new IllegalStateException("Image not yet loaded!"); 293 } 294 } 295 296 /** 297 * Throw IllegalStateException if load() has already been called. 298 */ 299 private void checkNotLoaded() { 300 if (loaded) { 301 throw new IllegalStateException("Image already loaded!"); 302 } 303 } 304 305 public void load(File curFile) throws IOException { 306 checkNotLoaded(); 307 assert curFile != null : "curFile is null"; 308 309 StartupProgress prog = NameNode.getStartupProgress(); 310 Step step = new Step(StepType.INODES); 311 prog.beginStep(Phase.LOADING_FSIMAGE, step); 312 long startTime = monotonicNow(); 313 314 // 315 // Load in bits 316 // 317 MessageDigest digester = MD5Hash.getDigester(); 318 DigestInputStream fin = new DigestInputStream( 319 new FileInputStream(curFile), digester); 320 321 DataInputStream in = new DataInputStream(fin); 322 try { 323 // read image version: first appeared in version -1 324 int imgVersion = in.readInt(); 325 if (getLayoutVersion() != imgVersion) { 326 throw new InconsistentFSStateException(curFile, 327 "imgVersion " + imgVersion + 328 " expected to be " + getLayoutVersion()); 329 } 330 boolean supportSnapshot = NameNodeLayoutVersion.supports( 331 LayoutVersion.Feature.SNAPSHOT, imgVersion); 332 if (NameNodeLayoutVersion.supports( 333 LayoutVersion.Feature.ADD_LAYOUT_FLAGS, imgVersion)) { 334 LayoutFlags.read(in); 335 } 336 337 // read namespaceID: first appeared in version -2 338 in.readInt(); 339 340 long numFiles = in.readLong(); 341 342 // read in the last generation stamp for legacy blocks. 343 long genstamp = in.readLong(); 344 namesystem.getBlockIdManager().setGenerationStampV1(genstamp); 345 346 if (NameNodeLayoutVersion.supports( 347 LayoutVersion.Feature.SEQUENTIAL_BLOCK_ID, imgVersion)) { 348 // read the starting generation stamp for sequential block IDs 349 genstamp = in.readLong(); 350 namesystem.getBlockIdManager().setGenerationStampV2(genstamp); 351 352 // read the last generation stamp for blocks created after 353 // the switch to sequential block IDs. 354 long stampAtIdSwitch = in.readLong(); 355 namesystem.getBlockIdManager().setGenerationStampV1Limit(stampAtIdSwitch); 356 357 // read the max sequential block ID. 358 long maxSequentialBlockId = in.readLong(); 359 namesystem.getBlockIdManager().setLastAllocatedBlockId(maxSequentialBlockId); 360 } else { 361 362 long startingGenStamp = namesystem.getBlockIdManager() 363 .upgradeGenerationStampToV2(); 364 // This is an upgrade. 365 LOG.info("Upgrading to sequential block IDs. Generation stamp " + 366 "for new blocks set to " + startingGenStamp); 367 } 368 369 // read the transaction ID of the last edit represented by 370 // this image 371 if (NameNodeLayoutVersion.supports( 372 LayoutVersion.Feature.STORED_TXIDS, imgVersion)) { 373 imgTxId = in.readLong(); 374 } else { 375 imgTxId = 0; 376 } 377 378 // read the last allocated inode id in the fsimage 379 if (NameNodeLayoutVersion.supports( 380 LayoutVersion.Feature.ADD_INODE_ID, imgVersion)) { 381 long lastInodeId = in.readLong(); 382 namesystem.dir.resetLastInodeId(lastInodeId); 383 if (LOG.isDebugEnabled()) { 384 LOG.debug("load last allocated InodeId from fsimage:" + lastInodeId); 385 } 386 } else { 387 if (LOG.isDebugEnabled()) { 388 LOG.debug("Old layout version doesn't have inode id." 389 + " Will assign new id for each inode."); 390 } 391 } 392 393 if (supportSnapshot) { 394 snapshotMap = namesystem.getSnapshotManager().read(in, this); 395 } 396 397 // read compression related info 398 FSImageCompression compression; 399 if (NameNodeLayoutVersion.supports( 400 LayoutVersion.Feature.FSIMAGE_COMPRESSION, imgVersion)) { 401 compression = FSImageCompression.readCompressionHeader(conf, in); 402 } else { 403 compression = FSImageCompression.createNoopCompression(); 404 } 405 in = compression.unwrapInputStream(fin); 406 407 LOG.info("Loading image file " + curFile + " using " + compression); 408 409 // load all inodes 410 LOG.info("Number of files = " + numFiles); 411 prog.setTotal(Phase.LOADING_FSIMAGE, step, numFiles); 412 Counter counter = prog.getCounter(Phase.LOADING_FSIMAGE, step); 413 if (NameNodeLayoutVersion.supports( 414 LayoutVersion.Feature.FSIMAGE_NAME_OPTIMIZATION, imgVersion)) { 415 if (supportSnapshot) { 416 loadLocalNameINodesWithSnapshot(numFiles, in, counter); 417 } else { 418 loadLocalNameINodes(numFiles, in, counter); 419 } 420 } else { 421 loadFullNameINodes(numFiles, in, counter); 422 } 423 424 loadFilesUnderConstruction(in, supportSnapshot, counter); 425 prog.endStep(Phase.LOADING_FSIMAGE, step); 426 // Now that the step is finished, set counter equal to total to adjust 427 // for possible under-counting due to reference inodes. 428 prog.setCount(Phase.LOADING_FSIMAGE, step, numFiles); 429 430 loadSecretManagerState(in); 431 432 loadCacheManagerState(in); 433 434 // make sure to read to the end of file 435 boolean eof = (in.read() == -1); 436 assert eof : "Should have reached the end of image file " + curFile; 437 } finally { 438 in.close(); 439 } 440 441 imgDigest = new MD5Hash(digester.digest()); 442 loaded = true; 443 444 LOG.info("Image file " + curFile + " of size " + curFile.length() 445 + " bytes loaded in " + (monotonicNow() - startTime) / 1000 446 + " seconds."); 447 } 448 449 /** Update the root node's attributes */ 450 private void updateRootAttr(INodeWithAdditionalFields root) { 451 final QuotaCounts q = root.getQuotaCounts(); 452 final long nsQuota = q.getNameSpace(); 453 final long dsQuota = q.getStorageSpace(); 454 FSDirectory fsDir = namesystem.dir; 455 if (nsQuota != -1 || dsQuota != -1) { 456 fsDir.rootDir.getDirectoryWithQuotaFeature().setQuota(nsQuota, dsQuota); 457 } 458 fsDir.rootDir.cloneModificationTime(root); 459 fsDir.rootDir.clonePermissionStatus(root); 460 } 461 462 /** 463 * Load fsimage files when 1) only local names are stored, 464 * and 2) snapshot is supported. 465 * 466 * @param numFiles number of files expected to be read 467 * @param in Image input stream 468 * @param counter Counter to increment for namenode startup progress 469 */ 470 private void loadLocalNameINodesWithSnapshot(long numFiles, DataInput in, 471 Counter counter) throws IOException { 472 assert NameNodeLayoutVersion.supports( 473 LayoutVersion.Feature.FSIMAGE_NAME_OPTIMIZATION, getLayoutVersion()); 474 assert NameNodeLayoutVersion.supports( 475 LayoutVersion.Feature.SNAPSHOT, getLayoutVersion()); 476 477 // load root 478 loadRoot(in, counter); 479 // load rest of the nodes recursively 480 loadDirectoryWithSnapshot(in, counter); 481 } 482 483 /** 484 * load fsimage files assuming only local names are stored. Used when 485 * snapshots are not supported by the layout version. 486 * 487 * @param numFiles number of files expected to be read 488 * @param in image input stream 489 * @param counter Counter to increment for namenode startup progress 490 * @throws IOException 491 */ 492 private void loadLocalNameINodes(long numFiles, DataInput in, Counter counter) 493 throws IOException { 494 assert NameNodeLayoutVersion.supports( 495 LayoutVersion.Feature.FSIMAGE_NAME_OPTIMIZATION, getLayoutVersion()); 496 assert numFiles > 0; 497 498 // load root 499 loadRoot(in, counter); 500 // have loaded the first file (the root) 501 numFiles--; 502 503 // load rest of the nodes directory by directory 504 while (numFiles > 0) { 505 numFiles -= loadDirectory(in, counter); 506 } 507 if (numFiles != 0) { 508 throw new IOException("Read unexpect number of files: " + -numFiles); 509 } 510 } 511 512 /** 513 * Load information about root, and use the information to update the root 514 * directory of NameSystem. 515 * @param in The {@link DataInput} instance to read. 516 * @param counter Counter to increment for namenode startup progress 517 */ 518 private void loadRoot(DataInput in, Counter counter) 519 throws IOException { 520 // load root 521 if (in.readShort() != 0) { 522 throw new IOException("First node is not root"); 523 } 524 final INodeDirectory root = loadINode(null, false, in, counter) 525 .asDirectory(); 526 // update the root's attributes 527 updateRootAttr(root); 528 } 529 530 /** Load children nodes for the parent directory. */ 531 private int loadChildren(INodeDirectory parent, DataInput in, 532 Counter counter) throws IOException { 533 int numChildren = in.readInt(); 534 for (int i = 0; i < numChildren; i++) { 535 // load single inode 536 INode newNode = loadINodeWithLocalName(false, in, true, counter); 537 addToParent(parent, newNode); 538 } 539 return numChildren; 540 } 541 542 /** 543 * Load a directory when snapshot is supported. 544 * @param in The {@link DataInput} instance to read. 545 * @param counter Counter to increment for namenode startup progress 546 */ 547 private void loadDirectoryWithSnapshot(DataInput in, Counter counter) 548 throws IOException { 549 // Step 1. Identify the parent INode 550 long inodeId = in.readLong(); 551 final INodeDirectory parent = this.namesystem.dir.getInode(inodeId) 552 .asDirectory(); 553 554 // Check if the whole subtree has been saved (for reference nodes) 555 boolean toLoadSubtree = referenceMap.toProcessSubtree(parent.getId()); 556 if (!toLoadSubtree) { 557 return; 558 } 559 560 // Step 2. Load snapshots if parent is snapshottable 561 int numSnapshots = in.readInt(); 562 if (numSnapshots >= 0) { 563 // load snapshots and snapshotQuota 564 SnapshotFSImageFormat.loadSnapshotList(parent, numSnapshots, in, this); 565 if (parent.getDirectorySnapshottableFeature().getSnapshotQuota() > 0) { 566 // add the directory to the snapshottable directory list in 567 // SnapshotManager. Note that we only add root when its snapshot quota 568 // is positive. 569 this.namesystem.getSnapshotManager().addSnapshottable(parent); 570 } 571 } 572 573 // Step 3. Load children nodes under parent 574 loadChildren(parent, in, counter); 575 576 // Step 4. load Directory Diff List 577 SnapshotFSImageFormat.loadDirectoryDiffList(parent, in, this); 578 579 // Recursively load sub-directories, including snapshot copies of deleted 580 // directories 581 int numSubTree = in.readInt(); 582 for (int i = 0; i < numSubTree; i++) { 583 loadDirectoryWithSnapshot(in, counter); 584 } 585 } 586 587 /** 588 * Load all children of a directory 589 * 590 * @param in input to load from 591 * @param counter Counter to increment for namenode startup progress 592 * @return number of child inodes read 593 * @throws IOException 594 */ 595 private int loadDirectory(DataInput in, Counter counter) throws IOException { 596 String parentPath = FSImageSerialization.readString(in); 597 // Rename .snapshot paths if we're doing an upgrade 598 parentPath = renameReservedPathsOnUpgrade(parentPath, getLayoutVersion()); 599 final INodeDirectory parent = INodeDirectory.valueOf( 600 namesystem.dir.getINode(parentPath, DirOp.READ), parentPath); 601 return loadChildren(parent, in, counter); 602 } 603 604 /** 605 * load fsimage files assuming full path names are stored 606 * 607 * @param numFiles total number of files to load 608 * @param in data input stream 609 * @param counter Counter to increment for namenode startup progress 610 * @throws IOException if any error occurs 611 */ 612 private void loadFullNameINodes(long numFiles, DataInput in, Counter counter) 613 throws IOException { 614 byte[][] pathComponents; 615 byte[][] parentPath = {{}}; 616 FSDirectory fsDir = namesystem.dir; 617 INodeDirectory parentINode = fsDir.rootDir; 618 for (long i = 0; i < numFiles; i++) { 619 pathComponents = FSImageSerialization.readPathComponents(in); 620 for (int j=0; j < pathComponents.length; j++) { 621 byte[] newComponent = renameReservedComponentOnUpgrade 622 (pathComponents[j], getLayoutVersion()); 623 if (!Arrays.equals(newComponent, pathComponents[j])) { 624 String oldPath = DFSUtil.byteArray2PathString(pathComponents); 625 pathComponents[j] = newComponent; 626 String newPath = DFSUtil.byteArray2PathString(pathComponents); 627 LOG.info("Renaming reserved path " + oldPath + " to " + newPath); 628 } 629 } 630 final INode newNode = loadINode( 631 pathComponents[pathComponents.length-1], false, in, counter); 632 633 if (isRoot(pathComponents)) { // it is the root 634 // update the root's attributes 635 updateRootAttr(newNode.asDirectory()); 636 continue; 637 } 638 639 namesystem.dir.addToInodeMap(newNode); 640 // check if the new inode belongs to the same parent 641 if(!isParent(pathComponents, parentPath)) { 642 parentINode = getParentINodeDirectory(pathComponents); 643 parentPath = getParent(pathComponents); 644 } 645 646 // add new inode 647 addToParent(parentINode, newNode); 648 } 649 } 650 651 private INodeDirectory getParentINodeDirectory(byte[][] pathComponents) 652 throws IOException { 653 if (pathComponents.length < 2) { // root 654 return null; 655 } 656 // Gets the parent INode 657 final INodesInPath inodes = 658 namesystem.dir.getINodesInPath(pathComponents, DirOp.WRITE); 659 return INodeDirectory.valueOf(inodes.getINode(-2), pathComponents); 660 } 661 662 /** 663 * Add the child node to parent and, if child is a file, update block map. 664 * This method is only used for image loading so that synchronization, 665 * modification time update and space count update are not needed. 666 */ 667 private void addToParent(INodeDirectory parent, INode child) 668 throws IllegalReservedPathException { 669 FSDirectory fsDir = namesystem.dir; 670 if (parent == fsDir.rootDir) { 671 child.setLocalName(renameReservedRootComponentOnUpgrade( 672 child.getLocalNameBytes(), getLayoutVersion())); 673 } 674 // NOTE: This does not update space counts for parents 675 if (!parent.addChild(child)) { 676 return; 677 } 678 namesystem.dir.cacheName(child); 679 680 if (child.isFile()) { 681 updateBlocksMap(child.asFile()); 682 } 683 } 684 685 public void updateBlocksMap(INodeFile file) { 686 // Add file->block mapping 687 final BlockInfo[] blocks = file.getBlocks(); 688 if (blocks != null) { 689 final BlockManager bm = namesystem.getBlockManager(); 690 for (int i = 0; i < blocks.length; i++) { 691 file.setBlock(i, bm.addBlockCollection(blocks[i], file)); 692 } 693 } 694 } 695 696 /** @return The FSDirectory of the namesystem where the fsimage is loaded */ 697 public FSDirectory getFSDirectoryInLoading() { 698 return namesystem.dir; 699 } 700 701 public INode loadINodeWithLocalName(boolean isSnapshotINode, DataInput in, 702 boolean updateINodeMap) throws IOException { 703 return loadINodeWithLocalName(isSnapshotINode, in, updateINodeMap, null); 704 } 705 706 public INode loadINodeWithLocalName(boolean isSnapshotINode, 707 DataInput in, boolean updateINodeMap, Counter counter) 708 throws IOException { 709 byte[] localName = FSImageSerialization.readLocalName(in); 710 localName = 711 renameReservedComponentOnUpgrade(localName, getLayoutVersion()); 712 INode inode = loadINode(localName, isSnapshotINode, in, counter); 713 if (updateINodeMap) { 714 namesystem.dir.addToInodeMap(inode); 715 } 716 return inode; 717 } 718 719 /** 720 * load an inode from fsimage except for its name 721 * 722 * @param in data input stream from which image is read 723 * @param counter Counter to increment for namenode startup progress 724 * @return an inode 725 */ 726 @SuppressWarnings("deprecation") 727 INode loadINode(final byte[] localName, boolean isSnapshotINode, 728 DataInput in, Counter counter) throws IOException { 729 final int imgVersion = getLayoutVersion(); 730 if (NameNodeLayoutVersion.supports( 731 LayoutVersion.Feature.SNAPSHOT, imgVersion)) { 732 namesystem.getFSDirectory().verifyINodeName(localName); 733 } 734 735 long inodeId = NameNodeLayoutVersion.supports( 736 LayoutVersion.Feature.ADD_INODE_ID, imgVersion) ? in.readLong() 737 : namesystem.dir.allocateNewInodeId(); 738 739 final short replication = namesystem.getBlockManager().adjustReplication( 740 in.readShort()); 741 final long modificationTime = in.readLong(); 742 long atime = 0; 743 if (NameNodeLayoutVersion.supports( 744 LayoutVersion.Feature.FILE_ACCESS_TIME, imgVersion)) { 745 atime = in.readLong(); 746 } 747 final long blockSize = in.readLong(); 748 final int numBlocks = in.readInt(); 749 750 if (numBlocks >= 0) { 751 // file 752 753 // read blocks 754 BlockInfo[] blocks = new BlockInfo[numBlocks]; 755 for (int j = 0; j < numBlocks; j++) { 756 blocks[j] = new BlockInfoContiguous(replication); 757 blocks[j].readFields(in); 758 } 759 760 String clientName = ""; 761 String clientMachine = ""; 762 boolean underConstruction = false; 763 FileDiffList fileDiffs = null; 764 if (NameNodeLayoutVersion.supports( 765 LayoutVersion.Feature.SNAPSHOT, imgVersion)) { 766 // read diffs 767 fileDiffs = SnapshotFSImageFormat.loadFileDiffList(in, this); 768 769 if (isSnapshotINode) { 770 underConstruction = in.readBoolean(); 771 if (underConstruction) { 772 clientName = FSImageSerialization.readString(in); 773 clientMachine = FSImageSerialization.readString(in); 774 // convert the last block to BlockUC 775 if (blocks.length > 0) { 776 BlockInfo lastBlk = blocks[blocks.length - 1]; 777 lastBlk.convertToBlockUnderConstruction( 778 HdfsServerConstants.BlockUCState.UNDER_CONSTRUCTION, null); 779 } 780 } 781 } 782 } 783 784 final PermissionStatus permissions = PermissionStatus.read(in); 785 786 // return 787 if (counter != null) { 788 counter.increment(); 789 } 790 791 final INodeFile file = new INodeFile(inodeId, localName, permissions, 792 modificationTime, atime, blocks, replication, blockSize, (byte)0); 793 if (underConstruction) { 794 file.toUnderConstruction(clientName, clientMachine); 795 } 796 return fileDiffs == null ? file : new INodeFile(file, fileDiffs); 797 } else if (numBlocks == -1) { 798 //directory 799 800 //read quotas 801 final long nsQuota = in.readLong(); 802 long dsQuota = -1L; 803 if (NameNodeLayoutVersion.supports( 804 LayoutVersion.Feature.DISKSPACE_QUOTA, imgVersion)) { 805 dsQuota = in.readLong(); 806 } 807 808 //read snapshot info 809 boolean snapshottable = false; 810 boolean withSnapshot = false; 811 if (NameNodeLayoutVersion.supports( 812 LayoutVersion.Feature.SNAPSHOT, imgVersion)) { 813 snapshottable = in.readBoolean(); 814 if (!snapshottable) { 815 withSnapshot = in.readBoolean(); 816 } 817 } 818 819 final PermissionStatus permissions = PermissionStatus.read(in); 820 821 //return 822 if (counter != null) { 823 counter.increment(); 824 } 825 final INodeDirectory dir = new INodeDirectory(inodeId, localName, 826 permissions, modificationTime); 827 if (nsQuota >= 0 || dsQuota >= 0) { 828 dir.addDirectoryWithQuotaFeature(new DirectoryWithQuotaFeature.Builder(). 829 nameSpaceQuota(nsQuota).storageSpaceQuota(dsQuota).build()); 830 } 831 if (withSnapshot) { 832 dir.addSnapshotFeature(null); 833 } 834 if (snapshottable) { 835 dir.addSnapshottableFeature(); 836 } 837 return dir; 838 } else if (numBlocks == -2) { 839 //symlink 840 if (!FileSystem.areSymlinksEnabled()) { 841 throw new IOException("Symlinks not supported - please remove symlink before upgrading to this version of HDFS"); 842 } 843 844 final String symlink = Text.readString(in); 845 final PermissionStatus permissions = PermissionStatus.read(in); 846 if (counter != null) { 847 counter.increment(); 848 } 849 return new INodeSymlink(inodeId, localName, permissions, 850 modificationTime, atime, symlink); 851 } else if (numBlocks == -3) { 852 //reference 853 // Intentionally do not increment counter, because it is too difficult at 854 // this point to assess whether or not this is a reference that counts 855 // toward quota. 856 857 final boolean isWithName = in.readBoolean(); 858 // lastSnapshotId for WithName node, dstSnapshotId for DstReference node 859 int snapshotId = in.readInt(); 860 861 final INodeReference.WithCount withCount 862 = referenceMap.loadINodeReferenceWithCount(isSnapshotINode, in, this); 863 864 if (isWithName) { 865 return new INodeReference.WithName(null, withCount, localName, 866 snapshotId); 867 } else { 868 final INodeReference ref = new INodeReference.DstReference(null, 869 withCount, snapshotId); 870 return ref; 871 } 872 } 873 874 throw new IOException("Unknown inode type: numBlocks=" + numBlocks); 875 } 876 877 /** Load {@link INodeFileAttributes}. */ 878 public INodeFileAttributes loadINodeFileAttributes(DataInput in) 879 throws IOException { 880 final int layoutVersion = getLayoutVersion(); 881 882 if (!NameNodeLayoutVersion.supports( 883 LayoutVersion.Feature.OPTIMIZE_SNAPSHOT_INODES, layoutVersion)) { 884 return loadINodeWithLocalName(true, in, false).asFile(); 885 } 886 887 final byte[] name = FSImageSerialization.readLocalName(in); 888 final PermissionStatus permissions = PermissionStatus.read(in); 889 final long modificationTime = in.readLong(); 890 final long accessTime = in.readLong(); 891 892 final short replication = namesystem.getBlockManager().adjustReplication( 893 in.readShort()); 894 final long preferredBlockSize = in.readLong(); 895 896 return new INodeFileAttributes.SnapshotCopy(name, permissions, null, modificationTime, 897 accessTime, replication, preferredBlockSize, (byte) 0, null); 898 } 899 900 public INodeDirectoryAttributes loadINodeDirectoryAttributes(DataInput in) 901 throws IOException { 902 final int layoutVersion = getLayoutVersion(); 903 904 if (!NameNodeLayoutVersion.supports( 905 LayoutVersion.Feature.OPTIMIZE_SNAPSHOT_INODES, layoutVersion)) { 906 return loadINodeWithLocalName(true, in, false).asDirectory(); 907 } 908 909 final byte[] name = FSImageSerialization.readLocalName(in); 910 final PermissionStatus permissions = PermissionStatus.read(in); 911 final long modificationTime = in.readLong(); 912 913 // Read quotas: quota by storage type does not need to be processed below. 914 // It is handled only in protobuf based FsImagePBINode class for newer 915 // fsImages. Tools using this class such as legacy-mode of offline image viewer 916 // should only load legacy FSImages without newer features. 917 final long nsQuota = in.readLong(); 918 final long dsQuota = in.readLong(); 919 920 return nsQuota == -1L && dsQuota == -1L ? new INodeDirectoryAttributes.SnapshotCopy( 921 name, permissions, null, modificationTime, null) 922 : new INodeDirectoryAttributes.CopyWithQuota(name, permissions, 923 null, modificationTime, nsQuota, dsQuota, null, null); 924 } 925 926 private void loadFilesUnderConstruction(DataInput in, 927 boolean supportSnapshot, Counter counter) throws IOException { 928 FSDirectory fsDir = namesystem.dir; 929 int size = in.readInt(); 930 931 LOG.info("Number of files under construction = " + size); 932 933 for (int i = 0; i < size; i++) { 934 INodeFile cons = FSImageSerialization.readINodeUnderConstruction(in, 935 namesystem, getLayoutVersion()); 936 counter.increment(); 937 938 // verify that file exists in namespace 939 String path = cons.getLocalName(); 940 INodeFile oldnode = null; 941 boolean inSnapshot = false; 942 if (path != null && FSDirectory.isReservedName(path) && 943 NameNodeLayoutVersion.supports( 944 LayoutVersion.Feature.ADD_INODE_ID, getLayoutVersion())) { 945 // TODO: for HDFS-5428, we use reserved path for those INodeFileUC in 946 // snapshot. If we support INode ID in the layout version, we can use 947 // the inode id to find the oldnode. 948 oldnode = namesystem.dir.getInode(cons.getId()).asFile(); 949 inSnapshot = true; 950 } else { 951 path = renameReservedPathsOnUpgrade(path, getLayoutVersion()); 952 final INodesInPath iip = fsDir.getINodesInPath(path, DirOp.WRITE); 953 oldnode = INodeFile.valueOf(iip.getLastINode(), path); 954 } 955 956 FileUnderConstructionFeature uc = cons.getFileUnderConstructionFeature(); 957 oldnode.toUnderConstruction(uc.getClientName(), uc.getClientMachine()); 958 if (oldnode.numBlocks() > 0) { 959 BlockInfo ucBlock = cons.getLastBlock(); 960 // we do not replace the inode, just replace the last block of oldnode 961 BlockInfo info = namesystem.getBlockManager().addBlockCollection( 962 ucBlock, oldnode); 963 oldnode.setBlock(oldnode.numBlocks() - 1, info); 964 } 965 966 if (!inSnapshot) { 967 namesystem.leaseManager.addLease(uc.getClientName(), oldnode.getId()); 968 } 969 } 970 } 971 972 private void loadSecretManagerState(DataInput in) 973 throws IOException { 974 int imgVersion = getLayoutVersion(); 975 976 if (!NameNodeLayoutVersion.supports( 977 LayoutVersion.Feature.DELEGATION_TOKEN, imgVersion)) { 978 //SecretManagerState is not available. 979 //This must not happen if security is turned on. 980 return; 981 } 982 namesystem.loadSecretManagerStateCompat(in); 983 } 984 985 private void loadCacheManagerState(DataInput in) throws IOException { 986 int imgVersion = getLayoutVersion(); 987 if (!NameNodeLayoutVersion.supports( 988 LayoutVersion.Feature.CACHING, imgVersion)) { 989 return; 990 } 991 namesystem.getCacheManager().loadStateCompat(in); 992 } 993 994 private int getLayoutVersion() { 995 return namesystem.getFSImage().getStorage().getLayoutVersion(); 996 } 997 998 private boolean isRoot(byte[][] path) { 999 return path.length == 1 && 1000 path[0] == null; 1001 } 1002 1003 private boolean isParent(byte[][] path, byte[][] parent) { 1004 if (path == null || parent == null) 1005 return false; 1006 if (parent.length == 0 || path.length != parent.length + 1) 1007 return false; 1008 boolean isParent = true; 1009 for (int i = 0; i < parent.length; i++) { 1010 isParent = isParent && Arrays.equals(path[i], parent[i]); 1011 } 1012 return isParent; 1013 } 1014 1015 /** 1016 * Return string representing the parent of the given path. 1017 */ 1018 String getParent(String path) { 1019 return path.substring(0, path.lastIndexOf(Path.SEPARATOR)); 1020 } 1021 1022 byte[][] getParent(byte[][] path) { 1023 byte[][] result = new byte[path.length - 1][]; 1024 for (int i = 0; i < result.length; i++) { 1025 result[i] = new byte[path[i].length]; 1026 System.arraycopy(path[i], 0, result[i], 0, path[i].length); 1027 } 1028 return result; 1029 } 1030 1031 public Snapshot getSnapshot(DataInput in) throws IOException { 1032 return snapshotMap.get(in.readInt()); 1033 } 1034 } 1035 1036 @VisibleForTesting 1037 public static final TreeMap<String, String> renameReservedMap = 1038 new TreeMap<String, String>(); 1039 1040 /** 1041 * Use the default key-value pairs that will be used to determine how to 1042 * rename reserved paths on upgrade. 1043 */ 1044 @VisibleForTesting 1045 public static void useDefaultRenameReservedPairs() { 1046 renameReservedMap.clear(); 1047 for (String key: HdfsServerConstants.RESERVED_PATH_COMPONENTS) { 1048 renameReservedMap.put( 1049 key, 1050 key + "." + HdfsServerConstants.NAMENODE_LAYOUT_VERSION + "." 1051 + "UPGRADE_RENAMED"); 1052 } 1053 } 1054 1055 /** 1056 * Set the key-value pairs that will be used to determine how to rename 1057 * reserved paths on upgrade. 1058 */ 1059 @VisibleForTesting 1060 public static void setRenameReservedPairs(String renameReserved) { 1061 // Clear and set the default values 1062 useDefaultRenameReservedPairs(); 1063 // Overwrite with provided values 1064 setRenameReservedMapInternal(renameReserved); 1065 } 1066 1067 private static void setRenameReservedMapInternal(String renameReserved) { 1068 Collection<String> pairs = 1069 StringUtils.getTrimmedStringCollection(renameReserved); 1070 for (String p : pairs) { 1071 String[] pair = StringUtils.split(p, '/', '='); 1072 Preconditions.checkArgument(pair.length == 2, 1073 "Could not parse key-value pair " + p); 1074 String key = pair[0]; 1075 String value = pair[1]; 1076 Preconditions.checkArgument(DFSUtil.isReservedPathComponent(key), 1077 "Unknown reserved path " + key); 1078 Preconditions.checkArgument(DFSUtil.isValidNameForComponent(value), 1079 "Invalid rename path for " + key + ": " + value); 1080 LOG.info("Will rename reserved path " + key + " to " + value); 1081 renameReservedMap.put(key, value); 1082 } 1083 } 1084 1085 /** 1086 * When upgrading from an old version, the filesystem could contain paths 1087 * that are now reserved in the new version (e.g. .snapshot). This renames 1088 * these new reserved paths to a user-specified value to avoid collisions 1089 * with the reserved name. 1090 * 1091 * @param path Old path potentially containing a reserved path 1092 * @return New path with reserved path components renamed to user value 1093 */ 1094 static String renameReservedPathsOnUpgrade(String path, 1095 final int layoutVersion) throws IllegalReservedPathException { 1096 final String oldPath = path; 1097 // If any known LVs aren't supported, we're doing an upgrade 1098 if (!NameNodeLayoutVersion.supports(Feature.ADD_INODE_ID, layoutVersion)) { 1099 String[] components = INode.getPathNames(path); 1100 // Only need to worry about the root directory 1101 if (components.length > 1) { 1102 components[1] = DFSUtil.bytes2String( 1103 renameReservedRootComponentOnUpgrade( 1104 DFSUtil.string2Bytes(components[1]), 1105 layoutVersion)); 1106 path = DFSUtil.strings2PathString(components); 1107 } 1108 } 1109 if (!NameNodeLayoutVersion.supports(Feature.SNAPSHOT, layoutVersion)) { 1110 String[] components = INode.getPathNames(path); 1111 // Special case the root path 1112 if (components.length == 0) { 1113 return path; 1114 } 1115 for (int i=0; i<components.length; i++) { 1116 components[i] = DFSUtil.bytes2String( 1117 renameReservedComponentOnUpgrade( 1118 DFSUtil.string2Bytes(components[i]), 1119 layoutVersion)); 1120 } 1121 path = DFSUtil.strings2PathString(components); 1122 } 1123 1124 if (!path.equals(oldPath)) { 1125 LOG.info("Upgrade process renamed reserved path " + oldPath + " to " 1126 + path); 1127 } 1128 return path; 1129 } 1130 1131 private final static String RESERVED_ERROR_MSG = 1132 FSDirectory.DOT_RESERVED_PATH_PREFIX + " is a reserved path and " 1133 + HdfsConstants.DOT_SNAPSHOT_DIR + " is a reserved path component in" 1134 + " this version of HDFS. Please rollback and delete or rename" 1135 + " this path, or upgrade with the " 1136 + StartupOption.RENAMERESERVED.getName() 1137 + " [key-value pairs]" 1138 + " option to automatically rename these paths during upgrade."; 1139 1140 /** 1141 * Same as {@link #renameReservedPathsOnUpgrade(String)}, but for a single 1142 * byte array path component. 1143 */ 1144 private static byte[] renameReservedComponentOnUpgrade(byte[] component, 1145 final int layoutVersion) throws IllegalReservedPathException { 1146 // If the LV doesn't support snapshots, we're doing an upgrade 1147 if (!NameNodeLayoutVersion.supports(Feature.SNAPSHOT, layoutVersion)) { 1148 if (Arrays.equals(component, HdfsServerConstants.DOT_SNAPSHOT_DIR_BYTES)) { 1149 if (!renameReservedMap.containsKey(HdfsConstants.DOT_SNAPSHOT_DIR)) { 1150 throw new IllegalReservedPathException(RESERVED_ERROR_MSG); 1151 } 1152 component = 1153 DFSUtil.string2Bytes(renameReservedMap 1154 .get(HdfsConstants.DOT_SNAPSHOT_DIR)); 1155 } 1156 } 1157 return component; 1158 } 1159 1160 /** 1161 * Same as {@link #renameReservedPathsOnUpgrade(String)}, but for a single 1162 * byte array path component. 1163 */ 1164 private static byte[] renameReservedRootComponentOnUpgrade(byte[] component, 1165 final int layoutVersion) throws IllegalReservedPathException { 1166 // If the LV doesn't support inode IDs, we're doing an upgrade 1167 if (!NameNodeLayoutVersion.supports(Feature.ADD_INODE_ID, layoutVersion)) { 1168 if (Arrays.equals(component, FSDirectory.DOT_RESERVED)) { 1169 if (!renameReservedMap.containsKey(HdfsConstants.DOT_SNAPSHOT_DIR)) { 1170 throw new IllegalReservedPathException(RESERVED_ERROR_MSG); 1171 } 1172 final String renameString = renameReservedMap 1173 .get(FSDirectory.DOT_RESERVED_STRING); 1174 component = 1175 DFSUtil.string2Bytes(renameString); 1176 LOG.info("Renamed root path " + FSDirectory.DOT_RESERVED_STRING 1177 + " to " + renameString); 1178 } 1179 } 1180 return component; 1181 } 1182 1183 /** 1184 * A one-shot class responsible for writing an image file. 1185 * The write() function should be called once, after which the getter 1186 * functions may be used to retrieve information about the file that was written. 1187 * 1188 * This is replaced by the PB-based FSImage. The class is to maintain 1189 * compatibility for the external fsimage tool. 1190 */ 1191 @Deprecated 1192 static class Saver { 1193 private static final int LAYOUT_VERSION = -51; 1194 public static final int CHECK_CANCEL_INTERVAL = 4096; 1195 private final SaveNamespaceContext context; 1196 /** Set to true once an image has been written */ 1197 private boolean saved = false; 1198 private long checkCancelCounter = 0; 1199 1200 /** The MD5 checksum of the file that was written */ 1201 private MD5Hash savedDigest; 1202 private final ReferenceMap referenceMap = new ReferenceMap(); 1203 1204 private final Map<Long, INodeFile> snapshotUCMap = 1205 new HashMap<Long, INodeFile>(); 1206 1207 /** @throws IllegalStateException if the instance has not yet saved an image */ 1208 private void checkSaved() { 1209 if (!saved) { 1210 throw new IllegalStateException("FSImageSaver has not saved an image"); 1211 } 1212 } 1213 1214 /** @throws IllegalStateException if the instance has already saved an image */ 1215 private void checkNotSaved() { 1216 if (saved) { 1217 throw new IllegalStateException("FSImageSaver has already saved an image"); 1218 } 1219 } 1220 1221 1222 Saver(SaveNamespaceContext context) { 1223 this.context = context; 1224 } 1225 1226 /** 1227 * Return the MD5 checksum of the image file that was saved. 1228 */ 1229 MD5Hash getSavedDigest() { 1230 checkSaved(); 1231 return savedDigest; 1232 } 1233 1234 void save(File newFile, FSImageCompression compression) throws IOException { 1235 checkNotSaved(); 1236 1237 final FSNamesystem sourceNamesystem = context.getSourceNamesystem(); 1238 final INodeDirectory rootDir = sourceNamesystem.dir.rootDir; 1239 final long numINodes = rootDir.getDirectoryWithQuotaFeature() 1240 .getSpaceConsumed().getNameSpace(); 1241 String sdPath = newFile.getParentFile().getParentFile().getAbsolutePath(); 1242 Step step = new Step(StepType.INODES, sdPath); 1243 StartupProgress prog = NameNode.getStartupProgress(); 1244 prog.beginStep(Phase.SAVING_CHECKPOINT, step); 1245 prog.setTotal(Phase.SAVING_CHECKPOINT, step, numINodes); 1246 Counter counter = prog.getCounter(Phase.SAVING_CHECKPOINT, step); 1247 long startTime = monotonicNow(); 1248 // 1249 // Write out data 1250 // 1251 MessageDigest digester = MD5Hash.getDigester(); 1252 FileOutputStream fout = new FileOutputStream(newFile); 1253 DigestOutputStream fos = new DigestOutputStream(fout, digester); 1254 DataOutputStream out = new DataOutputStream(fos); 1255 try { 1256 out.writeInt(LAYOUT_VERSION); 1257 LayoutFlags.write(out); 1258 // We use the non-locked version of getNamespaceInfo here since 1259 // the coordinating thread of saveNamespace already has read-locked 1260 // the namespace for us. If we attempt to take another readlock 1261 // from the actual saver thread, there's a potential of a 1262 // fairness-related deadlock. See the comments on HDFS-2223. 1263 out.writeInt(sourceNamesystem.unprotectedGetNamespaceInfo() 1264 .getNamespaceID()); 1265 out.writeLong(numINodes); 1266 out.writeLong(sourceNamesystem.getBlockIdManager().getGenerationStampV1()); 1267 out.writeLong(sourceNamesystem.getBlockIdManager().getGenerationStampV2()); 1268 out.writeLong(sourceNamesystem.getBlockIdManager().getGenerationStampAtblockIdSwitch()); 1269 out.writeLong(sourceNamesystem.getBlockIdManager().getLastAllocatedBlockId()); 1270 out.writeLong(context.getTxId()); 1271 out.writeLong(sourceNamesystem.dir.getLastInodeId()); 1272 1273 1274 sourceNamesystem.getSnapshotManager().write(out); 1275 1276 // write compression info and set up compressed stream 1277 out = compression.writeHeaderAndWrapStream(fos); 1278 LOG.info("Saving image file " + newFile + 1279 " using " + compression); 1280 1281 // save the root 1282 saveINode2Image(rootDir, out, false, referenceMap, counter); 1283 // save the rest of the nodes 1284 saveImage(rootDir, out, true, false, counter); 1285 prog.endStep(Phase.SAVING_CHECKPOINT, step); 1286 // Now that the step is finished, set counter equal to total to adjust 1287 // for possible under-counting due to reference inodes. 1288 prog.setCount(Phase.SAVING_CHECKPOINT, step, numINodes); 1289 // save files under construction 1290 // TODO: for HDFS-5428, since we cannot break the compatibility of 1291 // fsimage, we store part of the under-construction files that are only 1292 // in snapshots in this "under-construction-file" section. As a 1293 // temporary solution, we use "/.reserved/.inodes/<inodeid>" as their 1294 // paths, so that when loading fsimage we do not put them into the lease 1295 // map. In the future, we can remove this hack when we can bump the 1296 // layout version. 1297 saveFilesUnderConstruction(sourceNamesystem, out, snapshotUCMap); 1298 1299 context.checkCancelled(); 1300 sourceNamesystem.saveSecretManagerStateCompat(out, sdPath); 1301 context.checkCancelled(); 1302 sourceNamesystem.getCacheManager().saveStateCompat(out, sdPath); 1303 context.checkCancelled(); 1304 out.flush(); 1305 context.checkCancelled(); 1306 fout.getChannel().force(true); 1307 } finally { 1308 out.close(); 1309 } 1310 1311 saved = true; 1312 // set md5 of the saved image 1313 savedDigest = new MD5Hash(digester.digest()); 1314 1315 LOG.info("Image file " + newFile + " of size " + newFile.length() 1316 + " bytes saved in " + (monotonicNow() - startTime) / 1000 1317 + " seconds."); 1318 } 1319 1320 /** 1321 * Save children INodes. 1322 * @param children The list of children INodes 1323 * @param out The DataOutputStream to write 1324 * @param inSnapshot Whether the parent directory or its ancestor is in 1325 * the deleted list of some snapshot (caused by rename or 1326 * deletion) 1327 * @param counter Counter to increment for namenode startup progress 1328 * @return Number of children that are directory 1329 */ 1330 private int saveChildren(ReadOnlyList<INode> children, 1331 DataOutputStream out, boolean inSnapshot, Counter counter) 1332 throws IOException { 1333 // Write normal children INode. 1334 out.writeInt(children.size()); 1335 int dirNum = 0; 1336 for(INode child : children) { 1337 // print all children first 1338 // TODO: for HDFS-5428, we cannot change the format/content of fsimage 1339 // here, thus even if the parent directory is in snapshot, we still 1340 // do not handle INodeUC as those stored in deleted list 1341 saveINode2Image(child, out, false, referenceMap, counter); 1342 if (child.isDirectory()) { 1343 dirNum++; 1344 } else if (inSnapshot && child.isFile() 1345 && child.asFile().isUnderConstruction()) { 1346 this.snapshotUCMap.put(child.getId(), child.asFile()); 1347 } 1348 if (checkCancelCounter++ % CHECK_CANCEL_INTERVAL == 0) { 1349 context.checkCancelled(); 1350 } 1351 } 1352 return dirNum; 1353 } 1354 1355 /** 1356 * Save file tree image starting from the given root. 1357 * This is a recursive procedure, which first saves all children and 1358 * snapshot diffs of a current directory and then moves inside the 1359 * sub-directories. 1360 * 1361 * @param current The current node 1362 * @param out The DataoutputStream to write the image 1363 * @param toSaveSubtree Whether or not to save the subtree to fsimage. For 1364 * reference node, its subtree may already have been 1365 * saved before. 1366 * @param inSnapshot Whether the current directory is in snapshot 1367 * @param counter Counter to increment for namenode startup progress 1368 */ 1369 private void saveImage(INodeDirectory current, DataOutputStream out, 1370 boolean toSaveSubtree, boolean inSnapshot, Counter counter) 1371 throws IOException { 1372 // write the inode id of the directory 1373 out.writeLong(current.getId()); 1374 1375 if (!toSaveSubtree) { 1376 return; 1377 } 1378 1379 final ReadOnlyList<INode> children = current 1380 .getChildrenList(Snapshot.CURRENT_STATE_ID); 1381 int dirNum = 0; 1382 List<INodeDirectory> snapshotDirs = null; 1383 DirectoryWithSnapshotFeature sf = current.getDirectoryWithSnapshotFeature(); 1384 if (sf != null) { 1385 snapshotDirs = new ArrayList<INodeDirectory>(); 1386 sf.getSnapshotDirectory(snapshotDirs); 1387 dirNum += snapshotDirs.size(); 1388 } 1389 1390 // 2. Write INodeDirectorySnapshottable#snapshotsByNames to record all 1391 // Snapshots 1392 if (current.isDirectory() && current.asDirectory().isSnapshottable()) { 1393 SnapshotFSImageFormat.saveSnapshots(current.asDirectory(), out); 1394 } else { 1395 out.writeInt(-1); // # of snapshots 1396 } 1397 1398 // 3. Write children INode 1399 dirNum += saveChildren(children, out, inSnapshot, counter); 1400 1401 // 4. Write DirectoryDiff lists, if there is any. 1402 SnapshotFSImageFormat.saveDirectoryDiffList(current, out, referenceMap); 1403 1404 // Write sub-tree of sub-directories, including possible snapshots of 1405 // deleted sub-directories 1406 out.writeInt(dirNum); // the number of sub-directories 1407 for(INode child : children) { 1408 if(!child.isDirectory()) { 1409 continue; 1410 } 1411 // make sure we only save the subtree under a reference node once 1412 boolean toSave = child.isReference() ? 1413 referenceMap.toProcessSubtree(child.getId()) : true; 1414 saveImage(child.asDirectory(), out, toSave, inSnapshot, counter); 1415 } 1416 if (snapshotDirs != null) { 1417 for (INodeDirectory subDir : snapshotDirs) { 1418 // make sure we only save the subtree under a reference node once 1419 boolean toSave = subDir.getParentReference() != null ? 1420 referenceMap.toProcessSubtree(subDir.getId()) : true; 1421 saveImage(subDir, out, toSave, true, counter); 1422 } 1423 } 1424 } 1425 1426 /** 1427 * Saves inode and increments progress counter. 1428 * 1429 * @param inode INode to save 1430 * @param out DataOutputStream to receive inode 1431 * @param writeUnderConstruction boolean true if this is under construction 1432 * @param referenceMap ReferenceMap containing reference inodes 1433 * @param counter Counter to increment for namenode startup progress 1434 * @throws IOException thrown if there is an I/O error 1435 */ 1436 private void saveINode2Image(INode inode, DataOutputStream out, 1437 boolean writeUnderConstruction, ReferenceMap referenceMap, 1438 Counter counter) throws IOException { 1439 FSImageSerialization.saveINode2Image(inode, out, writeUnderConstruction, 1440 referenceMap); 1441 // Intentionally do not increment counter for reference inodes, because it 1442 // is too difficult at this point to assess whether or not this is a 1443 // reference that counts toward quota. 1444 if (!(inode instanceof INodeReference)) { 1445 counter.increment(); 1446 } 1447 } 1448 1449 /** 1450 * Serializes leases. 1451 */ 1452 void saveFilesUnderConstruction(FSNamesystem fsn, DataOutputStream out, 1453 Map<Long, INodeFile> snapshotUCMap) throws IOException { 1454 // This is run by an inferior thread of saveNamespace, which holds a read 1455 // lock on our behalf. If we took the read lock here, we could block 1456 // for fairness if a writer is waiting on the lock. 1457 final LeaseManager leaseManager = fsn.getLeaseManager(); 1458 final FSDirectory dir = fsn.getFSDirectory(); 1459 synchronized (leaseManager) { 1460 Collection<Long> filesWithUC = leaseManager.getINodeIdWithLeases(); 1461 for (Long id : filesWithUC) { 1462 // TODO: for HDFS-5428, because of rename operations, some 1463 // under-construction files that are 1464 // in the current fs directory can also be captured in the 1465 // snapshotUCMap. We should remove them from the snapshotUCMap. 1466 snapshotUCMap.remove(id); 1467 } 1468 out.writeInt(filesWithUC.size() + snapshotUCMap.size()); // write the size 1469 1470 for (Long id : filesWithUC) { 1471 INodeFile file = dir.getInode(id).asFile(); 1472 String path = file.getFullPathName(); 1473 FSImageSerialization.writeINodeUnderConstruction( 1474 out, file, path); 1475 } 1476 1477 for (Map.Entry<Long, INodeFile> entry : snapshotUCMap.entrySet()) { 1478 // for those snapshot INodeFileUC, we use "/.reserved/.inodes/<inodeid>" 1479 // as their paths 1480 StringBuilder b = new StringBuilder(); 1481 b.append(FSDirectory.DOT_RESERVED_PATH_PREFIX) 1482 .append(Path.SEPARATOR).append(FSDirectory.DOT_INODES_STRING) 1483 .append(Path.SEPARATOR).append(entry.getValue().getId()); 1484 FSImageSerialization.writeINodeUnderConstruction( 1485 out, entry.getValue(), b.toString()); 1486 } 1487 } 1488 } 1489 } 1490}