001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hdfs.server.namenode;
019
020import static org.apache.hadoop.util.Time.monotonicNow;
021
022import java.io.DataInput;
023import java.io.DataInputStream;
024import java.io.DataOutputStream;
025import java.io.File;
026import java.io.FileInputStream;
027import java.io.FileOutputStream;
028import java.io.IOException;
029import java.security.DigestInputStream;
030import java.security.DigestOutputStream;
031import java.security.MessageDigest;
032import java.util.ArrayList;
033import java.util.Arrays;
034import java.util.Collection;
035import java.util.HashMap;
036import java.util.List;
037import java.util.Map;
038import java.util.TreeMap;
039
040import org.apache.commons.logging.Log;
041import org.apache.hadoop.classification.InterfaceAudience;
042import org.apache.hadoop.classification.InterfaceStability;
043import org.apache.hadoop.conf.Configuration;
044import org.apache.hadoop.fs.FileSystem;
045import org.apache.hadoop.fs.Path;
046import org.apache.hadoop.fs.permission.PermissionStatus;
047import org.apache.hadoop.hdfs.DFSUtil;
048import org.apache.hadoop.hdfs.protocol.HdfsConstants;
049import org.apache.hadoop.hdfs.protocol.LayoutFlags;
050import org.apache.hadoop.hdfs.protocol.LayoutVersion;
051import org.apache.hadoop.hdfs.protocol.LayoutVersion.Feature;
052import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfo;
053import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfoContiguous;
054import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager;
055import org.apache.hadoop.hdfs.server.common.HdfsServerConstants;
056import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption;
057import org.apache.hadoop.hdfs.server.common.InconsistentFSStateException;
058import org.apache.hadoop.hdfs.server.namenode.FSDirectory.DirOp;
059import org.apache.hadoop.hdfs.server.namenode.snapshot.DirectoryWithSnapshotFeature;
060import org.apache.hadoop.hdfs.server.namenode.snapshot.FileDiffList;
061import org.apache.hadoop.hdfs.server.namenode.snapshot.Snapshot;
062import org.apache.hadoop.hdfs.server.namenode.snapshot.SnapshotFSImageFormat;
063import org.apache.hadoop.hdfs.server.namenode.snapshot.SnapshotFSImageFormat.ReferenceMap;
064import org.apache.hadoop.hdfs.server.namenode.startupprogress.Phase;
065import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress;
066import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress.Counter;
067import org.apache.hadoop.hdfs.server.namenode.startupprogress.Step;
068import org.apache.hadoop.hdfs.server.namenode.startupprogress.StepType;
069import org.apache.hadoop.hdfs.util.ReadOnlyList;
070import org.apache.hadoop.io.IOUtils;
071import org.apache.hadoop.io.MD5Hash;
072import org.apache.hadoop.io.Text;
073import org.apache.hadoop.util.StringUtils;
074
075import com.google.common.annotations.VisibleForTesting;
076import com.google.common.base.Preconditions;
077
078/**
079 * Contains inner classes for reading or writing the on-disk format for
080 * FSImages.
081 *
082 * In particular, the format of the FSImage looks like:
083 * <pre>
084 * FSImage {
085 *   layoutVersion: int, namespaceID: int, numberItemsInFSDirectoryTree: long,
086 *   namesystemGenerationStampV1: long, namesystemGenerationStampV2: long,
087 *   generationStampAtBlockIdSwitch:long, lastAllocatedBlockId:
088 *   long transactionID: long, snapshotCounter: int, numberOfSnapshots: int,
089 *   numOfSnapshottableDirs: int,
090 *   {FSDirectoryTree, FilesUnderConstruction, SecretManagerState} (can be compressed)
091 * }
092 *
093 * FSDirectoryTree (if {@link Feature#FSIMAGE_NAME_OPTIMIZATION} is supported) {
094 *   INodeInfo of root, numberOfChildren of root: int
095 *   [list of INodeInfo of root's children],
096 *   [list of INodeDirectoryInfo of root's directory children]
097 * }
098 *
099 * FSDirectoryTree (if {@link Feature#FSIMAGE_NAME_OPTIMIZATION} not supported){
100 *   [list of INodeInfo of INodes in topological order]
101 * }
102 *
103 * INodeInfo {
104 *   {
105 *     localName: short + byte[]
106 *   } when {@link Feature#FSIMAGE_NAME_OPTIMIZATION} is supported
107 *   or
108 *   {
109 *     fullPath: byte[]
110 *   } when {@link Feature#FSIMAGE_NAME_OPTIMIZATION} is not supported
111 *   replicationFactor: short, modificationTime: long,
112 *   accessTime: long, preferredBlockSize: long,
113 *   numberOfBlocks: int (-1 for INodeDirectory, -2 for INodeSymLink),
114 *   {
115 *     nsQuota: long, dsQuota: long,
116 *     {
117 *       isINodeSnapshottable: byte,
118 *       isINodeWithSnapshot: byte (if isINodeSnapshottable is false)
119 *     } (when {@link Feature#SNAPSHOT} is supported),
120 *     fsPermission: short, PermissionStatus
121 *   } for INodeDirectory
122 *   or
123 *   {
124 *     symlinkString, fsPermission: short, PermissionStatus
125 *   } for INodeSymlink
126 *   or
127 *   {
128 *     [list of BlockInfo]
129 *     [list of FileDiff]
130 *     {
131 *       isINodeFileUnderConstructionSnapshot: byte,
132 *       {clientName: short + byte[], clientMachine: short + byte[]} (when
133 *       isINodeFileUnderConstructionSnapshot is true),
134 *     } (when {@link Feature#SNAPSHOT} is supported and writing snapshotINode),
135 *     fsPermission: short, PermissionStatus
136 *   } for INodeFile
137 * }
138 *
139 * INodeDirectoryInfo {
140 *   fullPath of the directory: short + byte[],
141 *   numberOfChildren: int, [list of INodeInfo of children INode],
142 *   {
143 *     numberOfSnapshots: int,
144 *     [list of Snapshot] (when NumberOfSnapshots is positive),
145 *     numberOfDirectoryDiffs: int,
146 *     [list of DirectoryDiff] (NumberOfDirectoryDiffs is positive),
147 *     number of children that are directories,
148 *     [list of INodeDirectoryInfo of the directory children] (includes
149 *     snapshot copies of deleted sub-directories)
150 *   } (when {@link Feature#SNAPSHOT} is supported),
151 * }
152 *
153 * Snapshot {
154 *   snapshotID: int, root of Snapshot: INodeDirectoryInfo (its local name is
155 *   the name of the snapshot)
156 * }
157 *
158 * DirectoryDiff {
159 *   full path of the root of the associated Snapshot: short + byte[],
160 *   childrenSize: int,
161 *   isSnapshotRoot: byte,
162 *   snapshotINodeIsNotNull: byte (when isSnapshotRoot is false),
163 *   snapshotINode: INodeDirectory (when SnapshotINodeIsNotNull is true), Diff
164 * }
165 *
166 * Diff {
167 *   createdListSize: int, [Local name of INode in created list],
168 *   deletedListSize: int, [INode in deleted list: INodeInfo]
169 * }
170 *
171 * FileDiff {
172 *   full path of the root of the associated Snapshot: short + byte[],
173 *   fileSize: long,
174 *   snapshotINodeIsNotNull: byte,
175 *   snapshotINode: INodeFile (when SnapshotINodeIsNotNull is true), Diff
176 * }
177 * </pre>
178 */
179@InterfaceAudience.Private
180@InterfaceStability.Evolving
181public class FSImageFormat {
182  private static final Log LOG = FSImage.LOG;
183
184  // Static-only class
185  private FSImageFormat() {}
186
187  interface AbstractLoader {
188    MD5Hash getLoadedImageMd5();
189    long getLoadedImageTxId();
190  }
191
192  static class LoaderDelegator implements AbstractLoader {
193    private AbstractLoader impl;
194    private final Configuration conf;
195    private final FSNamesystem fsn;
196
197    LoaderDelegator(Configuration conf, FSNamesystem fsn) {
198      this.conf = conf;
199      this.fsn = fsn;
200    }
201
202    @Override
203    public MD5Hash getLoadedImageMd5() {
204      return impl.getLoadedImageMd5();
205    }
206
207    @Override
208    public long getLoadedImageTxId() {
209      return impl.getLoadedImageTxId();
210    }
211
212    public void load(File file, boolean requireSameLayoutVersion)
213        throws IOException {
214      Preconditions.checkState(impl == null, "Image already loaded!");
215
216      FileInputStream is = null;
217      try {
218        is = new FileInputStream(file);
219        byte[] magic = new byte[FSImageUtil.MAGIC_HEADER.length];
220        IOUtils.readFully(is, magic, 0, magic.length);
221        if (Arrays.equals(magic, FSImageUtil.MAGIC_HEADER)) {
222          FSImageFormatProtobuf.Loader loader = new FSImageFormatProtobuf.Loader(
223              conf, fsn, requireSameLayoutVersion);
224          impl = loader;
225          loader.load(file);
226        } else {
227          Loader loader = new Loader(conf, fsn);
228          impl = loader;
229          loader.load(file);
230        }
231      } finally {
232        IOUtils.cleanup(LOG, is);
233      }
234    }
235  }
236
237  /**
238   * Construct a loader class to load the image. It chooses the loader based on
239   * the layout version.
240   */
241  public static LoaderDelegator newLoader(Configuration conf, FSNamesystem fsn) {
242    return new LoaderDelegator(conf, fsn);
243  }
244
245  /**
246   * A one-shot class responsible for loading an image. The load() function
247   * should be called once, after which the getter methods may be used to retrieve
248   * information about the image that was loaded, if loading was successful.
249   */
250  public static class Loader implements AbstractLoader {
251    private final Configuration conf;
252    /** which namesystem this loader is working for */
253    private final FSNamesystem namesystem;
254
255    /** Set to true once a file has been loaded using this loader. */
256    private boolean loaded = false;
257
258    /** The transaction ID of the last edit represented by the loaded file */
259    private long imgTxId;
260    /** The MD5 sum of the loaded file */
261    private MD5Hash imgDigest;
262    
263    private Map<Integer, Snapshot> snapshotMap = null;
264    private final ReferenceMap referenceMap = new ReferenceMap();
265
266    Loader(Configuration conf, FSNamesystem namesystem) {
267      this.conf = conf;
268      this.namesystem = namesystem;
269    }
270
271    /**
272     * Return the MD5 checksum of the image that has been loaded.
273     * @throws IllegalStateException if load() has not yet been called.
274     */
275    @Override
276    public MD5Hash getLoadedImageMd5() {
277      checkLoaded();
278      return imgDigest;
279    }
280
281    @Override
282    public long getLoadedImageTxId() {
283      checkLoaded();
284      return imgTxId;
285    }
286
287    /**
288     * Throw IllegalStateException if load() has not yet been called.
289     */
290    private void checkLoaded() {
291      if (!loaded) {
292        throw new IllegalStateException("Image not yet loaded!");
293      }
294    }
295
296    /**
297     * Throw IllegalStateException if load() has already been called.
298     */
299    private void checkNotLoaded() {
300      if (loaded) {
301        throw new IllegalStateException("Image already loaded!");
302      }
303    }
304
305    public void load(File curFile) throws IOException {
306      checkNotLoaded();
307      assert curFile != null : "curFile is null";
308
309      StartupProgress prog = NameNode.getStartupProgress();
310      Step step = new Step(StepType.INODES);
311      prog.beginStep(Phase.LOADING_FSIMAGE, step);
312      long startTime = monotonicNow();
313
314      //
315      // Load in bits
316      //
317      MessageDigest digester = MD5Hash.getDigester();
318      DigestInputStream fin = new DigestInputStream(
319           new FileInputStream(curFile), digester);
320
321      DataInputStream in = new DataInputStream(fin);
322      try {
323        // read image version: first appeared in version -1
324        int imgVersion = in.readInt();
325        if (getLayoutVersion() != imgVersion) {
326          throw new InconsistentFSStateException(curFile, 
327              "imgVersion " + imgVersion +
328              " expected to be " + getLayoutVersion());
329        }
330        boolean supportSnapshot = NameNodeLayoutVersion.supports(
331            LayoutVersion.Feature.SNAPSHOT, imgVersion);
332        if (NameNodeLayoutVersion.supports(
333            LayoutVersion.Feature.ADD_LAYOUT_FLAGS, imgVersion)) {
334          LayoutFlags.read(in);
335        }
336
337        // read namespaceID: first appeared in version -2
338        in.readInt();
339
340        long numFiles = in.readLong();
341
342        // read in the last generation stamp for legacy blocks.
343        long genstamp = in.readLong();
344        namesystem.getBlockIdManager().setGenerationStampV1(genstamp);
345
346        if (NameNodeLayoutVersion.supports(
347            LayoutVersion.Feature.SEQUENTIAL_BLOCK_ID, imgVersion)) {
348          // read the starting generation stamp for sequential block IDs
349          genstamp = in.readLong();
350          namesystem.getBlockIdManager().setGenerationStampV2(genstamp);
351
352          // read the last generation stamp for blocks created after
353          // the switch to sequential block IDs.
354          long stampAtIdSwitch = in.readLong();
355          namesystem.getBlockIdManager().setGenerationStampV1Limit(stampAtIdSwitch);
356
357          // read the max sequential block ID.
358          long maxSequentialBlockId = in.readLong();
359          namesystem.getBlockIdManager().setLastAllocatedBlockId(maxSequentialBlockId);
360        } else {
361
362          long startingGenStamp = namesystem.getBlockIdManager()
363            .upgradeGenerationStampToV2();
364          // This is an upgrade.
365          LOG.info("Upgrading to sequential block IDs. Generation stamp " +
366                   "for new blocks set to " + startingGenStamp);
367        }
368
369        // read the transaction ID of the last edit represented by
370        // this image
371        if (NameNodeLayoutVersion.supports(
372            LayoutVersion.Feature.STORED_TXIDS, imgVersion)) {
373          imgTxId = in.readLong();
374        } else {
375          imgTxId = 0;
376        }
377
378        // read the last allocated inode id in the fsimage
379        if (NameNodeLayoutVersion.supports(
380            LayoutVersion.Feature.ADD_INODE_ID, imgVersion)) {
381          long lastInodeId = in.readLong();
382          namesystem.dir.resetLastInodeId(lastInodeId);
383          if (LOG.isDebugEnabled()) {
384            LOG.debug("load last allocated InodeId from fsimage:" + lastInodeId);
385          }
386        } else {
387          if (LOG.isDebugEnabled()) {
388            LOG.debug("Old layout version doesn't have inode id."
389                + " Will assign new id for each inode.");
390          }
391        }
392        
393        if (supportSnapshot) {
394          snapshotMap = namesystem.getSnapshotManager().read(in, this);
395        }
396
397        // read compression related info
398        FSImageCompression compression;
399        if (NameNodeLayoutVersion.supports(
400            LayoutVersion.Feature.FSIMAGE_COMPRESSION, imgVersion)) {
401          compression = FSImageCompression.readCompressionHeader(conf, in);
402        } else {
403          compression = FSImageCompression.createNoopCompression();
404        }
405        in = compression.unwrapInputStream(fin);
406
407        LOG.info("Loading image file " + curFile + " using " + compression);
408        
409        // load all inodes
410        LOG.info("Number of files = " + numFiles);
411        prog.setTotal(Phase.LOADING_FSIMAGE, step, numFiles);
412        Counter counter = prog.getCounter(Phase.LOADING_FSIMAGE, step);
413        if (NameNodeLayoutVersion.supports(
414            LayoutVersion.Feature.FSIMAGE_NAME_OPTIMIZATION, imgVersion)) {
415          if (supportSnapshot) {
416            loadLocalNameINodesWithSnapshot(numFiles, in, counter);
417          } else {
418            loadLocalNameINodes(numFiles, in, counter);
419          }
420        } else {
421          loadFullNameINodes(numFiles, in, counter);
422        }
423
424        loadFilesUnderConstruction(in, supportSnapshot, counter);
425        prog.endStep(Phase.LOADING_FSIMAGE, step);
426        // Now that the step is finished, set counter equal to total to adjust
427        // for possible under-counting due to reference inodes.
428        prog.setCount(Phase.LOADING_FSIMAGE, step, numFiles);
429
430        loadSecretManagerState(in);
431
432        loadCacheManagerState(in);
433
434        // make sure to read to the end of file
435        boolean eof = (in.read() == -1);
436        assert eof : "Should have reached the end of image file " + curFile;
437      } finally {
438        in.close();
439      }
440
441      imgDigest = new MD5Hash(digester.digest());
442      loaded = true;
443      
444      LOG.info("Image file " + curFile + " of size " + curFile.length()
445          + " bytes loaded in " + (monotonicNow() - startTime) / 1000
446          + " seconds.");
447    }
448
449  /** Update the root node's attributes */
450  private void updateRootAttr(INodeWithAdditionalFields root) {                                                           
451    final QuotaCounts q = root.getQuotaCounts();
452    final long nsQuota = q.getNameSpace();
453    final long dsQuota = q.getStorageSpace();
454    FSDirectory fsDir = namesystem.dir;
455    if (nsQuota != -1 || dsQuota != -1) {
456      fsDir.rootDir.getDirectoryWithQuotaFeature().setQuota(nsQuota, dsQuota);
457    }
458    fsDir.rootDir.cloneModificationTime(root);
459    fsDir.rootDir.clonePermissionStatus(root);    
460  }
461  
462    /**
463     * Load fsimage files when 1) only local names are stored, 
464     * and 2) snapshot is supported.
465     * 
466     * @param numFiles number of files expected to be read
467     * @param in Image input stream
468     * @param counter Counter to increment for namenode startup progress
469     */
470    private void loadLocalNameINodesWithSnapshot(long numFiles, DataInput in,
471        Counter counter) throws IOException {
472      assert NameNodeLayoutVersion.supports(
473          LayoutVersion.Feature.FSIMAGE_NAME_OPTIMIZATION, getLayoutVersion());
474      assert NameNodeLayoutVersion.supports(
475          LayoutVersion.Feature.SNAPSHOT, getLayoutVersion());
476      
477      // load root
478      loadRoot(in, counter);
479      // load rest of the nodes recursively
480      loadDirectoryWithSnapshot(in, counter);
481    }
482    
483  /** 
484   * load fsimage files assuming only local names are stored. Used when
485   * snapshots are not supported by the layout version.
486   *   
487   * @param numFiles number of files expected to be read
488   * @param in image input stream
489   * @param counter Counter to increment for namenode startup progress
490   * @throws IOException
491   */  
492   private void loadLocalNameINodes(long numFiles, DataInput in, Counter counter)
493       throws IOException {
494     assert NameNodeLayoutVersion.supports(
495         LayoutVersion.Feature.FSIMAGE_NAME_OPTIMIZATION, getLayoutVersion());
496     assert numFiles > 0;
497
498     // load root
499     loadRoot(in, counter);
500     // have loaded the first file (the root)
501     numFiles--; 
502
503     // load rest of the nodes directory by directory
504     while (numFiles > 0) {
505       numFiles -= loadDirectory(in, counter);
506     }
507     if (numFiles != 0) {
508       throw new IOException("Read unexpect number of files: " + -numFiles);
509     }
510   }
511   
512    /**
513     * Load information about root, and use the information to update the root
514     * directory of NameSystem.
515     * @param in The {@link DataInput} instance to read.
516     * @param counter Counter to increment for namenode startup progress
517     */
518    private void loadRoot(DataInput in, Counter counter)
519        throws IOException {
520      // load root
521      if (in.readShort() != 0) {
522        throw new IOException("First node is not root");
523      }
524      final INodeDirectory root = loadINode(null, false, in, counter)
525        .asDirectory();
526      // update the root's attributes
527      updateRootAttr(root);
528    }
529   
530    /** Load children nodes for the parent directory. */
531    private int loadChildren(INodeDirectory parent, DataInput in,
532        Counter counter) throws IOException {
533      int numChildren = in.readInt();
534      for (int i = 0; i < numChildren; i++) {
535        // load single inode
536        INode newNode = loadINodeWithLocalName(false, in, true, counter);
537        addToParent(parent, newNode);
538      }
539      return numChildren;
540    }
541    
542    /**
543     * Load a directory when snapshot is supported.
544     * @param in The {@link DataInput} instance to read.
545     * @param counter Counter to increment for namenode startup progress
546     */
547    private void loadDirectoryWithSnapshot(DataInput in, Counter counter)
548        throws IOException {
549      // Step 1. Identify the parent INode
550      long inodeId = in.readLong();
551      final INodeDirectory parent = this.namesystem.dir.getInode(inodeId)
552          .asDirectory();
553      
554      // Check if the whole subtree has been saved (for reference nodes)
555      boolean toLoadSubtree = referenceMap.toProcessSubtree(parent.getId());
556      if (!toLoadSubtree) {
557        return;
558      }
559
560      // Step 2. Load snapshots if parent is snapshottable
561      int numSnapshots = in.readInt();
562      if (numSnapshots >= 0) {
563        // load snapshots and snapshotQuota
564        SnapshotFSImageFormat.loadSnapshotList(parent, numSnapshots, in, this);
565        if (parent.getDirectorySnapshottableFeature().getSnapshotQuota() > 0) {
566          // add the directory to the snapshottable directory list in 
567          // SnapshotManager. Note that we only add root when its snapshot quota
568          // is positive.
569          this.namesystem.getSnapshotManager().addSnapshottable(parent);
570        }
571      }
572
573      // Step 3. Load children nodes under parent
574      loadChildren(parent, in, counter);
575      
576      // Step 4. load Directory Diff List
577      SnapshotFSImageFormat.loadDirectoryDiffList(parent, in, this);
578      
579      // Recursively load sub-directories, including snapshot copies of deleted
580      // directories
581      int numSubTree = in.readInt();
582      for (int i = 0; i < numSubTree; i++) {
583        loadDirectoryWithSnapshot(in, counter);
584      }
585    }
586    
587   /**
588    * Load all children of a directory
589    * 
590    * @param in input to load from
591    * @param counter Counter to increment for namenode startup progress
592    * @return number of child inodes read
593    * @throws IOException
594    */
595   private int loadDirectory(DataInput in, Counter counter) throws IOException {
596     String parentPath = FSImageSerialization.readString(in);
597     // Rename .snapshot paths if we're doing an upgrade
598     parentPath = renameReservedPathsOnUpgrade(parentPath, getLayoutVersion());
599     final INodeDirectory parent = INodeDirectory.valueOf(
600         namesystem.dir.getINode(parentPath, DirOp.READ), parentPath);
601     return loadChildren(parent, in, counter);
602   }
603
604  /**
605   * load fsimage files assuming full path names are stored
606   * 
607   * @param numFiles total number of files to load
608   * @param in data input stream
609   * @param counter Counter to increment for namenode startup progress
610   * @throws IOException if any error occurs
611   */
612  private void loadFullNameINodes(long numFiles, DataInput in, Counter counter)
613      throws IOException {
614    byte[][] pathComponents;
615    byte[][] parentPath = {{}};      
616    FSDirectory fsDir = namesystem.dir;
617    INodeDirectory parentINode = fsDir.rootDir;
618    for (long i = 0; i < numFiles; i++) {
619      pathComponents = FSImageSerialization.readPathComponents(in);
620      for (int j=0; j < pathComponents.length; j++) {
621        byte[] newComponent = renameReservedComponentOnUpgrade
622            (pathComponents[j], getLayoutVersion());
623        if (!Arrays.equals(newComponent, pathComponents[j])) {
624          String oldPath = DFSUtil.byteArray2PathString(pathComponents);
625          pathComponents[j] = newComponent;
626          String newPath = DFSUtil.byteArray2PathString(pathComponents);
627          LOG.info("Renaming reserved path " + oldPath + " to " + newPath);
628        }
629      }
630      final INode newNode = loadINode(
631          pathComponents[pathComponents.length-1], false, in, counter);
632
633      if (isRoot(pathComponents)) { // it is the root
634        // update the root's attributes
635        updateRootAttr(newNode.asDirectory());
636        continue;
637      }
638
639      namesystem.dir.addToInodeMap(newNode);
640      // check if the new inode belongs to the same parent
641      if(!isParent(pathComponents, parentPath)) {
642        parentINode = getParentINodeDirectory(pathComponents);
643        parentPath = getParent(pathComponents);
644      }
645
646      // add new inode
647      addToParent(parentINode, newNode);
648    }
649  }
650
651  private INodeDirectory getParentINodeDirectory(byte[][] pathComponents)
652      throws IOException {
653    if (pathComponents.length < 2) { // root
654      return null;
655    }
656    // Gets the parent INode
657    final INodesInPath inodes =
658        namesystem.dir.getINodesInPath(pathComponents, DirOp.WRITE);
659    return INodeDirectory.valueOf(inodes.getINode(-2), pathComponents);
660  }
661
662  /**
663   * Add the child node to parent and, if child is a file, update block map.
664   * This method is only used for image loading so that synchronization,
665   * modification time update and space count update are not needed.
666   */
667  private void addToParent(INodeDirectory parent, INode child)
668      throws IllegalReservedPathException {
669    FSDirectory fsDir = namesystem.dir;
670    if (parent == fsDir.rootDir) {
671        child.setLocalName(renameReservedRootComponentOnUpgrade(
672            child.getLocalNameBytes(), getLayoutVersion()));
673    }
674    // NOTE: This does not update space counts for parents
675    if (!parent.addChild(child)) {
676      return;
677    }
678    namesystem.dir.cacheName(child);
679
680    if (child.isFile()) {
681      updateBlocksMap(child.asFile());
682    }
683  }
684
685    public void updateBlocksMap(INodeFile file) {
686      // Add file->block mapping
687      final BlockInfo[] blocks = file.getBlocks();
688      if (blocks != null) {
689        final BlockManager bm = namesystem.getBlockManager();
690        for (int i = 0; i < blocks.length; i++) {
691          file.setBlock(i, bm.addBlockCollection(blocks[i], file));
692        } 
693      }
694    }
695
696    /** @return The FSDirectory of the namesystem where the fsimage is loaded */
697    public FSDirectory getFSDirectoryInLoading() {
698      return namesystem.dir;
699    }
700
701    public INode loadINodeWithLocalName(boolean isSnapshotINode, DataInput in,
702        boolean updateINodeMap) throws IOException {
703      return loadINodeWithLocalName(isSnapshotINode, in, updateINodeMap, null);
704    }
705
706    public INode loadINodeWithLocalName(boolean isSnapshotINode,
707        DataInput in, boolean updateINodeMap, Counter counter)
708        throws IOException {
709      byte[] localName = FSImageSerialization.readLocalName(in);
710      localName =
711          renameReservedComponentOnUpgrade(localName, getLayoutVersion());
712      INode inode = loadINode(localName, isSnapshotINode, in, counter);
713      if (updateINodeMap) {
714        namesystem.dir.addToInodeMap(inode);
715      }
716      return inode;
717    }
718  
719  /**
720   * load an inode from fsimage except for its name
721   * 
722   * @param in data input stream from which image is read
723   * @param counter Counter to increment for namenode startup progress
724   * @return an inode
725   */
726  @SuppressWarnings("deprecation")
727  INode loadINode(final byte[] localName, boolean isSnapshotINode,
728      DataInput in, Counter counter) throws IOException {
729    final int imgVersion = getLayoutVersion();
730    if (NameNodeLayoutVersion.supports(
731        LayoutVersion.Feature.SNAPSHOT, imgVersion)) {
732      namesystem.getFSDirectory().verifyINodeName(localName);
733    }
734
735    long inodeId = NameNodeLayoutVersion.supports(
736        LayoutVersion.Feature.ADD_INODE_ID, imgVersion) ? in.readLong()
737        : namesystem.dir.allocateNewInodeId();
738    
739    final short replication = namesystem.getBlockManager().adjustReplication(
740        in.readShort());
741    final long modificationTime = in.readLong();
742    long atime = 0;
743    if (NameNodeLayoutVersion.supports(
744        LayoutVersion.Feature.FILE_ACCESS_TIME, imgVersion)) {
745      atime = in.readLong();
746    }
747    final long blockSize = in.readLong();
748    final int numBlocks = in.readInt();
749
750    if (numBlocks >= 0) {
751      // file
752      
753      // read blocks
754      BlockInfo[] blocks = new BlockInfo[numBlocks];
755      for (int j = 0; j < numBlocks; j++) {
756        blocks[j] = new BlockInfoContiguous(replication);
757        blocks[j].readFields(in);
758      }
759
760      String clientName = "";
761      String clientMachine = "";
762      boolean underConstruction = false;
763      FileDiffList fileDiffs = null;
764      if (NameNodeLayoutVersion.supports(
765          LayoutVersion.Feature.SNAPSHOT, imgVersion)) {
766        // read diffs
767        fileDiffs = SnapshotFSImageFormat.loadFileDiffList(in, this);
768
769        if (isSnapshotINode) {
770          underConstruction = in.readBoolean();
771          if (underConstruction) {
772            clientName = FSImageSerialization.readString(in);
773            clientMachine = FSImageSerialization.readString(in);
774            // convert the last block to BlockUC
775            if (blocks.length > 0) {
776              BlockInfo lastBlk = blocks[blocks.length - 1];
777              lastBlk.convertToBlockUnderConstruction(
778                  HdfsServerConstants.BlockUCState.UNDER_CONSTRUCTION, null);
779            }
780          }
781        }
782      }
783
784      final PermissionStatus permissions = PermissionStatus.read(in);
785
786      // return
787      if (counter != null) {
788        counter.increment();
789      }
790
791      final INodeFile file = new INodeFile(inodeId, localName, permissions,
792          modificationTime, atime, blocks, replication, blockSize, (byte)0);
793      if (underConstruction) {
794        file.toUnderConstruction(clientName, clientMachine);
795      }
796        return fileDiffs == null ? file : new INodeFile(file, fileDiffs);
797      } else if (numBlocks == -1) {
798        //directory
799      
800      //read quotas
801      final long nsQuota = in.readLong();
802      long dsQuota = -1L;
803      if (NameNodeLayoutVersion.supports(
804          LayoutVersion.Feature.DISKSPACE_QUOTA, imgVersion)) {
805        dsQuota = in.readLong();
806      }
807
808      //read snapshot info
809      boolean snapshottable = false;
810      boolean withSnapshot = false;
811      if (NameNodeLayoutVersion.supports(
812          LayoutVersion.Feature.SNAPSHOT, imgVersion)) {
813        snapshottable = in.readBoolean();
814        if (!snapshottable) {
815          withSnapshot = in.readBoolean();
816        }
817      }
818
819      final PermissionStatus permissions = PermissionStatus.read(in);
820
821      //return
822      if (counter != null) {
823        counter.increment();
824      }
825      final INodeDirectory dir = new INodeDirectory(inodeId, localName,
826          permissions, modificationTime);
827      if (nsQuota >= 0 || dsQuota >= 0) {
828        dir.addDirectoryWithQuotaFeature(new DirectoryWithQuotaFeature.Builder().
829            nameSpaceQuota(nsQuota).storageSpaceQuota(dsQuota).build());
830      }
831      if (withSnapshot) {
832        dir.addSnapshotFeature(null);
833      }
834      if (snapshottable) {
835        dir.addSnapshottableFeature();
836      }
837      return dir;
838    } else if (numBlocks == -2) {
839      //symlink
840      if (!FileSystem.areSymlinksEnabled()) {
841        throw new IOException("Symlinks not supported - please remove symlink before upgrading to this version of HDFS");
842      }
843
844      final String symlink = Text.readString(in);
845      final PermissionStatus permissions = PermissionStatus.read(in);
846      if (counter != null) {
847        counter.increment();
848      }
849      return new INodeSymlink(inodeId, localName, permissions,
850          modificationTime, atime, symlink);
851    } else if (numBlocks == -3) {
852      //reference
853      // Intentionally do not increment counter, because it is too difficult at
854      // this point to assess whether or not this is a reference that counts
855      // toward quota.
856      
857      final boolean isWithName = in.readBoolean();
858      // lastSnapshotId for WithName node, dstSnapshotId for DstReference node
859      int snapshotId = in.readInt();
860      
861      final INodeReference.WithCount withCount
862          = referenceMap.loadINodeReferenceWithCount(isSnapshotINode, in, this);
863
864      if (isWithName) {
865          return new INodeReference.WithName(null, withCount, localName,
866              snapshotId);
867      } else {
868        final INodeReference ref = new INodeReference.DstReference(null,
869            withCount, snapshotId);
870        return ref;
871      }
872    }
873    
874    throw new IOException("Unknown inode type: numBlocks=" + numBlocks);
875  }
876
877    /** Load {@link INodeFileAttributes}. */
878    public INodeFileAttributes loadINodeFileAttributes(DataInput in)
879        throws IOException {
880      final int layoutVersion = getLayoutVersion();
881      
882      if (!NameNodeLayoutVersion.supports(
883          LayoutVersion.Feature.OPTIMIZE_SNAPSHOT_INODES, layoutVersion)) {
884        return loadINodeWithLocalName(true, in, false).asFile();
885      }
886  
887      final byte[] name = FSImageSerialization.readLocalName(in);
888      final PermissionStatus permissions = PermissionStatus.read(in);
889      final long modificationTime = in.readLong();
890      final long accessTime = in.readLong();
891  
892      final short replication = namesystem.getBlockManager().adjustReplication(
893          in.readShort());
894      final long preferredBlockSize = in.readLong();
895
896      return new INodeFileAttributes.SnapshotCopy(name, permissions, null, modificationTime,
897          accessTime, replication, preferredBlockSize, (byte) 0, null);
898    }
899
900    public INodeDirectoryAttributes loadINodeDirectoryAttributes(DataInput in)
901        throws IOException {
902      final int layoutVersion = getLayoutVersion();
903      
904      if (!NameNodeLayoutVersion.supports(
905          LayoutVersion.Feature.OPTIMIZE_SNAPSHOT_INODES, layoutVersion)) {
906        return loadINodeWithLocalName(true, in, false).asDirectory();
907      }
908  
909      final byte[] name = FSImageSerialization.readLocalName(in);
910      final PermissionStatus permissions = PermissionStatus.read(in);
911      final long modificationTime = in.readLong();
912      
913      // Read quotas: quota by storage type does not need to be processed below.
914      // It is handled only in protobuf based FsImagePBINode class for newer
915      // fsImages. Tools using this class such as legacy-mode of offline image viewer
916      // should only load legacy FSImages without newer features.
917      final long nsQuota = in.readLong();
918      final long dsQuota = in.readLong();
919
920      return nsQuota == -1L && dsQuota == -1L ? new INodeDirectoryAttributes.SnapshotCopy(
921          name, permissions, null, modificationTime, null)
922        : new INodeDirectoryAttributes.CopyWithQuota(name, permissions,
923            null, modificationTime, nsQuota, dsQuota, null, null);
924    }
925  
926    private void loadFilesUnderConstruction(DataInput in,
927        boolean supportSnapshot, Counter counter) throws IOException {
928      FSDirectory fsDir = namesystem.dir;
929      int size = in.readInt();
930
931      LOG.info("Number of files under construction = " + size);
932
933      for (int i = 0; i < size; i++) {
934        INodeFile cons = FSImageSerialization.readINodeUnderConstruction(in,
935            namesystem, getLayoutVersion());
936        counter.increment();
937
938        // verify that file exists in namespace
939        String path = cons.getLocalName();
940        INodeFile oldnode = null;
941        boolean inSnapshot = false;
942        if (path != null && FSDirectory.isReservedName(path) && 
943            NameNodeLayoutVersion.supports(
944                LayoutVersion.Feature.ADD_INODE_ID, getLayoutVersion())) {
945          // TODO: for HDFS-5428, we use reserved path for those INodeFileUC in
946          // snapshot. If we support INode ID in the layout version, we can use
947          // the inode id to find the oldnode.
948          oldnode = namesystem.dir.getInode(cons.getId()).asFile();
949          inSnapshot = true;
950        } else {
951          path = renameReservedPathsOnUpgrade(path, getLayoutVersion());
952          final INodesInPath iip = fsDir.getINodesInPath(path, DirOp.WRITE);
953          oldnode = INodeFile.valueOf(iip.getLastINode(), path);
954        }
955
956        FileUnderConstructionFeature uc = cons.getFileUnderConstructionFeature();
957        oldnode.toUnderConstruction(uc.getClientName(), uc.getClientMachine());
958        if (oldnode.numBlocks() > 0) {
959          BlockInfo ucBlock = cons.getLastBlock();
960          // we do not replace the inode, just replace the last block of oldnode
961          BlockInfo info = namesystem.getBlockManager().addBlockCollection(
962              ucBlock, oldnode);
963          oldnode.setBlock(oldnode.numBlocks() - 1, info);
964        }
965
966        if (!inSnapshot) {
967          namesystem.leaseManager.addLease(uc.getClientName(), oldnode.getId());
968        }
969      }
970    }
971
972    private void loadSecretManagerState(DataInput in)
973        throws IOException {
974      int imgVersion = getLayoutVersion();
975
976      if (!NameNodeLayoutVersion.supports(
977          LayoutVersion.Feature.DELEGATION_TOKEN, imgVersion)) {
978        //SecretManagerState is not available.
979        //This must not happen if security is turned on.
980        return; 
981      }
982      namesystem.loadSecretManagerStateCompat(in);
983    }
984
985    private void loadCacheManagerState(DataInput in) throws IOException {
986      int imgVersion = getLayoutVersion();
987      if (!NameNodeLayoutVersion.supports(
988          LayoutVersion.Feature.CACHING, imgVersion)) {
989        return;
990      }
991      namesystem.getCacheManager().loadStateCompat(in);
992    }
993
994    private int getLayoutVersion() {
995      return namesystem.getFSImage().getStorage().getLayoutVersion();
996    }
997
998    private boolean isRoot(byte[][] path) {
999      return path.length == 1 &&
1000        path[0] == null;    
1001    }
1002
1003    private boolean isParent(byte[][] path, byte[][] parent) {
1004      if (path == null || parent == null)
1005        return false;
1006      if (parent.length == 0 || path.length != parent.length + 1)
1007        return false;
1008      boolean isParent = true;
1009      for (int i = 0; i < parent.length; i++) {
1010        isParent = isParent && Arrays.equals(path[i], parent[i]); 
1011      }
1012      return isParent;
1013    }
1014
1015    /**
1016     * Return string representing the parent of the given path.
1017     */
1018    String getParent(String path) {
1019      return path.substring(0, path.lastIndexOf(Path.SEPARATOR));
1020    }
1021    
1022    byte[][] getParent(byte[][] path) {
1023      byte[][] result = new byte[path.length - 1][];
1024      for (int i = 0; i < result.length; i++) {
1025        result[i] = new byte[path[i].length];
1026        System.arraycopy(path[i], 0, result[i], 0, path[i].length);
1027      }
1028      return result;
1029    }
1030    
1031    public Snapshot getSnapshot(DataInput in) throws IOException {
1032      return snapshotMap.get(in.readInt());
1033    }
1034  }
1035
1036  @VisibleForTesting
1037  public static final TreeMap<String, String> renameReservedMap =
1038      new TreeMap<String, String>();
1039
1040  /**
1041   * Use the default key-value pairs that will be used to determine how to
1042   * rename reserved paths on upgrade.
1043   */
1044  @VisibleForTesting
1045  public static void useDefaultRenameReservedPairs() {
1046    renameReservedMap.clear();
1047    for (String key: HdfsServerConstants.RESERVED_PATH_COMPONENTS) {
1048      renameReservedMap.put(
1049          key,
1050          key + "." + HdfsServerConstants.NAMENODE_LAYOUT_VERSION + "."
1051              + "UPGRADE_RENAMED");
1052    }
1053  }
1054
1055  /**
1056   * Set the key-value pairs that will be used to determine how to rename
1057   * reserved paths on upgrade.
1058   */
1059  @VisibleForTesting
1060  public static void setRenameReservedPairs(String renameReserved) {
1061    // Clear and set the default values
1062    useDefaultRenameReservedPairs();
1063    // Overwrite with provided values
1064    setRenameReservedMapInternal(renameReserved);
1065  }
1066
1067  private static void setRenameReservedMapInternal(String renameReserved) {
1068    Collection<String> pairs =
1069        StringUtils.getTrimmedStringCollection(renameReserved);
1070    for (String p : pairs) {
1071      String[] pair = StringUtils.split(p, '/', '=');
1072      Preconditions.checkArgument(pair.length == 2,
1073          "Could not parse key-value pair " + p);
1074      String key = pair[0];
1075      String value = pair[1];
1076      Preconditions.checkArgument(DFSUtil.isReservedPathComponent(key),
1077          "Unknown reserved path " + key);
1078      Preconditions.checkArgument(DFSUtil.isValidNameForComponent(value),
1079          "Invalid rename path for " + key + ": " + value);
1080      LOG.info("Will rename reserved path " + key + " to " + value);
1081      renameReservedMap.put(key, value);
1082    }
1083  }
1084
1085  /**
1086   * When upgrading from an old version, the filesystem could contain paths
1087   * that are now reserved in the new version (e.g. .snapshot). This renames
1088   * these new reserved paths to a user-specified value to avoid collisions
1089   * with the reserved name.
1090   * 
1091   * @param path Old path potentially containing a reserved path
1092   * @return New path with reserved path components renamed to user value
1093   */
1094  static String renameReservedPathsOnUpgrade(String path,
1095      final int layoutVersion) throws IllegalReservedPathException {
1096    final String oldPath = path;
1097    // If any known LVs aren't supported, we're doing an upgrade
1098    if (!NameNodeLayoutVersion.supports(Feature.ADD_INODE_ID, layoutVersion)) {
1099      String[] components = INode.getPathNames(path);
1100      // Only need to worry about the root directory
1101      if (components.length > 1) {
1102        components[1] = DFSUtil.bytes2String(
1103            renameReservedRootComponentOnUpgrade(
1104                DFSUtil.string2Bytes(components[1]),
1105                layoutVersion));
1106        path = DFSUtil.strings2PathString(components);
1107      }
1108    }
1109    if (!NameNodeLayoutVersion.supports(Feature.SNAPSHOT, layoutVersion)) {
1110      String[] components = INode.getPathNames(path);
1111      // Special case the root path
1112      if (components.length == 0) {
1113        return path;
1114      }
1115      for (int i=0; i<components.length; i++) {
1116        components[i] = DFSUtil.bytes2String(
1117            renameReservedComponentOnUpgrade(
1118                DFSUtil.string2Bytes(components[i]),
1119                layoutVersion));
1120      }
1121      path = DFSUtil.strings2PathString(components);
1122    }
1123
1124    if (!path.equals(oldPath)) {
1125      LOG.info("Upgrade process renamed reserved path " + oldPath + " to "
1126          + path);
1127    }
1128    return path;
1129  }
1130
1131  private final static String RESERVED_ERROR_MSG = 
1132      FSDirectory.DOT_RESERVED_PATH_PREFIX + " is a reserved path and "
1133      + HdfsConstants.DOT_SNAPSHOT_DIR + " is a reserved path component in"
1134      + " this version of HDFS. Please rollback and delete or rename"
1135      + " this path, or upgrade with the "
1136      + StartupOption.RENAMERESERVED.getName()
1137      + " [key-value pairs]"
1138      + " option to automatically rename these paths during upgrade.";
1139
1140  /**
1141   * Same as {@link #renameReservedPathsOnUpgrade(String)}, but for a single
1142   * byte array path component.
1143   */
1144  private static byte[] renameReservedComponentOnUpgrade(byte[] component,
1145      final int layoutVersion) throws IllegalReservedPathException {
1146    // If the LV doesn't support snapshots, we're doing an upgrade
1147    if (!NameNodeLayoutVersion.supports(Feature.SNAPSHOT, layoutVersion)) {
1148      if (Arrays.equals(component, HdfsServerConstants.DOT_SNAPSHOT_DIR_BYTES)) {
1149        if (!renameReservedMap.containsKey(HdfsConstants.DOT_SNAPSHOT_DIR)) {
1150          throw new IllegalReservedPathException(RESERVED_ERROR_MSG);
1151        }
1152        component =
1153            DFSUtil.string2Bytes(renameReservedMap
1154                .get(HdfsConstants.DOT_SNAPSHOT_DIR));
1155      }
1156    }
1157    return component;
1158  }
1159
1160  /**
1161   * Same as {@link #renameReservedPathsOnUpgrade(String)}, but for a single
1162   * byte array path component.
1163   */
1164  private static byte[] renameReservedRootComponentOnUpgrade(byte[] component,
1165      final int layoutVersion) throws IllegalReservedPathException {
1166    // If the LV doesn't support inode IDs, we're doing an upgrade
1167    if (!NameNodeLayoutVersion.supports(Feature.ADD_INODE_ID, layoutVersion)) {
1168      if (Arrays.equals(component, FSDirectory.DOT_RESERVED)) {
1169        if (!renameReservedMap.containsKey(HdfsConstants.DOT_SNAPSHOT_DIR)) {
1170          throw new IllegalReservedPathException(RESERVED_ERROR_MSG);
1171        }
1172        final String renameString = renameReservedMap
1173            .get(FSDirectory.DOT_RESERVED_STRING);
1174        component =
1175            DFSUtil.string2Bytes(renameString);
1176        LOG.info("Renamed root path " + FSDirectory.DOT_RESERVED_STRING
1177            + " to " + renameString);
1178      }
1179    }
1180    return component;
1181  }
1182
1183  /**
1184   * A one-shot class responsible for writing an image file.
1185   * The write() function should be called once, after which the getter
1186   * functions may be used to retrieve information about the file that was written.
1187   *
1188   * This is replaced by the PB-based FSImage. The class is to maintain
1189   * compatibility for the external fsimage tool.
1190   */
1191  @Deprecated
1192  static class Saver {
1193    private static final int LAYOUT_VERSION = -51;
1194    public static final int CHECK_CANCEL_INTERVAL = 4096;
1195    private final SaveNamespaceContext context;
1196    /** Set to true once an image has been written */
1197    private boolean saved = false;
1198    private long checkCancelCounter = 0;
1199
1200    /** The MD5 checksum of the file that was written */
1201    private MD5Hash savedDigest;
1202    private final ReferenceMap referenceMap = new ReferenceMap();
1203
1204    private final Map<Long, INodeFile> snapshotUCMap =
1205        new HashMap<Long, INodeFile>();
1206
1207    /** @throws IllegalStateException if the instance has not yet saved an image */
1208    private void checkSaved() {
1209      if (!saved) {
1210        throw new IllegalStateException("FSImageSaver has not saved an image");
1211      }
1212    }
1213
1214    /** @throws IllegalStateException if the instance has already saved an image */
1215    private void checkNotSaved() {
1216      if (saved) {
1217        throw new IllegalStateException("FSImageSaver has already saved an image");
1218      }
1219    }
1220
1221
1222    Saver(SaveNamespaceContext context) {
1223      this.context = context;
1224    }
1225
1226    /**
1227     * Return the MD5 checksum of the image file that was saved.
1228     */
1229    MD5Hash getSavedDigest() {
1230      checkSaved();
1231      return savedDigest;
1232    }
1233
1234    void save(File newFile, FSImageCompression compression) throws IOException {
1235      checkNotSaved();
1236
1237      final FSNamesystem sourceNamesystem = context.getSourceNamesystem();
1238      final INodeDirectory rootDir = sourceNamesystem.dir.rootDir;
1239      final long numINodes = rootDir.getDirectoryWithQuotaFeature()
1240          .getSpaceConsumed().getNameSpace();
1241      String sdPath = newFile.getParentFile().getParentFile().getAbsolutePath();
1242      Step step = new Step(StepType.INODES, sdPath);
1243      StartupProgress prog = NameNode.getStartupProgress();
1244      prog.beginStep(Phase.SAVING_CHECKPOINT, step);
1245      prog.setTotal(Phase.SAVING_CHECKPOINT, step, numINodes);
1246      Counter counter = prog.getCounter(Phase.SAVING_CHECKPOINT, step);
1247      long startTime = monotonicNow();
1248      //
1249      // Write out data
1250      //
1251      MessageDigest digester = MD5Hash.getDigester();
1252      FileOutputStream fout = new FileOutputStream(newFile);
1253      DigestOutputStream fos = new DigestOutputStream(fout, digester);
1254      DataOutputStream out = new DataOutputStream(fos);
1255      try {
1256        out.writeInt(LAYOUT_VERSION);
1257        LayoutFlags.write(out);
1258        // We use the non-locked version of getNamespaceInfo here since
1259        // the coordinating thread of saveNamespace already has read-locked
1260        // the namespace for us. If we attempt to take another readlock
1261        // from the actual saver thread, there's a potential of a
1262        // fairness-related deadlock. See the comments on HDFS-2223.
1263        out.writeInt(sourceNamesystem.unprotectedGetNamespaceInfo()
1264            .getNamespaceID());
1265        out.writeLong(numINodes);
1266        out.writeLong(sourceNamesystem.getBlockIdManager().getGenerationStampV1());
1267        out.writeLong(sourceNamesystem.getBlockIdManager().getGenerationStampV2());
1268        out.writeLong(sourceNamesystem.getBlockIdManager().getGenerationStampAtblockIdSwitch());
1269        out.writeLong(sourceNamesystem.getBlockIdManager().getLastAllocatedBlockId());
1270        out.writeLong(context.getTxId());
1271        out.writeLong(sourceNamesystem.dir.getLastInodeId());
1272
1273
1274        sourceNamesystem.getSnapshotManager().write(out);
1275
1276        // write compression info and set up compressed stream
1277        out = compression.writeHeaderAndWrapStream(fos);
1278        LOG.info("Saving image file " + newFile +
1279                 " using " + compression);
1280
1281        // save the root
1282        saveINode2Image(rootDir, out, false, referenceMap, counter);
1283        // save the rest of the nodes
1284        saveImage(rootDir, out, true, false, counter);
1285        prog.endStep(Phase.SAVING_CHECKPOINT, step);
1286        // Now that the step is finished, set counter equal to total to adjust
1287        // for possible under-counting due to reference inodes.
1288        prog.setCount(Phase.SAVING_CHECKPOINT, step, numINodes);
1289        // save files under construction
1290        // TODO: for HDFS-5428, since we cannot break the compatibility of
1291        // fsimage, we store part of the under-construction files that are only
1292        // in snapshots in this "under-construction-file" section. As a
1293        // temporary solution, we use "/.reserved/.inodes/<inodeid>" as their
1294        // paths, so that when loading fsimage we do not put them into the lease
1295        // map. In the future, we can remove this hack when we can bump the
1296        // layout version.
1297        saveFilesUnderConstruction(sourceNamesystem, out, snapshotUCMap);
1298
1299        context.checkCancelled();
1300        sourceNamesystem.saveSecretManagerStateCompat(out, sdPath);
1301        context.checkCancelled();
1302        sourceNamesystem.getCacheManager().saveStateCompat(out, sdPath);
1303        context.checkCancelled();
1304        out.flush();
1305        context.checkCancelled();
1306        fout.getChannel().force(true);
1307      } finally {
1308        out.close();
1309      }
1310
1311      saved = true;
1312      // set md5 of the saved image
1313      savedDigest = new MD5Hash(digester.digest());
1314
1315      LOG.info("Image file " + newFile + " of size " + newFile.length()
1316          + " bytes saved in " + (monotonicNow() - startTime) / 1000
1317          + " seconds.");
1318    }
1319
1320    /**
1321     * Save children INodes.
1322     * @param children The list of children INodes
1323     * @param out The DataOutputStream to write
1324     * @param inSnapshot Whether the parent directory or its ancestor is in
1325     *                   the deleted list of some snapshot (caused by rename or
1326     *                   deletion)
1327     * @param counter Counter to increment for namenode startup progress
1328     * @return Number of children that are directory
1329     */
1330    private int saveChildren(ReadOnlyList<INode> children,
1331        DataOutputStream out, boolean inSnapshot, Counter counter)
1332        throws IOException {
1333      // Write normal children INode.
1334      out.writeInt(children.size());
1335      int dirNum = 0;
1336      for(INode child : children) {
1337        // print all children first
1338        // TODO: for HDFS-5428, we cannot change the format/content of fsimage
1339        // here, thus even if the parent directory is in snapshot, we still
1340        // do not handle INodeUC as those stored in deleted list
1341        saveINode2Image(child, out, false, referenceMap, counter);
1342        if (child.isDirectory()) {
1343          dirNum++;
1344        } else if (inSnapshot && child.isFile()
1345            && child.asFile().isUnderConstruction()) {
1346          this.snapshotUCMap.put(child.getId(), child.asFile());
1347        }
1348        if (checkCancelCounter++ % CHECK_CANCEL_INTERVAL == 0) {
1349          context.checkCancelled();
1350        }
1351      }
1352      return dirNum;
1353    }
1354
1355    /**
1356     * Save file tree image starting from the given root.
1357     * This is a recursive procedure, which first saves all children and
1358     * snapshot diffs of a current directory and then moves inside the
1359     * sub-directories.
1360     *
1361     * @param current The current node
1362     * @param out The DataoutputStream to write the image
1363     * @param toSaveSubtree Whether or not to save the subtree to fsimage. For
1364     *                      reference node, its subtree may already have been
1365     *                      saved before.
1366     * @param inSnapshot Whether the current directory is in snapshot
1367     * @param counter Counter to increment for namenode startup progress
1368     */
1369    private void saveImage(INodeDirectory current, DataOutputStream out,
1370        boolean toSaveSubtree, boolean inSnapshot, Counter counter)
1371        throws IOException {
1372      // write the inode id of the directory
1373      out.writeLong(current.getId());
1374
1375      if (!toSaveSubtree) {
1376        return;
1377      }
1378
1379      final ReadOnlyList<INode> children = current
1380          .getChildrenList(Snapshot.CURRENT_STATE_ID);
1381      int dirNum = 0;
1382      List<INodeDirectory> snapshotDirs = null;
1383      DirectoryWithSnapshotFeature sf = current.getDirectoryWithSnapshotFeature();
1384      if (sf != null) {
1385        snapshotDirs = new ArrayList<INodeDirectory>();
1386        sf.getSnapshotDirectory(snapshotDirs);
1387        dirNum += snapshotDirs.size();
1388      }
1389
1390      // 2. Write INodeDirectorySnapshottable#snapshotsByNames to record all
1391      // Snapshots
1392      if (current.isDirectory() && current.asDirectory().isSnapshottable()) {
1393        SnapshotFSImageFormat.saveSnapshots(current.asDirectory(), out);
1394      } else {
1395        out.writeInt(-1); // # of snapshots
1396      }
1397
1398      // 3. Write children INode
1399      dirNum += saveChildren(children, out, inSnapshot, counter);
1400
1401      // 4. Write DirectoryDiff lists, if there is any.
1402      SnapshotFSImageFormat.saveDirectoryDiffList(current, out, referenceMap);
1403
1404      // Write sub-tree of sub-directories, including possible snapshots of
1405      // deleted sub-directories
1406      out.writeInt(dirNum); // the number of sub-directories
1407      for(INode child : children) {
1408        if(!child.isDirectory()) {
1409          continue;
1410        }
1411        // make sure we only save the subtree under a reference node once
1412        boolean toSave = child.isReference() ?
1413            referenceMap.toProcessSubtree(child.getId()) : true;
1414        saveImage(child.asDirectory(), out, toSave, inSnapshot, counter);
1415      }
1416      if (snapshotDirs != null) {
1417        for (INodeDirectory subDir : snapshotDirs) {
1418          // make sure we only save the subtree under a reference node once
1419          boolean toSave = subDir.getParentReference() != null ?
1420              referenceMap.toProcessSubtree(subDir.getId()) : true;
1421          saveImage(subDir, out, toSave, true, counter);
1422        }
1423      }
1424    }
1425
1426    /**
1427     * Saves inode and increments progress counter.
1428     *
1429     * @param inode INode to save
1430     * @param out DataOutputStream to receive inode
1431     * @param writeUnderConstruction boolean true if this is under construction
1432     * @param referenceMap ReferenceMap containing reference inodes
1433     * @param counter Counter to increment for namenode startup progress
1434     * @throws IOException thrown if there is an I/O error
1435     */
1436    private void saveINode2Image(INode inode, DataOutputStream out,
1437        boolean writeUnderConstruction, ReferenceMap referenceMap,
1438        Counter counter) throws IOException {
1439      FSImageSerialization.saveINode2Image(inode, out, writeUnderConstruction,
1440        referenceMap);
1441      // Intentionally do not increment counter for reference inodes, because it
1442      // is too difficult at this point to assess whether or not this is a
1443      // reference that counts toward quota.
1444      if (!(inode instanceof INodeReference)) {
1445        counter.increment();
1446      }
1447    }
1448
1449    /**
1450     * Serializes leases.
1451     */
1452    void saveFilesUnderConstruction(FSNamesystem fsn, DataOutputStream out,
1453                                    Map<Long, INodeFile> snapshotUCMap) throws IOException {
1454      // This is run by an inferior thread of saveNamespace, which holds a read
1455      // lock on our behalf. If we took the read lock here, we could block
1456      // for fairness if a writer is waiting on the lock.
1457      final LeaseManager leaseManager = fsn.getLeaseManager();
1458      final FSDirectory dir = fsn.getFSDirectory();
1459      synchronized (leaseManager) {
1460        Collection<Long> filesWithUC = leaseManager.getINodeIdWithLeases();
1461        for (Long id : filesWithUC) {
1462          // TODO: for HDFS-5428, because of rename operations, some
1463          // under-construction files that are
1464          // in the current fs directory can also be captured in the
1465          // snapshotUCMap. We should remove them from the snapshotUCMap.
1466          snapshotUCMap.remove(id);
1467        }
1468        out.writeInt(filesWithUC.size() + snapshotUCMap.size()); // write the size
1469
1470        for (Long id : filesWithUC) {
1471          INodeFile file = dir.getInode(id).asFile();
1472          String path = file.getFullPathName();
1473          FSImageSerialization.writeINodeUnderConstruction(
1474                  out, file, path);
1475        }
1476
1477        for (Map.Entry<Long, INodeFile> entry : snapshotUCMap.entrySet()) {
1478          // for those snapshot INodeFileUC, we use "/.reserved/.inodes/<inodeid>"
1479          // as their paths
1480          StringBuilder b = new StringBuilder();
1481          b.append(FSDirectory.DOT_RESERVED_PATH_PREFIX)
1482                  .append(Path.SEPARATOR).append(FSDirectory.DOT_INODES_STRING)
1483                  .append(Path.SEPARATOR).append(entry.getValue().getId());
1484          FSImageSerialization.writeINodeUnderConstruction(
1485                  out, entry.getValue(), b.toString());
1486        }
1487      }
1488    }
1489  }
1490}