001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    package org.apache.hadoop.hdfs.server.namenode;
019    
020    import static org.apache.hadoop.util.Time.now;
021    
022    import java.io.DataInput;
023    import java.io.DataInputStream;
024    import java.io.File;
025    import java.io.FileInputStream;
026    import java.io.FileNotFoundException;
027    import java.io.IOException;
028    import java.security.DigestInputStream;
029    import java.security.MessageDigest;
030    import java.util.Arrays;
031    import java.util.Collection;
032    import java.util.Map;
033    import java.util.TreeMap;
034    
035    import org.apache.commons.logging.Log;
036    import org.apache.hadoop.classification.InterfaceAudience;
037    import org.apache.hadoop.classification.InterfaceStability;
038    import org.apache.hadoop.conf.Configuration;
039    import org.apache.hadoop.fs.FileSystem;
040    import org.apache.hadoop.fs.Path;
041    import org.apache.hadoop.fs.PathIsNotDirectoryException;
042    import org.apache.hadoop.fs.UnresolvedLinkException;
043    import org.apache.hadoop.fs.permission.PermissionStatus;
044    import org.apache.hadoop.hdfs.DFSUtil;
045    import org.apache.hadoop.hdfs.protocol.HdfsConstants;
046    import org.apache.hadoop.hdfs.protocol.LayoutFlags;
047    import org.apache.hadoop.hdfs.protocol.LayoutVersion;
048    import org.apache.hadoop.hdfs.protocol.LayoutVersion.Feature;
049    import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfo;
050    import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfoUnderConstruction;
051    import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager;
052    import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption;
053    import org.apache.hadoop.hdfs.server.common.InconsistentFSStateException;
054    import org.apache.hadoop.hdfs.server.namenode.snapshot.FileDiffList;
055    import org.apache.hadoop.hdfs.server.namenode.snapshot.INodeDirectorySnapshottable;
056    import org.apache.hadoop.hdfs.server.namenode.snapshot.Snapshot;
057    import org.apache.hadoop.hdfs.server.namenode.snapshot.SnapshotFSImageFormat;
058    import org.apache.hadoop.hdfs.server.namenode.snapshot.SnapshotFSImageFormat.ReferenceMap;
059    import org.apache.hadoop.hdfs.server.namenode.startupprogress.Phase;
060    import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress;
061    import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress.Counter;
062    import org.apache.hadoop.hdfs.server.namenode.startupprogress.Step;
063    import org.apache.hadoop.hdfs.server.namenode.startupprogress.StepType;
064    import org.apache.hadoop.io.IOUtils;
065    import org.apache.hadoop.io.MD5Hash;
066    import org.apache.hadoop.io.Text;
067    import org.apache.hadoop.util.StringUtils;
068    
069    import com.google.common.base.Preconditions;
070    import com.google.common.annotations.VisibleForTesting;
071    
072    /**
073     * This class loads and stores the FSImage of the NameNode. The file
074     * src/main/proto/fsimage.proto describes the on-disk layout of the FSImage.
075     */
076    @InterfaceAudience.Private
077    @InterfaceStability.Evolving
078    public class FSImageFormat {
079      private static final Log LOG = FSImage.LOG;
080    
081      // Static-only class
082      private FSImageFormat() {}
083    
084      interface AbstractLoader {
085        MD5Hash getLoadedImageMd5();
086        long getLoadedImageTxId();
087      }
088    
089      static class LoaderDelegator implements AbstractLoader {
090        private AbstractLoader impl;
091        private final Configuration conf;
092        private final FSNamesystem fsn;
093    
094        LoaderDelegator(Configuration conf, FSNamesystem fsn) {
095          this.conf = conf;
096          this.fsn = fsn;
097        }
098    
099        @Override
100        public MD5Hash getLoadedImageMd5() {
101          return impl.getLoadedImageMd5();
102        }
103    
104        @Override
105        public long getLoadedImageTxId() {
106          return impl.getLoadedImageTxId();
107        }
108    
109        public void load(File file) throws IOException {
110          Preconditions.checkState(impl == null, "Image already loaded!");
111    
112          FileInputStream is = null;
113          try {
114            is = new FileInputStream(file);
115            byte[] magic = new byte[FSImageUtil.MAGIC_HEADER.length];
116            IOUtils.readFully(is, magic, 0, magic.length);
117            if (Arrays.equals(magic, FSImageUtil.MAGIC_HEADER)) {
118              FSImageFormatProtobuf.Loader loader = new FSImageFormatProtobuf.Loader(
119                  conf, fsn);
120              impl = loader;
121              loader.load(file);
122            } else {
123              Loader loader = new Loader(conf, fsn);
124              impl = loader;
125              loader.load(file);
126            }
127    
128          } finally {
129            IOUtils.cleanup(LOG, is);
130          }
131        }
132      }
133    
134      /**
135       * Construct a loader class to load the image. It chooses the loader based on
136       * the layout version.
137       */
138      public static LoaderDelegator newLoader(Configuration conf, FSNamesystem fsn) {
139        return new LoaderDelegator(conf, fsn);
140      }
141    
142      /**
143       * A one-shot class responsible for loading an image. The load() function
144       * should be called once, after which the getter methods may be used to retrieve
145       * information about the image that was loaded, if loading was successful.
146       */
147      public static class Loader implements AbstractLoader {
148        private final Configuration conf;
149        /** which namesystem this loader is working for */
150        private final FSNamesystem namesystem;
151    
152        /** Set to true once a file has been loaded using this loader. */
153        private boolean loaded = false;
154    
155        /** The transaction ID of the last edit represented by the loaded file */
156        private long imgTxId;
157        /** The MD5 sum of the loaded file */
158        private MD5Hash imgDigest;
159        
160        private Map<Integer, Snapshot> snapshotMap = null;
161        private final ReferenceMap referenceMap = new ReferenceMap();
162    
163        Loader(Configuration conf, FSNamesystem namesystem) {
164          this.conf = conf;
165          this.namesystem = namesystem;
166        }
167    
168        /**
169         * Return the MD5 checksum of the image that has been loaded.
170         * @throws IllegalStateException if load() has not yet been called.
171         */
172        @Override
173        public MD5Hash getLoadedImageMd5() {
174          checkLoaded();
175          return imgDigest;
176        }
177    
178        @Override
179        public long getLoadedImageTxId() {
180          checkLoaded();
181          return imgTxId;
182        }
183    
184        /**
185         * Throw IllegalStateException if load() has not yet been called.
186         */
187        private void checkLoaded() {
188          if (!loaded) {
189            throw new IllegalStateException("Image not yet loaded!");
190          }
191        }
192    
193        /**
194         * Throw IllegalStateException if load() has already been called.
195         */
196        private void checkNotLoaded() {
197          if (loaded) {
198            throw new IllegalStateException("Image already loaded!");
199          }
200        }
201    
202        public void load(File curFile) throws IOException {
203          checkNotLoaded();
204          assert curFile != null : "curFile is null";
205    
206          StartupProgress prog = NameNode.getStartupProgress();
207          Step step = new Step(StepType.INODES);
208          prog.beginStep(Phase.LOADING_FSIMAGE, step);
209          long startTime = now();
210    
211          //
212          // Load in bits
213          //
214          MessageDigest digester = MD5Hash.getDigester();
215          DigestInputStream fin = new DigestInputStream(
216               new FileInputStream(curFile), digester);
217    
218          DataInputStream in = new DataInputStream(fin);
219          try {
220            // read image version: first appeared in version -1
221            int imgVersion = in.readInt();
222            if (getLayoutVersion() != imgVersion) {
223              throw new InconsistentFSStateException(curFile, 
224                  "imgVersion " + imgVersion +
225                  " expected to be " + getLayoutVersion());
226            }
227            boolean supportSnapshot = NameNodeLayoutVersion.supports(
228                LayoutVersion.Feature.SNAPSHOT, imgVersion);
229            if (NameNodeLayoutVersion.supports(
230                LayoutVersion.Feature.ADD_LAYOUT_FLAGS, imgVersion)) {
231              LayoutFlags.read(in);
232            }
233    
234            // read namespaceID: first appeared in version -2
235            in.readInt();
236    
237            long numFiles = in.readLong();
238    
239            // read in the last generation stamp for legacy blocks.
240            long genstamp = in.readLong();
241            namesystem.setGenerationStampV1(genstamp);
242            
243            if (NameNodeLayoutVersion.supports(
244                LayoutVersion.Feature.SEQUENTIAL_BLOCK_ID, imgVersion)) {
245              // read the starting generation stamp for sequential block IDs
246              genstamp = in.readLong();
247              namesystem.setGenerationStampV2(genstamp);
248    
249              // read the last generation stamp for blocks created after
250              // the switch to sequential block IDs.
251              long stampAtIdSwitch = in.readLong();
252              namesystem.setGenerationStampV1Limit(stampAtIdSwitch);
253    
254              // read the max sequential block ID.
255              long maxSequentialBlockId = in.readLong();
256              namesystem.setLastAllocatedBlockId(maxSequentialBlockId);
257            } else {
258              long startingGenStamp = namesystem.upgradeGenerationStampToV2();
259              // This is an upgrade.
260              LOG.info("Upgrading to sequential block IDs. Generation stamp " +
261                       "for new blocks set to " + startingGenStamp);
262            }
263    
264            // read the transaction ID of the last edit represented by
265            // this image
266            if (NameNodeLayoutVersion.supports(
267                LayoutVersion.Feature.STORED_TXIDS, imgVersion)) {
268              imgTxId = in.readLong();
269            } else {
270              imgTxId = 0;
271            }
272    
273            // read the last allocated inode id in the fsimage
274            if (NameNodeLayoutVersion.supports(
275                LayoutVersion.Feature.ADD_INODE_ID, imgVersion)) {
276              long lastInodeId = in.readLong();
277              namesystem.resetLastInodeId(lastInodeId);
278              if (LOG.isDebugEnabled()) {
279                LOG.debug("load last allocated InodeId from fsimage:" + lastInodeId);
280              }
281            } else {
282              if (LOG.isDebugEnabled()) {
283                LOG.debug("Old layout version doesn't have inode id."
284                    + " Will assign new id for each inode.");
285              }
286            }
287            
288            if (supportSnapshot) {
289              snapshotMap = namesystem.getSnapshotManager().read(in, this);
290            }
291    
292            // read compression related info
293            FSImageCompression compression;
294            if (NameNodeLayoutVersion.supports(
295                LayoutVersion.Feature.FSIMAGE_COMPRESSION, imgVersion)) {
296              compression = FSImageCompression.readCompressionHeader(conf, in);
297            } else {
298              compression = FSImageCompression.createNoopCompression();
299            }
300            in = compression.unwrapInputStream(fin);
301    
302            LOG.info("Loading image file " + curFile + " using " + compression);
303            
304            // load all inodes
305            LOG.info("Number of files = " + numFiles);
306            prog.setTotal(Phase.LOADING_FSIMAGE, step, numFiles);
307            Counter counter = prog.getCounter(Phase.LOADING_FSIMAGE, step);
308            if (NameNodeLayoutVersion.supports(
309                LayoutVersion.Feature.FSIMAGE_NAME_OPTIMIZATION, imgVersion)) {
310              if (supportSnapshot) {
311                loadLocalNameINodesWithSnapshot(numFiles, in, counter);
312              } else {
313                loadLocalNameINodes(numFiles, in, counter);
314              }
315            } else {
316              loadFullNameINodes(numFiles, in, counter);
317            }
318    
319            loadFilesUnderConstruction(in, supportSnapshot, counter);
320            prog.endStep(Phase.LOADING_FSIMAGE, step);
321            // Now that the step is finished, set counter equal to total to adjust
322            // for possible under-counting due to reference inodes.
323            prog.setCount(Phase.LOADING_FSIMAGE, step, numFiles);
324    
325            loadSecretManagerState(in);
326    
327            loadCacheManagerState(in);
328    
329            // make sure to read to the end of file
330            boolean eof = (in.read() == -1);
331            assert eof : "Should have reached the end of image file " + curFile;
332          } finally {
333            in.close();
334          }
335    
336          imgDigest = new MD5Hash(digester.digest());
337          loaded = true;
338          
339          LOG.info("Image file " + curFile + " of size " + curFile.length() +
340              " bytes loaded in " + (now() - startTime)/1000 + " seconds.");
341        }
342    
343      /** Update the root node's attributes */
344      private void updateRootAttr(INodeWithAdditionalFields root) {                                                           
345        final Quota.Counts q = root.getQuotaCounts();
346        final long nsQuota = q.get(Quota.NAMESPACE);
347        final long dsQuota = q.get(Quota.DISKSPACE);
348        FSDirectory fsDir = namesystem.dir;
349        if (nsQuota != -1 || dsQuota != -1) {
350          fsDir.rootDir.getDirectoryWithQuotaFeature().setQuota(nsQuota, dsQuota);
351        }
352        fsDir.rootDir.cloneModificationTime(root);
353        fsDir.rootDir.clonePermissionStatus(root);    
354      }
355      
356        /**
357         * Load fsimage files when 1) only local names are stored, 
358         * and 2) snapshot is supported.
359         * 
360         * @param numFiles number of files expected to be read
361         * @param in Image input stream
362         * @param counter Counter to increment for namenode startup progress
363         */
364        private void loadLocalNameINodesWithSnapshot(long numFiles, DataInput in,
365            Counter counter) throws IOException {
366          assert NameNodeLayoutVersion.supports(
367              LayoutVersion.Feature.FSIMAGE_NAME_OPTIMIZATION, getLayoutVersion());
368          assert NameNodeLayoutVersion.supports(
369              LayoutVersion.Feature.SNAPSHOT, getLayoutVersion());
370          
371          // load root
372          loadRoot(in, counter);
373          // load rest of the nodes recursively
374          loadDirectoryWithSnapshot(in, counter);
375        }
376        
377      /** 
378       * load fsimage files assuming only local names are stored. Used when
379       * snapshots are not supported by the layout version.
380       *   
381       * @param numFiles number of files expected to be read
382       * @param in image input stream
383       * @param counter Counter to increment for namenode startup progress
384       * @throws IOException
385       */  
386       private void loadLocalNameINodes(long numFiles, DataInput in, Counter counter)
387           throws IOException {
388         assert NameNodeLayoutVersion.supports(
389             LayoutVersion.Feature.FSIMAGE_NAME_OPTIMIZATION, getLayoutVersion());
390         assert numFiles > 0;
391    
392         // load root
393         loadRoot(in, counter);
394         // have loaded the first file (the root)
395         numFiles--; 
396    
397         // load rest of the nodes directory by directory
398         while (numFiles > 0) {
399           numFiles -= loadDirectory(in, counter);
400         }
401         if (numFiles != 0) {
402           throw new IOException("Read unexpect number of files: " + -numFiles);
403         }
404       }
405       
406        /**
407         * Load information about root, and use the information to update the root
408         * directory of NameSystem.
409         * @param in The {@link DataInput} instance to read.
410         * @param counter Counter to increment for namenode startup progress
411         */
412        private void loadRoot(DataInput in, Counter counter)
413            throws IOException {
414          // load root
415          if (in.readShort() != 0) {
416            throw new IOException("First node is not root");
417          }
418          final INodeDirectory root = loadINode(null, false, in, counter)
419            .asDirectory();
420          // update the root's attributes
421          updateRootAttr(root);
422        }
423       
424        /** Load children nodes for the parent directory. */
425        private int loadChildren(INodeDirectory parent, DataInput in,
426            Counter counter) throws IOException {
427          int numChildren = in.readInt();
428          for (int i = 0; i < numChildren; i++) {
429            // load single inode
430            INode newNode = loadINodeWithLocalName(false, in, true, counter);
431            addToParent(parent, newNode);
432          }
433          return numChildren;
434        }
435        
436        /**
437         * Load a directory when snapshot is supported.
438         * @param in The {@link DataInput} instance to read.
439         * @param counter Counter to increment for namenode startup progress
440         */
441        private void loadDirectoryWithSnapshot(DataInput in, Counter counter)
442            throws IOException {
443          // Step 1. Identify the parent INode
444          long inodeId = in.readLong();
445          final INodeDirectory parent = this.namesystem.dir.getInode(inodeId)
446              .asDirectory();
447          
448          // Check if the whole subtree has been saved (for reference nodes)
449          boolean toLoadSubtree = referenceMap.toProcessSubtree(parent.getId());
450          if (!toLoadSubtree) {
451            return;
452          }
453          
454          // Step 2. Load snapshots if parent is snapshottable
455          int numSnapshots = in.readInt();
456          if (numSnapshots >= 0) {
457            final INodeDirectorySnapshottable snapshottableParent
458                = INodeDirectorySnapshottable.valueOf(parent, parent.getLocalName());
459            // load snapshots and snapshotQuota
460            SnapshotFSImageFormat.loadSnapshotList(snapshottableParent,
461                numSnapshots, in, this);
462            if (snapshottableParent.getSnapshotQuota() > 0) {
463              // add the directory to the snapshottable directory list in 
464              // SnapshotManager. Note that we only add root when its snapshot quota
465              // is positive.
466              this.namesystem.getSnapshotManager().addSnapshottable(
467                  snapshottableParent);
468            }
469          }
470    
471          // Step 3. Load children nodes under parent
472          loadChildren(parent, in, counter);
473          
474          // Step 4. load Directory Diff List
475          SnapshotFSImageFormat.loadDirectoryDiffList(parent, in, this);
476          
477          // Recursively load sub-directories, including snapshot copies of deleted
478          // directories
479          int numSubTree = in.readInt();
480          for (int i = 0; i < numSubTree; i++) {
481            loadDirectoryWithSnapshot(in, counter);
482          }
483        }
484        
485       /**
486        * Load all children of a directory
487        * 
488        * @param in
489        * @param counter Counter to increment for namenode startup progress
490        * @return number of child inodes read
491        * @throws IOException
492        */
493       private int loadDirectory(DataInput in, Counter counter) throws IOException {
494         String parentPath = FSImageSerialization.readString(in);
495         // Rename .snapshot paths if we're doing an upgrade
496         parentPath = renameReservedPathsOnUpgrade(parentPath, getLayoutVersion());
497         final INodeDirectory parent = INodeDirectory.valueOf(
498             namesystem.dir.rootDir.getNode(parentPath, true), parentPath);
499         return loadChildren(parent, in, counter);
500       }
501    
502      /**
503       * load fsimage files assuming full path names are stored
504       * 
505       * @param numFiles total number of files to load
506       * @param in data input stream
507       * @param counter Counter to increment for namenode startup progress
508       * @throws IOException if any error occurs
509       */
510      private void loadFullNameINodes(long numFiles, DataInput in, Counter counter)
511          throws IOException {
512        byte[][] pathComponents;
513        byte[][] parentPath = {{}};      
514        FSDirectory fsDir = namesystem.dir;
515        INodeDirectory parentINode = fsDir.rootDir;
516        for (long i = 0; i < numFiles; i++) {
517          pathComponents = FSImageSerialization.readPathComponents(in);
518          final INode newNode = loadINode(
519              pathComponents[pathComponents.length-1], false, in, counter);
520    
521          if (isRoot(pathComponents)) { // it is the root
522            // update the root's attributes
523            updateRootAttr(newNode.asDirectory());
524            continue;
525          }
526    
527          namesystem.dir.addToInodeMap(newNode);
528          // check if the new inode belongs to the same parent
529          if(!isParent(pathComponents, parentPath)) {
530            parentINode = getParentINodeDirectory(pathComponents);
531            parentPath = getParent(pathComponents);
532          }
533    
534          // add new inode
535          addToParent(parentINode, newNode);
536        }
537      }
538    
539      private INodeDirectory getParentINodeDirectory(byte[][] pathComponents
540          ) throws FileNotFoundException, PathIsNotDirectoryException,
541          UnresolvedLinkException {
542        if (pathComponents.length < 2) { // root
543          return null;
544        }
545        // Gets the parent INode
546        final INodesInPath inodes = namesystem.dir.getExistingPathINodes(
547            pathComponents);
548        return INodeDirectory.valueOf(inodes.getINode(-2), pathComponents);
549      }
550    
551      /**
552       * Add the child node to parent and, if child is a file, update block map.
553       * This method is only used for image loading so that synchronization,
554       * modification time update and space count update are not needed.
555       */
556      private void addToParent(INodeDirectory parent, INode child) {
557        FSDirectory fsDir = namesystem.dir;
558        if (parent == fsDir.rootDir) {
559            child.setLocalName(renameReservedRootComponentOnUpgrade(
560                child.getLocalNameBytes(), getLayoutVersion()));
561        }
562        // NOTE: This does not update space counts for parents
563        if (!parent.addChild(child)) {
564          return;
565        }
566        namesystem.dir.cacheName(child);
567    
568        if (child.isFile()) {
569          updateBlocksMap(child.asFile());
570        }
571      }
572    
573        public void updateBlocksMap(INodeFile file) {
574          // Add file->block mapping
575          final BlockInfo[] blocks = file.getBlocks();
576          if (blocks != null) {
577            final BlockManager bm = namesystem.getBlockManager();
578            for (int i = 0; i < blocks.length; i++) {
579              file.setBlock(i, bm.addBlockCollection(blocks[i], file));
580            } 
581          }
582        }
583    
584        public INode loadINodeWithLocalName(boolean isSnapshotINode, DataInput in,
585            boolean updateINodeMap) throws IOException {
586          return loadINodeWithLocalName(isSnapshotINode, in, updateINodeMap, null);
587        }
588    
589        public INode loadINodeWithLocalName(boolean isSnapshotINode,
590            DataInput in, boolean updateINodeMap, Counter counter)
591            throws IOException {
592          byte[] localName = FSImageSerialization.readLocalName(in);
593          localName =
594              renameReservedComponentOnUpgrade(localName, getLayoutVersion());
595          INode inode = loadINode(localName, isSnapshotINode, in, counter);
596          if (updateINodeMap) {
597            namesystem.dir.addToInodeMap(inode);
598          }
599          return inode;
600        }
601      
602      /**
603       * load an inode from fsimage except for its name
604       * 
605       * @param in data input stream from which image is read
606       * @param counter Counter to increment for namenode startup progress
607       * @return an inode
608       */
609      @SuppressWarnings("deprecation")
610      INode loadINode(final byte[] localName, boolean isSnapshotINode,
611          DataInput in, Counter counter) throws IOException {
612        final int imgVersion = getLayoutVersion();
613        if (NameNodeLayoutVersion.supports(
614            LayoutVersion.Feature.SNAPSHOT, imgVersion)) {
615          namesystem.getFSDirectory().verifyINodeName(localName);
616        }
617    
618        long inodeId = NameNodeLayoutVersion.supports(
619            LayoutVersion.Feature.ADD_INODE_ID, imgVersion) ? in.readLong()
620            : namesystem.allocateNewInodeId();
621        
622        final short replication = namesystem.getBlockManager().adjustReplication(
623            in.readShort());
624        final long modificationTime = in.readLong();
625        long atime = 0;
626        if (NameNodeLayoutVersion.supports(
627            LayoutVersion.Feature.FILE_ACCESS_TIME, imgVersion)) {
628          atime = in.readLong();
629        }
630        final long blockSize = in.readLong();
631        final int numBlocks = in.readInt();
632    
633        if (numBlocks >= 0) {
634          // file
635          
636          // read blocks
637          BlockInfo[] blocks = new BlockInfo[numBlocks];
638          for (int j = 0; j < numBlocks; j++) {
639            blocks[j] = new BlockInfo(replication);
640            blocks[j].readFields(in);
641          }
642    
643          String clientName = "";
644          String clientMachine = "";
645          boolean underConstruction = false;
646          FileDiffList fileDiffs = null;
647          if (NameNodeLayoutVersion.supports(
648              LayoutVersion.Feature.SNAPSHOT, imgVersion)) {
649            // read diffs
650            fileDiffs = SnapshotFSImageFormat.loadFileDiffList(in, this);
651    
652            if (isSnapshotINode) {
653              underConstruction = in.readBoolean();
654              if (underConstruction) {
655                clientName = FSImageSerialization.readString(in);
656                clientMachine = FSImageSerialization.readString(in);
657                // convert the last block to BlockUC
658                if (blocks != null && blocks.length > 0) {
659                  BlockInfo lastBlk = blocks[blocks.length - 1]; 
660                  blocks[blocks.length - 1] = new BlockInfoUnderConstruction(
661                      lastBlk, replication);
662                }
663              }
664            }
665          }
666    
667          final PermissionStatus permissions = PermissionStatus.read(in);
668    
669          // return
670          if (counter != null) {
671            counter.increment();
672          }
673          final INodeFile file = new INodeFile(inodeId, localName, permissions,
674              modificationTime, atime, blocks, replication, blockSize);
675          if (underConstruction) {
676            file.toUnderConstruction(clientName, clientMachine, null);
677          }
678            return fileDiffs == null ? file : new INodeFile(file, fileDiffs);
679          } else if (numBlocks == -1) {
680            //directory
681          
682          //read quotas
683          final long nsQuota = in.readLong();
684          long dsQuota = -1L;
685          if (NameNodeLayoutVersion.supports(
686              LayoutVersion.Feature.DISKSPACE_QUOTA, imgVersion)) {
687            dsQuota = in.readLong();
688          }
689    
690          //read snapshot info
691          boolean snapshottable = false;
692          boolean withSnapshot = false;
693          if (NameNodeLayoutVersion.supports(
694              LayoutVersion.Feature.SNAPSHOT, imgVersion)) {
695            snapshottable = in.readBoolean();
696            if (!snapshottable) {
697              withSnapshot = in.readBoolean();
698            }
699          }
700    
701          final PermissionStatus permissions = PermissionStatus.read(in);
702    
703          //return
704          if (counter != null) {
705            counter.increment();
706          }
707          final INodeDirectory dir = new INodeDirectory(inodeId, localName,
708              permissions, modificationTime);
709          if (nsQuota >= 0 || dsQuota >= 0) {
710            dir.addDirectoryWithQuotaFeature(nsQuota, dsQuota);
711          }
712          if (withSnapshot) {
713            dir.addSnapshotFeature(null);
714          }
715          return snapshottable ? new INodeDirectorySnapshottable(dir) : dir;
716        } else if (numBlocks == -2) {
717          //symlink
718          if (!FileSystem.areSymlinksEnabled()) {
719            throw new IOException("Symlinks not supported - please remove symlink before upgrading to this version of HDFS");
720          }
721    
722          final String symlink = Text.readString(in);
723          final PermissionStatus permissions = PermissionStatus.read(in);
724          if (counter != null) {
725            counter.increment();
726          }
727          return new INodeSymlink(inodeId, localName, permissions,
728              modificationTime, atime, symlink);
729        } else if (numBlocks == -3) {
730          //reference
731          // Intentionally do not increment counter, because it is too difficult at
732          // this point to assess whether or not this is a reference that counts
733          // toward quota.
734          
735          final boolean isWithName = in.readBoolean();
736          // lastSnapshotId for WithName node, dstSnapshotId for DstReference node
737          int snapshotId = in.readInt();
738          
739          final INodeReference.WithCount withCount
740              = referenceMap.loadINodeReferenceWithCount(isSnapshotINode, in, this);
741    
742          if (isWithName) {
743              return new INodeReference.WithName(null, withCount, localName,
744                  snapshotId);
745          } else {
746            final INodeReference ref = new INodeReference.DstReference(null,
747                withCount, snapshotId);
748            return ref;
749          }
750        }
751        
752        throw new IOException("Unknown inode type: numBlocks=" + numBlocks);
753      }
754    
755        /** Load {@link INodeFileAttributes}. */
756        public INodeFileAttributes loadINodeFileAttributes(DataInput in)
757            throws IOException {
758          final int layoutVersion = getLayoutVersion();
759          
760          if (!NameNodeLayoutVersion.supports(
761              LayoutVersion.Feature.OPTIMIZE_SNAPSHOT_INODES, layoutVersion)) {
762            return loadINodeWithLocalName(true, in, false).asFile();
763          }
764      
765          final byte[] name = FSImageSerialization.readLocalName(in);
766          final PermissionStatus permissions = PermissionStatus.read(in);
767          final long modificationTime = in.readLong();
768          final long accessTime = in.readLong();
769      
770          final short replication = namesystem.getBlockManager().adjustReplication(
771              in.readShort());
772          final long preferredBlockSize = in.readLong();
773    
774          return new INodeFileAttributes.SnapshotCopy(name, permissions, null, modificationTime,
775              accessTime, replication, preferredBlockSize);
776        }
777    
778        public INodeDirectoryAttributes loadINodeDirectoryAttributes(DataInput in)
779            throws IOException {
780          final int layoutVersion = getLayoutVersion();
781          
782          if (!NameNodeLayoutVersion.supports(
783              LayoutVersion.Feature.OPTIMIZE_SNAPSHOT_INODES, layoutVersion)) {
784            return loadINodeWithLocalName(true, in, false).asDirectory();
785          }
786      
787          final byte[] name = FSImageSerialization.readLocalName(in);
788          final PermissionStatus permissions = PermissionStatus.read(in);
789          final long modificationTime = in.readLong();
790          
791          //read quotas
792          final long nsQuota = in.readLong();
793          final long dsQuota = in.readLong();
794      
795          return nsQuota == -1L && dsQuota == -1L?
796              new INodeDirectoryAttributes.SnapshotCopy(name, permissions, null, modificationTime)
797            : new INodeDirectoryAttributes.CopyWithQuota(name, permissions,
798                null, modificationTime, nsQuota, dsQuota);
799        }
800      
801        private void loadFilesUnderConstruction(DataInput in,
802            boolean supportSnapshot, Counter counter) throws IOException {
803          FSDirectory fsDir = namesystem.dir;
804          int size = in.readInt();
805    
806          LOG.info("Number of files under construction = " + size);
807    
808          for (int i = 0; i < size; i++) {
809            INodeFile cons = FSImageSerialization.readINodeUnderConstruction(in,
810                namesystem, getLayoutVersion());
811            counter.increment();
812    
813            // verify that file exists in namespace
814            String path = cons.getLocalName();
815            INodeFile oldnode = null;
816            boolean inSnapshot = false;
817            if (path != null && FSDirectory.isReservedName(path) && 
818                NameNodeLayoutVersion.supports(
819                    LayoutVersion.Feature.ADD_INODE_ID, getLayoutVersion())) {
820              // TODO: for HDFS-5428, we use reserved path for those INodeFileUC in
821              // snapshot. If we support INode ID in the layout version, we can use
822              // the inode id to find the oldnode.
823              oldnode = namesystem.dir.getInode(cons.getId()).asFile();
824              inSnapshot = true;
825            } else {
826              final INodesInPath iip = fsDir.getLastINodeInPath(path);
827              oldnode = INodeFile.valueOf(iip.getINode(0), path);
828            }
829    
830            FileUnderConstructionFeature uc = cons.getFileUnderConstructionFeature();
831            oldnode.toUnderConstruction(uc.getClientName(), uc.getClientMachine(),
832                uc.getClientNode());
833            if (oldnode.numBlocks() > 0) {
834              BlockInfo ucBlock = cons.getLastBlock();
835              // we do not replace the inode, just replace the last block of oldnode
836              BlockInfo info = namesystem.getBlockManager().addBlockCollection(
837                  ucBlock, oldnode);
838              oldnode.setBlock(oldnode.numBlocks() - 1, info);
839            }
840    
841            if (!inSnapshot) {
842              namesystem.leaseManager.addLease(cons
843                  .getFileUnderConstructionFeature().getClientName(), path);
844            }
845          }
846        }
847    
848        private void loadSecretManagerState(DataInput in)
849            throws IOException {
850          int imgVersion = getLayoutVersion();
851    
852          if (!NameNodeLayoutVersion.supports(
853              LayoutVersion.Feature.DELEGATION_TOKEN, imgVersion)) {
854            //SecretManagerState is not available.
855            //This must not happen if security is turned on.
856            return; 
857          }
858          namesystem.loadSecretManagerStateCompat(in);
859        }
860    
861        private void loadCacheManagerState(DataInput in) throws IOException {
862          int imgVersion = getLayoutVersion();
863          if (!NameNodeLayoutVersion.supports(
864              LayoutVersion.Feature.CACHING, imgVersion)) {
865            return;
866          }
867          namesystem.getCacheManager().loadStateCompat(in);
868        }
869    
870        private int getLayoutVersion() {
871          return namesystem.getFSImage().getStorage().getLayoutVersion();
872        }
873    
874        private boolean isRoot(byte[][] path) {
875          return path.length == 1 &&
876            path[0] == null;    
877        }
878    
879        private boolean isParent(byte[][] path, byte[][] parent) {
880          if (path == null || parent == null)
881            return false;
882          if (parent.length == 0 || path.length != parent.length + 1)
883            return false;
884          boolean isParent = true;
885          for (int i = 0; i < parent.length; i++) {
886            isParent = isParent && Arrays.equals(path[i], parent[i]); 
887          }
888          return isParent;
889        }
890    
891        /**
892         * Return string representing the parent of the given path.
893         */
894        String getParent(String path) {
895          return path.substring(0, path.lastIndexOf(Path.SEPARATOR));
896        }
897        
898        byte[][] getParent(byte[][] path) {
899          byte[][] result = new byte[path.length - 1][];
900          for (int i = 0; i < result.length; i++) {
901            result[i] = new byte[path[i].length];
902            System.arraycopy(path[i], 0, result[i], 0, path[i].length);
903          }
904          return result;
905        }
906        
907        public Snapshot getSnapshot(DataInput in) throws IOException {
908          return snapshotMap.get(in.readInt());
909        }
910      }
911    
912      @VisibleForTesting
913      public static final TreeMap<String, String> renameReservedMap =
914          new TreeMap<String, String>();
915    
916      /**
917       * Use the default key-value pairs that will be used to determine how to
918       * rename reserved paths on upgrade.
919       */
920      @VisibleForTesting
921      public static void useDefaultRenameReservedPairs() {
922        renameReservedMap.clear();
923        for (String key: HdfsConstants.RESERVED_PATH_COMPONENTS) {
924          renameReservedMap.put(
925              key,
926              key + "." + HdfsConstants.NAMENODE_LAYOUT_VERSION + "."
927                  + "UPGRADE_RENAMED");
928        }
929      }
930    
931      /**
932       * Set the key-value pairs that will be used to determine how to rename
933       * reserved paths on upgrade.
934       */
935      @VisibleForTesting
936      public static void setRenameReservedPairs(String renameReserved) {
937        // Clear and set the default values
938        useDefaultRenameReservedPairs();
939        // Overwrite with provided values
940        setRenameReservedMapInternal(renameReserved);
941      }
942    
943      private static void setRenameReservedMapInternal(String renameReserved) {
944        Collection<String> pairs =
945            StringUtils.getTrimmedStringCollection(renameReserved);
946        for (String p : pairs) {
947          String[] pair = StringUtils.split(p, '/', '=');
948          Preconditions.checkArgument(pair.length == 2,
949              "Could not parse key-value pair " + p);
950          String key = pair[0];
951          String value = pair[1];
952          Preconditions.checkArgument(DFSUtil.isReservedPathComponent(key),
953              "Unknown reserved path " + key);
954          Preconditions.checkArgument(DFSUtil.isValidNameForComponent(value),
955              "Invalid rename path for " + key + ": " + value);
956          LOG.info("Will rename reserved path " + key + " to " + value);
957          renameReservedMap.put(key, value);
958        }
959      }
960    
961      /**
962       * When upgrading from an old version, the filesystem could contain paths
963       * that are now reserved in the new version (e.g. .snapshot). This renames
964       * these new reserved paths to a user-specified value to avoid collisions
965       * with the reserved name.
966       * 
967       * @param path Old path potentially containing a reserved path
968       * @return New path with reserved path components renamed to user value
969       */
970      static String renameReservedPathsOnUpgrade(String path,
971          final int layoutVersion) {
972        final String oldPath = path;
973        // If any known LVs aren't supported, we're doing an upgrade
974        if (!NameNodeLayoutVersion.supports(Feature.ADD_INODE_ID, layoutVersion)) {
975          String[] components = INode.getPathNames(path);
976          // Only need to worry about the root directory
977          if (components.length > 1) {
978            components[1] = DFSUtil.bytes2String(
979                renameReservedRootComponentOnUpgrade(
980                    DFSUtil.string2Bytes(components[1]),
981                    layoutVersion));
982            path = DFSUtil.strings2PathString(components);
983          }
984        }
985        if (!NameNodeLayoutVersion.supports(Feature.SNAPSHOT, layoutVersion)) {
986          String[] components = INode.getPathNames(path);
987          // Special case the root path
988          if (components.length == 0) {
989            return path;
990          }
991          for (int i=0; i<components.length; i++) {
992            components[i] = DFSUtil.bytes2String(
993                renameReservedComponentOnUpgrade(
994                    DFSUtil.string2Bytes(components[i]),
995                    layoutVersion));
996          }
997          path = DFSUtil.strings2PathString(components);
998        }
999    
1000        if (!path.equals(oldPath)) {
1001          LOG.info("Upgrade process renamed reserved path " + oldPath + " to "
1002              + path);
1003        }
1004        return path;
1005      }
1006    
1007      private final static String RESERVED_ERROR_MSG = 
1008          FSDirectory.DOT_RESERVED_PATH_PREFIX + " is a reserved path and "
1009          + HdfsConstants.DOT_SNAPSHOT_DIR + " is a reserved path component in"
1010          + " this version of HDFS. Please rollback and delete or rename"
1011          + " this path, or upgrade with the "
1012          + StartupOption.RENAMERESERVED.getName()
1013          + " [key-value pairs]"
1014          + " option to automatically rename these paths during upgrade.";
1015    
1016      /**
1017       * Same as {@link #renameReservedPathsOnUpgrade}, but for a single
1018       * byte array path component.
1019       */
1020      private static byte[] renameReservedComponentOnUpgrade(byte[] component,
1021          final int layoutVersion) {
1022        // If the LV doesn't support snapshots, we're doing an upgrade
1023        if (!NameNodeLayoutVersion.supports(Feature.SNAPSHOT, layoutVersion)) {
1024          if (Arrays.equals(component, HdfsConstants.DOT_SNAPSHOT_DIR_BYTES)) {
1025            Preconditions.checkArgument(
1026                renameReservedMap != null &&
1027                renameReservedMap.containsKey(HdfsConstants.DOT_SNAPSHOT_DIR),
1028                RESERVED_ERROR_MSG);
1029            component =
1030                DFSUtil.string2Bytes(renameReservedMap
1031                    .get(HdfsConstants.DOT_SNAPSHOT_DIR));
1032          }
1033        }
1034        return component;
1035      }
1036    
1037      /**
1038       * Same as {@link #renameReservedPathsOnUpgrade}, but for a single
1039       * byte array path component.
1040       */
1041      private static byte[] renameReservedRootComponentOnUpgrade(byte[] component,
1042          final int layoutVersion) {
1043        // If the LV doesn't support inode IDs, we're doing an upgrade
1044        if (!NameNodeLayoutVersion.supports(Feature.ADD_INODE_ID, layoutVersion)) {
1045          if (Arrays.equals(component, FSDirectory.DOT_RESERVED)) {
1046            Preconditions.checkArgument(
1047                renameReservedMap != null &&
1048                renameReservedMap.containsKey(FSDirectory.DOT_RESERVED_STRING),
1049                RESERVED_ERROR_MSG);
1050            final String renameString = renameReservedMap
1051                .get(FSDirectory.DOT_RESERVED_STRING);
1052            component =
1053                DFSUtil.string2Bytes(renameString);
1054            LOG.info("Renamed root path " + FSDirectory.DOT_RESERVED_STRING
1055                + " to " + renameString);
1056          }
1057        }
1058        return component;
1059      }
1060    }