001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    package org.apache.hadoop.hdfs.server.namenode;
019    
020    import static org.apache.hadoop.util.Time.now;
021    
022    import java.io.DataInput;
023    import java.io.DataInputStream;
024    import java.io.DataOutputStream;
025    import java.io.File;
026    import java.io.FileInputStream;
027    import java.io.FileNotFoundException;
028    import java.io.FileOutputStream;
029    import java.io.IOException;
030    import java.security.DigestInputStream;
031    import java.security.DigestOutputStream;
032    import java.security.MessageDigest;
033    import java.util.ArrayList;
034    import java.util.Arrays;
035    import java.util.List;
036    import java.util.Map;
037    
038    import org.apache.commons.logging.Log;
039    import org.apache.hadoop.HadoopIllegalArgumentException;
040    import org.apache.hadoop.classification.InterfaceAudience;
041    import org.apache.hadoop.classification.InterfaceStability;
042    import org.apache.hadoop.conf.Configuration;
043    import org.apache.hadoop.fs.FileSystem;
044    import org.apache.hadoop.fs.Path;
045    import org.apache.hadoop.fs.PathIsNotDirectoryException;
046    import org.apache.hadoop.fs.UnresolvedLinkException;
047    import org.apache.hadoop.fs.permission.PermissionStatus;
048    import org.apache.hadoop.hdfs.protocol.HdfsConstants;
049    import org.apache.hadoop.hdfs.protocol.LayoutVersion;
050    import org.apache.hadoop.hdfs.protocol.LayoutVersion.Feature;
051    import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfo;
052    import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager;
053    import org.apache.hadoop.hdfs.server.common.InconsistentFSStateException;
054    import org.apache.hadoop.hdfs.server.namenode.snapshot.FileWithSnapshot.FileDiffList;
055    import org.apache.hadoop.hdfs.server.namenode.snapshot.INodeDirectorySnapshottable;
056    import org.apache.hadoop.hdfs.server.namenode.snapshot.INodeDirectoryWithSnapshot;
057    import org.apache.hadoop.hdfs.server.namenode.snapshot.INodeFileUnderConstructionWithSnapshot;
058    import org.apache.hadoop.hdfs.server.namenode.snapshot.INodeFileWithSnapshot;
059    import org.apache.hadoop.hdfs.server.namenode.snapshot.Snapshot;
060    import org.apache.hadoop.hdfs.server.namenode.snapshot.SnapshotFSImageFormat;
061    import org.apache.hadoop.hdfs.server.namenode.snapshot.SnapshotFSImageFormat.ReferenceMap;
062    import org.apache.hadoop.hdfs.server.namenode.startupprogress.Phase;
063    import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress;
064    import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress.Counter;
065    import org.apache.hadoop.hdfs.server.namenode.startupprogress.Step;
066    import org.apache.hadoop.hdfs.server.namenode.startupprogress.StepType;
067    import org.apache.hadoop.hdfs.util.ReadOnlyList;
068    import org.apache.hadoop.io.MD5Hash;
069    import org.apache.hadoop.io.Text;
070    
071    /**
072     * Contains inner classes for reading or writing the on-disk format for
073     * FSImages.
074     * 
075     * In particular, the format of the FSImage looks like:
076     * <pre>
077     * FSImage {
078     *   layoutVersion: int, namespaceID: int, numberItemsInFSDirectoryTree: long,
079     *   namesystemGenerationStampV1: long, namesystemGenerationStampV2: long,
080     *   generationStampAtBlockIdSwitch:long, lastAllocatedBlockId:
081     *   long transactionID: long, snapshotCounter: int, numberOfSnapshots: int,
082     *   numOfSnapshottableDirs: int,
083     *   {FSDirectoryTree, FilesUnderConstruction, SecretManagerState} (can be compressed)
084     * }
085     * 
086     * FSDirectoryTree (if {@link Feature#FSIMAGE_NAME_OPTIMIZATION} is supported) {
087     *   INodeInfo of root, numberOfChildren of root: int
088     *   [list of INodeInfo of root's children],
089     *   [list of INodeDirectoryInfo of root's directory children]
090     * }
091     * 
092     * FSDirectoryTree (if {@link Feature#FSIMAGE_NAME_OPTIMIZATION} not supported){
093     *   [list of INodeInfo of INodes in topological order]
094     * }
095     * 
096     * INodeInfo {
097     *   {
098     *     localName: short + byte[]
099     *   } when {@link Feature#FSIMAGE_NAME_OPTIMIZATION} is supported
100     *   or 
101     *   {
102     *     fullPath: byte[]
103     *   } when {@link Feature#FSIMAGE_NAME_OPTIMIZATION} is not supported
104     *   replicationFactor: short, modificationTime: long,
105     *   accessTime: long, preferredBlockSize: long,
106     *   numberOfBlocks: int (-1 for INodeDirectory, -2 for INodeSymLink),
107     *   { 
108     *     nsQuota: long, dsQuota: long, 
109     *     {
110     *       isINodeSnapshottable: byte,
111     *       isINodeWithSnapshot: byte (if isINodeSnapshottable is false)
112     *     } (when {@link Feature#SNAPSHOT} is supported), 
113     *     fsPermission: short, PermissionStatus
114     *   } for INodeDirectory
115     *   or 
116     *   {
117     *     symlinkString, fsPermission: short, PermissionStatus
118     *   } for INodeSymlink
119     *   or
120     *   {
121     *     [list of BlockInfo]
122     *     [list of FileDiff]
123     *     {
124     *       isINodeFileUnderConstructionSnapshot: byte, 
125     *       {clientName: short + byte[], clientMachine: short + byte[]} (when 
126     *       isINodeFileUnderConstructionSnapshot is true),
127     *     } (when {@link Feature#SNAPSHOT} is supported and writing snapshotINode), 
128     *     fsPermission: short, PermissionStatus
129     *   } for INodeFile
130     * }
131     * 
132     * INodeDirectoryInfo {
133     *   fullPath of the directory: short + byte[],
134     *   numberOfChildren: int, [list of INodeInfo of children INode],
135     *   {
136     *     numberOfSnapshots: int,
137     *     [list of Snapshot] (when NumberOfSnapshots is positive),
138     *     numberOfDirectoryDiffs: int,
139     *     [list of DirectoryDiff] (NumberOfDirectoryDiffs is positive),
140     *     number of children that are directories,
141     *     [list of INodeDirectoryInfo of the directory children] (includes
142     *     snapshot copies of deleted sub-directories)
143     *   } (when {@link Feature#SNAPSHOT} is supported), 
144     * }
145     * 
146     * Snapshot {
147     *   snapshotID: int, root of Snapshot: INodeDirectoryInfo (its local name is 
148     *   the name of the snapshot)
149     * }
150     * 
151     * DirectoryDiff {
152     *   full path of the root of the associated Snapshot: short + byte[], 
153     *   childrenSize: int, 
154     *   isSnapshotRoot: byte, 
155     *   snapshotINodeIsNotNull: byte (when isSnapshotRoot is false),
156     *   snapshotINode: INodeDirectory (when SnapshotINodeIsNotNull is true), Diff 
157     * }
158     * 
159     * Diff {
160     *   createdListSize: int, [Local name of INode in created list],
161     *   deletedListSize: int, [INode in deleted list: INodeInfo]
162     * }
163     *
164     * FileDiff {
165     *   full path of the root of the associated Snapshot: short + byte[], 
166     *   fileSize: long, 
167     *   snapshotINodeIsNotNull: byte,
168     *   snapshotINode: INodeFile (when SnapshotINodeIsNotNull is true), Diff 
169     * }
170     * </pre>
171     */
172    @InterfaceAudience.Private
173    @InterfaceStability.Evolving
174    public class FSImageFormat {
175      private static final Log LOG = FSImage.LOG;
176      
177      // Static-only class
178      private FSImageFormat() {}
179      
180      /**
181       * A one-shot class responsible for loading an image. The load() function
182       * should be called once, after which the getter methods may be used to retrieve
183       * information about the image that was loaded, if loading was successful.
184       */
185      public static class Loader {
186        private final Configuration conf;
187        /** which namesystem this loader is working for */
188        private final FSNamesystem namesystem;
189    
190        /** Set to true once a file has been loaded using this loader. */
191        private boolean loaded = false;
192    
193        /** The transaction ID of the last edit represented by the loaded file */
194        private long imgTxId;
195        /** The MD5 sum of the loaded file */
196        private MD5Hash imgDigest;
197        
198        private Map<Integer, Snapshot> snapshotMap = null;
199        private final ReferenceMap referenceMap = new ReferenceMap();
200    
201        Loader(Configuration conf, FSNamesystem namesystem) {
202          this.conf = conf;
203          this.namesystem = namesystem;
204        }
205    
206        /**
207         * Return the MD5 checksum of the image that has been loaded.
208         * @throws IllegalStateException if load() has not yet been called.
209         */
210        MD5Hash getLoadedImageMd5() {
211          checkLoaded();
212          return imgDigest;
213        }
214    
215        long getLoadedImageTxId() {
216          checkLoaded();
217          return imgTxId;
218        }
219    
220        /**
221         * Throw IllegalStateException if load() has not yet been called.
222         */
223        private void checkLoaded() {
224          if (!loaded) {
225            throw new IllegalStateException("Image not yet loaded!");
226          }
227        }
228    
229        /**
230         * Throw IllegalStateException if load() has already been called.
231         */
232        private void checkNotLoaded() {
233          if (loaded) {
234            throw new IllegalStateException("Image already loaded!");
235          }
236        }
237    
238        void load(File curFile) throws IOException {
239          checkNotLoaded();
240          assert curFile != null : "curFile is null";
241    
242          StartupProgress prog = NameNode.getStartupProgress();
243          Step step = new Step(StepType.INODES);
244          prog.beginStep(Phase.LOADING_FSIMAGE, step);
245          long startTime = now();
246    
247          //
248          // Load in bits
249          //
250          MessageDigest digester = MD5Hash.getDigester();
251          DigestInputStream fin = new DigestInputStream(
252               new FileInputStream(curFile), digester);
253    
254          DataInputStream in = new DataInputStream(fin);
255          try {
256            // read image version: first appeared in version -1
257            int imgVersion = in.readInt();
258            if (getLayoutVersion() != imgVersion) {
259              throw new InconsistentFSStateException(curFile, 
260                  "imgVersion " + imgVersion +
261                  " expected to be " + getLayoutVersion());
262            }
263            boolean supportSnapshot = LayoutVersion.supports(Feature.SNAPSHOT,
264                imgVersion);
265    
266            // read namespaceID: first appeared in version -2
267            in.readInt();
268    
269            long numFiles = in.readLong();
270    
271            // read in the last generation stamp for legacy blocks.
272            long genstamp = in.readLong();
273            namesystem.setGenerationStampV1(genstamp);
274            
275            if (LayoutVersion.supports(Feature.SEQUENTIAL_BLOCK_ID, imgVersion)) {
276              // read the starting generation stamp for sequential block IDs
277              genstamp = in.readLong();
278              namesystem.setGenerationStampV2(genstamp);
279    
280              // read the last generation stamp for blocks created after
281              // the switch to sequential block IDs.
282              long stampAtIdSwitch = in.readLong();
283              namesystem.setGenerationStampV1Limit(stampAtIdSwitch);
284    
285              // read the max sequential block ID.
286              long maxSequentialBlockId = in.readLong();
287              namesystem.setLastAllocatedBlockId(maxSequentialBlockId);
288            } else {
289              long startingGenStamp = namesystem.upgradeGenerationStampToV2();
290              // This is an upgrade.
291              LOG.info("Upgrading to sequential block IDs. Generation stamp " +
292                       "for new blocks set to " + startingGenStamp);
293            }
294    
295            // read the transaction ID of the last edit represented by
296            // this image
297            if (LayoutVersion.supports(Feature.STORED_TXIDS, imgVersion)) {
298              imgTxId = in.readLong();
299            } else {
300              imgTxId = 0;
301            }
302    
303            // read the last allocated inode id in the fsimage
304            if (LayoutVersion.supports(Feature.ADD_INODE_ID, imgVersion)) {
305              long lastInodeId = in.readLong();
306              namesystem.resetLastInodeId(lastInodeId);
307              if (LOG.isDebugEnabled()) {
308                LOG.debug("load last allocated InodeId from fsimage:" + lastInodeId);
309              }
310            } else {
311              if (LOG.isDebugEnabled()) {
312                LOG.debug("Old layout version doesn't have inode id."
313                    + " Will assign new id for each inode.");
314              }
315            }
316            
317            if (supportSnapshot) {
318              snapshotMap = namesystem.getSnapshotManager().read(in, this);
319            }
320    
321            // read compression related info
322            FSImageCompression compression;
323            if (LayoutVersion.supports(Feature.FSIMAGE_COMPRESSION, imgVersion)) {
324              compression = FSImageCompression.readCompressionHeader(conf, in);
325            } else {
326              compression = FSImageCompression.createNoopCompression();
327            }
328            in = compression.unwrapInputStream(fin);
329    
330            LOG.info("Loading image file " + curFile + " using " + compression);
331            
332            // load all inodes
333            LOG.info("Number of files = " + numFiles);
334            prog.setTotal(Phase.LOADING_FSIMAGE, step, numFiles);
335            Counter counter = prog.getCounter(Phase.LOADING_FSIMAGE, step);
336            if (LayoutVersion.supports(Feature.FSIMAGE_NAME_OPTIMIZATION,
337                imgVersion)) {
338              if (supportSnapshot) {
339                loadLocalNameINodesWithSnapshot(numFiles, in, counter);
340              } else {
341                loadLocalNameINodes(numFiles, in, counter);
342              }
343            } else {
344              loadFullNameINodes(numFiles, in, counter);
345            }
346    
347            loadFilesUnderConstruction(in, supportSnapshot, counter);
348            prog.endStep(Phase.LOADING_FSIMAGE, step);
349            // Now that the step is finished, set counter equal to total to adjust
350            // for possible under-counting due to reference inodes.
351            prog.setCount(Phase.LOADING_FSIMAGE, step, numFiles);
352    
353            loadSecretManagerState(in);
354    
355            // make sure to read to the end of file
356            boolean eof = (in.read() == -1);
357            assert eof : "Should have reached the end of image file " + curFile;
358          } finally {
359            in.close();
360          }
361    
362          imgDigest = new MD5Hash(digester.digest());
363          loaded = true;
364          
365          LOG.info("Image file " + curFile + " of size " + curFile.length() +
366              " bytes loaded in " + (now() - startTime)/1000 + " seconds.");
367        }
368    
369      /** Update the root node's attributes */
370      private void updateRootAttr(INodeWithAdditionalFields root) {                                                           
371        long nsQuota = root.getNsQuota();
372        long dsQuota = root.getDsQuota();
373        FSDirectory fsDir = namesystem.dir;
374        if (nsQuota != -1 || dsQuota != -1) {
375          fsDir.rootDir.setQuota(nsQuota, dsQuota);
376        }
377        fsDir.rootDir.cloneModificationTime(root);
378        fsDir.rootDir.clonePermissionStatus(root);    
379      }
380      
381        /**
382         * Load fsimage files when 1) only local names are stored, 
383         * and 2) snapshot is supported.
384         * 
385         * @param numFiles number of files expected to be read
386         * @param in Image input stream
387         * @param counter Counter to increment for namenode startup progress
388         */
389        private void loadLocalNameINodesWithSnapshot(long numFiles, DataInput in,
390            Counter counter) throws IOException {
391          assert LayoutVersion.supports(Feature.FSIMAGE_NAME_OPTIMIZATION,
392              getLayoutVersion());
393          assert LayoutVersion.supports(Feature.SNAPSHOT, getLayoutVersion());
394          
395          // load root
396          loadRoot(in, counter);
397          // load rest of the nodes recursively
398          loadDirectoryWithSnapshot(in, counter);
399        }
400        
401      /** 
402       * load fsimage files assuming only local names are stored
403       *   
404       * @param numFiles number of files expected to be read
405       * @param in image input stream
406       * @param counter Counter to increment for namenode startup progress
407       * @throws IOException
408       */  
409       private void loadLocalNameINodes(long numFiles, DataInput in, Counter counter)
410           throws IOException {
411         assert LayoutVersion.supports(Feature.FSIMAGE_NAME_OPTIMIZATION,
412             getLayoutVersion());
413         assert numFiles > 0;
414    
415         // load root
416         loadRoot(in, counter);
417         // have loaded the first file (the root)
418         numFiles--; 
419    
420         // load rest of the nodes directory by directory
421         while (numFiles > 0) {
422           numFiles -= loadDirectory(in, counter);
423         }
424         if (numFiles != 0) {
425           throw new IOException("Read unexpect number of files: " + -numFiles);
426         }
427       }
428       
429        /**
430         * Load information about root, and use the information to update the root
431         * directory of NameSystem.
432         * @param in The {@link DataInput} instance to read.
433         * @param counter Counter to increment for namenode startup progress
434         */
435        private void loadRoot(DataInput in, Counter counter)
436            throws IOException {
437          // load root
438          if (in.readShort() != 0) {
439            throw new IOException("First node is not root");
440          }
441          final INodeDirectory root = loadINode(null, false, in, counter)
442            .asDirectory();
443          // update the root's attributes
444          updateRootAttr(root);
445        }
446       
447        /** Load children nodes for the parent directory. */
448        private int loadChildren(INodeDirectory parent, DataInput in,
449            Counter counter) throws IOException {
450          int numChildren = in.readInt();
451          for (int i = 0; i < numChildren; i++) {
452            // load single inode
453            INode newNode = loadINodeWithLocalName(false, in, true, counter);
454            addToParent(parent, newNode);
455          }
456          return numChildren;
457        }
458        
459        /**
460         * Load a directory when snapshot is supported.
461         * @param in The {@link DataInput} instance to read.
462         * @param counter Counter to increment for namenode startup progress
463         */
464        private void loadDirectoryWithSnapshot(DataInput in, Counter counter)
465            throws IOException {
466          // Step 1. Identify the parent INode
467          long inodeId = in.readLong();
468          final INodeDirectory parent = this.namesystem.dir.getInode(inodeId)
469              .asDirectory();
470          
471          // Check if the whole subtree has been saved (for reference nodes)
472          boolean toLoadSubtree = referenceMap.toProcessSubtree(parent.getId());
473          if (!toLoadSubtree) {
474            return;
475          }
476          
477          // Step 2. Load snapshots if parent is snapshottable
478          int numSnapshots = in.readInt();
479          if (numSnapshots >= 0) {
480            final INodeDirectorySnapshottable snapshottableParent
481                = INodeDirectorySnapshottable.valueOf(parent, parent.getLocalName());
482            // load snapshots and snapshotQuota
483            SnapshotFSImageFormat.loadSnapshotList(snapshottableParent,
484                numSnapshots, in, this);
485            if (snapshottableParent.getSnapshotQuota() > 0) {
486              // add the directory to the snapshottable directory list in 
487              // SnapshotManager. Note that we only add root when its snapshot quota
488              // is positive.
489              this.namesystem.getSnapshotManager().addSnapshottable(
490                  snapshottableParent);
491            }
492          }
493    
494          // Step 3. Load children nodes under parent
495          loadChildren(parent, in, counter);
496          
497          // Step 4. load Directory Diff List
498          SnapshotFSImageFormat.loadDirectoryDiffList(parent, in, this);
499          
500          // Recursively load sub-directories, including snapshot copies of deleted
501          // directories
502          int numSubTree = in.readInt();
503          for (int i = 0; i < numSubTree; i++) {
504            loadDirectoryWithSnapshot(in, counter);
505          }
506        }
507        
508       /**
509        * Load all children of a directory
510        * 
511        * @param in
512        * @param counter Counter to increment for namenode startup progress
513        * @return number of child inodes read
514        * @throws IOException
515        */
516       private int loadDirectory(DataInput in, Counter counter) throws IOException {
517         String parentPath = FSImageSerialization.readString(in);
518         final INodeDirectory parent = INodeDirectory.valueOf(
519             namesystem.dir.rootDir.getNode(parentPath, true), parentPath);
520         return loadChildren(parent, in, counter);
521       }
522    
523      /**
524       * load fsimage files assuming full path names are stored
525       * 
526       * @param numFiles total number of files to load
527       * @param in data input stream
528       * @param counter Counter to increment for namenode startup progress
529       * @throws IOException if any error occurs
530       */
531      private void loadFullNameINodes(long numFiles, DataInput in, Counter counter)
532          throws IOException {
533        byte[][] pathComponents;
534        byte[][] parentPath = {{}};      
535        FSDirectory fsDir = namesystem.dir;
536        INodeDirectory parentINode = fsDir.rootDir;
537        for (long i = 0; i < numFiles; i++) {
538          pathComponents = FSImageSerialization.readPathComponents(in);
539          final INode newNode = loadINode(
540              pathComponents[pathComponents.length-1], false, in, counter);
541    
542          if (isRoot(pathComponents)) { // it is the root
543            // update the root's attributes
544            updateRootAttr(newNode.asDirectory());
545            continue;
546          }
547          // check if the new inode belongs to the same parent
548          if(!isParent(pathComponents, parentPath)) {
549            parentINode = getParentINodeDirectory(pathComponents);
550            parentPath = getParent(pathComponents);
551          }
552    
553          // add new inode
554          addToParent(parentINode, newNode);
555        }
556      }
557    
558      private INodeDirectory getParentINodeDirectory(byte[][] pathComponents
559          ) throws FileNotFoundException, PathIsNotDirectoryException,
560          UnresolvedLinkException {
561        if (pathComponents.length < 2) { // root
562          return null;
563        }
564        // Gets the parent INode
565        final INodesInPath inodes = namesystem.dir.getExistingPathINodes(
566            pathComponents);
567        return INodeDirectory.valueOf(inodes.getINode(-2), pathComponents);
568      }
569    
570      /**
571       * Add the child node to parent and, if child is a file, update block map.
572       * This method is only used for image loading so that synchronization,
573       * modification time update and space count update are not needed.
574       */
575      private void addToParent(INodeDirectory parent, INode child) {
576        FSDirectory fsDir = namesystem.dir;
577        if (parent == fsDir.rootDir && FSDirectory.isReservedName(child)) {
578            throw new HadoopIllegalArgumentException("File name \""
579                + child.getLocalName() + "\" is reserved. Please "
580                + " change the name of the existing file or directory to another "
581                + "name before upgrading to this release.");
582        }
583        // NOTE: This does not update space counts for parents
584        if (!parent.addChild(child)) {
585          return;
586        }
587        namesystem.dir.cacheName(child);
588    
589        if (child.isFile()) {
590          // Add file->block mapping
591          final INodeFile file = child.asFile();
592          final BlockInfo[] blocks = file.getBlocks();
593          if (blocks != null) {
594            final BlockManager bm = namesystem.getBlockManager();
595            for (int i = 0; i < blocks.length; i++) {
596              file.setBlock(i, bm.addBlockCollection(blocks[i], file));
597            } 
598          }
599        }
600      }
601    
602        /** @return The FSDirectory of the namesystem where the fsimage is loaded */
603        public FSDirectory getFSDirectoryInLoading() {
604          return namesystem.dir;
605        }
606    
607        public INode loadINodeWithLocalName(boolean isSnapshotINode, DataInput in,
608            boolean updateINodeMap) throws IOException {
609          return loadINodeWithLocalName(isSnapshotINode, in, updateINodeMap, null);
610        }
611    
612        public INode loadINodeWithLocalName(boolean isSnapshotINode,
613            DataInput in, boolean updateINodeMap, Counter counter)
614            throws IOException {
615          final byte[] localName = FSImageSerialization.readLocalName(in);
616          INode inode = loadINode(localName, isSnapshotINode, in, counter);
617          if (updateINodeMap
618              && LayoutVersion.supports(Feature.ADD_INODE_ID, getLayoutVersion())) {
619            namesystem.dir.addToInodeMap(inode);
620          }
621          return inode;
622        }
623      
624      /**
625       * load an inode from fsimage except for its name
626       * 
627       * @param in data input stream from which image is read
628       * @param counter Counter to increment for namenode startup progress
629       * @return an inode
630       */
631      @SuppressWarnings("deprecation")
632      INode loadINode(final byte[] localName, boolean isSnapshotINode,
633          DataInput in, Counter counter) throws IOException {
634        final int imgVersion = getLayoutVersion();
635        if (LayoutVersion.supports(Feature.SNAPSHOT, imgVersion)) {
636          namesystem.getFSDirectory().verifyINodeName(localName);
637        }
638    
639        long inodeId = LayoutVersion.supports(Feature.ADD_INODE_ID, imgVersion) ? 
640               in.readLong() : namesystem.allocateNewInodeId();
641        
642        final short replication = namesystem.getBlockManager().adjustReplication(
643            in.readShort());
644        final long modificationTime = in.readLong();
645        long atime = 0;
646        if (LayoutVersion.supports(Feature.FILE_ACCESS_TIME, imgVersion)) {
647          atime = in.readLong();
648        }
649        final long blockSize = in.readLong();
650        final int numBlocks = in.readInt();
651    
652        if (numBlocks >= 0) {
653          // file
654          
655          // read blocks
656          BlockInfo[] blocks = null;
657          if (numBlocks >= 0) {
658            blocks = new BlockInfo[numBlocks];
659            for (int j = 0; j < numBlocks; j++) {
660              blocks[j] = new BlockInfo(replication);
661              blocks[j].readFields(in);
662            }
663          }
664    
665          String clientName = "";
666          String clientMachine = "";
667          boolean underConstruction = false;
668          FileDiffList fileDiffs = null;
669          if (LayoutVersion.supports(Feature.SNAPSHOT, imgVersion)) {
670            // read diffs
671            fileDiffs = SnapshotFSImageFormat.loadFileDiffList(in, this);
672    
673            if (isSnapshotINode) {
674              underConstruction = in.readBoolean();
675              if (underConstruction) {
676                clientName = FSImageSerialization.readString(in);
677                clientMachine = FSImageSerialization.readString(in);
678              }
679            }
680          }
681    
682          final PermissionStatus permissions = PermissionStatus.read(in);
683    
684          // return
685          if (counter != null) {
686            counter.increment();
687          }
688          final INodeFile file = new INodeFile(inodeId, localName, permissions,
689              modificationTime, atime, blocks, replication, blockSize);
690          return fileDiffs != null? new INodeFileWithSnapshot(file, fileDiffs)
691              : underConstruction? new INodeFileUnderConstruction(
692                  file, clientName, clientMachine, null)
693              : file;
694        } else if (numBlocks == -1) {
695          //directory
696          
697          //read quotas
698          final long nsQuota = in.readLong();
699          long dsQuota = -1L;
700          if (LayoutVersion.supports(Feature.DISKSPACE_QUOTA, imgVersion)) {
701            dsQuota = in.readLong();
702          }
703    
704          //read snapshot info
705          boolean snapshottable = false;
706          boolean withSnapshot = false;
707          if (LayoutVersion.supports(Feature.SNAPSHOT, imgVersion)) {
708            snapshottable = in.readBoolean();
709            if (!snapshottable) {
710              withSnapshot = in.readBoolean();
711            }
712          }
713    
714          final PermissionStatus permissions = PermissionStatus.read(in);
715    
716          //return
717          if (counter != null) {
718            counter.increment();
719          }
720          final INodeDirectory dir = nsQuota >= 0 || dsQuota >= 0?
721              new INodeDirectoryWithQuota(inodeId, localName, permissions,
722                  modificationTime, nsQuota, dsQuota)
723              : new INodeDirectory(inodeId, localName, permissions, modificationTime);
724          return snapshottable ? new INodeDirectorySnapshottable(dir)
725              : withSnapshot ? new INodeDirectoryWithSnapshot(dir)
726              : dir;
727        } else if (numBlocks == -2) {
728          //symlink
729          if (!FileSystem.isSymlinksEnabled()) {
730            throw new IOException("Symlinks not supported - please remove symlink before upgrading to this version of HDFS");
731          }
732    
733          final String symlink = Text.readString(in);
734          final PermissionStatus permissions = PermissionStatus.read(in);
735          if (counter != null) {
736            counter.increment();
737          }
738          return new INodeSymlink(inodeId, localName, permissions,
739              modificationTime, atime, symlink);
740        } else if (numBlocks == -3) {
741          //reference
742          // Intentionally do not increment counter, because it is too difficult at
743          // this point to assess whether or not this is a reference that counts
744          // toward quota.
745          
746          final boolean isWithName = in.readBoolean();
747          // lastSnapshotId for WithName node, dstSnapshotId for DstReference node
748          int snapshotId = in.readInt();
749          
750          final INodeReference.WithCount withCount
751              = referenceMap.loadINodeReferenceWithCount(isSnapshotINode, in, this);
752    
753          if (isWithName) {
754              return new INodeReference.WithName(null, withCount, localName,
755                  snapshotId);
756          } else {
757            final INodeReference ref = new INodeReference.DstReference(null,
758                withCount, snapshotId);
759            return ref;
760          }
761        }
762        
763        throw new IOException("Unknown inode type: numBlocks=" + numBlocks);
764      }
765    
766        /** Load {@link INodeFileAttributes}. */
767        public INodeFileAttributes loadINodeFileAttributes(DataInput in)
768            throws IOException {
769          final int layoutVersion = getLayoutVersion();
770          
771          if (!LayoutVersion.supports(Feature.OPTIMIZE_SNAPSHOT_INODES, layoutVersion)) {
772            return loadINodeWithLocalName(true, in, false).asFile();
773          }
774      
775          final byte[] name = FSImageSerialization.readLocalName(in);
776          final PermissionStatus permissions = PermissionStatus.read(in);
777          final long modificationTime = in.readLong();
778          final long accessTime = in.readLong();
779      
780          final short replication = namesystem.getBlockManager().adjustReplication(
781              in.readShort());
782          final long preferredBlockSize = in.readLong();
783          
784          return new INodeFileAttributes.SnapshotCopy(name, permissions, modificationTime,
785              accessTime, replication, preferredBlockSize);
786        }
787    
788        public INodeDirectoryAttributes loadINodeDirectoryAttributes(DataInput in)
789            throws IOException {
790          final int layoutVersion = getLayoutVersion();
791          
792          if (!LayoutVersion.supports(Feature.OPTIMIZE_SNAPSHOT_INODES, layoutVersion)) {
793            return loadINodeWithLocalName(true, in, false).asDirectory();
794          }
795      
796          final byte[] name = FSImageSerialization.readLocalName(in);
797          final PermissionStatus permissions = PermissionStatus.read(in);
798          final long modificationTime = in.readLong();
799          
800          //read quotas
801          final long nsQuota = in.readLong();
802          final long dsQuota = in.readLong();
803      
804          return nsQuota == -1L && dsQuota == -1L?
805              new INodeDirectoryAttributes.SnapshotCopy(name, permissions, modificationTime)
806            : new INodeDirectoryAttributes.CopyWithQuota(name, permissions,
807                modificationTime, nsQuota, dsQuota);
808        }
809      
810        private void loadFilesUnderConstruction(DataInput in,
811            boolean supportSnapshot, Counter counter) throws IOException {
812          FSDirectory fsDir = namesystem.dir;
813          int size = in.readInt();
814    
815          LOG.info("Number of files under construction = " + size);
816    
817          for (int i = 0; i < size; i++) {
818            INodeFileUnderConstruction cons = FSImageSerialization
819                .readINodeUnderConstruction(in, namesystem, getLayoutVersion());
820            counter.increment();
821    
822            // verify that file exists in namespace
823            String path = cons.getLocalName();
824            final INodesInPath iip = fsDir.getLastINodeInPath(path);
825            INodeFile oldnode = INodeFile.valueOf(iip.getINode(0), path);
826            cons.setLocalName(oldnode.getLocalNameBytes());
827            cons.setParent(oldnode.getParent());
828    
829            if (oldnode instanceof INodeFileWithSnapshot) {
830              cons = new INodeFileUnderConstructionWithSnapshot(cons,
831                  ((INodeFileWithSnapshot)oldnode).getDiffs());
832            }
833    
834            fsDir.replaceINodeFile(path, oldnode, cons);
835            namesystem.leaseManager.addLease(cons.getClientName(), path); 
836          }
837        }
838    
839        private void loadSecretManagerState(DataInput in)
840            throws IOException {
841          int imgVersion = getLayoutVersion();
842    
843          if (!LayoutVersion.supports(Feature.DELEGATION_TOKEN, imgVersion)) {
844            //SecretManagerState is not available.
845            //This must not happen if security is turned on.
846            return; 
847          }
848          namesystem.loadSecretManagerState(in);
849        }
850    
851        private int getLayoutVersion() {
852          return namesystem.getFSImage().getStorage().getLayoutVersion();
853        }
854    
855        private boolean isRoot(byte[][] path) {
856          return path.length == 1 &&
857            path[0] == null;    
858        }
859    
860        private boolean isParent(byte[][] path, byte[][] parent) {
861          if (path == null || parent == null)
862            return false;
863          if (parent.length == 0 || path.length != parent.length + 1)
864            return false;
865          boolean isParent = true;
866          for (int i = 0; i < parent.length; i++) {
867            isParent = isParent && Arrays.equals(path[i], parent[i]); 
868          }
869          return isParent;
870        }
871    
872        /**
873         * Return string representing the parent of the given path.
874         */
875        String getParent(String path) {
876          return path.substring(0, path.lastIndexOf(Path.SEPARATOR));
877        }
878        
879        byte[][] getParent(byte[][] path) {
880          byte[][] result = new byte[path.length - 1][];
881          for (int i = 0; i < result.length; i++) {
882            result[i] = new byte[path[i].length];
883            System.arraycopy(path[i], 0, result[i], 0, path[i].length);
884          }
885          return result;
886        }
887        
888        public Snapshot getSnapshot(DataInput in) throws IOException {
889          return snapshotMap.get(in.readInt());
890        }
891      }
892      
893      /**
894       * A one-shot class responsible for writing an image file.
895       * The write() function should be called once, after which the getter
896       * functions may be used to retrieve information about the file that was written.
897       */
898      static class Saver {
899        private final SaveNamespaceContext context;
900        /** Set to true once an image has been written */
901        private boolean saved = false;
902        
903        /** The MD5 checksum of the file that was written */
904        private MD5Hash savedDigest;
905        private final ReferenceMap referenceMap = new ReferenceMap();
906    
907        /** @throws IllegalStateException if the instance has not yet saved an image */
908        private void checkSaved() {
909          if (!saved) {
910            throw new IllegalStateException("FSImageSaver has not saved an image");
911          }
912        }
913        
914        /** @throws IllegalStateException if the instance has already saved an image */
915        private void checkNotSaved() {
916          if (saved) {
917            throw new IllegalStateException("FSImageSaver has already saved an image");
918          }
919        }
920        
921    
922        Saver(SaveNamespaceContext context) {
923          this.context = context;
924        }
925    
926        /**
927         * Return the MD5 checksum of the image file that was saved.
928         */
929        MD5Hash getSavedDigest() {
930          checkSaved();
931          return savedDigest;
932        }
933    
934        void save(File newFile, FSImageCompression compression) throws IOException {
935          checkNotSaved();
936    
937          final FSNamesystem sourceNamesystem = context.getSourceNamesystem();
938          FSDirectory fsDir = sourceNamesystem.dir;
939          String sdPath = newFile.getParentFile().getParentFile().getAbsolutePath();
940          Step step = new Step(StepType.INODES, sdPath);
941          StartupProgress prog = NameNode.getStartupProgress();
942          prog.beginStep(Phase.SAVING_CHECKPOINT, step);
943          prog.setTotal(Phase.SAVING_CHECKPOINT, step,
944            fsDir.rootDir.numItemsInTree());
945          Counter counter = prog.getCounter(Phase.SAVING_CHECKPOINT, step);
946          long startTime = now();
947          //
948          // Write out data
949          //
950          MessageDigest digester = MD5Hash.getDigester();
951          FileOutputStream fout = new FileOutputStream(newFile);
952          DigestOutputStream fos = new DigestOutputStream(fout, digester);
953          DataOutputStream out = new DataOutputStream(fos);
954          try {
955            out.writeInt(HdfsConstants.LAYOUT_VERSION);
956            // We use the non-locked version of getNamespaceInfo here since
957            // the coordinating thread of saveNamespace already has read-locked
958            // the namespace for us. If we attempt to take another readlock
959            // from the actual saver thread, there's a potential of a
960            // fairness-related deadlock. See the comments on HDFS-2223.
961            out.writeInt(sourceNamesystem.unprotectedGetNamespaceInfo()
962                .getNamespaceID());
963            out.writeLong(fsDir.rootDir.numItemsInTree());
964            out.writeLong(sourceNamesystem.getGenerationStampV1());
965            out.writeLong(sourceNamesystem.getGenerationStampV2());
966            out.writeLong(sourceNamesystem.getGenerationStampAtblockIdSwitch());
967            out.writeLong(sourceNamesystem.getLastAllocatedBlockId());
968            out.writeLong(context.getTxId());
969            out.writeLong(sourceNamesystem.getLastInodeId());
970    
971            
972            sourceNamesystem.getSnapshotManager().write(out);
973            
974            // write compression info and set up compressed stream
975            out = compression.writeHeaderAndWrapStream(fos);
976            LOG.info("Saving image file " + newFile +
977                     " using " + compression);
978    
979            // save the root
980            saveINode2Image(fsDir.rootDir, out, false, referenceMap, counter);
981            // save the rest of the nodes
982            saveImage(fsDir.rootDir, out, true, counter);
983            prog.endStep(Phase.SAVING_CHECKPOINT, step);
984            // Now that the step is finished, set counter equal to total to adjust
985            // for possible under-counting due to reference inodes.
986            prog.setCount(Phase.SAVING_CHECKPOINT, step,
987              fsDir.rootDir.numItemsInTree());
988            // save files under construction
989            sourceNamesystem.saveFilesUnderConstruction(out);
990            context.checkCancelled();
991            sourceNamesystem.saveSecretManagerState(out, sdPath);
992            context.checkCancelled();
993            out.flush();
994            context.checkCancelled();
995            fout.getChannel().force(true);
996          } finally {
997            out.close();
998          }
999    
1000          saved = true;
1001          // set md5 of the saved image
1002          savedDigest = new MD5Hash(digester.digest());
1003    
1004          LOG.info("Image file " + newFile + " of size " + newFile.length() +
1005              " bytes saved in " + (now() - startTime)/1000 + " seconds.");
1006        }
1007    
1008        /**
1009         * Save children INodes.
1010         * @param children The list of children INodes
1011         * @param out The DataOutputStream to write
1012         * @param counter Counter to increment for namenode startup progress
1013         * @return Number of children that are directory
1014         */
1015        private int saveChildren(ReadOnlyList<INode> children, DataOutputStream out,
1016            Counter counter) throws IOException {
1017          // Write normal children INode. 
1018          out.writeInt(children.size());
1019          int dirNum = 0;
1020          int i = 0;
1021          for(INode child : children) {
1022            // print all children first
1023            saveINode2Image(child, out, false, referenceMap, counter);
1024            if (child.isDirectory()) {
1025              dirNum++;
1026            }
1027            if (i++ % 50 == 0) {
1028              context.checkCancelled();
1029            }
1030          }
1031          return dirNum;
1032        }
1033        
1034        /**
1035         * Save file tree image starting from the given root.
1036         * This is a recursive procedure, which first saves all children and 
1037         * snapshot diffs of a current directory and then moves inside the 
1038         * sub-directories.
1039         * 
1040         * @param current The current node
1041         * @param out The DataoutputStream to write the image
1042         * @param snapshot The possible snapshot associated with the current node
1043         * @param toSaveSubtree Whether or not to save the subtree to fsimage. For
1044         *                      reference node, its subtree may already have been
1045         *                      saved before.
1046         * @param counter Counter to increment for namenode startup progress
1047         */
1048        private void saveImage(INodeDirectory current, DataOutputStream out,
1049            boolean toSaveSubtree, Counter counter) throws IOException {
1050          // write the inode id of the directory
1051          out.writeLong(current.getId());
1052          
1053          if (!toSaveSubtree) {
1054            return;
1055          }
1056          
1057          final ReadOnlyList<INode> children = current.getChildrenList(null);
1058          int dirNum = 0;
1059          List<INodeDirectory> snapshotDirs = null;
1060          if (current instanceof INodeDirectoryWithSnapshot) {
1061            snapshotDirs = new ArrayList<INodeDirectory>();
1062            ((INodeDirectoryWithSnapshot) current).getSnapshotDirectory(
1063                snapshotDirs);
1064            dirNum += snapshotDirs.size();
1065          }
1066          
1067          // 2. Write INodeDirectorySnapshottable#snapshotsByNames to record all
1068          // Snapshots
1069          if (current instanceof INodeDirectorySnapshottable) {
1070            INodeDirectorySnapshottable snapshottableNode = 
1071                (INodeDirectorySnapshottable) current;
1072            SnapshotFSImageFormat.saveSnapshots(snapshottableNode, out);
1073          } else {
1074            out.writeInt(-1); // # of snapshots
1075          }
1076    
1077          // 3. Write children INode 
1078          dirNum += saveChildren(children, out, counter);
1079          
1080          // 4. Write DirectoryDiff lists, if there is any.
1081          SnapshotFSImageFormat.saveDirectoryDiffList(current, out, referenceMap);
1082          
1083          // Write sub-tree of sub-directories, including possible snapshots of 
1084          // deleted sub-directories
1085          out.writeInt(dirNum); // the number of sub-directories
1086          for(INode child : children) {
1087            if(!child.isDirectory()) {
1088              continue;
1089            }
1090            // make sure we only save the subtree under a reference node once
1091            boolean toSave = child.isReference() ? 
1092                referenceMap.toProcessSubtree(child.getId()) : true;
1093            saveImage(child.asDirectory(), out, toSave, counter);
1094          }
1095          if (snapshotDirs != null) {
1096            for (INodeDirectory subDir : snapshotDirs) {
1097              // make sure we only save the subtree under a reference node once
1098              boolean toSave = subDir.getParentReference() != null ? 
1099                  referenceMap.toProcessSubtree(subDir.getId()) : true;
1100              saveImage(subDir, out, toSave, counter);
1101            }
1102          }
1103        }
1104    
1105        /**
1106         * Saves inode and increments progress counter.
1107         * 
1108         * @param inode INode to save
1109         * @param out DataOutputStream to receive inode
1110         * @param writeUnderConstruction boolean true if this is under construction
1111         * @param referenceMap ReferenceMap containing reference inodes
1112         * @param counter Counter to increment for namenode startup progress
1113         * @throws IOException thrown if there is an I/O error
1114         */
1115        private void saveINode2Image(INode inode, DataOutputStream out,
1116            boolean writeUnderConstruction, ReferenceMap referenceMap,
1117            Counter counter) throws IOException {
1118          FSImageSerialization.saveINode2Image(inode, out, writeUnderConstruction,
1119            referenceMap);
1120          // Intentionally do not increment counter for reference inodes, because it
1121          // is too difficult at this point to assess whether or not this is a
1122          // reference that counts toward quota.
1123          if (!(inode instanceof INodeReference)) {
1124            counter.increment();
1125          }
1126        }
1127      }
1128    }