001 /**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements. See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership. The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License. You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018 package org.apache.hadoop.hdfs.server.namenode;
019
020 import static org.apache.hadoop.util.Time.now;
021
022 import java.io.DataInput;
023 import java.io.DataInputStream;
024 import java.io.DataOutputStream;
025 import java.io.File;
026 import java.io.FileInputStream;
027 import java.io.FileNotFoundException;
028 import java.io.FileOutputStream;
029 import java.io.IOException;
030 import java.security.DigestInputStream;
031 import java.security.DigestOutputStream;
032 import java.security.MessageDigest;
033 import java.util.ArrayList;
034 import java.util.Arrays;
035 import java.util.List;
036 import java.util.Map;
037
038 import org.apache.commons.logging.Log;
039 import org.apache.hadoop.HadoopIllegalArgumentException;
040 import org.apache.hadoop.classification.InterfaceAudience;
041 import org.apache.hadoop.classification.InterfaceStability;
042 import org.apache.hadoop.conf.Configuration;
043 import org.apache.hadoop.fs.FileSystem;
044 import org.apache.hadoop.fs.Path;
045 import org.apache.hadoop.fs.PathIsNotDirectoryException;
046 import org.apache.hadoop.fs.UnresolvedLinkException;
047 import org.apache.hadoop.fs.permission.PermissionStatus;
048 import org.apache.hadoop.hdfs.protocol.HdfsConstants;
049 import org.apache.hadoop.hdfs.protocol.LayoutVersion;
050 import org.apache.hadoop.hdfs.protocol.LayoutVersion.Feature;
051 import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfo;
052 import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager;
053 import org.apache.hadoop.hdfs.server.common.InconsistentFSStateException;
054 import org.apache.hadoop.hdfs.server.namenode.snapshot.FileWithSnapshot.FileDiffList;
055 import org.apache.hadoop.hdfs.server.namenode.snapshot.INodeDirectorySnapshottable;
056 import org.apache.hadoop.hdfs.server.namenode.snapshot.INodeDirectoryWithSnapshot;
057 import org.apache.hadoop.hdfs.server.namenode.snapshot.INodeFileUnderConstructionWithSnapshot;
058 import org.apache.hadoop.hdfs.server.namenode.snapshot.INodeFileWithSnapshot;
059 import org.apache.hadoop.hdfs.server.namenode.snapshot.Snapshot;
060 import org.apache.hadoop.hdfs.server.namenode.snapshot.SnapshotFSImageFormat;
061 import org.apache.hadoop.hdfs.server.namenode.snapshot.SnapshotFSImageFormat.ReferenceMap;
062 import org.apache.hadoop.hdfs.server.namenode.startupprogress.Phase;
063 import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress;
064 import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress.Counter;
065 import org.apache.hadoop.hdfs.server.namenode.startupprogress.Step;
066 import org.apache.hadoop.hdfs.server.namenode.startupprogress.StepType;
067 import org.apache.hadoop.hdfs.util.ReadOnlyList;
068 import org.apache.hadoop.io.MD5Hash;
069 import org.apache.hadoop.io.Text;
070
071 /**
072 * Contains inner classes for reading or writing the on-disk format for
073 * FSImages.
074 *
075 * In particular, the format of the FSImage looks like:
076 * <pre>
077 * FSImage {
078 * layoutVersion: int, namespaceID: int, numberItemsInFSDirectoryTree: long,
079 * namesystemGenerationStampV1: long, namesystemGenerationStampV2: long,
080 * generationStampAtBlockIdSwitch:long, lastAllocatedBlockId:
081 * long transactionID: long, snapshotCounter: int, numberOfSnapshots: int,
082 * numOfSnapshottableDirs: int,
083 * {FSDirectoryTree, FilesUnderConstruction, SecretManagerState} (can be compressed)
084 * }
085 *
086 * FSDirectoryTree (if {@link Feature#FSIMAGE_NAME_OPTIMIZATION} is supported) {
087 * INodeInfo of root, numberOfChildren of root: int
088 * [list of INodeInfo of root's children],
089 * [list of INodeDirectoryInfo of root's directory children]
090 * }
091 *
092 * FSDirectoryTree (if {@link Feature#FSIMAGE_NAME_OPTIMIZATION} not supported){
093 * [list of INodeInfo of INodes in topological order]
094 * }
095 *
096 * INodeInfo {
097 * {
098 * localName: short + byte[]
099 * } when {@link Feature#FSIMAGE_NAME_OPTIMIZATION} is supported
100 * or
101 * {
102 * fullPath: byte[]
103 * } when {@link Feature#FSIMAGE_NAME_OPTIMIZATION} is not supported
104 * replicationFactor: short, modificationTime: long,
105 * accessTime: long, preferredBlockSize: long,
106 * numberOfBlocks: int (-1 for INodeDirectory, -2 for INodeSymLink),
107 * {
108 * nsQuota: long, dsQuota: long,
109 * {
110 * isINodeSnapshottable: byte,
111 * isINodeWithSnapshot: byte (if isINodeSnapshottable is false)
112 * } (when {@link Feature#SNAPSHOT} is supported),
113 * fsPermission: short, PermissionStatus
114 * } for INodeDirectory
115 * or
116 * {
117 * symlinkString, fsPermission: short, PermissionStatus
118 * } for INodeSymlink
119 * or
120 * {
121 * [list of BlockInfo]
122 * [list of FileDiff]
123 * {
124 * isINodeFileUnderConstructionSnapshot: byte,
125 * {clientName: short + byte[], clientMachine: short + byte[]} (when
126 * isINodeFileUnderConstructionSnapshot is true),
127 * } (when {@link Feature#SNAPSHOT} is supported and writing snapshotINode),
128 * fsPermission: short, PermissionStatus
129 * } for INodeFile
130 * }
131 *
132 * INodeDirectoryInfo {
133 * fullPath of the directory: short + byte[],
134 * numberOfChildren: int, [list of INodeInfo of children INode],
135 * {
136 * numberOfSnapshots: int,
137 * [list of Snapshot] (when NumberOfSnapshots is positive),
138 * numberOfDirectoryDiffs: int,
139 * [list of DirectoryDiff] (NumberOfDirectoryDiffs is positive),
140 * number of children that are directories,
141 * [list of INodeDirectoryInfo of the directory children] (includes
142 * snapshot copies of deleted sub-directories)
143 * } (when {@link Feature#SNAPSHOT} is supported),
144 * }
145 *
146 * Snapshot {
147 * snapshotID: int, root of Snapshot: INodeDirectoryInfo (its local name is
148 * the name of the snapshot)
149 * }
150 *
151 * DirectoryDiff {
152 * full path of the root of the associated Snapshot: short + byte[],
153 * childrenSize: int,
154 * isSnapshotRoot: byte,
155 * snapshotINodeIsNotNull: byte (when isSnapshotRoot is false),
156 * snapshotINode: INodeDirectory (when SnapshotINodeIsNotNull is true), Diff
157 * }
158 *
159 * Diff {
160 * createdListSize: int, [Local name of INode in created list],
161 * deletedListSize: int, [INode in deleted list: INodeInfo]
162 * }
163 *
164 * FileDiff {
165 * full path of the root of the associated Snapshot: short + byte[],
166 * fileSize: long,
167 * snapshotINodeIsNotNull: byte,
168 * snapshotINode: INodeFile (when SnapshotINodeIsNotNull is true), Diff
169 * }
170 * </pre>
171 */
172 @InterfaceAudience.Private
173 @InterfaceStability.Evolving
174 public class FSImageFormat {
175 private static final Log LOG = FSImage.LOG;
176
177 // Static-only class
178 private FSImageFormat() {}
179
180 /**
181 * A one-shot class responsible for loading an image. The load() function
182 * should be called once, after which the getter methods may be used to retrieve
183 * information about the image that was loaded, if loading was successful.
184 */
185 public static class Loader {
186 private final Configuration conf;
187 /** which namesystem this loader is working for */
188 private final FSNamesystem namesystem;
189
190 /** Set to true once a file has been loaded using this loader. */
191 private boolean loaded = false;
192
193 /** The transaction ID of the last edit represented by the loaded file */
194 private long imgTxId;
195 /** The MD5 sum of the loaded file */
196 private MD5Hash imgDigest;
197
198 private Map<Integer, Snapshot> snapshotMap = null;
199 private final ReferenceMap referenceMap = new ReferenceMap();
200
201 Loader(Configuration conf, FSNamesystem namesystem) {
202 this.conf = conf;
203 this.namesystem = namesystem;
204 }
205
206 /**
207 * Return the MD5 checksum of the image that has been loaded.
208 * @throws IllegalStateException if load() has not yet been called.
209 */
210 MD5Hash getLoadedImageMd5() {
211 checkLoaded();
212 return imgDigest;
213 }
214
215 long getLoadedImageTxId() {
216 checkLoaded();
217 return imgTxId;
218 }
219
220 /**
221 * Throw IllegalStateException if load() has not yet been called.
222 */
223 private void checkLoaded() {
224 if (!loaded) {
225 throw new IllegalStateException("Image not yet loaded!");
226 }
227 }
228
229 /**
230 * Throw IllegalStateException if load() has already been called.
231 */
232 private void checkNotLoaded() {
233 if (loaded) {
234 throw new IllegalStateException("Image already loaded!");
235 }
236 }
237
238 void load(File curFile) throws IOException {
239 checkNotLoaded();
240 assert curFile != null : "curFile is null";
241
242 StartupProgress prog = NameNode.getStartupProgress();
243 Step step = new Step(StepType.INODES);
244 prog.beginStep(Phase.LOADING_FSIMAGE, step);
245 long startTime = now();
246
247 //
248 // Load in bits
249 //
250 MessageDigest digester = MD5Hash.getDigester();
251 DigestInputStream fin = new DigestInputStream(
252 new FileInputStream(curFile), digester);
253
254 DataInputStream in = new DataInputStream(fin);
255 try {
256 // read image version: first appeared in version -1
257 int imgVersion = in.readInt();
258 if (getLayoutVersion() != imgVersion) {
259 throw new InconsistentFSStateException(curFile,
260 "imgVersion " + imgVersion +
261 " expected to be " + getLayoutVersion());
262 }
263 boolean supportSnapshot = LayoutVersion.supports(Feature.SNAPSHOT,
264 imgVersion);
265
266 // read namespaceID: first appeared in version -2
267 in.readInt();
268
269 long numFiles = in.readLong();
270
271 // read in the last generation stamp for legacy blocks.
272 long genstamp = in.readLong();
273 namesystem.setGenerationStampV1(genstamp);
274
275 if (LayoutVersion.supports(Feature.SEQUENTIAL_BLOCK_ID, imgVersion)) {
276 // read the starting generation stamp for sequential block IDs
277 genstamp = in.readLong();
278 namesystem.setGenerationStampV2(genstamp);
279
280 // read the last generation stamp for blocks created after
281 // the switch to sequential block IDs.
282 long stampAtIdSwitch = in.readLong();
283 namesystem.setGenerationStampV1Limit(stampAtIdSwitch);
284
285 // read the max sequential block ID.
286 long maxSequentialBlockId = in.readLong();
287 namesystem.setLastAllocatedBlockId(maxSequentialBlockId);
288 } else {
289 long startingGenStamp = namesystem.upgradeGenerationStampToV2();
290 // This is an upgrade.
291 LOG.info("Upgrading to sequential block IDs. Generation stamp " +
292 "for new blocks set to " + startingGenStamp);
293 }
294
295 // read the transaction ID of the last edit represented by
296 // this image
297 if (LayoutVersion.supports(Feature.STORED_TXIDS, imgVersion)) {
298 imgTxId = in.readLong();
299 } else {
300 imgTxId = 0;
301 }
302
303 // read the last allocated inode id in the fsimage
304 if (LayoutVersion.supports(Feature.ADD_INODE_ID, imgVersion)) {
305 long lastInodeId = in.readLong();
306 namesystem.resetLastInodeId(lastInodeId);
307 if (LOG.isDebugEnabled()) {
308 LOG.debug("load last allocated InodeId from fsimage:" + lastInodeId);
309 }
310 } else {
311 if (LOG.isDebugEnabled()) {
312 LOG.debug("Old layout version doesn't have inode id."
313 + " Will assign new id for each inode.");
314 }
315 }
316
317 if (supportSnapshot) {
318 snapshotMap = namesystem.getSnapshotManager().read(in, this);
319 }
320
321 // read compression related info
322 FSImageCompression compression;
323 if (LayoutVersion.supports(Feature.FSIMAGE_COMPRESSION, imgVersion)) {
324 compression = FSImageCompression.readCompressionHeader(conf, in);
325 } else {
326 compression = FSImageCompression.createNoopCompression();
327 }
328 in = compression.unwrapInputStream(fin);
329
330 LOG.info("Loading image file " + curFile + " using " + compression);
331
332 // load all inodes
333 LOG.info("Number of files = " + numFiles);
334 prog.setTotal(Phase.LOADING_FSIMAGE, step, numFiles);
335 Counter counter = prog.getCounter(Phase.LOADING_FSIMAGE, step);
336 if (LayoutVersion.supports(Feature.FSIMAGE_NAME_OPTIMIZATION,
337 imgVersion)) {
338 if (supportSnapshot) {
339 loadLocalNameINodesWithSnapshot(numFiles, in, counter);
340 } else {
341 loadLocalNameINodes(numFiles, in, counter);
342 }
343 } else {
344 loadFullNameINodes(numFiles, in, counter);
345 }
346
347 loadFilesUnderConstruction(in, supportSnapshot, counter);
348 prog.endStep(Phase.LOADING_FSIMAGE, step);
349 // Now that the step is finished, set counter equal to total to adjust
350 // for possible under-counting due to reference inodes.
351 prog.setCount(Phase.LOADING_FSIMAGE, step, numFiles);
352
353 loadSecretManagerState(in);
354
355 // make sure to read to the end of file
356 boolean eof = (in.read() == -1);
357 assert eof : "Should have reached the end of image file " + curFile;
358 } finally {
359 in.close();
360 }
361
362 imgDigest = new MD5Hash(digester.digest());
363 loaded = true;
364
365 LOG.info("Image file " + curFile + " of size " + curFile.length() +
366 " bytes loaded in " + (now() - startTime)/1000 + " seconds.");
367 }
368
369 /** Update the root node's attributes */
370 private void updateRootAttr(INodeWithAdditionalFields root) {
371 long nsQuota = root.getNsQuota();
372 long dsQuota = root.getDsQuota();
373 FSDirectory fsDir = namesystem.dir;
374 if (nsQuota != -1 || dsQuota != -1) {
375 fsDir.rootDir.setQuota(nsQuota, dsQuota);
376 }
377 fsDir.rootDir.cloneModificationTime(root);
378 fsDir.rootDir.clonePermissionStatus(root);
379 }
380
381 /**
382 * Load fsimage files when 1) only local names are stored,
383 * and 2) snapshot is supported.
384 *
385 * @param numFiles number of files expected to be read
386 * @param in Image input stream
387 * @param counter Counter to increment for namenode startup progress
388 */
389 private void loadLocalNameINodesWithSnapshot(long numFiles, DataInput in,
390 Counter counter) throws IOException {
391 assert LayoutVersion.supports(Feature.FSIMAGE_NAME_OPTIMIZATION,
392 getLayoutVersion());
393 assert LayoutVersion.supports(Feature.SNAPSHOT, getLayoutVersion());
394
395 // load root
396 loadRoot(in, counter);
397 // load rest of the nodes recursively
398 loadDirectoryWithSnapshot(in, counter);
399 }
400
401 /**
402 * load fsimage files assuming only local names are stored
403 *
404 * @param numFiles number of files expected to be read
405 * @param in image input stream
406 * @param counter Counter to increment for namenode startup progress
407 * @throws IOException
408 */
409 private void loadLocalNameINodes(long numFiles, DataInput in, Counter counter)
410 throws IOException {
411 assert LayoutVersion.supports(Feature.FSIMAGE_NAME_OPTIMIZATION,
412 getLayoutVersion());
413 assert numFiles > 0;
414
415 // load root
416 loadRoot(in, counter);
417 // have loaded the first file (the root)
418 numFiles--;
419
420 // load rest of the nodes directory by directory
421 while (numFiles > 0) {
422 numFiles -= loadDirectory(in, counter);
423 }
424 if (numFiles != 0) {
425 throw new IOException("Read unexpect number of files: " + -numFiles);
426 }
427 }
428
429 /**
430 * Load information about root, and use the information to update the root
431 * directory of NameSystem.
432 * @param in The {@link DataInput} instance to read.
433 * @param counter Counter to increment for namenode startup progress
434 */
435 private void loadRoot(DataInput in, Counter counter)
436 throws IOException {
437 // load root
438 if (in.readShort() != 0) {
439 throw new IOException("First node is not root");
440 }
441 final INodeDirectory root = loadINode(null, false, in, counter)
442 .asDirectory();
443 // update the root's attributes
444 updateRootAttr(root);
445 }
446
447 /** Load children nodes for the parent directory. */
448 private int loadChildren(INodeDirectory parent, DataInput in,
449 Counter counter) throws IOException {
450 int numChildren = in.readInt();
451 for (int i = 0; i < numChildren; i++) {
452 // load single inode
453 INode newNode = loadINodeWithLocalName(false, in, true, counter);
454 addToParent(parent, newNode);
455 }
456 return numChildren;
457 }
458
459 /**
460 * Load a directory when snapshot is supported.
461 * @param in The {@link DataInput} instance to read.
462 * @param counter Counter to increment for namenode startup progress
463 */
464 private void loadDirectoryWithSnapshot(DataInput in, Counter counter)
465 throws IOException {
466 // Step 1. Identify the parent INode
467 long inodeId = in.readLong();
468 final INodeDirectory parent = this.namesystem.dir.getInode(inodeId)
469 .asDirectory();
470
471 // Check if the whole subtree has been saved (for reference nodes)
472 boolean toLoadSubtree = referenceMap.toProcessSubtree(parent.getId());
473 if (!toLoadSubtree) {
474 return;
475 }
476
477 // Step 2. Load snapshots if parent is snapshottable
478 int numSnapshots = in.readInt();
479 if (numSnapshots >= 0) {
480 final INodeDirectorySnapshottable snapshottableParent
481 = INodeDirectorySnapshottable.valueOf(parent, parent.getLocalName());
482 // load snapshots and snapshotQuota
483 SnapshotFSImageFormat.loadSnapshotList(snapshottableParent,
484 numSnapshots, in, this);
485 if (snapshottableParent.getSnapshotQuota() > 0) {
486 // add the directory to the snapshottable directory list in
487 // SnapshotManager. Note that we only add root when its snapshot quota
488 // is positive.
489 this.namesystem.getSnapshotManager().addSnapshottable(
490 snapshottableParent);
491 }
492 }
493
494 // Step 3. Load children nodes under parent
495 loadChildren(parent, in, counter);
496
497 // Step 4. load Directory Diff List
498 SnapshotFSImageFormat.loadDirectoryDiffList(parent, in, this);
499
500 // Recursively load sub-directories, including snapshot copies of deleted
501 // directories
502 int numSubTree = in.readInt();
503 for (int i = 0; i < numSubTree; i++) {
504 loadDirectoryWithSnapshot(in, counter);
505 }
506 }
507
508 /**
509 * Load all children of a directory
510 *
511 * @param in
512 * @param counter Counter to increment for namenode startup progress
513 * @return number of child inodes read
514 * @throws IOException
515 */
516 private int loadDirectory(DataInput in, Counter counter) throws IOException {
517 String parentPath = FSImageSerialization.readString(in);
518 final INodeDirectory parent = INodeDirectory.valueOf(
519 namesystem.dir.rootDir.getNode(parentPath, true), parentPath);
520 return loadChildren(parent, in, counter);
521 }
522
523 /**
524 * load fsimage files assuming full path names are stored
525 *
526 * @param numFiles total number of files to load
527 * @param in data input stream
528 * @param counter Counter to increment for namenode startup progress
529 * @throws IOException if any error occurs
530 */
531 private void loadFullNameINodes(long numFiles, DataInput in, Counter counter)
532 throws IOException {
533 byte[][] pathComponents;
534 byte[][] parentPath = {{}};
535 FSDirectory fsDir = namesystem.dir;
536 INodeDirectory parentINode = fsDir.rootDir;
537 for (long i = 0; i < numFiles; i++) {
538 pathComponents = FSImageSerialization.readPathComponents(in);
539 final INode newNode = loadINode(
540 pathComponents[pathComponents.length-1], false, in, counter);
541
542 if (isRoot(pathComponents)) { // it is the root
543 // update the root's attributes
544 updateRootAttr(newNode.asDirectory());
545 continue;
546 }
547 // check if the new inode belongs to the same parent
548 if(!isParent(pathComponents, parentPath)) {
549 parentINode = getParentINodeDirectory(pathComponents);
550 parentPath = getParent(pathComponents);
551 }
552
553 // add new inode
554 addToParent(parentINode, newNode);
555 }
556 }
557
558 private INodeDirectory getParentINodeDirectory(byte[][] pathComponents
559 ) throws FileNotFoundException, PathIsNotDirectoryException,
560 UnresolvedLinkException {
561 if (pathComponents.length < 2) { // root
562 return null;
563 }
564 // Gets the parent INode
565 final INodesInPath inodes = namesystem.dir.getExistingPathINodes(
566 pathComponents);
567 return INodeDirectory.valueOf(inodes.getINode(-2), pathComponents);
568 }
569
570 /**
571 * Add the child node to parent and, if child is a file, update block map.
572 * This method is only used for image loading so that synchronization,
573 * modification time update and space count update are not needed.
574 */
575 private void addToParent(INodeDirectory parent, INode child) {
576 FSDirectory fsDir = namesystem.dir;
577 if (parent == fsDir.rootDir && FSDirectory.isReservedName(child)) {
578 throw new HadoopIllegalArgumentException("File name \""
579 + child.getLocalName() + "\" is reserved. Please "
580 + " change the name of the existing file or directory to another "
581 + "name before upgrading to this release.");
582 }
583 // NOTE: This does not update space counts for parents
584 if (!parent.addChild(child)) {
585 return;
586 }
587 namesystem.dir.cacheName(child);
588
589 if (child.isFile()) {
590 // Add file->block mapping
591 final INodeFile file = child.asFile();
592 final BlockInfo[] blocks = file.getBlocks();
593 if (blocks != null) {
594 final BlockManager bm = namesystem.getBlockManager();
595 for (int i = 0; i < blocks.length; i++) {
596 file.setBlock(i, bm.addBlockCollection(blocks[i], file));
597 }
598 }
599 }
600 }
601
602 /** @return The FSDirectory of the namesystem where the fsimage is loaded */
603 public FSDirectory getFSDirectoryInLoading() {
604 return namesystem.dir;
605 }
606
607 public INode loadINodeWithLocalName(boolean isSnapshotINode, DataInput in,
608 boolean updateINodeMap) throws IOException {
609 return loadINodeWithLocalName(isSnapshotINode, in, updateINodeMap, null);
610 }
611
612 public INode loadINodeWithLocalName(boolean isSnapshotINode,
613 DataInput in, boolean updateINodeMap, Counter counter)
614 throws IOException {
615 final byte[] localName = FSImageSerialization.readLocalName(in);
616 INode inode = loadINode(localName, isSnapshotINode, in, counter);
617 if (updateINodeMap
618 && LayoutVersion.supports(Feature.ADD_INODE_ID, getLayoutVersion())) {
619 namesystem.dir.addToInodeMap(inode);
620 }
621 return inode;
622 }
623
624 /**
625 * load an inode from fsimage except for its name
626 *
627 * @param in data input stream from which image is read
628 * @param counter Counter to increment for namenode startup progress
629 * @return an inode
630 */
631 @SuppressWarnings("deprecation")
632 INode loadINode(final byte[] localName, boolean isSnapshotINode,
633 DataInput in, Counter counter) throws IOException {
634 final int imgVersion = getLayoutVersion();
635 if (LayoutVersion.supports(Feature.SNAPSHOT, imgVersion)) {
636 namesystem.getFSDirectory().verifyINodeName(localName);
637 }
638
639 long inodeId = LayoutVersion.supports(Feature.ADD_INODE_ID, imgVersion) ?
640 in.readLong() : namesystem.allocateNewInodeId();
641
642 final short replication = namesystem.getBlockManager().adjustReplication(
643 in.readShort());
644 final long modificationTime = in.readLong();
645 long atime = 0;
646 if (LayoutVersion.supports(Feature.FILE_ACCESS_TIME, imgVersion)) {
647 atime = in.readLong();
648 }
649 final long blockSize = in.readLong();
650 final int numBlocks = in.readInt();
651
652 if (numBlocks >= 0) {
653 // file
654
655 // read blocks
656 BlockInfo[] blocks = null;
657 if (numBlocks >= 0) {
658 blocks = new BlockInfo[numBlocks];
659 for (int j = 0; j < numBlocks; j++) {
660 blocks[j] = new BlockInfo(replication);
661 blocks[j].readFields(in);
662 }
663 }
664
665 String clientName = "";
666 String clientMachine = "";
667 boolean underConstruction = false;
668 FileDiffList fileDiffs = null;
669 if (LayoutVersion.supports(Feature.SNAPSHOT, imgVersion)) {
670 // read diffs
671 fileDiffs = SnapshotFSImageFormat.loadFileDiffList(in, this);
672
673 if (isSnapshotINode) {
674 underConstruction = in.readBoolean();
675 if (underConstruction) {
676 clientName = FSImageSerialization.readString(in);
677 clientMachine = FSImageSerialization.readString(in);
678 }
679 }
680 }
681
682 final PermissionStatus permissions = PermissionStatus.read(in);
683
684 // return
685 if (counter != null) {
686 counter.increment();
687 }
688 final INodeFile file = new INodeFile(inodeId, localName, permissions,
689 modificationTime, atime, blocks, replication, blockSize);
690 return fileDiffs != null? new INodeFileWithSnapshot(file, fileDiffs)
691 : underConstruction? new INodeFileUnderConstruction(
692 file, clientName, clientMachine, null)
693 : file;
694 } else if (numBlocks == -1) {
695 //directory
696
697 //read quotas
698 final long nsQuota = in.readLong();
699 long dsQuota = -1L;
700 if (LayoutVersion.supports(Feature.DISKSPACE_QUOTA, imgVersion)) {
701 dsQuota = in.readLong();
702 }
703
704 //read snapshot info
705 boolean snapshottable = false;
706 boolean withSnapshot = false;
707 if (LayoutVersion.supports(Feature.SNAPSHOT, imgVersion)) {
708 snapshottable = in.readBoolean();
709 if (!snapshottable) {
710 withSnapshot = in.readBoolean();
711 }
712 }
713
714 final PermissionStatus permissions = PermissionStatus.read(in);
715
716 //return
717 if (counter != null) {
718 counter.increment();
719 }
720 final INodeDirectory dir = nsQuota >= 0 || dsQuota >= 0?
721 new INodeDirectoryWithQuota(inodeId, localName, permissions,
722 modificationTime, nsQuota, dsQuota)
723 : new INodeDirectory(inodeId, localName, permissions, modificationTime);
724 return snapshottable ? new INodeDirectorySnapshottable(dir)
725 : withSnapshot ? new INodeDirectoryWithSnapshot(dir)
726 : dir;
727 } else if (numBlocks == -2) {
728 //symlink
729 if (!FileSystem.isSymlinksEnabled()) {
730 throw new IOException("Symlinks not supported - please remove symlink before upgrading to this version of HDFS");
731 }
732
733 final String symlink = Text.readString(in);
734 final PermissionStatus permissions = PermissionStatus.read(in);
735 if (counter != null) {
736 counter.increment();
737 }
738 return new INodeSymlink(inodeId, localName, permissions,
739 modificationTime, atime, symlink);
740 } else if (numBlocks == -3) {
741 //reference
742 // Intentionally do not increment counter, because it is too difficult at
743 // this point to assess whether or not this is a reference that counts
744 // toward quota.
745
746 final boolean isWithName = in.readBoolean();
747 // lastSnapshotId for WithName node, dstSnapshotId for DstReference node
748 int snapshotId = in.readInt();
749
750 final INodeReference.WithCount withCount
751 = referenceMap.loadINodeReferenceWithCount(isSnapshotINode, in, this);
752
753 if (isWithName) {
754 return new INodeReference.WithName(null, withCount, localName,
755 snapshotId);
756 } else {
757 final INodeReference ref = new INodeReference.DstReference(null,
758 withCount, snapshotId);
759 return ref;
760 }
761 }
762
763 throw new IOException("Unknown inode type: numBlocks=" + numBlocks);
764 }
765
766 /** Load {@link INodeFileAttributes}. */
767 public INodeFileAttributes loadINodeFileAttributes(DataInput in)
768 throws IOException {
769 final int layoutVersion = getLayoutVersion();
770
771 if (!LayoutVersion.supports(Feature.OPTIMIZE_SNAPSHOT_INODES, layoutVersion)) {
772 return loadINodeWithLocalName(true, in, false).asFile();
773 }
774
775 final byte[] name = FSImageSerialization.readLocalName(in);
776 final PermissionStatus permissions = PermissionStatus.read(in);
777 final long modificationTime = in.readLong();
778 final long accessTime = in.readLong();
779
780 final short replication = namesystem.getBlockManager().adjustReplication(
781 in.readShort());
782 final long preferredBlockSize = in.readLong();
783
784 return new INodeFileAttributes.SnapshotCopy(name, permissions, modificationTime,
785 accessTime, replication, preferredBlockSize);
786 }
787
788 public INodeDirectoryAttributes loadINodeDirectoryAttributes(DataInput in)
789 throws IOException {
790 final int layoutVersion = getLayoutVersion();
791
792 if (!LayoutVersion.supports(Feature.OPTIMIZE_SNAPSHOT_INODES, layoutVersion)) {
793 return loadINodeWithLocalName(true, in, false).asDirectory();
794 }
795
796 final byte[] name = FSImageSerialization.readLocalName(in);
797 final PermissionStatus permissions = PermissionStatus.read(in);
798 final long modificationTime = in.readLong();
799
800 //read quotas
801 final long nsQuota = in.readLong();
802 final long dsQuota = in.readLong();
803
804 return nsQuota == -1L && dsQuota == -1L?
805 new INodeDirectoryAttributes.SnapshotCopy(name, permissions, modificationTime)
806 : new INodeDirectoryAttributes.CopyWithQuota(name, permissions,
807 modificationTime, nsQuota, dsQuota);
808 }
809
810 private void loadFilesUnderConstruction(DataInput in,
811 boolean supportSnapshot, Counter counter) throws IOException {
812 FSDirectory fsDir = namesystem.dir;
813 int size = in.readInt();
814
815 LOG.info("Number of files under construction = " + size);
816
817 for (int i = 0; i < size; i++) {
818 INodeFileUnderConstruction cons = FSImageSerialization
819 .readINodeUnderConstruction(in, namesystem, getLayoutVersion());
820 counter.increment();
821
822 // verify that file exists in namespace
823 String path = cons.getLocalName();
824 final INodesInPath iip = fsDir.getLastINodeInPath(path);
825 INodeFile oldnode = INodeFile.valueOf(iip.getINode(0), path);
826 cons.setLocalName(oldnode.getLocalNameBytes());
827 cons.setParent(oldnode.getParent());
828
829 if (oldnode instanceof INodeFileWithSnapshot) {
830 cons = new INodeFileUnderConstructionWithSnapshot(cons,
831 ((INodeFileWithSnapshot)oldnode).getDiffs());
832 }
833
834 fsDir.replaceINodeFile(path, oldnode, cons);
835 namesystem.leaseManager.addLease(cons.getClientName(), path);
836 }
837 }
838
839 private void loadSecretManagerState(DataInput in)
840 throws IOException {
841 int imgVersion = getLayoutVersion();
842
843 if (!LayoutVersion.supports(Feature.DELEGATION_TOKEN, imgVersion)) {
844 //SecretManagerState is not available.
845 //This must not happen if security is turned on.
846 return;
847 }
848 namesystem.loadSecretManagerState(in);
849 }
850
851 private int getLayoutVersion() {
852 return namesystem.getFSImage().getStorage().getLayoutVersion();
853 }
854
855 private boolean isRoot(byte[][] path) {
856 return path.length == 1 &&
857 path[0] == null;
858 }
859
860 private boolean isParent(byte[][] path, byte[][] parent) {
861 if (path == null || parent == null)
862 return false;
863 if (parent.length == 0 || path.length != parent.length + 1)
864 return false;
865 boolean isParent = true;
866 for (int i = 0; i < parent.length; i++) {
867 isParent = isParent && Arrays.equals(path[i], parent[i]);
868 }
869 return isParent;
870 }
871
872 /**
873 * Return string representing the parent of the given path.
874 */
875 String getParent(String path) {
876 return path.substring(0, path.lastIndexOf(Path.SEPARATOR));
877 }
878
879 byte[][] getParent(byte[][] path) {
880 byte[][] result = new byte[path.length - 1][];
881 for (int i = 0; i < result.length; i++) {
882 result[i] = new byte[path[i].length];
883 System.arraycopy(path[i], 0, result[i], 0, path[i].length);
884 }
885 return result;
886 }
887
888 public Snapshot getSnapshot(DataInput in) throws IOException {
889 return snapshotMap.get(in.readInt());
890 }
891 }
892
893 /**
894 * A one-shot class responsible for writing an image file.
895 * The write() function should be called once, after which the getter
896 * functions may be used to retrieve information about the file that was written.
897 */
898 static class Saver {
899 private final SaveNamespaceContext context;
900 /** Set to true once an image has been written */
901 private boolean saved = false;
902
903 /** The MD5 checksum of the file that was written */
904 private MD5Hash savedDigest;
905 private final ReferenceMap referenceMap = new ReferenceMap();
906
907 /** @throws IllegalStateException if the instance has not yet saved an image */
908 private void checkSaved() {
909 if (!saved) {
910 throw new IllegalStateException("FSImageSaver has not saved an image");
911 }
912 }
913
914 /** @throws IllegalStateException if the instance has already saved an image */
915 private void checkNotSaved() {
916 if (saved) {
917 throw new IllegalStateException("FSImageSaver has already saved an image");
918 }
919 }
920
921
922 Saver(SaveNamespaceContext context) {
923 this.context = context;
924 }
925
926 /**
927 * Return the MD5 checksum of the image file that was saved.
928 */
929 MD5Hash getSavedDigest() {
930 checkSaved();
931 return savedDigest;
932 }
933
934 void save(File newFile, FSImageCompression compression) throws IOException {
935 checkNotSaved();
936
937 final FSNamesystem sourceNamesystem = context.getSourceNamesystem();
938 FSDirectory fsDir = sourceNamesystem.dir;
939 String sdPath = newFile.getParentFile().getParentFile().getAbsolutePath();
940 Step step = new Step(StepType.INODES, sdPath);
941 StartupProgress prog = NameNode.getStartupProgress();
942 prog.beginStep(Phase.SAVING_CHECKPOINT, step);
943 prog.setTotal(Phase.SAVING_CHECKPOINT, step,
944 fsDir.rootDir.numItemsInTree());
945 Counter counter = prog.getCounter(Phase.SAVING_CHECKPOINT, step);
946 long startTime = now();
947 //
948 // Write out data
949 //
950 MessageDigest digester = MD5Hash.getDigester();
951 FileOutputStream fout = new FileOutputStream(newFile);
952 DigestOutputStream fos = new DigestOutputStream(fout, digester);
953 DataOutputStream out = new DataOutputStream(fos);
954 try {
955 out.writeInt(HdfsConstants.LAYOUT_VERSION);
956 // We use the non-locked version of getNamespaceInfo here since
957 // the coordinating thread of saveNamespace already has read-locked
958 // the namespace for us. If we attempt to take another readlock
959 // from the actual saver thread, there's a potential of a
960 // fairness-related deadlock. See the comments on HDFS-2223.
961 out.writeInt(sourceNamesystem.unprotectedGetNamespaceInfo()
962 .getNamespaceID());
963 out.writeLong(fsDir.rootDir.numItemsInTree());
964 out.writeLong(sourceNamesystem.getGenerationStampV1());
965 out.writeLong(sourceNamesystem.getGenerationStampV2());
966 out.writeLong(sourceNamesystem.getGenerationStampAtblockIdSwitch());
967 out.writeLong(sourceNamesystem.getLastAllocatedBlockId());
968 out.writeLong(context.getTxId());
969 out.writeLong(sourceNamesystem.getLastInodeId());
970
971
972 sourceNamesystem.getSnapshotManager().write(out);
973
974 // write compression info and set up compressed stream
975 out = compression.writeHeaderAndWrapStream(fos);
976 LOG.info("Saving image file " + newFile +
977 " using " + compression);
978
979 // save the root
980 saveINode2Image(fsDir.rootDir, out, false, referenceMap, counter);
981 // save the rest of the nodes
982 saveImage(fsDir.rootDir, out, true, counter);
983 prog.endStep(Phase.SAVING_CHECKPOINT, step);
984 // Now that the step is finished, set counter equal to total to adjust
985 // for possible under-counting due to reference inodes.
986 prog.setCount(Phase.SAVING_CHECKPOINT, step,
987 fsDir.rootDir.numItemsInTree());
988 // save files under construction
989 sourceNamesystem.saveFilesUnderConstruction(out);
990 context.checkCancelled();
991 sourceNamesystem.saveSecretManagerState(out, sdPath);
992 context.checkCancelled();
993 out.flush();
994 context.checkCancelled();
995 fout.getChannel().force(true);
996 } finally {
997 out.close();
998 }
999
1000 saved = true;
1001 // set md5 of the saved image
1002 savedDigest = new MD5Hash(digester.digest());
1003
1004 LOG.info("Image file " + newFile + " of size " + newFile.length() +
1005 " bytes saved in " + (now() - startTime)/1000 + " seconds.");
1006 }
1007
1008 /**
1009 * Save children INodes.
1010 * @param children The list of children INodes
1011 * @param out The DataOutputStream to write
1012 * @param counter Counter to increment for namenode startup progress
1013 * @return Number of children that are directory
1014 */
1015 private int saveChildren(ReadOnlyList<INode> children, DataOutputStream out,
1016 Counter counter) throws IOException {
1017 // Write normal children INode.
1018 out.writeInt(children.size());
1019 int dirNum = 0;
1020 int i = 0;
1021 for(INode child : children) {
1022 // print all children first
1023 saveINode2Image(child, out, false, referenceMap, counter);
1024 if (child.isDirectory()) {
1025 dirNum++;
1026 }
1027 if (i++ % 50 == 0) {
1028 context.checkCancelled();
1029 }
1030 }
1031 return dirNum;
1032 }
1033
1034 /**
1035 * Save file tree image starting from the given root.
1036 * This is a recursive procedure, which first saves all children and
1037 * snapshot diffs of a current directory and then moves inside the
1038 * sub-directories.
1039 *
1040 * @param current The current node
1041 * @param out The DataoutputStream to write the image
1042 * @param snapshot The possible snapshot associated with the current node
1043 * @param toSaveSubtree Whether or not to save the subtree to fsimage. For
1044 * reference node, its subtree may already have been
1045 * saved before.
1046 * @param counter Counter to increment for namenode startup progress
1047 */
1048 private void saveImage(INodeDirectory current, DataOutputStream out,
1049 boolean toSaveSubtree, Counter counter) throws IOException {
1050 // write the inode id of the directory
1051 out.writeLong(current.getId());
1052
1053 if (!toSaveSubtree) {
1054 return;
1055 }
1056
1057 final ReadOnlyList<INode> children = current.getChildrenList(null);
1058 int dirNum = 0;
1059 List<INodeDirectory> snapshotDirs = null;
1060 if (current instanceof INodeDirectoryWithSnapshot) {
1061 snapshotDirs = new ArrayList<INodeDirectory>();
1062 ((INodeDirectoryWithSnapshot) current).getSnapshotDirectory(
1063 snapshotDirs);
1064 dirNum += snapshotDirs.size();
1065 }
1066
1067 // 2. Write INodeDirectorySnapshottable#snapshotsByNames to record all
1068 // Snapshots
1069 if (current instanceof INodeDirectorySnapshottable) {
1070 INodeDirectorySnapshottable snapshottableNode =
1071 (INodeDirectorySnapshottable) current;
1072 SnapshotFSImageFormat.saveSnapshots(snapshottableNode, out);
1073 } else {
1074 out.writeInt(-1); // # of snapshots
1075 }
1076
1077 // 3. Write children INode
1078 dirNum += saveChildren(children, out, counter);
1079
1080 // 4. Write DirectoryDiff lists, if there is any.
1081 SnapshotFSImageFormat.saveDirectoryDiffList(current, out, referenceMap);
1082
1083 // Write sub-tree of sub-directories, including possible snapshots of
1084 // deleted sub-directories
1085 out.writeInt(dirNum); // the number of sub-directories
1086 for(INode child : children) {
1087 if(!child.isDirectory()) {
1088 continue;
1089 }
1090 // make sure we only save the subtree under a reference node once
1091 boolean toSave = child.isReference() ?
1092 referenceMap.toProcessSubtree(child.getId()) : true;
1093 saveImage(child.asDirectory(), out, toSave, counter);
1094 }
1095 if (snapshotDirs != null) {
1096 for (INodeDirectory subDir : snapshotDirs) {
1097 // make sure we only save the subtree under a reference node once
1098 boolean toSave = subDir.getParentReference() != null ?
1099 referenceMap.toProcessSubtree(subDir.getId()) : true;
1100 saveImage(subDir, out, toSave, counter);
1101 }
1102 }
1103 }
1104
1105 /**
1106 * Saves inode and increments progress counter.
1107 *
1108 * @param inode INode to save
1109 * @param out DataOutputStream to receive inode
1110 * @param writeUnderConstruction boolean true if this is under construction
1111 * @param referenceMap ReferenceMap containing reference inodes
1112 * @param counter Counter to increment for namenode startup progress
1113 * @throws IOException thrown if there is an I/O error
1114 */
1115 private void saveINode2Image(INode inode, DataOutputStream out,
1116 boolean writeUnderConstruction, ReferenceMap referenceMap,
1117 Counter counter) throws IOException {
1118 FSImageSerialization.saveINode2Image(inode, out, writeUnderConstruction,
1119 referenceMap);
1120 // Intentionally do not increment counter for reference inodes, because it
1121 // is too difficult at this point to assess whether or not this is a
1122 // reference that counts toward quota.
1123 if (!(inode instanceof INodeReference)) {
1124 counter.increment();
1125 }
1126 }
1127 }
1128 }