001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hdfs.server.namenode;
019
020import static org.apache.hadoop.crypto.key.KeyProviderCryptoExtension.EncryptedKeyVersion;
021import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.FS_TRASH_INTERVAL_DEFAULT;
022import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.FS_TRASH_INTERVAL_KEY;
023import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.IO_FILE_BUFFER_SIZE_DEFAULT;
024import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.IO_FILE_BUFFER_SIZE_KEY;
025import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_BLOCK_SIZE_DEFAULT;
026import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_BLOCK_SIZE_KEY;
027import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_BYTES_PER_CHECKSUM_DEFAULT;
028import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_BYTES_PER_CHECKSUM_KEY;
029import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CHECKSUM_TYPE_DEFAULT;
030import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CHECKSUM_TYPE_KEY;
031import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_WRITE_PACKET_SIZE_DEFAULT;
032import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_WRITE_PACKET_SIZE_KEY;
033import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_ENCRYPT_DATA_TRANSFER_DEFAULT;
034import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_ENCRYPT_DATA_TRANSFER_KEY;
035import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_HA_STANDBY_CHECKPOINTS_DEFAULT;
036import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_HA_STANDBY_CHECKPOINTS_KEY;
037import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_ACCESSTIME_PRECISION_DEFAULT;
038import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_ACCESSTIME_PRECISION_KEY;
039import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_AUDIT_LOGGERS_KEY;
040import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_ASYNC_DEFAULT;
041import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_ASYNC_KEY;
042import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_DEFAULT;
043import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_KEY;
044import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_TXNS_DEFAULT;
045import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_TXNS_KEY;
046import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DEFAULT_AUDIT_LOGGER_NAME;
047import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_KEY_UPDATE_INTERVAL_DEFAULT;
048import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_KEY_UPDATE_INTERVAL_KEY;
049import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_DEFAULT;
050import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_KEY;
051import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_MAX_LIFETIME_DEFAULT;
052import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_MAX_LIFETIME_KEY;
053import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_RENEW_INTERVAL_DEFAULT;
054import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_RENEW_INTERVAL_KEY;
055import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_KEY;
056import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_REQUIRED_KEY;
057import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDIT_LOG_AUTOROLL_CHECK_INTERVAL_MS;
058import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDIT_LOG_AUTOROLL_CHECK_INTERVAL_MS_DEFAULT;
059import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDIT_LOG_AUTOROLL_MULTIPLIER_THRESHOLD;
060import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDIT_LOG_AUTOROLL_MULTIPLIER_THRESHOLD_DEFAULT;
061import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_ENABLE_RETRY_CACHE_DEFAULT;
062import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_ENABLE_RETRY_CACHE_KEY;
063import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_LAZY_PERSIST_FILE_SCRUB_INTERVAL_SEC;
064import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_LAZY_PERSIST_FILE_SCRUB_INTERVAL_SEC_DEFAULT;
065import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_INODE_ATTRIBUTES_PROVIDER_KEY;
066import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_MAX_OBJECTS_DEFAULT;
067import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_MAX_OBJECTS_KEY;
068import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_NAME_DIR_KEY;
069import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_REPLICATION_MIN_DEFAULT;
070import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_REPLICATION_MIN_KEY;
071import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_REPL_QUEUE_THRESHOLD_PCT_KEY;
072import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_DEFAULT;
073import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_KEY;
074import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RETRY_CACHE_EXPIRYTIME_MILLIS_DEFAULT;
075import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RETRY_CACHE_EXPIRYTIME_MILLIS_KEY;
076import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RETRY_CACHE_HEAP_PERCENT_DEFAULT;
077import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RETRY_CACHE_HEAP_PERCENT_KEY;
078import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_EXTENSION_KEY;
079import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_MIN_DATANODES_DEFAULT;
080import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_MIN_DATANODES_KEY;
081import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_DEFAULT;
082import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_KEY;
083import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SHARED_EDITS_DIR_KEY;
084import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_ENABLED_DEFAULT;
085import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_ENABLED_KEY;
086import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_SUPERUSERGROUP_DEFAULT;
087import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_SUPERUSERGROUP_KEY;
088import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_REPLICATION_DEFAULT;
089import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_REPLICATION_KEY;
090import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_SUPPORT_APPEND_DEFAULT;
091import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_SUPPORT_APPEND_KEY;
092import static org.apache.hadoop.hdfs.server.common.HdfsServerConstants.SECURITY_XATTR_UNREADABLE_BY_SUPERUSER;
093import static org.apache.hadoop.util.Time.now;
094import static org.apache.hadoop.util.Time.monotonicNow;
095
096import java.io.BufferedWriter;
097import java.io.ByteArrayInputStream;
098import java.io.DataInput;
099import java.io.DataInputStream;
100import java.io.DataOutputStream;
101import java.io.File;
102import java.io.FileNotFoundException;
103import java.io.FileOutputStream;
104import java.io.IOException;
105import java.io.OutputStreamWriter;
106import java.io.PrintWriter;
107import java.io.StringWriter;
108import java.lang.management.ManagementFactory;
109import java.net.InetAddress;
110import java.net.URI;
111import java.security.GeneralSecurityException;
112import java.util.ArrayList;
113import java.util.Arrays;
114import java.util.Collection;
115import java.util.Collections;
116import java.util.Date;
117import java.util.EnumSet;
118import java.util.HashMap;
119import java.util.HashSet;
120import java.util.Iterator;
121import java.util.LinkedHashSet;
122import java.util.List;
123import java.util.Map;
124import java.util.Set;
125import java.util.TreeMap;
126import java.util.concurrent.TimeUnit;
127import java.util.concurrent.locks.Condition;
128import java.util.concurrent.locks.ReentrantLock;
129import java.util.concurrent.locks.ReentrantReadWriteLock;
130
131import javax.management.NotCompliantMBeanException;
132import javax.management.ObjectName;
133import javax.management.StandardMBean;
134
135import org.apache.commons.logging.Log;
136import org.apache.commons.logging.LogFactory;
137import org.apache.commons.logging.impl.Log4JLogger;
138import org.apache.hadoop.HadoopIllegalArgumentException;
139import org.apache.hadoop.classification.InterfaceAudience;
140import org.apache.hadoop.conf.Configuration;
141import org.apache.hadoop.crypto.CipherSuite;
142import org.apache.hadoop.crypto.CryptoProtocolVersion;
143import org.apache.hadoop.crypto.key.KeyProvider;
144import org.apache.hadoop.crypto.CryptoCodec;
145import org.apache.hadoop.crypto.key.KeyProviderCryptoExtension;
146import org.apache.hadoop.fs.BatchedRemoteIterator.BatchedListEntries;
147import org.apache.hadoop.fs.CacheFlag;
148import org.apache.hadoop.fs.ContentSummary;
149import org.apache.hadoop.fs.CreateFlag;
150import org.apache.hadoop.fs.FileAlreadyExistsException;
151import org.apache.hadoop.fs.FileEncryptionInfo;
152import org.apache.hadoop.fs.FileStatus;
153import org.apache.hadoop.fs.FileSystem;
154import org.apache.hadoop.fs.FsServerDefaults;
155import org.apache.hadoop.fs.InvalidPathException;
156import org.apache.hadoop.fs.Options;
157import org.apache.hadoop.fs.ParentNotDirectoryException;
158import org.apache.hadoop.fs.Path;
159import org.apache.hadoop.fs.UnresolvedLinkException;
160import org.apache.hadoop.fs.XAttr;
161import org.apache.hadoop.fs.XAttrSetFlag;
162import org.apache.hadoop.fs.permission.AclEntry;
163import org.apache.hadoop.fs.permission.AclStatus;
164import org.apache.hadoop.fs.permission.FsAction;
165import org.apache.hadoop.fs.permission.FsPermission;
166import org.apache.hadoop.fs.permission.PermissionStatus;
167import org.apache.hadoop.fs.StorageType;
168import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState;
169import org.apache.hadoop.ha.ServiceFailedException;
170import org.apache.hadoop.hdfs.protocol.BlockStoragePolicy;
171import org.apache.hadoop.hdfs.DFSConfigKeys;
172import org.apache.hadoop.hdfs.DFSUtil;
173import org.apache.hadoop.hdfs.HAUtil;
174import org.apache.hadoop.hdfs.HdfsConfiguration;
175import org.apache.hadoop.hdfs.UnknownCryptoProtocolVersionException;
176import org.apache.hadoop.hdfs.XAttrHelper;
177import org.apache.hadoop.hdfs.protocol.AlreadyBeingCreatedException;
178import org.apache.hadoop.hdfs.protocol.Block;
179import org.apache.hadoop.hdfs.protocol.CacheDirectiveEntry;
180import org.apache.hadoop.hdfs.protocol.CacheDirectiveInfo;
181import org.apache.hadoop.hdfs.protocol.CachePoolEntry;
182import org.apache.hadoop.hdfs.protocol.CachePoolInfo;
183import org.apache.hadoop.hdfs.protocol.ClientProtocol;
184import org.apache.hadoop.hdfs.protocol.DatanodeID;
185import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
186import org.apache.hadoop.hdfs.protocol.DirectoryListing;
187import org.apache.hadoop.hdfs.protocol.EncryptionZone;
188import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
189import org.apache.hadoop.hdfs.protocol.HdfsConstants;
190import org.apache.hadoop.hdfs.protocol.LastBlockWithStatus;
191import org.apache.hadoop.hdfs.protocol.HdfsConstants.DatanodeReportType;
192import org.apache.hadoop.hdfs.protocol.HdfsConstants.SafeModeAction;
193import org.apache.hadoop.hdfs.protocol.HdfsFileStatus;
194import org.apache.hadoop.hdfs.protocol.LocatedBlock;
195import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
196import org.apache.hadoop.hdfs.protocol.QuotaExceededException;
197import org.apache.hadoop.hdfs.protocol.RecoveryInProgressException;
198import org.apache.hadoop.hdfs.protocol.RollingUpgradeException;
199import org.apache.hadoop.hdfs.protocol.RollingUpgradeInfo;
200import org.apache.hadoop.hdfs.protocol.SnapshotAccessControlException;
201import org.apache.hadoop.hdfs.protocol.SnapshotDiffReport;
202import org.apache.hadoop.hdfs.protocol.SnapshottableDirectoryStatus;
203import org.apache.hadoop.hdfs.protocol.datatransfer.ReplaceDatanodeOnFailure;
204import org.apache.hadoop.hdfs.security.token.block.BlockTokenSecretManager;
205import org.apache.hadoop.hdfs.security.token.block.BlockTokenSecretManager.AccessMode;
206import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier;
207import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenSecretManager;
208import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenSecretManager.SecretManagerState;
209import org.apache.hadoop.hdfs.server.blockmanagement.BlockCollection;
210import org.apache.hadoop.hdfs.server.blockmanagement.BlockIdManager;
211import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfoContiguous;
212import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfoContiguousUnderConstruction;
213import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager;
214import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor;
215import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeManager;
216import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeStatistics;
217import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeStorageInfo;
218import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.BlockUCState;
219import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.NamenodeRole;
220import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.RollingUpgradeStartupOption;
221import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption;
222import org.apache.hadoop.hdfs.server.common.Storage;
223import org.apache.hadoop.hdfs.server.common.Storage.StorageDirType;
224import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory;
225import org.apache.hadoop.hdfs.server.common.Util;
226import org.apache.hadoop.hdfs.server.namenode.FsImageProto.SecretManagerSection;
227import org.apache.hadoop.hdfs.server.namenode.INode.BlocksMapUpdateInfo;
228import org.apache.hadoop.hdfs.server.namenode.JournalSet.JournalAndStream;
229import org.apache.hadoop.hdfs.server.namenode.LeaseManager.Lease;
230import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeFile;
231import org.apache.hadoop.hdfs.server.namenode.NameNode.OperationCategory;
232import org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer;
233import org.apache.hadoop.hdfs.server.namenode.ha.HAContext;
234import org.apache.hadoop.hdfs.server.namenode.ha.StandbyCheckpointer;
235import org.apache.hadoop.hdfs.server.namenode.metrics.FSNamesystemMBean;
236import org.apache.hadoop.hdfs.server.namenode.metrics.NameNodeMetrics;
237import org.apache.hadoop.hdfs.server.namenode.snapshot.Snapshot;
238import org.apache.hadoop.hdfs.server.namenode.snapshot.SnapshotManager;
239import org.apache.hadoop.hdfs.server.namenode.startupprogress.Phase;
240import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress;
241import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress.Counter;
242import org.apache.hadoop.hdfs.server.namenode.startupprogress.Status;
243import org.apache.hadoop.hdfs.server.namenode.startupprogress.Step;
244import org.apache.hadoop.hdfs.server.namenode.startupprogress.StepType;
245import org.apache.hadoop.hdfs.server.namenode.top.TopAuditLogger;
246import org.apache.hadoop.hdfs.server.namenode.top.TopConf;
247import org.apache.hadoop.hdfs.server.namenode.top.metrics.TopMetrics;
248import org.apache.hadoop.hdfs.server.namenode.top.window.RollingWindowManager;
249import org.apache.hadoop.hdfs.server.namenode.web.resources.NamenodeWebHdfsMethods;
250import org.apache.hadoop.hdfs.server.protocol.DatanodeCommand;
251import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration;
252import org.apache.hadoop.hdfs.server.protocol.DatanodeStorageReport;
253import org.apache.hadoop.hdfs.server.protocol.HeartbeatResponse;
254import org.apache.hadoop.hdfs.server.protocol.NNHAStatusHeartbeat;
255import org.apache.hadoop.hdfs.server.protocol.NamenodeCommand;
256import org.apache.hadoop.hdfs.server.protocol.NamenodeRegistration;
257import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
258import org.apache.hadoop.hdfs.server.protocol.StorageReceivedDeletedBlocks;
259import org.apache.hadoop.hdfs.server.protocol.StorageReport;
260import org.apache.hadoop.hdfs.server.protocol.VolumeFailureSummary;
261import org.apache.hadoop.io.EnumSetWritable;
262import org.apache.hadoop.io.IOUtils;
263import org.apache.hadoop.io.Text;
264import org.apache.hadoop.ipc.RetriableException;
265import org.apache.hadoop.ipc.RetryCache;
266import org.apache.hadoop.ipc.Server;
267import org.apache.hadoop.ipc.StandbyException;
268import org.apache.hadoop.metrics2.annotation.Metric;
269import org.apache.hadoop.metrics2.annotation.Metrics;
270import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
271import org.apache.hadoop.metrics2.util.MBeans;
272import org.apache.hadoop.net.NetworkTopology;
273import org.apache.hadoop.net.Node;
274import org.apache.hadoop.net.NodeBase;
275import org.apache.hadoop.security.AccessControlException;
276import org.apache.hadoop.security.UserGroupInformation;
277import org.apache.hadoop.security.UserGroupInformation.AuthenticationMethod;
278import org.apache.hadoop.security.token.SecretManager.InvalidToken;
279import org.apache.hadoop.security.token.Token;
280import org.apache.hadoop.security.token.TokenIdentifier;
281import org.apache.hadoop.security.token.delegation.DelegationKey;
282import org.apache.hadoop.util.ChunkedArrayList;
283import org.apache.hadoop.util.Daemon;
284import org.apache.hadoop.util.DataChecksum;
285import org.apache.hadoop.util.ReflectionUtils;
286import org.apache.hadoop.util.StringUtils;
287import org.apache.hadoop.util.VersionInfo;
288import org.apache.log4j.Appender;
289import org.apache.log4j.AsyncAppender;
290import org.apache.log4j.Logger;
291import org.codehaus.jackson.map.ObjectMapper;
292import org.mortbay.util.ajax.JSON;
293
294import com.google.common.annotations.VisibleForTesting;
295import com.google.common.base.Charsets;
296import com.google.common.base.Preconditions;
297import com.google.common.collect.ImmutableMap;
298import com.google.common.collect.Lists;
299
300/***************************************************
301 * FSNamesystem does the actual bookkeeping work for the
302 * DataNode.
303 *
304 * It tracks several important tables.
305 *
306 * 1)  valid fsname --> blocklist  (kept on disk, logged)
307 * 2)  Set of all valid blocks (inverted #1)
308 * 3)  block --> machinelist (kept in memory, rebuilt dynamically from reports)
309 * 4)  machine --> blocklist (inverted #2)
310 * 5)  LRU cache of updated-heartbeat machines
311 ***************************************************/
312@InterfaceAudience.Private
313@Metrics(context="dfs")
314public class FSNamesystem implements Namesystem, FSNamesystemMBean,
315  NameNodeMXBean {
316  public static final Log LOG = LogFactory.getLog(FSNamesystem.class);
317
318  private static final ThreadLocal<StringBuilder> auditBuffer =
319    new ThreadLocal<StringBuilder>() {
320      @Override
321      protected StringBuilder initialValue() {
322        return new StringBuilder();
323      }
324  };
325
326  private final BlockIdManager blockIdManager;
327
328  @VisibleForTesting
329  public boolean isAuditEnabled() {
330    return !isDefaultAuditLogger || auditLog.isInfoEnabled();
331  }
332
333  private void logAuditEvent(boolean succeeded, String cmd, String src)
334      throws IOException {
335    logAuditEvent(succeeded, cmd, src, null, null);
336  }
337  
338  private void logAuditEvent(boolean succeeded, String cmd, String src,
339      String dst, HdfsFileStatus stat) throws IOException {
340    if (isAuditEnabled() && isExternalInvocation()) {
341      logAuditEvent(succeeded, getRemoteUser(), getRemoteIp(),
342                    cmd, src, dst, stat);
343    }
344  }
345
346  private void logAuditEvent(boolean succeeded,
347      UserGroupInformation ugi, InetAddress addr, String cmd, String src,
348      String dst, HdfsFileStatus stat) {
349    FileStatus status = null;
350    if (stat != null) {
351      Path symlink = stat.isSymlink() ? new Path(stat.getSymlink()) : null;
352      Path path = dst != null ? new Path(dst) : new Path(src);
353      status = new FileStatus(stat.getLen(), stat.isDir(),
354          stat.getReplication(), stat.getBlockSize(), stat.getModificationTime(),
355          stat.getAccessTime(), stat.getPermission(), stat.getOwner(),
356          stat.getGroup(), symlink, path);
357    }
358    for (AuditLogger logger : auditLoggers) {
359      if (logger instanceof HdfsAuditLogger) {
360        HdfsAuditLogger hdfsLogger = (HdfsAuditLogger) logger;
361        hdfsLogger.logAuditEvent(succeeded, ugi.toString(), addr, cmd, src, dst,
362            status, ugi, dtSecretManager);
363      } else {
364        logger.logAuditEvent(succeeded, ugi.toString(), addr,
365            cmd, src, dst, status);
366      }
367    }
368  }
369
370  /**
371   * Logger for audit events, noting successful FSNamesystem operations. Emits
372   * to FSNamesystem.audit at INFO. Each event causes a set of tab-separated
373   * <code>key=value</code> pairs to be written for the following properties:
374   * <code>
375   * ugi=&lt;ugi in RPC&gt;
376   * ip=&lt;remote IP&gt;
377   * cmd=&lt;command&gt;
378   * src=&lt;src path&gt;
379   * dst=&lt;dst path (optional)&gt;
380   * perm=&lt;permissions (optional)&gt;
381   * </code>
382   */
383  public static final Log auditLog = LogFactory.getLog(
384      FSNamesystem.class.getName() + ".audit");
385
386  static final int DEFAULT_MAX_CORRUPT_FILEBLOCKS_RETURNED = 100;
387  static int BLOCK_DELETION_INCREMENT = 1000;
388  private final boolean isPermissionEnabled;
389  private final UserGroupInformation fsOwner;
390  private final String supergroup;
391  private final boolean standbyShouldCheckpoint;
392  
393  // Scan interval is not configurable.
394  private static final long DELEGATION_TOKEN_REMOVER_SCAN_INTERVAL =
395    TimeUnit.MILLISECONDS.convert(1, TimeUnit.HOURS);
396  final DelegationTokenSecretManager dtSecretManager;
397  private final boolean alwaysUseDelegationTokensForTests;
398
399  private static final Step STEP_AWAITING_REPORTED_BLOCKS =
400    new Step(StepType.AWAITING_REPORTED_BLOCKS);
401
402  // Tracks whether the default audit logger is the only configured audit
403  // logger; this allows isAuditEnabled() to return false in case the
404  // underlying logger is disabled, and avoid some unnecessary work.
405  private final boolean isDefaultAuditLogger;
406  private final List<AuditLogger> auditLoggers;
407
408  /** The namespace tree. */
409  FSDirectory dir;
410  private final BlockManager blockManager;
411  private final SnapshotManager snapshotManager;
412  private final CacheManager cacheManager;
413  private final DatanodeStatistics datanodeStatistics;
414
415  private String nameserviceId;
416
417  private volatile RollingUpgradeInfo rollingUpgradeInfo = null;
418  /**
419   * A flag that indicates whether the checkpointer should checkpoint a rollback
420   * fsimage. The edit log tailer sets this flag. The checkpoint will create a
421   * rollback fsimage if the flag is true, and then change the flag to false.
422   */
423  private volatile boolean needRollbackFsImage;
424
425  // Block pool ID used by this namenode
426  private String blockPoolId;
427
428  final LeaseManager leaseManager = new LeaseManager(this); 
429
430  volatile Daemon smmthread = null;  // SafeModeMonitor thread
431  
432  Daemon nnrmthread = null; // NamenodeResourceMonitor thread
433
434  Daemon nnEditLogRoller = null; // NameNodeEditLogRoller thread
435
436  // A daemon to periodically clean up corrupt lazyPersist files
437  // from the name space.
438  Daemon lazyPersistFileScrubber = null;
439  /**
440   * When an active namenode will roll its own edit log, in # edits
441   */
442  private final long editLogRollerThreshold;
443  /**
444   * Check interval of an active namenode's edit log roller thread 
445   */
446  private final int editLogRollerInterval;
447
448  /**
449   * How frequently we scan and unlink corrupt lazyPersist files.
450   * (In seconds)
451   */
452  private final int lazyPersistFileScrubIntervalSec;
453
454  private volatile boolean hasResourcesAvailable = false;
455  private volatile boolean fsRunning = true;
456  
457  /** The start time of the namesystem. */
458  private final long startTime = now();
459
460  /** The interval of namenode checking for the disk space availability */
461  private final long resourceRecheckInterval;
462
463  // The actual resource checker instance.
464  NameNodeResourceChecker nnResourceChecker;
465
466  private final FsServerDefaults serverDefaults;
467  private final boolean supportAppends;
468  private final ReplaceDatanodeOnFailure dtpReplaceDatanodeOnFailure;
469
470  private volatile SafeModeInfo safeMode;  // safe mode information
471
472  private final long maxFsObjects;          // maximum number of fs objects
473
474  private final long minBlockSize;         // minimum block size
475  private final long maxBlocksPerFile;     // maximum # of blocks per file
476
477  // precision of access times.
478  private final long accessTimePrecision;
479
480  /** Lock to protect FSNamesystem. */
481  private final FSNamesystemLock fsLock;
482
483  /** 
484   * Checkpoint lock to protect FSNamesystem modification on standby NNs.
485   * Unlike fsLock, it does not affect block updates. On active NNs, this lock
486   * does not provide proper protection, because there are operations that
487   * modify both block and name system state.  Even on standby, fsLock is 
488   * used when block state changes need to be blocked.
489   */
490  private final ReentrantLock cpLock;
491
492  /**
493   * Used when this NN is in standby state to read from the shared edit log.
494   */
495  private EditLogTailer editLogTailer = null;
496
497  /**
498   * Used when this NN is in standby state to perform checkpoints.
499   */
500  private StandbyCheckpointer standbyCheckpointer;
501
502  /**
503   * Reference to the NN's HAContext object. This is only set once
504   * {@link #startCommonServices(Configuration, HAContext)} is called. 
505   */
506  private HAContext haContext;
507
508  private final boolean haEnabled;
509
510  /** flag indicating whether replication queues have been initialized */
511  boolean initializedReplQueues = false;
512
513  /**
514   * Whether the namenode is in the middle of starting the active service
515   */
516  private volatile boolean startingActiveService = false;
517
518  private final RetryCache retryCache;
519
520  private KeyProviderCryptoExtension provider = null;
521
522  private volatile boolean imageLoaded = false;
523  private final Condition cond;
524
525  private final FSImage fsImage;
526
527  private final TopConf topConf;
528  private TopMetrics topMetrics;
529
530  private INodeAttributeProvider inodeAttributeProvider;
531
532  /**
533   * Notify that loading of this FSDirectory is complete, and
534   * it is imageLoaded for use
535   */
536  void imageLoadComplete() {
537    Preconditions.checkState(!imageLoaded, "FSDirectory already loaded");
538    setImageLoaded();
539  }
540
541  void setImageLoaded() {
542    if(imageLoaded) return;
543    writeLock();
544    try {
545      setImageLoaded(true);
546      dir.markNameCacheInitialized();
547      cond.signalAll();
548    } finally {
549      writeUnlock();
550    }
551  }
552
553  //This is for testing purposes only
554  @VisibleForTesting
555  boolean isImageLoaded() {
556    return imageLoaded;
557  }
558
559  // exposed for unit tests
560  protected void setImageLoaded(boolean flag) {
561    imageLoaded = flag;
562  }
563
564  /**
565   * Block until the object is imageLoaded to be used.
566   */
567  void waitForLoadingFSImage() {
568    if (!imageLoaded) {
569      writeLock();
570      try {
571        while (!imageLoaded) {
572          try {
573            cond.await(5000, TimeUnit.MILLISECONDS);
574          } catch (InterruptedException ignored) {
575          }
576        }
577      } finally {
578        writeUnlock();
579      }
580    }
581  }
582
583  /**
584   * Clear all loaded data
585   */
586  void clear() {
587    dir.reset();
588    dtSecretManager.reset();
589    blockIdManager.clear();
590    leaseManager.removeAllLeases();
591    snapshotManager.clearSnapshottableDirs();
592    cacheManager.clear();
593    setImageLoaded(false);
594    blockManager.clear();
595  }
596
597  @VisibleForTesting
598  LeaseManager getLeaseManager() {
599    return leaseManager;
600  }
601  
602  boolean isHaEnabled() {
603    return haEnabled;
604  }
605  
606  /**
607   * Check the supplied configuration for correctness.
608   * @param conf Supplies the configuration to validate.
609   * @throws IOException if the configuration could not be queried.
610   * @throws IllegalArgumentException if the configuration is invalid.
611   */
612  private static void checkConfiguration(Configuration conf)
613      throws IOException {
614
615    final Collection<URI> namespaceDirs =
616        FSNamesystem.getNamespaceDirs(conf);
617    final Collection<URI> editsDirs =
618        FSNamesystem.getNamespaceEditsDirs(conf);
619    final Collection<URI> requiredEditsDirs =
620        FSNamesystem.getRequiredNamespaceEditsDirs(conf);
621    final Collection<URI> sharedEditsDirs =
622        FSNamesystem.getSharedEditsDirs(conf);
623
624    for (URI u : requiredEditsDirs) {
625      if (u.toString().compareTo(
626              DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_DEFAULT) == 0) {
627        continue;
628      }
629
630      // Each required directory must also be in editsDirs or in
631      // sharedEditsDirs.
632      if (!editsDirs.contains(u) &&
633          !sharedEditsDirs.contains(u)) {
634        throw new IllegalArgumentException(
635            "Required edits directory " + u.toString() + " not present in " +
636            DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_KEY + ". " +
637            DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_KEY + "=" +
638            editsDirs.toString() + "; " +
639            DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_REQUIRED_KEY + "=" +
640            requiredEditsDirs.toString() + ". " +
641            DFSConfigKeys.DFS_NAMENODE_SHARED_EDITS_DIR_KEY + "=" +
642            sharedEditsDirs.toString() + ".");
643      }
644    }
645
646    if (namespaceDirs.size() == 1) {
647      LOG.warn("Only one image storage directory ("
648          + DFS_NAMENODE_NAME_DIR_KEY + ") configured. Beware of data loss"
649          + " due to lack of redundant storage directories!");
650    }
651    if (editsDirs.size() == 1) {
652      LOG.warn("Only one namespace edits storage directory ("
653          + DFS_NAMENODE_EDITS_DIR_KEY + ") configured. Beware of data loss"
654          + " due to lack of redundant storage directories!");
655    }
656  }
657
658  /**
659   * Instantiates an FSNamesystem loaded from the image and edits
660   * directories specified in the passed Configuration.
661   *
662   * @param conf the Configuration which specifies the storage directories
663   *             from which to load
664   * @return an FSNamesystem which contains the loaded namespace
665   * @throws IOException if loading fails
666   */
667  static FSNamesystem loadFromDisk(Configuration conf) throws IOException {
668
669    checkConfiguration(conf);
670    FSImage fsImage = new FSImage(conf,
671        FSNamesystem.getNamespaceDirs(conf),
672        FSNamesystem.getNamespaceEditsDirs(conf));
673    FSNamesystem namesystem = new FSNamesystem(conf, fsImage, false);
674    StartupOption startOpt = NameNode.getStartupOption(conf);
675    if (startOpt == StartupOption.RECOVER) {
676      namesystem.setSafeMode(SafeModeAction.SAFEMODE_ENTER);
677    }
678
679    long loadStart = monotonicNow();
680    try {
681      namesystem.loadFSImage(startOpt);
682    } catch (IOException ioe) {
683      LOG.warn("Encountered exception loading fsimage", ioe);
684      fsImage.close();
685      throw ioe;
686    }
687    long timeTakenToLoadFSImage = monotonicNow() - loadStart;
688    LOG.info("Finished loading FSImage in " + timeTakenToLoadFSImage + " msecs");
689    NameNodeMetrics nnMetrics = NameNode.getNameNodeMetrics();
690    if (nnMetrics != null) {
691      nnMetrics.setFsImageLoadTime((int) timeTakenToLoadFSImage);
692    }
693    return namesystem;
694  }
695  
696  FSNamesystem(Configuration conf, FSImage fsImage) throws IOException {
697    this(conf, fsImage, false);
698  }
699  
700  /**
701   * Create an FSNamesystem associated with the specified image.
702   * 
703   * Note that this does not load any data off of disk -- if you would
704   * like that behavior, use {@link #loadFromDisk(Configuration)}
705   *
706   * @param conf configuration
707   * @param fsImage The FSImage to associate with
708   * @param ignoreRetryCache Whether or not should ignore the retry cache setup
709   *                         step. For Secondary NN this should be set to true.
710   * @throws IOException on bad configuration
711   */
712  FSNamesystem(Configuration conf, FSImage fsImage, boolean ignoreRetryCache)
713      throws IOException {
714    provider = DFSUtil.createKeyProviderCryptoExtension(conf);
715    if (provider == null) {
716      LOG.info("No KeyProvider found.");
717    } else {
718      LOG.info("Found KeyProvider: " + provider.toString());
719    }
720    if (conf.getBoolean(DFS_NAMENODE_AUDIT_LOG_ASYNC_KEY,
721                        DFS_NAMENODE_AUDIT_LOG_ASYNC_DEFAULT)) {
722      LOG.info("Enabling async auditlog");
723      enableAsyncAuditLog();
724    }
725    boolean fair = conf.getBoolean("dfs.namenode.fslock.fair", true);
726    LOG.info("fsLock is fair:" + fair);
727    fsLock = new FSNamesystemLock(fair);
728    cond = fsLock.writeLock().newCondition();
729    cpLock = new ReentrantLock();
730
731    this.fsImage = fsImage;
732    try {
733      resourceRecheckInterval = conf.getLong(
734          DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_KEY,
735          DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_DEFAULT);
736
737      this.blockManager = new BlockManager(this, conf);
738      this.datanodeStatistics = blockManager.getDatanodeManager().getDatanodeStatistics();
739      this.blockIdManager = new BlockIdManager(blockManager);
740
741      this.fsOwner = UserGroupInformation.getCurrentUser();
742      this.supergroup = conf.get(DFS_PERMISSIONS_SUPERUSERGROUP_KEY, 
743                                 DFS_PERMISSIONS_SUPERUSERGROUP_DEFAULT);
744      this.isPermissionEnabled = conf.getBoolean(DFS_PERMISSIONS_ENABLED_KEY,
745                                                 DFS_PERMISSIONS_ENABLED_DEFAULT);
746      LOG.info("fsOwner             = " + fsOwner);
747      LOG.info("supergroup          = " + supergroup);
748      LOG.info("isPermissionEnabled = " + isPermissionEnabled);
749
750      // block allocation has to be persisted in HA using a shared edits directory
751      // so that the standby has up-to-date namespace information
752      nameserviceId = DFSUtil.getNamenodeNameServiceId(conf);
753      this.haEnabled = HAUtil.isHAEnabled(conf, nameserviceId);  
754      
755      // Sanity check the HA-related config.
756      if (nameserviceId != null) {
757        LOG.info("Determined nameservice ID: " + nameserviceId);
758      }
759      LOG.info("HA Enabled: " + haEnabled);
760      if (!haEnabled && HAUtil.usesSharedEditsDir(conf)) {
761        LOG.warn("Configured NNs:\n" + DFSUtil.nnAddressesAsString(conf));
762        throw new IOException("Invalid configuration: a shared edits dir " +
763            "must not be specified if HA is not enabled.");
764      }
765
766      // Get the checksum type from config
767      String checksumTypeStr = conf.get(DFS_CHECKSUM_TYPE_KEY, DFS_CHECKSUM_TYPE_DEFAULT);
768      DataChecksum.Type checksumType;
769      try {
770         checksumType = DataChecksum.Type.valueOf(checksumTypeStr);
771      } catch (IllegalArgumentException iae) {
772         throw new IOException("Invalid checksum type in "
773            + DFS_CHECKSUM_TYPE_KEY + ": " + checksumTypeStr);
774      }
775
776      this.serverDefaults = new FsServerDefaults(
777          conf.getLongBytes(DFS_BLOCK_SIZE_KEY, DFS_BLOCK_SIZE_DEFAULT),
778          conf.getInt(DFS_BYTES_PER_CHECKSUM_KEY, DFS_BYTES_PER_CHECKSUM_DEFAULT),
779          conf.getInt(DFS_CLIENT_WRITE_PACKET_SIZE_KEY, DFS_CLIENT_WRITE_PACKET_SIZE_DEFAULT),
780          (short) conf.getInt(DFS_REPLICATION_KEY, DFS_REPLICATION_DEFAULT),
781          conf.getInt(IO_FILE_BUFFER_SIZE_KEY, IO_FILE_BUFFER_SIZE_DEFAULT),
782          conf.getBoolean(DFS_ENCRYPT_DATA_TRANSFER_KEY, DFS_ENCRYPT_DATA_TRANSFER_DEFAULT),
783          conf.getLong(FS_TRASH_INTERVAL_KEY, FS_TRASH_INTERVAL_DEFAULT),
784          checksumType);
785      
786      this.maxFsObjects = conf.getLong(DFS_NAMENODE_MAX_OBJECTS_KEY, 
787                                       DFS_NAMENODE_MAX_OBJECTS_DEFAULT);
788
789      this.minBlockSize = conf.getLong(DFSConfigKeys.DFS_NAMENODE_MIN_BLOCK_SIZE_KEY,
790          DFSConfigKeys.DFS_NAMENODE_MIN_BLOCK_SIZE_DEFAULT);
791      this.maxBlocksPerFile = conf.getLong(DFSConfigKeys.DFS_NAMENODE_MAX_BLOCKS_PER_FILE_KEY,
792          DFSConfigKeys.DFS_NAMENODE_MAX_BLOCKS_PER_FILE_DEFAULT);
793      this.accessTimePrecision = conf.getLong(DFS_NAMENODE_ACCESSTIME_PRECISION_KEY,
794          DFS_NAMENODE_ACCESSTIME_PRECISION_DEFAULT);
795      this.supportAppends = conf.getBoolean(DFS_SUPPORT_APPEND_KEY, DFS_SUPPORT_APPEND_DEFAULT);
796      LOG.info("Append Enabled: " + supportAppends);
797
798      this.dtpReplaceDatanodeOnFailure = ReplaceDatanodeOnFailure.get(conf);
799      
800      this.standbyShouldCheckpoint = conf.getBoolean(
801          DFS_HA_STANDBY_CHECKPOINTS_KEY, DFS_HA_STANDBY_CHECKPOINTS_DEFAULT);
802      // # edit autoroll threshold is a multiple of the checkpoint threshold 
803      this.editLogRollerThreshold = (long)
804          (conf.getFloat(
805              DFS_NAMENODE_EDIT_LOG_AUTOROLL_MULTIPLIER_THRESHOLD,
806              DFS_NAMENODE_EDIT_LOG_AUTOROLL_MULTIPLIER_THRESHOLD_DEFAULT) *
807          conf.getLong(
808              DFS_NAMENODE_CHECKPOINT_TXNS_KEY,
809              DFS_NAMENODE_CHECKPOINT_TXNS_DEFAULT));
810      this.editLogRollerInterval = conf.getInt(
811          DFS_NAMENODE_EDIT_LOG_AUTOROLL_CHECK_INTERVAL_MS,
812          DFS_NAMENODE_EDIT_LOG_AUTOROLL_CHECK_INTERVAL_MS_DEFAULT);
813
814      this.lazyPersistFileScrubIntervalSec = conf.getInt(
815          DFS_NAMENODE_LAZY_PERSIST_FILE_SCRUB_INTERVAL_SEC,
816          DFS_NAMENODE_LAZY_PERSIST_FILE_SCRUB_INTERVAL_SEC_DEFAULT);
817
818      if (this.lazyPersistFileScrubIntervalSec == 0) {
819        throw new IllegalArgumentException(
820            DFS_NAMENODE_LAZY_PERSIST_FILE_SCRUB_INTERVAL_SEC + " must be non-zero.");
821      }
822
823      // For testing purposes, allow the DT secret manager to be started regardless
824      // of whether security is enabled.
825      alwaysUseDelegationTokensForTests = conf.getBoolean(
826          DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_KEY,
827          DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_DEFAULT);
828      
829      this.dtSecretManager = createDelegationTokenSecretManager(conf);
830      this.dir = new FSDirectory(this, conf);
831      this.snapshotManager = new SnapshotManager(dir);
832      this.cacheManager = new CacheManager(this, conf, blockManager);
833      this.safeMode = new SafeModeInfo(conf);
834      this.topConf = new TopConf(conf);
835      this.auditLoggers = initAuditLoggers(conf);
836      this.isDefaultAuditLogger = auditLoggers.size() == 1 &&
837        auditLoggers.get(0) instanceof DefaultAuditLogger;
838      this.retryCache = ignoreRetryCache ? null : initRetryCache(conf);
839      Class<? extends INodeAttributeProvider> klass = conf.getClass(
840          DFS_NAMENODE_INODE_ATTRIBUTES_PROVIDER_KEY,
841          null, INodeAttributeProvider.class);
842      if (klass != null) {
843        inodeAttributeProvider = ReflectionUtils.newInstance(klass, conf);
844        LOG.info("Using INode attribute provider: " + klass.getName());
845      }
846    } catch(IOException e) {
847      LOG.error(getClass().getSimpleName() + " initialization failed.", e);
848      close();
849      throw e;
850    } catch (RuntimeException re) {
851      LOG.error(getClass().getSimpleName() + " initialization failed.", re);
852      close();
853      throw re;
854    }
855  }
856
857  @VisibleForTesting
858  public List<AuditLogger> getAuditLoggers() {
859    return auditLoggers;
860  }
861
862  @VisibleForTesting
863  public RetryCache getRetryCache() {
864    return retryCache;
865  }
866
867  void lockRetryCache() {
868    if (retryCache != null) {
869      retryCache.lock();
870    }
871  }
872
873  void unlockRetryCache() {
874    if (retryCache != null) {
875      retryCache.unlock();
876    }
877  }
878
879  /** Whether or not retry cache is enabled */
880  boolean hasRetryCache() {
881    return retryCache != null;
882  }
883  
884  void addCacheEntryWithPayload(byte[] clientId, int callId, Object payload) {
885    if (retryCache != null) {
886      retryCache.addCacheEntryWithPayload(clientId, callId, payload);
887    }
888  }
889  
890  void addCacheEntry(byte[] clientId, int callId) {
891    if (retryCache != null) {
892      retryCache.addCacheEntry(clientId, callId);
893    }
894  }
895
896  @VisibleForTesting
897  public KeyProviderCryptoExtension getProvider() {
898    return provider;
899  }
900
901  @VisibleForTesting
902  static RetryCache initRetryCache(Configuration conf) {
903    boolean enable = conf.getBoolean(DFS_NAMENODE_ENABLE_RETRY_CACHE_KEY,
904                                     DFS_NAMENODE_ENABLE_RETRY_CACHE_DEFAULT);
905    LOG.info("Retry cache on namenode is " + (enable ? "enabled" : "disabled"));
906    if (enable) {
907      float heapPercent = conf.getFloat(
908          DFS_NAMENODE_RETRY_CACHE_HEAP_PERCENT_KEY,
909          DFS_NAMENODE_RETRY_CACHE_HEAP_PERCENT_DEFAULT);
910      long entryExpiryMillis = conf.getLong(
911          DFS_NAMENODE_RETRY_CACHE_EXPIRYTIME_MILLIS_KEY,
912          DFS_NAMENODE_RETRY_CACHE_EXPIRYTIME_MILLIS_DEFAULT);
913      LOG.info("Retry cache will use " + heapPercent
914          + " of total heap and retry cache entry expiry time is "
915          + entryExpiryMillis + " millis");
916      long entryExpiryNanos = entryExpiryMillis * 1000 * 1000;
917      return new RetryCache("NameNodeRetryCache", heapPercent,
918          entryExpiryNanos);
919    }
920    return null;
921  }
922
923  private List<AuditLogger> initAuditLoggers(Configuration conf) {
924    // Initialize the custom access loggers if configured.
925    Collection<String> alClasses = conf.getStringCollection(DFS_NAMENODE_AUDIT_LOGGERS_KEY);
926    List<AuditLogger> auditLoggers = Lists.newArrayList();
927    if (alClasses != null && !alClasses.isEmpty()) {
928      for (String className : alClasses) {
929        try {
930          AuditLogger logger;
931          if (DFS_NAMENODE_DEFAULT_AUDIT_LOGGER_NAME.equals(className)) {
932            logger = new DefaultAuditLogger();
933          } else {
934            logger = (AuditLogger) Class.forName(className).newInstance();
935          }
936          logger.initialize(conf);
937          auditLoggers.add(logger);
938        } catch (RuntimeException re) {
939          throw re;
940        } catch (Exception e) {
941          throw new RuntimeException(e);
942        }
943      }
944    }
945
946    // Make sure there is at least one logger installed.
947    if (auditLoggers.isEmpty()) {
948      auditLoggers.add(new DefaultAuditLogger());
949    }
950
951    // Add audit logger to calculate top users
952    if (topConf.isEnabled) {
953      topMetrics = new TopMetrics(conf, topConf.nntopReportingPeriodsMs);
954      auditLoggers.add(new TopAuditLogger(topMetrics));
955    }
956
957    return Collections.unmodifiableList(auditLoggers);
958  }
959
960  private void loadFSImage(StartupOption startOpt) throws IOException {
961    final FSImage fsImage = getFSImage();
962
963    // format before starting up if requested
964    if (startOpt == StartupOption.FORMAT) {
965      
966      fsImage.format(this, fsImage.getStorage().determineClusterId());// reuse current id
967
968      startOpt = StartupOption.REGULAR;
969    }
970    boolean success = false;
971    writeLock();
972    try {
973      // We shouldn't be calling saveNamespace if we've come up in standby state.
974      MetaRecoveryContext recovery = startOpt.createRecoveryContext();
975      final boolean staleImage
976          = fsImage.recoverTransitionRead(startOpt, this, recovery);
977      if (RollingUpgradeStartupOption.ROLLBACK.matches(startOpt) ||
978          RollingUpgradeStartupOption.DOWNGRADE.matches(startOpt)) {
979        rollingUpgradeInfo = null;
980      }
981      final boolean needToSave = staleImage && !haEnabled && !isRollingUpgrade(); 
982      LOG.info("Need to save fs image? " + needToSave
983          + " (staleImage=" + staleImage + ", haEnabled=" + haEnabled
984          + ", isRollingUpgrade=" + isRollingUpgrade() + ")");
985      if (needToSave) {
986        fsImage.saveNamespace(this);
987      } else {
988        updateStorageVersionForRollingUpgrade(fsImage.getLayoutVersion(),
989            startOpt);
990        // No need to save, so mark the phase done.
991        StartupProgress prog = NameNode.getStartupProgress();
992        prog.beginPhase(Phase.SAVING_CHECKPOINT);
993        prog.endPhase(Phase.SAVING_CHECKPOINT);
994      }
995      // This will start a new log segment and write to the seen_txid file, so
996      // we shouldn't do it when coming up in standby state
997      if (!haEnabled || (haEnabled && startOpt == StartupOption.UPGRADE)
998          || (haEnabled && startOpt == StartupOption.UPGRADEONLY)) {
999        fsImage.openEditLogForWrite();
1000      }
1001      success = true;
1002    } finally {
1003      if (!success) {
1004        fsImage.close();
1005      }
1006      writeUnlock();
1007    }
1008    imageLoadComplete();
1009  }
1010
1011  private void updateStorageVersionForRollingUpgrade(final long layoutVersion,
1012      StartupOption startOpt) throws IOException {
1013    boolean rollingStarted = RollingUpgradeStartupOption.STARTED
1014        .matches(startOpt) && layoutVersion > HdfsConstants
1015        .NAMENODE_LAYOUT_VERSION;
1016    boolean rollingRollback = RollingUpgradeStartupOption.ROLLBACK
1017        .matches(startOpt);
1018    if (rollingRollback || rollingStarted) {
1019      fsImage.updateStorageVersion();
1020    }
1021  }
1022
1023  private void startSecretManager() {
1024    if (dtSecretManager != null) {
1025      try {
1026        dtSecretManager.startThreads();
1027      } catch (IOException e) {
1028        // Inability to start secret manager
1029        // can't be recovered from.
1030        throw new RuntimeException(e);
1031      }
1032    }
1033  }
1034  
1035  private void startSecretManagerIfNecessary() {
1036    boolean shouldRun = shouldUseDelegationTokens() &&
1037      !isInSafeMode() && getEditLog().isOpenForWrite();
1038    boolean running = dtSecretManager.isRunning();
1039    if (shouldRun && !running) {
1040      startSecretManager();
1041    }
1042  }
1043
1044  private void stopSecretManager() {
1045    if (dtSecretManager != null) {
1046      dtSecretManager.stopThreads();
1047    }
1048  }
1049  
1050  /** 
1051   * Start services common to both active and standby states
1052   */
1053  void startCommonServices(Configuration conf, HAContext haContext) throws IOException {
1054    this.registerMBean(); // register the MBean for the FSNamesystemState
1055    writeLock();
1056    this.haContext = haContext;
1057    try {
1058      nnResourceChecker = new NameNodeResourceChecker(conf);
1059      checkAvailableResources();
1060      assert safeMode != null && !isPopulatingReplQueues();
1061      StartupProgress prog = NameNode.getStartupProgress();
1062      prog.beginPhase(Phase.SAFEMODE);
1063      prog.setTotal(Phase.SAFEMODE, STEP_AWAITING_REPORTED_BLOCKS,
1064        getCompleteBlocksTotal());
1065      setBlockTotal();
1066      blockManager.activate(conf);
1067    } finally {
1068      writeUnlock();
1069    }
1070    
1071    registerMXBean();
1072    DefaultMetricsSystem.instance().register(this);
1073    if (inodeAttributeProvider != null) {
1074      inodeAttributeProvider.start();
1075      dir.setINodeAttributeProvider(inodeAttributeProvider);
1076    }
1077    snapshotManager.registerMXBean();
1078  }
1079  
1080  /** 
1081   * Stop services common to both active and standby states
1082   */
1083  void stopCommonServices() {
1084    writeLock();
1085    if (inodeAttributeProvider != null) {
1086      dir.setINodeAttributeProvider(null);
1087      inodeAttributeProvider.stop();
1088    }
1089    try {
1090      if (blockManager != null) blockManager.close();
1091    } finally {
1092      writeUnlock();
1093    }
1094    RetryCache.clear(retryCache);
1095  }
1096  
1097  /**
1098   * Start services required in active state
1099   * @throws IOException
1100   */
1101  void startActiveServices() throws IOException {
1102    startingActiveService = true;
1103    LOG.info("Starting services required for active state");
1104    writeLock();
1105    try {
1106      FSEditLog editLog = getFSImage().getEditLog();
1107      
1108      if (!editLog.isOpenForWrite()) {
1109        // During startup, we're already open for write during initialization.
1110        editLog.initJournalsForWrite();
1111        // May need to recover
1112        editLog.recoverUnclosedStreams();
1113        
1114        LOG.info("Catching up to latest edits from old active before " +
1115            "taking over writer role in edits logs");
1116        editLogTailer.catchupDuringFailover();
1117        
1118        blockManager.setPostponeBlocksFromFuture(false);
1119        blockManager.getDatanodeManager().markAllDatanodesStale();
1120        blockManager.clearQueues();
1121        blockManager.processAllPendingDNMessages();
1122
1123        // Only need to re-process the queue, If not in SafeMode.
1124        if (!isInSafeMode()) {
1125          LOG.info("Reprocessing replication and invalidation queues");
1126          initializeReplQueues();
1127        }
1128
1129        if (LOG.isDebugEnabled()) {
1130          LOG.debug("NameNode metadata after re-processing " +
1131              "replication and invalidation queues during failover:\n" +
1132              metaSaveAsString());
1133        }
1134        
1135        long nextTxId = getFSImage().getLastAppliedTxId() + 1;
1136        LOG.info("Will take over writing edit logs at txnid " + 
1137            nextTxId);
1138        editLog.setNextTxId(nextTxId);
1139
1140        getFSImage().editLog.openForWrite();
1141      }
1142
1143      // Enable quota checks.
1144      dir.enableQuotaChecks();
1145      if (haEnabled) {
1146        // Renew all of the leases before becoming active.
1147        // This is because, while we were in standby mode,
1148        // the leases weren't getting renewed on this NN.
1149        // Give them all a fresh start here.
1150        leaseManager.renewAllLeases();
1151      }
1152      leaseManager.startMonitor();
1153      startSecretManagerIfNecessary();
1154
1155      //ResourceMonitor required only at ActiveNN. See HDFS-2914
1156      this.nnrmthread = new Daemon(new NameNodeResourceMonitor());
1157      nnrmthread.start();
1158
1159      nnEditLogRoller = new Daemon(new NameNodeEditLogRoller(
1160          editLogRollerThreshold, editLogRollerInterval));
1161      nnEditLogRoller.start();
1162
1163      if (lazyPersistFileScrubIntervalSec > 0) {
1164        lazyPersistFileScrubber = new Daemon(new LazyPersistFileScrubber(
1165            lazyPersistFileScrubIntervalSec));
1166        lazyPersistFileScrubber.start();
1167      }
1168
1169      cacheManager.startMonitorThread();
1170      blockManager.getDatanodeManager().setShouldSendCachingCommands(true);
1171    } finally {
1172      startingActiveService = false;
1173      checkSafeMode();
1174      writeUnlock();
1175    }
1176  }
1177
1178  /**
1179   * Initialize replication queues.
1180   */
1181  private void initializeReplQueues() {
1182    LOG.info("initializing replication queues");
1183    blockManager.processMisReplicatedBlocks();
1184    initializedReplQueues = true;
1185  }
1186
1187  private boolean inActiveState() {
1188    return haContext != null &&
1189        haContext.getState().getServiceState() == HAServiceState.ACTIVE;
1190  }
1191
1192  /**
1193   * @return Whether the namenode is transitioning to active state and is in the
1194   *         middle of the {@link #startActiveServices()}
1195   */
1196  public boolean inTransitionToActive() {
1197    return haEnabled && inActiveState() && startingActiveService;
1198  }
1199
1200  private boolean shouldUseDelegationTokens() {
1201    return UserGroupInformation.isSecurityEnabled() ||
1202      alwaysUseDelegationTokensForTests;
1203  }
1204
1205  /** 
1206   * Stop services required in active state
1207   */
1208  void stopActiveServices() {
1209    LOG.info("Stopping services started for active state");
1210    writeLock();
1211    try {
1212      stopSecretManager();
1213      leaseManager.stopMonitor();
1214      if (nnrmthread != null) {
1215        ((NameNodeResourceMonitor) nnrmthread.getRunnable()).stopMonitor();
1216        nnrmthread.interrupt();
1217      }
1218      if (nnEditLogRoller != null) {
1219        ((NameNodeEditLogRoller)nnEditLogRoller.getRunnable()).stop();
1220        nnEditLogRoller.interrupt();
1221      }
1222      if (lazyPersistFileScrubber != null) {
1223        ((LazyPersistFileScrubber) lazyPersistFileScrubber.getRunnable()).stop();
1224        lazyPersistFileScrubber.interrupt();
1225      }
1226      if (dir != null && getFSImage() != null) {
1227        if (getFSImage().editLog != null) {
1228          getFSImage().editLog.close();
1229        }
1230        // Update the fsimage with the last txid that we wrote
1231        // so that the tailer starts from the right spot.
1232        getFSImage().updateLastAppliedTxIdFromWritten();
1233      }
1234      if (cacheManager != null) {
1235        cacheManager.stopMonitorThread();
1236        cacheManager.clearDirectiveStats();
1237      }
1238      blockManager.getDatanodeManager().clearPendingCachingCommands();
1239      blockManager.getDatanodeManager().setShouldSendCachingCommands(false);
1240      // Don't want to keep replication queues when not in Active.
1241      blockManager.clearQueues();
1242      initializedReplQueues = false;
1243    } finally {
1244      writeUnlock();
1245    }
1246  }
1247  
1248  /**
1249   * Start services required in standby state 
1250   * 
1251   * @throws IOException
1252   */
1253  void startStandbyServices(final Configuration conf) throws IOException {
1254    LOG.info("Starting services required for standby state");
1255    if (!getFSImage().editLog.isOpenForRead()) {
1256      // During startup, we're already open for read.
1257      getFSImage().editLog.initSharedJournalsForRead();
1258    }
1259    
1260    blockManager.setPostponeBlocksFromFuture(true);
1261
1262    // Disable quota checks while in standby.
1263    dir.disableQuotaChecks();
1264    editLogTailer = new EditLogTailer(this, conf);
1265    editLogTailer.start();
1266    if (standbyShouldCheckpoint) {
1267      standbyCheckpointer = new StandbyCheckpointer(conf, this);
1268      standbyCheckpointer.start();
1269    }
1270  }
1271
1272  /**
1273   * Called when the NN is in Standby state and the editlog tailer tails the
1274   * OP_ROLLING_UPGRADE_START.
1275   */
1276  void triggerRollbackCheckpoint() {
1277    setNeedRollbackFsImage(true);
1278    if (standbyCheckpointer != null) {
1279      standbyCheckpointer.triggerRollbackCheckpoint();
1280    }
1281  }
1282
1283  /**
1284   * Called while the NN is in Standby state, but just about to be
1285   * asked to enter Active state. This cancels any checkpoints
1286   * currently being taken.
1287   */
1288  void prepareToStopStandbyServices() throws ServiceFailedException {
1289    if (standbyCheckpointer != null) {
1290      standbyCheckpointer.cancelAndPreventCheckpoints(
1291          "About to leave standby state");
1292    }
1293  }
1294
1295  /** Stop services required in standby state */
1296  void stopStandbyServices() throws IOException {
1297    LOG.info("Stopping services started for standby state");
1298    if (standbyCheckpointer != null) {
1299      standbyCheckpointer.stop();
1300    }
1301    if (editLogTailer != null) {
1302      editLogTailer.stop();
1303    }
1304    if (dir != null && getFSImage() != null && getFSImage().editLog != null) {
1305      getFSImage().editLog.close();
1306    }
1307  }
1308  
1309  @Override
1310  public void checkOperation(OperationCategory op) throws StandbyException {
1311    if (haContext != null) {
1312      // null in some unit tests
1313      haContext.checkOperation(op);
1314    }
1315  }
1316  
1317  /**
1318   * @throws RetriableException
1319   *           If 1) The NameNode is in SafeMode, 2) HA is enabled, and 3)
1320   *           NameNode is in active state
1321   * @throws SafeModeException
1322   *           Otherwise if NameNode is in SafeMode.
1323   */
1324  void checkNameNodeSafeMode(String errorMsg)
1325      throws RetriableException, SafeModeException {
1326    if (isInSafeMode()) {
1327      SafeModeException se = new SafeModeException(errorMsg, safeMode);
1328      if (haEnabled && haContext != null
1329          && haContext.getState().getServiceState() == HAServiceState.ACTIVE
1330          && shouldRetrySafeMode(this.safeMode)) {
1331        throw new RetriableException(se);
1332      } else {
1333        throw se;
1334      }
1335    }
1336  }
1337
1338  boolean isPermissionEnabled() {
1339    return isPermissionEnabled;
1340  }
1341
1342  /**
1343   * We already know that the safemode is on. We will throw a RetriableException
1344   * if the safemode is not manual or caused by low resource.
1345   */
1346  private boolean shouldRetrySafeMode(SafeModeInfo safeMode) {
1347    if (safeMode == null) {
1348      return false;
1349    } else {
1350      return !safeMode.isManual() && !safeMode.areResourcesLow();
1351    }
1352  }
1353  
1354  public static Collection<URI> getNamespaceDirs(Configuration conf) {
1355    return getStorageDirs(conf, DFS_NAMENODE_NAME_DIR_KEY);
1356  }
1357
1358  /**
1359   * Get all edits dirs which are required. If any shared edits dirs are
1360   * configured, these are also included in the set of required dirs.
1361   * 
1362   * @param conf the HDFS configuration.
1363   * @return all required dirs.
1364   */
1365  public static Collection<URI> getRequiredNamespaceEditsDirs(Configuration conf) {
1366    Set<URI> ret = new HashSet<URI>();
1367    ret.addAll(getStorageDirs(conf, DFS_NAMENODE_EDITS_DIR_REQUIRED_KEY));
1368    ret.addAll(getSharedEditsDirs(conf));
1369    return ret;
1370  }
1371
1372  private static Collection<URI> getStorageDirs(Configuration conf,
1373                                                String propertyName) {
1374    Collection<String> dirNames = conf.getTrimmedStringCollection(propertyName);
1375    StartupOption startOpt = NameNode.getStartupOption(conf);
1376    if(startOpt == StartupOption.IMPORT) {
1377      // In case of IMPORT this will get rid of default directories 
1378      // but will retain directories specified in hdfs-site.xml
1379      // When importing image from a checkpoint, the name-node can
1380      // start with empty set of storage directories.
1381      Configuration cE = new HdfsConfiguration(false);
1382      cE.addResource("core-default.xml");
1383      cE.addResource("core-site.xml");
1384      cE.addResource("hdfs-default.xml");
1385      Collection<String> dirNames2 = cE.getTrimmedStringCollection(propertyName);
1386      dirNames.removeAll(dirNames2);
1387      if(dirNames.isEmpty())
1388        LOG.warn("!!! WARNING !!!" +
1389          "\n\tThe NameNode currently runs without persistent storage." +
1390          "\n\tAny changes to the file system meta-data may be lost." +
1391          "\n\tRecommended actions:" +
1392          "\n\t\t- shutdown and restart NameNode with configured \"" 
1393          + propertyName + "\" in hdfs-site.xml;" +
1394          "\n\t\t- use Backup Node as a persistent and up-to-date storage " +
1395          "of the file system meta-data.");
1396    } else if (dirNames.isEmpty()) {
1397      dirNames = Collections.singletonList(
1398          DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_DEFAULT);
1399    }
1400    return Util.stringCollectionAsURIs(dirNames);
1401  }
1402
1403  /**
1404   * Return an ordered list of edits directories to write to.
1405   * The list is ordered such that all shared edits directories
1406   * are ordered before non-shared directories, and any duplicates
1407   * are removed. The order they are specified in the configuration
1408   * is retained.
1409   * @return Collection of shared edits directories.
1410   * @throws IOException if multiple shared edits directories are configured
1411   */
1412  public static List<URI> getNamespaceEditsDirs(Configuration conf)
1413      throws IOException {
1414    return getNamespaceEditsDirs(conf, true);
1415  }
1416  
1417  public static List<URI> getNamespaceEditsDirs(Configuration conf,
1418      boolean includeShared)
1419      throws IOException {
1420    // Use a LinkedHashSet so that order is maintained while we de-dup
1421    // the entries.
1422    LinkedHashSet<URI> editsDirs = new LinkedHashSet<URI>();
1423    
1424    if (includeShared) {
1425      List<URI> sharedDirs = getSharedEditsDirs(conf);
1426  
1427      // Fail until multiple shared edits directories are supported (HDFS-2782)
1428      if (sharedDirs.size() > 1) {
1429        throw new IOException(
1430            "Multiple shared edits directories are not yet supported");
1431      }
1432  
1433      // First add the shared edits dirs. It's critical that the shared dirs
1434      // are added first, since JournalSet syncs them in the order they are listed,
1435      // and we need to make sure all edits are in place in the shared storage
1436      // before they are replicated locally. See HDFS-2874.
1437      for (URI dir : sharedDirs) {
1438        if (!editsDirs.add(dir)) {
1439          LOG.warn("Edits URI " + dir + " listed multiple times in " + 
1440              DFS_NAMENODE_SHARED_EDITS_DIR_KEY + ". Ignoring duplicates.");
1441        }
1442      }
1443    }    
1444    // Now add the non-shared dirs.
1445    for (URI dir : getStorageDirs(conf, DFS_NAMENODE_EDITS_DIR_KEY)) {
1446      if (!editsDirs.add(dir)) {
1447        LOG.warn("Edits URI " + dir + " listed multiple times in " + 
1448            DFS_NAMENODE_SHARED_EDITS_DIR_KEY + " and " +
1449            DFS_NAMENODE_EDITS_DIR_KEY + ". Ignoring duplicates.");
1450      }
1451    }
1452
1453    if (editsDirs.isEmpty()) {
1454      // If this is the case, no edit dirs have been explicitly configured.
1455      // Image dirs are to be used for edits too.
1456      return Lists.newArrayList(getNamespaceDirs(conf));
1457    } else {
1458      return Lists.newArrayList(editsDirs);
1459    }
1460  }
1461  
1462  /**
1463   * Returns edit directories that are shared between primary and secondary.
1464   * @param conf configuration
1465   * @return collection of edit directories from {@code conf}
1466   */
1467  public static List<URI> getSharedEditsDirs(Configuration conf) {
1468    // don't use getStorageDirs here, because we want an empty default
1469    // rather than the dir in /tmp
1470    Collection<String> dirNames = conf.getTrimmedStringCollection(
1471        DFS_NAMENODE_SHARED_EDITS_DIR_KEY);
1472    return Util.stringCollectionAsURIs(dirNames);
1473  }
1474
1475  @Override
1476  public void readLock() {
1477    this.fsLock.readLock().lock();
1478  }
1479  @Override
1480  public void readUnlock() {
1481    this.fsLock.readLock().unlock();
1482  }
1483  @Override
1484  public void writeLock() {
1485    this.fsLock.writeLock().lock();
1486  }
1487  @Override
1488  public void writeLockInterruptibly() throws InterruptedException {
1489    this.fsLock.writeLock().lockInterruptibly();
1490  }
1491  @Override
1492  public void writeUnlock() {
1493    this.fsLock.writeLock().unlock();
1494  }
1495  @Override
1496  public boolean hasWriteLock() {
1497    return this.fsLock.isWriteLockedByCurrentThread();
1498  }
1499  @Override
1500  public boolean hasReadLock() {
1501    return this.fsLock.getReadHoldCount() > 0 || hasWriteLock();
1502  }
1503
1504  public int getReadHoldCount() {
1505    return this.fsLock.getReadHoldCount();
1506  }
1507
1508  public int getWriteHoldCount() {
1509    return this.fsLock.getWriteHoldCount();
1510  }
1511
1512  /** Lock the checkpoint lock */
1513  public void cpLock() {
1514    this.cpLock.lock();
1515  }
1516
1517  /** Lock the checkpoint lock interrupibly */
1518  public void cpLockInterruptibly() throws InterruptedException {
1519    this.cpLock.lockInterruptibly();
1520  }
1521
1522  /** Unlock the checkpoint lock */
1523  public void cpUnlock() {
1524    this.cpLock.unlock();
1525  }
1526    
1527
1528  NamespaceInfo getNamespaceInfo() {
1529    readLock();
1530    try {
1531      return unprotectedGetNamespaceInfo();
1532    } finally {
1533      readUnlock();
1534    }
1535  }
1536
1537  /**
1538   * Version of @see #getNamespaceInfo() that is not protected by a lock.
1539   */
1540  NamespaceInfo unprotectedGetNamespaceInfo() {
1541    return new NamespaceInfo(getFSImage().getStorage().getNamespaceID(),
1542        getClusterId(), getBlockPoolId(),
1543        getFSImage().getStorage().getCTime());
1544  }
1545
1546  /**
1547   * Close down this file system manager.
1548   * Causes heartbeat and lease daemons to stop; waits briefly for
1549   * them to finish, but a short timeout returns control back to caller.
1550   */
1551  void close() {
1552    fsRunning = false;
1553    try {
1554      stopCommonServices();
1555      if (smmthread != null) smmthread.interrupt();
1556    } finally {
1557      // using finally to ensure we also wait for lease daemon
1558      try {
1559        stopActiveServices();
1560        stopStandbyServices();
1561      } catch (IOException ie) {
1562      } finally {
1563        IOUtils.cleanup(LOG, dir);
1564        IOUtils.cleanup(LOG, fsImage);
1565      }
1566    }
1567  }
1568
1569  @Override
1570  public boolean isRunning() {
1571    return fsRunning;
1572  }
1573  
1574  @Override
1575  public boolean isInStandbyState() {
1576    if (haContext == null || haContext.getState() == null) {
1577      // We're still starting up. In this case, if HA is
1578      // on for the cluster, we always start in standby. Otherwise
1579      // start in active.
1580      return haEnabled;
1581    }
1582
1583    return HAServiceState.STANDBY == haContext.getState().getServiceState();
1584  }
1585
1586  /**
1587   * Dump all metadata into specified file
1588   */
1589  void metaSave(String filename) throws IOException {
1590    checkSuperuserPrivilege();
1591    checkOperation(OperationCategory.UNCHECKED);
1592    writeLock();
1593    try {
1594      checkOperation(OperationCategory.UNCHECKED);
1595      File file = new File(System.getProperty("hadoop.log.dir"), filename);
1596      PrintWriter out = new PrintWriter(new BufferedWriter(
1597          new OutputStreamWriter(new FileOutputStream(file), Charsets.UTF_8)));
1598      metaSave(out);
1599      out.flush();
1600      out.close();
1601    } finally {
1602      writeUnlock();
1603    }
1604  }
1605
1606  private void metaSave(PrintWriter out) {
1607    assert hasWriteLock();
1608    long totalInodes = this.dir.totalInodes();
1609    long totalBlocks = this.getBlocksTotal();
1610    out.println(totalInodes + " files and directories, " + totalBlocks
1611        + " blocks = " + (totalInodes + totalBlocks) + " total");
1612
1613    blockManager.metaSave(out);
1614  }
1615
1616  private String metaSaveAsString() {
1617    StringWriter sw = new StringWriter();
1618    PrintWriter pw = new PrintWriter(sw);
1619    metaSave(pw);
1620    pw.flush();
1621    return sw.toString();
1622  }
1623
1624  FsServerDefaults getServerDefaults() throws StandbyException {
1625    checkOperation(OperationCategory.READ);
1626    return serverDefaults;
1627  }
1628
1629  long getAccessTimePrecision() {
1630    return accessTimePrecision;
1631  }
1632
1633  private boolean isAccessTimeSupported() {
1634    return accessTimePrecision > 0;
1635  }
1636
1637  /////////////////////////////////////////////////////////
1638  //
1639  // These methods are called by HadoopFS clients
1640  //
1641  /////////////////////////////////////////////////////////
1642  /**
1643   * Set permissions for an existing file.
1644   * @throws IOException
1645   */
1646  void setPermission(String src, FsPermission permission) throws IOException {
1647    HdfsFileStatus auditStat;
1648    checkOperation(OperationCategory.WRITE);
1649    writeLock();
1650    try {
1651      checkOperation(OperationCategory.WRITE);
1652      checkNameNodeSafeMode("Cannot set permission for " + src);
1653      auditStat = FSDirAttrOp.setPermission(dir, src, permission);
1654    } catch (AccessControlException e) {
1655      logAuditEvent(false, "setPermission", src);
1656      throw e;
1657    } finally {
1658      writeUnlock();
1659    }
1660    getEditLog().logSync();
1661    logAuditEvent(true, "setPermission", src, null, auditStat);
1662  }
1663
1664  /**
1665   * Set owner for an existing file.
1666   * @throws IOException
1667   */
1668  void setOwner(String src, String username, String group)
1669      throws IOException {
1670    HdfsFileStatus auditStat;
1671    checkOperation(OperationCategory.WRITE);
1672    writeLock();
1673    try {
1674      checkOperation(OperationCategory.WRITE);
1675      checkNameNodeSafeMode("Cannot set owner for " + src);
1676      auditStat = FSDirAttrOp.setOwner(dir, src, username, group);
1677    } catch (AccessControlException e) {
1678      logAuditEvent(false, "setOwner", src);
1679      throw e;
1680    } finally {
1681      writeUnlock();
1682    }
1683    getEditLog().logSync();
1684    logAuditEvent(true, "setOwner", src, null, auditStat);
1685  }
1686
1687  static class GetBlockLocationsResult {
1688    final INodesInPath iip;
1689    final LocatedBlocks blocks;
1690    boolean updateAccessTime() {
1691      return iip != null;
1692    }
1693    private GetBlockLocationsResult(INodesInPath iip, LocatedBlocks blocks) {
1694      this.iip = iip;
1695      this.blocks = blocks;
1696    }
1697  }
1698
1699  /**
1700   * Get block locations within the specified range.
1701   * @see ClientProtocol#getBlockLocations(String, long, long)
1702   */
1703  LocatedBlocks getBlockLocations(String clientMachine, String src,
1704      long offset, long length) throws IOException {
1705    checkOperation(OperationCategory.READ);
1706    GetBlockLocationsResult res = null;
1707    readLock();
1708    try {
1709      checkOperation(OperationCategory.READ);
1710      res = getBlockLocations(src, offset, length, true, true);
1711    } catch (AccessControlException e) {
1712      logAuditEvent(false, "open", src);
1713      throw e;
1714    } finally {
1715      readUnlock();
1716    }
1717
1718    logAuditEvent(true, "open", src);
1719
1720    if (res.updateAccessTime()) {
1721      writeLock();
1722      final long now = now();
1723      try {
1724        checkOperation(OperationCategory.WRITE);
1725        INode inode = res.iip.getLastINode();
1726        boolean updateAccessTime = now > inode.getAccessTime() +
1727            getAccessTimePrecision();
1728        if (!isInSafeMode() && updateAccessTime) {
1729          boolean changed = FSDirAttrOp.setTimes(dir,
1730              inode, -1, now, false, res.iip.getLatestSnapshotId());
1731          if (changed) {
1732            getEditLog().logTimes(src, -1, now);
1733          }
1734        }
1735      } catch (Throwable e) {
1736        LOG.warn("Failed to update the access time of " + src, e);
1737      } finally {
1738        writeUnlock();
1739      }
1740    }
1741
1742    LocatedBlocks blocks = res.blocks;
1743    if (blocks != null) {
1744      blockManager.getDatanodeManager().sortLocatedBlocks(
1745          clientMachine, blocks.getLocatedBlocks());
1746
1747      // lastBlock is not part of getLocatedBlocks(), might need to sort it too
1748      LocatedBlock lastBlock = blocks.getLastLocatedBlock();
1749      if (lastBlock != null) {
1750        ArrayList<LocatedBlock> lastBlockList = Lists.newArrayList(lastBlock);
1751        blockManager.getDatanodeManager().sortLocatedBlocks(
1752            clientMachine, lastBlockList);
1753      }
1754    }
1755    return blocks;
1756  }
1757
1758  /**
1759   * Get block locations within the specified range.
1760   * @see ClientProtocol#getBlockLocations(String, long, long)
1761   * @throws IOException
1762   */
1763  GetBlockLocationsResult getBlockLocations(
1764      String src, long offset, long length, boolean needBlockToken,
1765      boolean checkSafeMode) throws IOException {
1766    if (offset < 0) {
1767      throw new HadoopIllegalArgumentException(
1768          "Negative offset is not supported. File: " + src);
1769    }
1770    if (length < 0) {
1771      throw new HadoopIllegalArgumentException(
1772          "Negative length is not supported. File: " + src);
1773    }
1774    final GetBlockLocationsResult ret = getBlockLocationsInt(
1775        src, offset, length, needBlockToken);
1776
1777    if (checkSafeMode && isInSafeMode()) {
1778      for (LocatedBlock b : ret.blocks.getLocatedBlocks()) {
1779        // if safemode & no block locations yet then throw safemodeException
1780        if ((b.getLocations() == null) || (b.getLocations().length == 0)) {
1781          SafeModeException se = new SafeModeException(
1782              "Zero blocklocations for " + src, safeMode);
1783          if (haEnabled && haContext != null &&
1784              haContext.getState().getServiceState() == HAServiceState.ACTIVE) {
1785            throw new RetriableException(se);
1786          } else {
1787            throw se;
1788          }
1789        }
1790      }
1791    }
1792    return ret;
1793  }
1794
1795  private GetBlockLocationsResult getBlockLocationsInt(
1796      final String srcArg, long offset, long length, boolean needBlockToken)
1797      throws IOException {
1798    String src = srcArg;
1799    FSPermissionChecker pc = getPermissionChecker();
1800    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
1801    src = dir.resolvePath(pc, src, pathComponents);
1802    final INodesInPath iip = dir.getINodesInPath(src, true);
1803    final INodeFile inode = INodeFile.valueOf(iip.getLastINode(), src);
1804    if (isPermissionEnabled) {
1805      dir.checkPathAccess(pc, iip, FsAction.READ);
1806      checkUnreadableBySuperuser(pc, inode, iip.getPathSnapshotId());
1807    }
1808
1809    final long fileSize = iip.isSnapshot()
1810        ? inode.computeFileSize(iip.getPathSnapshotId())
1811        : inode.computeFileSizeNotIncludingLastUcBlock();
1812    boolean isUc = inode.isUnderConstruction();
1813    if (iip.isSnapshot()) {
1814      // if src indicates a snapshot file, we need to make sure the returned
1815      // blocks do not exceed the size of the snapshot file.
1816      length = Math.min(length, fileSize - offset);
1817      isUc = false;
1818    }
1819
1820    final FileEncryptionInfo feInfo =
1821        FSDirectory.isReservedRawName(srcArg) ? null
1822            : dir.getFileEncryptionInfo(inode, iip.getPathSnapshotId(), iip);
1823
1824    final LocatedBlocks blocks = blockManager.createLocatedBlocks(
1825        inode.getBlocks(iip.getPathSnapshotId()), fileSize,
1826        isUc, offset, length, needBlockToken, iip.isSnapshot(), feInfo);
1827
1828    // Set caching information for the located blocks.
1829    for (LocatedBlock lb : blocks.getLocatedBlocks()) {
1830      cacheManager.setCachedLocations(lb);
1831    }
1832
1833    final long now = now();
1834    boolean updateAccessTime = isAccessTimeSupported() && !isInSafeMode()
1835        && !iip.isSnapshot()
1836        && now > inode.getAccessTime() + getAccessTimePrecision();
1837    return new GetBlockLocationsResult(updateAccessTime ? iip : null, blocks);
1838  }
1839
1840  /**
1841   * Moves all the blocks from {@code srcs} and appends them to {@code target}
1842   * To avoid rollbacks we will verify validity of ALL of the args
1843   * before we start actual move.
1844   * 
1845   * This does not support ".inodes" relative path
1846   * @param target target to concat into
1847   * @param srcs file that will be concatenated
1848   * @throws IOException on error
1849   */
1850  void concat(String target, String [] srcs, boolean logRetryCache)
1851      throws IOException {
1852    checkOperation(OperationCategory.WRITE);
1853    waitForLoadingFSImage();
1854    HdfsFileStatus stat = null;
1855    boolean success = false;
1856    writeLock();
1857    try {
1858      checkOperation(OperationCategory.WRITE);
1859      checkNameNodeSafeMode("Cannot concat " + target);
1860      stat = FSDirConcatOp.concat(dir, target, srcs, logRetryCache);
1861      success = true;
1862    } finally {
1863      writeUnlock();
1864      if (success) {
1865        getEditLog().logSync();
1866      }
1867      logAuditEvent(success, "concat", Arrays.toString(srcs), target, stat);
1868    }
1869  }
1870
1871  /**
1872   * stores the modification and access time for this inode. 
1873   * The access time is precise up to an hour. The transaction, if needed, is
1874   * written to the edits log but is not flushed.
1875   */
1876  void setTimes(String src, long mtime, long atime) throws IOException {
1877    HdfsFileStatus auditStat;
1878    checkOperation(OperationCategory.WRITE);
1879    writeLock();
1880    try {
1881      checkOperation(OperationCategory.WRITE);
1882      checkNameNodeSafeMode("Cannot set times " + src);
1883      auditStat = FSDirAttrOp.setTimes(dir, src, mtime, atime);
1884    } catch (AccessControlException e) {
1885      logAuditEvent(false, "setTimes", src);
1886      throw e;
1887    } finally {
1888      writeUnlock();
1889    }
1890    getEditLog().logSync();
1891    logAuditEvent(true, "setTimes", src, null, auditStat);
1892  }
1893
1894  /**
1895   * Create a symbolic link.
1896   */
1897  @SuppressWarnings("deprecation")
1898  void createSymlink(String target, String link,
1899      PermissionStatus dirPerms, boolean createParent, boolean logRetryCache)
1900      throws IOException {
1901    if (!FileSystem.areSymlinksEnabled()) {
1902      throw new UnsupportedOperationException("Symlinks not supported");
1903    }
1904    HdfsFileStatus auditStat = null;
1905    checkOperation(OperationCategory.WRITE);
1906    writeLock();
1907    try {
1908      checkOperation(OperationCategory.WRITE);
1909      checkNameNodeSafeMode("Cannot create symlink " + link);
1910      auditStat = FSDirSymlinkOp.createSymlinkInt(this, target, link, dirPerms,
1911                                                  createParent, logRetryCache);
1912    } catch (AccessControlException e) {
1913      logAuditEvent(false, "createSymlink", link, target, null);
1914      throw e;
1915    } finally {
1916      writeUnlock();
1917    }
1918    getEditLog().logSync();
1919    logAuditEvent(true, "createSymlink", link, target, auditStat);
1920  }
1921
1922  /**
1923   * Set replication for an existing file.
1924   * 
1925   * The NameNode sets new replication and schedules either replication of 
1926   * under-replicated data blocks or removal of the excessive block copies 
1927   * if the blocks are over-replicated.
1928   * 
1929   * @see ClientProtocol#setReplication(String, short)
1930   * @param src file name
1931   * @param replication new replication
1932   * @return true if successful; 
1933   *         false if file does not exist or is a directory
1934   */
1935  boolean setReplication(final String src, final short replication)
1936      throws IOException {
1937    boolean success = false;
1938    waitForLoadingFSImage();
1939    checkOperation(OperationCategory.WRITE);
1940    writeLock();
1941    try {
1942      checkOperation(OperationCategory.WRITE);
1943      checkNameNodeSafeMode("Cannot set replication for " + src);
1944      success = FSDirAttrOp.setReplication(dir, blockManager, src, replication);
1945    } catch (AccessControlException e) {
1946      logAuditEvent(false, "setReplication", src);
1947      throw e;
1948    } finally {
1949      writeUnlock();
1950    }
1951    if (success) {
1952      getEditLog().logSync();
1953      logAuditEvent(true, "setReplication", src);
1954    }
1955    return success;
1956  }
1957
1958  /**
1959   * Truncate file to a lower length.
1960   * Truncate cannot be reverted / recovered from as it causes data loss.
1961   * Truncation at block boundary is atomic, otherwise it requires
1962   * block recovery to truncate the last block of the file.
1963   *
1964   * @return true if client does not need to wait for block recovery,
1965   * false if client needs to wait for block recovery.
1966   */
1967  boolean truncate(String src, long newLength,
1968                   String clientName, String clientMachine,
1969                   long mtime)
1970      throws IOException, UnresolvedLinkException {
1971    boolean ret;
1972    try {
1973      ret = truncateInt(src, newLength, clientName, clientMachine, mtime);
1974    } catch (AccessControlException e) {
1975      logAuditEvent(false, "truncate", src);
1976      throw e;
1977    }
1978    return ret;
1979  }
1980
1981  boolean truncateInt(String srcArg, long newLength,
1982                      String clientName, String clientMachine,
1983                      long mtime)
1984      throws IOException, UnresolvedLinkException {
1985    String src = srcArg;
1986    if (NameNode.stateChangeLog.isDebugEnabled()) {
1987      NameNode.stateChangeLog.debug("DIR* NameSystem.truncate: src="
1988          + src + " newLength=" + newLength);
1989    }
1990    if (newLength < 0) {
1991      throw new HadoopIllegalArgumentException(
1992          "Cannot truncate to a negative file size: " + newLength + ".");
1993    }
1994    HdfsFileStatus stat = null;
1995    FSPermissionChecker pc = getPermissionChecker();
1996    checkOperation(OperationCategory.WRITE);
1997    boolean res;
1998    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
1999    writeLock();
2000    BlocksMapUpdateInfo toRemoveBlocks = new BlocksMapUpdateInfo();
2001    try {
2002      checkOperation(OperationCategory.WRITE);
2003      checkNameNodeSafeMode("Cannot truncate for " + src);
2004      src = dir.resolvePath(pc, src, pathComponents);
2005      res = truncateInternal(src, newLength, clientName,
2006          clientMachine, mtime, pc, toRemoveBlocks);
2007      stat = dir.getAuditFileInfo(dir.getINodesInPath4Write(src, false));
2008    } finally {
2009      writeUnlock();
2010    }
2011    getEditLog().logSync();
2012    if (!toRemoveBlocks.getToDeleteList().isEmpty()) {
2013      removeBlocks(toRemoveBlocks);
2014      toRemoveBlocks.clear();
2015    }
2016    logAuditEvent(true, "truncate", src, null, stat);
2017    return res;
2018  }
2019
2020  /**
2021   * Truncate a file to a given size
2022   * Update the count at each ancestor directory with quota
2023   */
2024  boolean truncateInternal(String src, long newLength,
2025                           String clientName, String clientMachine,
2026                           long mtime, FSPermissionChecker pc,
2027                           BlocksMapUpdateInfo toRemoveBlocks)
2028      throws IOException, UnresolvedLinkException {
2029    assert hasWriteLock();
2030    INodesInPath iip = dir.getINodesInPath4Write(src, true);
2031    if (isPermissionEnabled) {
2032      dir.checkPathAccess(pc, iip, FsAction.WRITE);
2033    }
2034    INodeFile file = INodeFile.valueOf(iip.getLastINode(), src);
2035    final BlockStoragePolicy lpPolicy =
2036        blockManager.getStoragePolicy("LAZY_PERSIST");
2037
2038    if (lpPolicy != null &&
2039        lpPolicy.getId() == file.getStoragePolicyID()) {
2040      throw new UnsupportedOperationException(
2041          "Cannot truncate lazy persist file " + src);
2042    }
2043
2044    // Check if the file is already being truncated with the same length
2045    final BlockInfoContiguous last = file.getLastBlock();
2046    if (last != null && last.getBlockUCState() == BlockUCState.UNDER_RECOVERY) {
2047      final Block truncateBlock
2048          = ((BlockInfoContiguousUnderConstruction)last).getTruncateBlock();
2049      if (truncateBlock != null) {
2050        final long truncateLength = file.computeFileSize(false, false)
2051            + truncateBlock.getNumBytes();
2052        if (newLength == truncateLength) {
2053          return false;
2054        }
2055      }
2056    }
2057
2058    // Opening an existing file for truncate. May need lease recovery.
2059    recoverLeaseInternal(RecoverLeaseOp.TRUNCATE_FILE,
2060        iip, src, clientName, clientMachine, false);
2061    // Truncate length check.
2062    long oldLength = file.computeFileSize();
2063    if(oldLength == newLength) {
2064      return true;
2065    }
2066    if(oldLength < newLength) {
2067      throw new HadoopIllegalArgumentException(
2068          "Cannot truncate to a larger file size. Current size: " + oldLength +
2069              ", truncate size: " + newLength + ".");
2070    }
2071    // Perform INodeFile truncation.
2072    final QuotaCounts delta = new QuotaCounts.Builder().build();
2073    boolean onBlockBoundary = dir.truncate(iip, newLength, toRemoveBlocks,
2074        mtime, delta);
2075    Block truncateBlock = null;
2076    if(!onBlockBoundary) {
2077      // Open file for write, but don't log into edits
2078      long lastBlockDelta = file.computeFileSize() - newLength;
2079      assert lastBlockDelta > 0 : "delta is 0 only if on block bounday";
2080      truncateBlock = prepareFileForTruncate(iip, clientName, clientMachine,
2081          lastBlockDelta, null);
2082    }
2083
2084    // update the quota: use the preferred block size for UC block
2085    dir.writeLock();
2086    try {
2087      dir.updateCountNoQuotaCheck(iip, iip.length() - 1, delta);
2088    } finally {
2089      dir.writeUnlock();
2090    }
2091
2092    getEditLog().logTruncate(src, clientName, clientMachine, newLength, mtime,
2093        truncateBlock);
2094    return onBlockBoundary;
2095  }
2096
2097  /**
2098   * Convert current INode to UnderConstruction.
2099   * Recreate lease.
2100   * Create new block for the truncated copy.
2101   * Schedule truncation of the replicas.
2102   *
2103   * @return the returned block will be written to editLog and passed back into
2104   * this method upon loading.
2105   */
2106  Block prepareFileForTruncate(INodesInPath iip,
2107                               String leaseHolder,
2108                               String clientMachine,
2109                               long lastBlockDelta,
2110                               Block newBlock)
2111      throws IOException {
2112    INodeFile file = iip.getLastINode().asFile();
2113    String src = iip.getPath();
2114    file.recordModification(iip.getLatestSnapshotId());
2115    file.toUnderConstruction(leaseHolder, clientMachine);
2116    assert file.isUnderConstruction() : "inode should be under construction.";
2117    leaseManager.addLease(
2118        file.getFileUnderConstructionFeature().getClientName(), src);
2119    boolean shouldRecoverNow = (newBlock == null);
2120    BlockInfoContiguous oldBlock = file.getLastBlock();
2121    boolean shouldCopyOnTruncate = shouldCopyOnTruncate(file, oldBlock);
2122    if(newBlock == null) {
2123      newBlock = (shouldCopyOnTruncate) ? createNewBlock() :
2124          new Block(oldBlock.getBlockId(), oldBlock.getNumBytes(),
2125              nextGenerationStamp(blockIdManager.isLegacyBlock(oldBlock)));
2126    }
2127
2128    BlockInfoContiguousUnderConstruction truncatedBlockUC;
2129    if(shouldCopyOnTruncate) {
2130      // Add new truncateBlock into blocksMap and
2131      // use oldBlock as a source for copy-on-truncate recovery
2132      truncatedBlockUC = new BlockInfoContiguousUnderConstruction(newBlock,
2133          file.getBlockReplication());
2134      truncatedBlockUC.setNumBytes(oldBlock.getNumBytes() - lastBlockDelta);
2135      truncatedBlockUC.setTruncateBlock(oldBlock);
2136      file.setLastBlock(truncatedBlockUC, blockManager.getStorages(oldBlock));
2137      getBlockManager().addBlockCollection(truncatedBlockUC, file);
2138
2139      NameNode.stateChangeLog.info("BLOCK* prepareFileForTruncate: "
2140          + "Scheduling copy-on-truncate to new size "
2141          + truncatedBlockUC.getNumBytes() + " new block " + newBlock
2142          + " old block " + truncatedBlockUC.getTruncateBlock());
2143    } else {
2144      // Use new generation stamp for in-place truncate recovery
2145      blockManager.convertLastBlockToUnderConstruction(file, lastBlockDelta);
2146      oldBlock = file.getLastBlock();
2147      assert !oldBlock.isComplete() : "oldBlock should be under construction";
2148      truncatedBlockUC = (BlockInfoContiguousUnderConstruction) oldBlock;
2149      truncatedBlockUC.setTruncateBlock(new Block(oldBlock));
2150      truncatedBlockUC.getTruncateBlock().setNumBytes(
2151          oldBlock.getNumBytes() - lastBlockDelta);
2152      truncatedBlockUC.getTruncateBlock().setGenerationStamp(
2153          newBlock.getGenerationStamp());
2154
2155      NameNode.stateChangeLog.debug("BLOCK* prepareFileForTruncate: "
2156          + "Scheduling in-place block truncate to new size "
2157          + truncatedBlockUC.getTruncateBlock().getNumBytes()
2158          + " block=" + truncatedBlockUC);
2159    }
2160    if (shouldRecoverNow) {
2161      truncatedBlockUC.initializeBlockRecovery(newBlock.getGenerationStamp());
2162    }
2163
2164    return newBlock;
2165  }
2166
2167  /**
2168   * Defines if a replica needs to be copied on truncate or
2169   * can be truncated in place.
2170   */
2171  boolean shouldCopyOnTruncate(INodeFile file, BlockInfoContiguous blk) {
2172    if(!isUpgradeFinalized()) {
2173      return true;
2174    }
2175    if (isRollingUpgrade()) {
2176      return true;
2177    }
2178    return file.isBlockInLatestSnapshot(blk);
2179  }
2180
2181  /**
2182   * Set the storage policy for a file or a directory.
2183   *
2184   * @param src file/directory path
2185   * @param policyName storage policy name
2186   */
2187  void setStoragePolicy(String src, String policyName) throws IOException {
2188    HdfsFileStatus auditStat;
2189    waitForLoadingFSImage();
2190    checkOperation(OperationCategory.WRITE);
2191    writeLock();
2192    try {
2193      checkOperation(OperationCategory.WRITE);
2194      checkNameNodeSafeMode("Cannot set storage policy for " + src);
2195      auditStat = FSDirAttrOp.setStoragePolicy(
2196          dir, blockManager, src, policyName);
2197    } catch (AccessControlException e) {
2198      logAuditEvent(false, "setStoragePolicy", src);
2199      throw e;
2200    } finally {
2201      writeUnlock();
2202    }
2203    getEditLog().logSync();
2204    logAuditEvent(true, "setStoragePolicy", src, null, auditStat);
2205  }
2206
2207  /**
2208   * @return All the existing block storage policies
2209   */
2210  BlockStoragePolicy[] getStoragePolicies() throws IOException {
2211    checkOperation(OperationCategory.READ);
2212    waitForLoadingFSImage();
2213    readLock();
2214    try {
2215      checkOperation(OperationCategory.READ);
2216      return FSDirAttrOp.getStoragePolicies(blockManager);
2217    } finally {
2218      readUnlock();
2219    }
2220  }
2221
2222  long getPreferredBlockSize(String src) throws IOException {
2223    checkOperation(OperationCategory.READ);
2224    readLock();
2225    try {
2226      checkOperation(OperationCategory.READ);
2227      return FSDirAttrOp.getPreferredBlockSize(dir, src);
2228    } finally {
2229      readUnlock();
2230    }
2231  }
2232
2233  /**
2234   * If the file is within an encryption zone, select the appropriate 
2235   * CryptoProtocolVersion from the list provided by the client. Since the
2236   * client may be newer, we need to handle unknown versions.
2237   *
2238   * @param zone EncryptionZone of the file
2239   * @param supportedVersions List of supported protocol versions
2240   * @return chosen protocol version
2241   * @throws IOException
2242   */
2243  private CryptoProtocolVersion chooseProtocolVersion(EncryptionZone zone,
2244      CryptoProtocolVersion[] supportedVersions)
2245      throws UnknownCryptoProtocolVersionException, UnresolvedLinkException,
2246        SnapshotAccessControlException {
2247    Preconditions.checkNotNull(zone);
2248    Preconditions.checkNotNull(supportedVersions);
2249    // Right now, we only support a single protocol version,
2250    // so simply look for it in the list of provided options
2251    final CryptoProtocolVersion required = zone.getVersion();
2252
2253    for (CryptoProtocolVersion c : supportedVersions) {
2254      if (c.equals(CryptoProtocolVersion.UNKNOWN)) {
2255        if (LOG.isDebugEnabled()) {
2256          LOG.debug("Ignoring unknown CryptoProtocolVersion provided by " +
2257              "client: " + c.getUnknownValue());
2258        }
2259        continue;
2260      }
2261      if (c.equals(required)) {
2262        return c;
2263      }
2264    }
2265    throw new UnknownCryptoProtocolVersionException(
2266        "No crypto protocol versions provided by the client are supported."
2267            + " Client provided: " + Arrays.toString(supportedVersions)
2268            + " NameNode supports: " + Arrays.toString(CryptoProtocolVersion
2269            .values()));
2270  }
2271
2272  /**
2273   * Invoke KeyProvider APIs to generate an encrypted data encryption key for an
2274   * encryption zone. Should not be called with any locks held.
2275   *
2276   * @param ezKeyName key name of an encryption zone
2277   * @return New EDEK, or null if ezKeyName is null
2278   * @throws IOException
2279   */
2280  private EncryptedKeyVersion generateEncryptedDataEncryptionKey(String
2281      ezKeyName) throws IOException {
2282    if (ezKeyName == null) {
2283      return null;
2284    }
2285    EncryptedKeyVersion edek = null;
2286    try {
2287      edek = provider.generateEncryptedKey(ezKeyName);
2288    } catch (GeneralSecurityException e) {
2289      throw new IOException(e);
2290    }
2291    Preconditions.checkNotNull(edek);
2292    return edek;
2293  }
2294
2295  /**
2296   * Create a new file entry in the namespace.
2297   * 
2298   * For description of parameters and exceptions thrown see
2299   * {@link ClientProtocol#create}, except it returns valid file status upon
2300   * success
2301   */
2302  HdfsFileStatus startFile(String src, PermissionStatus permissions,
2303      String holder, String clientMachine, EnumSet<CreateFlag> flag,
2304      boolean createParent, short replication, long blockSize, 
2305      CryptoProtocolVersion[] supportedVersions, boolean logRetryCache)
2306      throws AccessControlException, SafeModeException,
2307      FileAlreadyExistsException, UnresolvedLinkException,
2308      FileNotFoundException, ParentNotDirectoryException, IOException {
2309
2310    HdfsFileStatus status = null;
2311    try {
2312      status = startFileInt(src, permissions, holder, clientMachine, flag,
2313          createParent, replication, blockSize, supportedVersions,
2314          logRetryCache);
2315    } catch (AccessControlException e) {
2316      logAuditEvent(false, "create", src);
2317      throw e;
2318    }
2319    return status;
2320  }
2321
2322  private HdfsFileStatus startFileInt(final String srcArg,
2323      PermissionStatus permissions, String holder, String clientMachine,
2324      EnumSet<CreateFlag> flag, boolean createParent, short replication,
2325      long blockSize, CryptoProtocolVersion[] supportedVersions,
2326      boolean logRetryCache)
2327      throws AccessControlException, SafeModeException,
2328      FileAlreadyExistsException, UnresolvedLinkException,
2329      FileNotFoundException, ParentNotDirectoryException, IOException {
2330    String src = srcArg;
2331    if (NameNode.stateChangeLog.isDebugEnabled()) {
2332      StringBuilder builder = new StringBuilder();
2333      builder.append("DIR* NameSystem.startFile: src=" + src
2334              + ", holder=" + holder
2335              + ", clientMachine=" + clientMachine
2336              + ", createParent=" + createParent
2337              + ", replication=" + replication
2338              + ", createFlag=" + flag.toString()
2339              + ", blockSize=" + blockSize);
2340      builder.append(", supportedVersions=");
2341      if (supportedVersions != null) {
2342        builder.append(Arrays.toString(supportedVersions));
2343      } else {
2344        builder.append("null");
2345      }
2346      NameNode.stateChangeLog.debug(builder.toString());
2347    }
2348    if (!DFSUtil.isValidName(src)) {
2349      throw new InvalidPathException(src);
2350    }
2351    blockManager.verifyReplication(src, replication, clientMachine);
2352
2353    boolean skipSync = false;
2354    HdfsFileStatus stat = null;
2355    FSPermissionChecker pc = getPermissionChecker();
2356    checkOperation(OperationCategory.WRITE);
2357    if (blockSize < minBlockSize) {
2358      throw new IOException("Specified block size is less than configured" +
2359          " minimum value (" + DFSConfigKeys.DFS_NAMENODE_MIN_BLOCK_SIZE_KEY
2360          + "): " + blockSize + " < " + minBlockSize);
2361    }
2362    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
2363    boolean create = flag.contains(CreateFlag.CREATE);
2364    boolean overwrite = flag.contains(CreateFlag.OVERWRITE);
2365    boolean isLazyPersist = flag.contains(CreateFlag.LAZY_PERSIST);
2366
2367    waitForLoadingFSImage();
2368
2369    /**
2370     * If the file is in an encryption zone, we optimistically create an
2371     * EDEK for the file by calling out to the configured KeyProvider.
2372     * Since this typically involves doing an RPC, we take the readLock
2373     * initially, then drop it to do the RPC.
2374     * 
2375     * Since the path can flip-flop between being in an encryption zone and not
2376     * in the meantime, we need to recheck the preconditions when we retake the
2377     * lock to do the create. If the preconditions are not met, we throw a
2378     * special RetryStartFileException to ask the DFSClient to try the create
2379     * again later.
2380     */
2381    CryptoProtocolVersion protocolVersion = null;
2382    CipherSuite suite = null;
2383    String ezKeyName = null;
2384    EncryptedKeyVersion edek = null;
2385
2386    if (provider != null) {
2387      readLock();
2388      try {
2389        src = dir.resolvePath(pc, src, pathComponents);
2390        INodesInPath iip = dir.getINodesInPath4Write(src);
2391        // Nothing to do if the path is not within an EZ
2392        final EncryptionZone zone = dir.getEZForPath(iip);
2393        if (zone != null) {
2394          protocolVersion = chooseProtocolVersion(zone, supportedVersions);
2395          suite = zone.getSuite();
2396          ezKeyName = zone.getKeyName();
2397
2398          Preconditions.checkNotNull(protocolVersion);
2399          Preconditions.checkNotNull(suite);
2400          Preconditions.checkArgument(!suite.equals(CipherSuite.UNKNOWN),
2401              "Chose an UNKNOWN CipherSuite!");
2402          Preconditions.checkNotNull(ezKeyName);
2403        }
2404      } finally {
2405        readUnlock();
2406      }
2407
2408      Preconditions.checkState(
2409          (suite == null && ezKeyName == null) ||
2410              (suite != null && ezKeyName != null),
2411          "Both suite and ezKeyName should both be null or not null");
2412
2413      // Generate EDEK if necessary while not holding the lock
2414      edek = generateEncryptedDataEncryptionKey(ezKeyName);
2415      EncryptionFaultInjector.getInstance().startFileAfterGenerateKey();
2416    }
2417
2418    // Proceed with the create, using the computed cipher suite and 
2419    // generated EDEK
2420    BlocksMapUpdateInfo toRemoveBlocks = null;
2421    writeLock();
2422    try {
2423      checkOperation(OperationCategory.WRITE);
2424      checkNameNodeSafeMode("Cannot create file" + src);
2425      dir.writeLock();
2426      try {
2427        src = dir.resolvePath(pc, src, pathComponents);
2428        final INodesInPath iip = dir.getINodesInPath4Write(src);
2429        toRemoveBlocks = startFileInternal(
2430            pc, iip, permissions, holder,
2431            clientMachine, create, overwrite,
2432            createParent, replication, blockSize,
2433            isLazyPersist, suite, protocolVersion, edek,
2434            logRetryCache);
2435        stat = FSDirStatAndListingOp.getFileInfo(
2436            dir, src, false, FSDirectory.isReservedRawName(srcArg), true);
2437      } finally {
2438        dir.writeUnlock();
2439      }
2440    } catch (StandbyException se) {
2441      skipSync = true;
2442      throw se;
2443    } finally {
2444      writeUnlock();
2445      // There might be transactions logged while trying to recover the lease.
2446      // They need to be sync'ed even when an exception was thrown.
2447      if (!skipSync) {
2448        getEditLog().logSync();
2449        if (toRemoveBlocks != null) {
2450          removeBlocks(toRemoveBlocks);
2451          toRemoveBlocks.clear();
2452        }
2453      }
2454    }
2455
2456    logAuditEvent(true, "create", srcArg, null, stat);
2457    return stat;
2458  }
2459
2460  /**
2461   * Create a new file or overwrite an existing file<br>
2462   * 
2463   * Once the file is create the client then allocates a new block with the next
2464   * call using {@link ClientProtocol#addBlock}.
2465   * <p>
2466   * For description of parameters and exceptions thrown see
2467   * {@link ClientProtocol#create}
2468   */
2469  private BlocksMapUpdateInfo startFileInternal(FSPermissionChecker pc, 
2470      INodesInPath iip, PermissionStatus permissions, String holder,
2471      String clientMachine, boolean create, boolean overwrite, 
2472      boolean createParent, short replication, long blockSize, 
2473      boolean isLazyPersist, CipherSuite suite, CryptoProtocolVersion version,
2474      EncryptedKeyVersion edek, boolean logRetryEntry)
2475      throws IOException {
2476    assert hasWriteLock();
2477    // Verify that the destination does not exist as a directory already.
2478    final INode inode = iip.getLastINode();
2479    final String src = iip.getPath();
2480    if (inode != null && inode.isDirectory()) {
2481      throw new FileAlreadyExistsException(src +
2482          " already exists as a directory");
2483    }
2484
2485    final INodeFile myFile = INodeFile.valueOf(inode, src, true);
2486    if (isPermissionEnabled) {
2487      if (overwrite && myFile != null) {
2488        dir.checkPathAccess(pc, iip, FsAction.WRITE);
2489      }
2490      /*
2491       * To overwrite existing file, need to check 'w' permission 
2492       * of parent (equals to ancestor in this case)
2493       */
2494      dir.checkAncestorAccess(pc, iip, FsAction.WRITE);
2495    }
2496    if (!createParent) {
2497      dir.verifyParentDir(iip, src);
2498    }
2499
2500    FileEncryptionInfo feInfo = null;
2501
2502    final EncryptionZone zone = dir.getEZForPath(iip);
2503    if (zone != null) {
2504      // The path is now within an EZ, but we're missing encryption parameters
2505      if (suite == null || edek == null) {
2506        throw new RetryStartFileException();
2507      }
2508      // Path is within an EZ and we have provided encryption parameters.
2509      // Make sure that the generated EDEK matches the settings of the EZ.
2510      final String ezKeyName = zone.getKeyName();
2511      if (!ezKeyName.equals(edek.getEncryptionKeyName())) {
2512        throw new RetryStartFileException();
2513      }
2514      feInfo = new FileEncryptionInfo(suite, version,
2515          edek.getEncryptedKeyVersion().getMaterial(),
2516          edek.getEncryptedKeyIv(),
2517          ezKeyName, edek.getEncryptionKeyVersionName());
2518    }
2519
2520    try {
2521      BlocksMapUpdateInfo toRemoveBlocks = null;
2522      if (myFile == null) {
2523        if (!create) {
2524          throw new FileNotFoundException("Can't overwrite non-existent " +
2525              src + " for client " + clientMachine);
2526        }
2527      } else {
2528        if (overwrite) {
2529          toRemoveBlocks = new BlocksMapUpdateInfo();
2530          List<INode> toRemoveINodes = new ChunkedArrayList<INode>();
2531          long ret = FSDirDeleteOp.delete(dir, iip, toRemoveBlocks,
2532                                          toRemoveINodes, now());
2533          if (ret >= 0) {
2534            iip = INodesInPath.replace(iip, iip.length() - 1, null);
2535            FSDirDeleteOp.incrDeletedFileCount(ret);
2536            removeLeasesAndINodes(src, toRemoveINodes, true);
2537          }
2538        } else {
2539          // If lease soft limit time is expired, recover the lease
2540          recoverLeaseInternal(RecoverLeaseOp.CREATE_FILE,
2541              iip, src, holder, clientMachine, false);
2542          throw new FileAlreadyExistsException(src + " for client " +
2543              clientMachine + " already exists");
2544        }
2545      }
2546
2547      checkFsObjectLimit();
2548      INodeFile newNode = null;
2549
2550      // Always do an implicit mkdirs for parent directory tree.
2551      Map.Entry<INodesInPath, String> parent = FSDirMkdirOp
2552          .createAncestorDirectories(dir, iip, permissions);
2553      if (parent != null) {
2554        iip = dir.addFile(parent.getKey(), parent.getValue(), permissions,
2555            replication, blockSize, holder, clientMachine);
2556        newNode = iip != null ? iip.getLastINode().asFile() : null;
2557      }
2558
2559      if (newNode == null) {
2560        throw new IOException("Unable to add " + src +  " to namespace");
2561      }
2562      leaseManager.addLease(newNode.getFileUnderConstructionFeature()
2563          .getClientName(), src);
2564
2565      // Set encryption attributes if necessary
2566      if (feInfo != null) {
2567        dir.setFileEncryptionInfo(src, feInfo);
2568        newNode = dir.getInode(newNode.getId()).asFile();
2569      }
2570
2571      setNewINodeStoragePolicy(newNode, iip, isLazyPersist);
2572
2573      // record file record in log, record new generation stamp
2574      getEditLog().logOpenFile(src, newNode, overwrite, logRetryEntry);
2575      if (NameNode.stateChangeLog.isDebugEnabled()) {
2576        NameNode.stateChangeLog.debug("DIR* NameSystem.startFile: added " +
2577            src + " inode " + newNode.getId() + " " + holder);
2578      }
2579      return toRemoveBlocks;
2580    } catch (IOException ie) {
2581      NameNode.stateChangeLog.warn("DIR* NameSystem.startFile: " + src + " " +
2582          ie.getMessage());
2583      throw ie;
2584    }
2585  }
2586
2587  private void setNewINodeStoragePolicy(INodeFile inode,
2588                                        INodesInPath iip,
2589                                        boolean isLazyPersist)
2590      throws IOException {
2591
2592    if (isLazyPersist) {
2593      BlockStoragePolicy lpPolicy =
2594          blockManager.getStoragePolicy("LAZY_PERSIST");
2595
2596      // Set LAZY_PERSIST storage policy if the flag was passed to
2597      // CreateFile.
2598      if (lpPolicy == null) {
2599        throw new HadoopIllegalArgumentException(
2600            "The LAZY_PERSIST storage policy has been disabled " +
2601            "by the administrator.");
2602      }
2603      inode.setStoragePolicyID(lpPolicy.getId(),
2604                                 iip.getLatestSnapshotId());
2605    } else {
2606      BlockStoragePolicy effectivePolicy =
2607          blockManager.getStoragePolicy(inode.getStoragePolicyID());
2608
2609      if (effectivePolicy != null &&
2610          effectivePolicy.isCopyOnCreateFile()) {
2611        // Copy effective policy from ancestor directory to current file.
2612        inode.setStoragePolicyID(effectivePolicy.getId(),
2613                                 iip.getLatestSnapshotId());
2614      }
2615    }
2616  }
2617
2618  /**
2619   * Append to an existing file for append.
2620   * <p>
2621   * 
2622   * The method returns the last block of the file if this is a partial block,
2623   * which can still be used for writing more data. The client uses the returned
2624   * block locations to form the data pipeline for this block.<br>
2625   * The method returns null if the last block is full. The client then
2626   * allocates a new block with the next call using
2627   * {@link ClientProtocol#addBlock}.
2628   * <p>
2629   * 
2630   * For description of parameters and exceptions thrown see
2631   * {@link ClientProtocol#append(String, String, EnumSetWritable)}
2632   *
2633   * @return the last block locations if the block is partial or null otherwise
2634   */
2635  private LocatedBlock appendFileInternal(FSPermissionChecker pc,
2636      INodesInPath iip, String holder, String clientMachine, boolean newBlock,
2637      boolean logRetryCache) throws IOException {
2638    assert hasWriteLock();
2639    // Verify that the destination does not exist as a directory already.
2640    final INode inode = iip.getLastINode();
2641    final String src = iip.getPath();
2642    if (inode != null && inode.isDirectory()) {
2643      throw new FileAlreadyExistsException("Cannot append to directory " + src
2644          + "; already exists as a directory.");
2645    }
2646    if (isPermissionEnabled) {
2647      dir.checkPathAccess(pc, iip, FsAction.WRITE);
2648    }
2649
2650    try {
2651      if (inode == null) {
2652        throw new FileNotFoundException("failed to append to non-existent file "
2653          + src + " for client " + clientMachine);
2654      }
2655      INodeFile myFile = INodeFile.valueOf(inode, src, true);
2656      final BlockStoragePolicy lpPolicy =
2657          blockManager.getStoragePolicy("LAZY_PERSIST");
2658      if (lpPolicy != null &&
2659          lpPolicy.getId() == myFile.getStoragePolicyID()) {
2660        throw new UnsupportedOperationException(
2661            "Cannot append to lazy persist file " + src);
2662      }
2663      // Opening an existing file for append - may need to recover lease.
2664      recoverLeaseInternal(RecoverLeaseOp.APPEND_FILE,
2665          iip, src, holder, clientMachine, false);
2666      
2667      final BlockInfoContiguous lastBlock = myFile.getLastBlock();
2668      // Check that the block has at least minimum replication.
2669      if(lastBlock != null && lastBlock.isComplete() &&
2670          !getBlockManager().isSufficientlyReplicated(lastBlock)) {
2671        throw new IOException("append: lastBlock=" + lastBlock +
2672            " of src=" + src + " is not sufficiently replicated yet.");
2673      }
2674      return prepareFileForAppend(src, iip, holder, clientMachine, newBlock,
2675          true, logRetryCache);
2676    } catch (IOException ie) {
2677      NameNode.stateChangeLog.warn("DIR* NameSystem.append: " +ie.getMessage());
2678      throw ie;
2679    }
2680  }
2681  
2682  /**
2683   * Convert current node to under construction.
2684   * Recreate in-memory lease record.
2685   * 
2686   * @param src path to the file
2687   * @param leaseHolder identifier of the lease holder on this file
2688   * @param clientMachine identifier of the client machine
2689   * @param newBlock if the data is appended to a new block
2690   * @param writeToEditLog whether to persist this change to the edit log
2691   * @param logRetryCache whether to record RPC ids in editlog for retry cache
2692   *                      rebuilding
2693   * @return the last block locations if the block is partial or null otherwise
2694   * @throws UnresolvedLinkException
2695   * @throws IOException
2696   */
2697  LocatedBlock prepareFileForAppend(String src, INodesInPath iip,
2698      String leaseHolder, String clientMachine, boolean newBlock,
2699      boolean writeToEditLog, boolean logRetryCache) throws IOException {
2700    final INodeFile file = iip.getLastINode().asFile();
2701    final QuotaCounts delta = verifyQuotaForUCBlock(file, iip);
2702
2703    file.recordModification(iip.getLatestSnapshotId());
2704    file.toUnderConstruction(leaseHolder, clientMachine);
2705
2706    leaseManager.addLease(
2707        file.getFileUnderConstructionFeature().getClientName(), src);
2708
2709    LocatedBlock ret = null;
2710    if (!newBlock) {
2711      ret = blockManager.convertLastBlockToUnderConstruction(file, 0);
2712      if (ret != null && delta != null) {
2713        Preconditions.checkState(delta.getStorageSpace() >= 0,
2714            "appending to a block with size larger than the preferred block size");
2715        dir.writeLock();
2716        try {
2717          dir.updateCountNoQuotaCheck(iip, iip.length() - 1, delta);
2718        } finally {
2719          dir.writeUnlock();
2720        }
2721      }
2722    } else {
2723      BlockInfoContiguous lastBlock = file.getLastBlock();
2724      if (lastBlock != null) {
2725        ExtendedBlock blk = new ExtendedBlock(this.getBlockPoolId(), lastBlock);
2726        ret = new LocatedBlock(blk, new DatanodeInfo[0]);
2727      }
2728    }
2729
2730    if (writeToEditLog) {
2731      getEditLog().logAppendFile(src, file, newBlock, logRetryCache);
2732    }
2733    return ret;
2734  }
2735
2736  /**
2737   * Verify quota when using the preferred block size for UC block. This is
2738   * usually used by append and truncate
2739   * @throws QuotaExceededException when violating the storage quota
2740   * @return expected quota usage update. null means no change or no need to
2741   *         update quota usage later
2742   */
2743  private QuotaCounts verifyQuotaForUCBlock(INodeFile file, INodesInPath iip)
2744      throws QuotaExceededException {
2745    if (!isImageLoaded() || dir.shouldSkipQuotaChecks()) {
2746      // Do not check quota if editlog is still being processed
2747      return null;
2748    }
2749    if (file.getLastBlock() != null) {
2750      final QuotaCounts delta = computeQuotaDeltaForUCBlock(file);
2751      dir.readLock();
2752      try {
2753        FSDirectory.verifyQuota(iip, iip.length() - 1, delta, null);
2754        return delta;
2755      } finally {
2756        dir.readUnlock();
2757      }
2758    }
2759    return null;
2760  }
2761
2762  /** Compute quota change for converting a complete block to a UC block */
2763  private QuotaCounts computeQuotaDeltaForUCBlock(INodeFile file) {
2764    final QuotaCounts delta = new QuotaCounts.Builder().build();
2765    final BlockInfoContiguous lastBlock = file.getLastBlock();
2766    if (lastBlock != null) {
2767      final long diff = file.getPreferredBlockSize() - lastBlock.getNumBytes();
2768      final short repl = file.getBlockReplication();
2769      delta.addStorageSpace(diff * repl);
2770      final BlockStoragePolicy policy = dir.getBlockStoragePolicySuite()
2771          .getPolicy(file.getStoragePolicyID());
2772      List<StorageType> types = policy.chooseStorageTypes(repl);
2773      for (StorageType t : types) {
2774        if (t.supportTypeQuota()) {
2775          delta.addTypeSpace(t, diff);
2776        }
2777      }
2778    }
2779    return delta;
2780  }
2781
2782  /**
2783   * Recover lease;
2784   * Immediately revoke the lease of the current lease holder and start lease
2785   * recovery so that the file can be forced to be closed.
2786   * 
2787   * @param src the path of the file to start lease recovery
2788   * @param holder the lease holder's name
2789   * @param clientMachine the client machine's name
2790   * @return true if the file is already closed
2791   * @throws IOException
2792   */
2793  boolean recoverLease(String src, String holder, String clientMachine)
2794      throws IOException {
2795    if (!DFSUtil.isValidName(src)) {
2796      throw new IOException("Invalid file name: " + src);
2797    }
2798  
2799    boolean skipSync = false;
2800    FSPermissionChecker pc = getPermissionChecker();
2801    checkOperation(OperationCategory.WRITE);
2802    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
2803    writeLock();
2804    try {
2805      checkOperation(OperationCategory.WRITE);
2806      checkNameNodeSafeMode("Cannot recover the lease of " + src);
2807      src = dir.resolvePath(pc, src, pathComponents);
2808      final INodesInPath iip = dir.getINodesInPath4Write(src);
2809      final INodeFile inode = INodeFile.valueOf(iip.getLastINode(), src);
2810      if (!inode.isUnderConstruction()) {
2811        return true;
2812      }
2813      if (isPermissionEnabled) {
2814        dir.checkPathAccess(pc, iip, FsAction.WRITE);
2815      }
2816  
2817      recoverLeaseInternal(RecoverLeaseOp.RECOVER_LEASE,
2818          iip, src, holder, clientMachine, true);
2819    } catch (StandbyException se) {
2820      skipSync = true;
2821      throw se;
2822    } finally {
2823      writeUnlock();
2824      // There might be transactions logged while trying to recover the lease.
2825      // They need to be sync'ed even when an exception was thrown.
2826      if (!skipSync) {
2827        getEditLog().logSync();
2828      }
2829    }
2830    return false;
2831  }
2832
2833  private enum RecoverLeaseOp {
2834    CREATE_FILE,
2835    APPEND_FILE,
2836    TRUNCATE_FILE,
2837    RECOVER_LEASE;
2838    
2839    private String getExceptionMessage(String src, String holder,
2840        String clientMachine, String reason) {
2841      return "Failed to " + this + " " + src + " for " + holder +
2842          " on " + clientMachine + " because " + reason;
2843    }
2844  }
2845
2846  void recoverLeaseInternal(RecoverLeaseOp op, INodesInPath iip,
2847      String src, String holder, String clientMachine, boolean force)
2848      throws IOException {
2849    assert hasWriteLock();
2850    INodeFile file = iip.getLastINode().asFile();
2851    if (file != null && file.isUnderConstruction()) {
2852      //
2853      // If the file is under construction , then it must be in our
2854      // leases. Find the appropriate lease record.
2855      //
2856      Lease lease = leaseManager.getLease(holder);
2857
2858      if (!force && lease != null) {
2859        Lease leaseFile = leaseManager.getLeaseByPath(src);
2860        if (leaseFile != null && leaseFile.equals(lease)) {
2861          // We found the lease for this file but the original
2862          // holder is trying to obtain it again.
2863          throw new AlreadyBeingCreatedException(
2864              op.getExceptionMessage(src, holder, clientMachine,
2865                  holder + " is already the current lease holder."));
2866        }
2867      }
2868      //
2869      // Find the original holder.
2870      //
2871      FileUnderConstructionFeature uc = file.getFileUnderConstructionFeature();
2872      String clientName = uc.getClientName();
2873      lease = leaseManager.getLease(clientName);
2874      if (lease == null) {
2875        throw new AlreadyBeingCreatedException(
2876            op.getExceptionMessage(src, holder, clientMachine,
2877                "the file is under construction but no leases found."));
2878      }
2879      if (force) {
2880        // close now: no need to wait for soft lease expiration and 
2881        // close only the file src
2882        LOG.info("recoverLease: " + lease + ", src=" + src +
2883          " from client " + clientName);
2884        internalReleaseLease(lease, src, iip, holder);
2885      } else {
2886        assert lease.getHolder().equals(clientName) :
2887          "Current lease holder " + lease.getHolder() +
2888          " does not match file creator " + clientName;
2889        //
2890        // If the original holder has not renewed in the last SOFTLIMIT 
2891        // period, then start lease recovery.
2892        //
2893        if (lease.expiredSoftLimit()) {
2894          LOG.info("startFile: recover " + lease + ", src=" + src + " client "
2895              + clientName);
2896          boolean isClosed = internalReleaseLease(lease, src, iip, null);
2897          if(!isClosed)
2898            throw new RecoveryInProgressException(
2899                op.getExceptionMessage(src, holder, clientMachine,
2900                    "lease recovery is in progress. Try again later."));
2901        } else {
2902          final BlockInfoContiguous lastBlock = file.getLastBlock();
2903          if (lastBlock != null
2904              && lastBlock.getBlockUCState() == BlockUCState.UNDER_RECOVERY) {
2905            throw new RecoveryInProgressException(
2906                op.getExceptionMessage(src, holder, clientMachine,
2907                    "another recovery is in progress by "
2908                        + clientName + " on " + uc.getClientMachine()));
2909          } else {
2910            throw new AlreadyBeingCreatedException(
2911                op.getExceptionMessage(src, holder, clientMachine,
2912                    "this file lease is currently owned by "
2913                        + clientName + " on " + uc.getClientMachine()));
2914          }
2915        }
2916      }
2917    }
2918  }
2919
2920  /**
2921   * Append to an existing file in the namespace.
2922   */
2923  LastBlockWithStatus appendFile(String src, String holder,
2924      String clientMachine, EnumSet<CreateFlag> flag, boolean logRetryCache)
2925      throws IOException {
2926    try {
2927      return appendFileInt(src, holder, clientMachine,
2928          flag.contains(CreateFlag.NEW_BLOCK), logRetryCache);
2929    } catch (AccessControlException e) {
2930      logAuditEvent(false, "append", src);
2931      throw e;
2932    }
2933  }
2934
2935  private LastBlockWithStatus appendFileInt(final String srcArg, String holder,
2936      String clientMachine, boolean newBlock, boolean logRetryCache)
2937      throws IOException {
2938    String src = srcArg;
2939    if (NameNode.stateChangeLog.isDebugEnabled()) {
2940      NameNode.stateChangeLog.debug("DIR* NameSystem.appendFile: src=" + src
2941          + ", holder=" + holder
2942          + ", clientMachine=" + clientMachine);
2943    }
2944    boolean skipSync = false;
2945    if (!supportAppends) {
2946      throw new UnsupportedOperationException(
2947          "Append is not enabled on this NameNode. Use the " +
2948          DFS_SUPPORT_APPEND_KEY + " configuration option to enable it.");
2949    }
2950
2951    LocatedBlock lb = null;
2952    HdfsFileStatus stat = null;
2953    FSPermissionChecker pc = getPermissionChecker();
2954    checkOperation(OperationCategory.WRITE);
2955    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
2956    writeLock();
2957    try {
2958      checkOperation(OperationCategory.WRITE);
2959      checkNameNodeSafeMode("Cannot append to file" + src);
2960      src = dir.resolvePath(pc, src, pathComponents);
2961      final INodesInPath iip = dir.getINodesInPath4Write(src);
2962      lb = appendFileInternal(pc, iip, holder, clientMachine, newBlock,
2963          logRetryCache);
2964      stat = FSDirStatAndListingOp.getFileInfo(dir, src, false,
2965          FSDirectory.isReservedRawName(srcArg), true);
2966    } catch (StandbyException se) {
2967      skipSync = true;
2968      throw se;
2969    } finally {
2970      writeUnlock();
2971      // There might be transactions logged while trying to recover the lease.
2972      // They need to be sync'ed even when an exception was thrown.
2973      if (!skipSync) {
2974        getEditLog().logSync();
2975      }
2976    }
2977    if (lb != null) {
2978      if (NameNode.stateChangeLog.isDebugEnabled()) {
2979        NameNode.stateChangeLog.debug("DIR* NameSystem.appendFile: file "
2980            +src+" for "+holder+" at "+clientMachine
2981            +" block " + lb.getBlock()
2982            +" block size " + lb.getBlock().getNumBytes());
2983      }
2984    }
2985    logAuditEvent(true, "append", srcArg);
2986    return new LastBlockWithStatus(lb, stat);
2987  }
2988
2989  ExtendedBlock getExtendedBlock(Block blk) {
2990    return new ExtendedBlock(blockPoolId, blk);
2991  }
2992  
2993  void setBlockPoolId(String bpid) {
2994    blockPoolId = bpid;
2995    blockManager.setBlockPoolId(blockPoolId);
2996  }
2997
2998  /**
2999   * The client would like to obtain an additional block for the indicated
3000   * filename (which is being written-to).  Return an array that consists
3001   * of the block, plus a set of machines.  The first on this list should
3002   * be where the client writes data.  Subsequent items in the list must
3003   * be provided in the connection to the first datanode.
3004   *
3005   * Make sure the previous blocks have been reported by datanodes and
3006   * are replicated.  Will return an empty 2-elt array if we want the
3007   * client to "try again later".
3008   */
3009  LocatedBlock getAdditionalBlock(String src, long fileId, String clientName,
3010      ExtendedBlock previous, Set<Node> excludedNodes, 
3011      List<String> favoredNodes) throws IOException {
3012    final long blockSize;
3013    final int replication;
3014    final byte storagePolicyID;
3015    Node clientNode = null;
3016    String clientMachine = null;
3017
3018    if(NameNode.stateChangeLog.isDebugEnabled()) {
3019      NameNode.stateChangeLog.debug("BLOCK* getAdditionalBlock: "
3020          + src + " inodeId " +  fileId  + " for " + clientName);
3021    }
3022
3023    // Part I. Analyze the state of the file with respect to the input data.
3024    checkOperation(OperationCategory.READ);
3025    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
3026    FSPermissionChecker pc = getPermissionChecker();
3027    readLock();
3028    try {
3029      checkOperation(OperationCategory.READ);
3030      src = dir.resolvePath(pc, src, pathComponents);
3031      LocatedBlock[] onRetryBlock = new LocatedBlock[1];
3032      FileState fileState = analyzeFileState(
3033          src, fileId, clientName, previous, onRetryBlock);
3034      final INodeFile pendingFile = fileState.inode;
3035      // Check if the penultimate block is minimally replicated
3036      if (!checkFileProgress(src, pendingFile, false)) {
3037        throw new NotReplicatedYetException("Not replicated yet: " + src);
3038      }
3039      src = fileState.path;
3040
3041      if (onRetryBlock[0] != null && onRetryBlock[0].getLocations().length > 0) {
3042        // This is a retry. Just return the last block if having locations.
3043        return onRetryBlock[0];
3044      }
3045      if (pendingFile.getBlocks().length >= maxBlocksPerFile) {
3046        throw new IOException("File has reached the limit on maximum number of"
3047            + " blocks (" + DFSConfigKeys.DFS_NAMENODE_MAX_BLOCKS_PER_FILE_KEY
3048            + "): " + pendingFile.getBlocks().length + " >= "
3049            + maxBlocksPerFile);
3050      }
3051      blockSize = pendingFile.getPreferredBlockSize();
3052      clientMachine = pendingFile.getFileUnderConstructionFeature()
3053          .getClientMachine();
3054      clientNode = blockManager.getDatanodeManager().getDatanodeByHost(
3055          clientMachine);
3056      replication = pendingFile.getFileReplication();
3057      storagePolicyID = pendingFile.getStoragePolicyID();
3058    } finally {
3059      readUnlock();
3060    }
3061
3062    if (clientNode == null) {
3063      clientNode = getClientNode(clientMachine);
3064    }
3065
3066    // choose targets for the new block to be allocated.
3067    final DatanodeStorageInfo targets[] = getBlockManager().chooseTarget4NewBlock( 
3068        src, replication, clientNode, excludedNodes, blockSize, favoredNodes,
3069        storagePolicyID);
3070
3071    // Part II.
3072    // Allocate a new block, add it to the INode and the BlocksMap. 
3073    Block newBlock = null;
3074    long offset;
3075    checkOperation(OperationCategory.WRITE);
3076    waitForLoadingFSImage();
3077    writeLock();
3078    try {
3079      checkOperation(OperationCategory.WRITE);
3080      // Run the full analysis again, since things could have changed
3081      // while chooseTarget() was executing.
3082      LocatedBlock[] onRetryBlock = new LocatedBlock[1];
3083      FileState fileState = 
3084          analyzeFileState(src, fileId, clientName, previous, onRetryBlock);
3085      final INodeFile pendingFile = fileState.inode;
3086      src = fileState.path;
3087
3088      if (onRetryBlock[0] != null) {
3089        if (onRetryBlock[0].getLocations().length > 0) {
3090          // This is a retry. Just return the last block if having locations.
3091          return onRetryBlock[0];
3092        } else {
3093          // add new chosen targets to already allocated block and return
3094          BlockInfoContiguous lastBlockInFile = pendingFile.getLastBlock();
3095          ((BlockInfoContiguousUnderConstruction) lastBlockInFile)
3096              .setExpectedLocations(targets);
3097          offset = pendingFile.computeFileSize();
3098          return makeLocatedBlock(lastBlockInFile, targets, offset);
3099        }
3100      }
3101
3102      // commit the last block and complete it if it has minimum replicas
3103      commitOrCompleteLastBlock(pendingFile, fileState.iip,
3104                                ExtendedBlock.getLocalBlock(previous));
3105
3106      // allocate new block, record block locations in INode.
3107      newBlock = createNewBlock();
3108      INodesInPath inodesInPath = INodesInPath.fromINode(pendingFile);
3109      saveAllocatedBlock(src, inodesInPath, newBlock, targets);
3110
3111      persistNewBlock(src, pendingFile);
3112      offset = pendingFile.computeFileSize();
3113    } finally {
3114      writeUnlock();
3115    }
3116    getEditLog().logSync();
3117
3118    // Return located block
3119    return makeLocatedBlock(newBlock, targets, offset);
3120  }
3121
3122  /*
3123   * Resolve clientmachine address to get a network location path
3124   */
3125  private Node getClientNode(String clientMachine) {
3126    List<String> hosts = new ArrayList<String>(1);
3127    hosts.add(clientMachine);
3128    List<String> rName = getBlockManager().getDatanodeManager()
3129        .resolveNetworkLocation(hosts);
3130    Node clientNode = null;
3131    if (rName != null) {
3132      // Able to resolve clientMachine mapping.
3133      // Create a temp node to findout the rack local nodes
3134      clientNode = new NodeBase(rName.get(0) + NodeBase.PATH_SEPARATOR_STR
3135          + clientMachine);
3136    }
3137    return clientNode;
3138  }
3139
3140  static class FileState {
3141    public final INodeFile inode;
3142    public final String path;
3143    public final INodesInPath iip;
3144
3145    public FileState(INodeFile inode, String fullPath, INodesInPath iip) {
3146      this.inode = inode;
3147      this.path = fullPath;
3148      this.iip = iip;
3149    }
3150  }
3151
3152  FileState analyzeFileState(String src,
3153                                long fileId,
3154                                String clientName,
3155                                ExtendedBlock previous,
3156                                LocatedBlock[] onRetryBlock)
3157          throws IOException  {
3158    assert hasReadLock();
3159
3160    checkBlock(previous);
3161    onRetryBlock[0] = null;
3162    checkNameNodeSafeMode("Cannot add block to " + src);
3163
3164    // have we exceeded the configured limit of fs objects.
3165    checkFsObjectLimit();
3166
3167    Block previousBlock = ExtendedBlock.getLocalBlock(previous);
3168    final INode inode;
3169    final INodesInPath iip;
3170    if (fileId == INodeId.GRANDFATHER_INODE_ID) {
3171      // Older clients may not have given us an inode ID to work with.
3172      // In this case, we have to try to resolve the path and hope it
3173      // hasn't changed or been deleted since the file was opened for write.
3174      iip = dir.getINodesInPath4Write(src);
3175      inode = iip.getLastINode();
3176    } else {
3177      // Newer clients pass the inode ID, so we can just get the inode
3178      // directly.
3179      inode = dir.getInode(fileId);
3180      iip = INodesInPath.fromINode(inode);
3181      if (inode != null) {
3182        src = iip.getPath();
3183      }
3184    }
3185    final INodeFile pendingFile = checkLease(src, clientName, inode, fileId);
3186    BlockInfoContiguous lastBlockInFile = pendingFile.getLastBlock();
3187    if (!Block.matchingIdAndGenStamp(previousBlock, lastBlockInFile)) {
3188      // The block that the client claims is the current last block
3189      // doesn't match up with what we think is the last block. There are
3190      // four possibilities:
3191      // 1) This is the first block allocation of an append() pipeline
3192      //    which started appending exactly at or exceeding the block boundary.
3193      //    In this case, the client isn't passed the previous block,
3194      //    so it makes the allocateBlock() call with previous=null.
3195      //    We can distinguish this since the last block of the file
3196      //    will be exactly a full block.
3197      // 2) This is a retry from a client that missed the response of a
3198      //    prior getAdditionalBlock() call, perhaps because of a network
3199      //    timeout, or because of an HA failover. In that case, we know
3200      //    by the fact that the client is re-issuing the RPC that it
3201      //    never began to write to the old block. Hence it is safe to
3202      //    to return the existing block.
3203      // 3) This is an entirely bogus request/bug -- we should error out
3204      //    rather than potentially appending a new block with an empty
3205      //    one in the middle, etc
3206      // 4) This is a retry from a client that timed out while
3207      //    the prior getAdditionalBlock() is still being processed,
3208      //    currently working on chooseTarget(). 
3209      //    There are no means to distinguish between the first and 
3210      //    the second attempts in Part I, because the first one hasn't
3211      //    changed the namesystem state yet.
3212      //    We run this analysis again in Part II where case 4 is impossible.
3213
3214      BlockInfoContiguous penultimateBlock = pendingFile.getPenultimateBlock();
3215      if (previous == null &&
3216          lastBlockInFile != null &&
3217          lastBlockInFile.getNumBytes() >= pendingFile.getPreferredBlockSize() &&
3218          lastBlockInFile.isComplete()) {
3219        // Case 1
3220        if (NameNode.stateChangeLog.isDebugEnabled()) {
3221           NameNode.stateChangeLog.debug(
3222               "BLOCK* NameSystem.allocateBlock: handling block allocation" +
3223               " writing to a file with a complete previous block: src=" +
3224               src + " lastBlock=" + lastBlockInFile);
3225        }
3226      } else if (Block.matchingIdAndGenStamp(penultimateBlock, previousBlock)) {
3227        if (lastBlockInFile.getNumBytes() != 0) {
3228          throw new IOException(
3229              "Request looked like a retry to allocate block " +
3230              lastBlockInFile + " but it already contains " +
3231              lastBlockInFile.getNumBytes() + " bytes");
3232        }
3233
3234        // Case 2
3235        // Return the last block.
3236        NameNode.stateChangeLog.info("BLOCK* allocateBlock: " +
3237            "caught retry for allocation of a new block in " +
3238            src + ". Returning previously allocated block " + lastBlockInFile);
3239        long offset = pendingFile.computeFileSize();
3240        onRetryBlock[0] = makeLocatedBlock(lastBlockInFile,
3241            ((BlockInfoContiguousUnderConstruction)lastBlockInFile).getExpectedStorageLocations(),
3242            offset);
3243        return new FileState(pendingFile, src, iip);
3244      } else {
3245        // Case 3
3246        throw new IOException("Cannot allocate block in " + src + ": " +
3247            "passed 'previous' block " + previous + " does not match actual " +
3248            "last block in file " + lastBlockInFile);
3249      }
3250    }
3251    return new FileState(pendingFile, src, iip);
3252  }
3253
3254  LocatedBlock makeLocatedBlock(Block blk, DatanodeStorageInfo[] locs,
3255                                        long offset) throws IOException {
3256    LocatedBlock lBlk = new LocatedBlock(
3257        getExtendedBlock(blk), locs, offset, false);
3258    getBlockManager().setBlockToken(
3259        lBlk, BlockTokenSecretManager.AccessMode.WRITE);
3260    return lBlk;
3261  }
3262
3263  /** @see ClientProtocol#getAdditionalDatanode */
3264  LocatedBlock getAdditionalDatanode(String src, long fileId,
3265      final ExtendedBlock blk, final DatanodeInfo[] existings,
3266      final String[] storageIDs,
3267      final Set<Node> excludes,
3268      final int numAdditionalNodes, final String clientName
3269      ) throws IOException {
3270    //check if the feature is enabled
3271    dtpReplaceDatanodeOnFailure.checkEnabled();
3272
3273    Node clientnode = null;
3274    String clientMachine;
3275    final long preferredblocksize;
3276    final byte storagePolicyID;
3277    final List<DatanodeStorageInfo> chosen;
3278    checkOperation(OperationCategory.READ);
3279    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
3280    FSPermissionChecker pc = getPermissionChecker();
3281    readLock();
3282    try {
3283      checkOperation(OperationCategory.READ);
3284      //check safe mode
3285      checkNameNodeSafeMode("Cannot add datanode; src=" + src + ", blk=" + blk);
3286      src = dir.resolvePath(pc, src, pathComponents);
3287
3288      //check lease
3289      final INode inode;
3290      if (fileId == INodeId.GRANDFATHER_INODE_ID) {
3291        // Older clients may not have given us an inode ID to work with.
3292        // In this case, we have to try to resolve the path and hope it
3293        // hasn't changed or been deleted since the file was opened for write.
3294        inode = dir.getINode(src);
3295      } else {
3296        inode = dir.getInode(fileId);
3297        if (inode != null) src = inode.getFullPathName();
3298      }
3299      final INodeFile file = checkLease(src, clientName, inode, fileId);
3300      clientMachine = file.getFileUnderConstructionFeature().getClientMachine();
3301      clientnode = blockManager.getDatanodeManager().getDatanodeByHost(clientMachine);
3302      preferredblocksize = file.getPreferredBlockSize();
3303      storagePolicyID = file.getStoragePolicyID();
3304
3305      //find datanode storages
3306      final DatanodeManager dm = blockManager.getDatanodeManager();
3307      chosen = Arrays.asList(dm.getDatanodeStorageInfos(existings, storageIDs));
3308    } finally {
3309      readUnlock();
3310    }
3311
3312    if (clientnode == null) {
3313      clientnode = getClientNode(clientMachine);
3314    }
3315
3316    // choose new datanodes.
3317    final DatanodeStorageInfo[] targets = blockManager.chooseTarget4AdditionalDatanode(
3318        src, numAdditionalNodes, clientnode, chosen, 
3319        excludes, preferredblocksize, storagePolicyID);
3320    final LocatedBlock lb = new LocatedBlock(blk, targets);
3321    blockManager.setBlockToken(lb, AccessMode.COPY);
3322    return lb;
3323  }
3324
3325  /**
3326   * The client would like to let go of the given block
3327   */
3328  boolean abandonBlock(ExtendedBlock b, long fileId, String src, String holder)
3329      throws IOException {
3330    if(NameNode.stateChangeLog.isDebugEnabled()) {
3331      NameNode.stateChangeLog.debug("BLOCK* NameSystem.abandonBlock: " + b
3332          + "of file " + src);
3333    }
3334    checkOperation(OperationCategory.WRITE);
3335    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
3336    FSPermissionChecker pc = getPermissionChecker();
3337    waitForLoadingFSImage();
3338    writeLock();
3339    try {
3340      checkOperation(OperationCategory.WRITE);
3341      checkNameNodeSafeMode("Cannot abandon block " + b + " for file" + src);
3342      src = dir.resolvePath(pc, src, pathComponents);
3343
3344      final INode inode;
3345      final INodesInPath iip;
3346      if (fileId == INodeId.GRANDFATHER_INODE_ID) {
3347        // Older clients may not have given us an inode ID to work with.
3348        // In this case, we have to try to resolve the path and hope it
3349        // hasn't changed or been deleted since the file was opened for write.
3350        iip = dir.getINodesInPath(src, true);
3351        inode = iip.getLastINode();
3352      } else {
3353        inode = dir.getInode(fileId);
3354        iip = INodesInPath.fromINode(inode);
3355        if (inode != null) {
3356          src = iip.getPath();
3357        }
3358      }
3359      final INodeFile file = checkLease(src, holder, inode, fileId);
3360
3361      // Remove the block from the pending creates list
3362      boolean removed = dir.removeBlock(src, iip, file,
3363          ExtendedBlock.getLocalBlock(b));
3364      if (!removed) {
3365        return true;
3366      }
3367      if(NameNode.stateChangeLog.isDebugEnabled()) {
3368        NameNode.stateChangeLog.debug("BLOCK* NameSystem.abandonBlock: "
3369                                      + b + " is removed from pendingCreates");
3370      }
3371      persistBlocks(src, file, false);
3372    } finally {
3373      writeUnlock();
3374    }
3375    getEditLog().logSync();
3376
3377    return true;
3378  }
3379
3380  private INodeFile checkLease(String src, String holder, INode inode,
3381      long fileId) throws LeaseExpiredException, FileNotFoundException {
3382    assert hasReadLock();
3383    final String ident = src + " (inode " + fileId + ")";
3384    if (inode == null) {
3385      Lease lease = leaseManager.getLease(holder);
3386      throw new LeaseExpiredException(
3387          "No lease on " + ident + ": File does not exist. "
3388          + (lease != null ? lease.toString()
3389              : "Holder " + holder + " does not have any open files."));
3390    }
3391    if (!inode.isFile()) {
3392      Lease lease = leaseManager.getLease(holder);
3393      throw new LeaseExpiredException(
3394          "No lease on " + ident + ": INode is not a regular file. "
3395              + (lease != null ? lease.toString()
3396              : "Holder " + holder + " does not have any open files."));
3397    }
3398    final INodeFile file = inode.asFile();
3399    if (!file.isUnderConstruction()) {
3400      Lease lease = leaseManager.getLease(holder);
3401      throw new LeaseExpiredException(
3402          "No lease on " + ident + ": File is not open for writing. "
3403          + (lease != null ? lease.toString()
3404              : "Holder " + holder + " does not have any open files."));
3405    }
3406    // No further modification is allowed on a deleted file.
3407    // A file is considered deleted, if it is not in the inodeMap or is marked
3408    // as deleted in the snapshot feature.
3409    if (isFileDeleted(file)) {
3410      throw new FileNotFoundException(src);
3411    }
3412    String clientName = file.getFileUnderConstructionFeature().getClientName();
3413    if (holder != null && !clientName.equals(holder)) {
3414      throw new LeaseExpiredException("Lease mismatch on " + ident +
3415          " owned by " + clientName + " but is accessed by " + holder);
3416    }
3417    return file;
3418  }
3419 
3420  /**
3421   * Complete in-progress write to the given file.
3422   * @return true if successful, false if the client should continue to retry
3423   *         (e.g if not all blocks have reached minimum replication yet)
3424   * @throws IOException on error (eg lease mismatch, file not open, file deleted)
3425   */
3426  boolean completeFile(final String srcArg, String holder,
3427                       ExtendedBlock last, long fileId)
3428    throws SafeModeException, UnresolvedLinkException, IOException {
3429    String src = srcArg;
3430    if (NameNode.stateChangeLog.isDebugEnabled()) {
3431      NameNode.stateChangeLog.debug("DIR* NameSystem.completeFile: " +
3432          src + " for " + holder);
3433    }
3434    checkBlock(last);
3435    boolean success = false;
3436    checkOperation(OperationCategory.WRITE);
3437    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
3438    FSPermissionChecker pc = getPermissionChecker();
3439    waitForLoadingFSImage();
3440    writeLock();
3441    try {
3442      checkOperation(OperationCategory.WRITE);
3443      checkNameNodeSafeMode("Cannot complete file " + src);
3444      src = dir.resolvePath(pc, src, pathComponents);
3445      success = completeFileInternal(src, holder,
3446        ExtendedBlock.getLocalBlock(last), fileId);
3447    } finally {
3448      writeUnlock();
3449    }
3450    getEditLog().logSync();
3451    if (success) {
3452      NameNode.stateChangeLog.info("DIR* completeFile: " + srcArg
3453          + " is closed by " + holder);
3454    }
3455    return success;
3456  }
3457
3458  private boolean completeFileInternal(String src, String holder, Block last,
3459      long fileId) throws IOException {
3460    assert hasWriteLock();
3461    final INodeFile pendingFile;
3462    final INodesInPath iip;
3463    INode inode = null;
3464    try {
3465      if (fileId == INodeId.GRANDFATHER_INODE_ID) {
3466        // Older clients may not have given us an inode ID to work with.
3467        // In this case, we have to try to resolve the path and hope it
3468        // hasn't changed or been deleted since the file was opened for write.
3469        iip = dir.getINodesInPath(src, true);
3470        inode = iip.getLastINode();
3471      } else {
3472        inode = dir.getInode(fileId);
3473        iip = INodesInPath.fromINode(inode);
3474        if (inode != null) {
3475          src = iip.getPath();
3476        }
3477      }
3478      pendingFile = checkLease(src, holder, inode, fileId);
3479    } catch (LeaseExpiredException lee) {
3480      if (inode != null && inode.isFile() &&
3481          !inode.asFile().isUnderConstruction()) {
3482        // This could be a retry RPC - i.e the client tried to close
3483        // the file, but missed the RPC response. Thus, it is trying
3484        // again to close the file. If the file still exists and
3485        // the client's view of the last block matches the actual
3486        // last block, then we'll treat it as a successful close.
3487        // See HDFS-3031.
3488        final Block realLastBlock = inode.asFile().getLastBlock();
3489        if (Block.matchingIdAndGenStamp(last, realLastBlock)) {
3490          NameNode.stateChangeLog.info("DIR* completeFile: " +
3491              "request from " + holder + " to complete inode " + fileId +
3492              "(" + src + ") which is already closed. But, it appears to be " +
3493              "an RPC retry. Returning success");
3494          return true;
3495        }
3496      }
3497      throw lee;
3498    }
3499    // Check the state of the penultimate block. It should be completed
3500    // before attempting to complete the last one.
3501    if (!checkFileProgress(src, pendingFile, false)) {
3502      return false;
3503    }
3504
3505    // commit the last block and complete it if it has minimum replicas
3506    commitOrCompleteLastBlock(pendingFile, iip, last);
3507
3508    if (!checkFileProgress(src, pendingFile, true)) {
3509      return false;
3510    }
3511
3512    finalizeINodeFileUnderConstruction(src, pendingFile,
3513        Snapshot.CURRENT_STATE_ID);
3514    return true;
3515  }
3516
3517  /**
3518   * Save allocated block at the given pending filename
3519   * 
3520   * @param src path to the file
3521   * @param inodesInPath representing each of the components of src.
3522   *                     The last INode is the INode for {@code src} file.
3523   * @param newBlock newly allocated block to be save
3524   * @param targets target datanodes where replicas of the new block is placed
3525   * @throws QuotaExceededException If addition of block exceeds space quota
3526   */
3527  BlockInfoContiguous saveAllocatedBlock(String src, INodesInPath inodesInPath,
3528      Block newBlock, DatanodeStorageInfo[] targets)
3529          throws IOException {
3530    assert hasWriteLock();
3531    BlockInfoContiguous b = dir.addBlock(src, inodesInPath, newBlock, targets);
3532    NameNode.stateChangeLog.info("BLOCK* allocate " + b + " for " + src);
3533    DatanodeStorageInfo.incrementBlocksScheduled(targets);
3534    return b;
3535  }
3536
3537  /**
3538   * Create new block with a unique block id and a new generation stamp.
3539   */
3540  Block createNewBlock() throws IOException {
3541    assert hasWriteLock();
3542    Block b = new Block(nextBlockId(), 0, 0);
3543    // Increment the generation stamp for every new block.
3544    b.setGenerationStamp(nextGenerationStamp(false));
3545    return b;
3546  }
3547
3548  /**
3549   * Check that the indicated file's blocks are present and
3550   * replicated.  If not, return false. If checkall is true, then check
3551   * all blocks, otherwise check only penultimate block.
3552   */
3553  boolean checkFileProgress(String src, INodeFile v, boolean checkall) {
3554    if (checkall) {
3555      // check all blocks of the file.
3556      for (BlockInfoContiguous block: v.getBlocks()) {
3557        if (!isCompleteBlock(src, block, blockManager.minReplication)) {
3558          return false;
3559        }
3560      }
3561    } else {
3562      // check the penultimate block of this file
3563      BlockInfoContiguous b = v.getPenultimateBlock();
3564      if (b != null
3565          && !isCompleteBlock(src, b, blockManager.minReplication)) {
3566        return false;
3567      }
3568    }
3569    return true;
3570  }
3571
3572  private static boolean isCompleteBlock(String src, BlockInfoContiguous b, int minRepl) {
3573    if (!b.isComplete()) {
3574      final BlockInfoContiguousUnderConstruction uc = (BlockInfoContiguousUnderConstruction)b;
3575      final int numNodes = b.numNodes();
3576      LOG.info("BLOCK* " + b + " is not COMPLETE (ucState = "
3577          + uc.getBlockUCState() + ", replication# = " + numNodes
3578          + (numNodes < minRepl? " < ": " >= ")
3579          + " minimum = " + minRepl + ") in file " + src);
3580      return false;
3581    }
3582    return true;
3583  }
3584
3585  ////////////////////////////////////////////////////////////////
3586  // Here's how to handle block-copy failure during client write:
3587  // -- As usual, the client's write should result in a streaming
3588  // backup write to a k-machine sequence.
3589  // -- If one of the backup machines fails, no worries.  Fail silently.
3590  // -- Before client is allowed to close and finalize file, make sure
3591  // that the blocks are backed up.  Namenode may have to issue specific backup
3592  // commands to make up for earlier datanode failures.  Once all copies
3593  // are made, edit namespace and return to client.
3594  ////////////////////////////////////////////////////////////////
3595
3596  /** 
3597   * Change the indicated filename. 
3598   * @deprecated Use {@link #renameTo(String, String, boolean,
3599   * Options.Rename...)} instead.
3600   */
3601  @Deprecated
3602  boolean renameTo(String src, String dst, boolean logRetryCache)
3603      throws IOException {
3604    waitForLoadingFSImage();
3605    checkOperation(OperationCategory.WRITE);
3606    FSDirRenameOp.RenameOldResult ret = null;
3607    writeLock();
3608    try {
3609      checkOperation(OperationCategory.WRITE);
3610      checkNameNodeSafeMode("Cannot rename " + src);
3611      ret = FSDirRenameOp.renameToInt(dir, src, dst, logRetryCache);
3612    } catch (AccessControlException e)  {
3613      logAuditEvent(false, "rename", src, dst, null);
3614      throw e;
3615    } finally {
3616      writeUnlock();
3617    }
3618    boolean success = ret != null && ret.success;
3619    if (success) {
3620      getEditLog().logSync();
3621    }
3622    logAuditEvent(success, "rename", src, dst,
3623        ret == null ? null : ret.auditStat);
3624    return success;
3625  }
3626
3627  void renameTo(final String src, final String dst,
3628                boolean logRetryCache, Options.Rename... options)
3629      throws IOException {
3630    waitForLoadingFSImage();
3631    checkOperation(OperationCategory.WRITE);
3632    Map.Entry<BlocksMapUpdateInfo, HdfsFileStatus> res = null;
3633    writeLock();
3634    try {
3635      checkOperation(OperationCategory.WRITE);
3636      checkNameNodeSafeMode("Cannot rename " + src);
3637      res = FSDirRenameOp.renameToInt(dir, src, dst, logRetryCache, options);
3638    } catch (AccessControlException e) {
3639      logAuditEvent(false, "rename (options=" + Arrays.toString(options) +
3640          ")", src, dst, null);
3641      throw e;
3642    } finally {
3643      writeUnlock();
3644    }
3645
3646    getEditLog().logSync();
3647
3648    BlocksMapUpdateInfo collectedBlocks = res.getKey();
3649    HdfsFileStatus auditStat = res.getValue();
3650    if (!collectedBlocks.getToDeleteList().isEmpty()) {
3651      removeBlocks(collectedBlocks);
3652      collectedBlocks.clear();
3653    }
3654
3655    logAuditEvent(true, "rename (options=" + Arrays.toString(options) +
3656        ")", src, dst, auditStat);
3657  }
3658
3659  /**
3660   * Remove the indicated file from namespace.
3661   * 
3662   * @see ClientProtocol#delete(String, boolean) for detailed description and 
3663   * description of exceptions
3664   */
3665  boolean delete(String src, boolean recursive, boolean logRetryCache)
3666      throws IOException {
3667    waitForLoadingFSImage();
3668    checkOperation(OperationCategory.WRITE);
3669    BlocksMapUpdateInfo toRemovedBlocks = null;
3670    writeLock();
3671    boolean ret = false;
3672    try {
3673      checkOperation(OperationCategory.WRITE);
3674      checkNameNodeSafeMode("Cannot delete " + src);
3675      toRemovedBlocks = FSDirDeleteOp.delete(
3676          this, src, recursive, logRetryCache);
3677      ret = toRemovedBlocks != null;
3678    } catch (AccessControlException e) {
3679      logAuditEvent(false, "delete", src);
3680      throw e;
3681    } finally {
3682      writeUnlock();
3683    }
3684    if (toRemovedBlocks != null) {
3685      removeBlocks(toRemovedBlocks); // Incremental deletion of blocks
3686    }
3687    logAuditEvent(true, "delete", src);
3688    return ret;
3689  }
3690
3691  FSPermissionChecker getPermissionChecker()
3692      throws AccessControlException {
3693    return dir.getPermissionChecker();
3694  }
3695
3696  /**
3697   * From the given list, incrementally remove the blocks from blockManager
3698   * Writelock is dropped and reacquired every BLOCK_DELETION_INCREMENT to
3699   * ensure that other waiters on the lock can get in. See HDFS-2938
3700   * 
3701   * @param blocks
3702   *          An instance of {@link BlocksMapUpdateInfo} which contains a list
3703   *          of blocks that need to be removed from blocksMap
3704   */
3705  void removeBlocks(BlocksMapUpdateInfo blocks) {
3706    List<Block> toDeleteList = blocks.getToDeleteList();
3707    Iterator<Block> iter = toDeleteList.iterator();
3708    while (iter.hasNext()) {
3709      writeLock();
3710      try {
3711        for (int i = 0; i < BLOCK_DELETION_INCREMENT && iter.hasNext(); i++) {
3712          blockManager.removeBlock(iter.next());
3713        }
3714      } finally {
3715        writeUnlock();
3716      }
3717    }
3718  }
3719  
3720  /**
3721   * Remove leases and inodes related to a given path
3722   * @param src The given path
3723   * @param removedINodes Containing the list of inodes to be removed from
3724   *                      inodesMap
3725   * @param acquireINodeMapLock Whether to acquire the lock for inode removal
3726   */
3727  void removeLeasesAndINodes(String src, List<INode> removedINodes,
3728      final boolean acquireINodeMapLock) {
3729    assert hasWriteLock();
3730    leaseManager.removeLeaseWithPrefixPath(src);
3731    // remove inodes from inodesMap
3732    if (removedINodes != null) {
3733      if (acquireINodeMapLock) {
3734        dir.writeLock();
3735      }
3736      try {
3737        dir.removeFromInodeMap(removedINodes);
3738      } finally {
3739        if (acquireINodeMapLock) {
3740          dir.writeUnlock();
3741        }
3742      }
3743      removedINodes.clear();
3744    }
3745  }
3746
3747  /**
3748   * Removes the blocks from blocksmap and updates the safemode blocks total
3749   * 
3750   * @param blocks
3751   *          An instance of {@link BlocksMapUpdateInfo} which contains a list
3752   *          of blocks that need to be removed from blocksMap
3753   */
3754  void removeBlocksAndUpdateSafemodeTotal(BlocksMapUpdateInfo blocks) {
3755    assert hasWriteLock();
3756    // In the case that we are a Standby tailing edits from the
3757    // active while in safe-mode, we need to track the total number
3758    // of blocks and safe blocks in the system.
3759    boolean trackBlockCounts = isSafeModeTrackingBlocks();
3760    int numRemovedComplete = 0, numRemovedSafe = 0;
3761
3762    for (Block b : blocks.getToDeleteList()) {
3763      if (trackBlockCounts) {
3764        BlockInfoContiguous bi = getStoredBlock(b);
3765        if (bi.isComplete()) {
3766          numRemovedComplete++;
3767          if (bi.numNodes() >= blockManager.minReplication) {
3768            numRemovedSafe++;
3769          }
3770        }
3771      }
3772      blockManager.removeBlock(b);
3773    }
3774    if (trackBlockCounts) {
3775      if (LOG.isDebugEnabled()) {
3776        LOG.debug("Adjusting safe-mode totals for deletion."
3777            + "decreasing safeBlocks by " + numRemovedSafe
3778            + ", totalBlocks by " + numRemovedComplete);
3779      }
3780      adjustSafeModeBlockTotals(-numRemovedSafe, -numRemovedComplete);
3781    }
3782  }
3783
3784  /**
3785   * @see SafeModeInfo#shouldIncrementallyTrackBlocks
3786   */
3787  private boolean isSafeModeTrackingBlocks() {
3788    if (!haEnabled) {
3789      // Never track blocks incrementally in non-HA code.
3790      return false;
3791    }
3792    SafeModeInfo sm = this.safeMode;
3793    return sm != null && sm.shouldIncrementallyTrackBlocks();
3794  }
3795
3796  /**
3797   * Get the file info for a specific file.
3798   *
3799   * @param src The string representation of the path to the file
3800   * @param resolveLink whether to throw UnresolvedLinkException
3801   *        if src refers to a symlink
3802   *
3803   * @throws AccessControlException if access is denied
3804   * @throws UnresolvedLinkException if a symlink is encountered.
3805   *
3806   * @return object containing information regarding the file
3807   *         or null if file not found
3808   * @throws StandbyException
3809   */
3810  HdfsFileStatus getFileInfo(final String src, boolean resolveLink)
3811    throws IOException {
3812    checkOperation(OperationCategory.READ);
3813    HdfsFileStatus stat = null;
3814    readLock();
3815    try {
3816      checkOperation(OperationCategory.READ);
3817      stat = FSDirStatAndListingOp.getFileInfo(dir, src, resolveLink);
3818    } catch (AccessControlException e) {
3819      logAuditEvent(false, "getfileinfo", src);
3820      throw e;
3821    } finally {
3822      readUnlock();
3823    }
3824    logAuditEvent(true, "getfileinfo", src);
3825    return stat;
3826  }
3827
3828  /**
3829   * Returns true if the file is closed
3830   */
3831  boolean isFileClosed(final String src) throws IOException {
3832    checkOperation(OperationCategory.READ);
3833    readLock();
3834    try {
3835      checkOperation(OperationCategory.READ);
3836      return FSDirStatAndListingOp.isFileClosed(dir, src);
3837    } catch (AccessControlException e) {
3838      logAuditEvent(false, "isFileClosed", src);
3839      throw e;
3840    } finally {
3841      readUnlock();
3842    }
3843  }
3844
3845  /**
3846   * Create all the necessary directories
3847   */
3848  boolean mkdirs(String src, PermissionStatus permissions,
3849      boolean createParent) throws IOException {
3850    HdfsFileStatus auditStat = null;
3851    checkOperation(OperationCategory.WRITE);
3852    writeLock();
3853    try {
3854      checkOperation(OperationCategory.WRITE);
3855      checkNameNodeSafeMode("Cannot create directory " + src);
3856      auditStat = FSDirMkdirOp.mkdirs(this, src, permissions, createParent);
3857    } catch (AccessControlException e) {
3858      logAuditEvent(false, "mkdirs", src);
3859      throw e;
3860    } finally {
3861      writeUnlock();
3862    }
3863    getEditLog().logSync();
3864    logAuditEvent(true, "mkdirs", src, null, auditStat);
3865    return true;
3866  }
3867
3868  /**
3869   * Get the content summary for a specific file/dir.
3870   *
3871   * @param src The string representation of the path to the file
3872   *
3873   * @throws AccessControlException if access is denied
3874   * @throws UnresolvedLinkException if a symlink is encountered.
3875   * @throws FileNotFoundException if no file exists
3876   * @throws StandbyException
3877   * @throws IOException for issues with writing to the audit log
3878   *
3879   * @return object containing information regarding the file
3880   *         or null if file not found
3881   */
3882  ContentSummary getContentSummary(final String src) throws IOException {
3883    readLock();
3884    boolean success = true;
3885    try {
3886      return FSDirStatAndListingOp.getContentSummary(dir, src);
3887    } catch (AccessControlException ace) {
3888      success = false;
3889      throw ace;
3890    } finally {
3891      readUnlock();
3892      logAuditEvent(success, "contentSummary", src);
3893    }
3894  }
3895
3896  /**
3897   * Set the namespace quota and storage space quota for a directory.
3898   * See {@link ClientProtocol#setQuota(String, long, long, StorageType)} for the
3899   * contract.
3900   * 
3901   * Note: This does not support ".inodes" relative path.
3902   */
3903  void setQuota(String src, long nsQuota, long ssQuota, StorageType type)
3904      throws IOException {
3905    checkOperation(OperationCategory.WRITE);
3906    writeLock();
3907    boolean success = false;
3908    try {
3909      checkOperation(OperationCategory.WRITE);
3910      checkNameNodeSafeMode("Cannot set quota on " + src);
3911      FSDirAttrOp.setQuota(dir, src, nsQuota, ssQuota, type);
3912      success = true;
3913    } finally {
3914      writeUnlock();
3915      if (success) {
3916        getEditLog().logSync();
3917      }
3918      logAuditEvent(success, "setQuota", src);
3919    }
3920  }
3921
3922  /** Persist all metadata about this file.
3923   * @param src The string representation of the path
3924   * @param fileId The inode ID that we're fsyncing.  Older clients will pass
3925   *               INodeId.GRANDFATHER_INODE_ID here.
3926   * @param clientName The string representation of the client
3927   * @param lastBlockLength The length of the last block 
3928   *                        under construction reported from client.
3929   * @throws IOException if path does not exist
3930   */
3931  void fsync(String src, long fileId, String clientName, long lastBlockLength)
3932      throws IOException {
3933    NameNode.stateChangeLog.info("BLOCK* fsync: " + src + " for " + clientName);
3934    checkOperation(OperationCategory.WRITE);
3935    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
3936
3937    FSPermissionChecker pc = getPermissionChecker();
3938    waitForLoadingFSImage();
3939    writeLock();
3940    try {
3941      checkOperation(OperationCategory.WRITE);
3942      checkNameNodeSafeMode("Cannot fsync file " + src);
3943      src = dir.resolvePath(pc, src, pathComponents);
3944      final INode inode;
3945      if (fileId == INodeId.GRANDFATHER_INODE_ID) {
3946        // Older clients may not have given us an inode ID to work with.
3947        // In this case, we have to try to resolve the path and hope it
3948        // hasn't changed or been deleted since the file was opened for write.
3949        inode = dir.getINode(src);
3950      } else {
3951        inode = dir.getInode(fileId);
3952        if (inode != null) src = inode.getFullPathName();
3953      }
3954      final INodeFile pendingFile = checkLease(src, clientName, inode, fileId);
3955      if (lastBlockLength > 0) {
3956        pendingFile.getFileUnderConstructionFeature().updateLengthOfLastBlock(
3957            pendingFile, lastBlockLength);
3958      }
3959      persistBlocks(src, pendingFile, false);
3960    } finally {
3961      writeUnlock();
3962    }
3963    getEditLog().logSync();
3964  }
3965
3966  /**
3967   * Move a file that is being written to be immutable.
3968   * @param src The filename
3969   * @param lease The lease for the client creating the file
3970   * @param recoveryLeaseHolder reassign lease to this holder if the last block
3971   *        needs recovery; keep current holder if null.
3972   * @throws AlreadyBeingCreatedException if file is waiting to achieve minimal
3973   *         replication;<br>
3974   *         RecoveryInProgressException if lease recovery is in progress.<br>
3975   *         IOException in case of an error.
3976   * @return true  if file has been successfully finalized and closed or 
3977   *         false if block recovery has been initiated. Since the lease owner
3978   *         has been changed and logged, caller should call logSync().
3979   */
3980  boolean internalReleaseLease(Lease lease, String src, INodesInPath iip,
3981      String recoveryLeaseHolder) throws IOException {
3982    LOG.info("Recovering " + lease + ", src=" + src);
3983    assert !isInSafeMode();
3984    assert hasWriteLock();
3985
3986    final INodeFile pendingFile = iip.getLastINode().asFile();
3987    int nrBlocks = pendingFile.numBlocks();
3988    BlockInfoContiguous[] blocks = pendingFile.getBlocks();
3989
3990    int nrCompleteBlocks;
3991    BlockInfoContiguous curBlock = null;
3992    for(nrCompleteBlocks = 0; nrCompleteBlocks < nrBlocks; nrCompleteBlocks++) {
3993      curBlock = blocks[nrCompleteBlocks];
3994      if(!curBlock.isComplete())
3995        break;
3996      assert blockManager.checkMinReplication(curBlock) :
3997              "A COMPLETE block is not minimally replicated in " + src;
3998    }
3999
4000    // If there are no incomplete blocks associated with this file,
4001    // then reap lease immediately and close the file.
4002    if(nrCompleteBlocks == nrBlocks) {
4003      finalizeINodeFileUnderConstruction(src, pendingFile,
4004          iip.getLatestSnapshotId());
4005      NameNode.stateChangeLog.warn("BLOCK*"
4006        + " internalReleaseLease: All existing blocks are COMPLETE,"
4007        + " lease removed, file closed.");
4008      return true;  // closed!
4009    }
4010
4011    // Only the last and the penultimate blocks may be in non COMPLETE state.
4012    // If the penultimate block is not COMPLETE, then it must be COMMITTED.
4013    if(nrCompleteBlocks < nrBlocks - 2 ||
4014       nrCompleteBlocks == nrBlocks - 2 &&
4015         curBlock != null &&
4016         curBlock.getBlockUCState() != BlockUCState.COMMITTED) {
4017      final String message = "DIR* NameSystem.internalReleaseLease: "
4018        + "attempt to release a create lock on "
4019        + src + " but file is already closed.";
4020      NameNode.stateChangeLog.warn(message);
4021      throw new IOException(message);
4022    }
4023
4024    // The last block is not COMPLETE, and
4025    // that the penultimate block if exists is either COMPLETE or COMMITTED
4026    final BlockInfoContiguous lastBlock = pendingFile.getLastBlock();
4027    BlockUCState lastBlockState = lastBlock.getBlockUCState();
4028    BlockInfoContiguous penultimateBlock = pendingFile.getPenultimateBlock();
4029
4030    // If penultimate block doesn't exist then its minReplication is met
4031    boolean penultimateBlockMinReplication = penultimateBlock == null ? true :
4032        blockManager.checkMinReplication(penultimateBlock);
4033
4034    switch(lastBlockState) {
4035    case COMPLETE:
4036      assert false : "Already checked that the last block is incomplete";
4037      break;
4038    case COMMITTED:
4039      // Close file if committed blocks are minimally replicated
4040      if(penultimateBlockMinReplication &&
4041          blockManager.checkMinReplication(lastBlock)) {
4042        finalizeINodeFileUnderConstruction(src, pendingFile,
4043            iip.getLatestSnapshotId());
4044        NameNode.stateChangeLog.warn("BLOCK*"
4045          + " internalReleaseLease: Committed blocks are minimally replicated,"
4046          + " lease removed, file closed.");
4047        return true;  // closed!
4048      }
4049      // Cannot close file right now, since some blocks 
4050      // are not yet minimally replicated.
4051      // This may potentially cause infinite loop in lease recovery
4052      // if there are no valid replicas on data-nodes.
4053      String message = "DIR* NameSystem.internalReleaseLease: " +
4054          "Failed to release lease for file " + src +
4055          ". Committed blocks are waiting to be minimally replicated." +
4056          " Try again later.";
4057      NameNode.stateChangeLog.warn(message);
4058      throw new AlreadyBeingCreatedException(message);
4059    case UNDER_CONSTRUCTION:
4060    case UNDER_RECOVERY:
4061      final BlockInfoContiguousUnderConstruction uc = (BlockInfoContiguousUnderConstruction)lastBlock;
4062      // determine if last block was intended to be truncated
4063      Block recoveryBlock = uc.getTruncateBlock();
4064      boolean truncateRecovery = recoveryBlock != null;
4065      boolean copyOnTruncate = truncateRecovery &&
4066          recoveryBlock.getBlockId() != uc.getBlockId();
4067      assert !copyOnTruncate ||
4068          recoveryBlock.getBlockId() < uc.getBlockId() &&
4069          recoveryBlock.getGenerationStamp() < uc.getGenerationStamp() &&
4070          recoveryBlock.getNumBytes() > uc.getNumBytes() :
4071            "wrong recoveryBlock";
4072
4073      // setup the last block locations from the blockManager if not known
4074      if (uc.getNumExpectedLocations() == 0) {
4075        uc.setExpectedLocations(blockManager.getStorages(lastBlock));
4076      }
4077
4078      if (uc.getNumExpectedLocations() == 0 && uc.getNumBytes() == 0) {
4079        // There is no datanode reported to this block.
4080        // may be client have crashed before writing data to pipeline.
4081        // This blocks doesn't need any recovery.
4082        // We can remove this block and close the file.
4083        pendingFile.removeLastBlock(lastBlock);
4084        finalizeINodeFileUnderConstruction(src, pendingFile,
4085            iip.getLatestSnapshotId());
4086        NameNode.stateChangeLog.warn("BLOCK* internalReleaseLease: "
4087            + "Removed empty last block and closed file.");
4088        return true;
4089      }
4090      // start recovery of the last block for this file
4091      long blockRecoveryId = nextGenerationStamp(blockIdManager.isLegacyBlock(uc));
4092      lease = reassignLease(lease, src, recoveryLeaseHolder, pendingFile);
4093      if(copyOnTruncate) {
4094        uc.setGenerationStamp(blockRecoveryId);
4095      } else if(truncateRecovery) {
4096        recoveryBlock.setGenerationStamp(blockRecoveryId);
4097      }
4098      uc.initializeBlockRecovery(blockRecoveryId);
4099      leaseManager.renewLease(lease);
4100      // Cannot close file right now, since the last block requires recovery.
4101      // This may potentially cause infinite loop in lease recovery
4102      // if there are no valid replicas on data-nodes.
4103      NameNode.stateChangeLog.warn(
4104                "DIR* NameSystem.internalReleaseLease: " +
4105                "File " + src + " has not been closed." +
4106               " Lease recovery is in progress. " +
4107                "RecoveryId = " + blockRecoveryId + " for block " + lastBlock);
4108      break;
4109    }
4110    return false;
4111  }
4112
4113  private Lease reassignLease(Lease lease, String src, String newHolder,
4114      INodeFile pendingFile) {
4115    assert hasWriteLock();
4116    if(newHolder == null)
4117      return lease;
4118    // The following transaction is not synced. Make sure it's sync'ed later.
4119    logReassignLease(lease.getHolder(), src, newHolder);
4120    return reassignLeaseInternal(lease, src, newHolder, pendingFile);
4121  }
4122  
4123  Lease reassignLeaseInternal(Lease lease, String src, String newHolder,
4124      INodeFile pendingFile) {
4125    assert hasWriteLock();
4126    pendingFile.getFileUnderConstructionFeature().setClientName(newHolder);
4127    return leaseManager.reassignLease(lease, src, newHolder);
4128  }
4129
4130  private void commitOrCompleteLastBlock(final INodeFile fileINode,
4131      final INodesInPath iip, final Block commitBlock) throws IOException {
4132    assert hasWriteLock();
4133    Preconditions.checkArgument(fileINode.isUnderConstruction());
4134    if (!blockManager.commitOrCompleteLastBlock(fileINode, commitBlock)) {
4135      return;
4136    }
4137
4138    // Adjust disk space consumption if required
4139    final long diff = fileINode.getPreferredBlockSize() - commitBlock.getNumBytes();    
4140    if (diff > 0) {
4141      try {
4142        dir.updateSpaceConsumed(iip, 0, -diff, fileINode.getFileReplication());
4143      } catch (IOException e) {
4144        LOG.warn("Unexpected exception while updating disk space.", e);
4145      }
4146    }
4147  }
4148
4149  private void finalizeINodeFileUnderConstruction(String src,
4150      INodeFile pendingFile, int latestSnapshot) throws IOException {
4151    assert hasWriteLock();
4152
4153    FileUnderConstructionFeature uc = pendingFile.getFileUnderConstructionFeature();
4154    Preconditions.checkArgument(uc != null);
4155    leaseManager.removeLease(uc.getClientName(), src);
4156    
4157    pendingFile.recordModification(latestSnapshot);
4158
4159    // The file is no longer pending.
4160    // Create permanent INode, update blocks. No need to replace the inode here
4161    // since we just remove the uc feature from pendingFile
4162    pendingFile.toCompleteFile(now());
4163
4164    waitForLoadingFSImage();
4165    // close file and persist block allocations for this file
4166    closeFile(src, pendingFile);
4167
4168    blockManager.checkReplication(pendingFile);
4169  }
4170
4171  @VisibleForTesting
4172  BlockInfoContiguous getStoredBlock(Block block) {
4173    return blockManager.getStoredBlock(block);
4174  }
4175  
4176  @Override
4177  public boolean isInSnapshot(BlockInfoContiguousUnderConstruction blockUC) {
4178    assert hasReadLock();
4179    final BlockCollection bc = blockUC.getBlockCollection();
4180    if (bc == null || !(bc instanceof INodeFile)
4181        || !bc.isUnderConstruction()) {
4182      return false;
4183    }
4184
4185    String fullName = bc.getName();
4186    try {
4187      if (fullName != null && fullName.startsWith(Path.SEPARATOR)
4188          && dir.getINode(fullName) == bc) {
4189        // If file exists in normal path then no need to look in snapshot
4190        return false;
4191      }
4192    } catch (UnresolvedLinkException e) {
4193      LOG.error("Error while resolving the link : " + fullName, e);
4194      return false;
4195    }
4196    /*
4197     * 1. if bc is under construction and also with snapshot, and
4198     * bc is not in the current fsdirectory tree, bc must represent a snapshot
4199     * file. 
4200     * 2. if fullName is not an absolute path, bc cannot be existent in the 
4201     * current fsdirectory tree. 
4202     * 3. if bc is not the current node associated with fullName, bc must be a
4203     * snapshot inode.
4204     */
4205    return true;
4206  }
4207
4208  void commitBlockSynchronization(ExtendedBlock oldBlock,
4209      long newgenerationstamp, long newlength,
4210      boolean closeFile, boolean deleteblock, DatanodeID[] newtargets,
4211      String[] newtargetstorages) throws IOException {
4212    LOG.info("commitBlockSynchronization(oldBlock=" + oldBlock
4213             + ", newgenerationstamp=" + newgenerationstamp
4214             + ", newlength=" + newlength
4215             + ", newtargets=" + Arrays.asList(newtargets)
4216             + ", closeFile=" + closeFile
4217             + ", deleteBlock=" + deleteblock
4218             + ")");
4219    checkOperation(OperationCategory.WRITE);
4220    String src = "";
4221    waitForLoadingFSImage();
4222    writeLock();
4223    try {
4224      checkOperation(OperationCategory.WRITE);
4225      // If a DN tries to commit to the standby, the recovery will
4226      // fail, and the next retry will succeed on the new NN.
4227  
4228      checkNameNodeSafeMode(
4229          "Cannot commitBlockSynchronization while in safe mode");
4230      final BlockInfoContiguous storedBlock = getStoredBlock(
4231          ExtendedBlock.getLocalBlock(oldBlock));
4232      if (storedBlock == null) {
4233        if (deleteblock) {
4234          // This may be a retry attempt so ignore the failure
4235          // to locate the block.
4236          if (LOG.isDebugEnabled()) {
4237            LOG.debug("Block (=" + oldBlock + ") not found");
4238          }
4239          return;
4240        } else {
4241          throw new IOException("Block (=" + oldBlock + ") not found");
4242        }
4243      }
4244      final long oldGenerationStamp = storedBlock.getGenerationStamp();
4245      final long oldNumBytes = storedBlock.getNumBytes();
4246      //
4247      // The implementation of delete operation (see @deleteInternal method)
4248      // first removes the file paths from namespace, and delays the removal
4249      // of blocks to later time for better performance. When
4250      // commitBlockSynchronization (this method) is called in between, the
4251      // blockCollection of storedBlock could have been assigned to null by
4252      // the delete operation, throw IOException here instead of NPE; if the
4253      // file path is already removed from namespace by the delete operation,
4254      // throw FileNotFoundException here, so not to proceed to the end of
4255      // this method to add a CloseOp to the edit log for an already deleted
4256      // file (See HDFS-6825).
4257      //
4258      BlockCollection blockCollection = storedBlock.getBlockCollection();
4259      if (blockCollection == null) {
4260        throw new IOException("The blockCollection of " + storedBlock
4261            + " is null, likely because the file owning this block was"
4262            + " deleted and the block removal is delayed");
4263      }
4264      INodeFile iFile = ((INode)blockCollection).asFile();
4265      if (isFileDeleted(iFile)) {
4266        throw new FileNotFoundException("File not found: "
4267            + iFile.getFullPathName() + ", likely due to delayed block"
4268            + " removal");
4269      }
4270      if ((!iFile.isUnderConstruction() || storedBlock.isComplete()) &&
4271          iFile.getLastBlock().isComplete()) {
4272        if (LOG.isDebugEnabled()) {
4273          LOG.debug("Unexpected block (=" + oldBlock
4274                    + ") since the file (=" + iFile.getLocalName()
4275                    + ") is not under construction");
4276        }
4277        return;
4278      }
4279
4280      BlockInfoContiguousUnderConstruction truncatedBlock =
4281          (BlockInfoContiguousUnderConstruction) iFile.getLastBlock();
4282      long recoveryId = truncatedBlock.getBlockRecoveryId();
4283      boolean copyTruncate =
4284          truncatedBlock.getBlockId() != storedBlock.getBlockId();
4285      if(recoveryId != newgenerationstamp) {
4286        throw new IOException("The recovery id " + newgenerationstamp
4287                              + " does not match current recovery id "
4288                              + recoveryId + " for block " + oldBlock);
4289      }
4290
4291      if (deleteblock) {
4292        Block blockToDel = ExtendedBlock.getLocalBlock(oldBlock);
4293        boolean remove = iFile.removeLastBlock(blockToDel);
4294        if (remove) {
4295          blockManager.removeBlock(storedBlock);
4296        }
4297      }
4298      else {
4299        // update last block
4300        if(!copyTruncate) {
4301          storedBlock.setGenerationStamp(newgenerationstamp);
4302          storedBlock.setNumBytes(newlength);
4303        }
4304
4305        // find the DatanodeDescriptor objects
4306        ArrayList<DatanodeDescriptor> trimmedTargets =
4307            new ArrayList<DatanodeDescriptor>(newtargets.length);
4308        ArrayList<String> trimmedStorages =
4309            new ArrayList<String>(newtargets.length);
4310        if (newtargets.length > 0) {
4311          for (int i = 0; i < newtargets.length; ++i) {
4312            // try to get targetNode
4313            DatanodeDescriptor targetNode =
4314                blockManager.getDatanodeManager().getDatanode(newtargets[i]);
4315            if (targetNode != null) {
4316              trimmedTargets.add(targetNode);
4317              trimmedStorages.add(newtargetstorages[i]);
4318            } else if (LOG.isDebugEnabled()) {
4319              LOG.debug("DatanodeDescriptor (=" + newtargets[i] + ") not found");
4320            }
4321          }
4322        }
4323        if ((closeFile) && !trimmedTargets.isEmpty()) {
4324          // the file is getting closed. Insert block locations into blockManager.
4325          // Otherwise fsck will report these blocks as MISSING, especially if the
4326          // blocksReceived from Datanodes take a long time to arrive.
4327          for (int i = 0; i < trimmedTargets.size(); i++) {
4328            DatanodeStorageInfo storageInfo =
4329                trimmedTargets.get(i).getStorageInfo(trimmedStorages.get(i));
4330            if (storageInfo != null) {
4331              if(copyTruncate) {
4332                storageInfo.addBlock(truncatedBlock);
4333              } else {
4334                storageInfo.addBlock(storedBlock);
4335              }
4336            }
4337          }
4338        }
4339
4340        // add pipeline locations into the INodeUnderConstruction
4341        DatanodeStorageInfo[] trimmedStorageInfos =
4342            blockManager.getDatanodeManager().getDatanodeStorageInfos(
4343                trimmedTargets.toArray(new DatanodeID[trimmedTargets.size()]),
4344                trimmedStorages.toArray(new String[trimmedStorages.size()]));
4345        if(copyTruncate) {
4346          iFile.setLastBlock(truncatedBlock, trimmedStorageInfos);
4347        } else {
4348          iFile.setLastBlock(storedBlock, trimmedStorageInfos);
4349          if (closeFile) {
4350            blockManager.markBlockReplicasAsCorrupt(storedBlock,
4351                oldGenerationStamp, oldNumBytes, trimmedStorageInfos);
4352          }
4353        }
4354      }
4355
4356      if (closeFile) {
4357        if(copyTruncate) {
4358          src = closeFileCommitBlocks(iFile, truncatedBlock);
4359          if(!iFile.isBlockInLatestSnapshot(storedBlock)) {
4360            blockManager.removeBlock(storedBlock);
4361          }
4362        } else {
4363          src = closeFileCommitBlocks(iFile, storedBlock);
4364        }
4365      } else {
4366        // If this commit does not want to close the file, persist blocks
4367        src = iFile.getFullPathName();
4368        persistBlocks(src, iFile, false);
4369      }
4370    } finally {
4371      writeUnlock();
4372    }
4373    getEditLog().logSync();
4374    if (closeFile) {
4375      LOG.info("commitBlockSynchronization(oldBlock=" + oldBlock
4376          + ", file=" + src
4377          + ", newgenerationstamp=" + newgenerationstamp
4378          + ", newlength=" + newlength
4379          + ", newtargets=" + Arrays.asList(newtargets) + ") successful");
4380    } else {
4381      LOG.info("commitBlockSynchronization(" + oldBlock + ") successful");
4382    }
4383  }
4384
4385  /**
4386   * @param pendingFile open file that needs to be closed
4387   * @param storedBlock last block
4388   * @return Path of the file that was closed.
4389   * @throws IOException on error
4390   */
4391  @VisibleForTesting
4392  String closeFileCommitBlocks(INodeFile pendingFile, BlockInfoContiguous storedBlock)
4393      throws IOException {
4394    final INodesInPath iip = INodesInPath.fromINode(pendingFile);
4395    final String src = iip.getPath();
4396
4397    // commit the last block and complete it if it has minimum replicas
4398    commitOrCompleteLastBlock(pendingFile, iip, storedBlock);
4399
4400    //remove lease, close file
4401    finalizeINodeFileUnderConstruction(src, pendingFile,
4402        Snapshot.findLatestSnapshot(pendingFile, Snapshot.CURRENT_STATE_ID));
4403
4404    return src;
4405  }
4406
4407  /**
4408   * Renew the lease(s) held by the given client
4409   */
4410  void renewLease(String holder) throws IOException {
4411    checkOperation(OperationCategory.WRITE);
4412    readLock();
4413    try {
4414      checkOperation(OperationCategory.WRITE);
4415      checkNameNodeSafeMode("Cannot renew lease for " + holder);
4416      leaseManager.renewLease(holder);
4417    } finally {
4418      readUnlock();
4419    }
4420  }
4421
4422  /**
4423   * Get a partial listing of the indicated directory
4424   *
4425   * @param src the directory name
4426   * @param startAfter the name to start after
4427   * @param needLocation if blockLocations need to be returned
4428   * @return a partial listing starting after startAfter
4429   * 
4430   * @throws AccessControlException if access is denied
4431   * @throws UnresolvedLinkException if symbolic link is encountered
4432   * @throws IOException if other I/O error occurred
4433   */
4434  DirectoryListing getListing(String src, byte[] startAfter,
4435      boolean needLocation) 
4436      throws IOException {
4437    checkOperation(OperationCategory.READ);
4438    DirectoryListing dl = null;
4439    readLock();
4440    try {
4441      checkOperation(NameNode.OperationCategory.READ);
4442      dl = FSDirStatAndListingOp.getListingInt(dir, src, startAfter,
4443          needLocation);
4444    } catch (AccessControlException e) {
4445      logAuditEvent(false, "listStatus", src);
4446      throw e;
4447    } finally {
4448      readUnlock();
4449    }
4450    logAuditEvent(true, "listStatus", src);
4451    return dl;
4452  }
4453
4454  /////////////////////////////////////////////////////////
4455  //
4456  // These methods are called by datanodes
4457  //
4458  /////////////////////////////////////////////////////////
4459  /**
4460   * Register Datanode.
4461   * <p>
4462   * The purpose of registration is to identify whether the new datanode
4463   * serves a new data storage, and will report new data block copies,
4464   * which the namenode was not aware of; or the datanode is a replacement
4465   * node for the data storage that was previously served by a different
4466   * or the same (in terms of host:port) datanode.
4467   * The data storages are distinguished by their storageIDs. When a new
4468   * data storage is reported the namenode issues a new unique storageID.
4469   * <p>
4470   * Finally, the namenode returns its namespaceID as the registrationID
4471   * for the datanodes. 
4472   * namespaceID is a persistent attribute of the name space.
4473   * The registrationID is checked every time the datanode is communicating
4474   * with the namenode. 
4475   * Datanodes with inappropriate registrationID are rejected.
4476   * If the namenode stops, and then restarts it can restore its 
4477   * namespaceID and will continue serving the datanodes that has previously
4478   * registered with the namenode without restarting the whole cluster.
4479   * 
4480   * @see org.apache.hadoop.hdfs.server.datanode.DataNode
4481   */
4482  void registerDatanode(DatanodeRegistration nodeReg) throws IOException {
4483    writeLock();
4484    try {
4485      getBlockManager().getDatanodeManager().registerDatanode(nodeReg);
4486      checkSafeMode();
4487    } finally {
4488      writeUnlock();
4489    }
4490  }
4491  
4492  /**
4493   * Get registrationID for datanodes based on the namespaceID.
4494   * 
4495   * @see #registerDatanode(DatanodeRegistration)
4496   * @return registration ID
4497   */
4498  String getRegistrationID() {
4499    return Storage.getRegistrationID(getFSImage().getStorage());
4500  }
4501
4502  /**
4503   * The given node has reported in.  This method should:
4504   * 1) Record the heartbeat, so the datanode isn't timed out
4505   * 2) Adjust usage stats for future block allocation
4506   * 
4507   * If a substantial amount of time passed since the last datanode 
4508   * heartbeat then request an immediate block report.  
4509   * 
4510   * @return an array of datanode commands 
4511   * @throws IOException
4512   */
4513  HeartbeatResponse handleHeartbeat(DatanodeRegistration nodeReg,
4514      StorageReport[] reports, long cacheCapacity, long cacheUsed,
4515      int xceiverCount, int xmitsInProgress, int failedVolumes,
4516      VolumeFailureSummary volumeFailureSummary) throws IOException {
4517    readLock();
4518    try {
4519      //get datanode commands
4520      final int maxTransfer = blockManager.getMaxReplicationStreams()
4521          - xmitsInProgress;
4522      DatanodeCommand[] cmds = blockManager.getDatanodeManager().handleHeartbeat(
4523          nodeReg, reports, blockPoolId, cacheCapacity, cacheUsed,
4524          xceiverCount, maxTransfer, failedVolumes, volumeFailureSummary);
4525      
4526      //create ha status
4527      final NNHAStatusHeartbeat haState = new NNHAStatusHeartbeat(
4528          haContext.getState().getServiceState(),
4529          getFSImage().getLastAppliedOrWrittenTxId());
4530
4531      return new HeartbeatResponse(cmds, haState, rollingUpgradeInfo);
4532    } finally {
4533      readUnlock();
4534    }
4535  }
4536
4537  /**
4538   * Returns whether or not there were available resources at the last check of
4539   * resources.
4540   *
4541   * @return true if there were sufficient resources available, false otherwise.
4542   */
4543  boolean nameNodeHasResourcesAvailable() {
4544    return hasResourcesAvailable;
4545  }
4546
4547  /**
4548   * Perform resource checks and cache the results.
4549   */
4550  void checkAvailableResources() {
4551    Preconditions.checkState(nnResourceChecker != null,
4552        "nnResourceChecker not initialized");
4553    hasResourcesAvailable = nnResourceChecker.hasAvailableDiskSpace();
4554  }
4555
4556  /**
4557   * Persist the block list for the inode.
4558   * @param path
4559   * @param file
4560   * @param logRetryCache
4561   */
4562  private void persistBlocks(String path, INodeFile file,
4563                             boolean logRetryCache) {
4564    assert hasWriteLock();
4565    Preconditions.checkArgument(file.isUnderConstruction());
4566    getEditLog().logUpdateBlocks(path, file, logRetryCache);
4567    if(NameNode.stateChangeLog.isDebugEnabled()) {
4568      NameNode.stateChangeLog.debug("persistBlocks: " + path
4569              + " with " + file.getBlocks().length + " blocks is persisted to" +
4570              " the file system");
4571    }
4572  }
4573
4574  /**
4575   * Close file.
4576   * @param path
4577   * @param file
4578   */
4579  private void closeFile(String path, INodeFile file) {
4580    assert hasWriteLock();
4581    waitForLoadingFSImage();
4582    // file is closed
4583    getEditLog().logCloseFile(path, file);
4584    if (NameNode.stateChangeLog.isDebugEnabled()) {
4585      NameNode.stateChangeLog.debug("closeFile: "
4586              +path+" with "+ file.getBlocks().length
4587              +" blocks is persisted to the file system");
4588    }
4589  }
4590
4591  /**
4592   * Periodically calls hasAvailableResources of NameNodeResourceChecker, and if
4593   * there are found to be insufficient resources available, causes the NN to
4594   * enter safe mode. If resources are later found to have returned to
4595   * acceptable levels, this daemon will cause the NN to exit safe mode.
4596   */
4597  class NameNodeResourceMonitor implements Runnable  {
4598    boolean shouldNNRmRun = true;
4599    @Override
4600    public void run () {
4601      try {
4602        while (fsRunning && shouldNNRmRun) {
4603          checkAvailableResources();
4604          if(!nameNodeHasResourcesAvailable()) {
4605            String lowResourcesMsg = "NameNode low on available disk space. ";
4606            if (!isInSafeMode()) {
4607              LOG.warn(lowResourcesMsg + "Entering safe mode.");
4608            } else {
4609              LOG.warn(lowResourcesMsg + "Already in safe mode.");
4610            }
4611            enterSafeMode(true);
4612          }
4613          try {
4614            Thread.sleep(resourceRecheckInterval);
4615          } catch (InterruptedException ie) {
4616            // Deliberately ignore
4617          }
4618        }
4619      } catch (Exception e) {
4620        FSNamesystem.LOG.error("Exception in NameNodeResourceMonitor: ", e);
4621      }
4622    }
4623
4624    public void stopMonitor() {
4625      shouldNNRmRun = false;
4626    }
4627 }
4628
4629  class NameNodeEditLogRoller implements Runnable {
4630
4631    private boolean shouldRun = true;
4632    private final long rollThreshold;
4633    private final long sleepIntervalMs;
4634
4635    public NameNodeEditLogRoller(long rollThreshold, int sleepIntervalMs) {
4636        this.rollThreshold = rollThreshold;
4637        this.sleepIntervalMs = sleepIntervalMs;
4638    }
4639
4640    @Override
4641    public void run() {
4642      while (fsRunning && shouldRun) {
4643        try {
4644          FSEditLog editLog = getFSImage().getEditLog();
4645          long numEdits =
4646              editLog.getLastWrittenTxId() - editLog.getCurSegmentTxId();
4647          if (numEdits > rollThreshold) {
4648            FSNamesystem.LOG.info("NameNode rolling its own edit log because"
4649                + " number of edits in open segment exceeds threshold of "
4650                + rollThreshold);
4651            rollEditLog();
4652          }
4653        } catch (Exception e) {
4654          FSNamesystem.LOG.error("Swallowing exception in "
4655              + NameNodeEditLogRoller.class.getSimpleName() + ":", e);
4656        }
4657        try {
4658          Thread.sleep(sleepIntervalMs);
4659        } catch (InterruptedException e) {
4660          FSNamesystem.LOG.info(NameNodeEditLogRoller.class.getSimpleName()
4661              + " was interrupted, exiting");
4662          break;
4663        }
4664      }
4665    }
4666
4667    public void stop() {
4668      shouldRun = false;
4669    }
4670  }
4671
4672  /**
4673   * Daemon to periodically scan the namespace for lazyPersist files
4674   * with missing blocks and unlink them.
4675   */
4676  class LazyPersistFileScrubber implements Runnable {
4677    private volatile boolean shouldRun = true;
4678    final int scrubIntervalSec;
4679    public LazyPersistFileScrubber(final int scrubIntervalSec) {
4680      this.scrubIntervalSec = scrubIntervalSec;
4681    }
4682
4683    /**
4684     * Periodically go over the list of lazyPersist files with missing
4685     * blocks and unlink them from the namespace.
4686     */
4687    private void clearCorruptLazyPersistFiles()
4688        throws SafeModeException, AccessControlException,
4689        UnresolvedLinkException, IOException {
4690
4691      BlockStoragePolicy lpPolicy = blockManager.getStoragePolicy("LAZY_PERSIST");
4692
4693      List<BlockCollection> filesToDelete = new ArrayList<BlockCollection>();
4694
4695      writeLock();
4696
4697      try {
4698        final Iterator<Block> it = blockManager.getCorruptReplicaBlockIterator();
4699
4700        while (it.hasNext()) {
4701          Block b = it.next();
4702          BlockInfoContiguous blockInfo = blockManager.getStoredBlock(b);
4703          if (blockInfo.getBlockCollection().getStoragePolicyID() == lpPolicy.getId()) {
4704            filesToDelete.add(blockInfo.getBlockCollection());
4705          }
4706        }
4707
4708        for (BlockCollection bc : filesToDelete) {
4709          LOG.warn("Removing lazyPersist file " + bc.getName() + " with no replicas.");
4710          BlocksMapUpdateInfo toRemoveBlocks =
4711          FSDirDeleteOp.deleteInternal(
4712              FSNamesystem.this, bc.getName(),
4713              INodesInPath.fromINode((INodeFile) bc), false);
4714          if (toRemoveBlocks != null) {
4715            removeBlocks(toRemoveBlocks); // Incremental deletion of blocks
4716          }
4717        }
4718      } finally {
4719        writeUnlock();
4720      }
4721    }
4722
4723    @Override
4724    public void run() {
4725      while (fsRunning && shouldRun) {
4726        try {
4727          clearCorruptLazyPersistFiles();
4728          Thread.sleep(scrubIntervalSec * 1000);
4729        } catch (InterruptedException e) {
4730          FSNamesystem.LOG.info(
4731              "LazyPersistFileScrubber was interrupted, exiting");
4732          break;
4733        } catch (Exception e) {
4734          FSNamesystem.LOG.error(
4735              "Ignoring exception in LazyPersistFileScrubber:", e);
4736        }
4737      }
4738    }
4739
4740    public void stop() {
4741      shouldRun = false;
4742    }
4743  }
4744
4745  public FSImage getFSImage() {
4746    return fsImage;
4747  }
4748
4749  public FSEditLog getEditLog() {
4750    return getFSImage().getEditLog();
4751  }    
4752
4753  private void checkBlock(ExtendedBlock block) throws IOException {
4754    if (block != null && !this.blockPoolId.equals(block.getBlockPoolId())) {
4755      throw new IOException("Unexpected BlockPoolId " + block.getBlockPoolId()
4756          + " - expected " + blockPoolId);
4757    }
4758  }
4759
4760  @Metric({"MissingBlocks", "Number of missing blocks"})
4761  public long getMissingBlocksCount() {
4762    // not locking
4763    return blockManager.getMissingBlocksCount();
4764  }
4765
4766  @Metric({"MissingReplOneBlocks", "Number of missing blocks " +
4767      "with replication factor 1"})
4768  public long getMissingReplOneBlocksCount() {
4769    // not locking
4770    return blockManager.getMissingReplOneBlocksCount();
4771  }
4772  
4773  @Metric({"ExpiredHeartbeats", "Number of expired heartbeats"})
4774  public int getExpiredHeartbeats() {
4775    return datanodeStatistics.getExpiredHeartbeats();
4776  }
4777  
4778  @Metric({"TransactionsSinceLastCheckpoint",
4779      "Number of transactions since last checkpoint"})
4780  public long getTransactionsSinceLastCheckpoint() {
4781    return getEditLog().getLastWrittenTxId() -
4782        getFSImage().getStorage().getMostRecentCheckpointTxId();
4783  }
4784  
4785  @Metric({"TransactionsSinceLastLogRoll",
4786      "Number of transactions since last edit log roll"})
4787  public long getTransactionsSinceLastLogRoll() {
4788    if (isInStandbyState() || !getEditLog().isSegmentOpen()) {
4789      return 0;
4790    } else {
4791      return getEditLog().getLastWrittenTxId() -
4792        getEditLog().getCurSegmentTxId() + 1;
4793    }
4794  }
4795  
4796  @Metric({"LastWrittenTransactionId", "Transaction ID written to the edit log"})
4797  public long getLastWrittenTransactionId() {
4798    return getEditLog().getLastWrittenTxId();
4799  }
4800  
4801  @Metric({"LastCheckpointTime",
4802      "Time in milliseconds since the epoch of the last checkpoint"})
4803  public long getLastCheckpointTime() {
4804    return getFSImage().getStorage().getMostRecentCheckpointTime();
4805  }
4806
4807  /** @see ClientProtocol#getStats() */
4808  long[] getStats() {
4809    final long[] stats = datanodeStatistics.getStats();
4810    stats[ClientProtocol.GET_STATS_UNDER_REPLICATED_IDX] = getUnderReplicatedBlocks();
4811    stats[ClientProtocol.GET_STATS_CORRUPT_BLOCKS_IDX] = getCorruptReplicaBlocks();
4812    stats[ClientProtocol.GET_STATS_MISSING_BLOCKS_IDX] = getMissingBlocksCount();
4813    stats[ClientProtocol.GET_STATS_MISSING_REPL_ONE_BLOCKS_IDX] =
4814        getMissingReplOneBlocksCount();
4815    return stats;
4816  }
4817
4818  @Override // FSNamesystemMBean
4819  @Metric({"CapacityTotal",
4820      "Total raw capacity of data nodes in bytes"})
4821  public long getCapacityTotal() {
4822    return datanodeStatistics.getCapacityTotal();
4823  }
4824
4825  @Metric({"CapacityTotalGB",
4826      "Total raw capacity of data nodes in GB"})
4827  public float getCapacityTotalGB() {
4828    return DFSUtil.roundBytesToGB(getCapacityTotal());
4829  }
4830
4831  @Override // FSNamesystemMBean
4832  @Metric({"CapacityUsed",
4833      "Total used capacity across all data nodes in bytes"})
4834  public long getCapacityUsed() {
4835    return datanodeStatistics.getCapacityUsed();
4836  }
4837
4838  @Metric({"CapacityUsedGB",
4839      "Total used capacity across all data nodes in GB"})
4840  public float getCapacityUsedGB() {
4841    return DFSUtil.roundBytesToGB(getCapacityUsed());
4842  }
4843
4844  @Override // FSNamesystemMBean
4845  @Metric({"CapacityRemaining", "Remaining capacity in bytes"})
4846  public long getCapacityRemaining() {
4847    return datanodeStatistics.getCapacityRemaining();
4848  }
4849
4850  @Metric({"CapacityRemainingGB", "Remaining capacity in GB"})
4851  public float getCapacityRemainingGB() {
4852    return DFSUtil.roundBytesToGB(getCapacityRemaining());
4853  }
4854
4855  @Metric({"CapacityUsedNonDFS",
4856      "Total space used by data nodes for non DFS purposes in bytes"})
4857  public long getCapacityUsedNonDFS() {
4858    return datanodeStatistics.getCapacityUsedNonDFS();
4859  }
4860
4861  /**
4862   * Total number of connections.
4863   */
4864  @Override // FSNamesystemMBean
4865  @Metric
4866  public int getTotalLoad() {
4867    return datanodeStatistics.getXceiverCount();
4868  }
4869  
4870  @Metric({ "SnapshottableDirectories", "Number of snapshottable directories" })
4871  public int getNumSnapshottableDirs() {
4872    return this.snapshotManager.getNumSnapshottableDirs();
4873  }
4874
4875  @Metric({ "Snapshots", "The number of snapshots" })
4876  public int getNumSnapshots() {
4877    return this.snapshotManager.getNumSnapshots();
4878  }
4879
4880  @Override
4881  public String getSnapshotStats() {
4882    Map<String, Object> info = new HashMap<String, Object>();
4883    info.put("SnapshottableDirectories", this.getNumSnapshottableDirs());
4884    info.put("Snapshots", this.getNumSnapshots());
4885    return JSON.toString(info);
4886  }
4887
4888  int getNumberOfDatanodes(DatanodeReportType type) {
4889    readLock();
4890    try {
4891      return getBlockManager().getDatanodeManager().getDatanodeListForReport(
4892          type).size(); 
4893    } finally {
4894      readUnlock();
4895    }
4896  }
4897
4898  DatanodeInfo[] datanodeReport(final DatanodeReportType type
4899      ) throws AccessControlException, StandbyException {
4900    checkSuperuserPrivilege();
4901    checkOperation(OperationCategory.UNCHECKED);
4902    readLock();
4903    try {
4904      checkOperation(OperationCategory.UNCHECKED);
4905      final DatanodeManager dm = getBlockManager().getDatanodeManager();      
4906      final List<DatanodeDescriptor> results = dm.getDatanodeListForReport(type);
4907
4908      DatanodeInfo[] arr = new DatanodeInfo[results.size()];
4909      for (int i=0; i<arr.length; i++) {
4910        arr[i] = new DatanodeInfo(results.get(i));
4911      }
4912      return arr;
4913    } finally {
4914      readUnlock();
4915    }
4916  }
4917
4918  DatanodeStorageReport[] getDatanodeStorageReport(final DatanodeReportType type
4919      ) throws AccessControlException, StandbyException {
4920    checkSuperuserPrivilege();
4921    checkOperation(OperationCategory.UNCHECKED);
4922    readLock();
4923    try {
4924      checkOperation(OperationCategory.UNCHECKED);
4925      final DatanodeManager dm = getBlockManager().getDatanodeManager();      
4926      final List<DatanodeDescriptor> datanodes = dm.getDatanodeListForReport(type);
4927
4928      DatanodeStorageReport[] reports = new DatanodeStorageReport[datanodes.size()];
4929      for (int i = 0; i < reports.length; i++) {
4930        final DatanodeDescriptor d = datanodes.get(i);
4931        reports[i] = new DatanodeStorageReport(new DatanodeInfo(d),
4932            d.getStorageReports());
4933      }
4934      return reports;
4935    } finally {
4936      readUnlock();
4937    }
4938  }
4939
4940  /**
4941   * Save namespace image.
4942   * This will save current namespace into fsimage file and empty edits file.
4943   * Requires superuser privilege and safe mode.
4944   * 
4945   * @throws AccessControlException if superuser privilege is violated.
4946   * @throws IOException if 
4947   */
4948  void saveNamespace() throws AccessControlException, IOException {
4949    checkOperation(OperationCategory.UNCHECKED);
4950    checkSuperuserPrivilege();
4951
4952    cpLock();  // Block if a checkpointing is in progress on standby.
4953    readLock();
4954    try {
4955      checkOperation(OperationCategory.UNCHECKED);
4956
4957      if (!isInSafeMode()) {
4958        throw new IOException("Safe mode should be turned ON "
4959            + "in order to create namespace image.");
4960      }
4961      getFSImage().saveNamespace(this);
4962    } finally {
4963      readUnlock();
4964      cpUnlock();
4965    }
4966    LOG.info("New namespace image has been created");
4967  }
4968  
4969  /**
4970   * Enables/Disables/Checks restoring failed storage replicas if the storage becomes available again.
4971   * Requires superuser privilege.
4972   * 
4973   * @throws AccessControlException if superuser privilege is violated.
4974   */
4975  boolean restoreFailedStorage(String arg) throws AccessControlException,
4976      StandbyException {
4977    checkSuperuserPrivilege();
4978    checkOperation(OperationCategory.UNCHECKED);
4979    cpLock();  // Block if a checkpointing is in progress on standby.
4980    writeLock();
4981    try {
4982      checkOperation(OperationCategory.UNCHECKED);
4983      
4984      // if it is disabled - enable it and vice versa.
4985      if(arg.equals("check"))
4986        return getFSImage().getStorage().getRestoreFailedStorage();
4987      
4988      boolean val = arg.equals("true");  // false if not
4989      getFSImage().getStorage().setRestoreFailedStorage(val);
4990      
4991      return val;
4992    } finally {
4993      writeUnlock();
4994      cpUnlock();
4995    }
4996  }
4997
4998  Date getStartTime() {
4999    return new Date(startTime); 
5000  }
5001    
5002  void finalizeUpgrade() throws IOException {
5003    checkSuperuserPrivilege();
5004    checkOperation(OperationCategory.UNCHECKED);
5005    cpLock();  // Block if a checkpointing is in progress on standby.
5006    writeLock();
5007    try {
5008      checkOperation(OperationCategory.UNCHECKED);
5009      getFSImage().finalizeUpgrade(this.isHaEnabled() && inActiveState());
5010    } finally {
5011      writeUnlock();
5012      cpUnlock();
5013    }
5014  }
5015
5016  void refreshNodes() throws IOException {
5017    checkOperation(OperationCategory.UNCHECKED);
5018    checkSuperuserPrivilege();
5019    getBlockManager().getDatanodeManager().refreshNodes(new HdfsConfiguration());
5020  }
5021
5022  void setBalancerBandwidth(long bandwidth) throws IOException {
5023    checkOperation(OperationCategory.UNCHECKED);
5024    checkSuperuserPrivilege();
5025    getBlockManager().getDatanodeManager().setBalancerBandwidth(bandwidth);
5026  }
5027
5028  /**
5029   * Persist the new block (the last block of the given file).
5030   * @param path
5031   * @param file
5032   */
5033  private void persistNewBlock(String path, INodeFile file) {
5034    Preconditions.checkArgument(file.isUnderConstruction());
5035    getEditLog().logAddBlock(path, file);
5036    if (NameNode.stateChangeLog.isDebugEnabled()) {
5037      NameNode.stateChangeLog.debug("persistNewBlock: "
5038              + path + " with new block " + file.getLastBlock().toString()
5039              + ", current total block count is " + file.getBlocks().length);
5040    }
5041  }
5042
5043  /**
5044   * SafeModeInfo contains information related to the safe mode.
5045   * <p>
5046   * An instance of {@link SafeModeInfo} is created when the name node
5047   * enters safe mode.
5048   * <p>
5049   * During name node startup {@link SafeModeInfo} counts the number of
5050   * <em>safe blocks</em>, those that have at least the minimal number of
5051   * replicas, and calculates the ratio of safe blocks to the total number
5052   * of blocks in the system, which is the size of blocks in
5053   * {@link FSNamesystem#blockManager}. When the ratio reaches the
5054   * {@link #threshold} it starts the SafeModeMonitor daemon in order
5055   * to monitor whether the safe mode {@link #extension} is passed.
5056   * Then it leaves safe mode and destroys itself.
5057   * <p>
5058   * If safe mode is turned on manually then the number of safe blocks is
5059   * not tracked because the name node is not intended to leave safe mode
5060   * automatically in the case.
5061   *
5062   * @see ClientProtocol#setSafeMode(HdfsConstants.SafeModeAction, boolean)
5063   */
5064  public class SafeModeInfo {
5065    // configuration fields
5066    /** Safe mode threshold condition %.*/
5067    private final double threshold;
5068    /** Safe mode minimum number of datanodes alive */
5069    private final int datanodeThreshold;
5070    /**
5071     * Safe mode extension after the threshold.
5072     * Make it volatile so that getSafeModeTip can read the latest value
5073     * without taking a lock.
5074     */
5075    private volatile int extension;
5076    /** Min replication required by safe mode. */
5077    private final int safeReplication;
5078    /** threshold for populating needed replication queues */
5079    private final double replQueueThreshold;
5080    // internal fields
5081    /** Time when threshold was reached.
5082     * <br> -1 safe mode is off
5083     * <br> 0 safe mode is on, and threshold is not reached yet
5084     * <br> >0 safe mode is on, but we are in extension period 
5085     */
5086    private long reached = -1;  
5087    private long reachedTimestamp = -1;
5088    /** Total number of blocks. */
5089    int blockTotal; 
5090    /** Number of safe blocks. */
5091    int blockSafe;
5092    /** Number of blocks needed to satisfy safe mode threshold condition */
5093    private int blockThreshold;
5094    /** Number of blocks needed before populating replication queues */
5095    private int blockReplQueueThreshold;
5096    /** time of the last status printout */
5097    private long lastStatusReport = 0;
5098    /**
5099     * Was safemode entered automatically because available resources were low.
5100     * Make it volatile so that getSafeModeTip can read the latest value
5101     * without taking a lock.
5102     */
5103    private volatile boolean resourcesLow = false;
5104    /** Should safemode adjust its block totals as blocks come in */
5105    private boolean shouldIncrementallyTrackBlocks = false;
5106    /** counter for tracking startup progress of reported blocks */
5107    private Counter awaitingReportedBlocksCounter;
5108    
5109    /**
5110     * Creates SafeModeInfo when the name node enters
5111     * automatic safe mode at startup.
5112     *  
5113     * @param conf configuration
5114     */
5115    private SafeModeInfo(Configuration conf) {
5116      this.threshold = conf.getFloat(DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_KEY,
5117          DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_DEFAULT);
5118      if(threshold > 1.0) {
5119        LOG.warn("The threshold value should't be greater than 1, threshold: " + threshold);
5120      }
5121      this.datanodeThreshold = conf.getInt(
5122        DFS_NAMENODE_SAFEMODE_MIN_DATANODES_KEY,
5123        DFS_NAMENODE_SAFEMODE_MIN_DATANODES_DEFAULT);
5124      this.extension = conf.getInt(DFS_NAMENODE_SAFEMODE_EXTENSION_KEY, 0);
5125      this.safeReplication = conf.getInt(DFS_NAMENODE_REPLICATION_MIN_KEY, 
5126                                         DFS_NAMENODE_REPLICATION_MIN_DEFAULT);
5127      
5128      LOG.info(DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_KEY + " = " + threshold);
5129      LOG.info(DFS_NAMENODE_SAFEMODE_MIN_DATANODES_KEY + " = " + datanodeThreshold);
5130      LOG.info(DFS_NAMENODE_SAFEMODE_EXTENSION_KEY + "     = " + extension);
5131
5132      // default to safe mode threshold (i.e., don't populate queues before leaving safe mode)
5133      this.replQueueThreshold = 
5134        conf.getFloat(DFS_NAMENODE_REPL_QUEUE_THRESHOLD_PCT_KEY,
5135                      (float) threshold);
5136      this.blockTotal = 0; 
5137      this.blockSafe = 0;
5138    }
5139
5140    /**
5141     * In the HA case, the StandbyNode can be in safemode while the namespace
5142     * is modified by the edit log tailer. In this case, the number of total
5143     * blocks changes as edits are processed (eg blocks are added and deleted).
5144     * However, we don't want to do the incremental tracking during the
5145     * startup-time loading process -- only once the initial total has been
5146     * set after the image has been loaded.
5147     */
5148    private boolean shouldIncrementallyTrackBlocks() {
5149      return shouldIncrementallyTrackBlocks;
5150    }
5151
5152    /**
5153     * Creates SafeModeInfo when safe mode is entered manually, or because
5154     * available resources are low.
5155     *
5156     * The {@link #threshold} is set to 1.5 so that it could never be reached.
5157     * {@link #blockTotal} is set to -1 to indicate that safe mode is manual.
5158     * 
5159     * @see SafeModeInfo
5160     */
5161    private SafeModeInfo(boolean resourcesLow) {
5162      this.threshold = 1.5f;  // this threshold can never be reached
5163      this.datanodeThreshold = Integer.MAX_VALUE;
5164      this.extension = Integer.MAX_VALUE;
5165      this.safeReplication = Short.MAX_VALUE + 1; // more than maxReplication
5166      this.replQueueThreshold = 1.5f; // can never be reached
5167      this.blockTotal = -1;
5168      this.blockSafe = -1;
5169      this.resourcesLow = resourcesLow;
5170      enter();
5171      reportStatus("STATE* Safe mode is ON.", true);
5172    }
5173      
5174    /**
5175     * Check if safe mode is on.
5176     * @return true if in safe mode
5177     */
5178    private synchronized boolean isOn() {
5179      doConsistencyCheck();
5180      return this.reached >= 0;
5181    }
5182      
5183    /**
5184     * Enter safe mode.
5185     */
5186    private void enter() {
5187      this.reached = 0;
5188      this.reachedTimestamp = 0;
5189    }
5190      
5191    /**
5192     * Leave safe mode.
5193     * <p>
5194     * Check for invalid, under- & over-replicated blocks in the end of startup.
5195     */
5196    private synchronized void leave() {
5197      // if not done yet, initialize replication queues.
5198      // In the standby, do not populate repl queues
5199      if (!isPopulatingReplQueues() && shouldPopulateReplQueues()) {
5200        initializeReplQueues();
5201      }
5202      long timeInSafemode = now() - startTime;
5203      NameNode.stateChangeLog.info("STATE* Leaving safe mode after " 
5204                                    + timeInSafemode/1000 + " secs");
5205      NameNode.getNameNodeMetrics().setSafeModeTime((int) timeInSafemode);
5206
5207      //Log the following only once (when transitioning from ON -> OFF)
5208      if (reached >= 0) {
5209        NameNode.stateChangeLog.info("STATE* Safe mode is OFF"); 
5210      }
5211      reached = -1;
5212      reachedTimestamp = -1;
5213      safeMode = null;
5214      final NetworkTopology nt = blockManager.getDatanodeManager().getNetworkTopology();
5215      NameNode.stateChangeLog.info("STATE* Network topology has "
5216          + nt.getNumOfRacks() + " racks and "
5217          + nt.getNumOfLeaves() + " datanodes");
5218      NameNode.stateChangeLog.info("STATE* UnderReplicatedBlocks has "
5219          + blockManager.numOfUnderReplicatedBlocks() + " blocks");
5220
5221      startSecretManagerIfNecessary();
5222
5223      // If startup has not yet completed, end safemode phase.
5224      StartupProgress prog = NameNode.getStartupProgress();
5225      if (prog.getStatus(Phase.SAFEMODE) != Status.COMPLETE) {
5226        prog.endStep(Phase.SAFEMODE, STEP_AWAITING_REPORTED_BLOCKS);
5227        prog.endPhase(Phase.SAFEMODE);
5228      }
5229    }
5230
5231    /**
5232     * Check whether we have reached the threshold for 
5233     * initializing replication queues.
5234     */
5235    private synchronized boolean canInitializeReplQueues() {
5236      return shouldPopulateReplQueues()
5237          && blockSafe >= blockReplQueueThreshold;
5238    }
5239      
5240    /** 
5241     * Safe mode can be turned off iff 
5242     * the threshold is reached and 
5243     * the extension time have passed.
5244     * @return true if can leave or false otherwise.
5245     */
5246    private synchronized boolean canLeave() {
5247      if (reached == 0) {
5248        return false;
5249      }
5250
5251      if (monotonicNow() - reached < extension) {
5252        reportStatus("STATE* Safe mode ON, in safe mode extension.", false);
5253        return false;
5254      }
5255
5256      if (needEnter()) {
5257        reportStatus("STATE* Safe mode ON, thresholds not met.", false);
5258        return false;
5259      }
5260
5261      return true;
5262    }
5263      
5264    /** 
5265     * There is no need to enter safe mode 
5266     * if DFS is empty or {@link #threshold} == 0
5267     */
5268    private boolean needEnter() {
5269      return (threshold != 0 && blockSafe < blockThreshold) ||
5270        (datanodeThreshold != 0 && getNumLiveDataNodes() < datanodeThreshold) ||
5271        (!nameNodeHasResourcesAvailable());
5272    }
5273      
5274    /**
5275     * Check and trigger safe mode if needed. 
5276     */
5277    private void checkMode() {
5278      // Have to have write-lock since leaving safemode initializes
5279      // repl queues, which requires write lock
5280      assert hasWriteLock();
5281      if (inTransitionToActive()) {
5282        return;
5283      }
5284      // if smmthread is already running, the block threshold must have been 
5285      // reached before, there is no need to enter the safe mode again
5286      if (smmthread == null && needEnter()) {
5287        enter();
5288        // check if we are ready to initialize replication queues
5289        if (canInitializeReplQueues() && !isPopulatingReplQueues()
5290            && !haEnabled) {
5291          initializeReplQueues();
5292        }
5293        reportStatus("STATE* Safe mode ON.", false);
5294        return;
5295      }
5296      // the threshold is reached or was reached before
5297      if (!isOn() ||                           // safe mode is off
5298          extension <= 0 || threshold <= 0) {  // don't need to wait
5299        this.leave(); // leave safe mode
5300        return;
5301      }
5302      if (reached > 0) {  // threshold has already been reached before
5303        reportStatus("STATE* Safe mode ON.", false);
5304        return;
5305      }
5306      // start monitor
5307      reached = monotonicNow();
5308      reachedTimestamp = now();
5309      if (smmthread == null) {
5310        smmthread = new Daemon(new SafeModeMonitor());
5311        smmthread.start();
5312        reportStatus("STATE* Safe mode extension entered.", true);
5313      }
5314
5315      // check if we are ready to initialize replication queues
5316      if (canInitializeReplQueues() && !isPopulatingReplQueues() && !haEnabled) {
5317        initializeReplQueues();
5318      }
5319    }
5320      
5321    /**
5322     * Set total number of blocks.
5323     */
5324    private synchronized void setBlockTotal(int total) {
5325      this.blockTotal = total;
5326      this.blockThreshold = (int) (blockTotal * threshold);
5327      this.blockReplQueueThreshold = 
5328        (int) (blockTotal * replQueueThreshold);
5329      if (haEnabled) {
5330        // After we initialize the block count, any further namespace
5331        // modifications done while in safe mode need to keep track
5332        // of the number of total blocks in the system.
5333        this.shouldIncrementallyTrackBlocks = true;
5334      }
5335      if(blockSafe < 0)
5336        this.blockSafe = 0;
5337      checkMode();
5338    }
5339      
5340    /**
5341     * Increment number of safe blocks if current block has 
5342     * reached minimal replication.
5343     * @param replication current replication 
5344     */
5345    private synchronized void incrementSafeBlockCount(short replication) {
5346      if (replication == safeReplication) {
5347        this.blockSafe++;
5348
5349        // Report startup progress only if we haven't completed startup yet.
5350        StartupProgress prog = NameNode.getStartupProgress();
5351        if (prog.getStatus(Phase.SAFEMODE) != Status.COMPLETE) {
5352          if (this.awaitingReportedBlocksCounter == null) {
5353            this.awaitingReportedBlocksCounter = prog.getCounter(Phase.SAFEMODE,
5354              STEP_AWAITING_REPORTED_BLOCKS);
5355          }
5356          this.awaitingReportedBlocksCounter.increment();
5357        }
5358
5359        checkMode();
5360      }
5361    }
5362      
5363    /**
5364     * Decrement number of safe blocks if current block has 
5365     * fallen below minimal replication.
5366     * @param replication current replication 
5367     */
5368    private synchronized void decrementSafeBlockCount(short replication) {
5369      if (replication == safeReplication-1) {
5370        this.blockSafe--;
5371        //blockSafe is set to -1 in manual / low resources safemode
5372        assert blockSafe >= 0 || isManual() || areResourcesLow();
5373        checkMode();
5374      }
5375    }
5376
5377    /**
5378     * Check if safe mode was entered manually
5379     */
5380    private boolean isManual() {
5381      return extension == Integer.MAX_VALUE;
5382    }
5383
5384    /**
5385     * Set manual safe mode.
5386     */
5387    private synchronized void setManual() {
5388      extension = Integer.MAX_VALUE;
5389    }
5390
5391    /**
5392     * Check if safe mode was entered due to resources being low.
5393     */
5394    private boolean areResourcesLow() {
5395      return resourcesLow;
5396    }
5397
5398    /**
5399     * Set that resources are low for this instance of safe mode.
5400     */
5401    private void setResourcesLow() {
5402      resourcesLow = true;
5403    }
5404
5405    /**
5406     * A tip on how safe mode is to be turned off: manually or automatically.
5407     */
5408    String getTurnOffTip() {
5409      if(!isOn()) {
5410        return "Safe mode is OFF.";
5411      }
5412
5413      //Manual OR low-resource safemode. (Admin intervention required)
5414      String adminMsg = "It was turned on manually. ";
5415      if (areResourcesLow()) {
5416        adminMsg = "Resources are low on NN. Please add or free up more "
5417          + "resources then turn off safe mode manually. NOTE:  If you turn off"
5418          + " safe mode before adding resources, "
5419          + "the NN will immediately return to safe mode. ";
5420      }
5421      if (isManual() || areResourcesLow()) {
5422        return adminMsg
5423          + "Use \"hdfs dfsadmin -safemode leave\" to turn safe mode off.";
5424      }
5425
5426      boolean thresholdsMet = true;
5427      int numLive = getNumLiveDataNodes();
5428      String msg = "";
5429      if (blockSafe < blockThreshold) {
5430        msg += String.format(
5431          "The reported blocks %d needs additional %d"
5432          + " blocks to reach the threshold %.4f of total blocks %d.%n",
5433          blockSafe, (blockThreshold - blockSafe) + 1, threshold, blockTotal);
5434        thresholdsMet = false;
5435      } else {
5436        msg += String.format("The reported blocks %d has reached the threshold"
5437            + " %.4f of total blocks %d. ", blockSafe, threshold, blockTotal);
5438      }
5439      if (numLive < datanodeThreshold) {
5440        msg += String.format(
5441          "The number of live datanodes %d needs an additional %d live "
5442          + "datanodes to reach the minimum number %d.%n",
5443          numLive, (datanodeThreshold - numLive), datanodeThreshold);
5444        thresholdsMet = false;
5445      } else {
5446        msg += String.format("The number of live datanodes %d has reached "
5447            + "the minimum number %d. ",
5448            numLive, datanodeThreshold);
5449      }
5450      msg += (reached > 0) ? "In safe mode extension. " : "";
5451      msg += "Safe mode will be turned off automatically ";
5452
5453      if (!thresholdsMet) {
5454        msg += "once the thresholds have been reached.";
5455      } else if (reached + extension - monotonicNow() > 0) {
5456        msg += ("in " + (reached + extension - monotonicNow()) / 1000 + " seconds.");
5457      } else {
5458        msg += "soon.";
5459      }
5460
5461      return msg;
5462    }
5463
5464    /**
5465     * Print status every 20 seconds.
5466     */
5467    private void reportStatus(String msg, boolean rightNow) {
5468      long curTime = now();
5469      if(!rightNow && (curTime - lastStatusReport < 20 * 1000))
5470        return;
5471      NameNode.stateChangeLog.info(msg + " \n" + getTurnOffTip());
5472      lastStatusReport = curTime;
5473    }
5474
5475    @Override
5476    public String toString() {
5477      String resText = "Current safe blocks = " 
5478        + blockSafe 
5479        + ". Target blocks = " + blockThreshold + " for threshold = %" + threshold
5480        + ". Minimal replication = " + safeReplication + ".";
5481      if (reached > 0) 
5482        resText += " Threshold was reached " + new Date(reachedTimestamp) + ".";
5483      return resText;
5484    }
5485      
5486    /**
5487     * Checks consistency of the class state.
5488     * This is costly so only runs if asserts are enabled.
5489     */
5490    private void doConsistencyCheck() {
5491      boolean assertsOn = false;
5492      assert assertsOn = true; // set to true if asserts are on
5493      if (!assertsOn) return;
5494      
5495      if (blockTotal == -1 && blockSafe == -1) {
5496        return; // manual safe mode
5497      }
5498      int activeBlocks = blockManager.getActiveBlockCount();
5499      if ((blockTotal != activeBlocks) &&
5500          !(blockSafe >= 0 && blockSafe <= blockTotal)) {
5501        throw new AssertionError(
5502            " SafeMode: Inconsistent filesystem state: "
5503        + "SafeMode data: blockTotal=" + blockTotal
5504        + " blockSafe=" + blockSafe + "; "
5505        + "BlockManager data: active="  + activeBlocks);
5506      }
5507    }
5508
5509    private synchronized void adjustBlockTotals(int deltaSafe, int deltaTotal) {
5510      if (!shouldIncrementallyTrackBlocks) {
5511        return;
5512      }
5513      assert haEnabled;
5514      
5515      if (LOG.isDebugEnabled()) {
5516        LOG.debug("Adjusting block totals from " +
5517            blockSafe + "/" + blockTotal + " to " +
5518            (blockSafe + deltaSafe) + "/" + (blockTotal + deltaTotal));
5519      }
5520      assert blockSafe + deltaSafe >= 0 : "Can't reduce blockSafe " +
5521        blockSafe + " by " + deltaSafe + ": would be negative";
5522      assert blockTotal + deltaTotal >= 0 : "Can't reduce blockTotal " +
5523        blockTotal + " by " + deltaTotal + ": would be negative";
5524      
5525      blockSafe += deltaSafe;
5526      setBlockTotal(blockTotal + deltaTotal);
5527    }
5528  }
5529    
5530  /**
5531   * Periodically check whether it is time to leave safe mode.
5532   * This thread starts when the threshold level is reached.
5533   *
5534   */
5535  class SafeModeMonitor implements Runnable {
5536    /** interval in msec for checking safe mode: {@value} */
5537    private static final long recheckInterval = 1000;
5538      
5539    /**
5540     */
5541    @Override
5542    public void run() {
5543      while (fsRunning) {
5544        writeLock();
5545        try {
5546          if (safeMode == null) { // Not in safe mode.
5547            break;
5548          }
5549          if (safeMode.canLeave()) {
5550            // Leave safe mode.
5551            safeMode.leave();
5552            smmthread = null;
5553            break;
5554          }
5555        } finally {
5556          writeUnlock();
5557        }
5558
5559        try {
5560          Thread.sleep(recheckInterval);
5561        } catch (InterruptedException ie) {
5562          // Ignored
5563        }
5564      }
5565      if (!fsRunning) {
5566        LOG.info("NameNode is being shutdown, exit SafeModeMonitor thread");
5567      }
5568    }
5569  }
5570    
5571  boolean setSafeMode(SafeModeAction action) throws IOException {
5572    if (action != SafeModeAction.SAFEMODE_GET) {
5573      checkSuperuserPrivilege();
5574      switch(action) {
5575      case SAFEMODE_LEAVE: // leave safe mode
5576        leaveSafeMode();
5577        break;
5578      case SAFEMODE_ENTER: // enter safe mode
5579        enterSafeMode(false);
5580        break;
5581      default:
5582        LOG.error("Unexpected safe mode action");
5583      }
5584    }
5585    return isInSafeMode();
5586  }
5587
5588  @Override
5589  public void checkSafeMode() {
5590    // safeMode is volatile, and may be set to null at any time
5591    SafeModeInfo safeMode = this.safeMode;
5592    if (safeMode != null) {
5593      safeMode.checkMode();
5594    }
5595  }
5596
5597  @Override
5598  public boolean isInSafeMode() {
5599    // safeMode is volatile, and may be set to null at any time
5600    SafeModeInfo safeMode = this.safeMode;
5601    if (safeMode == null)
5602      return false;
5603    return safeMode.isOn();
5604  }
5605
5606  @Override
5607  public boolean isInStartupSafeMode() {
5608    // safeMode is volatile, and may be set to null at any time
5609    SafeModeInfo safeMode = this.safeMode;
5610    if (safeMode == null)
5611      return false;
5612    // If the NN is in safemode, and not due to manual / low resources, we
5613    // assume it must be because of startup. If the NN had low resources during
5614    // startup, we assume it came out of startup safemode and it is now in low
5615    // resources safemode
5616    return !safeMode.isManual() && !safeMode.areResourcesLow()
5617      && safeMode.isOn();
5618  }
5619
5620  /**
5621   * Check if replication queues are to be populated
5622   * @return true when node is HAState.Active and not in the very first safemode
5623   */
5624  @Override
5625  public boolean isPopulatingReplQueues() {
5626    if (!shouldPopulateReplQueues()) {
5627      return false;
5628    }
5629    return initializedReplQueues;
5630  }
5631
5632  private boolean shouldPopulateReplQueues() {
5633    if(haContext == null || haContext.getState() == null)
5634      return false;
5635    return haContext.getState().shouldPopulateReplQueues();
5636  }
5637
5638  @Override
5639  public void incrementSafeBlockCount(int replication) {
5640    // safeMode is volatile, and may be set to null at any time
5641    SafeModeInfo safeMode = this.safeMode;
5642    if (safeMode == null)
5643      return;
5644    safeMode.incrementSafeBlockCount((short)replication);
5645  }
5646
5647  @Override
5648  public void decrementSafeBlockCount(Block b) {
5649    // safeMode is volatile, and may be set to null at any time
5650    SafeModeInfo safeMode = this.safeMode;
5651    if (safeMode == null) // mostly true
5652      return;
5653    BlockInfoContiguous storedBlock = getStoredBlock(b);
5654    if (storedBlock.isComplete()) {
5655      safeMode.decrementSafeBlockCount((short)blockManager.countNodes(b).liveReplicas());
5656    }
5657  }
5658  
5659  /**
5660   * Adjust the total number of blocks safe and expected during safe mode.
5661   * If safe mode is not currently on, this is a no-op.
5662   * @param deltaSafe the change in number of safe blocks
5663   * @param deltaTotal the change i nnumber of total blocks expected
5664   */
5665  @Override
5666  public void adjustSafeModeBlockTotals(int deltaSafe, int deltaTotal) {
5667    // safeMode is volatile, and may be set to null at any time
5668    SafeModeInfo safeMode = this.safeMode;
5669    if (safeMode == null)
5670      return;
5671    safeMode.adjustBlockTotals(deltaSafe, deltaTotal);
5672  }
5673
5674  /**
5675   * Set the total number of blocks in the system. 
5676   */
5677  public void setBlockTotal() {
5678    // safeMode is volatile, and may be set to null at any time
5679    SafeModeInfo safeMode = this.safeMode;
5680    if (safeMode == null)
5681      return;
5682    safeMode.setBlockTotal((int)getCompleteBlocksTotal());
5683  }
5684
5685  /**
5686   * Get the total number of blocks in the system. 
5687   */
5688  @Override // FSNamesystemMBean
5689  @Metric
5690  public long getBlocksTotal() {
5691    return blockManager.getTotalBlocks();
5692  }
5693
5694  /**
5695   * Get the total number of COMPLETE blocks in the system.
5696   * For safe mode only complete blocks are counted.
5697   */
5698  private long getCompleteBlocksTotal() {
5699    // Calculate number of blocks under construction
5700    long numUCBlocks = 0;
5701    readLock();
5702    numUCBlocks = leaseManager.getNumUnderConstructionBlocks();
5703    try {
5704      return getBlocksTotal() - numUCBlocks;
5705    } finally {
5706      readUnlock();
5707    }
5708  }
5709
5710  /**
5711   * Enter safe mode. If resourcesLow is false, then we assume it is manual
5712   * @throws IOException
5713   */
5714  void enterSafeMode(boolean resourcesLow) throws IOException {
5715    writeLock();
5716    try {
5717      // Stop the secret manager, since rolling the master key would
5718      // try to write to the edit log
5719      stopSecretManager();
5720
5721      // Ensure that any concurrent operations have been fully synced
5722      // before entering safe mode. This ensures that the FSImage
5723      // is entirely stable on disk as soon as we're in safe mode.
5724      boolean isEditlogOpenForWrite = getEditLog().isOpenForWrite();
5725      // Before Editlog is in OpenForWrite mode, editLogStream will be null. So,
5726      // logSyncAll call can be called only when Edlitlog is in OpenForWrite mode
5727      if (isEditlogOpenForWrite) {
5728        getEditLog().logSyncAll();
5729      }
5730      if (!isInSafeMode()) {
5731        safeMode = new SafeModeInfo(resourcesLow);
5732        return;
5733      }
5734      if (resourcesLow) {
5735        safeMode.setResourcesLow();
5736      } else {
5737        safeMode.setManual();
5738      }
5739      if (isEditlogOpenForWrite) {
5740        getEditLog().logSyncAll();
5741      }
5742      NameNode.stateChangeLog.info("STATE* Safe mode is ON"
5743          + safeMode.getTurnOffTip());
5744    } finally {
5745      writeUnlock();
5746    }
5747  }
5748
5749  /**
5750   * Leave safe mode.
5751   */
5752  void leaveSafeMode() {
5753    writeLock();
5754    try {
5755      if (!isInSafeMode()) {
5756        NameNode.stateChangeLog.info("STATE* Safe mode is already OFF"); 
5757        return;
5758      }
5759      safeMode.leave();
5760    } finally {
5761      writeUnlock();
5762    }
5763  }
5764    
5765  String getSafeModeTip() {
5766    // There is no need to take readLock.
5767    // Don't use isInSafeMode as this.safeMode might be set to null.
5768    // after isInSafeMode returns.
5769    boolean inSafeMode;
5770    SafeModeInfo safeMode = this.safeMode;
5771    if (safeMode == null) {
5772      inSafeMode = false;
5773    } else {
5774      inSafeMode = safeMode.isOn();
5775    }
5776
5777    if (!inSafeMode) {
5778      return "";
5779    } else {
5780      return safeMode.getTurnOffTip();
5781    }
5782  }
5783
5784  CheckpointSignature rollEditLog() throws IOException {
5785    checkSuperuserPrivilege();
5786    checkOperation(OperationCategory.JOURNAL);
5787    writeLock();
5788    try {
5789      checkOperation(OperationCategory.JOURNAL);
5790      checkNameNodeSafeMode("Log not rolled");
5791      if (Server.isRpcInvocation()) {
5792        LOG.info("Roll Edit Log from " + Server.getRemoteAddress());
5793      }
5794      return getFSImage().rollEditLog();
5795    } finally {
5796      writeUnlock();
5797    }
5798  }
5799
5800  NamenodeCommand startCheckpoint(NamenodeRegistration backupNode,
5801      NamenodeRegistration activeNamenode) throws IOException {
5802    checkOperation(OperationCategory.CHECKPOINT);
5803    writeLock();
5804    try {
5805      checkOperation(OperationCategory.CHECKPOINT);
5806      checkNameNodeSafeMode("Checkpoint not started");
5807      
5808      LOG.info("Start checkpoint for " + backupNode.getAddress());
5809      NamenodeCommand cmd = getFSImage().startCheckpoint(backupNode,
5810          activeNamenode);
5811      getEditLog().logSync();
5812      return cmd;
5813    } finally {
5814      writeUnlock();
5815    }
5816  }
5817
5818  public void processIncrementalBlockReport(final DatanodeID nodeID,
5819      final StorageReceivedDeletedBlocks srdb)
5820      throws IOException {
5821    writeLock();
5822    try {
5823      blockManager.processIncrementalBlockReport(nodeID, srdb);
5824    } finally {
5825      writeUnlock();
5826    }
5827  }
5828  
5829  void endCheckpoint(NamenodeRegistration registration,
5830                            CheckpointSignature sig) throws IOException {
5831    checkOperation(OperationCategory.CHECKPOINT);
5832    readLock();
5833    try {
5834      checkOperation(OperationCategory.CHECKPOINT);
5835      checkNameNodeSafeMode("Checkpoint not ended");
5836      LOG.info("End checkpoint for " + registration.getAddress());
5837      getFSImage().endCheckpoint(sig);
5838    } finally {
5839      readUnlock();
5840    }
5841  }
5842
5843  PermissionStatus createFsOwnerPermissions(FsPermission permission) {
5844    return new PermissionStatus(fsOwner.getShortUserName(), supergroup, permission);
5845  }
5846
5847  private void checkUnreadableBySuperuser(FSPermissionChecker pc,
5848      INode inode, int snapshotId)
5849      throws IOException {
5850    if (pc.isSuperUser()) {
5851      for (XAttr xattr : FSDirXAttrOp.getXAttrs(dir, inode, snapshotId)) {
5852        if (XAttrHelper.getPrefixName(xattr).
5853            equals(SECURITY_XATTR_UNREADABLE_BY_SUPERUSER)) {
5854          throw new AccessControlException("Access is denied for " +
5855              pc.getUser() + " since the superuser is not allowed to " +
5856              "perform this operation.");
5857        }
5858      }
5859    }
5860  }
5861
5862  @Override
5863  public void checkSuperuserPrivilege()
5864      throws AccessControlException {
5865    if (isPermissionEnabled) {
5866      FSPermissionChecker pc = getPermissionChecker();
5867      pc.checkSuperuserPrivilege();
5868    }
5869  }
5870
5871  /**
5872   * Check to see if we have exceeded the limit on the number
5873   * of inodes.
5874   */
5875  void checkFsObjectLimit() throws IOException {
5876    if (maxFsObjects != 0 &&
5877        maxFsObjects <= dir.totalInodes() + getBlocksTotal()) {
5878      throw new IOException("Exceeded the configured number of objects " +
5879                             maxFsObjects + " in the filesystem.");
5880    }
5881  }
5882
5883  /**
5884   * Get the total number of objects in the system. 
5885   */
5886  @Override // FSNamesystemMBean
5887  public long getMaxObjects() {
5888    return maxFsObjects;
5889  }
5890
5891  @Override // FSNamesystemMBean
5892  @Metric
5893  public long getFilesTotal() {
5894    // There is no need to take fSNamesystem's lock as
5895    // FSDirectory has its own lock.
5896    return this.dir.totalInodes();
5897  }
5898
5899  @Override // FSNamesystemMBean
5900  @Metric
5901  public long getPendingReplicationBlocks() {
5902    return blockManager.getPendingReplicationBlocksCount();
5903  }
5904
5905  @Override // FSNamesystemMBean
5906  @Metric
5907  public long getUnderReplicatedBlocks() {
5908    return blockManager.getUnderReplicatedBlocksCount();
5909  }
5910
5911  /** Returns number of blocks with corrupt replicas */
5912  @Metric({"CorruptBlocks", "Number of blocks with corrupt replicas"})
5913  public long getCorruptReplicaBlocks() {
5914    return blockManager.getCorruptReplicaBlocksCount();
5915  }
5916
5917  @Override // FSNamesystemMBean
5918  @Metric
5919  public long getScheduledReplicationBlocks() {
5920    return blockManager.getScheduledReplicationBlocksCount();
5921  }
5922
5923  @Override
5924  @Metric
5925  public long getPendingDeletionBlocks() {
5926    return blockManager.getPendingDeletionBlocksCount();
5927  }
5928
5929  @Override
5930  public long getBlockDeletionStartTime() {
5931    return startTime + blockManager.getStartupDelayBlockDeletionInMs();
5932  }
5933
5934  @Metric
5935  public long getExcessBlocks() {
5936    return blockManager.getExcessBlocksCount();
5937  }
5938  
5939  // HA-only metric
5940  @Metric
5941  public long getPostponedMisreplicatedBlocks() {
5942    return blockManager.getPostponedMisreplicatedBlocksCount();
5943  }
5944
5945  // HA-only metric
5946  @Metric
5947  public int getPendingDataNodeMessageCount() {
5948    return blockManager.getPendingDataNodeMessageCount();
5949  }
5950  
5951  // HA-only metric
5952  @Metric
5953  public String getHAState() {
5954    return haContext.getState().toString();
5955  }
5956
5957  // HA-only metric
5958  @Metric
5959  public long getMillisSinceLastLoadedEdits() {
5960    if (isInStandbyState() && editLogTailer != null) {
5961      return monotonicNow() - editLogTailer.getLastLoadTimeMs();
5962    } else {
5963      return 0;
5964    }
5965  }
5966  
5967  @Metric
5968  public int getBlockCapacity() {
5969    return blockManager.getCapacity();
5970  }
5971
5972  @Override // FSNamesystemMBean
5973  public String getFSState() {
5974    return isInSafeMode() ? "safeMode" : "Operational";
5975  }
5976  
5977  private ObjectName mbeanName;
5978  private ObjectName mxbeanName;
5979
5980  /**
5981   * Register the FSNamesystem MBean using the name
5982   *        "hadoop:service=NameNode,name=FSNamesystemState"
5983   */
5984  private void registerMBean() {
5985    // We can only implement one MXBean interface, so we keep the old one.
5986    try {
5987      StandardMBean bean = new StandardMBean(this, FSNamesystemMBean.class);
5988      mbeanName = MBeans.register("NameNode", "FSNamesystemState", bean);
5989    } catch (NotCompliantMBeanException e) {
5990      throw new RuntimeException("Bad MBean setup", e);
5991    }
5992
5993    LOG.info("Registered FSNamesystemState MBean");
5994  }
5995
5996  /**
5997   * shutdown FSNamesystem
5998   */
5999  void shutdown() {
6000    if (snapshotManager != null) {
6001      snapshotManager.shutdown();
6002    }
6003    if (mbeanName != null) {
6004      MBeans.unregister(mbeanName);
6005      mbeanName = null;
6006    }
6007    if (mxbeanName != null) {
6008      MBeans.unregister(mxbeanName);
6009      mxbeanName = null;
6010    }
6011    if (dir != null) {
6012      dir.shutdown();
6013    }
6014    if (blockManager != null) {
6015      blockManager.shutdown();
6016    }
6017  }
6018
6019  @Override // FSNamesystemMBean
6020  public int getNumLiveDataNodes() {
6021    return getBlockManager().getDatanodeManager().getNumLiveDataNodes();
6022  }
6023
6024  @Override // FSNamesystemMBean
6025  public int getNumDeadDataNodes() {
6026    return getBlockManager().getDatanodeManager().getNumDeadDataNodes();
6027  }
6028  
6029  @Override // FSNamesystemMBean
6030  public int getNumDecomLiveDataNodes() {
6031    final List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
6032    getBlockManager().getDatanodeManager().fetchDatanodes(live, null, true);
6033    int liveDecommissioned = 0;
6034    for (DatanodeDescriptor node : live) {
6035      liveDecommissioned += node.isDecommissioned() ? 1 : 0;
6036    }
6037    return liveDecommissioned;
6038  }
6039
6040  @Override // FSNamesystemMBean
6041  public int getNumDecomDeadDataNodes() {
6042    final List<DatanodeDescriptor> dead = new ArrayList<DatanodeDescriptor>();
6043    getBlockManager().getDatanodeManager().fetchDatanodes(null, dead, true);
6044    int deadDecommissioned = 0;
6045    for (DatanodeDescriptor node : dead) {
6046      deadDecommissioned += node.isDecommissioned() ? 1 : 0;
6047    }
6048    return deadDecommissioned;
6049  }
6050
6051  @Override // FSNamesystemMBean
6052  public int getVolumeFailuresTotal() {
6053    List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
6054    getBlockManager().getDatanodeManager().fetchDatanodes(live, null, true);
6055    int volumeFailuresTotal = 0;
6056    for (DatanodeDescriptor node: live) {
6057      volumeFailuresTotal += node.getVolumeFailures();
6058    }
6059    return volumeFailuresTotal;
6060  }
6061
6062  @Override // FSNamesystemMBean
6063  public long getEstimatedCapacityLostTotal() {
6064    List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
6065    getBlockManager().getDatanodeManager().fetchDatanodes(live, null, true);
6066    long estimatedCapacityLostTotal = 0;
6067    for (DatanodeDescriptor node: live) {
6068      VolumeFailureSummary volumeFailureSummary = node.getVolumeFailureSummary();
6069      if (volumeFailureSummary != null) {
6070        estimatedCapacityLostTotal +=
6071            volumeFailureSummary.getEstimatedCapacityLostTotal();
6072      }
6073    }
6074    return estimatedCapacityLostTotal;
6075  }
6076
6077  @Override // FSNamesystemMBean
6078  public int getNumDecommissioningDataNodes() {
6079    return getBlockManager().getDatanodeManager().getDecommissioningNodes()
6080        .size();
6081  }
6082
6083  @Override // FSNamesystemMBean
6084  @Metric({"StaleDataNodes", 
6085    "Number of datanodes marked stale due to delayed heartbeat"})
6086  public int getNumStaleDataNodes() {
6087    return getBlockManager().getDatanodeManager().getNumStaleNodes();
6088  }
6089
6090  /**
6091   * Storages are marked as "content stale" after NN restart or fails over and
6092   * before NN receives the first Heartbeat followed by the first Blockreport.
6093   */
6094  @Override // FSNamesystemMBean
6095  public int getNumStaleStorages() {
6096    return getBlockManager().getDatanodeManager().getNumStaleStorages();
6097  }
6098
6099  @Override // FSNamesystemMBean
6100  public String getTopUserOpCounts() {
6101    if (!topConf.isEnabled) {
6102      return null;
6103    }
6104
6105    Date now = new Date();
6106    final List<RollingWindowManager.TopWindow> topWindows =
6107        topMetrics.getTopWindows();
6108    Map<String, Object> topMap = new TreeMap<String, Object>();
6109    topMap.put("windows", topWindows);
6110    topMap.put("timestamp", DFSUtil.dateToIso8601String(now));
6111    ObjectMapper mapper = new ObjectMapper();
6112    try {
6113      return mapper.writeValueAsString(topMap);
6114    } catch (IOException e) {
6115      LOG.warn("Failed to fetch TopUser metrics", e);
6116    }
6117    return null;
6118  }
6119
6120  /**
6121   * Increments, logs and then returns the stamp
6122   */
6123  long nextGenerationStamp(boolean legacyBlock)
6124      throws IOException, SafeModeException {
6125    assert hasWriteLock();
6126    checkNameNodeSafeMode("Cannot get next generation stamp");
6127
6128    long gs = blockIdManager.nextGenerationStamp(legacyBlock);
6129    if (legacyBlock) {
6130      getEditLog().logGenerationStampV1(gs);
6131    } else {
6132      getEditLog().logGenerationStampV2(gs);
6133    }
6134
6135    // NB: callers sync the log
6136    return gs;
6137  }
6138
6139  /**
6140   * Increments, logs and then returns the block ID
6141   */
6142  private long nextBlockId() throws IOException {
6143    assert hasWriteLock();
6144    checkNameNodeSafeMode("Cannot get next block ID");
6145    final long blockId = blockIdManager.nextBlockId();
6146    getEditLog().logAllocateBlockId(blockId);
6147    // NB: callers sync the log
6148    return blockId;
6149  }
6150
6151  private boolean isFileDeleted(INodeFile file) {
6152    // Not in the inodeMap or in the snapshot but marked deleted.
6153    if (dir.getInode(file.getId()) == null) {
6154      return true;
6155    }
6156
6157    // look at the path hierarchy to see if one parent is deleted by recursive
6158    // deletion
6159    INode tmpChild = file;
6160    INodeDirectory tmpParent = file.getParent();
6161    while (true) {
6162      if (tmpParent == null) {
6163        return true;
6164      }
6165
6166      INode childINode = tmpParent.getChild(tmpChild.getLocalNameBytes(),
6167          Snapshot.CURRENT_STATE_ID);
6168      if (childINode == null || !childINode.equals(tmpChild)) {
6169        // a newly created INode with the same name as an already deleted one
6170        // would be a different INode than the deleted one
6171        return true;
6172      }
6173
6174      if (tmpParent.isRoot()) {
6175        break;
6176      }
6177
6178      tmpChild = tmpParent;
6179      tmpParent = tmpParent.getParent();
6180    }
6181
6182    if (file.isWithSnapshot() &&
6183        file.getFileWithSnapshotFeature().isCurrentFileDeleted()) {
6184      return true;
6185    }
6186    return false;
6187  }
6188
6189  private INodeFile checkUCBlock(ExtendedBlock block,
6190      String clientName) throws IOException {
6191    assert hasWriteLock();
6192    checkNameNodeSafeMode("Cannot get a new generation stamp and an "
6193        + "access token for block " + block);
6194    
6195    // check stored block state
6196    BlockInfoContiguous storedBlock = getStoredBlock(ExtendedBlock.getLocalBlock(block));
6197    if (storedBlock == null || 
6198        storedBlock.getBlockUCState() != BlockUCState.UNDER_CONSTRUCTION) {
6199        throw new IOException(block + 
6200            " does not exist or is not under Construction" + storedBlock);
6201    }
6202    
6203    // check file inode
6204    final INodeFile file = ((INode)storedBlock.getBlockCollection()).asFile();
6205    if (file == null || !file.isUnderConstruction() || isFileDeleted(file)) {
6206      throw new IOException("The file " + storedBlock + 
6207          " belonged to does not exist or it is not under construction.");
6208    }
6209    
6210    // check lease
6211    if (clientName == null
6212        || !clientName.equals(file.getFileUnderConstructionFeature()
6213            .getClientName())) {
6214      throw new LeaseExpiredException("Lease mismatch: " + block + 
6215          " is accessed by a non lease holder " + clientName); 
6216    }
6217
6218    return file;
6219  }
6220  
6221  /**
6222   * Client is reporting some bad block locations.
6223   */
6224  void reportBadBlocks(LocatedBlock[] blocks) throws IOException {
6225    checkOperation(OperationCategory.WRITE);
6226    NameNode.stateChangeLog.info("*DIR* reportBadBlocks");
6227    writeLock();
6228    try {
6229      checkOperation(OperationCategory.WRITE);
6230      for (int i = 0; i < blocks.length; i++) {
6231        ExtendedBlock blk = blocks[i].getBlock();
6232        DatanodeInfo[] nodes = blocks[i].getLocations();
6233        String[] storageIDs = blocks[i].getStorageIDs();
6234        for (int j = 0; j < nodes.length; j++) {
6235          blockManager.findAndMarkBlockAsCorrupt(blk, nodes[j],
6236              storageIDs == null ? null: storageIDs[j], 
6237              "client machine reported it");
6238        }
6239      }
6240    } finally {
6241      writeUnlock();
6242    }
6243  }
6244
6245  /**
6246   * Get a new generation stamp together with an access token for 
6247   * a block under construction
6248   * 
6249   * This method is called for recovering a failed pipeline or setting up
6250   * a pipeline to append to a block.
6251   * 
6252   * @param block a block
6253   * @param clientName the name of a client
6254   * @return a located block with a new generation stamp and an access token
6255   * @throws IOException if any error occurs
6256   */
6257  LocatedBlock updateBlockForPipeline(ExtendedBlock block, 
6258      String clientName) throws IOException {
6259    LocatedBlock locatedBlock;
6260    checkOperation(OperationCategory.WRITE);
6261    writeLock();
6262    try {
6263      checkOperation(OperationCategory.WRITE);
6264
6265      // check vadility of parameters
6266      checkUCBlock(block, clientName);
6267  
6268      // get a new generation stamp and an access token
6269      block.setGenerationStamp(nextGenerationStamp(blockIdManager.isLegacyBlock(block.getLocalBlock())));
6270      locatedBlock = new LocatedBlock(block, new DatanodeInfo[0]);
6271      blockManager.setBlockToken(locatedBlock, AccessMode.WRITE);
6272    } finally {
6273      writeUnlock();
6274    }
6275    // Ensure we record the new generation stamp
6276    getEditLog().logSync();
6277    return locatedBlock;
6278  }
6279  
6280  /**
6281   * Update a pipeline for a block under construction
6282   * 
6283   * @param clientName the name of the client
6284   * @param oldBlock and old block
6285   * @param newBlock a new block with a new generation stamp and length
6286   * @param newNodes datanodes in the pipeline
6287   * @throws IOException if any error occurs
6288   */
6289  void updatePipeline(
6290      String clientName, ExtendedBlock oldBlock, ExtendedBlock newBlock,
6291      DatanodeID[] newNodes, String[] newStorageIDs, boolean logRetryCache)
6292      throws IOException {
6293    checkOperation(OperationCategory.WRITE);
6294
6295    LOG.info("updatePipeline(" + oldBlock.getLocalBlock()
6296             + ", newGS=" + newBlock.getGenerationStamp()
6297             + ", newLength=" + newBlock.getNumBytes()
6298             + ", newNodes=" + Arrays.asList(newNodes)
6299             + ", client=" + clientName
6300             + ")");
6301    waitForLoadingFSImage();
6302    writeLock();
6303    try {
6304      checkOperation(OperationCategory.WRITE);
6305      checkNameNodeSafeMode("Pipeline not updated");
6306      assert newBlock.getBlockId()==oldBlock.getBlockId() : newBlock + " and "
6307        + oldBlock + " has different block identifier";
6308      updatePipelineInternal(clientName, oldBlock, newBlock, newNodes,
6309          newStorageIDs, logRetryCache);
6310    } finally {
6311      writeUnlock();
6312    }
6313    getEditLog().logSync();
6314    LOG.info("updatePipeline(" + oldBlock.getLocalBlock() + " => "
6315        + newBlock.getLocalBlock() + ") success");
6316  }
6317
6318  private void updatePipelineInternal(String clientName, ExtendedBlock oldBlock,
6319      ExtendedBlock newBlock, DatanodeID[] newNodes, String[] newStorageIDs,
6320      boolean logRetryCache)
6321      throws IOException {
6322    assert hasWriteLock();
6323    // check the vadility of the block and lease holder name
6324    final INodeFile pendingFile = checkUCBlock(oldBlock, clientName);
6325    final BlockInfoContiguousUnderConstruction blockinfo
6326        = (BlockInfoContiguousUnderConstruction)pendingFile.getLastBlock();
6327
6328    // check new GS & length: this is not expected
6329    if (newBlock.getGenerationStamp() <= blockinfo.getGenerationStamp() ||
6330        newBlock.getNumBytes() < blockinfo.getNumBytes()) {
6331      String msg = "Update " + oldBlock + " (len = " + 
6332        blockinfo.getNumBytes() + ") to an older state: " + newBlock + 
6333        " (len = " + newBlock.getNumBytes() +")";
6334      LOG.warn(msg);
6335      throw new IOException(msg);
6336    }
6337
6338    // Update old block with the new generation stamp and new length
6339    blockinfo.setNumBytes(newBlock.getNumBytes());
6340    blockinfo.setGenerationStampAndVerifyReplicas(newBlock.getGenerationStamp());
6341
6342    // find the DatanodeDescriptor objects
6343    final DatanodeStorageInfo[] storages = blockManager.getDatanodeManager()
6344        .getDatanodeStorageInfos(newNodes, newStorageIDs);
6345    blockinfo.setExpectedLocations(storages);
6346
6347    String src = pendingFile.getFullPathName();
6348    persistBlocks(src, pendingFile, logRetryCache);
6349  }
6350
6351  // rename was successful. If any part of the renamed subtree had
6352  // files that were being written to, update with new filename.
6353  void unprotectedChangeLease(String src, String dst) {
6354    assert hasWriteLock();
6355    leaseManager.changeLease(src, dst);
6356  }
6357
6358  /**
6359   * Serializes leases.
6360   */
6361  void saveFilesUnderConstruction(DataOutputStream out,
6362      Map<Long, INodeFile> snapshotUCMap) throws IOException {
6363    // This is run by an inferior thread of saveNamespace, which holds a read
6364    // lock on our behalf. If we took the read lock here, we could block
6365    // for fairness if a writer is waiting on the lock.
6366    synchronized (leaseManager) {
6367      Map<String, INodeFile> nodes = leaseManager.getINodesUnderConstruction();
6368      for (Map.Entry<String, INodeFile> entry : nodes.entrySet()) {
6369        // TODO: for HDFS-5428, because of rename operations, some
6370        // under-construction files that are
6371        // in the current fs directory can also be captured in the
6372        // snapshotUCMap. We should remove them from the snapshotUCMap.
6373        snapshotUCMap.remove(entry.getValue().getId());
6374      }
6375
6376      out.writeInt(nodes.size() + snapshotUCMap.size()); // write the size
6377      for (Map.Entry<String, INodeFile> entry : nodes.entrySet()) {
6378        FSImageSerialization.writeINodeUnderConstruction(
6379            out, entry.getValue(), entry.getKey());
6380      }
6381      for (Map.Entry<Long, INodeFile> entry : snapshotUCMap.entrySet()) {
6382        // for those snapshot INodeFileUC, we use "/.reserved/.inodes/<inodeid>"
6383        // as their paths
6384        StringBuilder b = new StringBuilder();
6385        b.append(FSDirectory.DOT_RESERVED_PATH_PREFIX)
6386            .append(Path.SEPARATOR).append(FSDirectory.DOT_INODES_STRING)
6387            .append(Path.SEPARATOR).append(entry.getValue().getId());
6388        FSImageSerialization.writeINodeUnderConstruction(
6389            out, entry.getValue(), b.toString());
6390      }
6391    }
6392  }
6393
6394  /**
6395   * @return all the under-construction files in the lease map
6396   */
6397  Map<String, INodeFile> getFilesUnderConstruction() {
6398    synchronized (leaseManager) {
6399      return leaseManager.getINodesUnderConstruction();
6400    }
6401  }
6402
6403  /**
6404   * Register a Backup name-node, verifying that it belongs
6405   * to the correct namespace, and adding it to the set of
6406   * active journals if necessary.
6407   * 
6408   * @param bnReg registration of the new BackupNode
6409   * @param nnReg registration of this NameNode
6410   * @throws IOException if the namespace IDs do not match
6411   */
6412  void registerBackupNode(NamenodeRegistration bnReg,
6413      NamenodeRegistration nnReg) throws IOException {
6414    writeLock();
6415    try {
6416      if(getFSImage().getStorage().getNamespaceID() 
6417         != bnReg.getNamespaceID())
6418        throw new IOException("Incompatible namespaceIDs: "
6419            + " Namenode namespaceID = "
6420            + getFSImage().getStorage().getNamespaceID() + "; "
6421            + bnReg.getRole() +
6422            " node namespaceID = " + bnReg.getNamespaceID());
6423      if (bnReg.getRole() == NamenodeRole.BACKUP) {
6424        getFSImage().getEditLog().registerBackupNode(
6425            bnReg, nnReg);
6426      }
6427    } finally {
6428      writeUnlock();
6429    }
6430  }
6431
6432  /**
6433   * Release (unregister) backup node.
6434   * <p>
6435   * Find and remove the backup stream corresponding to the node.
6436   * @throws IOException
6437   */
6438  void releaseBackupNode(NamenodeRegistration registration)
6439    throws IOException {
6440    checkOperation(OperationCategory.WRITE);
6441    writeLock();
6442    try {
6443      checkOperation(OperationCategory.WRITE);
6444      if(getFSImage().getStorage().getNamespaceID()
6445         != registration.getNamespaceID())
6446        throw new IOException("Incompatible namespaceIDs: "
6447            + " Namenode namespaceID = "
6448            + getFSImage().getStorage().getNamespaceID() + "; "
6449            + registration.getRole() +
6450            " node namespaceID = " + registration.getNamespaceID());
6451      getEditLog().releaseBackupStream(registration);
6452    } finally {
6453      writeUnlock();
6454    }
6455  }
6456
6457  static class CorruptFileBlockInfo {
6458    final String path;
6459    final Block block;
6460    
6461    public CorruptFileBlockInfo(String p, Block b) {
6462      path = p;
6463      block = b;
6464    }
6465    
6466    @Override
6467    public String toString() {
6468      return block.getBlockName() + "\t" + path;
6469    }
6470  }
6471  /**
6472   * @param path Restrict corrupt files to this portion of namespace.
6473   * @param cookieTab Support for continuation; cookieTab  tells where
6474   *                  to start from
6475   * @return a list in which each entry describes a corrupt file/block
6476   * @throws IOException
6477   */
6478  Collection<CorruptFileBlockInfo> listCorruptFileBlocks(String path,
6479  String[] cookieTab) throws IOException {
6480    checkSuperuserPrivilege();
6481    checkOperation(OperationCategory.READ);
6482
6483    int count = 0;
6484    ArrayList<CorruptFileBlockInfo> corruptFiles =
6485        new ArrayList<CorruptFileBlockInfo>();
6486    if (cookieTab == null) {
6487      cookieTab = new String[] { null };
6488    }
6489
6490    // Do a quick check if there are any corrupt files without taking the lock
6491    if (blockManager.getMissingBlocksCount() == 0) {
6492      if (cookieTab[0] == null) {
6493        cookieTab[0] = String.valueOf(getIntCookie(cookieTab[0]));
6494      }
6495      LOG.info("there are no corrupt file blocks.");
6496      return corruptFiles;
6497    }
6498
6499    readLock();
6500    try {
6501      checkOperation(OperationCategory.READ);
6502      if (!isPopulatingReplQueues()) {
6503        throw new IOException("Cannot run listCorruptFileBlocks because " +
6504                              "replication queues have not been initialized.");
6505      }
6506      // print a limited # of corrupt files per call
6507
6508      final Iterator<Block> blkIterator = blockManager.getCorruptReplicaBlockIterator();
6509
6510      int skip = getIntCookie(cookieTab[0]);
6511      for (int i = 0; i < skip && blkIterator.hasNext(); i++) {
6512        blkIterator.next();
6513      }
6514
6515      while (blkIterator.hasNext()) {
6516        Block blk = blkIterator.next();
6517        final INode inode = (INode)blockManager.getBlockCollection(blk);
6518        skip++;
6519        if (inode != null && blockManager.countNodes(blk).liveReplicas() == 0) {
6520          String src = FSDirectory.getFullPathName(inode);
6521          if (src.startsWith(path)){
6522            corruptFiles.add(new CorruptFileBlockInfo(src, blk));
6523            count++;
6524            if (count >= DEFAULT_MAX_CORRUPT_FILEBLOCKS_RETURNED)
6525              break;
6526          }
6527        }
6528      }
6529      cookieTab[0] = String.valueOf(skip);
6530      LOG.info("list corrupt file blocks returned: " + count);
6531      return corruptFiles;
6532    } finally {
6533      readUnlock();
6534    }
6535  }
6536
6537  /**
6538   * Convert string cookie to integer.
6539   */
6540  private static int getIntCookie(String cookie){
6541    int c;
6542    if(cookie == null){
6543      c = 0;
6544    } else {
6545      try{
6546        c = Integer.parseInt(cookie);
6547      }catch (NumberFormatException e) {
6548        c = 0;
6549      }
6550    }
6551    c = Math.max(0, c);
6552    return c;
6553  }
6554
6555  /**
6556   * Create delegation token secret manager
6557   */
6558  private DelegationTokenSecretManager createDelegationTokenSecretManager(
6559      Configuration conf) {
6560    return new DelegationTokenSecretManager(conf.getLong(
6561        DFS_NAMENODE_DELEGATION_KEY_UPDATE_INTERVAL_KEY,
6562        DFS_NAMENODE_DELEGATION_KEY_UPDATE_INTERVAL_DEFAULT),
6563        conf.getLong(DFS_NAMENODE_DELEGATION_TOKEN_MAX_LIFETIME_KEY,
6564            DFS_NAMENODE_DELEGATION_TOKEN_MAX_LIFETIME_DEFAULT),
6565        conf.getLong(DFS_NAMENODE_DELEGATION_TOKEN_RENEW_INTERVAL_KEY,
6566            DFS_NAMENODE_DELEGATION_TOKEN_RENEW_INTERVAL_DEFAULT),
6567        DELEGATION_TOKEN_REMOVER_SCAN_INTERVAL,
6568        conf.getBoolean(DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_KEY,
6569            DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_DEFAULT),
6570        this);
6571  }
6572
6573  /**
6574   * Returns the DelegationTokenSecretManager instance in the namesystem.
6575   * @return delegation token secret manager object
6576   */
6577  DelegationTokenSecretManager getDelegationTokenSecretManager() {
6578    return dtSecretManager;
6579  }
6580
6581  /**
6582   * @param renewer Renewer information
6583   * @return delegation toek
6584   * @throws IOException on error
6585   */
6586  Token<DelegationTokenIdentifier> getDelegationToken(Text renewer)
6587      throws IOException {
6588    Token<DelegationTokenIdentifier> token;
6589    checkOperation(OperationCategory.WRITE);
6590    writeLock();
6591    try {
6592      checkOperation(OperationCategory.WRITE);
6593      checkNameNodeSafeMode("Cannot issue delegation token");
6594      if (!isAllowedDelegationTokenOp()) {
6595        throw new IOException(
6596          "Delegation Token can be issued only with kerberos or web authentication");
6597      }
6598      if (dtSecretManager == null || !dtSecretManager.isRunning()) {
6599        LOG.warn("trying to get DT with no secret manager running");
6600        return null;
6601      }
6602
6603      UserGroupInformation ugi = getRemoteUser();
6604      String user = ugi.getUserName();
6605      Text owner = new Text(user);
6606      Text realUser = null;
6607      if (ugi.getRealUser() != null) {
6608        realUser = new Text(ugi.getRealUser().getUserName());
6609      }
6610      DelegationTokenIdentifier dtId = new DelegationTokenIdentifier(owner,
6611        renewer, realUser);
6612      token = new Token<DelegationTokenIdentifier>(
6613        dtId, dtSecretManager);
6614      long expiryTime = dtSecretManager.getTokenExpiryTime(dtId);
6615      getEditLog().logGetDelegationToken(dtId, expiryTime);
6616    } finally {
6617      writeUnlock();
6618    }
6619    getEditLog().logSync();
6620    return token;
6621  }
6622
6623  /**
6624   * 
6625   * @param token token to renew
6626   * @return new expiryTime of the token
6627   * @throws InvalidToken if {@code token} is invalid
6628   * @throws IOException on other errors
6629   */
6630  long renewDelegationToken(Token<DelegationTokenIdentifier> token)
6631      throws InvalidToken, IOException {
6632    long expiryTime;
6633    checkOperation(OperationCategory.WRITE);
6634    writeLock();
6635    try {
6636      checkOperation(OperationCategory.WRITE);
6637
6638      checkNameNodeSafeMode("Cannot renew delegation token");
6639      if (!isAllowedDelegationTokenOp()) {
6640        throw new IOException(
6641            "Delegation Token can be renewed only with kerberos or web authentication");
6642      }
6643      String renewer = getRemoteUser().getShortUserName();
6644      expiryTime = dtSecretManager.renewToken(token, renewer);
6645      DelegationTokenIdentifier id = new DelegationTokenIdentifier();
6646      ByteArrayInputStream buf = new ByteArrayInputStream(token.getIdentifier());
6647      DataInputStream in = new DataInputStream(buf);
6648      id.readFields(in);
6649      getEditLog().logRenewDelegationToken(id, expiryTime);
6650    } finally {
6651      writeUnlock();
6652    }
6653    getEditLog().logSync();
6654    return expiryTime;
6655  }
6656
6657  /**
6658   * 
6659   * @param token token to cancel
6660   * @throws IOException on error
6661   */
6662  void cancelDelegationToken(Token<DelegationTokenIdentifier> token)
6663      throws IOException {
6664    checkOperation(OperationCategory.WRITE);
6665    writeLock();
6666    try {
6667      checkOperation(OperationCategory.WRITE);
6668
6669      checkNameNodeSafeMode("Cannot cancel delegation token");
6670      String canceller = getRemoteUser().getUserName();
6671      DelegationTokenIdentifier id = dtSecretManager
6672        .cancelToken(token, canceller);
6673      getEditLog().logCancelDelegationToken(id);
6674    } finally {
6675      writeUnlock();
6676    }
6677    getEditLog().logSync();
6678  }
6679
6680  /**
6681   * @param out save state of the secret manager
6682   * @param sdPath String storage directory path
6683   */
6684  void saveSecretManagerStateCompat(DataOutputStream out, String sdPath)
6685      throws IOException {
6686    dtSecretManager.saveSecretManagerStateCompat(out, sdPath);
6687  }
6688
6689  SecretManagerState saveSecretManagerState() {
6690    return dtSecretManager.saveSecretManagerState();
6691  }
6692
6693  /**
6694   * @param in load the state of secret manager from input stream
6695   */
6696  void loadSecretManagerStateCompat(DataInput in) throws IOException {
6697    dtSecretManager.loadSecretManagerStateCompat(in);
6698  }
6699
6700  void loadSecretManagerState(SecretManagerSection s,
6701      List<SecretManagerSection.DelegationKey> keys,
6702      List<SecretManagerSection.PersistToken> tokens) throws IOException {
6703    dtSecretManager.loadSecretManagerState(new SecretManagerState(s, keys, tokens));
6704  }
6705
6706  /**
6707   * Log the updateMasterKey operation to edit logs
6708   * 
6709   * @param key new delegation key.
6710   */
6711  public void logUpdateMasterKey(DelegationKey key) {
6712    
6713    assert !isInSafeMode() :
6714      "this should never be called while in safemode, since we stop " +
6715      "the DT manager before entering safemode!";
6716    // No need to hold FSN lock since we don't access any internal
6717    // structures, and this is stopped before the FSN shuts itself
6718    // down, etc.
6719    getEditLog().logUpdateMasterKey(key);
6720    getEditLog().logSync();
6721  }
6722  
6723  /**
6724   * Log the cancellation of expired tokens to edit logs
6725   * 
6726   * @param id token identifier to cancel
6727   */
6728  public void logExpireDelegationToken(DelegationTokenIdentifier id) {
6729    assert !isInSafeMode() :
6730      "this should never be called while in safemode, since we stop " +
6731      "the DT manager before entering safemode!";
6732    // No need to hold FSN lock since we don't access any internal
6733    // structures, and this is stopped before the FSN shuts itself
6734    // down, etc.
6735    getEditLog().logCancelDelegationToken(id);
6736  }  
6737  
6738  private void logReassignLease(String leaseHolder, String src,
6739      String newHolder) {
6740    assert hasWriteLock();
6741    getEditLog().logReassignLease(leaseHolder, src, newHolder);
6742  }
6743  
6744  /**
6745   * 
6746   * @return true if delegation token operation is allowed
6747   */
6748  private boolean isAllowedDelegationTokenOp() throws IOException {
6749    AuthenticationMethod authMethod = getConnectionAuthenticationMethod();
6750    if (UserGroupInformation.isSecurityEnabled()
6751        && (authMethod != AuthenticationMethod.KERBEROS)
6752        && (authMethod != AuthenticationMethod.KERBEROS_SSL)
6753        && (authMethod != AuthenticationMethod.CERTIFICATE)) {
6754      return false;
6755    }
6756    return true;
6757  }
6758  
6759  /**
6760   * Returns authentication method used to establish the connection
6761   * @return AuthenticationMethod used to establish connection
6762   * @throws IOException
6763   */
6764  private AuthenticationMethod getConnectionAuthenticationMethod()
6765      throws IOException {
6766    UserGroupInformation ugi = getRemoteUser();
6767    AuthenticationMethod authMethod = ugi.getAuthenticationMethod();
6768    if (authMethod == AuthenticationMethod.PROXY) {
6769      authMethod = ugi.getRealUser().getAuthenticationMethod();
6770    }
6771    return authMethod;
6772  }
6773  
6774  /**
6775   * Client invoked methods are invoked over RPC and will be in 
6776   * RPC call context even if the client exits.
6777   */
6778  boolean isExternalInvocation() {
6779    return Server.isRpcInvocation() || NamenodeWebHdfsMethods.isWebHdfsInvocation();
6780  }
6781
6782  private static InetAddress getRemoteIp() {
6783    InetAddress ip = Server.getRemoteIp();
6784    if (ip != null) {
6785      return ip;
6786    }
6787    return NamenodeWebHdfsMethods.getRemoteIp();
6788  }
6789  
6790  // optimize ugi lookup for RPC operations to avoid a trip through
6791  // UGI.getCurrentUser which is synch'ed
6792  private static UserGroupInformation getRemoteUser() throws IOException {
6793    return NameNode.getRemoteUser();
6794  }
6795  
6796  /**
6797   * Log fsck event in the audit log 
6798   */
6799  void logFsckEvent(String src, InetAddress remoteAddress) throws IOException {
6800    if (isAuditEnabled()) {
6801      logAuditEvent(true, getRemoteUser(),
6802                    remoteAddress,
6803                    "fsck", src, null, null);
6804    }
6805  }
6806  /**
6807   * Register NameNodeMXBean
6808   */
6809  private void registerMXBean() {
6810    mxbeanName = MBeans.register("NameNode", "NameNodeInfo", this);
6811  }
6812
6813  /**
6814   * Class representing Namenode information for JMX interfaces
6815   */
6816  @Override // NameNodeMXBean
6817  public String getVersion() {
6818    return VersionInfo.getVersion() + ", r" + VersionInfo.getRevision();
6819  }
6820
6821  @Override // NameNodeMXBean
6822  public long getUsed() {
6823    return this.getCapacityUsed();
6824  }
6825
6826  @Override // NameNodeMXBean
6827  public long getFree() {
6828    return this.getCapacityRemaining();
6829  }
6830
6831  @Override // NameNodeMXBean
6832  public long getTotal() {
6833    return this.getCapacityTotal();
6834  }
6835
6836  @Override // NameNodeMXBean
6837  public String getSafemode() {
6838    if (!this.isInSafeMode())
6839      return "";
6840    return "Safe mode is ON. " + this.getSafeModeTip();
6841  }
6842
6843  @Override // NameNodeMXBean
6844  public boolean isUpgradeFinalized() {
6845    return this.getFSImage().isUpgradeFinalized();
6846  }
6847
6848  @Override // NameNodeMXBean
6849  public long getNonDfsUsedSpace() {
6850    return datanodeStatistics.getCapacityUsedNonDFS();
6851  }
6852
6853  @Override // NameNodeMXBean
6854  public float getPercentUsed() {
6855    return datanodeStatistics.getCapacityUsedPercent();
6856  }
6857
6858  @Override // NameNodeMXBean
6859  public long getBlockPoolUsedSpace() {
6860    return datanodeStatistics.getBlockPoolUsed();
6861  }
6862
6863  @Override // NameNodeMXBean
6864  public float getPercentBlockPoolUsed() {
6865    return datanodeStatistics.getPercentBlockPoolUsed();
6866  }
6867
6868  @Override // NameNodeMXBean
6869  public float getPercentRemaining() {
6870    return datanodeStatistics.getCapacityRemainingPercent();
6871  }
6872
6873  @Override // NameNodeMXBean
6874  public long getCacheCapacity() {
6875    return datanodeStatistics.getCacheCapacity();
6876  }
6877
6878  @Override // NameNodeMXBean
6879  public long getCacheUsed() {
6880    return datanodeStatistics.getCacheUsed();
6881  }
6882
6883  @Override // NameNodeMXBean
6884  public long getTotalBlocks() {
6885    return getBlocksTotal();
6886  }
6887
6888  @Override // NameNodeMXBean
6889  @Metric
6890  public long getTotalFiles() {
6891    return getFilesTotal();
6892  }
6893
6894  @Override // NameNodeMXBean
6895  public long getNumberOfMissingBlocks() {
6896    return getMissingBlocksCount();
6897  }
6898  
6899  @Override // NameNodeMXBean
6900  public long getNumberOfMissingBlocksWithReplicationFactorOne() {
6901    return getMissingReplOneBlocksCount();
6902  }
6903
6904  @Override // NameNodeMXBean
6905  public int getThreads() {
6906    return ManagementFactory.getThreadMXBean().getThreadCount();
6907  }
6908
6909  /**
6910   * Returned information is a JSON representation of map with host name as the
6911   * key and value is a map of live node attribute keys to its values
6912   */
6913  @Override // NameNodeMXBean
6914  public String getLiveNodes() {
6915    final Map<String, Map<String,Object>> info = 
6916      new HashMap<String, Map<String,Object>>();
6917    final List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
6918    blockManager.getDatanodeManager().fetchDatanodes(live, null, true);
6919    for (DatanodeDescriptor node : live) {
6920      ImmutableMap.Builder<String, Object> innerinfo =
6921          ImmutableMap.<String,Object>builder();
6922      innerinfo
6923          .put("infoAddr", node.getInfoAddr())
6924          .put("infoSecureAddr", node.getInfoSecureAddr())
6925          .put("xferaddr", node.getXferAddr())
6926          .put("lastContact", getLastContact(node))
6927          .put("usedSpace", getDfsUsed(node))
6928          .put("adminState", node.getAdminState().toString())
6929          .put("nonDfsUsedSpace", node.getNonDfsUsed())
6930          .put("capacity", node.getCapacity())
6931          .put("numBlocks", node.numBlocks())
6932          .put("version", node.getSoftwareVersion())
6933          .put("used", node.getDfsUsed())
6934          .put("remaining", node.getRemaining())
6935          .put("blockScheduled", node.getBlocksScheduled())
6936          .put("blockPoolUsed", node.getBlockPoolUsed())
6937          .put("blockPoolUsedPercent", node.getBlockPoolUsedPercent())
6938          .put("volfails", node.getVolumeFailures());
6939      VolumeFailureSummary volumeFailureSummary = node.getVolumeFailureSummary();
6940      if (volumeFailureSummary != null) {
6941        innerinfo
6942            .put("failedStorageLocations",
6943                volumeFailureSummary.getFailedStorageLocations())
6944            .put("lastVolumeFailureDate",
6945                volumeFailureSummary.getLastVolumeFailureDate())
6946            .put("estimatedCapacityLostTotal",
6947                volumeFailureSummary.getEstimatedCapacityLostTotal());
6948      }
6949      info.put(node.getHostName() + ":" + node.getXferPort(), innerinfo.build());
6950    }
6951    return JSON.toString(info);
6952  }
6953
6954  /**
6955   * Returned information is a JSON representation of map with host name as the
6956   * key and value is a map of dead node attribute keys to its values
6957   */
6958  @Override // NameNodeMXBean
6959  public String getDeadNodes() {
6960    final Map<String, Map<String, Object>> info = 
6961      new HashMap<String, Map<String, Object>>();
6962    final List<DatanodeDescriptor> dead = new ArrayList<DatanodeDescriptor>();
6963    blockManager.getDatanodeManager().fetchDatanodes(null, dead, true);
6964    for (DatanodeDescriptor node : dead) {
6965      Map<String, Object> innerinfo = ImmutableMap.<String, Object>builder()
6966          .put("lastContact", getLastContact(node))
6967          .put("decommissioned", node.isDecommissioned())
6968          .put("xferaddr", node.getXferAddr())
6969          .build();
6970      info.put(node.getHostName() + ":" + node.getXferPort(), innerinfo);
6971    }
6972    return JSON.toString(info);
6973  }
6974
6975  /**
6976   * Returned information is a JSON representation of map with host name as the
6977   * key and value is a map of decommissioning node attribute keys to its
6978   * values
6979   */
6980  @Override // NameNodeMXBean
6981  public String getDecomNodes() {
6982    final Map<String, Map<String, Object>> info = 
6983      new HashMap<String, Map<String, Object>>();
6984    final List<DatanodeDescriptor> decomNodeList = blockManager.getDatanodeManager(
6985        ).getDecommissioningNodes();
6986    for (DatanodeDescriptor node : decomNodeList) {
6987      Map<String, Object> innerinfo = ImmutableMap
6988          .<String, Object> builder()
6989          .put("xferaddr", node.getXferAddr())
6990          .put("underReplicatedBlocks",
6991              node.decommissioningStatus.getUnderReplicatedBlocks())
6992          .put("decommissionOnlyReplicas",
6993              node.decommissioningStatus.getDecommissionOnlyReplicas())
6994          .put("underReplicateInOpenFiles",
6995              node.decommissioningStatus.getUnderReplicatedInOpenFiles())
6996          .build();
6997      info.put(node.getHostName() + ":" + node.getXferPort(), innerinfo);
6998    }
6999    return JSON.toString(info);
7000  }
7001
7002  private long getLastContact(DatanodeDescriptor alivenode) {
7003    return (monotonicNow() - alivenode.getLastUpdateMonotonic())/1000;
7004  }
7005
7006  private long getDfsUsed(DatanodeDescriptor alivenode) {
7007    return alivenode.getDfsUsed();
7008  }
7009
7010  @Override  // NameNodeMXBean
7011  public String getClusterId() {
7012    return getFSImage().getStorage().getClusterID();
7013  }
7014  
7015  @Override  // NameNodeMXBean
7016  public String getBlockPoolId() {
7017    return blockPoolId;
7018  }
7019  
7020  @Override  // NameNodeMXBean
7021  public String getNameDirStatuses() {
7022    Map<String, Map<File, StorageDirType>> statusMap =
7023      new HashMap<String, Map<File, StorageDirType>>();
7024    
7025    Map<File, StorageDirType> activeDirs = new HashMap<File, StorageDirType>();
7026    for (Iterator<StorageDirectory> it
7027        = getFSImage().getStorage().dirIterator(); it.hasNext();) {
7028      StorageDirectory st = it.next();
7029      activeDirs.put(st.getRoot(), st.getStorageDirType());
7030    }
7031    statusMap.put("active", activeDirs);
7032    
7033    List<Storage.StorageDirectory> removedStorageDirs
7034        = getFSImage().getStorage().getRemovedStorageDirs();
7035    Map<File, StorageDirType> failedDirs = new HashMap<File, StorageDirType>();
7036    for (StorageDirectory st : removedStorageDirs) {
7037      failedDirs.put(st.getRoot(), st.getStorageDirType());
7038    }
7039    statusMap.put("failed", failedDirs);
7040    
7041    return JSON.toString(statusMap);
7042  }
7043
7044  @Override // NameNodeMXBean
7045  public String getNodeUsage() {
7046    float median = 0;
7047    float max = 0;
7048    float min = 0;
7049    float dev = 0;
7050
7051    final Map<String, Map<String,Object>> info =
7052        new HashMap<String, Map<String,Object>>();
7053    final List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
7054    blockManager.getDatanodeManager().fetchDatanodes(live, null, true);
7055
7056    if (live.size() > 0) {
7057      float totalDfsUsed = 0;
7058      float[] usages = new float[live.size()];
7059      int i = 0;
7060      for (DatanodeDescriptor dn : live) {
7061        usages[i++] = dn.getDfsUsedPercent();
7062        totalDfsUsed += dn.getDfsUsedPercent();
7063      }
7064      totalDfsUsed /= live.size();
7065      Arrays.sort(usages);
7066      median = usages[usages.length / 2];
7067      max = usages[usages.length - 1];
7068      min = usages[0];
7069
7070      for (i = 0; i < usages.length; i++) {
7071        dev += (usages[i] - totalDfsUsed) * (usages[i] - totalDfsUsed);
7072      }
7073      dev = (float) Math.sqrt(dev / usages.length);
7074    }
7075
7076    final Map<String, Object> innerInfo = new HashMap<String, Object>();
7077    innerInfo.put("min", StringUtils.format("%.2f%%", min));
7078    innerInfo.put("median", StringUtils.format("%.2f%%", median));
7079    innerInfo.put("max", StringUtils.format("%.2f%%", max));
7080    innerInfo.put("stdDev", StringUtils.format("%.2f%%", dev));
7081    info.put("nodeUsage", innerInfo);
7082
7083    return JSON.toString(info);
7084  }
7085
7086  @Override  // NameNodeMXBean
7087  public String getNameJournalStatus() {
7088    List<Map<String, String>> jasList = new ArrayList<Map<String, String>>();
7089    FSEditLog log = getFSImage().getEditLog();
7090    if (log != null) {
7091      boolean openForWrite = log.isOpenForWrite();
7092      for (JournalAndStream jas : log.getJournals()) {
7093        final Map<String, String> jasMap = new HashMap<String, String>();
7094        String manager = jas.getManager().toString();
7095
7096        jasMap.put("required", String.valueOf(jas.isRequired()));
7097        jasMap.put("disabled", String.valueOf(jas.isDisabled()));
7098        jasMap.put("manager", manager);
7099
7100        if (jas.isDisabled()) {
7101          jasMap.put("stream", "Failed");
7102        } else if (openForWrite) {
7103          EditLogOutputStream elos = jas.getCurrentStream();
7104          if (elos != null) {
7105            jasMap.put("stream", elos.generateReport());
7106          } else {
7107            jasMap.put("stream", "not currently writing");
7108          }
7109        } else {
7110          jasMap.put("stream", "open for read");
7111        }
7112        jasList.add(jasMap);
7113      }
7114    }
7115    return JSON.toString(jasList);
7116  }
7117
7118  @Override // NameNodeMxBean
7119  public String getJournalTransactionInfo() {
7120    Map<String, String> txnIdMap = new HashMap<String, String>();
7121    txnIdMap.put("LastAppliedOrWrittenTxId",
7122        Long.toString(this.getFSImage().getLastAppliedOrWrittenTxId()));
7123    txnIdMap.put("MostRecentCheckpointTxId",
7124        Long.toString(this.getFSImage().getMostRecentCheckpointTxId()));
7125    return JSON.toString(txnIdMap);
7126  }
7127  
7128  @Override  // NameNodeMXBean
7129  public String getNNStarted() {
7130    return getStartTime().toString();
7131  }
7132
7133  @Override  // NameNodeMXBean
7134  public String getCompileInfo() {
7135    return VersionInfo.getDate() + " by " + VersionInfo.getUser() +
7136        " from " + VersionInfo.getBranch();
7137  }
7138
7139  /** @return the block manager. */
7140  public BlockManager getBlockManager() {
7141    return blockManager;
7142  }
7143
7144  public BlockIdManager getBlockIdManager() {
7145    return blockIdManager;
7146  }
7147
7148  /** @return the FSDirectory. */
7149  public FSDirectory getFSDirectory() {
7150    return dir;
7151  }
7152  /** Set the FSDirectory. */
7153  @VisibleForTesting
7154  public void setFSDirectory(FSDirectory dir) {
7155    this.dir = dir;
7156  }
7157  /** @return the cache manager. */
7158  public CacheManager getCacheManager() {
7159    return cacheManager;
7160  }
7161
7162  @Override  // NameNodeMXBean
7163  public String getCorruptFiles() {
7164    List<String> list = new ArrayList<String>();
7165    Collection<FSNamesystem.CorruptFileBlockInfo> corruptFileBlocks;
7166    try {
7167      corruptFileBlocks = listCorruptFileBlocks("/", null);
7168      int corruptFileCount = corruptFileBlocks.size();
7169      if (corruptFileCount != 0) {
7170        for (FSNamesystem.CorruptFileBlockInfo c : corruptFileBlocks) {
7171          list.add(c.toString());
7172        }
7173      }
7174    } catch (IOException e) {
7175      LOG.warn("Get corrupt file blocks returned error: " + e.getMessage());
7176    }
7177    return JSON.toString(list);
7178  }
7179
7180  @Override  //NameNodeMXBean
7181  public int getDistinctVersionCount() {
7182    return blockManager.getDatanodeManager().getDatanodesSoftwareVersions()
7183      .size();
7184  }
7185
7186  @Override  //NameNodeMXBean
7187  public Map<String, Integer> getDistinctVersions() {
7188    return blockManager.getDatanodeManager().getDatanodesSoftwareVersions();
7189  }
7190
7191  @Override  //NameNodeMXBean
7192  public String getSoftwareVersion() {
7193    return VersionInfo.getVersion();
7194  }
7195
7196  /**
7197   * Verifies that the given identifier and password are valid and match.
7198   * @param identifier Token identifier.
7199   * @param password Password in the token.
7200   */
7201  public synchronized void verifyToken(DelegationTokenIdentifier identifier,
7202      byte[] password) throws InvalidToken, RetriableException {
7203    try {
7204      getDelegationTokenSecretManager().verifyToken(identifier, password);
7205    } catch (InvalidToken it) {
7206      if (inTransitionToActive()) {
7207        throw new RetriableException(it);
7208      }
7209      throw it;
7210    }
7211  }
7212  
7213  @Override
7214  public boolean isGenStampInFuture(Block block) {
7215    return blockIdManager.isGenStampInFuture(block);
7216  }
7217
7218  @VisibleForTesting
7219  public EditLogTailer getEditLogTailer() {
7220    return editLogTailer;
7221  }
7222  
7223  @VisibleForTesting
7224  public void setEditLogTailerForTests(EditLogTailer tailer) {
7225    this.editLogTailer = tailer;
7226  }
7227  
7228  @VisibleForTesting
7229  void setFsLockForTests(ReentrantReadWriteLock lock) {
7230    this.fsLock.coarseLock = lock;
7231  }
7232  
7233  @VisibleForTesting
7234  public ReentrantReadWriteLock getFsLockForTests() {
7235    return fsLock.coarseLock;
7236  }
7237  
7238  @VisibleForTesting
7239  public ReentrantLock getCpLockForTests() {
7240    return cpLock;
7241  }
7242
7243  @VisibleForTesting
7244  public SafeModeInfo getSafeModeInfoForTests() {
7245    return safeMode;
7246  }
7247  
7248  @VisibleForTesting
7249  public void setNNResourceChecker(NameNodeResourceChecker nnResourceChecker) {
7250    this.nnResourceChecker = nnResourceChecker;
7251  }
7252
7253  public SnapshotManager getSnapshotManager() {
7254    return snapshotManager;
7255  }
7256  
7257  /** Allow snapshot on a directory. */
7258  void allowSnapshot(String path) throws IOException {
7259    checkOperation(OperationCategory.WRITE);
7260    boolean success = false;
7261    writeLock();
7262    try {
7263      checkOperation(OperationCategory.WRITE);
7264      checkNameNodeSafeMode("Cannot allow snapshot for " + path);
7265      checkSuperuserPrivilege();
7266      FSDirSnapshotOp.allowSnapshot(dir, snapshotManager, path);
7267      success = true;
7268    } finally {
7269      writeUnlock();
7270    }
7271    getEditLog().logSync();
7272    logAuditEvent(success, "allowSnapshot", path, null, null);
7273  }
7274  
7275  /** Disallow snapshot on a directory. */
7276  void disallowSnapshot(String path) throws IOException {
7277    checkOperation(OperationCategory.WRITE);
7278    boolean success = false;
7279    writeLock();
7280    try {
7281      checkOperation(OperationCategory.WRITE);
7282      checkNameNodeSafeMode("Cannot disallow snapshot for " + path);
7283      checkSuperuserPrivilege();
7284      FSDirSnapshotOp.disallowSnapshot(dir, snapshotManager, path);
7285      success = true;
7286    } finally {
7287      writeUnlock();
7288    }
7289    getEditLog().logSync();
7290    logAuditEvent(success, "disallowSnapshot", path, null, null);
7291  }
7292  
7293  /**
7294   * Create a snapshot
7295   * @param snapshotRoot The directory path where the snapshot is taken
7296   * @param snapshotName The name of the snapshot
7297   */
7298  String createSnapshot(String snapshotRoot, String snapshotName,
7299                        boolean logRetryCache) throws IOException {
7300    String snapshotPath = null;
7301    writeLock();
7302    try {
7303      checkOperation(OperationCategory.WRITE);
7304      checkNameNodeSafeMode("Cannot create snapshot for " + snapshotRoot);
7305      snapshotPath = FSDirSnapshotOp.createSnapshot(dir,
7306          snapshotManager, snapshotRoot, snapshotName, logRetryCache);
7307    } finally {
7308      writeUnlock();
7309    }
7310    getEditLog().logSync();
7311    logAuditEvent(snapshotPath != null, "createSnapshot", snapshotRoot,
7312        snapshotPath, null);
7313    return snapshotPath;
7314  }
7315  
7316  /**
7317   * Rename a snapshot
7318   * @param path The directory path where the snapshot was taken
7319   * @param snapshotOldName Old snapshot name
7320   * @param snapshotNewName New snapshot name
7321   * @throws SafeModeException
7322   * @throws IOException 
7323   */
7324  void renameSnapshot(
7325      String path, String snapshotOldName, String snapshotNewName,
7326      boolean logRetryCache) throws IOException {
7327    checkOperation(OperationCategory.WRITE);
7328    boolean success = false;
7329    writeLock();
7330    try {
7331      checkOperation(OperationCategory.WRITE);
7332      checkNameNodeSafeMode("Cannot rename snapshot for " + path);
7333      FSDirSnapshotOp.renameSnapshot(dir, snapshotManager, path,
7334          snapshotOldName, snapshotNewName, logRetryCache);
7335      success = true;
7336    } finally {
7337      writeUnlock();
7338    }
7339    getEditLog().logSync();
7340    String oldSnapshotRoot = Snapshot.getSnapshotPath(path, snapshotOldName);
7341    String newSnapshotRoot = Snapshot.getSnapshotPath(path, snapshotNewName);
7342    logAuditEvent(success, "renameSnapshot", oldSnapshotRoot,
7343        newSnapshotRoot, null);
7344  }
7345  
7346  /**
7347   * Get the list of snapshottable directories that are owned 
7348   * by the current user. Return all the snapshottable directories if the 
7349   * current user is a super user.
7350   * @return The list of all the current snapshottable directories
7351   * @throws IOException
7352   */
7353  public SnapshottableDirectoryStatus[] getSnapshottableDirListing()
7354      throws IOException {
7355    SnapshottableDirectoryStatus[] status = null;
7356    checkOperation(OperationCategory.READ);
7357    boolean success = false;
7358    readLock();
7359    try {
7360      checkOperation(OperationCategory.READ);
7361      status = FSDirSnapshotOp.getSnapshottableDirListing(dir, snapshotManager);
7362      success = true;
7363    } finally {
7364      readUnlock();
7365    }
7366    logAuditEvent(success, "listSnapshottableDirectory", null, null, null);
7367    return status;
7368  }
7369  
7370  /**
7371   * Get the difference between two snapshots (or between a snapshot and the
7372   * current status) of a snapshottable directory.
7373   * 
7374   * @param path The full path of the snapshottable directory.
7375   * @param fromSnapshot Name of the snapshot to calculate the diff from. Null
7376   *          or empty string indicates the current tree.
7377   * @param toSnapshot Name of the snapshot to calculated the diff to. Null or
7378   *          empty string indicates the current tree.
7379   * @return A report about the difference between {@code fromSnapshot} and 
7380   *         {@code toSnapshot}. Modified/deleted/created/renamed files and 
7381   *         directories belonging to the snapshottable directories are listed 
7382   *         and labeled as M/-/+/R respectively. 
7383   * @throws IOException
7384   */
7385  SnapshotDiffReport getSnapshotDiffReport(String path,
7386      String fromSnapshot, String toSnapshot) throws IOException {
7387    SnapshotDiffReport diffs = null;
7388    checkOperation(OperationCategory.READ);
7389    readLock();
7390    try {
7391      checkOperation(OperationCategory.READ);
7392      diffs = FSDirSnapshotOp.getSnapshotDiffReport(dir, snapshotManager,
7393          path, fromSnapshot, toSnapshot);
7394    } finally {
7395      readUnlock();
7396    }
7397
7398    logAuditEvent(diffs != null, "computeSnapshotDiff", null, null, null);
7399    return diffs;
7400  }
7401  
7402  /**
7403   * Delete a snapshot of a snapshottable directory
7404   * @param snapshotRoot The snapshottable directory
7405   * @param snapshotName The name of the to-be-deleted snapshot
7406   * @throws SafeModeException
7407   * @throws IOException
7408   */
7409  void deleteSnapshot(String snapshotRoot, String snapshotName,
7410      boolean logRetryCache) throws IOException {
7411    checkOperation(OperationCategory.WRITE);
7412    boolean success = false;
7413    writeLock();
7414    BlocksMapUpdateInfo blocksToBeDeleted = null;
7415    try {
7416      checkOperation(OperationCategory.WRITE);
7417      checkNameNodeSafeMode("Cannot delete snapshot for " + snapshotRoot);
7418
7419      blocksToBeDeleted = FSDirSnapshotOp.deleteSnapshot(dir, snapshotManager,
7420          snapshotRoot, snapshotName, logRetryCache);
7421      success = true;
7422    } finally {
7423      writeUnlock();
7424    }
7425    getEditLog().logSync();
7426
7427    // Breaking the pattern as removing blocks have to happen outside of the
7428    // global lock
7429    if (blocksToBeDeleted != null) {
7430      removeBlocks(blocksToBeDeleted);
7431    }
7432
7433    String rootPath = Snapshot.getSnapshotPath(snapshotRoot, snapshotName);
7434    logAuditEvent(success, "deleteSnapshot", rootPath, null, null);
7435  }
7436
7437  /**
7438   * Remove a list of INodeDirectorySnapshottable from the SnapshotManager
7439   * @param toRemove the list of INodeDirectorySnapshottable to be removed
7440   */
7441  void removeSnapshottableDirs(List<INodeDirectory> toRemove) {
7442    if (snapshotManager != null) {
7443      snapshotManager.removeSnapshottable(toRemove);
7444    }
7445  }
7446
7447  RollingUpgradeInfo queryRollingUpgrade() throws IOException {
7448    checkSuperuserPrivilege();
7449    checkOperation(OperationCategory.READ);
7450    readLock();
7451    try {
7452      if (rollingUpgradeInfo != null) {
7453        boolean hasRollbackImage = this.getFSImage().hasRollbackFSImage();
7454        rollingUpgradeInfo.setCreatedRollbackImages(hasRollbackImage);
7455      }
7456      return rollingUpgradeInfo;
7457    } finally {
7458      readUnlock();
7459    }
7460  }
7461
7462  RollingUpgradeInfo startRollingUpgrade() throws IOException {
7463    checkSuperuserPrivilege();
7464    checkOperation(OperationCategory.WRITE);
7465    writeLock();
7466    try {
7467      checkOperation(OperationCategory.WRITE);
7468      if (isRollingUpgrade()) {
7469        return rollingUpgradeInfo;
7470      }
7471      long startTime = now();
7472      if (!haEnabled) { // for non-HA, we require NN to be in safemode
7473        startRollingUpgradeInternalForNonHA(startTime);
7474      } else { // for HA, NN cannot be in safemode
7475        checkNameNodeSafeMode("Failed to start rolling upgrade");
7476        startRollingUpgradeInternal(startTime);
7477      }
7478
7479      getEditLog().logStartRollingUpgrade(rollingUpgradeInfo.getStartTime());
7480      if (haEnabled) {
7481        // roll the edit log to make sure the standby NameNode can tail
7482        getFSImage().rollEditLog();
7483      }
7484    } finally {
7485      writeUnlock();
7486    }
7487
7488    getEditLog().logSync();
7489    if (auditLog.isInfoEnabled() && isExternalInvocation()) {
7490      logAuditEvent(true, "startRollingUpgrade", null, null, null);
7491    }
7492    return rollingUpgradeInfo;
7493  }
7494
7495  /**
7496   * Update internal state to indicate that a rolling upgrade is in progress.
7497   * @param startTime rolling upgrade start time
7498   */
7499  void startRollingUpgradeInternal(long startTime)
7500      throws IOException {
7501    checkRollingUpgrade("start rolling upgrade");
7502    getFSImage().checkUpgrade(this);
7503    setRollingUpgradeInfo(false, startTime);
7504  }
7505
7506  /**
7507   * Update internal state to indicate that a rolling upgrade is in progress for
7508   * non-HA setup. This requires the namesystem is in SafeMode and after doing a
7509   * checkpoint for rollback the namesystem will quit the safemode automatically 
7510   */
7511  private void startRollingUpgradeInternalForNonHA(long startTime)
7512      throws IOException {
7513    Preconditions.checkState(!haEnabled);
7514    if (!isInSafeMode()) {
7515      throw new IOException("Safe mode should be turned ON "
7516          + "in order to create namespace image.");
7517    }
7518    checkRollingUpgrade("start rolling upgrade");
7519    getFSImage().checkUpgrade(this);
7520    // in non-HA setup, we do an extra checkpoint to generate a rollback image
7521    getFSImage().saveNamespace(this, NameNodeFile.IMAGE_ROLLBACK, null);
7522    LOG.info("Successfully saved namespace for preparing rolling upgrade.");
7523
7524    // leave SafeMode automatically
7525    setSafeMode(SafeModeAction.SAFEMODE_LEAVE);
7526    setRollingUpgradeInfo(true, startTime);
7527  }
7528
7529  void setRollingUpgradeInfo(boolean createdRollbackImages, long startTime) {
7530    rollingUpgradeInfo = new RollingUpgradeInfo(blockPoolId,
7531        createdRollbackImages, startTime, 0L);
7532  }
7533
7534  public void setCreatedRollbackImages(boolean created) {
7535    if (rollingUpgradeInfo != null) {
7536      rollingUpgradeInfo.setCreatedRollbackImages(created);
7537    }
7538  }
7539
7540  public RollingUpgradeInfo getRollingUpgradeInfo() {
7541    return rollingUpgradeInfo;
7542  }
7543
7544  public boolean isNeedRollbackFsImage() {
7545    return needRollbackFsImage;
7546  }
7547
7548  public void setNeedRollbackFsImage(boolean needRollbackFsImage) {
7549    this.needRollbackFsImage = needRollbackFsImage;
7550  }
7551
7552  @Override  // NameNodeMXBean
7553  public RollingUpgradeInfo.Bean getRollingUpgradeStatus() {
7554    RollingUpgradeInfo upgradeInfo = getRollingUpgradeInfo();
7555    if (upgradeInfo != null) {
7556      return new RollingUpgradeInfo.Bean(upgradeInfo);
7557    }
7558    return null;
7559  }
7560
7561  /** Is rolling upgrade in progress? */
7562  public boolean isRollingUpgrade() {
7563    return rollingUpgradeInfo != null;
7564  }
7565
7566  void checkRollingUpgrade(String action) throws RollingUpgradeException {
7567    if (isRollingUpgrade()) {
7568      throw new RollingUpgradeException("Failed to " + action
7569          + " since a rolling upgrade is already in progress."
7570          + " Existing rolling upgrade info:\n" + rollingUpgradeInfo);
7571    }
7572  }
7573
7574  RollingUpgradeInfo finalizeRollingUpgrade() throws IOException {
7575    checkSuperuserPrivilege();
7576    checkOperation(OperationCategory.WRITE);
7577    writeLock();
7578    final RollingUpgradeInfo returnInfo;
7579    try {
7580      checkOperation(OperationCategory.WRITE);
7581      if (!isRollingUpgrade()) {
7582        return null;
7583      }
7584      checkNameNodeSafeMode("Failed to finalize rolling upgrade");
7585
7586      returnInfo = finalizeRollingUpgradeInternal(now());
7587      getEditLog().logFinalizeRollingUpgrade(returnInfo.getFinalizeTime());
7588      if (haEnabled) {
7589        // roll the edit log to make sure the standby NameNode can tail
7590        getFSImage().rollEditLog();
7591      }
7592      getFSImage().updateStorageVersion();
7593      getFSImage().renameCheckpoint(NameNodeFile.IMAGE_ROLLBACK,
7594          NameNodeFile.IMAGE);
7595    } finally {
7596      writeUnlock();
7597    }
7598
7599    if (!haEnabled) {
7600      // Sync not needed for ha since the edit was rolled after logging.
7601      getEditLog().logSync();
7602    }
7603
7604    if (auditLog.isInfoEnabled() && isExternalInvocation()) {
7605      logAuditEvent(true, "finalizeRollingUpgrade", null, null, null);
7606    }
7607    return returnInfo;
7608  }
7609
7610  RollingUpgradeInfo finalizeRollingUpgradeInternal(long finalizeTime)
7611      throws RollingUpgradeException {
7612    final long startTime = rollingUpgradeInfo.getStartTime();
7613    rollingUpgradeInfo = null;
7614    return new RollingUpgradeInfo(blockPoolId, false, startTime, finalizeTime);
7615  }
7616
7617  long addCacheDirective(CacheDirectiveInfo directive,
7618                         EnumSet<CacheFlag> flags, boolean logRetryCache)
7619      throws IOException {
7620    checkOperation(OperationCategory.WRITE);
7621    CacheDirectiveInfo effectiveDirective = null;
7622    if (!flags.contains(CacheFlag.FORCE)) {
7623      cacheManager.waitForRescanIfNeeded();
7624    }
7625    writeLock();
7626    try {
7627      checkOperation(OperationCategory.WRITE);
7628      if (isInSafeMode()) {
7629        throw new SafeModeException(
7630            "Cannot add cache directive", safeMode);
7631      }
7632      effectiveDirective = FSNDNCacheOp.addCacheDirective(this, cacheManager,
7633          directive, flags, logRetryCache);
7634    } finally {
7635      writeUnlock();
7636      boolean success = effectiveDirective != null;
7637      if (success) {
7638        getEditLog().logSync();
7639      }
7640
7641      String effectiveDirectiveStr = effectiveDirective != null ?
7642          effectiveDirective.toString() : null;
7643      logAuditEvent(success, "addCacheDirective", effectiveDirectiveStr,
7644          null, null);
7645    }
7646    return effectiveDirective != null ? effectiveDirective.getId() : 0;
7647  }
7648
7649  void modifyCacheDirective(CacheDirectiveInfo directive,
7650      EnumSet<CacheFlag> flags, boolean logRetryCache) throws IOException {
7651    checkOperation(OperationCategory.WRITE);
7652    boolean success = false;
7653    if (!flags.contains(CacheFlag.FORCE)) {
7654      cacheManager.waitForRescanIfNeeded();
7655    }
7656    writeLock();
7657    try {
7658      checkOperation(OperationCategory.WRITE);
7659      if (isInSafeMode()) {
7660        throw new SafeModeException(
7661            "Cannot add cache directive", safeMode);
7662      }
7663      FSNDNCacheOp.modifyCacheDirective(this, cacheManager, directive, flags,
7664          logRetryCache);
7665      success = true;
7666    } finally {
7667      writeUnlock();
7668      if (success) {
7669        getEditLog().logSync();
7670      }
7671      String idStr = "{id: " + directive.getId().toString() + "}";
7672      logAuditEvent(success, "modifyCacheDirective", idStr,
7673          directive.toString(), null);
7674    }
7675  }
7676
7677  void removeCacheDirective(long id, boolean logRetryCache) throws IOException {
7678    checkOperation(OperationCategory.WRITE);
7679    boolean success = false;
7680    writeLock();
7681    try {
7682      checkOperation(OperationCategory.WRITE);
7683      if (isInSafeMode()) {
7684        throw new SafeModeException(
7685            "Cannot remove cache directives", safeMode);
7686      }
7687      FSNDNCacheOp.removeCacheDirective(this, cacheManager, id, logRetryCache);
7688      success = true;
7689    } finally {
7690      writeUnlock();
7691      String idStr = "{id: " + Long.toString(id) + "}";
7692      logAuditEvent(success, "removeCacheDirective", idStr, null,
7693          null);
7694    }
7695    getEditLog().logSync();
7696  }
7697
7698  BatchedListEntries<CacheDirectiveEntry> listCacheDirectives(
7699      long startId, CacheDirectiveInfo filter) throws IOException {
7700    checkOperation(OperationCategory.READ);
7701    BatchedListEntries<CacheDirectiveEntry> results;
7702    cacheManager.waitForRescanIfNeeded();
7703    readLock();
7704    boolean success = false;
7705    try {
7706      checkOperation(OperationCategory.READ);
7707      results = FSNDNCacheOp.listCacheDirectives(this, cacheManager, startId,
7708          filter);
7709      success = true;
7710    } finally {
7711      readUnlock();
7712      logAuditEvent(success, "listCacheDirectives", filter.toString(), null,
7713          null);
7714    }
7715    return results;
7716  }
7717
7718  void addCachePool(CachePoolInfo req, boolean logRetryCache)
7719      throws IOException {
7720    checkOperation(OperationCategory.WRITE);
7721    writeLock();
7722    boolean success = false;
7723    String poolInfoStr = null;
7724    try {
7725      checkOperation(OperationCategory.WRITE);
7726      if (isInSafeMode()) {
7727        throw new SafeModeException(
7728            "Cannot add cache pool " + req.getPoolName(), safeMode);
7729      }
7730      CachePoolInfo info = FSNDNCacheOp.addCachePool(this, cacheManager, req,
7731          logRetryCache);
7732      poolInfoStr = info.toString();
7733      success = true;
7734    } finally {
7735      writeUnlock();
7736      logAuditEvent(success, "addCachePool", poolInfoStr, null, null);
7737    }
7738    
7739    getEditLog().logSync();
7740  }
7741
7742  void modifyCachePool(CachePoolInfo req, boolean logRetryCache)
7743      throws IOException {
7744    checkOperation(OperationCategory.WRITE);
7745    writeLock();
7746    boolean success = false;
7747    try {
7748      checkOperation(OperationCategory.WRITE);
7749      if (isInSafeMode()) {
7750        throw new SafeModeException(
7751            "Cannot modify cache pool " + req.getPoolName(), safeMode);
7752      }
7753      FSNDNCacheOp.modifyCachePool(this, cacheManager, req, logRetryCache);
7754      success = true;
7755    } finally {
7756      writeUnlock();
7757      String poolNameStr = "{poolName: " +
7758          (req == null ? null : req.getPoolName()) + "}";
7759      logAuditEvent(success, "modifyCachePool", poolNameStr,
7760                    req == null ? null : req.toString(), null);
7761    }
7762
7763    getEditLog().logSync();
7764  }
7765
7766  void removeCachePool(String cachePoolName, boolean logRetryCache)
7767      throws IOException {
7768    checkOperation(OperationCategory.WRITE);
7769    writeLock();
7770    boolean success = false;
7771    try {
7772      checkOperation(OperationCategory.WRITE);
7773      if (isInSafeMode()) {
7774        throw new SafeModeException(
7775            "Cannot remove cache pool " + cachePoolName, safeMode);
7776      }
7777      FSNDNCacheOp.removeCachePool(this, cacheManager, cachePoolName,
7778          logRetryCache);
7779      success = true;
7780    } finally {
7781      writeUnlock();
7782      String poolNameStr = "{poolName: " + cachePoolName + "}";
7783      logAuditEvent(success, "removeCachePool", poolNameStr, null, null);
7784    }
7785    
7786    getEditLog().logSync();
7787  }
7788
7789  BatchedListEntries<CachePoolEntry> listCachePools(String prevKey)
7790      throws IOException {
7791    BatchedListEntries<CachePoolEntry> results;
7792    checkOperation(OperationCategory.READ);
7793    boolean success = false;
7794    cacheManager.waitForRescanIfNeeded();
7795    readLock();
7796    try {
7797      checkOperation(OperationCategory.READ);
7798      results = FSNDNCacheOp.listCachePools(this, cacheManager, prevKey);
7799      success = true;
7800    } finally {
7801      readUnlock();
7802      logAuditEvent(success, "listCachePools", null, null, null);
7803    }
7804    return results;
7805  }
7806
7807  void modifyAclEntries(final String src, List<AclEntry> aclSpec)
7808      throws IOException {
7809    HdfsFileStatus auditStat = null;
7810    checkOperation(OperationCategory.WRITE);
7811    writeLock();
7812    try {
7813      checkOperation(OperationCategory.WRITE);
7814      checkNameNodeSafeMode("Cannot modify ACL entries on " + src);
7815      auditStat = FSDirAclOp.modifyAclEntries(dir, src, aclSpec);
7816    } catch (AccessControlException e) {
7817      logAuditEvent(false, "modifyAclEntries", src);
7818      throw e;
7819    } finally {
7820      writeUnlock();
7821    }
7822    getEditLog().logSync();
7823    logAuditEvent(true, "modifyAclEntries", src, null, auditStat);
7824  }
7825
7826  void removeAclEntries(final String src, List<AclEntry> aclSpec)
7827      throws IOException {
7828    checkOperation(OperationCategory.WRITE);
7829    HdfsFileStatus auditStat = null;
7830    writeLock();
7831    try {
7832      checkOperation(OperationCategory.WRITE);
7833      checkNameNodeSafeMode("Cannot remove ACL entries on " + src);
7834      auditStat = FSDirAclOp.removeAclEntries(dir, src, aclSpec);
7835    } catch (AccessControlException e) {
7836      logAuditEvent(false, "removeAclEntries", src);
7837      throw e;
7838    } finally {
7839      writeUnlock();
7840    }
7841    getEditLog().logSync();
7842    logAuditEvent(true, "removeAclEntries", src, null, auditStat);
7843  }
7844
7845  void removeDefaultAcl(final String src) throws IOException {
7846    HdfsFileStatus auditStat = null;
7847    checkOperation(OperationCategory.WRITE);
7848    writeLock();
7849    try {
7850      checkOperation(OperationCategory.WRITE);
7851      checkNameNodeSafeMode("Cannot remove default ACL entries on " + src);
7852      auditStat = FSDirAclOp.removeDefaultAcl(dir, src);
7853    } catch (AccessControlException e) {
7854      logAuditEvent(false, "removeDefaultAcl", src);
7855      throw e;
7856    } finally {
7857      writeUnlock();
7858    }
7859    getEditLog().logSync();
7860    logAuditEvent(true, "removeDefaultAcl", src, null, auditStat);
7861  }
7862
7863  void removeAcl(final String src) throws IOException {
7864    HdfsFileStatus auditStat = null;
7865    checkOperation(OperationCategory.WRITE);
7866    writeLock();
7867    try {
7868      checkOperation(OperationCategory.WRITE);
7869      checkNameNodeSafeMode("Cannot remove ACL on " + src);
7870      auditStat = FSDirAclOp.removeAcl(dir, src);
7871    } catch (AccessControlException e) {
7872      logAuditEvent(false, "removeAcl", src);
7873      throw e;
7874    } finally {
7875      writeUnlock();
7876    }
7877    getEditLog().logSync();
7878    logAuditEvent(true, "removeAcl", src, null, auditStat);
7879  }
7880
7881  void setAcl(final String src, List<AclEntry> aclSpec) throws IOException {
7882    HdfsFileStatus auditStat = null;
7883    checkOperation(OperationCategory.WRITE);
7884    writeLock();
7885    try {
7886      checkOperation(OperationCategory.WRITE);
7887      checkNameNodeSafeMode("Cannot set ACL on " + src);
7888      auditStat = FSDirAclOp.setAcl(dir, src, aclSpec);
7889    } catch (AccessControlException e) {
7890      logAuditEvent(false, "setAcl", src);
7891      throw e;
7892    } finally {
7893      writeUnlock();
7894    }
7895    getEditLog().logSync();
7896    logAuditEvent(true, "setAcl", src, null, auditStat);
7897  }
7898
7899  AclStatus getAclStatus(String src) throws IOException {
7900    checkOperation(OperationCategory.READ);
7901    boolean success = false;
7902    readLock();
7903    try {
7904      checkOperation(OperationCategory.READ);
7905      final AclStatus ret = FSDirAclOp.getAclStatus(dir, src);
7906      success = true;
7907      return ret;
7908    } finally {
7909      readUnlock();
7910      logAuditEvent(success, "getAclStatus", src);
7911    }
7912  }
7913
7914  /**
7915   * Create an encryption zone on directory src using the specified key.
7916   *
7917   * @param src     the path of a directory which will be the root of the
7918   *                encryption zone. The directory must be empty.
7919   * @param keyName name of a key which must be present in the configured
7920   *                KeyProvider.
7921   * @throws AccessControlException  if the caller is not the superuser.
7922   * @throws UnresolvedLinkException if the path can't be resolved.
7923   * @throws SafeModeException       if the Namenode is in safe mode.
7924   */
7925  void createEncryptionZone(final String src, final String keyName,
7926                            boolean logRetryCache)
7927    throws IOException, UnresolvedLinkException,
7928      SafeModeException, AccessControlException {
7929    try {
7930      if (provider == null) {
7931        throw new IOException(
7932            "Can't create an encryption zone for " + src +
7933            " since no key provider is available.");
7934      }
7935      if (keyName == null || keyName.isEmpty()) {
7936        throw new IOException("Must specify a key name when creating an " +
7937            "encryption zone");
7938      }
7939      KeyProvider.Metadata metadata = provider.getMetadata(keyName);
7940      if (metadata == null) {
7941        /*
7942         * It would be nice if we threw something more specific than
7943         * IOException when the key is not found, but the KeyProvider API
7944         * doesn't provide for that. If that API is ever changed to throw
7945         * something more specific (e.g. UnknownKeyException) then we can
7946         * update this to match it, or better yet, just rethrow the
7947         * KeyProvider's exception.
7948         */
7949        throw new IOException("Key " + keyName + " doesn't exist.");
7950      }
7951      // If the provider supports pool for EDEKs, this will fill in the pool
7952      generateEncryptedDataEncryptionKey(keyName);
7953      createEncryptionZoneInt(src, metadata.getCipher(),
7954          keyName, logRetryCache);
7955    } catch (AccessControlException e) {
7956      logAuditEvent(false, "createEncryptionZone", src);
7957      throw e;
7958    }
7959  }
7960
7961  private void createEncryptionZoneInt(final String srcArg, String cipher,
7962      String keyName, final boolean logRetryCache) throws IOException {
7963    String src = srcArg;
7964    HdfsFileStatus resultingStat = null;
7965    checkSuperuserPrivilege();
7966    checkOperation(OperationCategory.WRITE);
7967    final byte[][] pathComponents =
7968      FSDirectory.getPathComponentsForReservedPath(src);
7969    FSPermissionChecker pc = getPermissionChecker();
7970    writeLock();
7971    try {
7972      checkSuperuserPrivilege();
7973      checkOperation(OperationCategory.WRITE);
7974      checkNameNodeSafeMode("Cannot create encryption zone on " + src);
7975      src = dir.resolvePath(pc, src, pathComponents);
7976
7977      final CipherSuite suite = CipherSuite.convert(cipher);
7978      // For now this is hardcoded, as we only support one method.
7979      final CryptoProtocolVersion version =
7980          CryptoProtocolVersion.ENCRYPTION_ZONES;
7981      final XAttr ezXAttr = dir.createEncryptionZone(src, suite,
7982          version, keyName);
7983      List<XAttr> xAttrs = Lists.newArrayListWithCapacity(1);
7984      xAttrs.add(ezXAttr);
7985      getEditLog().logSetXAttrs(src, xAttrs, logRetryCache);
7986      final INodesInPath iip = dir.getINodesInPath4Write(src, false);
7987      resultingStat = dir.getAuditFileInfo(iip);
7988    } finally {
7989      writeUnlock();
7990    }
7991    getEditLog().logSync();
7992    logAuditEvent(true, "createEncryptionZone", srcArg, null, resultingStat);
7993  }
7994
7995  /**
7996   * Get the encryption zone for the specified path.
7997   *
7998   * @param srcArg the path of a file or directory to get the EZ for.
7999   * @return the EZ of the of the path or null if none.
8000   * @throws AccessControlException  if the caller is not the superuser.
8001   * @throws UnresolvedLinkException if the path can't be resolved.
8002   */
8003  EncryptionZone getEZForPath(final String srcArg)
8004    throws AccessControlException, UnresolvedLinkException, IOException {
8005    String src = srcArg;
8006    HdfsFileStatus resultingStat = null;
8007    final byte[][] pathComponents =
8008        FSDirectory.getPathComponentsForReservedPath(src);
8009    boolean success = false;
8010    final FSPermissionChecker pc = getPermissionChecker();
8011    checkOperation(OperationCategory.READ);
8012    readLock();
8013    try {
8014      checkOperation(OperationCategory.READ);
8015      src = dir.resolvePath(pc, src, pathComponents);
8016      final INodesInPath iip = dir.getINodesInPath(src, true);
8017      if (isPermissionEnabled) {
8018        dir.checkPathAccess(pc, iip, FsAction.READ);
8019      }
8020      final EncryptionZone ret = dir.getEZForPath(iip);
8021      resultingStat = dir.getAuditFileInfo(iip);
8022      success = true;
8023      return ret;
8024    } finally {
8025      readUnlock();
8026      logAuditEvent(success, "getEZForPath", srcArg, null, resultingStat);
8027    }
8028  }
8029
8030  BatchedListEntries<EncryptionZone> listEncryptionZones(long prevId)
8031      throws IOException {
8032    boolean success = false;
8033    checkSuperuserPrivilege();
8034    checkOperation(OperationCategory.READ);
8035    readLock();
8036    try {
8037      checkSuperuserPrivilege();
8038      checkOperation(OperationCategory.READ);
8039      final BatchedListEntries<EncryptionZone> ret =
8040          dir.listEncryptionZones(prevId);
8041      success = true;
8042      return ret;
8043    } finally {
8044      readUnlock();
8045      logAuditEvent(success, "listEncryptionZones", null);
8046    }
8047  }
8048
8049  void setXAttr(String src, XAttr xAttr, EnumSet<XAttrSetFlag> flag,
8050                boolean logRetryCache)
8051      throws IOException {
8052    checkOperation(OperationCategory.WRITE);
8053    HdfsFileStatus auditStat = null;
8054    writeLock();
8055    try {
8056      checkOperation(OperationCategory.WRITE);
8057      checkNameNodeSafeMode("Cannot set XAttr on " + src);
8058      auditStat = FSDirXAttrOp.setXAttr(dir, src, xAttr, flag, logRetryCache);
8059    } catch (AccessControlException e) {
8060      logAuditEvent(false, "setXAttr", src);
8061      throw e;
8062    } finally {
8063      writeUnlock();
8064    }
8065    getEditLog().logSync();
8066    logAuditEvent(true, "setXAttr", src, null, auditStat);
8067  }
8068
8069  List<XAttr> getXAttrs(final String src, List<XAttr> xAttrs)
8070      throws IOException {
8071    checkOperation(OperationCategory.READ);
8072    readLock();
8073    try {
8074      checkOperation(OperationCategory.READ);
8075      return FSDirXAttrOp.getXAttrs(dir, src, xAttrs);
8076    } catch (AccessControlException e) {
8077      logAuditEvent(false, "getXAttrs", src);
8078      throw e;
8079    } finally {
8080      readUnlock();
8081    }
8082  }
8083
8084  List<XAttr> listXAttrs(String src) throws IOException {
8085    checkOperation(OperationCategory.READ);
8086    readLock();
8087    try {
8088      checkOperation(OperationCategory.READ);
8089      return FSDirXAttrOp.listXAttrs(dir, src);
8090    } catch (AccessControlException e) {
8091      logAuditEvent(false, "listXAttrs", src);
8092      throw e;
8093    } finally {
8094      readUnlock();
8095    }
8096  }
8097
8098  void removeXAttr(String src, XAttr xAttr, boolean logRetryCache)
8099      throws IOException {
8100    checkOperation(OperationCategory.WRITE);
8101    HdfsFileStatus auditStat = null;
8102    writeLock();
8103    try {
8104      checkOperation(OperationCategory.WRITE);
8105      checkNameNodeSafeMode("Cannot remove XAttr entry on " + src);
8106      auditStat = FSDirXAttrOp.removeXAttr(dir, src, xAttr, logRetryCache);
8107    } catch (AccessControlException e) {
8108      logAuditEvent(false, "removeXAttr", src);
8109      throw e;
8110    } finally {
8111      writeUnlock();
8112    }
8113    getEditLog().logSync();
8114    logAuditEvent(true, "removeXAttr", src, null, auditStat);
8115  }
8116
8117  void checkAccess(String src, FsAction mode) throws IOException {
8118    checkOperation(OperationCategory.READ);
8119    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
8120    readLock();
8121    try {
8122      checkOperation(OperationCategory.READ);
8123      src = FSDirectory.resolvePath(src, pathComponents, dir);
8124      final INodesInPath iip = dir.getINodesInPath(src, true);
8125      INode inode = iip.getLastINode();
8126      if (inode == null) {
8127        throw new FileNotFoundException("Path not found");
8128      }
8129      if (isPermissionEnabled) {
8130        FSPermissionChecker pc = getPermissionChecker();
8131        dir.checkPathAccess(pc, iip, mode);
8132      }
8133    } catch (AccessControlException e) {
8134      logAuditEvent(false, "checkAccess", src);
8135      throw e;
8136    } finally {
8137      readUnlock();
8138    }
8139  }
8140
8141  /**
8142   * Default AuditLogger implementation; used when no access logger is
8143   * defined in the config file. It can also be explicitly listed in the
8144   * config file.
8145   */
8146  private static class DefaultAuditLogger extends HdfsAuditLogger {
8147
8148    private boolean logTokenTrackingId;
8149
8150    @Override
8151    public void initialize(Configuration conf) {
8152      logTokenTrackingId = conf.getBoolean(
8153          DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_KEY,
8154          DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_DEFAULT);
8155    }
8156
8157    @Override
8158    public void logAuditEvent(boolean succeeded, String userName,
8159        InetAddress addr, String cmd, String src, String dst,
8160        FileStatus status, UserGroupInformation ugi,
8161        DelegationTokenSecretManager dtSecretManager) {
8162      if (auditLog.isInfoEnabled()) {
8163        final StringBuilder sb = auditBuffer.get();
8164        sb.setLength(0);
8165        sb.append("allowed=").append(succeeded).append("\t");
8166        sb.append("ugi=").append(userName).append("\t");
8167        sb.append("ip=").append(addr).append("\t");
8168        sb.append("cmd=").append(cmd).append("\t");
8169        sb.append("src=").append(src).append("\t");
8170        sb.append("dst=").append(dst).append("\t");
8171        if (null == status) {
8172          sb.append("perm=null");
8173        } else {
8174          sb.append("perm=");
8175          sb.append(status.getOwner()).append(":");
8176          sb.append(status.getGroup()).append(":");
8177          sb.append(status.getPermission());
8178        }
8179        if (logTokenTrackingId) {
8180          sb.append("\t").append("trackingId=");
8181          String trackingId = null;
8182          if (ugi != null && dtSecretManager != null
8183              && ugi.getAuthenticationMethod() == AuthenticationMethod.TOKEN) {
8184            for (TokenIdentifier tid: ugi.getTokenIdentifiers()) {
8185              if (tid instanceof DelegationTokenIdentifier) {
8186                DelegationTokenIdentifier dtid =
8187                    (DelegationTokenIdentifier)tid;
8188                trackingId = dtSecretManager.getTokenTrackingId(dtid);
8189                break;
8190              }
8191            }
8192          }
8193          sb.append(trackingId);
8194        }
8195        sb.append("\t").append("proto=");
8196        sb.append(NamenodeWebHdfsMethods.isWebHdfsInvocation() ? "webhdfs" : "rpc");
8197        logAuditMessage(sb.toString());
8198      }
8199    }
8200
8201    public void logAuditMessage(String message) {
8202      auditLog.info(message);
8203    }
8204  }
8205
8206  private static void enableAsyncAuditLog() {
8207    if (!(auditLog instanceof Log4JLogger)) {
8208      LOG.warn("Log4j is required to enable async auditlog");
8209      return;
8210    }
8211    Logger logger = ((Log4JLogger)auditLog).getLogger();
8212    @SuppressWarnings("unchecked")
8213    List<Appender> appenders = Collections.list(logger.getAllAppenders());
8214    // failsafe against trying to async it more than once
8215    if (!appenders.isEmpty() && !(appenders.get(0) instanceof AsyncAppender)) {
8216      AsyncAppender asyncAppender = new AsyncAppender();
8217      // change logger to have an async appender containing all the
8218      // previously configured appenders
8219      for (Appender appender : appenders) {
8220        logger.removeAppender(appender);
8221        asyncAppender.addAppender(appender);
8222      }
8223      logger.addAppender(asyncAppender);        
8224    }
8225  }
8226
8227}
8228