001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hdfs.server.namenode;
019
020import static org.apache.hadoop.crypto.key.KeyProviderCryptoExtension.EncryptedKeyVersion;
021import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.FS_TRASH_INTERVAL_DEFAULT;
022import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.FS_TRASH_INTERVAL_KEY;
023import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.IO_FILE_BUFFER_SIZE_DEFAULT;
024import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.IO_FILE_BUFFER_SIZE_KEY;
025import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_BLOCK_SIZE_DEFAULT;
026import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_BLOCK_SIZE_KEY;
027import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_BYTES_PER_CHECKSUM_DEFAULT;
028import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_BYTES_PER_CHECKSUM_KEY;
029import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CHECKSUM_TYPE_DEFAULT;
030import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CHECKSUM_TYPE_KEY;
031import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_WRITE_PACKET_SIZE_DEFAULT;
032import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_WRITE_PACKET_SIZE_KEY;
033import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_ENCRYPT_DATA_TRANSFER_DEFAULT;
034import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_ENCRYPT_DATA_TRANSFER_KEY;
035import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_HA_STANDBY_CHECKPOINTS_DEFAULT;
036import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_HA_STANDBY_CHECKPOINTS_KEY;
037import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_ACCESSTIME_PRECISION_DEFAULT;
038import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_ACCESSTIME_PRECISION_KEY;
039import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_AUDIT_LOGGERS_KEY;
040import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_ASYNC_DEFAULT;
041import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_ASYNC_KEY;
042import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_DEFAULT;
043import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_KEY;
044import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_TXNS_DEFAULT;
045import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_TXNS_KEY;
046import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DEFAULT_AUDIT_LOGGER_NAME;
047import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_KEY_UPDATE_INTERVAL_DEFAULT;
048import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_KEY_UPDATE_INTERVAL_KEY;
049import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_DEFAULT;
050import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_KEY;
051import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_MAX_LIFETIME_DEFAULT;
052import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_MAX_LIFETIME_KEY;
053import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_RENEW_INTERVAL_DEFAULT;
054import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_RENEW_INTERVAL_KEY;
055import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_KEY;
056import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_REQUIRED_KEY;
057import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDIT_LOG_AUTOROLL_CHECK_INTERVAL_MS;
058import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDIT_LOG_AUTOROLL_CHECK_INTERVAL_MS_DEFAULT;
059import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDIT_LOG_AUTOROLL_MULTIPLIER_THRESHOLD;
060import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDIT_LOG_AUTOROLL_MULTIPLIER_THRESHOLD_DEFAULT;
061import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_ENABLE_RETRY_CACHE_DEFAULT;
062import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_ENABLE_RETRY_CACHE_KEY;
063import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_LAZY_PERSIST_FILE_SCRUB_INTERVAL_SEC;
064import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_LAZY_PERSIST_FILE_SCRUB_INTERVAL_SEC_DEFAULT;
065import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_INODE_ATTRIBUTES_PROVIDER_KEY;
066import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_MAX_OBJECTS_DEFAULT;
067import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_MAX_OBJECTS_KEY;
068import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_NAME_DIR_KEY;
069import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_REPLICATION_MIN_DEFAULT;
070import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_REPLICATION_MIN_KEY;
071import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_REPL_QUEUE_THRESHOLD_PCT_KEY;
072import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_DEFAULT;
073import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_KEY;
074import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RETRY_CACHE_EXPIRYTIME_MILLIS_DEFAULT;
075import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RETRY_CACHE_EXPIRYTIME_MILLIS_KEY;
076import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RETRY_CACHE_HEAP_PERCENT_DEFAULT;
077import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RETRY_CACHE_HEAP_PERCENT_KEY;
078import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_EXTENSION_KEY;
079import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_MIN_DATANODES_DEFAULT;
080import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_MIN_DATANODES_KEY;
081import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_DEFAULT;
082import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_KEY;
083import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SHARED_EDITS_DIR_KEY;
084import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_ENABLED_DEFAULT;
085import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_ENABLED_KEY;
086import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_SUPERUSERGROUP_DEFAULT;
087import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_SUPERUSERGROUP_KEY;
088import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_REPLICATION_DEFAULT;
089import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_REPLICATION_KEY;
090import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_SUPPORT_APPEND_DEFAULT;
091import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_SUPPORT_APPEND_KEY;
092import static org.apache.hadoop.hdfs.server.common.HdfsServerConstants.SECURITY_XATTR_UNREADABLE_BY_SUPERUSER;
093import static org.apache.hadoop.util.Time.now;
094import static org.apache.hadoop.util.Time.monotonicNow;
095import static org.apache.hadoop.hdfs.server.namenode.top.metrics.TopMetrics.TOPMETRICS_METRICS_SOURCE_NAME;
096
097import java.io.BufferedWriter;
098import java.io.ByteArrayInputStream;
099import java.io.DataInput;
100import java.io.DataInputStream;
101import java.io.DataOutputStream;
102import java.io.File;
103import java.io.FileNotFoundException;
104import java.io.FileOutputStream;
105import java.io.IOException;
106import java.io.OutputStreamWriter;
107import java.io.PrintWriter;
108import java.io.StringWriter;
109import java.lang.management.ManagementFactory;
110import java.net.InetAddress;
111import java.net.URI;
112import java.security.GeneralSecurityException;
113import java.util.ArrayList;
114import java.util.Arrays;
115import java.util.Collection;
116import java.util.Collections;
117import java.util.Date;
118import java.util.EnumSet;
119import java.util.HashMap;
120import java.util.HashSet;
121import java.util.Iterator;
122import java.util.LinkedHashSet;
123import java.util.List;
124import java.util.Map;
125import java.util.Set;
126import java.util.TreeMap;
127import java.util.concurrent.TimeUnit;
128import java.util.concurrent.locks.Condition;
129import java.util.concurrent.locks.ReentrantLock;
130import java.util.concurrent.locks.ReentrantReadWriteLock;
131
132import javax.management.NotCompliantMBeanException;
133import javax.management.ObjectName;
134import javax.management.StandardMBean;
135
136import org.apache.commons.logging.Log;
137import org.apache.commons.logging.LogFactory;
138import org.apache.commons.logging.impl.Log4JLogger;
139import org.apache.hadoop.HadoopIllegalArgumentException;
140import org.apache.hadoop.classification.InterfaceAudience;
141import org.apache.hadoop.conf.Configuration;
142import org.apache.hadoop.crypto.CipherSuite;
143import org.apache.hadoop.crypto.CryptoProtocolVersion;
144import org.apache.hadoop.crypto.key.KeyProvider;
145import org.apache.hadoop.crypto.key.KeyProviderCryptoExtension;
146import org.apache.hadoop.fs.BatchedRemoteIterator.BatchedListEntries;
147import org.apache.hadoop.fs.CacheFlag;
148import org.apache.hadoop.fs.ContentSummary;
149import org.apache.hadoop.fs.CreateFlag;
150import org.apache.hadoop.fs.FileAlreadyExistsException;
151import org.apache.hadoop.fs.FileEncryptionInfo;
152import org.apache.hadoop.fs.FileStatus;
153import org.apache.hadoop.fs.FileSystem;
154import org.apache.hadoop.fs.FsServerDefaults;
155import org.apache.hadoop.fs.InvalidPathException;
156import org.apache.hadoop.fs.Options;
157import org.apache.hadoop.fs.ParentNotDirectoryException;
158import org.apache.hadoop.fs.Path;
159import org.apache.hadoop.fs.UnresolvedLinkException;
160import org.apache.hadoop.fs.XAttr;
161import org.apache.hadoop.fs.XAttrSetFlag;
162import org.apache.hadoop.fs.permission.AclEntry;
163import org.apache.hadoop.fs.permission.AclStatus;
164import org.apache.hadoop.fs.permission.FsAction;
165import org.apache.hadoop.fs.permission.FsPermission;
166import org.apache.hadoop.fs.permission.PermissionStatus;
167import org.apache.hadoop.fs.StorageType;
168import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState;
169import org.apache.hadoop.ha.ServiceFailedException;
170import org.apache.hadoop.hdfs.protocol.BlockStoragePolicy;
171import org.apache.hadoop.hdfs.DFSConfigKeys;
172import org.apache.hadoop.hdfs.DFSUtil;
173import org.apache.hadoop.hdfs.HAUtil;
174import org.apache.hadoop.hdfs.HdfsConfiguration;
175import org.apache.hadoop.hdfs.UnknownCryptoProtocolVersionException;
176import org.apache.hadoop.hdfs.XAttrHelper;
177import org.apache.hadoop.hdfs.protocol.AlreadyBeingCreatedException;
178import org.apache.hadoop.hdfs.protocol.Block;
179import org.apache.hadoop.hdfs.protocol.CacheDirectiveEntry;
180import org.apache.hadoop.hdfs.protocol.CacheDirectiveInfo;
181import org.apache.hadoop.hdfs.protocol.CachePoolEntry;
182import org.apache.hadoop.hdfs.protocol.CachePoolInfo;
183import org.apache.hadoop.hdfs.protocol.ClientProtocol;
184import org.apache.hadoop.hdfs.protocol.DatanodeID;
185import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
186import org.apache.hadoop.hdfs.protocol.DirectoryListing;
187import org.apache.hadoop.hdfs.protocol.EncryptionZone;
188import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
189import org.apache.hadoop.hdfs.protocol.HdfsConstants;
190import org.apache.hadoop.hdfs.protocol.LastBlockWithStatus;
191import org.apache.hadoop.hdfs.protocol.HdfsConstants.DatanodeReportType;
192import org.apache.hadoop.hdfs.protocol.HdfsConstants.SafeModeAction;
193import org.apache.hadoop.hdfs.protocol.HdfsFileStatus;
194import org.apache.hadoop.hdfs.protocol.LocatedBlock;
195import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
196import org.apache.hadoop.hdfs.protocol.QuotaExceededException;
197import org.apache.hadoop.hdfs.protocol.RecoveryInProgressException;
198import org.apache.hadoop.hdfs.protocol.RollingUpgradeException;
199import org.apache.hadoop.hdfs.protocol.RollingUpgradeInfo;
200import org.apache.hadoop.hdfs.protocol.SnapshotAccessControlException;
201import org.apache.hadoop.hdfs.protocol.SnapshotDiffReport;
202import org.apache.hadoop.hdfs.protocol.SnapshottableDirectoryStatus;
203import org.apache.hadoop.hdfs.protocol.datatransfer.ReplaceDatanodeOnFailure;
204import org.apache.hadoop.hdfs.security.token.block.BlockTokenSecretManager;
205import org.apache.hadoop.hdfs.security.token.block.BlockTokenSecretManager.AccessMode;
206import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier;
207import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenSecretManager;
208import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenSecretManager.SecretManagerState;
209import org.apache.hadoop.hdfs.server.blockmanagement.BlockCollection;
210import org.apache.hadoop.hdfs.server.blockmanagement.BlockIdManager;
211import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfoContiguous;
212import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfoContiguousUnderConstruction;
213import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager;
214import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor;
215import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeManager;
216import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeStatistics;
217import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeStorageInfo;
218import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.BlockUCState;
219import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.NamenodeRole;
220import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.RollingUpgradeStartupOption;
221import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption;
222import org.apache.hadoop.hdfs.server.common.Storage;
223import org.apache.hadoop.hdfs.server.common.Storage.StorageDirType;
224import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory;
225import org.apache.hadoop.hdfs.server.common.Util;
226import org.apache.hadoop.hdfs.server.namenode.FsImageProto.SecretManagerSection;
227import org.apache.hadoop.hdfs.server.namenode.INode.BlocksMapUpdateInfo;
228import org.apache.hadoop.hdfs.server.namenode.JournalSet.JournalAndStream;
229import org.apache.hadoop.hdfs.server.namenode.LeaseManager.Lease;
230import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeFile;
231import org.apache.hadoop.hdfs.server.namenode.NameNode.OperationCategory;
232import org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer;
233import org.apache.hadoop.hdfs.server.namenode.ha.HAContext;
234import org.apache.hadoop.hdfs.server.namenode.ha.StandbyCheckpointer;
235import org.apache.hadoop.hdfs.server.namenode.metrics.FSNamesystemMBean;
236import org.apache.hadoop.hdfs.server.namenode.metrics.NameNodeMetrics;
237import org.apache.hadoop.hdfs.server.namenode.snapshot.Snapshot;
238import org.apache.hadoop.hdfs.server.namenode.snapshot.SnapshotManager;
239import org.apache.hadoop.hdfs.server.namenode.startupprogress.Phase;
240import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress;
241import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress.Counter;
242import org.apache.hadoop.hdfs.server.namenode.startupprogress.Status;
243import org.apache.hadoop.hdfs.server.namenode.startupprogress.Step;
244import org.apache.hadoop.hdfs.server.namenode.startupprogress.StepType;
245import org.apache.hadoop.hdfs.server.namenode.top.TopAuditLogger;
246import org.apache.hadoop.hdfs.server.namenode.top.TopConf;
247import org.apache.hadoop.hdfs.server.namenode.top.metrics.TopMetrics;
248import org.apache.hadoop.hdfs.server.namenode.top.window.RollingWindowManager;
249import org.apache.hadoop.hdfs.server.namenode.web.resources.NamenodeWebHdfsMethods;
250import org.apache.hadoop.hdfs.server.protocol.DatanodeCommand;
251import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration;
252import org.apache.hadoop.hdfs.server.protocol.DatanodeStorageReport;
253import org.apache.hadoop.hdfs.server.protocol.HeartbeatResponse;
254import org.apache.hadoop.hdfs.server.protocol.NNHAStatusHeartbeat;
255import org.apache.hadoop.hdfs.server.protocol.NamenodeCommand;
256import org.apache.hadoop.hdfs.server.protocol.NamenodeRegistration;
257import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
258import org.apache.hadoop.hdfs.server.protocol.StorageReceivedDeletedBlocks;
259import org.apache.hadoop.hdfs.server.protocol.StorageReport;
260import org.apache.hadoop.hdfs.server.protocol.VolumeFailureSummary;
261import org.apache.hadoop.io.EnumSetWritable;
262import org.apache.hadoop.io.IOUtils;
263import org.apache.hadoop.io.Text;
264import org.apache.hadoop.ipc.RetriableException;
265import org.apache.hadoop.ipc.RetryCache;
266import org.apache.hadoop.ipc.Server;
267import org.apache.hadoop.ipc.StandbyException;
268import org.apache.hadoop.metrics2.annotation.Metric;
269import org.apache.hadoop.metrics2.annotation.Metrics;
270import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
271import org.apache.hadoop.metrics2.lib.MetricsRegistry;
272import org.apache.hadoop.metrics2.lib.MutableRatesWithAggregation;
273import org.apache.hadoop.metrics2.util.MBeans;
274import org.apache.hadoop.net.NetworkTopology;
275import org.apache.hadoop.net.Node;
276import org.apache.hadoop.net.NodeBase;
277import org.apache.hadoop.security.AccessControlException;
278import org.apache.hadoop.security.UserGroupInformation;
279import org.apache.hadoop.security.UserGroupInformation.AuthenticationMethod;
280import org.apache.hadoop.security.token.SecretManager.InvalidToken;
281import org.apache.hadoop.security.token.Token;
282import org.apache.hadoop.security.token.TokenIdentifier;
283import org.apache.hadoop.security.token.delegation.DelegationKey;
284import org.apache.hadoop.util.ChunkedArrayList;
285import org.apache.hadoop.util.Daemon;
286import org.apache.hadoop.util.DataChecksum;
287import org.apache.hadoop.util.ReflectionUtils;
288import org.apache.hadoop.util.StringUtils;
289import org.apache.hadoop.util.VersionInfo;
290import org.apache.log4j.Appender;
291import org.apache.log4j.AsyncAppender;
292import org.apache.log4j.Logger;
293import org.codehaus.jackson.map.ObjectMapper;
294import org.mortbay.util.ajax.JSON;
295
296import com.google.common.annotations.VisibleForTesting;
297import com.google.common.base.Charsets;
298import com.google.common.base.Preconditions;
299import com.google.common.collect.ImmutableMap;
300import com.google.common.collect.Lists;
301
302/***************************************************
303 * FSNamesystem does the actual bookkeeping work for the
304 * DataNode.
305 *
306 * It tracks several important tables.
307 *
308 * 1)  valid fsname --> blocklist  (kept on disk, logged)
309 * 2)  Set of all valid blocks (inverted #1)
310 * 3)  block --> machinelist (kept in memory, rebuilt dynamically from reports)
311 * 4)  machine --> blocklist (inverted #2)
312 * 5)  LRU cache of updated-heartbeat machines
313 ***************************************************/
314@InterfaceAudience.Private
315@Metrics(context="dfs")
316public class FSNamesystem implements Namesystem, FSNamesystemMBean,
317  NameNodeMXBean {
318  public static final Log LOG = LogFactory.getLog(FSNamesystem.class);
319  private final MetricsRegistry registry = new MetricsRegistry("FSNamesystem");
320  @Metric final MutableRatesWithAggregation detailedLockHoldTimeMetrics =
321      registry.newRatesWithAggregation("detailedLockHoldTimeMetrics");
322
323  private static final ThreadLocal<StringBuilder> auditBuffer =
324    new ThreadLocal<StringBuilder>() {
325      @Override
326      protected StringBuilder initialValue() {
327        return new StringBuilder();
328      }
329  };
330
331  private final BlockIdManager blockIdManager;
332
333  @VisibleForTesting
334  public boolean isAuditEnabled() {
335    return !isDefaultAuditLogger || auditLog.isInfoEnabled();
336  }
337
338  private void logAuditEvent(boolean succeeded, String cmd, String src)
339      throws IOException {
340    logAuditEvent(succeeded, cmd, src, null, null);
341  }
342  
343  private void logAuditEvent(boolean succeeded, String cmd, String src,
344      String dst, HdfsFileStatus stat) throws IOException {
345    if (isAuditEnabled() && isExternalInvocation()) {
346      logAuditEvent(succeeded, getRemoteUser(), getRemoteIp(),
347                    cmd, src, dst, stat);
348    }
349  }
350
351  private void logAuditEvent(boolean succeeded,
352      UserGroupInformation ugi, InetAddress addr, String cmd, String src,
353      String dst, HdfsFileStatus stat) {
354    FileStatus status = null;
355    if (stat != null) {
356      Path symlink = stat.isSymlink() ? new Path(stat.getSymlink()) : null;
357      Path path = dst != null ? new Path(dst) : new Path(src);
358      status = new FileStatus(stat.getLen(), stat.isDir(),
359          stat.getReplication(), stat.getBlockSize(), stat.getModificationTime(),
360          stat.getAccessTime(), stat.getPermission(), stat.getOwner(),
361          stat.getGroup(), symlink, path);
362    }
363    for (AuditLogger logger : auditLoggers) {
364      if (logger instanceof HdfsAuditLogger) {
365        HdfsAuditLogger hdfsLogger = (HdfsAuditLogger) logger;
366        hdfsLogger.logAuditEvent(succeeded, ugi.toString(), addr, cmd, src, dst,
367            status, ugi, dtSecretManager);
368      } else {
369        logger.logAuditEvent(succeeded, ugi.toString(), addr,
370            cmd, src, dst, status);
371      }
372    }
373  }
374
375  /**
376   * Logger for audit events, noting successful FSNamesystem operations. Emits
377   * to FSNamesystem.audit at INFO. Each event causes a set of tab-separated
378   * <code>key=value</code> pairs to be written for the following properties:
379   * <code>
380   * ugi=&lt;ugi in RPC&gt;
381   * ip=&lt;remote IP&gt;
382   * cmd=&lt;command&gt;
383   * src=&lt;src path&gt;
384   * dst=&lt;dst path (optional)&gt;
385   * perm=&lt;permissions (optional)&gt;
386   * </code>
387   */
388  public static final Log auditLog = LogFactory.getLog(
389      FSNamesystem.class.getName() + ".audit");
390
391  static final int DEFAULT_MAX_CORRUPT_FILEBLOCKS_RETURNED = 100;
392  static int BLOCK_DELETION_INCREMENT = 1000;
393  private final boolean isPermissionEnabled;
394  private final UserGroupInformation fsOwner;
395  private final String supergroup;
396  private final boolean standbyShouldCheckpoint;
397  
398  // Scan interval is not configurable.
399  private static final long DELEGATION_TOKEN_REMOVER_SCAN_INTERVAL =
400    TimeUnit.MILLISECONDS.convert(1, TimeUnit.HOURS);
401  final DelegationTokenSecretManager dtSecretManager;
402  private final boolean alwaysUseDelegationTokensForTests;
403
404  private static final Step STEP_AWAITING_REPORTED_BLOCKS =
405    new Step(StepType.AWAITING_REPORTED_BLOCKS);
406
407  // Tracks whether the default audit logger is the only configured audit
408  // logger; this allows isAuditEnabled() to return false in case the
409  // underlying logger is disabled, and avoid some unnecessary work.
410  private final boolean isDefaultAuditLogger;
411  private final List<AuditLogger> auditLoggers;
412
413  /** The namespace tree. */
414  FSDirectory dir;
415  private final BlockManager blockManager;
416  private final SnapshotManager snapshotManager;
417  private final CacheManager cacheManager;
418  private final DatanodeStatistics datanodeStatistics;
419
420  private String nameserviceId;
421
422  private volatile RollingUpgradeInfo rollingUpgradeInfo = null;
423  /**
424   * A flag that indicates whether the checkpointer should checkpoint a rollback
425   * fsimage. The edit log tailer sets this flag. The checkpoint will create a
426   * rollback fsimage if the flag is true, and then change the flag to false.
427   */
428  private volatile boolean needRollbackFsImage;
429
430  // Block pool ID used by this namenode
431  private String blockPoolId;
432
433  final LeaseManager leaseManager = new LeaseManager(this); 
434
435  volatile Daemon smmthread = null;  // SafeModeMonitor thread
436  
437  Daemon nnrmthread = null; // NamenodeResourceMonitor thread
438
439  Daemon nnEditLogRoller = null; // NameNodeEditLogRoller thread
440
441  // A daemon to periodically clean up corrupt lazyPersist files
442  // from the name space.
443  Daemon lazyPersistFileScrubber = null;
444  /**
445   * When an active namenode will roll its own edit log, in # edits
446   */
447  private final long editLogRollerThreshold;
448  /**
449   * Check interval of an active namenode's edit log roller thread 
450   */
451  private final int editLogRollerInterval;
452
453  /**
454   * How frequently we scan and unlink corrupt lazyPersist files.
455   * (In seconds)
456   */
457  private final int lazyPersistFileScrubIntervalSec;
458
459  private volatile boolean hasResourcesAvailable = false;
460  private volatile boolean fsRunning = true;
461  
462  /** The start time of the namesystem. */
463  private final long startTime = now();
464
465  /** The interval of namenode checking for the disk space availability */
466  private final long resourceRecheckInterval;
467
468  // The actual resource checker instance.
469  NameNodeResourceChecker nnResourceChecker;
470
471  private final FsServerDefaults serverDefaults;
472  private final boolean supportAppends;
473  private final ReplaceDatanodeOnFailure dtpReplaceDatanodeOnFailure;
474
475  private volatile SafeModeInfo safeMode;  // safe mode information
476
477  private final long maxFsObjects;          // maximum number of fs objects
478
479  private final long minBlockSize;         // minimum block size
480  private final long maxBlocksPerFile;     // maximum # of blocks per file
481
482  // precision of access times.
483  private final long accessTimePrecision;
484
485  /** Lock to protect FSNamesystem. */
486  private final FSNamesystemLock fsLock;
487
488  /** 
489   * Checkpoint lock to protect FSNamesystem modification on standby NNs.
490   * Unlike fsLock, it does not affect block updates. On active NNs, this lock
491   * does not provide proper protection, because there are operations that
492   * modify both block and name system state.  Even on standby, fsLock is 
493   * used when block state changes need to be blocked.
494   */
495  private final ReentrantLock cpLock;
496
497  /**
498   * Used when this NN is in standby state to read from the shared edit log.
499   */
500  private EditLogTailer editLogTailer = null;
501
502  /**
503   * Used when this NN is in standby state to perform checkpoints.
504   */
505  private StandbyCheckpointer standbyCheckpointer;
506
507  /**
508   * Reference to the NN's HAContext object. This is only set once
509   * {@link #startCommonServices(Configuration, HAContext)} is called. 
510   */
511  private HAContext haContext;
512
513  private final boolean haEnabled;
514
515  /** flag indicating whether replication queues have been initialized */
516  boolean initializedReplQueues = false;
517
518  /**
519   * Whether the namenode is in the middle of starting the active service
520   */
521  private volatile boolean startingActiveService = false;
522
523  private final RetryCache retryCache;
524
525  private KeyProviderCryptoExtension provider = null;
526
527  private volatile boolean imageLoaded = false;
528  private final Condition cond;
529
530  private final FSImage fsImage;
531
532  private final TopConf topConf;
533  private TopMetrics topMetrics;
534
535  private INodeAttributeProvider inodeAttributeProvider;
536
537  /**
538   * Notify that loading of this FSDirectory is complete, and
539   * it is imageLoaded for use
540   */
541  void imageLoadComplete() {
542    Preconditions.checkState(!imageLoaded, "FSDirectory already loaded");
543    setImageLoaded();
544  }
545
546  void setImageLoaded() {
547    if(imageLoaded) return;
548    writeLock();
549    try {
550      setImageLoaded(true);
551      dir.markNameCacheInitialized();
552      cond.signalAll();
553    } finally {
554      writeUnlock("setImageLoaded");
555    }
556  }
557
558  //This is for testing purposes only
559  @VisibleForTesting
560  boolean isImageLoaded() {
561    return imageLoaded;
562  }
563
564  // exposed for unit tests
565  protected void setImageLoaded(boolean flag) {
566    imageLoaded = flag;
567  }
568
569  /**
570   * Block until the object is imageLoaded to be used.
571   */
572  void waitForLoadingFSImage() {
573    if (!imageLoaded) {
574      writeLock();
575      try {
576        while (!imageLoaded) {
577          try {
578            cond.await(5000, TimeUnit.MILLISECONDS);
579          } catch (InterruptedException ignored) {
580          }
581        }
582      } finally {
583        writeUnlock();
584      }
585    }
586  }
587
588  /**
589   * Clear all loaded data
590   */
591  void clear() {
592    dir.reset();
593    dtSecretManager.reset();
594    blockIdManager.clear();
595    leaseManager.removeAllLeases();
596    snapshotManager.clearSnapshottableDirs();
597    cacheManager.clear();
598    setImageLoaded(false);
599    blockManager.clear();
600  }
601
602  @VisibleForTesting
603  LeaseManager getLeaseManager() {
604    return leaseManager;
605  }
606  
607  boolean isHaEnabled() {
608    return haEnabled;
609  }
610  
611  /**
612   * Check the supplied configuration for correctness.
613   * @param conf Supplies the configuration to validate.
614   * @throws IOException if the configuration could not be queried.
615   * @throws IllegalArgumentException if the configuration is invalid.
616   */
617  private static void checkConfiguration(Configuration conf)
618      throws IOException {
619
620    final Collection<URI> namespaceDirs =
621        FSNamesystem.getNamespaceDirs(conf);
622    final Collection<URI> editsDirs =
623        FSNamesystem.getNamespaceEditsDirs(conf);
624    final Collection<URI> requiredEditsDirs =
625        FSNamesystem.getRequiredNamespaceEditsDirs(conf);
626    final Collection<URI> sharedEditsDirs =
627        FSNamesystem.getSharedEditsDirs(conf);
628
629    for (URI u : requiredEditsDirs) {
630      if (u.toString().compareTo(
631              DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_DEFAULT) == 0) {
632        continue;
633      }
634
635      // Each required directory must also be in editsDirs or in
636      // sharedEditsDirs.
637      if (!editsDirs.contains(u) &&
638          !sharedEditsDirs.contains(u)) {
639        throw new IllegalArgumentException(
640            "Required edits directory " + u.toString() + " not present in " +
641            DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_KEY + ". " +
642            DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_KEY + "=" +
643            editsDirs.toString() + "; " +
644            DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_REQUIRED_KEY + "=" +
645            requiredEditsDirs.toString() + ". " +
646            DFSConfigKeys.DFS_NAMENODE_SHARED_EDITS_DIR_KEY + "=" +
647            sharedEditsDirs.toString() + ".");
648      }
649    }
650
651    if (namespaceDirs.size() == 1) {
652      LOG.warn("Only one image storage directory ("
653          + DFS_NAMENODE_NAME_DIR_KEY + ") configured. Beware of data loss"
654          + " due to lack of redundant storage directories!");
655    }
656    if (editsDirs.size() == 1) {
657      LOG.warn("Only one namespace edits storage directory ("
658          + DFS_NAMENODE_EDITS_DIR_KEY + ") configured. Beware of data loss"
659          + " due to lack of redundant storage directories!");
660    }
661  }
662
663  /**
664   * Instantiates an FSNamesystem loaded from the image and edits
665   * directories specified in the passed Configuration.
666   *
667   * @param conf the Configuration which specifies the storage directories
668   *             from which to load
669   * @return an FSNamesystem which contains the loaded namespace
670   * @throws IOException if loading fails
671   */
672  static FSNamesystem loadFromDisk(Configuration conf) throws IOException {
673
674    checkConfiguration(conf);
675    FSImage fsImage = new FSImage(conf,
676        FSNamesystem.getNamespaceDirs(conf),
677        FSNamesystem.getNamespaceEditsDirs(conf));
678    FSNamesystem namesystem = new FSNamesystem(conf, fsImage, false);
679    StartupOption startOpt = NameNode.getStartupOption(conf);
680    if (startOpt == StartupOption.RECOVER) {
681      namesystem.setSafeMode(SafeModeAction.SAFEMODE_ENTER);
682    }
683
684    long loadStart = monotonicNow();
685    try {
686      namesystem.loadFSImage(startOpt);
687    } catch (IOException ioe) {
688      LOG.warn("Encountered exception loading fsimage", ioe);
689      fsImage.close();
690      throw ioe;
691    }
692    long timeTakenToLoadFSImage = monotonicNow() - loadStart;
693    LOG.info("Finished loading FSImage in " + timeTakenToLoadFSImage + " msecs");
694    NameNodeMetrics nnMetrics = NameNode.getNameNodeMetrics();
695    if (nnMetrics != null) {
696      nnMetrics.setFsImageLoadTime((int) timeTakenToLoadFSImage);
697    }
698    return namesystem;
699  }
700  
701  FSNamesystem(Configuration conf, FSImage fsImage) throws IOException {
702    this(conf, fsImage, false);
703  }
704  
705  /**
706   * Create an FSNamesystem associated with the specified image.
707   * 
708   * Note that this does not load any data off of disk -- if you would
709   * like that behavior, use {@link #loadFromDisk(Configuration)}
710   *
711   * @param conf configuration
712   * @param fsImage The FSImage to associate with
713   * @param ignoreRetryCache Whether or not should ignore the retry cache setup
714   *                         step. For Secondary NN this should be set to true.
715   * @throws IOException on bad configuration
716   */
717  FSNamesystem(Configuration conf, FSImage fsImage, boolean ignoreRetryCache)
718      throws IOException {
719    provider = DFSUtil.createKeyProviderCryptoExtension(conf);
720    if (provider == null) {
721      LOG.info("No KeyProvider found.");
722    } else {
723      LOG.info("Found KeyProvider: " + provider.toString());
724    }
725    if (conf.getBoolean(DFS_NAMENODE_AUDIT_LOG_ASYNC_KEY,
726                        DFS_NAMENODE_AUDIT_LOG_ASYNC_DEFAULT)) {
727      LOG.info("Enabling async auditlog");
728      enableAsyncAuditLog();
729    }
730    fsLock = new FSNamesystemLock(conf, detailedLockHoldTimeMetrics);
731    cond = fsLock.newWriteLockCondition();
732    cpLock = new ReentrantLock();
733
734    this.fsImage = fsImage;
735    try {
736      resourceRecheckInterval = conf.getLong(
737          DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_KEY,
738          DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_DEFAULT);
739
740      this.blockManager = new BlockManager(this, conf);
741      this.datanodeStatistics = blockManager.getDatanodeManager().getDatanodeStatistics();
742      this.blockIdManager = new BlockIdManager(blockManager);
743
744      this.fsOwner = UserGroupInformation.getCurrentUser();
745      this.supergroup = conf.get(DFS_PERMISSIONS_SUPERUSERGROUP_KEY, 
746                                 DFS_PERMISSIONS_SUPERUSERGROUP_DEFAULT);
747      this.isPermissionEnabled = conf.getBoolean(DFS_PERMISSIONS_ENABLED_KEY,
748                                                 DFS_PERMISSIONS_ENABLED_DEFAULT);
749      LOG.info("fsOwner             = " + fsOwner);
750      LOG.info("supergroup          = " + supergroup);
751      LOG.info("isPermissionEnabled = " + isPermissionEnabled);
752
753      // block allocation has to be persisted in HA using a shared edits directory
754      // so that the standby has up-to-date namespace information
755      nameserviceId = DFSUtil.getNamenodeNameServiceId(conf);
756      this.haEnabled = HAUtil.isHAEnabled(conf, nameserviceId);  
757      
758      // Sanity check the HA-related config.
759      if (nameserviceId != null) {
760        LOG.info("Determined nameservice ID: " + nameserviceId);
761      }
762      LOG.info("HA Enabled: " + haEnabled);
763      if (!haEnabled && HAUtil.usesSharedEditsDir(conf)) {
764        LOG.warn("Configured NNs:\n" + DFSUtil.nnAddressesAsString(conf));
765        throw new IOException("Invalid configuration: a shared edits dir " +
766            "must not be specified if HA is not enabled.");
767      }
768
769      // Get the checksum type from config
770      String checksumTypeStr = conf.get(DFS_CHECKSUM_TYPE_KEY, DFS_CHECKSUM_TYPE_DEFAULT);
771      DataChecksum.Type checksumType;
772      try {
773         checksumType = DataChecksum.Type.valueOf(checksumTypeStr);
774      } catch (IllegalArgumentException iae) {
775         throw new IOException("Invalid checksum type in "
776            + DFS_CHECKSUM_TYPE_KEY + ": " + checksumTypeStr);
777      }
778
779      this.serverDefaults = new FsServerDefaults(
780          conf.getLongBytes(DFS_BLOCK_SIZE_KEY, DFS_BLOCK_SIZE_DEFAULT),
781          conf.getInt(DFS_BYTES_PER_CHECKSUM_KEY, DFS_BYTES_PER_CHECKSUM_DEFAULT),
782          conf.getInt(DFS_CLIENT_WRITE_PACKET_SIZE_KEY, DFS_CLIENT_WRITE_PACKET_SIZE_DEFAULT),
783          (short) conf.getInt(DFS_REPLICATION_KEY, DFS_REPLICATION_DEFAULT),
784          conf.getInt(IO_FILE_BUFFER_SIZE_KEY, IO_FILE_BUFFER_SIZE_DEFAULT),
785          conf.getBoolean(DFS_ENCRYPT_DATA_TRANSFER_KEY, DFS_ENCRYPT_DATA_TRANSFER_DEFAULT),
786          conf.getLong(FS_TRASH_INTERVAL_KEY, FS_TRASH_INTERVAL_DEFAULT),
787          checksumType);
788      
789      this.maxFsObjects = conf.getLong(DFS_NAMENODE_MAX_OBJECTS_KEY, 
790                                       DFS_NAMENODE_MAX_OBJECTS_DEFAULT);
791
792      this.minBlockSize = conf.getLong(DFSConfigKeys.DFS_NAMENODE_MIN_BLOCK_SIZE_KEY,
793          DFSConfigKeys.DFS_NAMENODE_MIN_BLOCK_SIZE_DEFAULT);
794      this.maxBlocksPerFile = conf.getLong(DFSConfigKeys.DFS_NAMENODE_MAX_BLOCKS_PER_FILE_KEY,
795          DFSConfigKeys.DFS_NAMENODE_MAX_BLOCKS_PER_FILE_DEFAULT);
796      this.accessTimePrecision = conf.getLong(DFS_NAMENODE_ACCESSTIME_PRECISION_KEY,
797          DFS_NAMENODE_ACCESSTIME_PRECISION_DEFAULT);
798      this.supportAppends = conf.getBoolean(DFS_SUPPORT_APPEND_KEY, DFS_SUPPORT_APPEND_DEFAULT);
799      LOG.info("Append Enabled: " + supportAppends);
800
801      this.dtpReplaceDatanodeOnFailure = ReplaceDatanodeOnFailure.get(conf);
802      
803      this.standbyShouldCheckpoint = conf.getBoolean(
804          DFS_HA_STANDBY_CHECKPOINTS_KEY, DFS_HA_STANDBY_CHECKPOINTS_DEFAULT);
805      // # edit autoroll threshold is a multiple of the checkpoint threshold 
806      this.editLogRollerThreshold = (long)
807          (conf.getFloat(
808              DFS_NAMENODE_EDIT_LOG_AUTOROLL_MULTIPLIER_THRESHOLD,
809              DFS_NAMENODE_EDIT_LOG_AUTOROLL_MULTIPLIER_THRESHOLD_DEFAULT) *
810          conf.getLong(
811              DFS_NAMENODE_CHECKPOINT_TXNS_KEY,
812              DFS_NAMENODE_CHECKPOINT_TXNS_DEFAULT));
813      this.editLogRollerInterval = conf.getInt(
814          DFS_NAMENODE_EDIT_LOG_AUTOROLL_CHECK_INTERVAL_MS,
815          DFS_NAMENODE_EDIT_LOG_AUTOROLL_CHECK_INTERVAL_MS_DEFAULT);
816
817      this.lazyPersistFileScrubIntervalSec = conf.getInt(
818          DFS_NAMENODE_LAZY_PERSIST_FILE_SCRUB_INTERVAL_SEC,
819          DFS_NAMENODE_LAZY_PERSIST_FILE_SCRUB_INTERVAL_SEC_DEFAULT);
820
821      if (this.lazyPersistFileScrubIntervalSec == 0) {
822        throw new IllegalArgumentException(
823            DFS_NAMENODE_LAZY_PERSIST_FILE_SCRUB_INTERVAL_SEC + " must be non-zero.");
824      }
825
826      // For testing purposes, allow the DT secret manager to be started regardless
827      // of whether security is enabled.
828      alwaysUseDelegationTokensForTests = conf.getBoolean(
829          DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_KEY,
830          DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_DEFAULT);
831      
832      this.dtSecretManager = createDelegationTokenSecretManager(conf);
833      this.dir = new FSDirectory(this, conf);
834      this.snapshotManager = new SnapshotManager(dir);
835      this.cacheManager = new CacheManager(this, conf, blockManager);
836      this.safeMode = new SafeModeInfo(conf);
837      this.topConf = new TopConf(conf);
838      this.auditLoggers = initAuditLoggers(conf);
839      this.isDefaultAuditLogger = auditLoggers.size() == 1 &&
840        auditLoggers.get(0) instanceof DefaultAuditLogger;
841      this.retryCache = ignoreRetryCache ? null : initRetryCache(conf);
842      Class<? extends INodeAttributeProvider> klass = conf.getClass(
843          DFS_NAMENODE_INODE_ATTRIBUTES_PROVIDER_KEY,
844          null, INodeAttributeProvider.class);
845      if (klass != null) {
846        inodeAttributeProvider = ReflectionUtils.newInstance(klass, conf);
847        LOG.info("Using INode attribute provider: " + klass.getName());
848      }
849    } catch(IOException e) {
850      LOG.error(getClass().getSimpleName() + " initialization failed.", e);
851      close();
852      throw e;
853    } catch (RuntimeException re) {
854      LOG.error(getClass().getSimpleName() + " initialization failed.", re);
855      close();
856      throw re;
857    }
858  }
859
860  @VisibleForTesting
861  public List<AuditLogger> getAuditLoggers() {
862    return auditLoggers;
863  }
864
865  @VisibleForTesting
866  public RetryCache getRetryCache() {
867    return retryCache;
868  }
869
870  void lockRetryCache() {
871    if (retryCache != null) {
872      retryCache.lock();
873    }
874  }
875
876  void unlockRetryCache() {
877    if (retryCache != null) {
878      retryCache.unlock();
879    }
880  }
881
882  /** Whether or not retry cache is enabled */
883  boolean hasRetryCache() {
884    return retryCache != null;
885  }
886  
887  void addCacheEntryWithPayload(byte[] clientId, int callId, Object payload) {
888    if (retryCache != null) {
889      retryCache.addCacheEntryWithPayload(clientId, callId, payload);
890    }
891  }
892  
893  void addCacheEntry(byte[] clientId, int callId) {
894    if (retryCache != null) {
895      retryCache.addCacheEntry(clientId, callId);
896    }
897  }
898
899  @VisibleForTesting
900  public KeyProviderCryptoExtension getProvider() {
901    return provider;
902  }
903
904  @VisibleForTesting
905  static RetryCache initRetryCache(Configuration conf) {
906    boolean enable = conf.getBoolean(DFS_NAMENODE_ENABLE_RETRY_CACHE_KEY,
907                                     DFS_NAMENODE_ENABLE_RETRY_CACHE_DEFAULT);
908    LOG.info("Retry cache on namenode is " + (enable ? "enabled" : "disabled"));
909    if (enable) {
910      float heapPercent = conf.getFloat(
911          DFS_NAMENODE_RETRY_CACHE_HEAP_PERCENT_KEY,
912          DFS_NAMENODE_RETRY_CACHE_HEAP_PERCENT_DEFAULT);
913      long entryExpiryMillis = conf.getLong(
914          DFS_NAMENODE_RETRY_CACHE_EXPIRYTIME_MILLIS_KEY,
915          DFS_NAMENODE_RETRY_CACHE_EXPIRYTIME_MILLIS_DEFAULT);
916      LOG.info("Retry cache will use " + heapPercent
917          + " of total heap and retry cache entry expiry time is "
918          + entryExpiryMillis + " millis");
919      long entryExpiryNanos = entryExpiryMillis * 1000 * 1000;
920      return new RetryCache("NameNodeRetryCache", heapPercent,
921          entryExpiryNanos);
922    }
923    return null;
924  }
925
926  private List<AuditLogger> initAuditLoggers(Configuration conf) {
927    // Initialize the custom access loggers if configured.
928    Collection<String> alClasses = conf.getStringCollection(DFS_NAMENODE_AUDIT_LOGGERS_KEY);
929    List<AuditLogger> auditLoggers = Lists.newArrayList();
930    if (alClasses != null && !alClasses.isEmpty()) {
931      for (String className : alClasses) {
932        try {
933          AuditLogger logger;
934          if (DFS_NAMENODE_DEFAULT_AUDIT_LOGGER_NAME.equals(className)) {
935            logger = new DefaultAuditLogger();
936          } else {
937            logger = (AuditLogger) Class.forName(className).newInstance();
938          }
939          logger.initialize(conf);
940          auditLoggers.add(logger);
941        } catch (RuntimeException re) {
942          throw re;
943        } catch (Exception e) {
944          throw new RuntimeException(e);
945        }
946      }
947    }
948
949    // Make sure there is at least one logger installed.
950    if (auditLoggers.isEmpty()) {
951      auditLoggers.add(new DefaultAuditLogger());
952    }
953
954    // Add audit logger to calculate top users
955    if (topConf.isEnabled) {
956      topMetrics = new TopMetrics(conf, topConf.nntopReportingPeriodsMs);
957      if (DefaultMetricsSystem.instance().getSource(
958          TOPMETRICS_METRICS_SOURCE_NAME) == null) {
959        DefaultMetricsSystem.instance().register(TOPMETRICS_METRICS_SOURCE_NAME,
960            "Top N operations by user", topMetrics);
961      }
962      auditLoggers.add(new TopAuditLogger(topMetrics));
963    }
964
965    return Collections.unmodifiableList(auditLoggers);
966  }
967
968  private void loadFSImage(StartupOption startOpt) throws IOException {
969    final FSImage fsImage = getFSImage();
970
971    // format before starting up if requested
972    if (startOpt == StartupOption.FORMAT) {
973      
974      fsImage.format(this, fsImage.getStorage().determineClusterId());// reuse current id
975
976      startOpt = StartupOption.REGULAR;
977    }
978    boolean success = false;
979    writeLock();
980    try {
981      // We shouldn't be calling saveNamespace if we've come up in standby state.
982      MetaRecoveryContext recovery = startOpt.createRecoveryContext();
983      final boolean staleImage
984          = fsImage.recoverTransitionRead(startOpt, this, recovery);
985      if (RollingUpgradeStartupOption.ROLLBACK.matches(startOpt) ||
986          RollingUpgradeStartupOption.DOWNGRADE.matches(startOpt)) {
987        rollingUpgradeInfo = null;
988      }
989      final boolean needToSave = staleImage && !haEnabled && !isRollingUpgrade(); 
990      LOG.info("Need to save fs image? " + needToSave
991          + " (staleImage=" + staleImage + ", haEnabled=" + haEnabled
992          + ", isRollingUpgrade=" + isRollingUpgrade() + ")");
993      if (needToSave) {
994        fsImage.saveNamespace(this);
995      } else {
996        updateStorageVersionForRollingUpgrade(fsImage.getLayoutVersion(),
997            startOpt);
998        // No need to save, so mark the phase done.
999        StartupProgress prog = NameNode.getStartupProgress();
1000        prog.beginPhase(Phase.SAVING_CHECKPOINT);
1001        prog.endPhase(Phase.SAVING_CHECKPOINT);
1002      }
1003      // This will start a new log segment and write to the seen_txid file, so
1004      // we shouldn't do it when coming up in standby state
1005      if (!haEnabled || (haEnabled && startOpt == StartupOption.UPGRADE)
1006          || (haEnabled && startOpt == StartupOption.UPGRADEONLY)) {
1007        fsImage.openEditLogForWrite();
1008      }
1009      success = true;
1010    } finally {
1011      if (!success) {
1012        fsImage.close();
1013      }
1014      writeUnlock("loadFSImage");
1015    }
1016    imageLoadComplete();
1017  }
1018
1019  private void updateStorageVersionForRollingUpgrade(final long layoutVersion,
1020      StartupOption startOpt) throws IOException {
1021    boolean rollingStarted = RollingUpgradeStartupOption.STARTED
1022        .matches(startOpt) && layoutVersion > HdfsConstants
1023        .NAMENODE_LAYOUT_VERSION;
1024    boolean rollingRollback = RollingUpgradeStartupOption.ROLLBACK
1025        .matches(startOpt);
1026    if (rollingRollback || rollingStarted) {
1027      fsImage.updateStorageVersion();
1028    }
1029  }
1030
1031  private void startSecretManager() {
1032    if (dtSecretManager != null) {
1033      try {
1034        dtSecretManager.startThreads();
1035      } catch (IOException e) {
1036        // Inability to start secret manager
1037        // can't be recovered from.
1038        throw new RuntimeException(e);
1039      }
1040    }
1041  }
1042  
1043  private void startSecretManagerIfNecessary() {
1044    boolean shouldRun = shouldUseDelegationTokens() &&
1045      !isInSafeMode() && getEditLog().isOpenForWrite();
1046    boolean running = dtSecretManager.isRunning();
1047    if (shouldRun && !running) {
1048      startSecretManager();
1049    }
1050  }
1051
1052  private void stopSecretManager() {
1053    if (dtSecretManager != null) {
1054      dtSecretManager.stopThreads();
1055    }
1056  }
1057  
1058  /** 
1059   * Start services common to both active and standby states
1060   */
1061  void startCommonServices(Configuration conf, HAContext haContext) throws IOException {
1062    this.registerMBean(); // register the MBean for the FSNamesystemState
1063    writeLock();
1064    this.haContext = haContext;
1065    try {
1066      nnResourceChecker = new NameNodeResourceChecker(conf);
1067      checkAvailableResources();
1068      assert safeMode != null && !isPopulatingReplQueues();
1069      StartupProgress prog = NameNode.getStartupProgress();
1070      prog.beginPhase(Phase.SAFEMODE);
1071      prog.setTotal(Phase.SAFEMODE, STEP_AWAITING_REPORTED_BLOCKS,
1072        getCompleteBlocksTotal());
1073      setBlockTotal();
1074      blockManager.activate(conf);
1075    } finally {
1076      writeUnlock("startCommonServices");
1077    }
1078    
1079    registerMXBean();
1080    DefaultMetricsSystem.instance().register(this);
1081    if (inodeAttributeProvider != null) {
1082      inodeAttributeProvider.start();
1083      dir.setINodeAttributeProvider(inodeAttributeProvider);
1084    }
1085    snapshotManager.registerMXBean();
1086  }
1087  
1088  /** 
1089   * Stop services common to both active and standby states
1090   */
1091  void stopCommonServices() {
1092    writeLock();
1093    if (inodeAttributeProvider != null) {
1094      dir.setINodeAttributeProvider(null);
1095      inodeAttributeProvider.stop();
1096    }
1097    try {
1098      if (blockManager != null) blockManager.close();
1099    } finally {
1100      writeUnlock("stopCommonServices");
1101    }
1102    RetryCache.clear(retryCache);
1103  }
1104  
1105  /**
1106   * Start services required in active state
1107   * @throws IOException
1108   */
1109  void startActiveServices() throws IOException {
1110    startingActiveService = true;
1111    LOG.info("Starting services required for active state");
1112    writeLock();
1113    try {
1114      FSEditLog editLog = getFSImage().getEditLog();
1115      
1116      if (!editLog.isOpenForWrite()) {
1117        // During startup, we're already open for write during initialization.
1118        editLog.initJournalsForWrite();
1119        // May need to recover
1120        editLog.recoverUnclosedStreams();
1121        
1122        LOG.info("Catching up to latest edits from old active before " +
1123            "taking over writer role in edits logs");
1124        editLogTailer.catchupDuringFailover();
1125        
1126        blockManager.setPostponeBlocksFromFuture(false);
1127        blockManager.getDatanodeManager().markAllDatanodesStale();
1128        blockManager.clearQueues();
1129        blockManager.processAllPendingDNMessages();
1130
1131        // Only need to re-process the queue, If not in SafeMode.
1132        if (!isInSafeMode()) {
1133          LOG.info("Reprocessing replication and invalidation queues");
1134          initializeReplQueues();
1135        }
1136
1137        if (LOG.isDebugEnabled()) {
1138          LOG.debug("NameNode metadata after re-processing " +
1139              "replication and invalidation queues during failover:\n" +
1140              metaSaveAsString());
1141        }
1142        
1143        long nextTxId = getFSImage().getLastAppliedTxId() + 1;
1144        LOG.info("Will take over writing edit logs at txnid " + 
1145            nextTxId);
1146        editLog.setNextTxId(nextTxId);
1147
1148        getFSImage().editLog.openForWrite();
1149      }
1150
1151      // Enable quota checks.
1152      dir.enableQuotaChecks();
1153      if (haEnabled) {
1154        // Renew all of the leases before becoming active.
1155        // This is because, while we were in standby mode,
1156        // the leases weren't getting renewed on this NN.
1157        // Give them all a fresh start here.
1158        leaseManager.renewAllLeases();
1159      }
1160      leaseManager.startMonitor();
1161      startSecretManagerIfNecessary();
1162
1163      //ResourceMonitor required only at ActiveNN. See HDFS-2914
1164      this.nnrmthread = new Daemon(new NameNodeResourceMonitor());
1165      nnrmthread.start();
1166
1167      nnEditLogRoller = new Daemon(new NameNodeEditLogRoller(
1168          editLogRollerThreshold, editLogRollerInterval));
1169      nnEditLogRoller.start();
1170
1171      if (lazyPersistFileScrubIntervalSec > 0) {
1172        lazyPersistFileScrubber = new Daemon(new LazyPersistFileScrubber(
1173            lazyPersistFileScrubIntervalSec));
1174        lazyPersistFileScrubber.start();
1175      }
1176
1177      cacheManager.startMonitorThread();
1178      blockManager.getDatanodeManager().setShouldSendCachingCommands(true);
1179    } finally {
1180      startingActiveService = false;
1181      checkSafeMode();
1182      writeUnlock("startActiveServices");
1183    }
1184  }
1185
1186  /**
1187   * Initialize replication queues.
1188   */
1189  private void initializeReplQueues() {
1190    LOG.info("initializing replication queues");
1191    blockManager.processMisReplicatedBlocks();
1192    initializedReplQueues = true;
1193  }
1194
1195  private boolean inActiveState() {
1196    return haContext != null &&
1197        haContext.getState().getServiceState() == HAServiceState.ACTIVE;
1198  }
1199
1200  /**
1201   * @return Whether the namenode is transitioning to active state and is in the
1202   *         middle of the {@link #startActiveServices()}
1203   */
1204  public boolean inTransitionToActive() {
1205    return haEnabled && inActiveState() && startingActiveService;
1206  }
1207
1208  private boolean shouldUseDelegationTokens() {
1209    return UserGroupInformation.isSecurityEnabled() ||
1210      alwaysUseDelegationTokensForTests;
1211  }
1212
1213  /** 
1214   * Stop services required in active state
1215   */
1216  void stopActiveServices() {
1217    LOG.info("Stopping services started for active state");
1218    writeLock();
1219    try {
1220      stopSecretManager();
1221      leaseManager.stopMonitor();
1222      if (nnrmthread != null) {
1223        ((NameNodeResourceMonitor) nnrmthread.getRunnable()).stopMonitor();
1224        nnrmthread.interrupt();
1225      }
1226      if (nnEditLogRoller != null) {
1227        ((NameNodeEditLogRoller)nnEditLogRoller.getRunnable()).stop();
1228        nnEditLogRoller.interrupt();
1229      }
1230      if (lazyPersistFileScrubber != null) {
1231        ((LazyPersistFileScrubber) lazyPersistFileScrubber.getRunnable()).stop();
1232        lazyPersistFileScrubber.interrupt();
1233      }
1234      if (dir != null && getFSImage() != null) {
1235        if (getFSImage().editLog != null) {
1236          getFSImage().editLog.close();
1237        }
1238        // Update the fsimage with the last txid that we wrote
1239        // so that the tailer starts from the right spot.
1240        getFSImage().updateLastAppliedTxIdFromWritten();
1241      }
1242      if (cacheManager != null) {
1243        cacheManager.stopMonitorThread();
1244        cacheManager.clearDirectiveStats();
1245      }
1246      blockManager.getDatanodeManager().clearPendingCachingCommands();
1247      blockManager.getDatanodeManager().setShouldSendCachingCommands(false);
1248      // Don't want to keep replication queues when not in Active.
1249      blockManager.clearQueues();
1250      initializedReplQueues = false;
1251    } finally {
1252      writeUnlock("stopActiveServices");
1253    }
1254  }
1255  
1256  /**
1257   * Start services required in standby state 
1258   * 
1259   * @throws IOException
1260   */
1261  void startStandbyServices(final Configuration conf) throws IOException {
1262    LOG.info("Starting services required for standby state");
1263    if (!getFSImage().editLog.isOpenForRead()) {
1264      // During startup, we're already open for read.
1265      getFSImage().editLog.initSharedJournalsForRead();
1266    }
1267    
1268    blockManager.setPostponeBlocksFromFuture(true);
1269
1270    // Disable quota checks while in standby.
1271    dir.disableQuotaChecks();
1272    editLogTailer = new EditLogTailer(this, conf);
1273    editLogTailer.start();
1274    if (standbyShouldCheckpoint) {
1275      standbyCheckpointer = new StandbyCheckpointer(conf, this);
1276      standbyCheckpointer.start();
1277    }
1278  }
1279
1280  /**
1281   * Called when the NN is in Standby state and the editlog tailer tails the
1282   * OP_ROLLING_UPGRADE_START.
1283   */
1284  void triggerRollbackCheckpoint() {
1285    setNeedRollbackFsImage(true);
1286    if (standbyCheckpointer != null) {
1287      standbyCheckpointer.triggerRollbackCheckpoint();
1288    }
1289  }
1290
1291  /**
1292   * Called while the NN is in Standby state, but just about to be
1293   * asked to enter Active state. This cancels any checkpoints
1294   * currently being taken.
1295   */
1296  void prepareToStopStandbyServices() throws ServiceFailedException {
1297    if (standbyCheckpointer != null) {
1298      standbyCheckpointer.cancelAndPreventCheckpoints(
1299          "About to leave standby state");
1300    }
1301  }
1302
1303  /** Stop services required in standby state */
1304  void stopStandbyServices() throws IOException {
1305    LOG.info("Stopping services started for standby state");
1306    if (standbyCheckpointer != null) {
1307      standbyCheckpointer.stop();
1308    }
1309    if (editLogTailer != null) {
1310      editLogTailer.stop();
1311    }
1312    if (dir != null && getFSImage() != null && getFSImage().editLog != null) {
1313      getFSImage().editLog.close();
1314    }
1315  }
1316  
1317  @Override
1318  public void checkOperation(OperationCategory op) throws StandbyException {
1319    if (haContext != null) {
1320      // null in some unit tests
1321      haContext.checkOperation(op);
1322    }
1323  }
1324  
1325  /**
1326   * @throws RetriableException
1327   *           If 1) The NameNode is in SafeMode, 2) HA is enabled, and 3)
1328   *           NameNode is in active state
1329   * @throws SafeModeException
1330   *           Otherwise if NameNode is in SafeMode.
1331   */
1332  void checkNameNodeSafeMode(String errorMsg)
1333      throws RetriableException, SafeModeException {
1334    if (isInSafeMode()) {
1335      SafeModeException se = new SafeModeException(errorMsg, safeMode);
1336      if (haEnabled && haContext != null
1337          && haContext.getState().getServiceState() == HAServiceState.ACTIVE
1338          && shouldRetrySafeMode(this.safeMode)) {
1339        throw new RetriableException(se);
1340      } else {
1341        throw se;
1342      }
1343    }
1344  }
1345
1346  boolean isPermissionEnabled() {
1347    return isPermissionEnabled;
1348  }
1349
1350  /**
1351   * We already know that the safemode is on. We will throw a RetriableException
1352   * if the safemode is not manual or caused by low resource.
1353   */
1354  private boolean shouldRetrySafeMode(SafeModeInfo safeMode) {
1355    if (safeMode == null) {
1356      return false;
1357    } else {
1358      return !safeMode.isManual() && !safeMode.areResourcesLow();
1359    }
1360  }
1361  
1362  public static Collection<URI> getNamespaceDirs(Configuration conf) {
1363    return getStorageDirs(conf, DFS_NAMENODE_NAME_DIR_KEY);
1364  }
1365
1366  /**
1367   * Get all edits dirs which are required. If any shared edits dirs are
1368   * configured, these are also included in the set of required dirs.
1369   * 
1370   * @param conf the HDFS configuration.
1371   * @return all required dirs.
1372   */
1373  public static Collection<URI> getRequiredNamespaceEditsDirs(Configuration conf) {
1374    Set<URI> ret = new HashSet<URI>();
1375    ret.addAll(getStorageDirs(conf, DFS_NAMENODE_EDITS_DIR_REQUIRED_KEY));
1376    ret.addAll(getSharedEditsDirs(conf));
1377    return ret;
1378  }
1379
1380  private static Collection<URI> getStorageDirs(Configuration conf,
1381                                                String propertyName) {
1382    Collection<String> dirNames = conf.getTrimmedStringCollection(propertyName);
1383    StartupOption startOpt = NameNode.getStartupOption(conf);
1384    if(startOpt == StartupOption.IMPORT) {
1385      // In case of IMPORT this will get rid of default directories 
1386      // but will retain directories specified in hdfs-site.xml
1387      // When importing image from a checkpoint, the name-node can
1388      // start with empty set of storage directories.
1389      Configuration cE = new HdfsConfiguration(false);
1390      cE.addResource("core-default.xml");
1391      cE.addResource("core-site.xml");
1392      cE.addResource("hdfs-default.xml");
1393      Collection<String> dirNames2 = cE.getTrimmedStringCollection(propertyName);
1394      dirNames.removeAll(dirNames2);
1395      if(dirNames.isEmpty())
1396        LOG.warn("!!! WARNING !!!" +
1397          "\n\tThe NameNode currently runs without persistent storage." +
1398          "\n\tAny changes to the file system meta-data may be lost." +
1399          "\n\tRecommended actions:" +
1400          "\n\t\t- shutdown and restart NameNode with configured \"" 
1401          + propertyName + "\" in hdfs-site.xml;" +
1402          "\n\t\t- use Backup Node as a persistent and up-to-date storage " +
1403          "of the file system meta-data.");
1404    } else if (dirNames.isEmpty()) {
1405      dirNames = Collections.singletonList(
1406          DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_DEFAULT);
1407    }
1408    return Util.stringCollectionAsURIs(dirNames);
1409  }
1410
1411  /**
1412   * Return an ordered list of edits directories to write to.
1413   * The list is ordered such that all shared edits directories
1414   * are ordered before non-shared directories, and any duplicates
1415   * are removed. The order they are specified in the configuration
1416   * is retained.
1417   * @return Collection of shared edits directories.
1418   * @throws IOException if multiple shared edits directories are configured
1419   */
1420  public static List<URI> getNamespaceEditsDirs(Configuration conf)
1421      throws IOException {
1422    return getNamespaceEditsDirs(conf, true);
1423  }
1424  
1425  public static List<URI> getNamespaceEditsDirs(Configuration conf,
1426      boolean includeShared)
1427      throws IOException {
1428    // Use a LinkedHashSet so that order is maintained while we de-dup
1429    // the entries.
1430    LinkedHashSet<URI> editsDirs = new LinkedHashSet<URI>();
1431    
1432    if (includeShared) {
1433      List<URI> sharedDirs = getSharedEditsDirs(conf);
1434  
1435      // Fail until multiple shared edits directories are supported (HDFS-2782)
1436      if (sharedDirs.size() > 1) {
1437        throw new IOException(
1438            "Multiple shared edits directories are not yet supported");
1439      }
1440  
1441      // First add the shared edits dirs. It's critical that the shared dirs
1442      // are added first, since JournalSet syncs them in the order they are listed,
1443      // and we need to make sure all edits are in place in the shared storage
1444      // before they are replicated locally. See HDFS-2874.
1445      for (URI dir : sharedDirs) {
1446        if (!editsDirs.add(dir)) {
1447          LOG.warn("Edits URI " + dir + " listed multiple times in " + 
1448              DFS_NAMENODE_SHARED_EDITS_DIR_KEY + ". Ignoring duplicates.");
1449        }
1450      }
1451    }    
1452    // Now add the non-shared dirs.
1453    for (URI dir : getStorageDirs(conf, DFS_NAMENODE_EDITS_DIR_KEY)) {
1454      if (!editsDirs.add(dir)) {
1455        LOG.warn("Edits URI " + dir + " listed multiple times in " + 
1456            DFS_NAMENODE_SHARED_EDITS_DIR_KEY + " and " +
1457            DFS_NAMENODE_EDITS_DIR_KEY + ". Ignoring duplicates.");
1458      }
1459    }
1460
1461    if (editsDirs.isEmpty()) {
1462      // If this is the case, no edit dirs have been explicitly configured.
1463      // Image dirs are to be used for edits too.
1464      return Lists.newArrayList(getNamespaceDirs(conf));
1465    } else {
1466      return Lists.newArrayList(editsDirs);
1467    }
1468  }
1469  
1470  /**
1471   * Returns edit directories that are shared between primary and secondary.
1472   * @param conf configuration
1473   * @return collection of edit directories from {@code conf}
1474   */
1475  public static List<URI> getSharedEditsDirs(Configuration conf) {
1476    // don't use getStorageDirs here, because we want an empty default
1477    // rather than the dir in /tmp
1478    Collection<String> dirNames = conf.getTrimmedStringCollection(
1479        DFS_NAMENODE_SHARED_EDITS_DIR_KEY);
1480    return Util.stringCollectionAsURIs(dirNames);
1481  }
1482
1483  @Override
1484  public void readLock() {
1485    this.fsLock.readLock();
1486  }
1487  @Override
1488  public void readLockInterruptibly() throws InterruptedException {
1489    this.fsLock.readLockInterruptibly();
1490  }
1491  @Override
1492  public void readUnlock() {
1493    this.fsLock.readUnlock();
1494  }
1495  public void readUnlock(String opName) {
1496    this.fsLock.readUnlock(opName);
1497  }
1498  @Override
1499  public void writeLock() {
1500    this.fsLock.writeLock();
1501  }
1502  @Override
1503  public void writeLockInterruptibly() throws InterruptedException {
1504    this.fsLock.writeLockInterruptibly();
1505  }
1506  @Override
1507  public void writeUnlock() {
1508    this.fsLock.writeUnlock();
1509  }
1510  public void writeUnlock(String opName) {
1511    this.fsLock.writeUnlock(opName);
1512  }
1513  @Override
1514  public boolean hasWriteLock() {
1515    return this.fsLock.isWriteLockedByCurrentThread();
1516  }
1517  @Override
1518  public boolean hasReadLock() {
1519    return this.fsLock.getReadHoldCount() > 0 || hasWriteLock();
1520  }
1521
1522  public int getReadHoldCount() {
1523    return this.fsLock.getReadHoldCount();
1524  }
1525
1526  public int getWriteHoldCount() {
1527    return this.fsLock.getWriteHoldCount();
1528  }
1529
1530  /** Lock the checkpoint lock */
1531  public void cpLock() {
1532    this.cpLock.lock();
1533  }
1534
1535  /** Lock the checkpoint lock interrupibly */
1536  public void cpLockInterruptibly() throws InterruptedException {
1537    this.cpLock.lockInterruptibly();
1538  }
1539
1540  /** Unlock the checkpoint lock */
1541  public void cpUnlock() {
1542    this.cpLock.unlock();
1543  }
1544    
1545
1546  NamespaceInfo getNamespaceInfo() {
1547    readLock();
1548    try {
1549      return unprotectedGetNamespaceInfo();
1550    } finally {
1551      readUnlock("getNamespaceInfo");
1552    }
1553  }
1554
1555  /**
1556   * Version of @see #getNamespaceInfo() that is not protected by a lock.
1557   */
1558  NamespaceInfo unprotectedGetNamespaceInfo() {
1559    return new NamespaceInfo(getFSImage().getStorage().getNamespaceID(),
1560        getClusterId(), getBlockPoolId(),
1561        getFSImage().getStorage().getCTime());
1562  }
1563
1564  /**
1565   * Close down this file system manager.
1566   * Causes heartbeat and lease daemons to stop; waits briefly for
1567   * them to finish, but a short timeout returns control back to caller.
1568   */
1569  void close() {
1570    fsRunning = false;
1571    try {
1572      stopCommonServices();
1573      if (smmthread != null) smmthread.interrupt();
1574    } finally {
1575      // using finally to ensure we also wait for lease daemon
1576      try {
1577        stopActiveServices();
1578        stopStandbyServices();
1579      } catch (IOException ie) {
1580      } finally {
1581        IOUtils.cleanup(LOG, dir);
1582        IOUtils.cleanup(LOG, fsImage);
1583      }
1584    }
1585  }
1586
1587  @Override
1588  public boolean isRunning() {
1589    return fsRunning;
1590  }
1591  
1592  @Override
1593  public boolean isInStandbyState() {
1594    if (haContext == null || haContext.getState() == null) {
1595      // We're still starting up. In this case, if HA is
1596      // on for the cluster, we always start in standby. Otherwise
1597      // start in active.
1598      return haEnabled;
1599    }
1600
1601    return HAServiceState.STANDBY == haContext.getState().getServiceState();
1602  }
1603
1604  /**
1605   * Dump all metadata into specified file
1606   */
1607  void metaSave(String filename) throws IOException {
1608    checkSuperuserPrivilege();
1609    checkOperation(OperationCategory.UNCHECKED);
1610    writeLock();
1611    try {
1612      checkOperation(OperationCategory.UNCHECKED);
1613      File file = new File(System.getProperty("hadoop.log.dir"), filename);
1614      PrintWriter out = new PrintWriter(new BufferedWriter(
1615          new OutputStreamWriter(new FileOutputStream(file), Charsets.UTF_8)));
1616      metaSave(out);
1617      out.flush();
1618      out.close();
1619    } finally {
1620      writeUnlock("metaSave");
1621    }
1622  }
1623
1624  private void metaSave(PrintWriter out) {
1625    assert hasWriteLock();
1626    long totalInodes = this.dir.totalInodes();
1627    long totalBlocks = this.getBlocksTotal();
1628    out.println(totalInodes + " files and directories, " + totalBlocks
1629        + " blocks = " + (totalInodes + totalBlocks) + " total");
1630
1631    blockManager.metaSave(out);
1632  }
1633
1634  private String metaSaveAsString() {
1635    StringWriter sw = new StringWriter();
1636    PrintWriter pw = new PrintWriter(sw);
1637    metaSave(pw);
1638    pw.flush();
1639    return sw.toString();
1640  }
1641
1642  FsServerDefaults getServerDefaults() throws StandbyException {
1643    checkOperation(OperationCategory.READ);
1644    return serverDefaults;
1645  }
1646
1647  long getAccessTimePrecision() {
1648    return accessTimePrecision;
1649  }
1650
1651  private boolean isAccessTimeSupported() {
1652    return accessTimePrecision > 0;
1653  }
1654
1655  /////////////////////////////////////////////////////////
1656  //
1657  // These methods are called by HadoopFS clients
1658  //
1659  /////////////////////////////////////////////////////////
1660  /**
1661   * Set permissions for an existing file.
1662   * @throws IOException
1663   */
1664  void setPermission(String src, FsPermission permission) throws IOException {
1665    final String operationName = "setPermission";
1666    HdfsFileStatus auditStat;
1667    checkOperation(OperationCategory.WRITE);
1668    writeLock();
1669    try {
1670      checkOperation(OperationCategory.WRITE);
1671      checkNameNodeSafeMode("Cannot set permission for " + src);
1672      auditStat = FSDirAttrOp.setPermission(dir, src, permission);
1673    } catch (AccessControlException e) {
1674      logAuditEvent(false, operationName, src);
1675      throw e;
1676    } finally {
1677      writeUnlock(operationName);
1678    }
1679    getEditLog().logSync();
1680    logAuditEvent(true, operationName, src, null, auditStat);
1681  }
1682
1683  /**
1684   * Set owner for an existing file.
1685   * @throws IOException
1686   */
1687  void setOwner(String src, String username, String group)
1688      throws IOException {
1689    final String operationName = "setOwner";
1690    HdfsFileStatus auditStat;
1691    checkOperation(OperationCategory.WRITE);
1692    writeLock();
1693    try {
1694      checkOperation(OperationCategory.WRITE);
1695      checkNameNodeSafeMode("Cannot set owner for " + src);
1696      auditStat = FSDirAttrOp.setOwner(dir, src, username, group);
1697    } catch (AccessControlException e) {
1698      logAuditEvent(false, operationName, src);
1699      throw e;
1700    } finally {
1701      writeUnlock(operationName);
1702    }
1703    getEditLog().logSync();
1704    logAuditEvent(true, operationName, src, null, auditStat);
1705  }
1706
1707  static class GetBlockLocationsResult {
1708    final boolean updateAccessTime;
1709    final LocatedBlocks blocks;
1710    boolean updateAccessTime() {
1711      return updateAccessTime;
1712    }
1713    private GetBlockLocationsResult(
1714        boolean updateAccessTime, LocatedBlocks blocks) {
1715      this.updateAccessTime = updateAccessTime;
1716      this.blocks = blocks;
1717    }
1718  }
1719
1720  /**
1721   * Get block locations within the specified range.
1722   * @see ClientProtocol#getBlockLocations(String, long, long)
1723   */
1724  LocatedBlocks getBlockLocations(String clientMachine, String srcArg,
1725      long offset, long length) throws IOException {
1726    final String operationName = "open";
1727    checkOperation(OperationCategory.READ);
1728    GetBlockLocationsResult res = null;
1729    FSPermissionChecker pc = getPermissionChecker();
1730    readLock();
1731    try {
1732      checkOperation(OperationCategory.READ);
1733      res = getBlockLocations(pc, srcArg, offset, length, true, true);
1734    } catch (AccessControlException e) {
1735      logAuditEvent(false, operationName, srcArg);
1736      throw e;
1737    } finally {
1738      readUnlock(operationName);
1739    }
1740
1741    logAuditEvent(true, operationName, srcArg);
1742
1743    if (res.updateAccessTime()) {
1744      String src = srcArg;
1745      checkOperation(OperationCategory.WRITE);
1746      writeLock();
1747      final long now = now();
1748      try {
1749        checkOperation(OperationCategory.WRITE);
1750        /**
1751         * Resolve the path again and update the atime only when the file
1752         * exists.
1753         *
1754         * XXX: Races can still occur even after resolving the path again.
1755         * For example:
1756         *
1757         * <ul>
1758         *   <li>Get the block location for "/a/b"</li>
1759         *   <li>Rename "/a/b" to "/c/b"</li>
1760         *   <li>The second resolution still points to "/a/b", which is
1761         *   wrong.</li>
1762         * </ul>
1763         *
1764         * The behavior is incorrect but consistent with the one before
1765         * HDFS-7463. A better fix is to change the edit log of SetTime to
1766         * use inode id instead of a path.
1767         */
1768        final INodesInPath iip = dir.resolvePath(pc, src);
1769        src = iip.getPath();
1770        INode inode = iip.getLastINode();
1771        boolean updateAccessTime = inode != null &&
1772            now > inode.getAccessTime() + getAccessTimePrecision();
1773        if (!isInSafeMode() && updateAccessTime) {
1774          boolean changed = FSDirAttrOp.setTimes(dir,
1775              inode, -1, now, false, iip.getLatestSnapshotId());
1776          if (changed) {
1777            getEditLog().logTimes(src, -1, now);
1778          }
1779        }
1780      } catch (Throwable e) {
1781        LOG.warn("Failed to update the access time of " + src, e);
1782      } finally {
1783        writeUnlock(operationName);
1784      }
1785    }
1786
1787    LocatedBlocks blocks = res.blocks;
1788    if (blocks != null) {
1789      blockManager.getDatanodeManager().sortLocatedBlocks(
1790          clientMachine, blocks.getLocatedBlocks());
1791
1792      // lastBlock is not part of getLocatedBlocks(), might need to sort it too
1793      LocatedBlock lastBlock = blocks.getLastLocatedBlock();
1794      if (lastBlock != null) {
1795        ArrayList<LocatedBlock> lastBlockList = Lists.newArrayList(lastBlock);
1796        blockManager.getDatanodeManager().sortLocatedBlocks(
1797            clientMachine, lastBlockList);
1798      }
1799    }
1800    return blocks;
1801  }
1802
1803  /**
1804   * Get block locations within the specified range.
1805   * @see ClientProtocol#getBlockLocations(String, long, long)
1806   * @throws IOException
1807   */
1808  GetBlockLocationsResult getBlockLocations(
1809      FSPermissionChecker pc, String src, long offset, long length,
1810      boolean needBlockToken, boolean checkSafeMode) throws IOException {
1811    if (offset < 0) {
1812      throw new HadoopIllegalArgumentException(
1813          "Negative offset is not supported. File: " + src);
1814    }
1815    if (length < 0) {
1816      throw new HadoopIllegalArgumentException(
1817          "Negative length is not supported. File: " + src);
1818    }
1819    final GetBlockLocationsResult ret = getBlockLocationsInt(
1820        pc, src, offset, length, needBlockToken);
1821
1822    if (checkSafeMode && isInSafeMode()) {
1823      for (LocatedBlock b : ret.blocks.getLocatedBlocks()) {
1824        // if safemode & no block locations yet then throw safemodeException
1825        if ((b.getLocations() == null) || (b.getLocations().length == 0)) {
1826          SafeModeException se = new SafeModeException(
1827              "Zero blocklocations for " + src, safeMode);
1828          if (haEnabled && haContext != null &&
1829              haContext.getState().getServiceState() == HAServiceState.ACTIVE) {
1830            throw new RetriableException(se);
1831          } else {
1832            throw se;
1833          }
1834        }
1835      }
1836    }
1837    return ret;
1838  }
1839
1840  private GetBlockLocationsResult getBlockLocationsInt(
1841      FSPermissionChecker pc, final String srcArg, long offset, long length,
1842      boolean needBlockToken)
1843      throws IOException {
1844    String src = srcArg;
1845    final INodesInPath iip = dir.resolvePath(pc, src);
1846    src = iip.getPath();
1847    final INodeFile inode = INodeFile.valueOf(iip.getLastINode(), src);
1848    if (isPermissionEnabled) {
1849      dir.checkPathAccess(pc, iip, FsAction.READ);
1850      checkUnreadableBySuperuser(pc, inode, iip.getPathSnapshotId());
1851    }
1852
1853    final long fileSize = iip.isSnapshot()
1854        ? inode.computeFileSize(iip.getPathSnapshotId())
1855        : inode.computeFileSizeNotIncludingLastUcBlock();
1856    boolean isUc = inode.isUnderConstruction();
1857    if (iip.isSnapshot()) {
1858      // if src indicates a snapshot file, we need to make sure the returned
1859      // blocks do not exceed the size of the snapshot file.
1860      length = Math.min(length, fileSize - offset);
1861      isUc = false;
1862    }
1863
1864    final FileEncryptionInfo feInfo =
1865        FSDirectory.isReservedRawName(srcArg) ? null
1866            : dir.getFileEncryptionInfo(inode, iip.getPathSnapshotId(), iip);
1867
1868    final LocatedBlocks blocks = blockManager.createLocatedBlocks(
1869        inode.getBlocks(iip.getPathSnapshotId()), fileSize,
1870        isUc, offset, length, needBlockToken, iip.isSnapshot(), feInfo);
1871
1872    // Set caching information for the located blocks.
1873    for (LocatedBlock lb : blocks.getLocatedBlocks()) {
1874      cacheManager.setCachedLocations(lb);
1875    }
1876
1877    final long now = now();
1878    boolean updateAccessTime = isAccessTimeSupported() && !isInSafeMode()
1879        && !iip.isSnapshot()
1880        && now > inode.getAccessTime() + getAccessTimePrecision();
1881    return new GetBlockLocationsResult(updateAccessTime, blocks);
1882  }
1883
1884  /**
1885   * Moves all the blocks from {@code srcs} and appends them to {@code target}
1886   * To avoid rollbacks we will verify validity of ALL of the args
1887   * before we start actual move.
1888   * 
1889   * This does not support ".inodes" relative path
1890   * @param target target to concat into
1891   * @param srcs file that will be concatenated
1892   * @throws IOException on error
1893   */
1894  void concat(String target, String [] srcs, boolean logRetryCache)
1895      throws IOException {
1896    waitForLoadingFSImage();
1897    final String operationName = "concat";
1898    HdfsFileStatus stat = null;
1899    boolean success = false;
1900    checkOperation(OperationCategory.WRITE);
1901    writeLock();
1902    try {
1903      checkOperation(OperationCategory.WRITE);
1904      checkNameNodeSafeMode("Cannot concat " + target);
1905      stat = FSDirConcatOp.concat(dir, target, srcs, logRetryCache);
1906      success = true;
1907    } finally {
1908      writeUnlock(operationName);
1909      if (success) {
1910        getEditLog().logSync();
1911      }
1912      logAuditEvent(success, operationName, Arrays.toString(srcs),
1913          target, stat);
1914    }
1915  }
1916
1917  /**
1918   * stores the modification and access time for this inode. 
1919   * The access time is precise up to an hour. The transaction, if needed, is
1920   * written to the edits log but is not flushed.
1921   */
1922  void setTimes(String src, long mtime, long atime) throws IOException {
1923    final String operationName = "setTimes";
1924    HdfsFileStatus auditStat;
1925    checkOperation(OperationCategory.WRITE);
1926    writeLock();
1927    try {
1928      checkOperation(OperationCategory.WRITE);
1929      checkNameNodeSafeMode("Cannot set times " + src);
1930      auditStat = FSDirAttrOp.setTimes(dir, src, mtime, atime);
1931    } catch (AccessControlException e) {
1932      logAuditEvent(false, operationName, src);
1933      throw e;
1934    } finally {
1935      writeUnlock(operationName);
1936    }
1937    getEditLog().logSync();
1938    logAuditEvent(true, operationName, src, null, auditStat);
1939  }
1940
1941  /**
1942   * Create a symbolic link.
1943   */
1944  @SuppressWarnings("deprecation")
1945  void createSymlink(String target, String link,
1946      PermissionStatus dirPerms, boolean createParent, boolean logRetryCache)
1947      throws IOException {
1948    final String operationName = "createSymlink";
1949    if (!FileSystem.areSymlinksEnabled()) {
1950      throw new UnsupportedOperationException("Symlinks not supported");
1951    }
1952    HdfsFileStatus auditStat = null;
1953    checkOperation(OperationCategory.WRITE);
1954    writeLock();
1955    try {
1956      checkOperation(OperationCategory.WRITE);
1957      checkNameNodeSafeMode("Cannot create symlink " + link);
1958      auditStat = FSDirSymlinkOp.createSymlinkInt(this, target, link, dirPerms,
1959                                                  createParent, logRetryCache);
1960    } catch (AccessControlException e) {
1961      logAuditEvent(false, operationName, link, target, null);
1962      throw e;
1963    } finally {
1964      writeUnlock(operationName);
1965    }
1966    getEditLog().logSync();
1967    logAuditEvent(true, operationName, link, target, auditStat);
1968  }
1969
1970  /**
1971   * Set replication for an existing file.
1972   * 
1973   * The NameNode sets new replication and schedules either replication of 
1974   * under-replicated data blocks or removal of the excessive block copies 
1975   * if the blocks are over-replicated.
1976   * 
1977   * @see ClientProtocol#setReplication(String, short)
1978   * @param src file name
1979   * @param replication new replication
1980   * @return true if successful; 
1981   *         false if file does not exist or is a directory
1982   */
1983  boolean setReplication(final String src, final short replication)
1984      throws IOException {
1985    final String operationName = "setReplication";
1986    boolean success = false;
1987    waitForLoadingFSImage();
1988    checkOperation(OperationCategory.WRITE);
1989    writeLock();
1990    try {
1991      checkOperation(OperationCategory.WRITE);
1992      checkNameNodeSafeMode("Cannot set replication for " + src);
1993      success = FSDirAttrOp.setReplication(dir, blockManager, src, replication);
1994    } catch (AccessControlException e) {
1995      logAuditEvent(false, operationName, src);
1996      throw e;
1997    } finally {
1998      writeUnlock(operationName);
1999    }
2000    if (success) {
2001      getEditLog().logSync();
2002      logAuditEvent(true, operationName, src);
2003    }
2004    return success;
2005  }
2006
2007  /**
2008   * Truncate file to a lower length.
2009   * Truncate cannot be reverted / recovered from as it causes data loss.
2010   * Truncation at block boundary is atomic, otherwise it requires
2011   * block recovery to truncate the last block of the file.
2012   *
2013   * @return true if client does not need to wait for block recovery,
2014   * false if client needs to wait for block recovery.
2015   */
2016  boolean truncate(String src, long newLength,
2017                   String clientName, String clientMachine,
2018                   long mtime)
2019      throws IOException, UnresolvedLinkException {
2020    boolean ret;
2021    try {
2022      ret = truncateInt(src, newLength, clientName, clientMachine, mtime);
2023    } catch (AccessControlException e) {
2024      logAuditEvent(false, "truncate", src);
2025      throw e;
2026    }
2027    return ret;
2028  }
2029
2030  boolean truncateInt(String srcArg, long newLength,
2031                      String clientName, String clientMachine,
2032                      long mtime)
2033      throws IOException, UnresolvedLinkException {
2034    final String operationName = "truncate";
2035    String src = srcArg;
2036    NameNode.stateChangeLog.debug(
2037        "DIR* NameSystem.truncate: src={} newLength={}", src, newLength);
2038    if (newLength < 0) {
2039      throw new HadoopIllegalArgumentException(
2040          "Cannot truncate to a negative file size: " + newLength + ".");
2041    }
2042    HdfsFileStatus stat = null;
2043    FSPermissionChecker pc = getPermissionChecker();
2044    checkOperation(OperationCategory.WRITE);
2045    boolean res;
2046    writeLock();
2047    BlocksMapUpdateInfo toRemoveBlocks = new BlocksMapUpdateInfo();
2048    try {
2049      checkOperation(OperationCategory.WRITE);
2050      checkNameNodeSafeMode("Cannot truncate for " + src);
2051      INodesInPath iip = dir.resolvePath(pc, src);
2052      src = iip.getPath();
2053      res = truncateInternal(src, newLength, clientName,
2054          clientMachine, mtime, pc, toRemoveBlocks);
2055      stat = dir.getAuditFileInfo(dir.getINodesInPath4Write(src, false));
2056    } finally {
2057      writeUnlock(operationName);
2058    }
2059    getEditLog().logSync();
2060    if (!toRemoveBlocks.getToDeleteList().isEmpty()) {
2061      removeBlocks(toRemoveBlocks);
2062      toRemoveBlocks.clear();
2063    }
2064    logAuditEvent(true, operationName, src, null, stat);
2065    return res;
2066  }
2067
2068  /**
2069   * Truncate a file to a given size
2070   * Update the count at each ancestor directory with quota
2071   */
2072  boolean truncateInternal(String src, long newLength,
2073                           String clientName, String clientMachine,
2074                           long mtime, FSPermissionChecker pc,
2075                           BlocksMapUpdateInfo toRemoveBlocks)
2076      throws IOException, UnresolvedLinkException {
2077    assert hasWriteLock();
2078    INodesInPath iip = dir.getINodesInPath4Write(src, true);
2079    if (isPermissionEnabled) {
2080      dir.checkPathAccess(pc, iip, FsAction.WRITE);
2081    }
2082    INodeFile file = INodeFile.valueOf(iip.getLastINode(), src);
2083    final BlockStoragePolicy lpPolicy =
2084        blockManager.getStoragePolicy("LAZY_PERSIST");
2085
2086    if (lpPolicy != null &&
2087        lpPolicy.getId() == file.getStoragePolicyID()) {
2088      throw new UnsupportedOperationException(
2089          "Cannot truncate lazy persist file " + src);
2090    }
2091
2092    // Check if the file is already being truncated with the same length
2093    final BlockInfoContiguous last = file.getLastBlock();
2094    if (last != null && last.getBlockUCState() == BlockUCState.UNDER_RECOVERY) {
2095      final Block truncateBlock
2096          = ((BlockInfoContiguousUnderConstruction)last).getTruncateBlock();
2097      if (truncateBlock != null) {
2098        final long truncateLength = file.computeFileSize(false, false)
2099            + truncateBlock.getNumBytes();
2100        if (newLength == truncateLength) {
2101          return false;
2102        }
2103      }
2104    }
2105
2106    // Opening an existing file for truncate. May need lease recovery.
2107    recoverLeaseInternal(RecoverLeaseOp.TRUNCATE_FILE,
2108        iip, src, clientName, clientMachine, false);
2109    // Truncate length check.
2110    long oldLength = file.computeFileSize();
2111    if(oldLength == newLength) {
2112      return true;
2113    }
2114    if(oldLength < newLength) {
2115      throw new HadoopIllegalArgumentException(
2116          "Cannot truncate to a larger file size. Current size: " + oldLength +
2117              ", truncate size: " + newLength + ".");
2118    }
2119    // Perform INodeFile truncation.
2120    final QuotaCounts delta = new QuotaCounts.Builder().build();
2121    boolean onBlockBoundary = dir.truncate(iip, newLength, toRemoveBlocks,
2122        mtime, delta);
2123    Block truncateBlock = null;
2124    if(!onBlockBoundary) {
2125      // Open file for write, but don't log into edits
2126      long lastBlockDelta = file.computeFileSize() - newLength;
2127      assert lastBlockDelta > 0 : "delta is 0 only if on block bounday";
2128      truncateBlock = prepareFileForTruncate(iip, clientName, clientMachine,
2129          lastBlockDelta, null);
2130    }
2131
2132    // update the quota: use the preferred block size for UC block
2133    dir.writeLock();
2134    try {
2135      dir.updateCountNoQuotaCheck(iip, iip.length() - 1, delta);
2136    } finally {
2137      dir.writeUnlock();
2138    }
2139
2140    getEditLog().logTruncate(src, clientName, clientMachine, newLength, mtime,
2141        truncateBlock);
2142    return onBlockBoundary;
2143  }
2144
2145  /**
2146   * Convert current INode to UnderConstruction.
2147   * Recreate lease.
2148   * Create new block for the truncated copy.
2149   * Schedule truncation of the replicas.
2150   *
2151   * @return the returned block will be written to editLog and passed back into
2152   * this method upon loading.
2153   */
2154  Block prepareFileForTruncate(INodesInPath iip,
2155                               String leaseHolder,
2156                               String clientMachine,
2157                               long lastBlockDelta,
2158                               Block newBlock)
2159      throws IOException {
2160    INodeFile file = iip.getLastINode().asFile();
2161    String src = iip.getPath();
2162    file.recordModification(iip.getLatestSnapshotId());
2163    file.toUnderConstruction(leaseHolder, clientMachine);
2164    assert file.isUnderConstruction() : "inode should be under construction.";
2165    leaseManager.addLease(
2166        file.getFileUnderConstructionFeature().getClientName(), src);
2167    boolean shouldRecoverNow = (newBlock == null);
2168    BlockInfoContiguous oldBlock = file.getLastBlock();
2169    boolean shouldCopyOnTruncate = shouldCopyOnTruncate(file, oldBlock);
2170    if(newBlock == null) {
2171      newBlock = (shouldCopyOnTruncate) ? createNewBlock() :
2172          new Block(oldBlock.getBlockId(), oldBlock.getNumBytes(),
2173              nextGenerationStamp(blockIdManager.isLegacyBlock(oldBlock)));
2174    }
2175
2176    BlockInfoContiguousUnderConstruction truncatedBlockUC;
2177    if(shouldCopyOnTruncate) {
2178      // Add new truncateBlock into blocksMap and
2179      // use oldBlock as a source for copy-on-truncate recovery
2180      truncatedBlockUC = new BlockInfoContiguousUnderConstruction(newBlock,
2181          file.getBlockReplication());
2182      truncatedBlockUC.setNumBytes(oldBlock.getNumBytes() - lastBlockDelta);
2183      truncatedBlockUC.setTruncateBlock(oldBlock);
2184      file.setLastBlock(truncatedBlockUC, blockManager.getStorages(oldBlock));
2185      getBlockManager().addBlockCollection(truncatedBlockUC, file);
2186
2187      NameNode.stateChangeLog.debug(
2188          "BLOCK* prepareFileForTruncate: Scheduling copy-on-truncate to new" +
2189          " size {}  new block {} old block {}", truncatedBlockUC.getNumBytes(),
2190          newBlock, truncatedBlockUC.getTruncateBlock());
2191    } else {
2192      // Use new generation stamp for in-place truncate recovery
2193      blockManager.convertLastBlockToUnderConstruction(file, lastBlockDelta);
2194      oldBlock = file.getLastBlock();
2195      assert !oldBlock.isComplete() : "oldBlock should be under construction";
2196      truncatedBlockUC = (BlockInfoContiguousUnderConstruction) oldBlock;
2197      truncatedBlockUC.setTruncateBlock(new BlockInfoContiguous(oldBlock,
2198          file.getBlockReplication()));
2199      truncatedBlockUC.getTruncateBlock().setNumBytes(
2200          oldBlock.getNumBytes() - lastBlockDelta);
2201      truncatedBlockUC.getTruncateBlock().setGenerationStamp(
2202          newBlock.getGenerationStamp());
2203
2204      NameNode.stateChangeLog.debug(
2205          "BLOCK* prepareFileForTruncate: {} Scheduling in-place block " +
2206          "truncate to new size {}",
2207          truncatedBlockUC.getTruncateBlock().getNumBytes(), truncatedBlockUC);
2208    }
2209    if (shouldRecoverNow) {
2210      truncatedBlockUC.initializeBlockRecovery(newBlock.getGenerationStamp(),
2211          true);
2212    }
2213
2214    return newBlock;
2215  }
2216
2217  /**
2218   * Defines if a replica needs to be copied on truncate or
2219   * can be truncated in place.
2220   */
2221  boolean shouldCopyOnTruncate(INodeFile file, BlockInfoContiguous blk) {
2222    if(!isUpgradeFinalized()) {
2223      return true;
2224    }
2225    if (isRollingUpgrade()) {
2226      return true;
2227    }
2228    return file.isBlockInLatestSnapshot(blk);
2229  }
2230
2231  /**
2232   * Set the storage policy for a file or a directory.
2233   *
2234   * @param src file/directory path
2235   * @param policyName storage policy name
2236   */
2237  void setStoragePolicy(String src, String policyName) throws IOException {
2238    HdfsFileStatus auditStat;
2239    waitForLoadingFSImage();
2240    checkOperation(OperationCategory.WRITE);
2241    final String operationName = "setStoragePolicy";
2242    writeLock();
2243    try {
2244      checkOperation(OperationCategory.WRITE);
2245      checkNameNodeSafeMode("Cannot set storage policy for " + src);
2246      auditStat = FSDirAttrOp.setStoragePolicy(
2247          dir, blockManager, src, policyName);
2248    } catch (AccessControlException e) {
2249      logAuditEvent(false, operationName, src);
2250      throw e;
2251    } finally {
2252      writeUnlock(operationName);
2253    }
2254    getEditLog().logSync();
2255    logAuditEvent(true, operationName, src, null, auditStat);
2256  }
2257
2258  /**
2259   * @return All the existing block storage policies
2260   */
2261  BlockStoragePolicy[] getStoragePolicies() throws IOException {
2262    checkOperation(OperationCategory.READ);
2263    waitForLoadingFSImage();
2264    readLock();
2265    try {
2266      checkOperation(OperationCategory.READ);
2267      return FSDirAttrOp.getStoragePolicies(blockManager);
2268    } finally {
2269      readUnlock("getStoragePolicies");
2270    }
2271  }
2272
2273  long getPreferredBlockSize(String src) throws IOException {
2274    checkOperation(OperationCategory.READ);
2275    readLock();
2276    try {
2277      checkOperation(OperationCategory.READ);
2278      return FSDirAttrOp.getPreferredBlockSize(dir, src);
2279    } finally {
2280      readUnlock("getPreferredBlockSize");
2281    }
2282  }
2283
2284  /**
2285   * If the file is within an encryption zone, select the appropriate 
2286   * CryptoProtocolVersion from the list provided by the client. Since the
2287   * client may be newer, we need to handle unknown versions.
2288   *
2289   * @param zone EncryptionZone of the file
2290   * @param supportedVersions List of supported protocol versions
2291   * @return chosen protocol version
2292   * @throws IOException
2293   */
2294  private CryptoProtocolVersion chooseProtocolVersion(EncryptionZone zone,
2295      CryptoProtocolVersion[] supportedVersions)
2296      throws UnknownCryptoProtocolVersionException, UnresolvedLinkException,
2297        SnapshotAccessControlException {
2298    Preconditions.checkNotNull(zone);
2299    Preconditions.checkNotNull(supportedVersions);
2300    // Right now, we only support a single protocol version,
2301    // so simply look for it in the list of provided options
2302    final CryptoProtocolVersion required = zone.getVersion();
2303
2304    for (CryptoProtocolVersion c : supportedVersions) {
2305      if (c.equals(CryptoProtocolVersion.UNKNOWN)) {
2306        if (LOG.isDebugEnabled()) {
2307          LOG.debug("Ignoring unknown CryptoProtocolVersion provided by " +
2308              "client: " + c.getUnknownValue());
2309        }
2310        continue;
2311      }
2312      if (c.equals(required)) {
2313        return c;
2314      }
2315    }
2316    throw new UnknownCryptoProtocolVersionException(
2317        "No crypto protocol versions provided by the client are supported."
2318            + " Client provided: " + Arrays.toString(supportedVersions)
2319            + " NameNode supports: " + Arrays.toString(CryptoProtocolVersion
2320            .values()));
2321  }
2322
2323  /**
2324   * Invoke KeyProvider APIs to generate an encrypted data encryption key for an
2325   * encryption zone. Should not be called with any locks held.
2326   *
2327   * @param ezKeyName key name of an encryption zone
2328   * @return New EDEK, or null if ezKeyName is null
2329   * @throws IOException
2330   */
2331  private EncryptedKeyVersion generateEncryptedDataEncryptionKey(String
2332      ezKeyName) throws IOException {
2333    if (ezKeyName == null) {
2334      return null;
2335    }
2336    EncryptedKeyVersion edek = null;
2337    try {
2338      edek = provider.generateEncryptedKey(ezKeyName);
2339    } catch (GeneralSecurityException e) {
2340      throw new IOException(e);
2341    }
2342    Preconditions.checkNotNull(edek);
2343    return edek;
2344  }
2345
2346  /**
2347   * Create a new file entry in the namespace.
2348   * 
2349   * For description of parameters and exceptions thrown see
2350   * {@link ClientProtocol#create}, except it returns valid file status upon
2351   * success
2352   */
2353  HdfsFileStatus startFile(String src, PermissionStatus permissions,
2354      String holder, String clientMachine, EnumSet<CreateFlag> flag,
2355      boolean createParent, short replication, long blockSize, 
2356      CryptoProtocolVersion[] supportedVersions, boolean logRetryCache)
2357      throws AccessControlException, SafeModeException,
2358      FileAlreadyExistsException, UnresolvedLinkException,
2359      FileNotFoundException, ParentNotDirectoryException, IOException {
2360
2361    HdfsFileStatus status = null;
2362    try {
2363      status = startFileInt(src, permissions, holder, clientMachine, flag,
2364          createParent, replication, blockSize, supportedVersions,
2365          logRetryCache);
2366    } catch (AccessControlException e) {
2367      logAuditEvent(false, "create", src);
2368      throw e;
2369    }
2370    return status;
2371  }
2372
2373  private HdfsFileStatus startFileInt(final String srcArg,
2374      PermissionStatus permissions, String holder, String clientMachine,
2375      EnumSet<CreateFlag> flag, boolean createParent, short replication,
2376      long blockSize, CryptoProtocolVersion[] supportedVersions,
2377      boolean logRetryCache)
2378      throws AccessControlException, SafeModeException,
2379      FileAlreadyExistsException, UnresolvedLinkException,
2380      FileNotFoundException, ParentNotDirectoryException, IOException {
2381    String src = srcArg;
2382    final String operationName = "create";
2383    if (NameNode.stateChangeLog.isDebugEnabled()) {
2384      StringBuilder builder = new StringBuilder();
2385      builder.append("DIR* NameSystem.startFile: src=" + src
2386              + ", holder=" + holder
2387              + ", clientMachine=" + clientMachine
2388              + ", createParent=" + createParent
2389              + ", replication=" + replication
2390              + ", createFlag=" + flag.toString()
2391              + ", blockSize=" + blockSize);
2392      builder.append(", supportedVersions=");
2393      if (supportedVersions != null) {
2394        builder.append(Arrays.toString(supportedVersions));
2395      } else {
2396        builder.append("null");
2397      }
2398      NameNode.stateChangeLog.debug(builder.toString());
2399    }
2400    if (!DFSUtil.isValidName(src)) {
2401      throw new InvalidPathException(src);
2402    }
2403    blockManager.verifyReplication(src, replication, clientMachine);
2404
2405    boolean skipSync = false;
2406    HdfsFileStatus stat = null;
2407    FSPermissionChecker pc = getPermissionChecker();
2408    if (blockSize < minBlockSize) {
2409      throw new IOException("Specified block size is less than configured" +
2410          " minimum value (" + DFSConfigKeys.DFS_NAMENODE_MIN_BLOCK_SIZE_KEY
2411          + "): " + blockSize + " < " + minBlockSize);
2412    }
2413    boolean create = flag.contains(CreateFlag.CREATE);
2414    boolean overwrite = flag.contains(CreateFlag.OVERWRITE);
2415    boolean isLazyPersist = flag.contains(CreateFlag.LAZY_PERSIST);
2416
2417    waitForLoadingFSImage();
2418
2419    /**
2420     * If the file is in an encryption zone, we optimistically create an
2421     * EDEK for the file by calling out to the configured KeyProvider.
2422     * Since this typically involves doing an RPC, we take the readLock
2423     * initially, then drop it to do the RPC.
2424     * 
2425     * Since the path can flip-flop between being in an encryption zone and not
2426     * in the meantime, we need to recheck the preconditions when we retake the
2427     * lock to do the create. If the preconditions are not met, we throw a
2428     * special RetryStartFileException to ask the DFSClient to try the create
2429     * again later.
2430     */
2431    CryptoProtocolVersion protocolVersion = null;
2432    CipherSuite suite = null;
2433    String ezKeyName = null;
2434    EncryptedKeyVersion edek = null;
2435
2436    if (provider != null) {
2437      readLock();
2438      try {
2439        INodesInPath iip = dir.resolvePathForWrite(pc, src);
2440        src = iip.getPath();
2441        // Nothing to do if the path is not within an EZ
2442        final EncryptionZone zone = dir.getEZForPath(iip);
2443        if (zone != null) {
2444          protocolVersion = chooseProtocolVersion(zone, supportedVersions);
2445          suite = zone.getSuite();
2446          ezKeyName = zone.getKeyName();
2447
2448          Preconditions.checkNotNull(protocolVersion);
2449          Preconditions.checkNotNull(suite);
2450          Preconditions.checkArgument(!suite.equals(CipherSuite.UNKNOWN),
2451              "Chose an UNKNOWN CipherSuite!");
2452          Preconditions.checkNotNull(ezKeyName);
2453        }
2454      } finally {
2455        readUnlock(operationName);
2456      }
2457
2458      Preconditions.checkState(
2459          (suite == null && ezKeyName == null) ||
2460              (suite != null && ezKeyName != null),
2461          "Both suite and ezKeyName should both be null or not null");
2462
2463      // Generate EDEK if necessary while not holding the lock
2464      edek = generateEncryptedDataEncryptionKey(ezKeyName);
2465      EncryptionFaultInjector.getInstance().startFileAfterGenerateKey();
2466    }
2467
2468    // Proceed with the create, using the computed cipher suite and 
2469    // generated EDEK
2470    BlocksMapUpdateInfo toRemoveBlocks = null;
2471    writeLock();
2472    try {
2473      checkOperation(OperationCategory.WRITE);
2474      checkNameNodeSafeMode("Cannot create file" + src);
2475      dir.writeLock();
2476      try {
2477        final INodesInPath iip = dir.resolvePathForWrite(pc, src);
2478        src = iip.getPath();
2479        toRemoveBlocks = startFileInternal(
2480            pc, iip, permissions, holder,
2481            clientMachine, create, overwrite,
2482            createParent, replication, blockSize,
2483            isLazyPersist, suite, protocolVersion, edek,
2484            logRetryCache);
2485        stat = FSDirStatAndListingOp.getFileInfo(
2486            dir, src, false, FSDirectory.isReservedRawName(srcArg));
2487      } finally {
2488        dir.writeUnlock();
2489      }
2490    } catch (StandbyException se) {
2491      skipSync = true;
2492      throw se;
2493    } finally {
2494      writeUnlock(operationName);
2495      // There might be transactions logged while trying to recover the lease.
2496      // They need to be sync'ed even when an exception was thrown.
2497      if (!skipSync) {
2498        getEditLog().logSync();
2499        if (toRemoveBlocks != null) {
2500          removeBlocks(toRemoveBlocks);
2501          toRemoveBlocks.clear();
2502        }
2503      }
2504    }
2505
2506    logAuditEvent(true, operationName, srcArg, null, stat);
2507    return stat;
2508  }
2509
2510  /**
2511   * Create a new file or overwrite an existing file<br>
2512   * 
2513   * Once the file is create the client then allocates a new block with the next
2514   * call using {@link ClientProtocol#addBlock}.
2515   * <p>
2516   * For description of parameters and exceptions thrown see
2517   * {@link ClientProtocol#create}
2518   */
2519  private BlocksMapUpdateInfo startFileInternal(FSPermissionChecker pc, 
2520      INodesInPath iip, PermissionStatus permissions, String holder,
2521      String clientMachine, boolean create, boolean overwrite, 
2522      boolean createParent, short replication, long blockSize, 
2523      boolean isLazyPersist, CipherSuite suite, CryptoProtocolVersion version,
2524      EncryptedKeyVersion edek, boolean logRetryEntry)
2525      throws IOException {
2526    assert hasWriteLock();
2527    // Verify that the destination does not exist as a directory already.
2528    final INode inode = iip.getLastINode();
2529    final String src = iip.getPath();
2530    if (inode != null && inode.isDirectory()) {
2531      throw new FileAlreadyExistsException(src +
2532          " already exists as a directory");
2533    }
2534
2535    final INodeFile myFile = INodeFile.valueOf(inode, src, true);
2536    if (isPermissionEnabled) {
2537      if (overwrite && myFile != null) {
2538        dir.checkPathAccess(pc, iip, FsAction.WRITE);
2539      }
2540      /*
2541       * To overwrite existing file, need to check 'w' permission 
2542       * of parent (equals to ancestor in this case)
2543       */
2544      dir.checkAncestorAccess(pc, iip, FsAction.WRITE);
2545    }
2546    if (!createParent) {
2547      dir.verifyParentDir(iip, src);
2548    }
2549
2550    FileEncryptionInfo feInfo = null;
2551
2552    final EncryptionZone zone = dir.getEZForPath(iip);
2553    if (zone != null) {
2554      // The path is now within an EZ, but we're missing encryption parameters
2555      if (suite == null || edek == null) {
2556        throw new RetryStartFileException();
2557      }
2558      // Path is within an EZ and we have provided encryption parameters.
2559      // Make sure that the generated EDEK matches the settings of the EZ.
2560      final String ezKeyName = zone.getKeyName();
2561      if (!ezKeyName.equals(edek.getEncryptionKeyName())) {
2562        throw new RetryStartFileException();
2563      }
2564      feInfo = new FileEncryptionInfo(suite, version,
2565          edek.getEncryptedKeyVersion().getMaterial(),
2566          edek.getEncryptedKeyIv(),
2567          ezKeyName, edek.getEncryptionKeyVersionName());
2568    }
2569
2570    try {
2571      BlocksMapUpdateInfo toRemoveBlocks = null;
2572      if (myFile == null) {
2573        if (!create) {
2574          throw new FileNotFoundException("Can't overwrite non-existent " +
2575              src + " for client " + clientMachine);
2576        }
2577      } else {
2578        if (overwrite) {
2579          toRemoveBlocks = new BlocksMapUpdateInfo();
2580          List<INode> toRemoveINodes = new ChunkedArrayList<INode>();
2581          long ret = FSDirDeleteOp.delete(dir, iip, toRemoveBlocks,
2582                                          toRemoveINodes, now());
2583          if (ret >= 0) {
2584            iip = INodesInPath.replace(iip, iip.length() - 1, null);
2585            FSDirDeleteOp.incrDeletedFileCount(ret);
2586            removeLeasesAndINodes(src, toRemoveINodes, true);
2587          }
2588        } else {
2589          // If lease soft limit time is expired, recover the lease
2590          recoverLeaseInternal(RecoverLeaseOp.CREATE_FILE,
2591              iip, src, holder, clientMachine, false);
2592          throw new FileAlreadyExistsException(src + " for client " +
2593              clientMachine + " already exists");
2594        }
2595      }
2596
2597      checkFsObjectLimit();
2598      INodeFile newNode = null;
2599
2600      // Always do an implicit mkdirs for parent directory tree.
2601      Map.Entry<INodesInPath, String> parent = FSDirMkdirOp
2602          .createAncestorDirectories(dir, iip, permissions);
2603      if (parent != null) {
2604        iip = dir.addFile(parent.getKey(), parent.getValue(), permissions,
2605            replication, blockSize, holder, clientMachine);
2606        newNode = iip != null ? iip.getLastINode().asFile() : null;
2607      }
2608
2609      if (newNode == null) {
2610        throw new IOException("Unable to add " + src +  " to namespace");
2611      }
2612      leaseManager.addLease(newNode.getFileUnderConstructionFeature()
2613          .getClientName(), src);
2614
2615      // Set encryption attributes if necessary
2616      if (feInfo != null) {
2617        dir.setFileEncryptionInfo(src, feInfo);
2618        newNode = dir.getInode(newNode.getId()).asFile();
2619      }
2620
2621      setNewINodeStoragePolicy(newNode, iip, isLazyPersist);
2622
2623      // record file record in log, record new generation stamp
2624      getEditLog().logOpenFile(src, newNode, overwrite, logRetryEntry);
2625      NameNode.stateChangeLog.debug("DIR* NameSystem.startFile: added {}" +
2626          " inode {} holder {}", src, newNode.getId(), holder);
2627      return toRemoveBlocks;
2628    } catch (IOException ie) {
2629      NameNode.stateChangeLog.warn("DIR* NameSystem.startFile: " + src + " " +
2630          ie.getMessage());
2631      throw ie;
2632    }
2633  }
2634
2635  private void setNewINodeStoragePolicy(INodeFile inode,
2636                                        INodesInPath iip,
2637                                        boolean isLazyPersist)
2638      throws IOException {
2639
2640    if (isLazyPersist) {
2641      BlockStoragePolicy lpPolicy =
2642          blockManager.getStoragePolicy("LAZY_PERSIST");
2643
2644      // Set LAZY_PERSIST storage policy if the flag was passed to
2645      // CreateFile.
2646      if (lpPolicy == null) {
2647        throw new HadoopIllegalArgumentException(
2648            "The LAZY_PERSIST storage policy has been disabled " +
2649            "by the administrator.");
2650      }
2651      inode.setStoragePolicyID(lpPolicy.getId(),
2652                                 iip.getLatestSnapshotId());
2653    } else {
2654      BlockStoragePolicy effectivePolicy =
2655          blockManager.getStoragePolicy(inode.getStoragePolicyID());
2656
2657      if (effectivePolicy != null &&
2658          effectivePolicy.isCopyOnCreateFile()) {
2659        // Copy effective policy from ancestor directory to current file.
2660        inode.setStoragePolicyID(effectivePolicy.getId(),
2661                                 iip.getLatestSnapshotId());
2662      }
2663    }
2664  }
2665
2666  /**
2667   * Append to an existing file for append.
2668   * <p>
2669   * 
2670   * The method returns the last block of the file if this is a partial block,
2671   * which can still be used for writing more data. The client uses the returned
2672   * block locations to form the data pipeline for this block.<br>
2673   * The method returns null if the last block is full. The client then
2674   * allocates a new block with the next call using
2675   * {@link ClientProtocol#addBlock}.
2676   * <p>
2677   * 
2678   * For description of parameters and exceptions thrown see
2679   * {@link ClientProtocol#append(String, String, EnumSetWritable)}
2680   *
2681   * @return the last block locations if the block is partial or null otherwise
2682   */
2683  private LocatedBlock appendFileInternal(FSPermissionChecker pc,
2684      INodesInPath iip, String holder, String clientMachine, boolean newBlock,
2685      boolean logRetryCache) throws IOException {
2686    assert hasWriteLock();
2687    // Verify that the destination does not exist as a directory already.
2688    final INode inode = iip.getLastINode();
2689    final String src = iip.getPath();
2690    if (inode != null && inode.isDirectory()) {
2691      throw new FileAlreadyExistsException("Cannot append to directory " + src
2692          + "; already exists as a directory.");
2693    }
2694    if (isPermissionEnabled) {
2695      dir.checkPathAccess(pc, iip, FsAction.WRITE);
2696    }
2697
2698    try {
2699      if (inode == null) {
2700        throw new FileNotFoundException("failed to append to non-existent file "
2701          + src + " for client " + clientMachine);
2702      }
2703      INodeFile myFile = INodeFile.valueOf(inode, src, true);
2704      final BlockStoragePolicy lpPolicy =
2705          blockManager.getStoragePolicy("LAZY_PERSIST");
2706      if (lpPolicy != null &&
2707          lpPolicy.getId() == myFile.getStoragePolicyID()) {
2708        throw new UnsupportedOperationException(
2709            "Cannot append to lazy persist file " + src);
2710      }
2711      // Opening an existing file for append - may need to recover lease.
2712      recoverLeaseInternal(RecoverLeaseOp.APPEND_FILE,
2713          iip, src, holder, clientMachine, false);
2714      
2715      final BlockInfoContiguous lastBlock = myFile.getLastBlock();
2716      // Check that the block has at least minimum replication.
2717      if(lastBlock != null && lastBlock.isComplete() &&
2718          !getBlockManager().isSufficientlyReplicated(lastBlock)) {
2719        throw new IOException("append: lastBlock=" + lastBlock +
2720            " of src=" + src + " is not sufficiently replicated yet.");
2721      }
2722      return prepareFileForAppend(src, iip, holder, clientMachine, newBlock,
2723          true, logRetryCache);
2724    } catch (IOException ie) {
2725      NameNode.stateChangeLog.warn("DIR* NameSystem.append: " +ie.getMessage());
2726      throw ie;
2727    }
2728  }
2729  
2730  /**
2731   * Convert current node to under construction.
2732   * Recreate in-memory lease record.
2733   * 
2734   * @param src path to the file
2735   * @param leaseHolder identifier of the lease holder on this file
2736   * @param clientMachine identifier of the client machine
2737   * @param newBlock if the data is appended to a new block
2738   * @param writeToEditLog whether to persist this change to the edit log
2739   * @param logRetryCache whether to record RPC ids in editlog for retry cache
2740   *                      rebuilding
2741   * @return the last block locations if the block is partial or null otherwise
2742   * @throws UnresolvedLinkException
2743   * @throws IOException
2744   */
2745  LocatedBlock prepareFileForAppend(String src, INodesInPath iip,
2746      String leaseHolder, String clientMachine, boolean newBlock,
2747      boolean writeToEditLog, boolean logRetryCache) throws IOException {
2748    final INodeFile file = iip.getLastINode().asFile();
2749    final QuotaCounts delta = verifyQuotaForUCBlock(file, iip);
2750
2751    file.recordModification(iip.getLatestSnapshotId());
2752    file.toUnderConstruction(leaseHolder, clientMachine);
2753
2754    leaseManager.addLease(
2755        file.getFileUnderConstructionFeature().getClientName(), src);
2756
2757    LocatedBlock ret = null;
2758    if (!newBlock) {
2759      ret = blockManager.convertLastBlockToUnderConstruction(file, 0);
2760      if (ret != null && delta != null) {
2761        Preconditions.checkState(delta.getStorageSpace() >= 0,
2762            "appending to a block with size larger than the preferred block size");
2763        dir.writeLock();
2764        try {
2765          dir.updateCountNoQuotaCheck(iip, iip.length() - 1, delta);
2766        } finally {
2767          dir.writeUnlock();
2768        }
2769      }
2770    } else {
2771      BlockInfoContiguous lastBlock = file.getLastBlock();
2772      if (lastBlock != null) {
2773        ExtendedBlock blk = new ExtendedBlock(this.getBlockPoolId(), lastBlock);
2774        ret = new LocatedBlock(blk, new DatanodeInfo[0]);
2775      }
2776    }
2777
2778    if (writeToEditLog) {
2779      getEditLog().logAppendFile(src, file, newBlock, logRetryCache);
2780    }
2781    return ret;
2782  }
2783
2784  /**
2785   * Verify quota when using the preferred block size for UC block. This is
2786   * usually used by append and truncate
2787   * @throws QuotaExceededException when violating the storage quota
2788   * @return expected quota usage update. null means no change or no need to
2789   *         update quota usage later
2790   */
2791  private QuotaCounts verifyQuotaForUCBlock(INodeFile file, INodesInPath iip)
2792      throws QuotaExceededException {
2793    if (!isImageLoaded() || dir.shouldSkipQuotaChecks()) {
2794      // Do not check quota if editlog is still being processed
2795      return null;
2796    }
2797    if (file.getLastBlock() != null) {
2798      final QuotaCounts delta = computeQuotaDeltaForUCBlock(file);
2799      dir.readLock();
2800      try {
2801        FSDirectory.verifyQuota(iip, iip.length() - 1, delta, null);
2802        return delta;
2803      } finally {
2804        dir.readUnlock();
2805      }
2806    }
2807    return null;
2808  }
2809
2810  /** Compute quota change for converting a complete block to a UC block */
2811  private QuotaCounts computeQuotaDeltaForUCBlock(INodeFile file) {
2812    final QuotaCounts delta = new QuotaCounts.Builder().build();
2813    final BlockInfoContiguous lastBlock = file.getLastBlock();
2814    if (lastBlock != null) {
2815      final long diff = file.getPreferredBlockSize() - lastBlock.getNumBytes();
2816      final short repl = file.getBlockReplication();
2817      delta.addStorageSpace(diff * repl);
2818      final BlockStoragePolicy policy = dir.getBlockStoragePolicySuite()
2819          .getPolicy(file.getStoragePolicyID());
2820      List<StorageType> types = policy.chooseStorageTypes(repl);
2821      for (StorageType t : types) {
2822        if (t.supportTypeQuota()) {
2823          delta.addTypeSpace(t, diff);
2824        }
2825      }
2826    }
2827    return delta;
2828  }
2829
2830  /**
2831   * Recover lease;
2832   * Immediately revoke the lease of the current lease holder and start lease
2833   * recovery so that the file can be forced to be closed.
2834   * 
2835   * @param src the path of the file to start lease recovery
2836   * @param holder the lease holder's name
2837   * @param clientMachine the client machine's name
2838   * @return true if the file is already closed or
2839   *         if the lease can be released and the file can be closed.
2840   * @throws IOException
2841   */
2842  boolean recoverLease(String src, String holder, String clientMachine)
2843      throws IOException {
2844    if (!DFSUtil.isValidName(src)) {
2845      throw new IOException("Invalid file name: " + src);
2846    }
2847  
2848    boolean skipSync = false;
2849    FSPermissionChecker pc = getPermissionChecker();
2850    checkOperation(OperationCategory.WRITE);
2851    writeLock();
2852    try {
2853      checkOperation(OperationCategory.WRITE);
2854      checkNameNodeSafeMode("Cannot recover the lease of " + src);
2855      final INodesInPath iip = dir.resolvePathForWrite(pc, src);
2856      src = iip.getPath();
2857      final INodeFile inode = INodeFile.valueOf(iip.getLastINode(), src);
2858      if (!inode.isUnderConstruction()) {
2859        return true;
2860      }
2861      if (isPermissionEnabled) {
2862        dir.checkPathAccess(pc, iip, FsAction.WRITE);
2863      }
2864  
2865      return recoverLeaseInternal(RecoverLeaseOp.RECOVER_LEASE,
2866          iip, src, holder, clientMachine, true);
2867    } catch (StandbyException se) {
2868      skipSync = true;
2869      throw se;
2870    } finally {
2871      writeUnlock("recoverLease");
2872      // There might be transactions logged while trying to recover the lease.
2873      // They need to be sync'ed even when an exception was thrown.
2874      if (!skipSync) {
2875        getEditLog().logSync();
2876      }
2877    }
2878  }
2879
2880  private enum RecoverLeaseOp {
2881    CREATE_FILE,
2882    APPEND_FILE,
2883    TRUNCATE_FILE,
2884    RECOVER_LEASE;
2885    
2886    private String getExceptionMessage(String src, String holder,
2887        String clientMachine, String reason) {
2888      return "Failed to " + this + " " + src + " for " + holder +
2889          " on " + clientMachine + " because " + reason;
2890    }
2891  }
2892
2893  boolean recoverLeaseInternal(RecoverLeaseOp op, INodesInPath iip,
2894      String src, String holder, String clientMachine, boolean force)
2895      throws IOException {
2896    assert hasWriteLock();
2897    INodeFile file = iip.getLastINode().asFile();
2898    if (file.isUnderConstruction()) {
2899      //
2900      // If the file is under construction , then it must be in our
2901      // leases. Find the appropriate lease record.
2902      //
2903      Lease lease = leaseManager.getLease(holder);
2904
2905      if (!force && lease != null) {
2906        Lease leaseFile = leaseManager.getLeaseByPath(src);
2907        if (leaseFile != null && leaseFile.equals(lease)) {
2908          // We found the lease for this file but the original
2909          // holder is trying to obtain it again.
2910          throw new AlreadyBeingCreatedException(
2911              op.getExceptionMessage(src, holder, clientMachine,
2912                  holder + " is already the current lease holder."));
2913        }
2914      }
2915      //
2916      // Find the original holder.
2917      //
2918      FileUnderConstructionFeature uc = file.getFileUnderConstructionFeature();
2919      String clientName = uc.getClientName();
2920      lease = leaseManager.getLease(clientName);
2921      if (lease == null) {
2922        throw new AlreadyBeingCreatedException(
2923            op.getExceptionMessage(src, holder, clientMachine,
2924                "the file is under construction but no leases found."));
2925      }
2926      if (force) {
2927        // close now: no need to wait for soft lease expiration and 
2928        // close only the file src
2929        LOG.info("recoverLease: " + lease + ", src=" + src +
2930          " from client " + clientName);
2931        return internalReleaseLease(lease, src, iip, holder);
2932      } else {
2933        assert lease.getHolder().equals(clientName) :
2934          "Current lease holder " + lease.getHolder() +
2935          " does not match file creator " + clientName;
2936        //
2937        // If the original holder has not renewed in the last SOFTLIMIT 
2938        // period, then start lease recovery.
2939        //
2940        if (lease.expiredSoftLimit()) {
2941          LOG.info("startFile: recover " + lease + ", src=" + src + " client "
2942              + clientName);
2943          if (internalReleaseLease(lease, src, iip, null)) {
2944            return true;
2945          } else {
2946            throw new RecoveryInProgressException(
2947                op.getExceptionMessage(src, holder, clientMachine,
2948                    "lease recovery is in progress. Try again later."));
2949          }
2950        } else {
2951          final BlockInfoContiguous lastBlock = file.getLastBlock();
2952          if (lastBlock != null
2953              && lastBlock.getBlockUCState() == BlockUCState.UNDER_RECOVERY) {
2954            throw new RecoveryInProgressException(
2955                op.getExceptionMessage(src, holder, clientMachine,
2956                    "another recovery is in progress by "
2957                        + clientName + " on " + uc.getClientMachine()));
2958          } else {
2959            throw new AlreadyBeingCreatedException(
2960                op.getExceptionMessage(src, holder, clientMachine,
2961                    "this file lease is currently owned by "
2962                        + clientName + " on " + uc.getClientMachine()));
2963          }
2964        }
2965      }
2966    } else {
2967      return true;
2968     }
2969  }
2970
2971  /**
2972   * Append to an existing file in the namespace.
2973   */
2974  LastBlockWithStatus appendFile(String src, String holder,
2975      String clientMachine, EnumSet<CreateFlag> flag, boolean logRetryCache)
2976      throws IOException {
2977    try {
2978      return appendFileInt(src, holder, clientMachine,
2979          flag.contains(CreateFlag.NEW_BLOCK), logRetryCache);
2980    } catch (AccessControlException e) {
2981      logAuditEvent(false, "append", src);
2982      throw e;
2983    }
2984  }
2985
2986  private LastBlockWithStatus appendFileInt(final String srcArg, String holder,
2987      String clientMachine, boolean newBlock, boolean logRetryCache)
2988      throws IOException {
2989    String src = srcArg;
2990    final String operationName = "append";
2991    NameNode.stateChangeLog.debug(
2992        "DIR* NameSystem.appendFile: src={}, holder={}, clientMachine={}",
2993        src, holder, clientMachine);
2994    boolean skipSync = false;
2995    if (!supportAppends) {
2996      throw new UnsupportedOperationException(
2997          "Append is not enabled on this NameNode. Use the " +
2998          DFS_SUPPORT_APPEND_KEY + " configuration option to enable it.");
2999    }
3000
3001    LocatedBlock lb = null;
3002    HdfsFileStatus stat = null;
3003    FSPermissionChecker pc = getPermissionChecker();
3004    writeLock();
3005    try {
3006      checkOperation(OperationCategory.WRITE);
3007      checkNameNodeSafeMode("Cannot append to file" + src);
3008      final INodesInPath iip = dir.resolvePathForWrite(pc, src);
3009      src = iip.getPath();
3010      lb = appendFileInternal(pc, iip, holder, clientMachine, newBlock,
3011          logRetryCache);
3012      stat = FSDirStatAndListingOp.getFileInfo(dir, src, false,
3013          FSDirectory.isReservedRawName(srcArg));
3014    } catch (StandbyException se) {
3015      skipSync = true;
3016      throw se;
3017    } finally {
3018      writeUnlock(operationName);
3019      // There might be transactions logged while trying to recover the lease.
3020      // They need to be sync'ed even when an exception was thrown.
3021      if (!skipSync) {
3022        getEditLog().logSync();
3023      }
3024    }
3025    if (lb != null) {
3026      NameNode.stateChangeLog.debug(
3027          "DIR* NameSystem.appendFile: file {} for {} at {} block {} block" +
3028          " size {}", src, holder, clientMachine, lb.getBlock(),
3029          lb.getBlock().getNumBytes());
3030    }
3031    logAuditEvent(true, operationName, srcArg);
3032    return new LastBlockWithStatus(lb, stat);
3033  }
3034
3035  ExtendedBlock getExtendedBlock(Block blk) {
3036    return new ExtendedBlock(blockPoolId, blk);
3037  }
3038  
3039  void setBlockPoolId(String bpid) {
3040    blockPoolId = bpid;
3041    blockManager.setBlockPoolId(blockPoolId);
3042  }
3043
3044  /**
3045   * The client would like to obtain an additional block for the indicated
3046   * filename (which is being written-to).  Return an array that consists
3047   * of the block, plus a set of machines.  The first on this list should
3048   * be where the client writes data.  Subsequent items in the list must
3049   * be provided in the connection to the first datanode.
3050   *
3051   * Make sure the previous blocks have been reported by datanodes and
3052   * are replicated.  Will return an empty 2-elt array if we want the
3053   * client to "try again later".
3054   */
3055  LocatedBlock getAdditionalBlock(String src, long fileId, String clientName,
3056      ExtendedBlock previous, Set<Node> excludedNodes, 
3057      List<String> favoredNodes) throws IOException {
3058    LocatedBlock[] onRetryBlock = new LocatedBlock[1];
3059    DatanodeStorageInfo targets[] = getNewBlockTargets(src, fileId,
3060        clientName, previous, excludedNodes, favoredNodes, onRetryBlock);
3061    if (targets == null) {
3062      assert onRetryBlock[0] != null : "Retry block is null";
3063      // This is a retry. Just return the last block.
3064      return onRetryBlock[0];
3065    }
3066    LocatedBlock newBlock = storeAllocatedBlock(
3067        src, fileId, clientName, previous, targets);
3068    return newBlock;
3069  }
3070
3071  /**
3072   * Part I of getAdditionalBlock().
3073   * Analyze the state of the file under read lock to determine if the client
3074   * can add a new block, detect potential retries, lease mismatches,
3075   * and minimal replication of the penultimate block.
3076   * 
3077   * Generate target DataNode locations for the new block,
3078   * but do not create the new block yet.
3079   */
3080  DatanodeStorageInfo[] getNewBlockTargets(String src, long fileId,
3081      String clientName, ExtendedBlock previous, Set<Node> excludedNodes,
3082      List<String> favoredNodes, LocatedBlock[] onRetryBlock) throws IOException {
3083    final long blockSize;
3084    final int replication;
3085    final byte storagePolicyID;
3086    Node clientNode = null;
3087    String clientMachine = null;
3088
3089    NameNode.stateChangeLog.debug("BLOCK* getAdditionalBlock: {}  inodeId {}" +
3090        " for {}", src, fileId, clientName);
3091
3092    checkOperation(OperationCategory.READ);
3093    FSPermissionChecker pc = getPermissionChecker();
3094    readLock();
3095    try {
3096      checkOperation(OperationCategory.READ);
3097      INodesInPath iip = dir.resolvePath(pc, src, fileId);
3098      src = iip.getPath();
3099      FileState fileState = analyzeFileState(
3100          iip, fileId, clientName, previous, onRetryBlock);
3101      if (onRetryBlock[0] != null && onRetryBlock[0].getLocations().length > 0) {
3102        // This is a retry. No need to generate new locations.
3103        // Use the last block if it has locations.
3104        return null;
3105      }
3106
3107      final INodeFile pendingFile = fileState.inode;
3108      if (!checkFileProgress(src, pendingFile, false)) {
3109        throw new NotReplicatedYetException("Not replicated yet: " + src);
3110      }
3111      src = fileState.path;
3112
3113      if (pendingFile.getBlocks().length >= maxBlocksPerFile) {
3114        throw new IOException("File has reached the limit on maximum number of"
3115            + " blocks (" + DFSConfigKeys.DFS_NAMENODE_MAX_BLOCKS_PER_FILE_KEY
3116            + "): " + pendingFile.getBlocks().length + " >= "
3117            + maxBlocksPerFile);
3118      }
3119      blockSize = pendingFile.getPreferredBlockSize();
3120      clientMachine = pendingFile.getFileUnderConstructionFeature()
3121          .getClientMachine();
3122      clientNode = blockManager.getDatanodeManager().getDatanodeByHost(
3123          clientMachine);
3124      replication = pendingFile.getFileReplication();
3125      storagePolicyID = pendingFile.getStoragePolicyID();
3126    } finally {
3127      readUnlock("getNewBlockTargets");
3128    }
3129
3130    if (clientNode == null) {
3131      clientNode = getClientNode(clientMachine);
3132    }
3133
3134    // choose targets for the new block to be allocated.
3135    return getBlockManager().chooseTarget4NewBlock( 
3136        src, replication, clientNode, excludedNodes, blockSize, favoredNodes,
3137        storagePolicyID);
3138  }
3139
3140  /**
3141   * Part II of getAdditionalBlock().
3142   * Should repeat the same analysis of the file state as in Part 1,
3143   * but under the write lock.
3144   * If the conditions still hold, then allocate a new block with
3145   * the new targets, add it to the INode and to the BlocksMap.
3146   */
3147  LocatedBlock storeAllocatedBlock(String src, long fileId, String clientName,
3148      ExtendedBlock previous, DatanodeStorageInfo[] targets) throws IOException {
3149    Block newBlock = null;
3150    long offset;
3151    checkOperation(OperationCategory.WRITE);
3152    waitForLoadingFSImage();
3153    writeLock();
3154    try {
3155      checkOperation(OperationCategory.WRITE);
3156      // Run the full analysis again, since things could have changed
3157      // while chooseTarget() was executing.
3158      LocatedBlock[] onRetryBlock = new LocatedBlock[1];
3159      final INodesInPath iip = dir.resolvePath(null, src, fileId);
3160      FileState fileState = 
3161          analyzeFileState(iip, fileId, clientName, previous, onRetryBlock);
3162      final INodeFile pendingFile = fileState.inode;
3163      src = fileState.path;
3164
3165      if (onRetryBlock[0] != null) {
3166        if (onRetryBlock[0].getLocations().length > 0) {
3167          // This is a retry. Just return the last block if having locations.
3168          return onRetryBlock[0];
3169        } else {
3170          // add new chosen targets to already allocated block and return
3171          BlockInfoContiguous lastBlockInFile = pendingFile.getLastBlock();
3172          ((BlockInfoContiguousUnderConstruction) lastBlockInFile)
3173              .setExpectedLocations(targets);
3174          offset = pendingFile.computeFileSize();
3175          return makeLocatedBlock(lastBlockInFile, targets, offset);
3176        }
3177      }
3178
3179      // commit the last block and complete it if it has minimum replicas
3180      commitOrCompleteLastBlock(pendingFile, fileState.iip,
3181                                ExtendedBlock.getLocalBlock(previous));
3182
3183      // allocate new block, record block locations in INode.
3184      newBlock = createNewBlock();
3185      INodesInPath inodesInPath = INodesInPath.fromINode(pendingFile);
3186      saveAllocatedBlock(src, inodesInPath, newBlock, targets);
3187
3188      persistNewBlock(src, pendingFile);
3189      offset = pendingFile.computeFileSize();
3190    } finally {
3191      writeUnlock("storeAllocatedBlock");
3192    }
3193    getEditLog().logSync();
3194
3195    // Return located block
3196    return makeLocatedBlock(newBlock, targets, offset);
3197  }
3198
3199  /*
3200   * Resolve clientmachine address to get a network location path
3201   */
3202  private Node getClientNode(String clientMachine) {
3203    List<String> hosts = new ArrayList<String>(1);
3204    hosts.add(clientMachine);
3205    List<String> rName = getBlockManager().getDatanodeManager()
3206        .resolveNetworkLocation(hosts);
3207    Node clientNode = null;
3208    if (rName != null) {
3209      // Able to resolve clientMachine mapping.
3210      // Create a temp node to findout the rack local nodes
3211      clientNode = new NodeBase(rName.get(0) + NodeBase.PATH_SEPARATOR_STR
3212          + clientMachine);
3213    }
3214    return clientNode;
3215  }
3216
3217  static class FileState {
3218    public final INodeFile inode;
3219    public final String path;
3220    public final INodesInPath iip;
3221
3222    public FileState(INodeFile inode, String fullPath, INodesInPath iip) {
3223      this.inode = inode;
3224      this.path = fullPath;
3225      this.iip = iip;
3226    }
3227  }
3228
3229  private FileState analyzeFileState(
3230      INodesInPath iip, long fileId, String clientName,
3231      ExtendedBlock previous, LocatedBlock[] onRetryBlock)
3232          throws IOException  {
3233    assert hasReadLock();
3234    String src = iip.getPath();
3235    checkBlock(previous);
3236    onRetryBlock[0] = null;
3237    checkNameNodeSafeMode("Cannot add block to " + src);
3238
3239    // have we exceeded the configured limit of fs objects.
3240    checkFsObjectLimit();
3241
3242    Block previousBlock = ExtendedBlock.getLocalBlock(previous);
3243    final INodeFile pendingFile = checkLease(iip, clientName, fileId);
3244    BlockInfoContiguous lastBlockInFile = pendingFile.getLastBlock();
3245    if (!Block.matchingIdAndGenStamp(previousBlock, lastBlockInFile)) {
3246      // The block that the client claims is the current last block
3247      // doesn't match up with what we think is the last block. There are
3248      // four possibilities:
3249      // 1) This is the first block allocation of an append() pipeline
3250      //    which started appending exactly at or exceeding the block boundary.
3251      //    In this case, the client isn't passed the previous block,
3252      //    so it makes the allocateBlock() call with previous=null.
3253      //    We can distinguish this since the last block of the file
3254      //    will be exactly a full block.
3255      // 2) This is a retry from a client that missed the response of a
3256      //    prior getAdditionalBlock() call, perhaps because of a network
3257      //    timeout, or because of an HA failover. In that case, we know
3258      //    by the fact that the client is re-issuing the RPC that it
3259      //    never began to write to the old block. Hence it is safe to
3260      //    to return the existing block.
3261      // 3) This is an entirely bogus request/bug -- we should error out
3262      //    rather than potentially appending a new block with an empty
3263      //    one in the middle, etc
3264      // 4) This is a retry from a client that timed out while
3265      //    the prior getAdditionalBlock() is still being processed,
3266      //    currently working on chooseTarget(). 
3267      //    There are no means to distinguish between the first and 
3268      //    the second attempts in Part I, because the first one hasn't
3269      //    changed the namesystem state yet.
3270      //    We run this analysis again in Part II where case 4 is impossible.
3271
3272      BlockInfoContiguous penultimateBlock = pendingFile.getPenultimateBlock();
3273      if (previous == null &&
3274          lastBlockInFile != null &&
3275          lastBlockInFile.getNumBytes() >= pendingFile.getPreferredBlockSize() &&
3276          lastBlockInFile.isComplete()) {
3277        // Case 1
3278        NameNode.stateChangeLog.debug(
3279            "BLOCK* NameSystem.allocateBlock: handling block allocation" +
3280            " writing to a file with a complete previous block: src={}" +
3281            " lastBlock={}", src, lastBlockInFile);
3282      } else if (Block.matchingIdAndGenStamp(penultimateBlock, previousBlock)) {
3283        if (lastBlockInFile.getNumBytes() != 0) {
3284          throw new IOException(
3285              "Request looked like a retry to allocate block " +
3286              lastBlockInFile + " but it already contains " +
3287              lastBlockInFile.getNumBytes() + " bytes");
3288        }
3289
3290        // Case 2
3291        // Return the last block.
3292        NameNode.stateChangeLog.info("BLOCK* allocateBlock: " +
3293            "caught retry for allocation of a new block in " +
3294            src + ". Returning previously allocated block " + lastBlockInFile);
3295        long offset = pendingFile.computeFileSize();
3296        onRetryBlock[0] = makeLocatedBlock(lastBlockInFile,
3297            ((BlockInfoContiguousUnderConstruction)lastBlockInFile).getExpectedStorageLocations(),
3298            offset);
3299        return new FileState(pendingFile, src, iip);
3300      } else {
3301        // Case 3
3302        throw new IOException("Cannot allocate block in " + src + ": " +
3303            "passed 'previous' block " + previous + " does not match actual " +
3304            "last block in file " + lastBlockInFile);
3305      }
3306    }
3307    return new FileState(pendingFile, src, iip);
3308  }
3309
3310  LocatedBlock makeLocatedBlock(Block blk, DatanodeStorageInfo[] locs,
3311                                        long offset) throws IOException {
3312    LocatedBlock lBlk = new LocatedBlock(
3313        getExtendedBlock(blk), locs, offset, false);
3314    getBlockManager().setBlockToken(
3315        lBlk, BlockTokenSecretManager.AccessMode.WRITE);
3316    return lBlk;
3317  }
3318
3319  /** @see ClientProtocol#getAdditionalDatanode */
3320  LocatedBlock getAdditionalDatanode(String src, long fileId,
3321      final ExtendedBlock blk, final DatanodeInfo[] existings,
3322      final String[] storageIDs,
3323      final Set<Node> excludes,
3324      final int numAdditionalNodes, final String clientName
3325      ) throws IOException {
3326    //check if the feature is enabled
3327    dtpReplaceDatanodeOnFailure.checkEnabled();
3328
3329    Node clientnode = null;
3330    String clientMachine;
3331    final long preferredblocksize;
3332    final byte storagePolicyID;
3333    final List<DatanodeStorageInfo> chosen;
3334    checkOperation(OperationCategory.READ);
3335    FSPermissionChecker pc = getPermissionChecker();
3336    readLock();
3337    try {
3338      checkOperation(OperationCategory.READ);
3339      //check safe mode
3340      checkNameNodeSafeMode("Cannot add datanode; src=" + src + ", blk=" + blk);
3341      final INodesInPath iip = dir.resolvePath(pc, src, fileId);
3342      src = iip.getPath();
3343
3344      //check lease
3345      final INodeFile file = checkLease(iip, clientName, fileId);
3346      clientMachine = file.getFileUnderConstructionFeature().getClientMachine();
3347      clientnode = blockManager.getDatanodeManager().getDatanodeByHost(clientMachine);
3348      preferredblocksize = file.getPreferredBlockSize();
3349      storagePolicyID = file.getStoragePolicyID();
3350
3351      //find datanode storages
3352      final DatanodeManager dm = blockManager.getDatanodeManager();
3353      chosen = Arrays.asList(dm.getDatanodeStorageInfos(existings, storageIDs,
3354          "src=%s, fileId=%d, blk=%s, clientName=%s, clientMachine=%s",
3355          src, fileId, blk, clientName, clientMachine));
3356    } finally {
3357      readUnlock("getAdditionalDatanode");
3358    }
3359
3360    if (clientnode == null) {
3361      clientnode = getClientNode(clientMachine);
3362    }
3363
3364    // choose new datanodes.
3365    final DatanodeStorageInfo[] targets = blockManager.chooseTarget4AdditionalDatanode(
3366        src, numAdditionalNodes, clientnode, chosen, 
3367        excludes, preferredblocksize, storagePolicyID);
3368    final LocatedBlock lb = new LocatedBlock(blk, targets);
3369    blockManager.setBlockToken(lb, AccessMode.COPY);
3370    return lb;
3371  }
3372
3373  /**
3374   * The client would like to let go of the given block
3375   */
3376  boolean abandonBlock(ExtendedBlock b, long fileId, String src, String holder)
3377      throws IOException {
3378    NameNode.stateChangeLog.debug(
3379        "BLOCK* NameSystem.abandonBlock: {} of file {}", b, src);
3380    checkOperation(OperationCategory.WRITE);
3381    FSPermissionChecker pc = getPermissionChecker();
3382    waitForLoadingFSImage();
3383    writeLock();
3384    final INodesInPath iip = dir.resolvePath(pc, src, fileId);
3385    src = iip.getPath();
3386    try {
3387      checkOperation(OperationCategory.WRITE);
3388      checkNameNodeSafeMode("Cannot abandon block " + b + " for file" + src);
3389      final INodeFile file = checkLease(iip, holder, fileId);
3390
3391      // Remove the block from the pending creates list
3392      boolean removed = dir.removeBlock(src, iip, file,
3393          ExtendedBlock.getLocalBlock(b));
3394      if (!removed) {
3395        return true;
3396      }
3397      NameNode.stateChangeLog.debug("BLOCK* NameSystem.abandonBlock: {} is " +
3398          "removed from pendingCreates", b);
3399      persistBlocks(src, file, false);
3400    } finally {
3401      writeUnlock("abandonBlock");
3402    }
3403    getEditLog().logSync();
3404
3405    return true;
3406  }
3407
3408  private INodeFile checkLease(INodesInPath iip, String holder, long fileId)
3409      throws LeaseExpiredException, FileNotFoundException {
3410    String src = iip.getPath();
3411    INode inode = iip.getLastINode();
3412    assert hasReadLock();
3413    final String ident = src + " (inode " + fileId + ")";
3414    if (inode == null) {
3415      Lease lease = leaseManager.getLease(holder);
3416      throw new LeaseExpiredException(
3417          "No lease on " + ident + ": File does not exist. "
3418          + (lease != null ? lease.toString()
3419              : "Holder " + holder + " does not have any open files."));
3420    }
3421    if (!inode.isFile()) {
3422      Lease lease = leaseManager.getLease(holder);
3423      throw new LeaseExpiredException(
3424          "No lease on " + ident + ": INode is not a regular file. "
3425              + (lease != null ? lease.toString()
3426              : "Holder " + holder + " does not have any open files."));
3427    }
3428    final INodeFile file = inode.asFile();
3429    if (!file.isUnderConstruction()) {
3430      Lease lease = leaseManager.getLease(holder);
3431      throw new LeaseExpiredException(
3432          "No lease on " + ident + ": File is not open for writing. "
3433          + (lease != null ? lease.toString()
3434              : "Holder " + holder + " does not have any open files."));
3435    }
3436    // No further modification is allowed on a deleted file.
3437    // A file is considered deleted, if it is not in the inodeMap or is marked
3438    // as deleted in the snapshot feature.
3439    if (isFileDeleted(file)) {
3440      throw new FileNotFoundException(src);
3441    }
3442    String clientName = file.getFileUnderConstructionFeature().getClientName();
3443    if (holder != null && !clientName.equals(holder)) {
3444      throw new LeaseExpiredException("Lease mismatch on " + ident +
3445          " owned by " + clientName + " but is accessed by " + holder);
3446    }
3447    return file;
3448  }
3449 
3450  /**
3451   * Complete in-progress write to the given file.
3452   * @return true if successful, false if the client should continue to retry
3453   *         (e.g if not all blocks have reached minimum replication yet)
3454   * @throws IOException on error (eg lease mismatch, file not open, file deleted)
3455   */
3456  boolean completeFile(final String srcArg, String holder,
3457                       ExtendedBlock last, long fileId)
3458    throws SafeModeException, UnresolvedLinkException, IOException {
3459    String src = srcArg;
3460    NameNode.stateChangeLog.debug("DIR* NameSystem.completeFile: {} for {}",
3461        src, holder);
3462    checkBlock(last);
3463    boolean success = false;
3464    checkOperation(OperationCategory.WRITE);
3465    waitForLoadingFSImage();
3466    writeLock();
3467    try {
3468      checkOperation(OperationCategory.WRITE);
3469      checkNameNodeSafeMode("Cannot complete file " + src);
3470      success = completeFileInternal(src, holder,
3471        ExtendedBlock.getLocalBlock(last), fileId);
3472    } finally {
3473      writeUnlock("completeFile");
3474    }
3475    getEditLog().logSync();
3476    if (success) {
3477      NameNode.stateChangeLog.info("DIR* completeFile: " + srcArg
3478          + " is closed by " + holder);
3479    }
3480    return success;
3481  }
3482
3483  private boolean completeFileInternal(String src, String holder, Block last,
3484      long fileId) throws IOException {
3485    assert hasWriteLock();
3486    final INodeFile pendingFile;
3487    FSPermissionChecker pc = getPermissionChecker();
3488    final INodesInPath iip = dir.resolvePath(pc, src, fileId);
3489    src = iip.getPath();
3490    INode inode = null;
3491    try {
3492      inode = iip.getLastINode();
3493      pendingFile = checkLease(iip, holder, fileId);
3494    } catch (LeaseExpiredException lee) {
3495      if (inode != null && inode.isFile() &&
3496          !inode.asFile().isUnderConstruction()) {
3497        // This could be a retry RPC - i.e the client tried to close
3498        // the file, but missed the RPC response. Thus, it is trying
3499        // again to close the file. If the file still exists and
3500        // the client's view of the last block matches the actual
3501        // last block, then we'll treat it as a successful close.
3502        // See HDFS-3031.
3503        final Block realLastBlock = inode.asFile().getLastBlock();
3504        if (Block.matchingIdAndGenStamp(last, realLastBlock)) {
3505          NameNode.stateChangeLog.info("DIR* completeFile: " +
3506              "request from " + holder + " to complete inode " + fileId +
3507              "(" + src + ") which is already closed. But, it appears to be " +
3508              "an RPC retry. Returning success");
3509          return true;
3510        }
3511      }
3512      throw lee;
3513    }
3514    // Check the state of the penultimate block. It should be completed
3515    // before attempting to complete the last one.
3516    if (!checkFileProgress(src, pendingFile, false)) {
3517      return false;
3518    }
3519
3520    // commit the last block and complete it if it has minimum replicas
3521    commitOrCompleteLastBlock(pendingFile, iip, last);
3522
3523    if (!checkFileProgress(src, pendingFile, true)) {
3524      return false;
3525    }
3526
3527    finalizeINodeFileUnderConstruction(src, pendingFile,
3528        Snapshot.CURRENT_STATE_ID);
3529    return true;
3530  }
3531
3532  /**
3533   * Save allocated block at the given pending filename
3534   * 
3535   * @param src path to the file
3536   * @param inodesInPath representing each of the components of src.
3537   *                     The last INode is the INode for {@code src} file.
3538   * @param newBlock newly allocated block to be save
3539   * @param targets target datanodes where replicas of the new block is placed
3540   * @throws QuotaExceededException If addition of block exceeds space quota
3541   */
3542  BlockInfoContiguous saveAllocatedBlock(String src, INodesInPath inodesInPath,
3543      Block newBlock, DatanodeStorageInfo[] targets)
3544          throws IOException {
3545    assert hasWriteLock();
3546    BlockInfoContiguous b = dir.addBlock(src, inodesInPath, newBlock, targets);
3547    NameNode.stateChangeLog.info("BLOCK* allocate " + b + " for " + src);
3548    DatanodeStorageInfo.incrementBlocksScheduled(targets);
3549    return b;
3550  }
3551
3552  /**
3553   * Create new block with a unique block id and a new generation stamp.
3554   */
3555  Block createNewBlock() throws IOException {
3556    assert hasWriteLock();
3557    Block b = new Block(nextBlockId(), 0, 0);
3558    // Increment the generation stamp for every new block.
3559    b.setGenerationStamp(nextGenerationStamp(false));
3560    return b;
3561  }
3562
3563  /**
3564   * Check that the indicated file's blocks are present and
3565   * replicated.  If not, return false. If checkall is true, then check
3566   * all blocks, otherwise check only penultimate block.
3567   */
3568  boolean checkFileProgress(String src, INodeFile v, boolean checkall) {
3569    if (checkall) {
3570      // check all blocks of the file.
3571      for (BlockInfoContiguous block: v.getBlocks()) {
3572        if (!isCompleteBlock(src, block, blockManager.minReplication)) {
3573          return false;
3574        }
3575      }
3576    } else {
3577      // check the penultimate block of this file
3578      BlockInfoContiguous b = v.getPenultimateBlock();
3579      if (b != null
3580          && !isCompleteBlock(src, b, blockManager.minReplication)) {
3581        return false;
3582      }
3583    }
3584    return true;
3585  }
3586
3587  private static boolean isCompleteBlock(String src, BlockInfoContiguous b, int minRepl) {
3588    if (!b.isComplete()) {
3589      final BlockInfoContiguousUnderConstruction uc = (BlockInfoContiguousUnderConstruction)b;
3590      final int numNodes = b.numNodes();
3591      LOG.info("BLOCK* " + b + " is not COMPLETE (ucState = "
3592          + uc.getBlockUCState() + ", replication# = " + numNodes
3593          + (numNodes < minRepl? " < ": " >= ")
3594          + " minimum = " + minRepl + ") in file " + src);
3595      return false;
3596    }
3597    return true;
3598  }
3599
3600  ////////////////////////////////////////////////////////////////
3601  // Here's how to handle block-copy failure during client write:
3602  // -- As usual, the client's write should result in a streaming
3603  // backup write to a k-machine sequence.
3604  // -- If one of the backup machines fails, no worries.  Fail silently.
3605  // -- Before client is allowed to close and finalize file, make sure
3606  // that the blocks are backed up.  Namenode may have to issue specific backup
3607  // commands to make up for earlier datanode failures.  Once all copies
3608  // are made, edit namespace and return to client.
3609  ////////////////////////////////////////////////////////////////
3610
3611  /** 
3612   * Change the indicated filename. 
3613   * @deprecated Use {@link #renameTo(String, String, boolean,
3614   * Options.Rename...)} instead.
3615   */
3616  @Deprecated
3617  boolean renameTo(String src, String dst, boolean logRetryCache)
3618      throws IOException {
3619    final String operationName = "rename";
3620    waitForLoadingFSImage();
3621    FSDirRenameOp.RenameOldResult ret = null;
3622    writeLock();
3623    try {
3624      checkOperation(OperationCategory.WRITE);
3625      checkNameNodeSafeMode("Cannot rename " + src);
3626      ret = FSDirRenameOp.renameToInt(dir, src, dst, logRetryCache);
3627    } catch (AccessControlException e)  {
3628      logAuditEvent(false, operationName, src, dst, null);
3629      throw e;
3630    } finally {
3631      writeUnlock(operationName);
3632    }
3633    boolean success = ret != null && ret.success;
3634    if (success) {
3635      getEditLog().logSync();
3636    }
3637    logAuditEvent(success, "rename", src, dst,
3638        ret == null ? null : ret.auditStat);
3639    return success;
3640  }
3641
3642  void renameTo(final String src, final String dst,
3643                boolean logRetryCache, Options.Rename... options)
3644      throws IOException {
3645    final String operationName = "rename";
3646    waitForLoadingFSImage();
3647    Map.Entry<BlocksMapUpdateInfo, HdfsFileStatus> res = null;
3648    writeLock();
3649    try {
3650      checkOperation(OperationCategory.WRITE);
3651      checkNameNodeSafeMode("Cannot rename " + src);
3652      res = FSDirRenameOp.renameToInt(dir, src, dst, logRetryCache, options);
3653    } catch (AccessControlException e) {
3654      logAuditEvent(false, operationName + " (options=" +
3655          Arrays.toString(options) + ")", src, dst, null);
3656      throw e;
3657    } finally {
3658      writeUnlock(operationName);
3659    }
3660
3661    getEditLog().logSync();
3662
3663    BlocksMapUpdateInfo collectedBlocks = res.getKey();
3664    HdfsFileStatus auditStat = res.getValue();
3665    if (!collectedBlocks.getToDeleteList().isEmpty()) {
3666      removeBlocks(collectedBlocks);
3667      collectedBlocks.clear();
3668    }
3669
3670    logAuditEvent(true, operationName + " (options=" +
3671        Arrays.toString(options) + ")", src, dst, auditStat);
3672  }
3673
3674  /**
3675   * Remove the indicated file from namespace.
3676   * 
3677   * @see ClientProtocol#delete(String, boolean) for detailed description and 
3678   * description of exceptions
3679   */
3680  boolean delete(String src, boolean recursive, boolean logRetryCache)
3681      throws IOException {
3682    waitForLoadingFSImage();
3683    final String operationName = "delete";
3684    BlocksMapUpdateInfo toRemovedBlocks = null;
3685    writeLock();
3686    boolean ret = false;
3687    try {
3688      checkOperation(OperationCategory.WRITE);
3689      checkNameNodeSafeMode("Cannot delete " + src);
3690      toRemovedBlocks = FSDirDeleteOp.delete(
3691          this, src, recursive, logRetryCache);
3692      ret = toRemovedBlocks != null;
3693    } catch (AccessControlException e) {
3694      logAuditEvent(false, operationName, src);
3695      throw e;
3696    } finally {
3697      writeUnlock(operationName);
3698    }
3699    getEditLog().logSync();
3700    if (toRemovedBlocks != null) {
3701      removeBlocks(toRemovedBlocks); // Incremental deletion of blocks
3702    }
3703    logAuditEvent(true, operationName, src);
3704    return ret;
3705  }
3706
3707  FSPermissionChecker getPermissionChecker()
3708      throws AccessControlException {
3709    return dir.getPermissionChecker();
3710  }
3711
3712  /**
3713   * From the given list, incrementally remove the blocks from blockManager
3714   * Writelock is dropped and reacquired every BLOCK_DELETION_INCREMENT to
3715   * ensure that other waiters on the lock can get in. See HDFS-2938
3716   * 
3717   * @param blocks
3718   *          An instance of {@link BlocksMapUpdateInfo} which contains a list
3719   *          of blocks that need to be removed from blocksMap
3720   */
3721  void removeBlocks(BlocksMapUpdateInfo blocks) {
3722    List<Block> toDeleteList = blocks.getToDeleteList();
3723    Iterator<Block> iter = toDeleteList.iterator();
3724    while (iter.hasNext()) {
3725      writeLock();
3726      try {
3727        for (int i = 0; i < BLOCK_DELETION_INCREMENT && iter.hasNext(); i++) {
3728          blockManager.removeBlock(iter.next());
3729        }
3730      } finally {
3731        writeUnlock("removeBlocks");
3732      }
3733    }
3734  }
3735  
3736  /**
3737   * Remove leases and inodes related to a given path
3738   * @param src The given path
3739   * @param removedINodes Containing the list of inodes to be removed from
3740   *                      inodesMap
3741   * @param acquireINodeMapLock Whether to acquire the lock for inode removal
3742   */
3743  void removeLeasesAndINodes(String src, List<INode> removedINodes,
3744      final boolean acquireINodeMapLock) {
3745    assert hasWriteLock();
3746    leaseManager.removeLeaseWithPrefixPath(src);
3747    // remove inodes from inodesMap
3748    if (removedINodes != null) {
3749      if (acquireINodeMapLock) {
3750        dir.writeLock();
3751      }
3752      try {
3753        dir.removeFromInodeMap(removedINodes);
3754      } finally {
3755        if (acquireINodeMapLock) {
3756          dir.writeUnlock();
3757        }
3758      }
3759      removedINodes.clear();
3760    }
3761  }
3762
3763  /**
3764   * Removes the blocks from blocksmap and updates the safemode blocks total
3765   * 
3766   * @param blocks
3767   *          An instance of {@link BlocksMapUpdateInfo} which contains a list
3768   *          of blocks that need to be removed from blocksMap
3769   */
3770  void removeBlocksAndUpdateSafemodeTotal(BlocksMapUpdateInfo blocks) {
3771    assert hasWriteLock();
3772    // In the case that we are a Standby tailing edits from the
3773    // active while in safe-mode, we need to track the total number
3774    // of blocks and safe blocks in the system.
3775    boolean trackBlockCounts = isSafeModeTrackingBlocks();
3776    int numRemovedComplete = 0, numRemovedSafe = 0;
3777
3778    for (Block b : blocks.getToDeleteList()) {
3779      if (trackBlockCounts) {
3780        BlockInfoContiguous bi = getStoredBlock(b);
3781        if (bi.isComplete()) {
3782          numRemovedComplete++;
3783          if (bi.numNodes() >= blockManager.minReplication) {
3784            numRemovedSafe++;
3785          }
3786        }
3787      }
3788      blockManager.removeBlock(b);
3789    }
3790    if (trackBlockCounts) {
3791      if (LOG.isDebugEnabled()) {
3792        LOG.debug("Adjusting safe-mode totals for deletion."
3793            + "decreasing safeBlocks by " + numRemovedSafe
3794            + ", totalBlocks by " + numRemovedComplete);
3795      }
3796      adjustSafeModeBlockTotals(-numRemovedSafe, -numRemovedComplete);
3797    }
3798  }
3799
3800  /**
3801   * @see SafeModeInfo#shouldIncrementallyTrackBlocks
3802   */
3803  private boolean isSafeModeTrackingBlocks() {
3804    if (!haEnabled) {
3805      // Never track blocks incrementally in non-HA code.
3806      return false;
3807    }
3808    SafeModeInfo sm = this.safeMode;
3809    return sm != null && sm.shouldIncrementallyTrackBlocks();
3810  }
3811
3812  /**
3813   * Get the file info for a specific file.
3814   *
3815   * @param src The string representation of the path to the file
3816   * @param resolveLink whether to throw UnresolvedLinkException
3817   *        if src refers to a symlink
3818   *
3819   * @throws AccessControlException if access is denied
3820   * @throws UnresolvedLinkException if a symlink is encountered.
3821   *
3822   * @return object containing information regarding the file
3823   *         or null if file not found
3824   * @throws StandbyException
3825   */
3826  HdfsFileStatus getFileInfo(final String src, boolean resolveLink)
3827    throws IOException {
3828    final String operationName = "getfileinfo";
3829    checkOperation(OperationCategory.READ);
3830    HdfsFileStatus stat = null;
3831    readLock();
3832    try {
3833      checkOperation(OperationCategory.READ);
3834      stat = FSDirStatAndListingOp.getFileInfo(dir, src, resolveLink);
3835    } catch (AccessControlException e) {
3836      logAuditEvent(false, operationName, src);
3837      throw e;
3838    } finally {
3839      readUnlock(operationName);
3840    }
3841    logAuditEvent(true, operationName, src);
3842    return stat;
3843  }
3844
3845  /**
3846   * Returns true if the file is closed
3847   */
3848  boolean isFileClosed(final String src) throws IOException {
3849    final String operationName = "isFileClosed";
3850    checkOperation(OperationCategory.READ);
3851    readLock();
3852    try {
3853      checkOperation(OperationCategory.READ);
3854      return FSDirStatAndListingOp.isFileClosed(dir, src);
3855    } catch (AccessControlException e) {
3856      logAuditEvent(false, operationName, src);
3857      throw e;
3858    } finally {
3859      readUnlock(operationName);
3860    }
3861  }
3862
3863  /**
3864   * Create all the necessary directories
3865   */
3866  boolean mkdirs(String src, PermissionStatus permissions,
3867      boolean createParent) throws IOException {
3868    final String operationName = "mkdirs";
3869    HdfsFileStatus auditStat = null;
3870    checkOperation(OperationCategory.WRITE);
3871    writeLock();
3872    try {
3873      checkOperation(OperationCategory.WRITE);
3874      checkNameNodeSafeMode("Cannot create directory " + src);
3875      auditStat = FSDirMkdirOp.mkdirs(this, src, permissions, createParent);
3876    } catch (AccessControlException e) {
3877      logAuditEvent(false, operationName, src);
3878      throw e;
3879    } finally {
3880      writeUnlock(operationName);
3881    }
3882    getEditLog().logSync();
3883    logAuditEvent(true, operationName, src, null, auditStat);
3884    return true;
3885  }
3886
3887  /**
3888   * Get the content summary for a specific file/dir.
3889   *
3890   * @param src The string representation of the path to the file
3891   *
3892   * @throws AccessControlException if access is denied
3893   * @throws UnresolvedLinkException if a symlink is encountered.
3894   * @throws FileNotFoundException if no file exists
3895   * @throws StandbyException
3896   * @throws IOException for issues with writing to the audit log
3897   *
3898   * @return object containing information regarding the file
3899   *         or null if file not found
3900   */
3901  ContentSummary getContentSummary(final String src) throws IOException {
3902    checkOperation(OperationCategory.READ);
3903    final String operationName = "contentSummary";
3904    readLock();
3905    boolean success = true;
3906    try {
3907      checkOperation(OperationCategory.READ);
3908      return FSDirStatAndListingOp.getContentSummary(dir, src);
3909    } catch (AccessControlException ace) {
3910      success = false;
3911      throw ace;
3912    } finally {
3913      readUnlock(operationName);
3914      logAuditEvent(success, operationName, src);
3915    }
3916  }
3917
3918  /**
3919   * Set the namespace quota and storage space quota for a directory.
3920   * See {@link ClientProtocol#setQuota(String, long, long, StorageType)} for the
3921   * contract.
3922   * 
3923   * Note: This does not support ".inodes" relative path.
3924   */
3925  void setQuota(String src, long nsQuota, long ssQuota, StorageType type)
3926      throws IOException {
3927    checkOperation(OperationCategory.WRITE);
3928    final String operationName = "setQuota";
3929    writeLock();
3930    boolean success = false;
3931    try {
3932      checkOperation(OperationCategory.WRITE);
3933      checkNameNodeSafeMode("Cannot set quota on " + src);
3934      FSDirAttrOp.setQuota(dir, src, nsQuota, ssQuota, type);
3935      success = true;
3936    } finally {
3937      writeUnlock(operationName);
3938      if (success) {
3939        getEditLog().logSync();
3940      }
3941      logAuditEvent(success, operationName, src);
3942    }
3943  }
3944
3945  /** Persist all metadata about this file.
3946   * @param src The string representation of the path
3947   * @param fileId The inode ID that we're fsyncing.  Older clients will pass
3948   *               INodeId.GRANDFATHER_INODE_ID here.
3949   * @param clientName The string representation of the client
3950   * @param lastBlockLength The length of the last block 
3951   *                        under construction reported from client.
3952   * @throws IOException if path does not exist
3953   */
3954  void fsync(String src, long fileId, String clientName, long lastBlockLength)
3955      throws IOException {
3956    NameNode.stateChangeLog.info("BLOCK* fsync: " + src + " for " + clientName);
3957    checkOperation(OperationCategory.WRITE);
3958
3959    FSPermissionChecker pc = getPermissionChecker();
3960    waitForLoadingFSImage();
3961    writeLock();
3962    try {
3963      checkOperation(OperationCategory.WRITE);
3964      checkNameNodeSafeMode("Cannot fsync file " + src);
3965      INodesInPath iip = dir.resolvePath(pc, src, fileId);
3966      src = iip.getPath();
3967      final INodeFile pendingFile = checkLease(iip, clientName, fileId);
3968      if (lastBlockLength > 0) {
3969        pendingFile.getFileUnderConstructionFeature().updateLengthOfLastBlock(
3970            pendingFile, lastBlockLength);
3971      }
3972      persistBlocks(src, pendingFile, false);
3973    } finally {
3974      writeUnlock("fsync");
3975    }
3976    getEditLog().logSync();
3977  }
3978
3979  /**
3980   * Move a file that is being written to be immutable.
3981   * @param src The filename
3982   * @param lease The lease for the client creating the file
3983   * @param recoveryLeaseHolder reassign lease to this holder if the last block
3984   *        needs recovery; keep current holder if null.
3985   * @throws AlreadyBeingCreatedException if file is waiting to achieve minimal
3986   *         replication;<br>
3987   *         RecoveryInProgressException if lease recovery is in progress.<br>
3988   *         IOException in case of an error.
3989   * @return true  if file has been successfully finalized and closed or 
3990   *         false if block recovery has been initiated. Since the lease owner
3991   *         has been changed and logged, caller should call logSync().
3992   */
3993  boolean internalReleaseLease(Lease lease, String src, INodesInPath iip,
3994      String recoveryLeaseHolder) throws IOException {
3995    LOG.info("Recovering " + lease + ", src=" + src);
3996    assert !isInSafeMode();
3997    assert hasWriteLock();
3998
3999    final INodeFile pendingFile = iip.getLastINode().asFile();
4000    int nrBlocks = pendingFile.numBlocks();
4001    BlockInfoContiguous[] blocks = pendingFile.getBlocks();
4002
4003    int nrCompleteBlocks;
4004    BlockInfoContiguous curBlock = null;
4005    for(nrCompleteBlocks = 0; nrCompleteBlocks < nrBlocks; nrCompleteBlocks++) {
4006      curBlock = blocks[nrCompleteBlocks];
4007      if(!curBlock.isComplete())
4008        break;
4009      assert blockManager.checkMinReplication(curBlock) :
4010              "A COMPLETE block is not minimally replicated in " + src;
4011    }
4012
4013    // If there are no incomplete blocks associated with this file,
4014    // then reap lease immediately and close the file.
4015    if(nrCompleteBlocks == nrBlocks) {
4016      finalizeINodeFileUnderConstruction(src, pendingFile,
4017          iip.getLatestSnapshotId());
4018      NameNode.stateChangeLog.warn("BLOCK*"
4019        + " internalReleaseLease: All existing blocks are COMPLETE,"
4020        + " lease removed, file closed.");
4021      return true;  // closed!
4022    }
4023
4024    // Only the last and the penultimate blocks may be in non COMPLETE state.
4025    // If the penultimate block is not COMPLETE, then it must be COMMITTED.
4026    if(nrCompleteBlocks < nrBlocks - 2 ||
4027       nrCompleteBlocks == nrBlocks - 2 &&
4028         curBlock != null &&
4029         curBlock.getBlockUCState() != BlockUCState.COMMITTED) {
4030      final String message = "DIR* NameSystem.internalReleaseLease: "
4031        + "attempt to release a create lock on "
4032        + src + " but file is already closed.";
4033      NameNode.stateChangeLog.warn(message);
4034      throw new IOException(message);
4035    }
4036
4037    // The last block is not COMPLETE, and
4038    // that the penultimate block if exists is either COMPLETE or COMMITTED
4039    final BlockInfoContiguous lastBlock = pendingFile.getLastBlock();
4040    BlockUCState lastBlockState = lastBlock.getBlockUCState();
4041    BlockInfoContiguous penultimateBlock = pendingFile.getPenultimateBlock();
4042
4043    // If penultimate block doesn't exist then its minReplication is met
4044    boolean penultimateBlockMinReplication = penultimateBlock == null ? true :
4045        blockManager.checkMinReplication(penultimateBlock);
4046
4047    switch(lastBlockState) {
4048    case COMPLETE:
4049      assert false : "Already checked that the last block is incomplete";
4050      break;
4051    case COMMITTED:
4052      // Close file if committed blocks are minimally replicated
4053      if(penultimateBlockMinReplication &&
4054          blockManager.checkMinReplication(lastBlock)) {
4055        finalizeINodeFileUnderConstruction(src, pendingFile,
4056            iip.getLatestSnapshotId());
4057        NameNode.stateChangeLog.warn("BLOCK*"
4058          + " internalReleaseLease: Committed blocks are minimally replicated,"
4059          + " lease removed, file closed.");
4060        return true;  // closed!
4061      }
4062      // Cannot close file right now, since some blocks 
4063      // are not yet minimally replicated.
4064      // This may potentially cause infinite loop in lease recovery
4065      // if there are no valid replicas on data-nodes.
4066      String message = "DIR* NameSystem.internalReleaseLease: " +
4067          "Failed to release lease for file " + src +
4068          ". Committed blocks are waiting to be minimally replicated." +
4069          " Try again later.";
4070      NameNode.stateChangeLog.warn(message);
4071      throw new AlreadyBeingCreatedException(message);
4072    case UNDER_CONSTRUCTION:
4073    case UNDER_RECOVERY:
4074      final BlockInfoContiguousUnderConstruction uc = (BlockInfoContiguousUnderConstruction)lastBlock;
4075      // determine if last block was intended to be truncated
4076      BlockInfoContiguous recoveryBlock = uc.getTruncateBlock();
4077      boolean truncateRecovery = recoveryBlock != null;
4078      boolean copyOnTruncate = truncateRecovery &&
4079          recoveryBlock.getBlockId() != uc.getBlockId();
4080      assert !copyOnTruncate ||
4081          recoveryBlock.getBlockId() < uc.getBlockId() &&
4082          recoveryBlock.getGenerationStamp() < uc.getGenerationStamp() &&
4083          recoveryBlock.getNumBytes() > uc.getNumBytes() :
4084            "wrong recoveryBlock";
4085
4086      // setup the last block locations from the blockManager if not known
4087      if (uc.getNumExpectedLocations() == 0) {
4088        uc.setExpectedLocations(blockManager.getStorages(lastBlock));
4089      }
4090
4091      if (uc.getNumExpectedLocations() == 0 && uc.getNumBytes() == 0) {
4092        // There is no datanode reported to this block.
4093        // may be client have crashed before writing data to pipeline.
4094        // This blocks doesn't need any recovery.
4095        // We can remove this block and close the file.
4096        pendingFile.removeLastBlock(lastBlock);
4097        finalizeINodeFileUnderConstruction(src, pendingFile,
4098            iip.getLatestSnapshotId());
4099        NameNode.stateChangeLog.warn("BLOCK* internalReleaseLease: "
4100            + "Removed empty last block and closed file.");
4101        return true;
4102      }
4103      // start recovery of the last block for this file
4104      long blockRecoveryId = nextGenerationStamp(blockIdManager.isLegacyBlock(uc));
4105      lease = reassignLease(lease, src, recoveryLeaseHolder, pendingFile);
4106      if(copyOnTruncate) {
4107        uc.setGenerationStamp(blockRecoveryId);
4108      } else if(truncateRecovery) {
4109        recoveryBlock.setGenerationStamp(blockRecoveryId);
4110      }
4111      uc.initializeBlockRecovery(blockRecoveryId, true);
4112      leaseManager.renewLease(lease);
4113      // Cannot close file right now, since the last block requires recovery.
4114      // This may potentially cause infinite loop in lease recovery
4115      // if there are no valid replicas on data-nodes.
4116      NameNode.stateChangeLog.warn(
4117                "DIR* NameSystem.internalReleaseLease: " +
4118                "File " + src + " has not been closed." +
4119               " Lease recovery is in progress. " +
4120                "RecoveryId = " + blockRecoveryId + " for block " + lastBlock);
4121      break;
4122    }
4123    return false;
4124  }
4125
4126  private Lease reassignLease(Lease lease, String src, String newHolder,
4127      INodeFile pendingFile) {
4128    assert hasWriteLock();
4129    if(newHolder == null)
4130      return lease;
4131    // The following transaction is not synced. Make sure it's sync'ed later.
4132    logReassignLease(lease.getHolder(), src, newHolder);
4133    return reassignLeaseInternal(lease, src, newHolder, pendingFile);
4134  }
4135  
4136  Lease reassignLeaseInternal(Lease lease, String src, String newHolder,
4137      INodeFile pendingFile) {
4138    assert hasWriteLock();
4139    pendingFile.getFileUnderConstructionFeature().setClientName(newHolder);
4140    return leaseManager.reassignLease(lease, src, newHolder);
4141  }
4142
4143  private void commitOrCompleteLastBlock(final INodeFile fileINode,
4144      final INodesInPath iip, final Block commitBlock) throws IOException {
4145    assert hasWriteLock();
4146    Preconditions.checkArgument(fileINode.isUnderConstruction());
4147    blockManager.commitOrCompleteLastBlock(fileINode, commitBlock, iip);
4148  }
4149
4150  private void finalizeINodeFileUnderConstruction(String src,
4151      INodeFile pendingFile, int latestSnapshot) throws IOException {
4152    assert hasWriteLock();
4153
4154    FileUnderConstructionFeature uc = pendingFile.getFileUnderConstructionFeature();
4155    if (uc == null) {
4156      throw new IOException("Cannot finalize file " + src
4157          + " because it is not under construction");
4158    }
4159    
4160    pendingFile.recordModification(latestSnapshot);
4161
4162    // The file is no longer pending.
4163    // Create permanent INode, update blocks. No need to replace the inode here
4164    // since we just remove the uc feature from pendingFile
4165    pendingFile.toCompleteFile(now());
4166
4167    leaseManager.removeLease(uc.getClientName(), src);
4168
4169    waitForLoadingFSImage();
4170    // close file and persist block allocations for this file
4171    closeFile(src, pendingFile);
4172
4173    blockManager.checkReplication(pendingFile);
4174  }
4175
4176  @VisibleForTesting
4177  BlockInfoContiguous getStoredBlock(Block block) {
4178    return blockManager.getStoredBlock(block);
4179  }
4180  
4181  @Override
4182  public boolean isInSnapshot(BlockInfoContiguousUnderConstruction blockUC) {
4183    assert hasReadLock();
4184    final BlockCollection bc = blockUC.getBlockCollection();
4185    if (bc == null || !(bc instanceof INodeFile)
4186        || !bc.isUnderConstruction()) {
4187      return false;
4188    }
4189
4190    String fullName = bc.getName();
4191    try {
4192      if (fullName != null && fullName.startsWith(Path.SEPARATOR)
4193          && dir.getINode(fullName) == bc) {
4194        // If file exists in normal path then no need to look in snapshot
4195        return false;
4196      }
4197    } catch (UnresolvedLinkException e) {
4198      LOG.error("Error while resolving the link : " + fullName, e);
4199      return false;
4200    }
4201    /*
4202     * 1. if bc is under construction and also with snapshot, and
4203     * bc is not in the current fsdirectory tree, bc must represent a snapshot
4204     * file. 
4205     * 2. if fullName is not an absolute path, bc cannot be existent in the 
4206     * current fsdirectory tree. 
4207     * 3. if bc is not the current node associated with fullName, bc must be a
4208     * snapshot inode.
4209     */
4210    return true;
4211  }
4212
4213  void commitBlockSynchronization(ExtendedBlock oldBlock,
4214      long newgenerationstamp, long newlength,
4215      boolean closeFile, boolean deleteblock, DatanodeID[] newtargets,
4216      String[] newtargetstorages) throws IOException {
4217    LOG.info("commitBlockSynchronization(oldBlock=" + oldBlock
4218             + ", newgenerationstamp=" + newgenerationstamp
4219             + ", newlength=" + newlength
4220             + ", newtargets=" + Arrays.asList(newtargets)
4221             + ", closeFile=" + closeFile
4222             + ", deleteBlock=" + deleteblock
4223             + ")");
4224    checkOperation(OperationCategory.WRITE);
4225    final String src;
4226    waitForLoadingFSImage();
4227    writeLock();
4228    try {
4229      checkOperation(OperationCategory.WRITE);
4230      // If a DN tries to commit to the standby, the recovery will
4231      // fail, and the next retry will succeed on the new NN.
4232  
4233      checkNameNodeSafeMode(
4234          "Cannot commitBlockSynchronization while in safe mode");
4235      final BlockInfoContiguous storedBlock = getStoredBlock(
4236          ExtendedBlock.getLocalBlock(oldBlock));
4237      if (storedBlock == null) {
4238        if (deleteblock) {
4239          // This may be a retry attempt so ignore the failure
4240          // to locate the block.
4241          if (LOG.isDebugEnabled()) {
4242            LOG.debug("Block (=" + oldBlock + ") not found");
4243          }
4244          return;
4245        } else {
4246          throw new IOException("Block (=" + oldBlock + ") not found");
4247        }
4248      }
4249      final long oldGenerationStamp = storedBlock.getGenerationStamp();
4250      final long oldNumBytes = storedBlock.getNumBytes();
4251      //
4252      // The implementation of delete operation (see @deleteInternal method)
4253      // first removes the file paths from namespace, and delays the removal
4254      // of blocks to later time for better performance. When
4255      // commitBlockSynchronization (this method) is called in between, the
4256      // blockCollection of storedBlock could have been assigned to null by
4257      // the delete operation, throw IOException here instead of NPE; if the
4258      // file path is already removed from namespace by the delete operation,
4259      // throw FileNotFoundException here, so not to proceed to the end of
4260      // this method to add a CloseOp to the edit log for an already deleted
4261      // file (See HDFS-6825).
4262      //
4263      BlockCollection blockCollection = storedBlock.getBlockCollection();
4264      if (blockCollection == null) {
4265        throw new IOException("The blockCollection of " + storedBlock
4266            + " is null, likely because the file owning this block was"
4267            + " deleted and the block removal is delayed");
4268      }
4269      INodeFile iFile = ((INode)blockCollection).asFile();
4270      src = iFile.getFullPathName();
4271      if (isFileDeleted(iFile)) {
4272        throw new FileNotFoundException("File not found: "
4273            + src + ", likely due to delayed block removal");
4274      }
4275      if ((!iFile.isUnderConstruction() || storedBlock.isComplete()) &&
4276          iFile.getLastBlock().isComplete()) {
4277        if (LOG.isDebugEnabled()) {
4278          LOG.debug("Unexpected block (=" + oldBlock
4279                    + ") since the file (=" + iFile.getLocalName()
4280                    + ") is not under construction");
4281        }
4282        return;
4283      }
4284
4285      BlockInfoContiguousUnderConstruction truncatedBlock =
4286          (BlockInfoContiguousUnderConstruction) iFile.getLastBlock();
4287      long recoveryId = truncatedBlock.getBlockRecoveryId();
4288      boolean copyTruncate =
4289          truncatedBlock.getBlockId() != storedBlock.getBlockId();
4290      if(recoveryId != newgenerationstamp) {
4291        throw new IOException("The recovery id " + newgenerationstamp
4292                              + " does not match current recovery id "
4293                              + recoveryId + " for block " + oldBlock);
4294      }
4295
4296      if (deleteblock) {
4297        Block blockToDel = ExtendedBlock.getLocalBlock(oldBlock);
4298        boolean remove = iFile.removeLastBlock(blockToDel);
4299        if (remove) {
4300          blockManager.removeBlock(storedBlock);
4301        }
4302      }
4303      else {
4304        // update last block
4305        if(!copyTruncate) {
4306          storedBlock.setGenerationStamp(newgenerationstamp);
4307          storedBlock.setNumBytes(newlength);
4308        }
4309
4310        // find the DatanodeDescriptor objects
4311        ArrayList<DatanodeDescriptor> trimmedTargets =
4312            new ArrayList<DatanodeDescriptor>(newtargets.length);
4313        ArrayList<String> trimmedStorages =
4314            new ArrayList<String>(newtargets.length);
4315        if (newtargets.length > 0) {
4316          for (int i = 0; i < newtargets.length; ++i) {
4317            // try to get targetNode
4318            DatanodeDescriptor targetNode =
4319                blockManager.getDatanodeManager().getDatanode(newtargets[i]);
4320            if (targetNode != null) {
4321              trimmedTargets.add(targetNode);
4322              trimmedStorages.add(newtargetstorages[i]);
4323            } else if (LOG.isDebugEnabled()) {
4324              LOG.debug("DatanodeDescriptor (=" + newtargets[i] + ") not found");
4325            }
4326          }
4327        }
4328        if ((closeFile) && !trimmedTargets.isEmpty()) {
4329          // the file is getting closed. Insert block locations into blockManager.
4330          // Otherwise fsck will report these blocks as MISSING, especially if the
4331          // blocksReceived from Datanodes take a long time to arrive.
4332          for (int i = 0; i < trimmedTargets.size(); i++) {
4333            DatanodeStorageInfo storageInfo =
4334                trimmedTargets.get(i).getStorageInfo(trimmedStorages.get(i));
4335            if (storageInfo != null) {
4336              if(copyTruncate) {
4337                storageInfo.addBlock(truncatedBlock);
4338              } else {
4339                storageInfo.addBlock(storedBlock);
4340              }
4341            }
4342          }
4343        }
4344
4345        // add pipeline locations into the INodeUnderConstruction
4346        DatanodeStorageInfo[] trimmedStorageInfos =
4347            blockManager.getDatanodeManager().getDatanodeStorageInfos(
4348                trimmedTargets.toArray(new DatanodeID[trimmedTargets.size()]),
4349                trimmedStorages.toArray(new String[trimmedStorages.size()]),
4350                "src=%s, oldBlock=%s, newgenerationstamp=%d, newlength=%d",
4351                src, oldBlock, newgenerationstamp, newlength);
4352
4353        if(copyTruncate) {
4354          iFile.setLastBlock(truncatedBlock, trimmedStorageInfos);
4355        } else {
4356          iFile.setLastBlock(storedBlock, trimmedStorageInfos);
4357          if (closeFile) {
4358            blockManager.markBlockReplicasAsCorrupt(storedBlock,
4359                oldGenerationStamp, oldNumBytes, trimmedStorageInfos);
4360          }
4361        }
4362      }
4363
4364      if (closeFile) {
4365        if(copyTruncate) {
4366          closeFileCommitBlocks(src, iFile, truncatedBlock);
4367          if(!iFile.isBlockInLatestSnapshot(storedBlock)) {
4368            blockManager.removeBlock(storedBlock);
4369          }
4370        } else {
4371          closeFileCommitBlocks(src, iFile, storedBlock);
4372        }
4373      } else {
4374        // If this commit does not want to close the file, persist blocks
4375        persistBlocks(src, iFile, false);
4376      }
4377    } finally {
4378      writeUnlock("commitBlockSynchronization");
4379    }
4380    getEditLog().logSync();
4381    if (closeFile) {
4382      LOG.info("commitBlockSynchronization(oldBlock=" + oldBlock
4383          + ", file=" + src
4384          + ", newgenerationstamp=" + newgenerationstamp
4385          + ", newlength=" + newlength
4386          + ", newtargets=" + Arrays.asList(newtargets) + ") successful");
4387    } else {
4388      LOG.info("commitBlockSynchronization(" + oldBlock + ") successful");
4389    }
4390  }
4391
4392  /**
4393   * @param pendingFile open file that needs to be closed
4394   * @param storedBlock last block
4395   * @throws IOException on error
4396   */
4397  @VisibleForTesting
4398  void closeFileCommitBlocks(String src, INodeFile pendingFile,
4399      BlockInfoContiguous storedBlock) throws IOException {
4400    final INodesInPath iip = INodesInPath.fromINode(pendingFile);
4401
4402    // commit the last block and complete it if it has minimum replicas
4403    commitOrCompleteLastBlock(pendingFile, iip, storedBlock);
4404
4405    //remove lease, close file
4406    finalizeINodeFileUnderConstruction(src, pendingFile,
4407        Snapshot.findLatestSnapshot(pendingFile, Snapshot.CURRENT_STATE_ID));
4408  }
4409
4410  /**
4411   * Renew the lease(s) held by the given client
4412   */
4413  void renewLease(String holder) throws IOException {
4414    checkOperation(OperationCategory.WRITE);
4415    readLock();
4416    try {
4417      checkOperation(OperationCategory.WRITE);
4418      checkNameNodeSafeMode("Cannot renew lease for " + holder);
4419      leaseManager.renewLease(holder);
4420    } finally {
4421      readUnlock("renewLease");
4422    }
4423  }
4424
4425  /**
4426   * Get a partial listing of the indicated directory
4427   *
4428   * @param src the directory name
4429   * @param startAfter the name to start after
4430   * @param needLocation if blockLocations need to be returned
4431   * @return a partial listing starting after startAfter
4432   * 
4433   * @throws AccessControlException if access is denied
4434   * @throws UnresolvedLinkException if symbolic link is encountered
4435   * @throws IOException if other I/O error occurred
4436   */
4437  DirectoryListing getListing(String src, byte[] startAfter,
4438      boolean needLocation) 
4439      throws IOException {
4440    checkOperation(OperationCategory.READ);
4441    final String operationName = "listStatus";
4442    DirectoryListing dl = null;
4443    readLock();
4444    try {
4445      checkOperation(NameNode.OperationCategory.READ);
4446      dl = FSDirStatAndListingOp.getListingInt(dir, src, startAfter,
4447          needLocation);
4448    } catch (AccessControlException e) {
4449      logAuditEvent(false, operationName, src);
4450      throw e;
4451    } finally {
4452      readUnlock(operationName);
4453    }
4454    logAuditEvent(true, operationName, src);
4455    return dl;
4456  }
4457
4458  /////////////////////////////////////////////////////////
4459  //
4460  // These methods are called by datanodes
4461  //
4462  /////////////////////////////////////////////////////////
4463  /**
4464   * Register Datanode.
4465   * <p>
4466   * The purpose of registration is to identify whether the new datanode
4467   * serves a new data storage, and will report new data block copies,
4468   * which the namenode was not aware of; or the datanode is a replacement
4469   * node for the data storage that was previously served by a different
4470   * or the same (in terms of host:port) datanode.
4471   * The data storages are distinguished by their storageIDs. When a new
4472   * data storage is reported the namenode issues a new unique storageID.
4473   * <p>
4474   * Finally, the namenode returns its namespaceID as the registrationID
4475   * for the datanodes. 
4476   * namespaceID is a persistent attribute of the name space.
4477   * The registrationID is checked every time the datanode is communicating
4478   * with the namenode. 
4479   * Datanodes with inappropriate registrationID are rejected.
4480   * If the namenode stops, and then restarts it can restore its 
4481   * namespaceID and will continue serving the datanodes that has previously
4482   * registered with the namenode without restarting the whole cluster.
4483   * 
4484   * @see org.apache.hadoop.hdfs.server.datanode.DataNode
4485   */
4486  void registerDatanode(DatanodeRegistration nodeReg) throws IOException {
4487    writeLock();
4488    try {
4489      getBlockManager().getDatanodeManager().registerDatanode(nodeReg);
4490      checkSafeMode();
4491    } finally {
4492      writeUnlock("registerDatanode");
4493    }
4494  }
4495  
4496  /**
4497   * Get registrationID for datanodes based on the namespaceID.
4498   * 
4499   * @see #registerDatanode(DatanodeRegistration)
4500   * @return registration ID
4501   */
4502  String getRegistrationID() {
4503    return Storage.getRegistrationID(getFSImage().getStorage());
4504  }
4505
4506  /**
4507   * The given node has reported in.  This method should:
4508   * 1) Record the heartbeat, so the datanode isn't timed out
4509   * 2) Adjust usage stats for future block allocation
4510   * 
4511   * If a substantial amount of time passed since the last datanode 
4512   * heartbeat then request an immediate block report.  
4513   * 
4514   * @return an array of datanode commands 
4515   * @throws IOException
4516   */
4517  HeartbeatResponse handleHeartbeat(DatanodeRegistration nodeReg,
4518      StorageReport[] reports, long cacheCapacity, long cacheUsed,
4519      int xceiverCount, int xmitsInProgress, int failedVolumes,
4520      VolumeFailureSummary volumeFailureSummary) throws IOException {
4521    readLock();
4522    try {
4523      //get datanode commands
4524      final int maxTransfer = blockManager.getMaxReplicationStreams()
4525          - xmitsInProgress;
4526      DatanodeCommand[] cmds = blockManager.getDatanodeManager().handleHeartbeat(
4527          nodeReg, reports, blockPoolId, cacheCapacity, cacheUsed,
4528          xceiverCount, maxTransfer, failedVolumes, volumeFailureSummary);
4529      
4530      //create ha status
4531      final NNHAStatusHeartbeat haState = new NNHAStatusHeartbeat(
4532          haContext.getState().getServiceState(),
4533          getFSImage().getCorrectLastAppliedOrWrittenTxId());
4534
4535      return new HeartbeatResponse(cmds, haState, rollingUpgradeInfo);
4536    } finally {
4537      readUnlock("handleHeartbeat");
4538    }
4539  }
4540
4541  /**
4542   * Returns whether or not there were available resources at the last check of
4543   * resources.
4544   *
4545   * @return true if there were sufficient resources available, false otherwise.
4546   */
4547  boolean nameNodeHasResourcesAvailable() {
4548    return hasResourcesAvailable;
4549  }
4550
4551  /**
4552   * Perform resource checks and cache the results.
4553   */
4554  void checkAvailableResources() {
4555    Preconditions.checkState(nnResourceChecker != null,
4556        "nnResourceChecker not initialized");
4557    hasResourcesAvailable = nnResourceChecker.hasAvailableDiskSpace();
4558  }
4559
4560  /**
4561   * Persist the block list for the inode.
4562   * @param path
4563   * @param file
4564   * @param logRetryCache
4565   */
4566  private void persistBlocks(String path, INodeFile file,
4567                             boolean logRetryCache) {
4568    assert hasWriteLock();
4569    Preconditions.checkArgument(file.isUnderConstruction());
4570    getEditLog().logUpdateBlocks(path, file, logRetryCache);
4571    NameNode.stateChangeLog.debug("persistBlocks: {} with {} blocks is" +
4572        " peristed to the file system", path, file.getBlocks().length);
4573  }
4574
4575  /**
4576   * Close file.
4577   * @param path
4578   * @param file
4579   */
4580  private void closeFile(String path, INodeFile file) {
4581    assert hasWriteLock();
4582    waitForLoadingFSImage();
4583    // file is closed
4584    getEditLog().logCloseFile(path, file);
4585    NameNode.stateChangeLog.debug("closeFile: {} with {} blocks is persisted" +
4586        " to the file system", path, file.getBlocks().length);
4587  }
4588
4589  /**
4590   * Periodically calls hasAvailableResources of NameNodeResourceChecker, and if
4591   * there are found to be insufficient resources available, causes the NN to
4592   * enter safe mode. If resources are later found to have returned to
4593   * acceptable levels, this daemon will cause the NN to exit safe mode.
4594   */
4595  class NameNodeResourceMonitor implements Runnable  {
4596    boolean shouldNNRmRun = true;
4597    @Override
4598    public void run () {
4599      try {
4600        while (fsRunning && shouldNNRmRun) {
4601          checkAvailableResources();
4602          if(!nameNodeHasResourcesAvailable()) {
4603            String lowResourcesMsg = "NameNode low on available disk space. ";
4604            if (!isInSafeMode()) {
4605              LOG.warn(lowResourcesMsg + "Entering safe mode.");
4606            } else {
4607              LOG.warn(lowResourcesMsg + "Already in safe mode.");
4608            }
4609            enterSafeMode(true);
4610          }
4611          try {
4612            Thread.sleep(resourceRecheckInterval);
4613          } catch (InterruptedException ie) {
4614            // Deliberately ignore
4615          }
4616        }
4617      } catch (Exception e) {
4618        FSNamesystem.LOG.error("Exception in NameNodeResourceMonitor: ", e);
4619      }
4620    }
4621
4622    public void stopMonitor() {
4623      shouldNNRmRun = false;
4624    }
4625 }
4626
4627  class NameNodeEditLogRoller implements Runnable {
4628
4629    private boolean shouldRun = true;
4630    private final long rollThreshold;
4631    private final long sleepIntervalMs;
4632
4633    public NameNodeEditLogRoller(long rollThreshold, int sleepIntervalMs) {
4634        this.rollThreshold = rollThreshold;
4635        this.sleepIntervalMs = sleepIntervalMs;
4636    }
4637
4638    @Override
4639    public void run() {
4640      while (fsRunning && shouldRun) {
4641        try {
4642          FSEditLog editLog = getFSImage().getEditLog();
4643          long numEdits =
4644              editLog.getLastWrittenTxId() - editLog.getCurSegmentTxId();
4645          if (numEdits > rollThreshold) {
4646            FSNamesystem.LOG.info("NameNode rolling its own edit log because"
4647                + " number of edits in open segment exceeds threshold of "
4648                + rollThreshold);
4649            rollEditLog();
4650          }
4651        } catch (Exception e) {
4652          FSNamesystem.LOG.error("Swallowing exception in "
4653              + NameNodeEditLogRoller.class.getSimpleName() + ":", e);
4654        }
4655        try {
4656          Thread.sleep(sleepIntervalMs);
4657        } catch (InterruptedException e) {
4658          FSNamesystem.LOG.info(NameNodeEditLogRoller.class.getSimpleName()
4659              + " was interrupted, exiting");
4660          break;
4661        }
4662      }
4663    }
4664
4665    public void stop() {
4666      shouldRun = false;
4667    }
4668  }
4669
4670  /**
4671   * Daemon to periodically scan the namespace for lazyPersist files
4672   * with missing blocks and unlink them.
4673   */
4674  class LazyPersistFileScrubber implements Runnable {
4675    private volatile boolean shouldRun = true;
4676    final int scrubIntervalSec;
4677    public LazyPersistFileScrubber(final int scrubIntervalSec) {
4678      this.scrubIntervalSec = scrubIntervalSec;
4679    }
4680
4681    /**
4682     * Periodically go over the list of lazyPersist files with missing
4683     * blocks and unlink them from the namespace.
4684     */
4685    private void clearCorruptLazyPersistFiles()
4686        throws IOException {
4687
4688      BlockStoragePolicy lpPolicy = blockManager.getStoragePolicy("LAZY_PERSIST");
4689
4690      List<BlockCollection> filesToDelete = new ArrayList<>();
4691      boolean changed = false;
4692      writeLock();
4693      try {
4694        final Iterator<Block> it = blockManager.getCorruptReplicaBlockIterator();
4695
4696        while (it.hasNext()) {
4697          Block b = it.next();
4698          BlockInfoContiguous blockInfo = blockManager.getStoredBlock(b);
4699          if (blockInfo == null) {
4700            LOG.info("Cannot find block info for block " + b);
4701          } else {
4702            if (blockInfo.getBlockCollection().getStoragePolicyID()
4703                == lpPolicy.getId()) {
4704              filesToDelete.add(blockInfo.getBlockCollection());
4705            }
4706          }
4707        }
4708
4709        for (BlockCollection bc : filesToDelete) {
4710          LOG.warn("Removing lazyPersist file " + bc.getName() + " with no replicas.");
4711          BlocksMapUpdateInfo toRemoveBlocks =
4712              FSDirDeleteOp.deleteInternal(
4713                  FSNamesystem.this, bc.getName(),
4714                  INodesInPath.fromINode((INodeFile) bc), false);
4715          changed |= toRemoveBlocks != null;
4716          if (toRemoveBlocks != null) {
4717            removeBlocks(toRemoveBlocks); // Incremental deletion of blocks
4718          }
4719        }
4720      } finally {
4721        writeUnlock("clearCorruptLazyPersistFiles");
4722      }
4723      if (changed) {
4724        getEditLog().logSync();
4725      }
4726    }
4727
4728    @Override
4729    public void run() {
4730      while (fsRunning && shouldRun) {
4731        try {
4732          clearCorruptLazyPersistFiles();
4733        } catch (Exception e) {
4734          FSNamesystem.LOG.error(
4735              "Ignoring exception in LazyPersistFileScrubber:", e);
4736        }
4737
4738        try {
4739          Thread.sleep(scrubIntervalSec * 1000);
4740        } catch (InterruptedException e) {
4741          FSNamesystem.LOG.info(
4742              "LazyPersistFileScrubber was interrupted, exiting");
4743          break;
4744        }
4745      }
4746    }
4747
4748    public void stop() {
4749      shouldRun = false;
4750    }
4751  }
4752
4753  public FSImage getFSImage() {
4754    return fsImage;
4755  }
4756
4757  public FSEditLog getEditLog() {
4758    return getFSImage().getEditLog();
4759  }    
4760
4761  private void checkBlock(ExtendedBlock block) throws IOException {
4762    if (block != null && !this.blockPoolId.equals(block.getBlockPoolId())) {
4763      throw new IOException("Unexpected BlockPoolId " + block.getBlockPoolId()
4764          + " - expected " + blockPoolId);
4765    }
4766  }
4767
4768  @Metric({"MissingBlocks", "Number of missing blocks"})
4769  public long getMissingBlocksCount() {
4770    // not locking
4771    return blockManager.getMissingBlocksCount();
4772  }
4773
4774  @Metric({"MissingReplOneBlocks", "Number of missing blocks " +
4775      "with replication factor 1"})
4776  public long getMissingReplOneBlocksCount() {
4777    // not locking
4778    return blockManager.getMissingReplOneBlocksCount();
4779  }
4780  
4781  @Metric({"ExpiredHeartbeats", "Number of expired heartbeats"})
4782  public int getExpiredHeartbeats() {
4783    return datanodeStatistics.getExpiredHeartbeats();
4784  }
4785  
4786  @Metric({"TransactionsSinceLastCheckpoint",
4787      "Number of transactions since last checkpoint"})
4788  public long getTransactionsSinceLastCheckpoint() {
4789    return getEditLog().getLastWrittenTxIdWithoutLock() -
4790        getFSImage().getStorage().getMostRecentCheckpointTxId();
4791  }
4792  
4793  @Metric({"TransactionsSinceLastLogRoll",
4794      "Number of transactions since last edit log roll"})
4795  public long getTransactionsSinceLastLogRoll() {
4796    if (isInStandbyState() || !getEditLog().isSegmentOpenWithoutLock()) {
4797      return 0;
4798    } else {
4799      return getEditLog().getLastWrittenTxIdWithoutLock() -
4800          getEditLog().getCurSegmentTxIdWithoutLock() + 1;
4801    }
4802  }
4803
4804  @Metric({"LastWrittenTransactionId", "Transaction ID written to the edit log"})
4805  public long getLastWrittenTransactionId() {
4806    return getEditLog().getLastWrittenTxIdWithoutLock();
4807  }
4808  
4809  @Metric({"LastCheckpointTime",
4810      "Time in milliseconds since the epoch of the last checkpoint"})
4811  public long getLastCheckpointTime() {
4812    return getFSImage().getStorage().getMostRecentCheckpointTime();
4813  }
4814
4815  /** @see ClientProtocol#getStats() */
4816  long[] getStats() {
4817    final long[] stats = datanodeStatistics.getStats();
4818    stats[ClientProtocol.GET_STATS_UNDER_REPLICATED_IDX] = getUnderReplicatedBlocks();
4819    stats[ClientProtocol.GET_STATS_CORRUPT_BLOCKS_IDX] = getCorruptReplicaBlocks();
4820    stats[ClientProtocol.GET_STATS_MISSING_BLOCKS_IDX] = getMissingBlocksCount();
4821    stats[ClientProtocol.GET_STATS_MISSING_REPL_ONE_BLOCKS_IDX] =
4822        getMissingReplOneBlocksCount();
4823    return stats;
4824  }
4825
4826  @Override // FSNamesystemMBean
4827  @Metric({"CapacityTotal",
4828      "Total raw capacity of data nodes in bytes"})
4829  public long getCapacityTotal() {
4830    return datanodeStatistics.getCapacityTotal();
4831  }
4832
4833  @Metric({"CapacityTotalGB",
4834      "Total raw capacity of data nodes in GB"})
4835  public float getCapacityTotalGB() {
4836    return DFSUtil.roundBytesToGB(getCapacityTotal());
4837  }
4838
4839  @Override // FSNamesystemMBean
4840  @Metric({"CapacityUsed",
4841      "Total used capacity across all data nodes in bytes"})
4842  public long getCapacityUsed() {
4843    return datanodeStatistics.getCapacityUsed();
4844  }
4845
4846  @Metric({"CapacityUsedGB",
4847      "Total used capacity across all data nodes in GB"})
4848  public float getCapacityUsedGB() {
4849    return DFSUtil.roundBytesToGB(getCapacityUsed());
4850  }
4851
4852  @Override // FSNamesystemMBean
4853  @Metric({"CapacityRemaining", "Remaining capacity in bytes"})
4854  public long getCapacityRemaining() {
4855    return datanodeStatistics.getCapacityRemaining();
4856  }
4857
4858  @Metric({"CapacityRemainingGB", "Remaining capacity in GB"})
4859  public float getCapacityRemainingGB() {
4860    return DFSUtil.roundBytesToGB(getCapacityRemaining());
4861  }
4862
4863  @Metric({"CapacityUsedNonDFS",
4864      "Total space used by data nodes for non DFS purposes in bytes"})
4865  public long getCapacityUsedNonDFS() {
4866    return datanodeStatistics.getCapacityUsedNonDFS();
4867  }
4868
4869  /**
4870   * Total number of connections.
4871   */
4872  @Override // FSNamesystemMBean
4873  @Metric
4874  public int getTotalLoad() {
4875    return datanodeStatistics.getXceiverCount();
4876  }
4877  
4878  @Metric({ "SnapshottableDirectories", "Number of snapshottable directories" })
4879  public int getNumSnapshottableDirs() {
4880    return this.snapshotManager.getNumSnapshottableDirs();
4881  }
4882
4883  @Metric({ "Snapshots", "The number of snapshots" })
4884  public int getNumSnapshots() {
4885    return this.snapshotManager.getNumSnapshots();
4886  }
4887
4888  @Override
4889  public String getSnapshotStats() {
4890    Map<String, Object> info = new HashMap<String, Object>();
4891    info.put("SnapshottableDirectories", this.getNumSnapshottableDirs());
4892    info.put("Snapshots", this.getNumSnapshots());
4893    return JSON.toString(info);
4894  }
4895
4896  @Override // FSNamesystemMBean
4897  @Metric({ "NumEncryptionZones", "The number of encryption zones" })
4898  public int getNumEncryptionZones() {
4899    return dir.ezManager.getNumEncryptionZones();
4900  }
4901
4902  /**
4903   * Returns the length of the wait Queue for the FSNameSystemLock.
4904   *
4905   * A larger number here indicates lots of threads are waiting for
4906   * FSNameSystemLock.
4907   *
4908   * @return int - Number of Threads waiting to acquire FSNameSystemLock
4909   */
4910  @Override
4911  @Metric({"LockQueueLength", "Number of threads waiting to " +
4912      "acquire FSNameSystemLock"})
4913  public int getFsLockQueueLength() {
4914    return fsLock.getQueueLength();
4915  }
4916
4917  int getNumberOfDatanodes(DatanodeReportType type) {
4918    readLock();
4919    try {
4920      return getBlockManager().getDatanodeManager().getDatanodeListForReport(
4921          type).size(); 
4922    } finally {
4923      readUnlock("getNumberOfDatanodes");
4924    }
4925  }
4926
4927  DatanodeInfo[] datanodeReport(final DatanodeReportType type
4928      ) throws AccessControlException, StandbyException {
4929    checkSuperuserPrivilege();
4930    checkOperation(OperationCategory.UNCHECKED);
4931    readLock();
4932    try {
4933      checkOperation(OperationCategory.UNCHECKED);
4934      final DatanodeManager dm = getBlockManager().getDatanodeManager();      
4935      final List<DatanodeDescriptor> results = dm.getDatanodeListForReport(type);
4936
4937      DatanodeInfo[] arr = new DatanodeInfo[results.size()];
4938      for (int i=0; i<arr.length; i++) {
4939        arr[i] = new DatanodeInfo(results.get(i));
4940      }
4941      return arr;
4942    } finally {
4943      readUnlock("datanodeReport");
4944    }
4945  }
4946
4947  DatanodeStorageReport[] getDatanodeStorageReport(final DatanodeReportType type
4948      ) throws AccessControlException, StandbyException {
4949    checkSuperuserPrivilege();
4950    checkOperation(OperationCategory.UNCHECKED);
4951    readLock();
4952    try {
4953      checkOperation(OperationCategory.UNCHECKED);
4954      final DatanodeManager dm = getBlockManager().getDatanodeManager();      
4955      final List<DatanodeDescriptor> datanodes = dm.getDatanodeListForReport(type);
4956
4957      DatanodeStorageReport[] reports = new DatanodeStorageReport[datanodes.size()];
4958      for (int i = 0; i < reports.length; i++) {
4959        final DatanodeDescriptor d = datanodes.get(i);
4960        reports[i] = new DatanodeStorageReport(new DatanodeInfo(d),
4961            d.getStorageReports());
4962      }
4963      return reports;
4964    } finally {
4965      readUnlock("getDatanodeStorageReport");
4966    }
4967  }
4968
4969  /**
4970   * Save namespace image.
4971   * This will save current namespace into fsimage file and empty edits file.
4972   * Requires superuser privilege and safe mode.
4973   * 
4974   * @throws AccessControlException if superuser privilege is violated.
4975   * @throws IOException if 
4976   */
4977  void saveNamespace() throws AccessControlException, IOException {
4978    checkOperation(OperationCategory.UNCHECKED);
4979    checkSuperuserPrivilege();
4980
4981    cpLock();  // Block if a checkpointing is in progress on standby.
4982    readLock();
4983    try {
4984      checkOperation(OperationCategory.UNCHECKED);
4985
4986      if (!isInSafeMode()) {
4987        throw new IOException("Safe mode should be turned ON "
4988            + "in order to create namespace image.");
4989      }
4990      getFSImage().saveNamespace(this);
4991    } finally {
4992      readUnlock("saveNamespace");
4993      cpUnlock();
4994    }
4995    LOG.info("New namespace image has been created");
4996  }
4997  
4998  /**
4999   * Enables/Disables/Checks restoring failed storage replicas if the storage becomes available again.
5000   * Requires superuser privilege.
5001   * 
5002   * @throws AccessControlException if superuser privilege is violated.
5003   */
5004  boolean restoreFailedStorage(String arg) throws AccessControlException,
5005      StandbyException {
5006    checkSuperuserPrivilege();
5007    checkOperation(OperationCategory.UNCHECKED);
5008    cpLock();  // Block if a checkpointing is in progress on standby.
5009    writeLock();
5010    try {
5011      checkOperation(OperationCategory.UNCHECKED);
5012      
5013      // if it is disabled - enable it and vice versa.
5014      if(arg.equals("check"))
5015        return getFSImage().getStorage().getRestoreFailedStorage();
5016      
5017      boolean val = arg.equals("true");  // false if not
5018      getFSImage().getStorage().setRestoreFailedStorage(val);
5019      
5020      return val;
5021    } finally {
5022      writeUnlock("restoreFailedStorage");
5023      cpUnlock();
5024    }
5025  }
5026
5027  Date getStartTime() {
5028    return new Date(startTime); 
5029  }
5030    
5031  void finalizeUpgrade() throws IOException {
5032    checkSuperuserPrivilege();
5033    checkOperation(OperationCategory.UNCHECKED);
5034    cpLock();  // Block if a checkpointing is in progress on standby.
5035    writeLock();
5036    try {
5037      checkOperation(OperationCategory.UNCHECKED);
5038      getFSImage().finalizeUpgrade(this.isHaEnabled() && inActiveState());
5039    } finally {
5040      writeUnlock("finalizeUpgrade");
5041      cpUnlock();
5042    }
5043  }
5044
5045  void refreshNodes() throws IOException {
5046    checkOperation(OperationCategory.UNCHECKED);
5047    checkSuperuserPrivilege();
5048    getBlockManager().getDatanodeManager().refreshNodes(new HdfsConfiguration());
5049  }
5050
5051  void setBalancerBandwidth(long bandwidth) throws IOException {
5052    checkOperation(OperationCategory.UNCHECKED);
5053    checkSuperuserPrivilege();
5054    getBlockManager().getDatanodeManager().setBalancerBandwidth(bandwidth);
5055  }
5056
5057  /**
5058   * Persist the new block (the last block of the given file).
5059   * @param path
5060   * @param file
5061   */
5062  private void persistNewBlock(String path, INodeFile file) {
5063    Preconditions.checkArgument(file.isUnderConstruction());
5064    getEditLog().logAddBlock(path, file);
5065    NameNode.stateChangeLog.debug("persistNewBlock: {} with new block {}," +
5066        " current total block count is {}", path,
5067        file.getLastBlock().toString(), file.getBlocks().length);
5068  }
5069
5070  /**
5071   * SafeModeInfo contains information related to the safe mode.
5072   * <p>
5073   * An instance of {@link SafeModeInfo} is created when the name node
5074   * enters safe mode.
5075   * <p>
5076   * During name node startup {@link SafeModeInfo} counts the number of
5077   * <em>safe blocks</em>, those that have at least the minimal number of
5078   * replicas, and calculates the ratio of safe blocks to the total number
5079   * of blocks in the system, which is the size of blocks in
5080   * {@link FSNamesystem#blockManager}. When the ratio reaches the
5081   * {@link #threshold} it starts the SafeModeMonitor daemon in order
5082   * to monitor whether the safe mode {@link #extension} is passed.
5083   * Then it leaves safe mode and destroys itself.
5084   * <p>
5085   * If safe mode is turned on manually then the number of safe blocks is
5086   * not tracked because the name node is not intended to leave safe mode
5087   * automatically in the case.
5088   *
5089   * @see ClientProtocol#setSafeMode(HdfsConstants.SafeModeAction, boolean)
5090   */
5091  public class SafeModeInfo {
5092    // configuration fields
5093    /** Safe mode threshold condition %.*/
5094    private final double threshold;
5095    /** Safe mode minimum number of datanodes alive */
5096    private final int datanodeThreshold;
5097    /**
5098     * Safe mode extension after the threshold.
5099     * Make it volatile so that getSafeModeTip can read the latest value
5100     * without taking a lock.
5101     */
5102    private volatile int extension;
5103    /** Min replication required by safe mode. */
5104    private final int safeReplication;
5105    /** threshold for populating needed replication queues */
5106    private final double replQueueThreshold;
5107    // internal fields
5108    /** Time when threshold was reached.
5109     * <br> -1 safe mode is off
5110     * <br> 0 safe mode is on, and threshold is not reached yet
5111     * <br> >0 safe mode is on, but we are in extension period 
5112     */
5113    private long reached = -1;  
5114    private long reachedTimestamp = -1;
5115    /** Total number of blocks. */
5116    int blockTotal; 
5117    /** Number of safe blocks. */
5118    int blockSafe;
5119    /** Number of blocks needed to satisfy safe mode threshold condition */
5120    private int blockThreshold;
5121    /** Number of blocks needed before populating replication queues */
5122    private int blockReplQueueThreshold;
5123    /** time of the last status printout */
5124    private long lastStatusReport = 0;
5125    /**
5126     * Was safemode entered automatically because available resources were low.
5127     * Make it volatile so that getSafeModeTip can read the latest value
5128     * without taking a lock.
5129     */
5130    private volatile boolean resourcesLow = false;
5131    /** Should safemode adjust its block totals as blocks come in */
5132    private boolean shouldIncrementallyTrackBlocks = false;
5133    /** counter for tracking startup progress of reported blocks */
5134    private Counter awaitingReportedBlocksCounter;
5135    
5136    /**
5137     * Creates SafeModeInfo when the name node enters
5138     * automatic safe mode at startup.
5139     *  
5140     * @param conf configuration
5141     */
5142    private SafeModeInfo(Configuration conf) {
5143      this.threshold = conf.getFloat(DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_KEY,
5144          DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_DEFAULT);
5145      if(threshold > 1.0) {
5146        LOG.warn("The threshold value should't be greater than 1, threshold: " + threshold);
5147      }
5148      this.datanodeThreshold = conf.getInt(
5149        DFS_NAMENODE_SAFEMODE_MIN_DATANODES_KEY,
5150        DFS_NAMENODE_SAFEMODE_MIN_DATANODES_DEFAULT);
5151      this.extension = conf.getInt(DFS_NAMENODE_SAFEMODE_EXTENSION_KEY, 0);
5152      this.safeReplication = conf.getInt(DFS_NAMENODE_REPLICATION_MIN_KEY, 
5153                                         DFS_NAMENODE_REPLICATION_MIN_DEFAULT);
5154      
5155      LOG.info(DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_KEY + " = " + threshold);
5156      LOG.info(DFS_NAMENODE_SAFEMODE_MIN_DATANODES_KEY + " = " + datanodeThreshold);
5157      LOG.info(DFS_NAMENODE_SAFEMODE_EXTENSION_KEY + "     = " + extension);
5158
5159      // default to safe mode threshold (i.e., don't populate queues before leaving safe mode)
5160      this.replQueueThreshold = 
5161        conf.getFloat(DFS_NAMENODE_REPL_QUEUE_THRESHOLD_PCT_KEY,
5162                      (float) threshold);
5163      this.blockTotal = 0; 
5164      this.blockSafe = 0;
5165    }
5166
5167    /**
5168     * In the HA case, the StandbyNode can be in safemode while the namespace
5169     * is modified by the edit log tailer. In this case, the number of total
5170     * blocks changes as edits are processed (eg blocks are added and deleted).
5171     * However, we don't want to do the incremental tracking during the
5172     * startup-time loading process -- only once the initial total has been
5173     * set after the image has been loaded.
5174     */
5175    private boolean shouldIncrementallyTrackBlocks() {
5176      return shouldIncrementallyTrackBlocks;
5177    }
5178
5179    /**
5180     * Creates SafeModeInfo when safe mode is entered manually, or because
5181     * available resources are low.
5182     *
5183     * The {@link #threshold} is set to 1.5 so that it could never be reached.
5184     * {@link #blockTotal} is set to -1 to indicate that safe mode is manual.
5185     * 
5186     * @see SafeModeInfo
5187     */
5188    private SafeModeInfo(boolean resourcesLow) {
5189      this.threshold = 1.5f;  // this threshold can never be reached
5190      this.datanodeThreshold = Integer.MAX_VALUE;
5191      this.extension = Integer.MAX_VALUE;
5192      this.safeReplication = Short.MAX_VALUE + 1; // more than maxReplication
5193      this.replQueueThreshold = 1.5f; // can never be reached
5194      this.blockTotal = -1;
5195      this.blockSafe = -1;
5196      this.resourcesLow = resourcesLow;
5197      enter();
5198      reportStatus("STATE* Safe mode is ON.", true);
5199    }
5200      
5201    /**
5202     * Check if safe mode is on.
5203     * @return true if in safe mode
5204     */
5205    private synchronized boolean isOn() {
5206      doConsistencyCheck();
5207      return this.reached >= 0;
5208    }
5209      
5210    /**
5211     * Enter safe mode.
5212     */
5213    private void enter() {
5214      this.reached = 0;
5215      this.reachedTimestamp = 0;
5216    }
5217      
5218    /**
5219     * Leave safe mode.
5220     * <p>
5221     * Check for invalid, under- & over-replicated blocks in the end of startup.
5222     */
5223    private synchronized void leave() {
5224      // if not done yet, initialize replication queues.
5225      // In the standby, do not populate repl queues
5226      if (!isPopulatingReplQueues() && shouldPopulateReplQueues()) {
5227        initializeReplQueues();
5228      }
5229      long timeInSafemode = now() - startTime;
5230      NameNode.stateChangeLog.info("STATE* Leaving safe mode after " 
5231                                    + timeInSafemode/1000 + " secs");
5232      NameNode.getNameNodeMetrics().setSafeModeTime((int) timeInSafemode);
5233
5234      //Log the following only once (when transitioning from ON -> OFF)
5235      if (reached >= 0) {
5236        NameNode.stateChangeLog.info("STATE* Safe mode is OFF"); 
5237      }
5238      reached = -1;
5239      reachedTimestamp = -1;
5240      safeMode = null;
5241      final NetworkTopology nt = blockManager.getDatanodeManager().getNetworkTopology();
5242      NameNode.stateChangeLog.info("STATE* Network topology has "
5243          + nt.getNumOfRacks() + " racks and "
5244          + nt.getNumOfLeaves() + " datanodes");
5245      NameNode.stateChangeLog.info("STATE* UnderReplicatedBlocks has "
5246          + blockManager.numOfUnderReplicatedBlocks() + " blocks");
5247
5248      startSecretManagerIfNecessary();
5249
5250      // If startup has not yet completed, end safemode phase.
5251      StartupProgress prog = NameNode.getStartupProgress();
5252      if (prog.getStatus(Phase.SAFEMODE) != Status.COMPLETE) {
5253        prog.endStep(Phase.SAFEMODE, STEP_AWAITING_REPORTED_BLOCKS);
5254        prog.endPhase(Phase.SAFEMODE);
5255      }
5256    }
5257
5258    /**
5259     * Check whether we have reached the threshold for 
5260     * initializing replication queues.
5261     */
5262    private synchronized boolean canInitializeReplQueues() {
5263      return shouldPopulateReplQueues()
5264          && blockSafe >= blockReplQueueThreshold;
5265    }
5266      
5267    /** 
5268     * Safe mode can be turned off iff 
5269     * the threshold is reached and 
5270     * the extension time have passed.
5271     * @return true if can leave or false otherwise.
5272     */
5273    private synchronized boolean canLeave() {
5274      if (reached == 0) {
5275        return false;
5276      }
5277
5278      if (monotonicNow() - reached < extension) {
5279        reportStatus("STATE* Safe mode ON, in safe mode extension.", false);
5280        return false;
5281      }
5282
5283      if (needEnter()) {
5284        reportStatus("STATE* Safe mode ON, thresholds not met.", false);
5285        return false;
5286      }
5287
5288      return true;
5289    }
5290      
5291    /** 
5292     * There is no need to enter safe mode 
5293     * if DFS is empty or {@link #threshold} == 0
5294     */
5295    private boolean needEnter() {
5296      return (threshold != 0 && blockSafe < blockThreshold) ||
5297        (datanodeThreshold != 0 && getNumLiveDataNodes() < datanodeThreshold) ||
5298        (!nameNodeHasResourcesAvailable());
5299    }
5300      
5301    /**
5302     * Check and trigger safe mode if needed. 
5303     */
5304    private void checkMode() {
5305      // Have to have write-lock since leaving safemode initializes
5306      // repl queues, which requires write lock
5307      assert hasWriteLock();
5308      if (inTransitionToActive()) {
5309        return;
5310      }
5311      // if smmthread is already running, the block threshold must have been 
5312      // reached before, there is no need to enter the safe mode again
5313      if (smmthread == null && needEnter()) {
5314        enter();
5315        // check if we are ready to initialize replication queues
5316        if (canInitializeReplQueues() && !isPopulatingReplQueues()
5317            && !haEnabled) {
5318          initializeReplQueues();
5319        }
5320        reportStatus("STATE* Safe mode ON.", false);
5321        return;
5322      }
5323      // the threshold is reached or was reached before
5324      if (!isOn() ||                           // safe mode is off
5325          extension <= 0 || threshold <= 0) {  // don't need to wait
5326        this.leave(); // leave safe mode
5327        return;
5328      }
5329      if (reached > 0) {  // threshold has already been reached before
5330        reportStatus("STATE* Safe mode ON.", false);
5331        return;
5332      }
5333      // start monitor
5334      reached = monotonicNow();
5335      reachedTimestamp = now();
5336      if (smmthread == null) {
5337        smmthread = new Daemon(new SafeModeMonitor());
5338        smmthread.start();
5339        reportStatus("STATE* Safe mode extension entered.", true);
5340      }
5341
5342      // check if we are ready to initialize replication queues
5343      if (canInitializeReplQueues() && !isPopulatingReplQueues() && !haEnabled) {
5344        initializeReplQueues();
5345      }
5346    }
5347      
5348    /**
5349     * Set total number of blocks.
5350     */
5351    private synchronized void setBlockTotal(int total) {
5352      this.blockTotal = total;
5353      this.blockThreshold = (int) (blockTotal * threshold);
5354      this.blockReplQueueThreshold = 
5355        (int) (blockTotal * replQueueThreshold);
5356      if (haEnabled) {
5357        // After we initialize the block count, any further namespace
5358        // modifications done while in safe mode need to keep track
5359        // of the number of total blocks in the system.
5360        this.shouldIncrementallyTrackBlocks = true;
5361      }
5362      if(blockSafe < 0)
5363        this.blockSafe = 0;
5364      checkMode();
5365    }
5366      
5367    /**
5368     * Increment number of safe blocks if current block has 
5369     * reached minimal replication.
5370     * @param replication current replication 
5371     */
5372    private synchronized void incrementSafeBlockCount(short replication) {
5373      if (replication == safeReplication) {
5374        this.blockSafe++;
5375
5376        // Report startup progress only if we haven't completed startup yet.
5377        StartupProgress prog = NameNode.getStartupProgress();
5378        if (prog.getStatus(Phase.SAFEMODE) != Status.COMPLETE) {
5379          if (this.awaitingReportedBlocksCounter == null) {
5380            this.awaitingReportedBlocksCounter = prog.getCounter(Phase.SAFEMODE,
5381              STEP_AWAITING_REPORTED_BLOCKS);
5382          }
5383          this.awaitingReportedBlocksCounter.increment();
5384        }
5385
5386        checkMode();
5387      }
5388    }
5389      
5390    /**
5391     * Decrement number of safe blocks if current block has 
5392     * fallen below minimal replication.
5393     * @param replication current replication 
5394     */
5395    private synchronized void decrementSafeBlockCount(short replication) {
5396      if (replication == safeReplication-1) {
5397        this.blockSafe--;
5398        //blockSafe is set to -1 in manual / low resources safemode
5399        assert blockSafe >= 0 || isManual() || areResourcesLow();
5400        checkMode();
5401      }
5402    }
5403
5404    /**
5405     * Check if safe mode was entered manually
5406     */
5407    private boolean isManual() {
5408      return extension == Integer.MAX_VALUE;
5409    }
5410
5411    /**
5412     * Set manual safe mode.
5413     */
5414    private synchronized void setManual() {
5415      extension = Integer.MAX_VALUE;
5416    }
5417
5418    /**
5419     * Check if safe mode was entered due to resources being low.
5420     */
5421    private boolean areResourcesLow() {
5422      return resourcesLow;
5423    }
5424
5425    /**
5426     * Set that resources are low for this instance of safe mode.
5427     */
5428    private void setResourcesLow() {
5429      resourcesLow = true;
5430    }
5431
5432    /**
5433     * A tip on how safe mode is to be turned off: manually or automatically.
5434     */
5435    String getTurnOffTip() {
5436      if(!isOn()) {
5437        return "Safe mode is OFF.";
5438      }
5439
5440      //Manual OR low-resource safemode. (Admin intervention required)
5441      String adminMsg = "It was turned on manually. ";
5442      if (areResourcesLow()) {
5443        adminMsg = "Resources are low on NN. Please add or free up more "
5444          + "resources then turn off safe mode manually. NOTE:  If you turn off"
5445          + " safe mode before adding resources, "
5446          + "the NN will immediately return to safe mode. ";
5447      }
5448      if (isManual() || areResourcesLow()) {
5449        return adminMsg
5450          + "Use \"hdfs dfsadmin -safemode leave\" to turn safe mode off.";
5451      }
5452
5453      boolean thresholdsMet = true;
5454      int numLive = getNumLiveDataNodes();
5455      String msg = "";
5456      if (blockSafe < blockThreshold) {
5457        msg += String.format(
5458          "The reported blocks %d needs additional %d"
5459          + " blocks to reach the threshold %.4f of total blocks %d.%n",
5460          blockSafe, (blockThreshold - blockSafe) + 1, threshold, blockTotal);
5461        thresholdsMet = false;
5462      } else {
5463        msg += String.format("The reported blocks %d has reached the threshold"
5464            + " %.4f of total blocks %d. ", blockSafe, threshold, blockTotal);
5465      }
5466      if (numLive < datanodeThreshold) {
5467        msg += String.format(
5468          "The number of live datanodes %d needs an additional %d live "
5469          + "datanodes to reach the minimum number %d.%n",
5470          numLive, (datanodeThreshold - numLive), datanodeThreshold);
5471        thresholdsMet = false;
5472      } else {
5473        msg += String.format("The number of live datanodes %d has reached "
5474            + "the minimum number %d. ",
5475            numLive, datanodeThreshold);
5476      }
5477      msg += (reached > 0) ? "In safe mode extension. " : "";
5478      msg += "Safe mode will be turned off automatically ";
5479
5480      if (!thresholdsMet) {
5481        msg += "once the thresholds have been reached.";
5482      } else if (reached + extension - monotonicNow() > 0) {
5483        msg += ("in " + (reached + extension - monotonicNow()) / 1000 + " seconds.");
5484      } else {
5485        msg += "soon.";
5486      }
5487
5488      return msg;
5489    }
5490
5491    /**
5492     * Print status every 20 seconds.
5493     */
5494    private void reportStatus(String msg, boolean rightNow) {
5495      long curTime = now();
5496      if(!rightNow && (curTime - lastStatusReport < 20 * 1000))
5497        return;
5498      NameNode.stateChangeLog.info(msg + " \n" + getTurnOffTip());
5499      lastStatusReport = curTime;
5500    }
5501
5502    @Override
5503    public String toString() {
5504      String resText = "Current safe blocks = " 
5505        + blockSafe 
5506        + ". Target blocks = " + blockThreshold + " for threshold = %" + threshold
5507        + ". Minimal replication = " + safeReplication + ".";
5508      if (reached > 0) 
5509        resText += " Threshold was reached " + new Date(reachedTimestamp) + ".";
5510      return resText;
5511    }
5512      
5513    /**
5514     * Checks consistency of the class state.
5515     * This is costly so only runs if asserts are enabled.
5516     */
5517    private void doConsistencyCheck() {
5518      boolean assertsOn = false;
5519      assert assertsOn = true; // set to true if asserts are on
5520      if (!assertsOn) return;
5521      
5522      if (blockTotal == -1 && blockSafe == -1) {
5523        return; // manual safe mode
5524      }
5525      int activeBlocks = blockManager.getActiveBlockCount();
5526      if ((blockTotal != activeBlocks) &&
5527          !(blockSafe >= 0 && blockSafe <= blockTotal)) {
5528        throw new AssertionError(
5529            " SafeMode: Inconsistent filesystem state: "
5530        + "SafeMode data: blockTotal=" + blockTotal
5531        + " blockSafe=" + blockSafe + "; "
5532        + "BlockManager data: active="  + activeBlocks);
5533      }
5534    }
5535
5536    private synchronized void adjustBlockTotals(int deltaSafe, int deltaTotal) {
5537      if (!shouldIncrementallyTrackBlocks) {
5538        return;
5539      }
5540      assert haEnabled;
5541      
5542      if (LOG.isDebugEnabled()) {
5543        LOG.debug("Adjusting block totals from " +
5544            blockSafe + "/" + blockTotal + " to " +
5545            (blockSafe + deltaSafe) + "/" + (blockTotal + deltaTotal));
5546      }
5547      assert blockSafe + deltaSafe >= 0 : "Can't reduce blockSafe " +
5548        blockSafe + " by " + deltaSafe + ": would be negative";
5549      assert blockTotal + deltaTotal >= 0 : "Can't reduce blockTotal " +
5550        blockTotal + " by " + deltaTotal + ": would be negative";
5551      
5552      blockSafe += deltaSafe;
5553      setBlockTotal(blockTotal + deltaTotal);
5554    }
5555  }
5556    
5557  /**
5558   * Periodically check whether it is time to leave safe mode.
5559   * This thread starts when the threshold level is reached.
5560   *
5561   */
5562  class SafeModeMonitor implements Runnable {
5563    /** interval in msec for checking safe mode: {@value} */
5564    private static final long recheckInterval = 1000;
5565      
5566    /**
5567     */
5568    @Override
5569    public void run() {
5570      while (fsRunning) {
5571        writeLock();
5572        try {
5573          if (safeMode == null) { // Not in safe mode.
5574            break;
5575          }
5576          if (safeMode.canLeave()) {
5577            // Leave safe mode.
5578            safeMode.leave();
5579            smmthread = null;
5580            break;
5581          }
5582        } finally {
5583          writeUnlock();
5584        }
5585
5586        try {
5587          Thread.sleep(recheckInterval);
5588        } catch (InterruptedException ie) {
5589          // Ignored
5590        }
5591      }
5592      if (!fsRunning) {
5593        LOG.info("NameNode is being shutdown, exit SafeModeMonitor thread");
5594      }
5595    }
5596  }
5597    
5598  boolean setSafeMode(SafeModeAction action) throws IOException {
5599    if (action != SafeModeAction.SAFEMODE_GET) {
5600      checkSuperuserPrivilege();
5601      switch(action) {
5602      case SAFEMODE_LEAVE: // leave safe mode
5603        leaveSafeMode();
5604        break;
5605      case SAFEMODE_ENTER: // enter safe mode
5606        enterSafeMode(false);
5607        break;
5608      default:
5609        LOG.error("Unexpected safe mode action");
5610      }
5611    }
5612    return isInSafeMode();
5613  }
5614
5615  @Override
5616  public void checkSafeMode() {
5617    // safeMode is volatile, and may be set to null at any time
5618    SafeModeInfo safeMode = this.safeMode;
5619    if (safeMode != null) {
5620      safeMode.checkMode();
5621    }
5622  }
5623
5624  @Override
5625  public boolean isInSafeMode() {
5626    // safeMode is volatile, and may be set to null at any time
5627    SafeModeInfo safeMode = this.safeMode;
5628    if (safeMode == null)
5629      return false;
5630    return safeMode.isOn();
5631  }
5632
5633  @Override
5634  public boolean isInStartupSafeMode() {
5635    // safeMode is volatile, and may be set to null at any time
5636    SafeModeInfo safeMode = this.safeMode;
5637    if (safeMode == null)
5638      return false;
5639    // If the NN is in safemode, and not due to manual / low resources, we
5640    // assume it must be because of startup. If the NN had low resources during
5641    // startup, we assume it came out of startup safemode and it is now in low
5642    // resources safemode
5643    return !safeMode.isManual() && !safeMode.areResourcesLow()
5644      && safeMode.isOn();
5645  }
5646
5647  /**
5648   * Check if replication queues are to be populated
5649   * @return true when node is HAState.Active and not in the very first safemode
5650   */
5651  @Override
5652  public boolean isPopulatingReplQueues() {
5653    if (!shouldPopulateReplQueues()) {
5654      return false;
5655    }
5656    return initializedReplQueues;
5657  }
5658
5659  private boolean shouldPopulateReplQueues() {
5660    if(haContext == null || haContext.getState() == null)
5661      return false;
5662    return haContext.getState().shouldPopulateReplQueues();
5663  }
5664
5665  @Override
5666  public void incrementSafeBlockCount(int replication) {
5667    // safeMode is volatile, and may be set to null at any time
5668    SafeModeInfo safeMode = this.safeMode;
5669    if (safeMode == null)
5670      return;
5671    safeMode.incrementSafeBlockCount((short)replication);
5672  }
5673
5674  @Override
5675  public void decrementSafeBlockCount(Block b) {
5676    // safeMode is volatile, and may be set to null at any time
5677    SafeModeInfo safeMode = this.safeMode;
5678    if (safeMode == null) // mostly true
5679      return;
5680    BlockInfoContiguous storedBlock = getStoredBlock(b);
5681    if (storedBlock.isComplete()) {
5682      safeMode.decrementSafeBlockCount((short)blockManager.countNodes(b).liveReplicas());
5683    }
5684  }
5685  
5686  /**
5687   * Adjust the total number of blocks safe and expected during safe mode.
5688   * If safe mode is not currently on, this is a no-op.
5689   * @param deltaSafe the change in number of safe blocks
5690   * @param deltaTotal the change i nnumber of total blocks expected
5691   */
5692  @Override
5693  public void adjustSafeModeBlockTotals(int deltaSafe, int deltaTotal) {
5694    // safeMode is volatile, and may be set to null at any time
5695    SafeModeInfo safeMode = this.safeMode;
5696    if (safeMode == null)
5697      return;
5698    safeMode.adjustBlockTotals(deltaSafe, deltaTotal);
5699  }
5700
5701  /**
5702   * Set the total number of blocks in the system. 
5703   */
5704  public void setBlockTotal() {
5705    // safeMode is volatile, and may be set to null at any time
5706    SafeModeInfo safeMode = this.safeMode;
5707    if (safeMode == null)
5708      return;
5709    safeMode.setBlockTotal((int)getCompleteBlocksTotal());
5710  }
5711
5712  /**
5713   * Get the total number of blocks in the system. 
5714   */
5715  @Override // FSNamesystemMBean
5716  @Metric
5717  public long getBlocksTotal() {
5718    return blockManager.getTotalBlocks();
5719  }
5720
5721  /**
5722   * Get the total number of COMPLETE blocks in the system.
5723   * For safe mode only complete blocks are counted.
5724   */
5725  private long getCompleteBlocksTotal() {
5726    // Calculate number of blocks under construction
5727    long numUCBlocks = 0;
5728    readLock();
5729    numUCBlocks = leaseManager.getNumUnderConstructionBlocks();
5730    try {
5731      return getBlocksTotal() - numUCBlocks;
5732    } finally {
5733      readUnlock("getCompleteBlocksTotal");
5734    }
5735  }
5736
5737  /**
5738   * Enter safe mode. If resourcesLow is false, then we assume it is manual
5739   * @throws IOException
5740   */
5741  void enterSafeMode(boolean resourcesLow) throws IOException {
5742    writeLock();
5743    try {
5744      // Stop the secret manager, since rolling the master key would
5745      // try to write to the edit log
5746      stopSecretManager();
5747
5748      // Ensure that any concurrent operations have been fully synced
5749      // before entering safe mode. This ensures that the FSImage
5750      // is entirely stable on disk as soon as we're in safe mode.
5751      boolean isEditlogOpenForWrite = getEditLog().isOpenForWrite();
5752      // Before Editlog is in OpenForWrite mode, editLogStream will be null. So,
5753      // logSyncAll call can be called only when Edlitlog is in OpenForWrite mode
5754      if (isEditlogOpenForWrite) {
5755        getEditLog().logSyncAll();
5756      }
5757      if (!isInSafeMode()) {
5758        safeMode = new SafeModeInfo(resourcesLow);
5759        return;
5760      }
5761      if (resourcesLow) {
5762        safeMode.setResourcesLow();
5763      } else {
5764        safeMode.setManual();
5765      }
5766      if (isEditlogOpenForWrite) {
5767        getEditLog().logSyncAll();
5768      }
5769      NameNode.stateChangeLog.info("STATE* Safe mode is ON"
5770          + safeMode.getTurnOffTip());
5771    } finally {
5772      writeUnlock("enterSafeMode");
5773    }
5774  }
5775
5776  /**
5777   * Leave safe mode.
5778   */
5779  void leaveSafeMode() {
5780    writeLock();
5781    try {
5782      if (!isInSafeMode()) {
5783        NameNode.stateChangeLog.info("STATE* Safe mode is already OFF"); 
5784        return;
5785      }
5786      safeMode.leave();
5787    } finally {
5788      writeUnlock("leaveSafeMode");
5789    }
5790  }
5791    
5792  String getSafeModeTip() {
5793    // There is no need to take readLock.
5794    // Don't use isInSafeMode as this.safeMode might be set to null.
5795    // after isInSafeMode returns.
5796    boolean inSafeMode;
5797    SafeModeInfo safeMode = this.safeMode;
5798    if (safeMode == null) {
5799      inSafeMode = false;
5800    } else {
5801      inSafeMode = safeMode.isOn();
5802    }
5803
5804    if (!inSafeMode) {
5805      return "";
5806    } else {
5807      return safeMode.getTurnOffTip();
5808    }
5809  }
5810
5811  CheckpointSignature rollEditLog() throws IOException {
5812    checkSuperuserPrivilege();
5813    checkOperation(OperationCategory.JOURNAL);
5814    writeLock();
5815    try {
5816      checkOperation(OperationCategory.JOURNAL);
5817      checkNameNodeSafeMode("Log not rolled");
5818      if (Server.isRpcInvocation()) {
5819        LOG.info("Roll Edit Log from " + Server.getRemoteAddress());
5820      }
5821      return getFSImage().rollEditLog();
5822    } finally {
5823      writeUnlock("rollEditLog");
5824    }
5825  }
5826
5827  NamenodeCommand startCheckpoint(NamenodeRegistration backupNode,
5828      NamenodeRegistration activeNamenode) throws IOException {
5829    checkOperation(OperationCategory.CHECKPOINT);
5830    writeLock();
5831    try {
5832      checkOperation(OperationCategory.CHECKPOINT);
5833      checkNameNodeSafeMode("Checkpoint not started");
5834      
5835      LOG.info("Start checkpoint for " + backupNode.getAddress());
5836      NamenodeCommand cmd = getFSImage().startCheckpoint(backupNode,
5837          activeNamenode);
5838      getEditLog().logSync();
5839      return cmd;
5840    } finally {
5841      writeUnlock("startCheckpoint");
5842    }
5843  }
5844
5845  public void processIncrementalBlockReport(final DatanodeID nodeID,
5846      final StorageReceivedDeletedBlocks srdb)
5847      throws IOException {
5848    writeLock();
5849    try {
5850      blockManager.processIncrementalBlockReport(nodeID, srdb);
5851    } finally {
5852      writeUnlock("processIncrementalBlockReport");
5853    }
5854  }
5855  
5856  void endCheckpoint(NamenodeRegistration registration,
5857                            CheckpointSignature sig) throws IOException {
5858    checkOperation(OperationCategory.CHECKPOINT);
5859    readLock();
5860    try {
5861      checkOperation(OperationCategory.CHECKPOINT);
5862      checkNameNodeSafeMode("Checkpoint not ended");
5863      LOG.info("End checkpoint for " + registration.getAddress());
5864      getFSImage().endCheckpoint(sig);
5865    } finally {
5866      readUnlock("endCheckpoint");
5867    }
5868  }
5869
5870  PermissionStatus createFsOwnerPermissions(FsPermission permission) {
5871    return new PermissionStatus(fsOwner.getShortUserName(), supergroup, permission);
5872  }
5873
5874  private void checkUnreadableBySuperuser(FSPermissionChecker pc,
5875      INode inode, int snapshotId)
5876      throws IOException {
5877    if (pc.isSuperUser()) {
5878      for (XAttr xattr : FSDirXAttrOp.getXAttrs(dir, inode, snapshotId)) {
5879        if (XAttrHelper.getPrefixName(xattr).
5880            equals(SECURITY_XATTR_UNREADABLE_BY_SUPERUSER)) {
5881          throw new AccessControlException("Access is denied for " +
5882              pc.getUser() + " since the superuser is not allowed to " +
5883              "perform this operation.");
5884        }
5885      }
5886    }
5887  }
5888
5889  @Override
5890  public void checkSuperuserPrivilege()
5891      throws AccessControlException {
5892    if (isPermissionEnabled) {
5893      FSPermissionChecker pc = getPermissionChecker();
5894      pc.checkSuperuserPrivilege();
5895    }
5896  }
5897
5898  /**
5899   * Check to see if we have exceeded the limit on the number
5900   * of inodes.
5901   */
5902  void checkFsObjectLimit() throws IOException {
5903    if (maxFsObjects != 0 &&
5904        maxFsObjects <= dir.totalInodes() + getBlocksTotal()) {
5905      throw new IOException("Exceeded the configured number of objects " +
5906                             maxFsObjects + " in the filesystem.");
5907    }
5908  }
5909
5910  /**
5911   * Get the total number of objects in the system. 
5912   */
5913  @Override // FSNamesystemMBean
5914  public long getMaxObjects() {
5915    return maxFsObjects;
5916  }
5917
5918  @Override // FSNamesystemMBean
5919  @Metric
5920  public long getFilesTotal() {
5921    // There is no need to take fSNamesystem's lock as
5922    // FSDirectory has its own lock.
5923    return this.dir.totalInodes();
5924  }
5925
5926  @Override // FSNamesystemMBean
5927  @Metric
5928  public long getPendingReplicationBlocks() {
5929    return blockManager.getPendingReplicationBlocksCount();
5930  }
5931
5932  @Override // FSNamesystemMBean
5933  @Metric
5934  public long getUnderReplicatedBlocks() {
5935    return blockManager.getUnderReplicatedBlocksCount();
5936  }
5937
5938  /** Returns number of blocks with corrupt replicas */
5939  @Metric({"CorruptBlocks", "Number of blocks with corrupt replicas"})
5940  public long getCorruptReplicaBlocks() {
5941    return blockManager.getCorruptReplicaBlocksCount();
5942  }
5943
5944  @Override // FSNamesystemMBean
5945  @Metric
5946  public long getScheduledReplicationBlocks() {
5947    return blockManager.getScheduledReplicationBlocksCount();
5948  }
5949
5950  @Override
5951  @Metric
5952  public long getPendingDeletionBlocks() {
5953    return blockManager.getPendingDeletionBlocksCount();
5954  }
5955
5956  @Override
5957  public long getBlockDeletionStartTime() {
5958    return startTime + blockManager.getStartupDelayBlockDeletionInMs();
5959  }
5960
5961  @Metric
5962  public long getExcessBlocks() {
5963    return blockManager.getExcessBlocksCount();
5964  }
5965  
5966  // HA-only metric
5967  @Metric
5968  public long getPostponedMisreplicatedBlocks() {
5969    return blockManager.getPostponedMisreplicatedBlocksCount();
5970  }
5971
5972  // HA-only metric
5973  @Metric
5974  public int getPendingDataNodeMessageCount() {
5975    return blockManager.getPendingDataNodeMessageCount();
5976  }
5977  
5978  // HA-only metric
5979  @Metric
5980  public String getHAState() {
5981    return haContext.getState().toString();
5982  }
5983
5984  // HA-only metric
5985  @Metric
5986  public long getMillisSinceLastLoadedEdits() {
5987    if (isInStandbyState() && editLogTailer != null) {
5988      return monotonicNow() - editLogTailer.getLastLoadTimeMs();
5989    } else {
5990      return 0;
5991    }
5992  }
5993  
5994  @Metric
5995  public int getBlockCapacity() {
5996    return blockManager.getCapacity();
5997  }
5998
5999  @Override // FSNamesystemMBean
6000  public String getFSState() {
6001    return isInSafeMode() ? "safeMode" : "Operational";
6002  }
6003  
6004  private ObjectName mbeanName;
6005  private ObjectName mxbeanName;
6006
6007  /**
6008   * Register the FSNamesystem MBean using the name
6009   *        "hadoop:service=NameNode,name=FSNamesystemState"
6010   */
6011  private void registerMBean() {
6012    // We can only implement one MXBean interface, so we keep the old one.
6013    try {
6014      StandardMBean bean = new StandardMBean(this, FSNamesystemMBean.class);
6015      mbeanName = MBeans.register("NameNode", "FSNamesystemState", bean);
6016    } catch (NotCompliantMBeanException e) {
6017      throw new RuntimeException("Bad MBean setup", e);
6018    }
6019
6020    LOG.info("Registered FSNamesystemState MBean");
6021  }
6022
6023  /**
6024   * shutdown FSNamesystem
6025   */
6026  void shutdown() {
6027    if (snapshotManager != null) {
6028      snapshotManager.shutdown();
6029    }
6030    if (mbeanName != null) {
6031      MBeans.unregister(mbeanName);
6032      mbeanName = null;
6033    }
6034    if (mxbeanName != null) {
6035      MBeans.unregister(mxbeanName);
6036      mxbeanName = null;
6037    }
6038    if (dir != null) {
6039      dir.shutdown();
6040    }
6041    if (blockManager != null) {
6042      blockManager.shutdown();
6043    }
6044  }
6045
6046  @Override // FSNamesystemMBean
6047  @Metric({"NumLiveDataNodes", "Number of datanodes which are currently live"})
6048  public int getNumLiveDataNodes() {
6049    return getBlockManager().getDatanodeManager().getNumLiveDataNodes();
6050  }
6051
6052  @Override // FSNamesystemMBean
6053  @Metric({"NumDeadDataNodes", "Number of datanodes which are currently dead"})
6054  public int getNumDeadDataNodes() {
6055    return getBlockManager().getDatanodeManager().getNumDeadDataNodes();
6056  }
6057  
6058  @Override // FSNamesystemMBean
6059  @Metric({"NumDecomLiveDataNodes",
6060      "Number of datanodes which have been decommissioned and are now live"})
6061  public int getNumDecomLiveDataNodes() {
6062    final List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
6063    getBlockManager().getDatanodeManager().fetchDatanodes(live, null, false);
6064    int liveDecommissioned = 0;
6065    for (DatanodeDescriptor node : live) {
6066      liveDecommissioned += node.isDecommissioned() ? 1 : 0;
6067    }
6068    return liveDecommissioned;
6069  }
6070
6071  @Override // FSNamesystemMBean
6072  @Metric({"NumDecomDeadDataNodes",
6073      "Number of datanodes which have been decommissioned and are now dead"})
6074  public int getNumDecomDeadDataNodes() {
6075    final List<DatanodeDescriptor> dead = new ArrayList<DatanodeDescriptor>();
6076    getBlockManager().getDatanodeManager().fetchDatanodes(null, dead, false);
6077    int deadDecommissioned = 0;
6078    for (DatanodeDescriptor node : dead) {
6079      deadDecommissioned += node.isDecommissioned() ? 1 : 0;
6080    }
6081    return deadDecommissioned;
6082  }
6083
6084  @Override // FSNamesystemMBean
6085  @Metric({"VolumeFailuresTotal",
6086      "Total number of volume failures across all Datanodes"})
6087  public int getVolumeFailuresTotal() {
6088    List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
6089    getBlockManager().getDatanodeManager().fetchDatanodes(live, null, false);
6090    int volumeFailuresTotal = 0;
6091    for (DatanodeDescriptor node: live) {
6092      volumeFailuresTotal += node.getVolumeFailures();
6093    }
6094    return volumeFailuresTotal;
6095  }
6096
6097  @Override // FSNamesystemMBean
6098  @Metric({"EstimatedCapacityLostTotal",
6099      "An estimate of the total capacity lost due to volume failures"})
6100  public long getEstimatedCapacityLostTotal() {
6101    List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
6102    getBlockManager().getDatanodeManager().fetchDatanodes(live, null, false);
6103    long estimatedCapacityLostTotal = 0;
6104    for (DatanodeDescriptor node: live) {
6105      VolumeFailureSummary volumeFailureSummary = node.getVolumeFailureSummary();
6106      if (volumeFailureSummary != null) {
6107        estimatedCapacityLostTotal +=
6108            volumeFailureSummary.getEstimatedCapacityLostTotal();
6109      }
6110    }
6111    return estimatedCapacityLostTotal;
6112  }
6113
6114  @Override // FSNamesystemMBean
6115  @Metric({"NumDecommissioningDataNodes",
6116      "Number of datanodes in decommissioning state"})
6117  public int getNumDecommissioningDataNodes() {
6118    return getBlockManager().getDatanodeManager().getDecommissioningNodes()
6119        .size();
6120  }
6121
6122  @Override // FSNamesystemMBean
6123  @Metric({"StaleDataNodes", 
6124    "Number of datanodes marked stale due to delayed heartbeat"})
6125  public int getNumStaleDataNodes() {
6126    return getBlockManager().getDatanodeManager().getNumStaleNodes();
6127  }
6128
6129  /**
6130   * Storages are marked as "content stale" after NN restart or fails over and
6131   * before NN receives the first Heartbeat followed by the first Blockreport.
6132   */
6133  @Override // FSNamesystemMBean
6134  @Metric({"NumStaleStorages",
6135      "Number of storages marked as content stale"})
6136  public int getNumStaleStorages() {
6137    return getBlockManager().getDatanodeManager().getNumStaleStorages();
6138  }
6139
6140  @Override // FSNamesystemMBean
6141  public String getTopUserOpCounts() {
6142    if (!topConf.isEnabled) {
6143      return null;
6144    }
6145
6146    Date now = new Date();
6147    final List<RollingWindowManager.TopWindow> topWindows =
6148        topMetrics.getTopWindows();
6149    Map<String, Object> topMap = new TreeMap<String, Object>();
6150    topMap.put("windows", topWindows);
6151    topMap.put("timestamp", DFSUtil.dateToIso8601String(now));
6152    ObjectMapper mapper = new ObjectMapper();
6153    try {
6154      return mapper.writeValueAsString(topMap);
6155    } catch (IOException e) {
6156      LOG.warn("Failed to fetch TopUser metrics", e);
6157    }
6158    return null;
6159  }
6160
6161  /**
6162   * Increments, logs and then returns the stamp
6163   */
6164  long nextGenerationStamp(boolean legacyBlock)
6165      throws IOException, SafeModeException {
6166    assert hasWriteLock();
6167    checkNameNodeSafeMode("Cannot get next generation stamp");
6168
6169    long gs = blockIdManager.nextGenerationStamp(legacyBlock);
6170    if (legacyBlock) {
6171      getEditLog().logGenerationStampV1(gs);
6172    } else {
6173      getEditLog().logGenerationStampV2(gs);
6174    }
6175
6176    // NB: callers sync the log
6177    return gs;
6178  }
6179
6180  /**
6181   * Increments, logs and then returns the block ID
6182   */
6183  private long nextBlockId() throws IOException {
6184    assert hasWriteLock();
6185    checkNameNodeSafeMode("Cannot get next block ID");
6186    final long blockId = blockIdManager.nextBlockId();
6187    getEditLog().logAllocateBlockId(blockId);
6188    // NB: callers sync the log
6189    return blockId;
6190  }
6191
6192  private boolean isFileDeleted(INodeFile file) {
6193    // Not in the inodeMap or in the snapshot but marked deleted.
6194    if (dir.getInode(file.getId()) == null) {
6195      return true;
6196    }
6197
6198    // look at the path hierarchy to see if one parent is deleted by recursive
6199    // deletion
6200    INode tmpChild = file;
6201    INodeDirectory tmpParent = file.getParent();
6202    while (true) {
6203      if (tmpParent == null) {
6204        return true;
6205      }
6206
6207      INode childINode = tmpParent.getChild(tmpChild.getLocalNameBytes(),
6208          Snapshot.CURRENT_STATE_ID);
6209      if (childINode == null || !childINode.equals(tmpChild)) {
6210        // a newly created INode with the same name as an already deleted one
6211        // would be a different INode than the deleted one
6212        return true;
6213      }
6214
6215      if (tmpParent.isRoot()) {
6216        break;
6217      }
6218
6219      tmpChild = tmpParent;
6220      tmpParent = tmpParent.getParent();
6221    }
6222
6223    if (file.isWithSnapshot() &&
6224        file.getFileWithSnapshotFeature().isCurrentFileDeleted()) {
6225      return true;
6226    }
6227    return false;
6228  }
6229
6230  private INodeFile checkUCBlock(ExtendedBlock block,
6231      String clientName) throws IOException {
6232    assert hasWriteLock();
6233    checkNameNodeSafeMode("Cannot get a new generation stamp and an "
6234        + "access token for block " + block);
6235    
6236    // check stored block state
6237    BlockInfoContiguous storedBlock = getStoredBlock(ExtendedBlock.getLocalBlock(block));
6238    if (storedBlock == null || 
6239        storedBlock.getBlockUCState() != BlockUCState.UNDER_CONSTRUCTION) {
6240        throw new IOException(block + 
6241            " does not exist or is not under Construction" + storedBlock);
6242    }
6243    
6244    // check file inode
6245    final INodeFile file = ((INode)storedBlock.getBlockCollection()).asFile();
6246    if (file == null || !file.isUnderConstruction() || isFileDeleted(file)) {
6247      throw new IOException("The file " + storedBlock + 
6248          " belonged to does not exist or it is not under construction.");
6249    }
6250    
6251    // check lease
6252    if (clientName == null
6253        || !clientName.equals(file.getFileUnderConstructionFeature()
6254            .getClientName())) {
6255      throw new LeaseExpiredException("Lease mismatch: " + block + 
6256          " is accessed by a non lease holder " + clientName); 
6257    }
6258
6259    return file;
6260  }
6261  
6262  /**
6263   * Client is reporting some bad block locations.
6264   */
6265  void reportBadBlocks(LocatedBlock[] blocks) throws IOException {
6266    checkOperation(OperationCategory.WRITE);
6267    writeLock();
6268    try {
6269      checkOperation(OperationCategory.WRITE);
6270      for (int i = 0; i < blocks.length; i++) {
6271        ExtendedBlock blk = blocks[i].getBlock();
6272        DatanodeInfo[] nodes = blocks[i].getLocations();
6273        String[] storageIDs = blocks[i].getStorageIDs();
6274        for (int j = 0; j < nodes.length; j++) {
6275          NameNode.stateChangeLog.info("*DIR* reportBadBlocks for block: {} on"
6276              + " datanode: {}", blk, nodes[j].getXferAddr());
6277          blockManager.findAndMarkBlockAsCorrupt(blk, nodes[j],
6278              storageIDs == null ? null: storageIDs[j], 
6279              "client machine reported it");
6280        }
6281      }
6282    } finally {
6283      writeUnlock("reportBadBlocks");
6284    }
6285  }
6286
6287  /**
6288   * Get a new generation stamp together with an access token for 
6289   * a block under construction
6290   * 
6291   * This method is called for recovering a failed pipeline or setting up
6292   * a pipeline to append to a block.
6293   * 
6294   * @param block a block
6295   * @param clientName the name of a client
6296   * @return a located block with a new generation stamp and an access token
6297   * @throws IOException if any error occurs
6298   */
6299  LocatedBlock updateBlockForPipeline(ExtendedBlock block, 
6300      String clientName) throws IOException {
6301    LocatedBlock locatedBlock;
6302    checkOperation(OperationCategory.WRITE);
6303    writeLock();
6304    try {
6305      checkOperation(OperationCategory.WRITE);
6306
6307      // check vadility of parameters
6308      checkUCBlock(block, clientName);
6309  
6310      // get a new generation stamp and an access token
6311      block.setGenerationStamp(nextGenerationStamp(blockIdManager.isLegacyBlock(block.getLocalBlock())));
6312      locatedBlock = new LocatedBlock(block, new DatanodeInfo[0]);
6313      blockManager.setBlockToken(locatedBlock, AccessMode.WRITE);
6314    } finally {
6315      writeUnlock("bumpBlockGenerationStamp");
6316    }
6317    // Ensure we record the new generation stamp
6318    getEditLog().logSync();
6319    return locatedBlock;
6320  }
6321  
6322  /**
6323   * Update a pipeline for a block under construction
6324   * 
6325   * @param clientName the name of the client
6326   * @param oldBlock and old block
6327   * @param newBlock a new block with a new generation stamp and length
6328   * @param newNodes datanodes in the pipeline
6329   * @throws IOException if any error occurs
6330   */
6331  void updatePipeline(
6332      String clientName, ExtendedBlock oldBlock, ExtendedBlock newBlock,
6333      DatanodeID[] newNodes, String[] newStorageIDs, boolean logRetryCache)
6334      throws IOException {
6335    checkOperation(OperationCategory.WRITE);
6336    LOG.info("updatePipeline(" + oldBlock.getLocalBlock()
6337             + ", newGS=" + newBlock.getGenerationStamp()
6338             + ", newLength=" + newBlock.getNumBytes()
6339             + ", newNodes=" + Arrays.asList(newNodes)
6340             + ", client=" + clientName
6341             + ")");
6342    waitForLoadingFSImage();
6343    writeLock();
6344    try {
6345      checkOperation(OperationCategory.WRITE);
6346      checkNameNodeSafeMode("Pipeline not updated");
6347      assert newBlock.getBlockId()==oldBlock.getBlockId() : newBlock + " and "
6348        + oldBlock + " has different block identifier";
6349      updatePipelineInternal(clientName, oldBlock, newBlock, newNodes,
6350          newStorageIDs, logRetryCache);
6351    } finally {
6352      writeUnlock("updatePipeline");
6353    }
6354    getEditLog().logSync();
6355    LOG.info("updatePipeline(" + oldBlock.getLocalBlock() + " => "
6356        + newBlock.getLocalBlock() + ") success");
6357  }
6358
6359  private void updatePipelineInternal(String clientName, ExtendedBlock oldBlock,
6360      ExtendedBlock newBlock, DatanodeID[] newNodes, String[] newStorageIDs,
6361      boolean logRetryCache)
6362      throws IOException {
6363    assert hasWriteLock();
6364    // check the vadility of the block and lease holder name
6365    final INodeFile pendingFile = checkUCBlock(oldBlock, clientName);
6366    final String src = pendingFile.getFullPathName();
6367    final BlockInfoContiguousUnderConstruction blockinfo
6368        = (BlockInfoContiguousUnderConstruction)pendingFile.getLastBlock();
6369
6370    // check new GS & length: this is not expected
6371    if (newBlock.getGenerationStamp() <= blockinfo.getGenerationStamp() ||
6372        newBlock.getNumBytes() < blockinfo.getNumBytes()) {
6373      String msg = "Update " + oldBlock + " (len = " + 
6374        blockinfo.getNumBytes() + ") to an older state: " + newBlock + 
6375        " (len = " + newBlock.getNumBytes() +")";
6376      LOG.warn(msg);
6377      throw new IOException(msg);
6378    }
6379
6380    // Update old block with the new generation stamp and new length
6381    blockManager.updateLastBlock(blockinfo, newBlock);
6382
6383    // find the DatanodeDescriptor objects
6384    final DatanodeStorageInfo[] storages = blockManager.getDatanodeManager()
6385        .getDatanodeStorageInfos(newNodes, newStorageIDs,
6386            "src=%s, oldBlock=%s, newBlock=%s, clientName=%s",
6387            src, oldBlock, newBlock, clientName);
6388    blockinfo.setExpectedLocations(storages);
6389
6390    persistBlocks(src, pendingFile, logRetryCache);
6391  }
6392
6393  // rename was successful. If any part of the renamed subtree had
6394  // files that were being written to, update with new filename.
6395  void unprotectedChangeLease(String src, String dst) {
6396    assert hasWriteLock();
6397    leaseManager.changeLease(src, dst);
6398  }
6399
6400  /**
6401   * Serializes leases.
6402   */
6403  void saveFilesUnderConstruction(DataOutputStream out,
6404      Map<Long, INodeFile> snapshotUCMap) throws IOException {
6405    // This is run by an inferior thread of saveNamespace, which holds a read
6406    // lock on our behalf. If we took the read lock here, we could block
6407    // for fairness if a writer is waiting on the lock.
6408    synchronized (leaseManager) {
6409      Map<String, INodeFile> nodes = leaseManager.getINodesUnderConstruction();
6410      for (Map.Entry<String, INodeFile> entry : nodes.entrySet()) {
6411        // TODO: for HDFS-5428, because of rename operations, some
6412        // under-construction files that are
6413        // in the current fs directory can also be captured in the
6414        // snapshotUCMap. We should remove them from the snapshotUCMap.
6415        snapshotUCMap.remove(entry.getValue().getId());
6416      }
6417
6418      out.writeInt(nodes.size() + snapshotUCMap.size()); // write the size
6419      for (Map.Entry<String, INodeFile> entry : nodes.entrySet()) {
6420        FSImageSerialization.writeINodeUnderConstruction(
6421            out, entry.getValue(), entry.getKey());
6422      }
6423      for (Map.Entry<Long, INodeFile> entry : snapshotUCMap.entrySet()) {
6424        // for those snapshot INodeFileUC, we use "/.reserved/.inodes/<inodeid>"
6425        // as their paths
6426        StringBuilder b = new StringBuilder();
6427        b.append(FSDirectory.DOT_RESERVED_PATH_PREFIX)
6428            .append(Path.SEPARATOR).append(FSDirectory.DOT_INODES_STRING)
6429            .append(Path.SEPARATOR).append(entry.getValue().getId());
6430        FSImageSerialization.writeINodeUnderConstruction(
6431            out, entry.getValue(), b.toString());
6432      }
6433    }
6434  }
6435
6436  /**
6437   * @return all the under-construction files in the lease map
6438   */
6439  Map<String, INodeFile> getFilesUnderConstruction() {
6440    synchronized (leaseManager) {
6441      return leaseManager.getINodesUnderConstruction();
6442    }
6443  }
6444
6445  /**
6446   * Register a Backup name-node, verifying that it belongs
6447   * to the correct namespace, and adding it to the set of
6448   * active journals if necessary.
6449   * 
6450   * @param bnReg registration of the new BackupNode
6451   * @param nnReg registration of this NameNode
6452   * @throws IOException if the namespace IDs do not match
6453   */
6454  void registerBackupNode(NamenodeRegistration bnReg,
6455      NamenodeRegistration nnReg) throws IOException {
6456    writeLock();
6457    try {
6458      if(getFSImage().getStorage().getNamespaceID() 
6459         != bnReg.getNamespaceID())
6460        throw new IOException("Incompatible namespaceIDs: "
6461            + " Namenode namespaceID = "
6462            + getFSImage().getStorage().getNamespaceID() + "; "
6463            + bnReg.getRole() +
6464            " node namespaceID = " + bnReg.getNamespaceID());
6465      if (bnReg.getRole() == NamenodeRole.BACKUP) {
6466        getFSImage().getEditLog().registerBackupNode(
6467            bnReg, nnReg);
6468      }
6469    } finally {
6470      writeUnlock("registerBackupNode");
6471    }
6472  }
6473
6474  /**
6475   * Release (unregister) backup node.
6476   * <p>
6477   * Find and remove the backup stream corresponding to the node.
6478   * @throws IOException
6479   */
6480  void releaseBackupNode(NamenodeRegistration registration)
6481    throws IOException {
6482    checkOperation(OperationCategory.WRITE);
6483    writeLock();
6484    try {
6485      checkOperation(OperationCategory.WRITE);
6486      if(getFSImage().getStorage().getNamespaceID()
6487         != registration.getNamespaceID())
6488        throw new IOException("Incompatible namespaceIDs: "
6489            + " Namenode namespaceID = "
6490            + getFSImage().getStorage().getNamespaceID() + "; "
6491            + registration.getRole() +
6492            " node namespaceID = " + registration.getNamespaceID());
6493      getEditLog().releaseBackupStream(registration);
6494    } finally {
6495      writeUnlock("releaseBackupNode");
6496    }
6497  }
6498
6499  static class CorruptFileBlockInfo {
6500    final String path;
6501    final Block block;
6502    
6503    public CorruptFileBlockInfo(String p, Block b) {
6504      path = p;
6505      block = b;
6506    }
6507    
6508    @Override
6509    public String toString() {
6510      return block.getBlockName() + "\t" + path;
6511    }
6512  }
6513  /**
6514   * @param path Restrict corrupt files to this portion of namespace.
6515   * @param cookieTab Support for continuation; cookieTab  tells where
6516   *                  to start from
6517   * @return a list in which each entry describes a corrupt file/block
6518   * @throws IOException
6519   */
6520  Collection<CorruptFileBlockInfo> listCorruptFileBlocks(String path,
6521  String[] cookieTab) throws IOException {
6522    checkSuperuserPrivilege();
6523    checkOperation(OperationCategory.READ);
6524
6525    int count = 0;
6526    ArrayList<CorruptFileBlockInfo> corruptFiles =
6527        new ArrayList<CorruptFileBlockInfo>();
6528    if (cookieTab == null) {
6529      cookieTab = new String[] { null };
6530    }
6531
6532    // Do a quick check if there are any corrupt files without taking the lock
6533    if (blockManager.getMissingBlocksCount() == 0) {
6534      if (cookieTab[0] == null) {
6535        cookieTab[0] = String.valueOf(getIntCookie(cookieTab[0]));
6536      }
6537      if (LOG.isDebugEnabled()) {
6538        LOG.debug("there are no corrupt file blocks.");
6539      }
6540      return corruptFiles;
6541    }
6542
6543    readLock();
6544    try {
6545      checkOperation(OperationCategory.READ);
6546      if (!isPopulatingReplQueues()) {
6547        throw new IOException("Cannot run listCorruptFileBlocks because " +
6548                              "replication queues have not been initialized.");
6549      }
6550      // print a limited # of corrupt files per call
6551
6552      final Iterator<Block> blkIterator = blockManager.getCorruptReplicaBlockIterator();
6553
6554      int skip = getIntCookie(cookieTab[0]);
6555      for (int i = 0; i < skip && blkIterator.hasNext(); i++) {
6556        blkIterator.next();
6557      }
6558
6559      while (blkIterator.hasNext()) {
6560        Block blk = blkIterator.next();
6561        final INode inode = (INode)blockManager.getBlockCollection(blk);
6562        skip++;
6563        if (inode != null && blockManager.countNodes(blk).liveReplicas() == 0) {
6564          String src = inode.getFullPathName();
6565          if (src.startsWith(path)){
6566            corruptFiles.add(new CorruptFileBlockInfo(src, blk));
6567            count++;
6568            if (count >= DEFAULT_MAX_CORRUPT_FILEBLOCKS_RETURNED)
6569              break;
6570          }
6571        }
6572      }
6573      cookieTab[0] = String.valueOf(skip);
6574      if (LOG.isDebugEnabled()) {
6575        LOG.debug("list corrupt file blocks returned: " + count);
6576      }
6577      return corruptFiles;
6578    } finally {
6579      readUnlock("listCorruptFileBlocks");
6580    }
6581  }
6582
6583  /**
6584   * Convert string cookie to integer.
6585   */
6586  private static int getIntCookie(String cookie){
6587    int c;
6588    if(cookie == null){
6589      c = 0;
6590    } else {
6591      try{
6592        c = Integer.parseInt(cookie);
6593      }catch (NumberFormatException e) {
6594        c = 0;
6595      }
6596    }
6597    c = Math.max(0, c);
6598    return c;
6599  }
6600
6601  /**
6602   * Create delegation token secret manager
6603   */
6604  private DelegationTokenSecretManager createDelegationTokenSecretManager(
6605      Configuration conf) {
6606    return new DelegationTokenSecretManager(conf.getLong(
6607        DFS_NAMENODE_DELEGATION_KEY_UPDATE_INTERVAL_KEY,
6608        DFS_NAMENODE_DELEGATION_KEY_UPDATE_INTERVAL_DEFAULT),
6609        conf.getLong(DFS_NAMENODE_DELEGATION_TOKEN_MAX_LIFETIME_KEY,
6610            DFS_NAMENODE_DELEGATION_TOKEN_MAX_LIFETIME_DEFAULT),
6611        conf.getLong(DFS_NAMENODE_DELEGATION_TOKEN_RENEW_INTERVAL_KEY,
6612            DFS_NAMENODE_DELEGATION_TOKEN_RENEW_INTERVAL_DEFAULT),
6613        DELEGATION_TOKEN_REMOVER_SCAN_INTERVAL,
6614        conf.getBoolean(DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_KEY,
6615            DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_DEFAULT),
6616        this);
6617  }
6618
6619  /**
6620   * Returns the DelegationTokenSecretManager instance in the namesystem.
6621   * @return delegation token secret manager object
6622   */
6623  DelegationTokenSecretManager getDelegationTokenSecretManager() {
6624    return dtSecretManager;
6625  }
6626
6627  /**
6628   * @param renewer Renewer information
6629   * @return delegation toek
6630   * @throws IOException on error
6631   */
6632  Token<DelegationTokenIdentifier> getDelegationToken(Text renewer)
6633      throws IOException {
6634    Token<DelegationTokenIdentifier> token;
6635    checkOperation(OperationCategory.WRITE);
6636    writeLock();
6637    try {
6638      checkOperation(OperationCategory.WRITE);
6639      checkNameNodeSafeMode("Cannot issue delegation token");
6640      if (!isAllowedDelegationTokenOp()) {
6641        throw new IOException(
6642          "Delegation Token can be issued only with kerberos or web authentication");
6643      }
6644      if (dtSecretManager == null || !dtSecretManager.isRunning()) {
6645        LOG.warn("trying to get DT with no secret manager running");
6646        return null;
6647      }
6648
6649      UserGroupInformation ugi = getRemoteUser();
6650      String user = ugi.getUserName();
6651      Text owner = new Text(user);
6652      Text realUser = null;
6653      if (ugi.getRealUser() != null) {
6654        realUser = new Text(ugi.getRealUser().getUserName());
6655      }
6656      DelegationTokenIdentifier dtId = new DelegationTokenIdentifier(owner,
6657        renewer, realUser);
6658      token = new Token<DelegationTokenIdentifier>(
6659        dtId, dtSecretManager);
6660      long expiryTime = dtSecretManager.getTokenExpiryTime(dtId);
6661      getEditLog().logGetDelegationToken(dtId, expiryTime);
6662    } finally {
6663      writeUnlock("getDelegationToken");
6664    }
6665    getEditLog().logSync();
6666    return token;
6667  }
6668
6669  /**
6670   * 
6671   * @param token token to renew
6672   * @return new expiryTime of the token
6673   * @throws InvalidToken if {@code token} is invalid
6674   * @throws IOException on other errors
6675   */
6676  long renewDelegationToken(Token<DelegationTokenIdentifier> token)
6677      throws InvalidToken, IOException {
6678    long expiryTime;
6679    checkOperation(OperationCategory.WRITE);
6680    writeLock();
6681    try {
6682      checkOperation(OperationCategory.WRITE);
6683
6684      checkNameNodeSafeMode("Cannot renew delegation token");
6685      if (!isAllowedDelegationTokenOp()) {
6686        throw new IOException(
6687            "Delegation Token can be renewed only with kerberos or web authentication");
6688      }
6689      String renewer = getRemoteUser().getShortUserName();
6690      expiryTime = dtSecretManager.renewToken(token, renewer);
6691      DelegationTokenIdentifier id = new DelegationTokenIdentifier();
6692      ByteArrayInputStream buf = new ByteArrayInputStream(token.getIdentifier());
6693      DataInputStream in = new DataInputStream(buf);
6694      id.readFields(in);
6695      getEditLog().logRenewDelegationToken(id, expiryTime);
6696    } finally {
6697      writeUnlock("renewDelegationToken");
6698    }
6699    getEditLog().logSync();
6700    return expiryTime;
6701  }
6702
6703  /**
6704   * 
6705   * @param token token to cancel
6706   * @throws IOException on error
6707   */
6708  void cancelDelegationToken(Token<DelegationTokenIdentifier> token)
6709      throws IOException {
6710    checkOperation(OperationCategory.WRITE);
6711    writeLock();
6712    try {
6713      checkOperation(OperationCategory.WRITE);
6714
6715      checkNameNodeSafeMode("Cannot cancel delegation token");
6716      String canceller = getRemoteUser().getUserName();
6717      DelegationTokenIdentifier id = dtSecretManager
6718        .cancelToken(token, canceller);
6719      getEditLog().logCancelDelegationToken(id);
6720    } finally {
6721      writeUnlock("cancelDelegationToken");
6722    }
6723    getEditLog().logSync();
6724  }
6725
6726  /**
6727   * @param out save state of the secret manager
6728   * @param sdPath String storage directory path
6729   */
6730  void saveSecretManagerStateCompat(DataOutputStream out, String sdPath)
6731      throws IOException {
6732    dtSecretManager.saveSecretManagerStateCompat(out, sdPath);
6733  }
6734
6735  SecretManagerState saveSecretManagerState() {
6736    return dtSecretManager.saveSecretManagerState();
6737  }
6738
6739  /**
6740   * @param in load the state of secret manager from input stream
6741   */
6742  void loadSecretManagerStateCompat(DataInput in) throws IOException {
6743    dtSecretManager.loadSecretManagerStateCompat(in);
6744  }
6745
6746  void loadSecretManagerState(SecretManagerSection s,
6747      List<SecretManagerSection.DelegationKey> keys,
6748      List<SecretManagerSection.PersistToken> tokens) throws IOException {
6749    dtSecretManager.loadSecretManagerState(new SecretManagerState(s, keys, tokens));
6750  }
6751
6752  /**
6753   * Log the updateMasterKey operation to edit logs
6754   * 
6755   * @param key new delegation key.
6756   */
6757  public void logUpdateMasterKey(DelegationKey key) {
6758    
6759    assert !isInSafeMode() :
6760      "this should never be called while in safemode, since we stop " +
6761      "the DT manager before entering safemode!";
6762    // edit log rolling is not thread-safe and must be protected by the
6763    // fsn lock.  not updating namespace so read lock is sufficient.
6764    assert hasReadLock();
6765    getEditLog().logUpdateMasterKey(key);
6766    getEditLog().logSync();
6767  }
6768  
6769  /**
6770   * Log the cancellation of expired tokens to edit logs
6771   * 
6772   * @param id token identifier to cancel
6773   */
6774  public void logExpireDelegationToken(DelegationTokenIdentifier id) {
6775    assert !isInSafeMode() :
6776      "this should never be called while in safemode, since we stop " +
6777      "the DT manager before entering safemode!";
6778    // edit log rolling is not thread-safe and must be protected by the
6779    // fsn lock.  not updating namespace so read lock is sufficient.
6780    assert hasReadLock();
6781    // do not logSync so expiration edits are batched
6782    getEditLog().logCancelDelegationToken(id);
6783  }  
6784  
6785  private void logReassignLease(String leaseHolder, String src,
6786      String newHolder) {
6787    assert hasWriteLock();
6788    getEditLog().logReassignLease(leaseHolder, src, newHolder);
6789  }
6790  
6791  /**
6792   * 
6793   * @return true if delegation token operation is allowed
6794   */
6795  private boolean isAllowedDelegationTokenOp() throws IOException {
6796    AuthenticationMethod authMethod = getConnectionAuthenticationMethod();
6797    if (UserGroupInformation.isSecurityEnabled()
6798        && (authMethod != AuthenticationMethod.KERBEROS)
6799        && (authMethod != AuthenticationMethod.KERBEROS_SSL)
6800        && (authMethod != AuthenticationMethod.CERTIFICATE)) {
6801      return false;
6802    }
6803    return true;
6804  }
6805  
6806  /**
6807   * Returns authentication method used to establish the connection
6808   * @return AuthenticationMethod used to establish connection
6809   * @throws IOException
6810   */
6811  private AuthenticationMethod getConnectionAuthenticationMethod()
6812      throws IOException {
6813    UserGroupInformation ugi = getRemoteUser();
6814    AuthenticationMethod authMethod = ugi.getAuthenticationMethod();
6815    if (authMethod == AuthenticationMethod.PROXY) {
6816      authMethod = ugi.getRealUser().getAuthenticationMethod();
6817    }
6818    return authMethod;
6819  }
6820  
6821  /**
6822   * Client invoked methods are invoked over RPC and will be in 
6823   * RPC call context even if the client exits.
6824   */
6825  boolean isExternalInvocation() {
6826    return Server.isRpcInvocation() || NamenodeWebHdfsMethods.isWebHdfsInvocation();
6827  }
6828
6829  private static InetAddress getRemoteIp() {
6830    InetAddress ip = Server.getRemoteIp();
6831    if (ip != null) {
6832      return ip;
6833    }
6834    return NamenodeWebHdfsMethods.getRemoteIp();
6835  }
6836  
6837  // optimize ugi lookup for RPC operations to avoid a trip through
6838  // UGI.getCurrentUser which is synch'ed
6839  private static UserGroupInformation getRemoteUser() throws IOException {
6840    return NameNode.getRemoteUser();
6841  }
6842  
6843  /**
6844   * Log fsck event in the audit log 
6845   */
6846  void logFsckEvent(String src, InetAddress remoteAddress) throws IOException {
6847    if (isAuditEnabled()) {
6848      logAuditEvent(true, getRemoteUser(),
6849                    remoteAddress,
6850                    "fsck", src, null, null);
6851    }
6852  }
6853  /**
6854   * Register NameNodeMXBean
6855   */
6856  private void registerMXBean() {
6857    mxbeanName = MBeans.register("NameNode", "NameNodeInfo", this);
6858  }
6859
6860  /**
6861   * Class representing Namenode information for JMX interfaces
6862   */
6863  @Override // NameNodeMXBean
6864  public String getVersion() {
6865    return VersionInfo.getVersion() + ", r" + VersionInfo.getRevision();
6866  }
6867
6868  @Override // NameNodeMXBean
6869  public long getUsed() {
6870    return this.getCapacityUsed();
6871  }
6872
6873  @Override // NameNodeMXBean
6874  public long getFree() {
6875    return this.getCapacityRemaining();
6876  }
6877
6878  @Override // NameNodeMXBean
6879  public long getTotal() {
6880    return this.getCapacityTotal();
6881  }
6882
6883  @Override // NameNodeMXBean
6884  public String getSafemode() {
6885    if (!this.isInSafeMode())
6886      return "";
6887    return "Safe mode is ON. " + this.getSafeModeTip();
6888  }
6889
6890  @Override // NameNodeMXBean
6891  public boolean isUpgradeFinalized() {
6892    return this.getFSImage().isUpgradeFinalized();
6893  }
6894
6895  @Override // NameNodeMXBean
6896  public long getNonDfsUsedSpace() {
6897    return datanodeStatistics.getCapacityUsedNonDFS();
6898  }
6899
6900  @Override // NameNodeMXBean
6901  public float getPercentUsed() {
6902    return datanodeStatistics.getCapacityUsedPercent();
6903  }
6904
6905  @Override // NameNodeMXBean
6906  public long getBlockPoolUsedSpace() {
6907    return datanodeStatistics.getBlockPoolUsed();
6908  }
6909
6910  @Override // NameNodeMXBean
6911  public float getPercentBlockPoolUsed() {
6912    return datanodeStatistics.getPercentBlockPoolUsed();
6913  }
6914
6915  @Override // NameNodeMXBean
6916  public float getPercentRemaining() {
6917    return datanodeStatistics.getCapacityRemainingPercent();
6918  }
6919
6920  @Override // NameNodeMXBean
6921  public long getCacheCapacity() {
6922    return datanodeStatistics.getCacheCapacity();
6923  }
6924
6925  @Override // NameNodeMXBean
6926  public long getCacheUsed() {
6927    return datanodeStatistics.getCacheUsed();
6928  }
6929
6930  @Override // NameNodeMXBean
6931  public long getTotalBlocks() {
6932    return getBlocksTotal();
6933  }
6934
6935  @Override // NameNodeMXBean
6936  @Metric
6937  public long getTotalFiles() {
6938    return getFilesTotal();
6939  }
6940
6941  @Override // NameNodeMXBean
6942  public long getNumberOfMissingBlocks() {
6943    return getMissingBlocksCount();
6944  }
6945  
6946  @Override // NameNodeMXBean
6947  public long getNumberOfMissingBlocksWithReplicationFactorOne() {
6948    return getMissingReplOneBlocksCount();
6949  }
6950
6951  @Override // NameNodeMXBean
6952  public int getThreads() {
6953    return ManagementFactory.getThreadMXBean().getThreadCount();
6954  }
6955
6956  /**
6957   * Returned information is a JSON representation of map with host name as the
6958   * key and value is a map of live node attribute keys to its values
6959   */
6960  @Override // NameNodeMXBean
6961  public String getLiveNodes() {
6962    final Map<String, Map<String,Object>> info = 
6963      new HashMap<String, Map<String,Object>>();
6964    final List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
6965    blockManager.getDatanodeManager().fetchDatanodes(live, null, false);
6966    for (DatanodeDescriptor node : live) {
6967      ImmutableMap.Builder<String, Object> innerinfo =
6968          ImmutableMap.<String,Object>builder();
6969      innerinfo
6970          .put("infoAddr", node.getInfoAddr())
6971          .put("infoSecureAddr", node.getInfoSecureAddr())
6972          .put("xferaddr", node.getXferAddr())
6973          .put("lastContact", getLastContact(node))
6974          .put("usedSpace", getDfsUsed(node))
6975          .put("adminState", node.getAdminState().toString())
6976          .put("nonDfsUsedSpace", node.getNonDfsUsed())
6977          .put("capacity", node.getCapacity())
6978          .put("numBlocks", node.numBlocks())
6979          .put("version", node.getSoftwareVersion())
6980          .put("used", node.getDfsUsed())
6981          .put("remaining", node.getRemaining())
6982          .put("blockScheduled", node.getBlocksScheduled())
6983          .put("blockPoolUsed", node.getBlockPoolUsed())
6984          .put("blockPoolUsedPercent", node.getBlockPoolUsedPercent())
6985          .put("volfails", node.getVolumeFailures());
6986      VolumeFailureSummary volumeFailureSummary = node.getVolumeFailureSummary();
6987      if (volumeFailureSummary != null) {
6988        innerinfo
6989            .put("failedStorageLocations",
6990                volumeFailureSummary.getFailedStorageLocations())
6991            .put("lastVolumeFailureDate",
6992                volumeFailureSummary.getLastVolumeFailureDate())
6993            .put("estimatedCapacityLostTotal",
6994                volumeFailureSummary.getEstimatedCapacityLostTotal());
6995      }
6996      info.put(node.getHostName() + ":" + node.getXferPort(), innerinfo.build());
6997    }
6998    return JSON.toString(info);
6999  }
7000
7001  /**
7002   * Returned information is a JSON representation of map with host name as the
7003   * key and value is a map of dead node attribute keys to its values
7004   */
7005  @Override // NameNodeMXBean
7006  public String getDeadNodes() {
7007    final Map<String, Map<String, Object>> info = 
7008      new HashMap<String, Map<String, Object>>();
7009    final List<DatanodeDescriptor> dead = new ArrayList<DatanodeDescriptor>();
7010    blockManager.getDatanodeManager().fetchDatanodes(null, dead, false);
7011    for (DatanodeDescriptor node : dead) {
7012      Map<String, Object> innerinfo = ImmutableMap.<String, Object>builder()
7013          .put("lastContact", getLastContact(node))
7014          .put("decommissioned", node.isDecommissioned())
7015          .put("xferaddr", node.getXferAddr())
7016          .build();
7017      info.put(node.getHostName() + ":" + node.getXferPort(), innerinfo);
7018    }
7019    return JSON.toString(info);
7020  }
7021
7022  /**
7023   * Returned information is a JSON representation of map with host name as the
7024   * key and value is a map of decommissioning node attribute keys to its
7025   * values
7026   */
7027  @Override // NameNodeMXBean
7028  public String getDecomNodes() {
7029    final Map<String, Map<String, Object>> info = 
7030      new HashMap<String, Map<String, Object>>();
7031    final List<DatanodeDescriptor> decomNodeList = blockManager.getDatanodeManager(
7032        ).getDecommissioningNodes();
7033    for (DatanodeDescriptor node : decomNodeList) {
7034      Map<String, Object> innerinfo = ImmutableMap
7035          .<String, Object> builder()
7036          .put("xferaddr", node.getXferAddr())
7037          .put("underReplicatedBlocks",
7038              node.decommissioningStatus.getUnderReplicatedBlocks())
7039          .put("decommissionOnlyReplicas",
7040              node.decommissioningStatus.getDecommissionOnlyReplicas())
7041          .put("underReplicateInOpenFiles",
7042              node.decommissioningStatus.getUnderReplicatedInOpenFiles())
7043          .build();
7044      info.put(node.getHostName() + ":" + node.getXferPort(), innerinfo);
7045    }
7046    return JSON.toString(info);
7047  }
7048
7049  private long getLastContact(DatanodeDescriptor alivenode) {
7050    return (monotonicNow() - alivenode.getLastUpdateMonotonic())/1000;
7051  }
7052
7053  private long getDfsUsed(DatanodeDescriptor alivenode) {
7054    return alivenode.getDfsUsed();
7055  }
7056
7057  @Override  // NameNodeMXBean
7058  public String getClusterId() {
7059    return getFSImage().getStorage().getClusterID();
7060  }
7061  
7062  @Override  // NameNodeMXBean
7063  public String getBlockPoolId() {
7064    return blockPoolId;
7065  }
7066  
7067  @Override  // NameNodeMXBean
7068  public String getNameDirStatuses() {
7069    Map<String, Map<File, StorageDirType>> statusMap =
7070      new HashMap<String, Map<File, StorageDirType>>();
7071    
7072    Map<File, StorageDirType> activeDirs = new HashMap<File, StorageDirType>();
7073    for (Iterator<StorageDirectory> it
7074        = getFSImage().getStorage().dirIterator(); it.hasNext();) {
7075      StorageDirectory st = it.next();
7076      activeDirs.put(st.getRoot(), st.getStorageDirType());
7077    }
7078    statusMap.put("active", activeDirs);
7079    
7080    List<Storage.StorageDirectory> removedStorageDirs
7081        = getFSImage().getStorage().getRemovedStorageDirs();
7082    Map<File, StorageDirType> failedDirs = new HashMap<File, StorageDirType>();
7083    for (StorageDirectory st : removedStorageDirs) {
7084      failedDirs.put(st.getRoot(), st.getStorageDirType());
7085    }
7086    statusMap.put("failed", failedDirs);
7087    
7088    return JSON.toString(statusMap);
7089  }
7090
7091  @Override // NameNodeMXBean
7092  public String getNodeUsage() {
7093    float median = 0;
7094    float max = 0;
7095    float min = 0;
7096    float dev = 0;
7097
7098    final Map<String, Map<String,Object>> info =
7099        new HashMap<String, Map<String,Object>>();
7100    final List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
7101    blockManager.getDatanodeManager().fetchDatanodes(live, null, true);
7102
7103    if (live.size() > 0) {
7104      float totalDfsUsed = 0;
7105      float[] usages = new float[live.size()];
7106      int i = 0;
7107      for (DatanodeDescriptor dn : live) {
7108        usages[i++] = dn.getDfsUsedPercent();
7109        totalDfsUsed += dn.getDfsUsedPercent();
7110      }
7111      totalDfsUsed /= live.size();
7112      Arrays.sort(usages);
7113      median = usages[usages.length / 2];
7114      max = usages[usages.length - 1];
7115      min = usages[0];
7116
7117      for (i = 0; i < usages.length; i++) {
7118        dev += (usages[i] - totalDfsUsed) * (usages[i] - totalDfsUsed);
7119      }
7120      dev = (float) Math.sqrt(dev / usages.length);
7121    }
7122
7123    final Map<String, Object> innerInfo = new HashMap<String, Object>();
7124    innerInfo.put("min", StringUtils.format("%.2f%%", min));
7125    innerInfo.put("median", StringUtils.format("%.2f%%", median));
7126    innerInfo.put("max", StringUtils.format("%.2f%%", max));
7127    innerInfo.put("stdDev", StringUtils.format("%.2f%%", dev));
7128    info.put("nodeUsage", innerInfo);
7129
7130    return JSON.toString(info);
7131  }
7132
7133  @Override  // NameNodeMXBean
7134  public String getNameJournalStatus() {
7135    List<Map<String, String>> jasList = new ArrayList<Map<String, String>>();
7136    FSEditLog log = getFSImage().getEditLog();
7137    if (log != null) {
7138      // This flag can be false because we cannot hold a lock of FSEditLog
7139      // for metrics.
7140      boolean openForWrite = log.isOpenForWriteWithoutLock();
7141      for (JournalAndStream jas : log.getJournals()) {
7142        final Map<String, String> jasMap = new HashMap<String, String>();
7143        String manager = jas.getManager().toString();
7144
7145        jasMap.put("required", String.valueOf(jas.isRequired()));
7146        jasMap.put("disabled", String.valueOf(jas.isDisabled()));
7147        jasMap.put("manager", manager);
7148
7149        if (jas.isDisabled()) {
7150          jasMap.put("stream", "Failed");
7151        } else if (openForWrite) {
7152          EditLogOutputStream elos = jas.getCurrentStream();
7153          if (elos != null) {
7154            jasMap.put("stream", elos.generateReport());
7155          } else {
7156            jasMap.put("stream", "not currently writing");
7157          }
7158        } else {
7159          jasMap.put("stream", "open for read");
7160        }
7161        jasList.add(jasMap);
7162      }
7163    }
7164    return JSON.toString(jasList);
7165  }
7166
7167  @Override // NameNodeMxBean
7168  public String getJournalTransactionInfo() {
7169    Map<String, String> txnIdMap = new HashMap<String, String>();
7170    txnIdMap.put("LastAppliedOrWrittenTxId",
7171        Long.toString(this.getFSImage().getLastAppliedOrWrittenTxId()));
7172    txnIdMap.put("MostRecentCheckpointTxId",
7173        Long.toString(this.getFSImage().getMostRecentCheckpointTxId()));
7174    return JSON.toString(txnIdMap);
7175  }
7176  
7177  @Override  // NameNodeMXBean
7178  public String getNNStarted() {
7179    return getStartTime().toString();
7180  }
7181
7182  @Override  // NameNodeMXBean
7183  public String getCompileInfo() {
7184    return VersionInfo.getDate() + " by " + VersionInfo.getUser() +
7185        " from " + VersionInfo.getBranch();
7186  }
7187
7188  /** @return the block manager. */
7189  public BlockManager getBlockManager() {
7190    return blockManager;
7191  }
7192
7193  public BlockIdManager getBlockIdManager() {
7194    return blockIdManager;
7195  }
7196
7197  /** @return the FSDirectory. */
7198  @Override
7199  public FSDirectory getFSDirectory() {
7200    return dir;
7201  }
7202  /** Set the FSDirectory. */
7203  @VisibleForTesting
7204  public void setFSDirectory(FSDirectory dir) {
7205    this.dir = dir;
7206  }
7207  /** @return the cache manager. */
7208  public CacheManager getCacheManager() {
7209    return cacheManager;
7210  }
7211
7212  @Override  // NameNodeMXBean
7213  public String getCorruptFiles() {
7214    List<String> list = new ArrayList<String>();
7215    Collection<FSNamesystem.CorruptFileBlockInfo> corruptFileBlocks;
7216    try {
7217      corruptFileBlocks = listCorruptFileBlocks("/", null);
7218      int corruptFileCount = corruptFileBlocks.size();
7219      if (corruptFileCount != 0) {
7220        for (FSNamesystem.CorruptFileBlockInfo c : corruptFileBlocks) {
7221          list.add(c.toString());
7222        }
7223      }
7224    } catch (IOException e) {
7225      LOG.warn("Get corrupt file blocks returned error: " + e.getMessage());
7226    }
7227    return JSON.toString(list);
7228  }
7229
7230  @Override  //NameNodeMXBean
7231  public int getDistinctVersionCount() {
7232    return blockManager.getDatanodeManager().getDatanodesSoftwareVersions()
7233      .size();
7234  }
7235
7236  @Override  //NameNodeMXBean
7237  public Map<String, Integer> getDistinctVersions() {
7238    return blockManager.getDatanodeManager().getDatanodesSoftwareVersions();
7239  }
7240
7241  @Override  //NameNodeMXBean
7242  public String getSoftwareVersion() {
7243    return VersionInfo.getVersion();
7244  }
7245
7246  /**
7247   * Verifies that the given identifier and password are valid and match.
7248   * @param identifier Token identifier.
7249   * @param password Password in the token.
7250   */
7251  public synchronized void verifyToken(DelegationTokenIdentifier identifier,
7252      byte[] password) throws InvalidToken, RetriableException {
7253    try {
7254      getDelegationTokenSecretManager().verifyToken(identifier, password);
7255    } catch (InvalidToken it) {
7256      if (inTransitionToActive()) {
7257        throw new RetriableException(it);
7258      }
7259      throw it;
7260    }
7261  }
7262  
7263  @Override
7264  public boolean isGenStampInFuture(Block block) {
7265    return blockIdManager.isGenStampInFuture(block);
7266  }
7267
7268  @VisibleForTesting
7269  public EditLogTailer getEditLogTailer() {
7270    return editLogTailer;
7271  }
7272  
7273  @VisibleForTesting
7274  public void setEditLogTailerForTests(EditLogTailer tailer) {
7275    this.editLogTailer = tailer;
7276  }
7277  
7278  @VisibleForTesting
7279  void setFsLockForTests(ReentrantReadWriteLock lock) {
7280    this.fsLock.coarseLock = lock;
7281  }
7282  
7283  @VisibleForTesting
7284  public ReentrantReadWriteLock getFsLockForTests() {
7285    return fsLock.coarseLock;
7286  }
7287  
7288  @VisibleForTesting
7289  public ReentrantLock getCpLockForTests() {
7290    return cpLock;
7291  }
7292
7293  @VisibleForTesting
7294  public SafeModeInfo getSafeModeInfoForTests() {
7295    return safeMode;
7296  }
7297  
7298  @VisibleForTesting
7299  public void setNNResourceChecker(NameNodeResourceChecker nnResourceChecker) {
7300    this.nnResourceChecker = nnResourceChecker;
7301  }
7302
7303  public SnapshotManager getSnapshotManager() {
7304    return snapshotManager;
7305  }
7306  
7307  /** Allow snapshot on a directory. */
7308  void allowSnapshot(String path) throws IOException {
7309    checkOperation(OperationCategory.WRITE);
7310    final String operationName = "allowSnapshot";
7311    boolean success = false;
7312    writeLock();
7313    try {
7314      checkOperation(OperationCategory.WRITE);
7315      checkNameNodeSafeMode("Cannot allow snapshot for " + path);
7316      checkSuperuserPrivilege();
7317      FSDirSnapshotOp.allowSnapshot(dir, snapshotManager, path);
7318      success = true;
7319    } finally {
7320      writeUnlock(operationName);
7321    }
7322    getEditLog().logSync();
7323    logAuditEvent(success, operationName, path, null, null);
7324  }
7325  
7326  /** Disallow snapshot on a directory. */
7327  void disallowSnapshot(String path) throws IOException {
7328    checkOperation(OperationCategory.WRITE);
7329    final String operationName = "disallowSnapshot";
7330    boolean success = false;
7331    writeLock();
7332    try {
7333      checkOperation(OperationCategory.WRITE);
7334      checkNameNodeSafeMode("Cannot disallow snapshot for " + path);
7335      checkSuperuserPrivilege();
7336      FSDirSnapshotOp.disallowSnapshot(dir, snapshotManager, path);
7337      success = true;
7338    } finally {
7339      writeUnlock(operationName);
7340    }
7341    getEditLog().logSync();
7342    logAuditEvent(success, operationName, path, null, null);
7343  }
7344  
7345  /**
7346   * Create a snapshot
7347   * @param snapshotRoot The directory path where the snapshot is taken
7348   * @param snapshotName The name of the snapshot
7349   */
7350  String createSnapshot(String snapshotRoot, String snapshotName,
7351                        boolean logRetryCache) throws IOException {
7352    final String operationName = "createSnapshot";
7353    String snapshotPath = null;
7354    writeLock();
7355    try {
7356      checkOperation(OperationCategory.WRITE);
7357      checkNameNodeSafeMode("Cannot create snapshot for " + snapshotRoot);
7358      snapshotPath = FSDirSnapshotOp.createSnapshot(dir,
7359          snapshotManager, snapshotRoot, snapshotName, logRetryCache);
7360    } finally {
7361      writeUnlock(operationName);
7362    }
7363    getEditLog().logSync();
7364    logAuditEvent(snapshotPath != null, operationName, snapshotRoot,
7365        snapshotPath, null);
7366    return snapshotPath;
7367  }
7368  
7369  /**
7370   * Rename a snapshot
7371   * @param path The directory path where the snapshot was taken
7372   * @param snapshotOldName Old snapshot name
7373   * @param snapshotNewName New snapshot name
7374   * @throws SafeModeException
7375   * @throws IOException 
7376   */
7377  void renameSnapshot(
7378      String path, String snapshotOldName, String snapshotNewName,
7379      boolean logRetryCache) throws IOException {
7380    final String operationName = "renameSnapshot";
7381    boolean success = false;
7382    writeLock();
7383    try {
7384      checkOperation(OperationCategory.WRITE);
7385      checkNameNodeSafeMode("Cannot rename snapshot for " + path);
7386      FSDirSnapshotOp.renameSnapshot(dir, snapshotManager, path,
7387          snapshotOldName, snapshotNewName, logRetryCache);
7388      success = true;
7389    } finally {
7390      writeUnlock(operationName);
7391    }
7392    getEditLog().logSync();
7393    String oldSnapshotRoot = Snapshot.getSnapshotPath(path, snapshotOldName);
7394    String newSnapshotRoot = Snapshot.getSnapshotPath(path, snapshotNewName);
7395    logAuditEvent(success, operationName, oldSnapshotRoot,
7396        newSnapshotRoot, null);
7397  }
7398  
7399  /**
7400   * Get the list of snapshottable directories that are owned 
7401   * by the current user. Return all the snapshottable directories if the 
7402   * current user is a super user.
7403   * @return The list of all the current snapshottable directories
7404   * @throws IOException
7405   */
7406  public SnapshottableDirectoryStatus[] getSnapshottableDirListing()
7407      throws IOException {
7408    final String operationName = "listSnapshottableDirectory";
7409    SnapshottableDirectoryStatus[] status = null;
7410    checkOperation(OperationCategory.READ);
7411    boolean success = false;
7412    readLock();
7413    try {
7414      checkOperation(OperationCategory.READ);
7415      status = FSDirSnapshotOp.getSnapshottableDirListing(dir, snapshotManager);
7416      success = true;
7417    } finally {
7418      readUnlock(operationName);
7419    }
7420    logAuditEvent(success, operationName, null, null, null);
7421    return status;
7422  }
7423  
7424  /**
7425   * Get the difference between two snapshots (or between a snapshot and the
7426   * current status) of a snapshottable directory.
7427   * 
7428   * @param path The full path of the snapshottable directory.
7429   * @param fromSnapshot Name of the snapshot to calculate the diff from. Null
7430   *          or empty string indicates the current tree.
7431   * @param toSnapshot Name of the snapshot to calculated the diff to. Null or
7432   *          empty string indicates the current tree.
7433   * @return A report about the difference between {@code fromSnapshot} and 
7434   *         {@code toSnapshot}. Modified/deleted/created/renamed files and 
7435   *         directories belonging to the snapshottable directories are listed 
7436   *         and labeled as M/-/+/R respectively. 
7437   * @throws IOException
7438   */
7439  SnapshotDiffReport getSnapshotDiffReport(String path,
7440      String fromSnapshot, String toSnapshot) throws IOException {
7441    final String operationName = "computeSnapshotDiff";
7442    SnapshotDiffReport diffs = null;
7443    checkOperation(OperationCategory.READ);
7444    readLock();
7445    try {
7446      checkOperation(OperationCategory.READ);
7447      diffs = FSDirSnapshotOp.getSnapshotDiffReport(dir, snapshotManager,
7448          path, fromSnapshot, toSnapshot);
7449    } finally {
7450      readUnlock(operationName);
7451    }
7452
7453    logAuditEvent(diffs != null, operationName, null, null, null);
7454    return diffs;
7455  }
7456  
7457  /**
7458   * Delete a snapshot of a snapshottable directory
7459   * @param snapshotRoot The snapshottable directory
7460   * @param snapshotName The name of the to-be-deleted snapshot
7461   * @throws SafeModeException
7462   * @throws IOException
7463   */
7464  void deleteSnapshot(String snapshotRoot, String snapshotName,
7465      boolean logRetryCache) throws IOException {
7466    final String operationName = "deleteSnapshot";
7467    boolean success = false;
7468    checkOperation(OperationCategory.WRITE);
7469    writeLock();
7470    BlocksMapUpdateInfo blocksToBeDeleted = null;
7471    try {
7472      checkOperation(OperationCategory.WRITE);
7473      checkNameNodeSafeMode("Cannot delete snapshot for " + snapshotRoot);
7474
7475      blocksToBeDeleted = FSDirSnapshotOp.deleteSnapshot(dir, snapshotManager,
7476          snapshotRoot, snapshotName, logRetryCache);
7477      success = true;
7478    } finally {
7479      writeUnlock(operationName);
7480    }
7481    getEditLog().logSync();
7482
7483    // Breaking the pattern as removing blocks have to happen outside of the
7484    // global lock
7485    if (blocksToBeDeleted != null) {
7486      removeBlocks(blocksToBeDeleted);
7487    }
7488
7489    String rootPath = Snapshot.getSnapshotPath(snapshotRoot, snapshotName);
7490    logAuditEvent(success, operationName, rootPath, null, null);
7491  }
7492
7493  /**
7494   * Remove a list of INodeDirectorySnapshottable from the SnapshotManager
7495   * @param toRemove the list of INodeDirectorySnapshottable to be removed
7496   */
7497  void removeSnapshottableDirs(List<INodeDirectory> toRemove) {
7498    if (snapshotManager != null) {
7499      snapshotManager.removeSnapshottable(toRemove);
7500    }
7501  }
7502
7503  RollingUpgradeInfo queryRollingUpgrade() throws IOException {
7504    checkSuperuserPrivilege();
7505    checkOperation(OperationCategory.READ);
7506    readLock();
7507    try {
7508      checkOperation(OperationCategory.READ);
7509      if (!isRollingUpgrade()) {
7510        return null;
7511      }
7512      Preconditions.checkNotNull(rollingUpgradeInfo);
7513      boolean hasRollbackImage = this.getFSImage().hasRollbackFSImage();
7514      rollingUpgradeInfo.setCreatedRollbackImages(hasRollbackImage);
7515      return rollingUpgradeInfo;
7516    } finally {
7517      readUnlock("queryRollingUpgrade");
7518    }
7519  }
7520
7521  RollingUpgradeInfo startRollingUpgrade() throws IOException {
7522    final String operationName = "startRollingUpgrade";
7523    checkSuperuserPrivilege();
7524    checkOperation(OperationCategory.WRITE);
7525    writeLock();
7526    try {
7527      checkOperation(OperationCategory.WRITE);
7528      if (isRollingUpgrade()) {
7529        return rollingUpgradeInfo;
7530      }
7531      long startTime = now();
7532      if (!haEnabled) { // for non-HA, we require NN to be in safemode
7533        startRollingUpgradeInternalForNonHA(startTime);
7534      } else { // for HA, NN cannot be in safemode
7535        checkNameNodeSafeMode("Failed to start rolling upgrade");
7536        startRollingUpgradeInternal(startTime);
7537      }
7538
7539      getEditLog().logStartRollingUpgrade(rollingUpgradeInfo.getStartTime());
7540      if (haEnabled) {
7541        // roll the edit log to make sure the standby NameNode can tail
7542        getFSImage().rollEditLog();
7543      }
7544    } finally {
7545      writeUnlock(operationName);
7546    }
7547
7548    getEditLog().logSync();
7549    if (auditLog.isInfoEnabled() && isExternalInvocation()) {
7550      logAuditEvent(true, operationName, null, null, null);
7551    }
7552    return rollingUpgradeInfo;
7553  }
7554
7555  /**
7556   * Update internal state to indicate that a rolling upgrade is in progress.
7557   * @param startTime rolling upgrade start time
7558   */
7559  void startRollingUpgradeInternal(long startTime)
7560      throws IOException {
7561    checkRollingUpgrade("start rolling upgrade");
7562    getFSImage().checkUpgrade();
7563    setRollingUpgradeInfo(false, startTime);
7564  }
7565
7566  /**
7567   * Update internal state to indicate that a rolling upgrade is in progress for
7568   * non-HA setup. This requires the namesystem is in SafeMode and after doing a
7569   * checkpoint for rollback the namesystem will quit the safemode automatically 
7570   */
7571  private void startRollingUpgradeInternalForNonHA(long startTime)
7572      throws IOException {
7573    Preconditions.checkState(!haEnabled);
7574    if (!isInSafeMode()) {
7575      throw new IOException("Safe mode should be turned ON "
7576          + "in order to create namespace image.");
7577    }
7578    checkRollingUpgrade("start rolling upgrade");
7579    getFSImage().checkUpgrade();
7580    // in non-HA setup, we do an extra checkpoint to generate a rollback image
7581    getFSImage().saveNamespace(this, NameNodeFile.IMAGE_ROLLBACK, null);
7582    LOG.info("Successfully saved namespace for preparing rolling upgrade.");
7583
7584    // leave SafeMode automatically
7585    setSafeMode(SafeModeAction.SAFEMODE_LEAVE);
7586    setRollingUpgradeInfo(true, startTime);
7587  }
7588
7589  void setRollingUpgradeInfo(boolean createdRollbackImages, long startTime) {
7590    rollingUpgradeInfo = new RollingUpgradeInfo(blockPoolId,
7591        createdRollbackImages, startTime, 0L);
7592  }
7593
7594  public void setCreatedRollbackImages(boolean created) {
7595    if (rollingUpgradeInfo != null) {
7596      rollingUpgradeInfo.setCreatedRollbackImages(created);
7597    }
7598  }
7599
7600  public RollingUpgradeInfo getRollingUpgradeInfo() {
7601    return rollingUpgradeInfo;
7602  }
7603
7604  public boolean isNeedRollbackFsImage() {
7605    return needRollbackFsImage;
7606  }
7607
7608  public void setNeedRollbackFsImage(boolean needRollbackFsImage) {
7609    this.needRollbackFsImage = needRollbackFsImage;
7610  }
7611
7612  @Override  // NameNodeMXBean
7613  public RollingUpgradeInfo.Bean getRollingUpgradeStatus() {
7614    if (!isRollingUpgrade()) {
7615      return null;
7616    }
7617    RollingUpgradeInfo upgradeInfo = getRollingUpgradeInfo();
7618    if (upgradeInfo.createdRollbackImages()) {
7619      return new RollingUpgradeInfo.Bean(upgradeInfo);
7620    }
7621    readLock();
7622    try {
7623      // check again after acquiring the read lock.
7624      upgradeInfo = getRollingUpgradeInfo();
7625      if (upgradeInfo == null) {
7626        return null;
7627      }
7628      if (!upgradeInfo.createdRollbackImages()) {
7629        boolean hasRollbackImage = this.getFSImage().hasRollbackFSImage();
7630        upgradeInfo.setCreatedRollbackImages(hasRollbackImage);
7631      }
7632    } catch (IOException ioe) {
7633      LOG.warn("Encountered exception setting Rollback Image", ioe);
7634    } finally {
7635      readUnlock("getRollingUpgradeStatus");
7636    }
7637    return new RollingUpgradeInfo.Bean(upgradeInfo);
7638  }
7639
7640  /** Is rolling upgrade in progress? */
7641  public boolean isRollingUpgrade() {
7642    return rollingUpgradeInfo != null && !rollingUpgradeInfo.isFinalized();
7643  }
7644
7645  void checkRollingUpgrade(String action) throws RollingUpgradeException {
7646    if (isRollingUpgrade()) {
7647      throw new RollingUpgradeException("Failed to " + action
7648          + " since a rolling upgrade is already in progress."
7649          + " Existing rolling upgrade info:\n" + rollingUpgradeInfo);
7650    }
7651  }
7652
7653  RollingUpgradeInfo finalizeRollingUpgrade() throws IOException {
7654    final String operationName = "finalizeRollingUpgrade";
7655    checkSuperuserPrivilege();
7656    checkOperation(OperationCategory.WRITE);
7657    writeLock();
7658    try {
7659      checkOperation(OperationCategory.WRITE);
7660      if (!isRollingUpgrade()) {
7661        return null;
7662      }
7663      checkNameNodeSafeMode("Failed to finalize rolling upgrade");
7664
7665      finalizeRollingUpgradeInternal(now());
7666      getEditLog().logFinalizeRollingUpgrade(rollingUpgradeInfo.getFinalizeTime());
7667      if (haEnabled) {
7668        // roll the edit log to make sure the standby NameNode can tail
7669        getFSImage().rollEditLog();
7670      }
7671      getFSImage().updateStorageVersion();
7672      getFSImage().renameCheckpoint(NameNodeFile.IMAGE_ROLLBACK,
7673          NameNodeFile.IMAGE);
7674    } finally {
7675      writeUnlock(operationName);
7676    }
7677
7678    if (!haEnabled) {
7679      // Sync not needed for ha since the edit was rolled after logging.
7680      getEditLog().logSync();
7681    }
7682
7683    if (auditLog.isInfoEnabled() && isExternalInvocation()) {
7684      logAuditEvent(true, operationName, null, null, null);
7685    }
7686    return rollingUpgradeInfo;
7687  }
7688
7689  void finalizeRollingUpgradeInternal(long finalizeTime) {
7690    // Set the finalize time
7691    rollingUpgradeInfo.finalize(finalizeTime);
7692  }
7693
7694  long addCacheDirective(CacheDirectiveInfo directive,
7695                         EnumSet<CacheFlag> flags, boolean logRetryCache)
7696      throws IOException {
7697    final String operationName = "addCacheDirective";
7698    CacheDirectiveInfo effectiveDirective = null;
7699    if (!flags.contains(CacheFlag.FORCE)) {
7700      cacheManager.waitForRescanIfNeeded();
7701    }
7702    checkOperation(OperationCategory.WRITE);
7703    writeLock();
7704    try {
7705      checkOperation(OperationCategory.WRITE);
7706      if (isInSafeMode()) {
7707        throw new SafeModeException(
7708            "Cannot add cache directive", safeMode);
7709      }
7710      effectiveDirective = FSNDNCacheOp.addCacheDirective(this, cacheManager,
7711          directive, flags, logRetryCache);
7712    } finally {
7713      writeUnlock(operationName);
7714      boolean success = effectiveDirective != null;
7715      if (success) {
7716        getEditLog().logSync();
7717      }
7718
7719      String effectiveDirectiveStr = effectiveDirective != null ?
7720          effectiveDirective.toString() : null;
7721      logAuditEvent(success, operationName, effectiveDirectiveStr,
7722          null, null);
7723    }
7724    return effectiveDirective != null ? effectiveDirective.getId() : 0;
7725  }
7726
7727  void modifyCacheDirective(CacheDirectiveInfo directive,
7728      EnumSet<CacheFlag> flags, boolean logRetryCache) throws IOException {
7729    final String operationName = "modifyCacheDirective";
7730    boolean success = false;
7731    if (!flags.contains(CacheFlag.FORCE)) {
7732      cacheManager.waitForRescanIfNeeded();
7733    }
7734    checkOperation(OperationCategory.WRITE);
7735    writeLock();
7736    try {
7737      checkOperation(OperationCategory.WRITE);
7738      if (isInSafeMode()) {
7739        throw new SafeModeException(
7740            "Cannot add cache directive", safeMode);
7741      }
7742      FSNDNCacheOp.modifyCacheDirective(this, cacheManager, directive, flags,
7743          logRetryCache);
7744      success = true;
7745    } finally {
7746      writeUnlock(operationName);
7747      if (success) {
7748        getEditLog().logSync();
7749      }
7750      String idStr = "{id: " + directive.getId().toString() + "}";
7751      logAuditEvent(success, "modifyCacheDirective", idStr,
7752          directive.toString(), null);
7753    }
7754  }
7755
7756  void removeCacheDirective(long id, boolean logRetryCache) throws IOException {
7757    final String operationName = "removeCacheDirective";
7758    boolean success = false;
7759    checkOperation(OperationCategory.WRITE);
7760    writeLock();
7761    try {
7762      checkOperation(OperationCategory.WRITE);
7763      if (isInSafeMode()) {
7764        throw new SafeModeException(
7765            "Cannot remove cache directives", safeMode);
7766      }
7767      FSNDNCacheOp.removeCacheDirective(this, cacheManager, id, logRetryCache);
7768      success = true;
7769    } finally {
7770      writeUnlock(operationName);
7771      String idStr = "{id: " + Long.toString(id) + "}";
7772      logAuditEvent(success, operationName, idStr, null,
7773          null);
7774    }
7775    getEditLog().logSync();
7776  }
7777
7778  BatchedListEntries<CacheDirectiveEntry> listCacheDirectives(
7779      long startId, CacheDirectiveInfo filter) throws IOException {
7780    final String operationName = "listCacheDirectives";
7781    checkOperation(OperationCategory.READ);
7782    BatchedListEntries<CacheDirectiveEntry> results;
7783    cacheManager.waitForRescanIfNeeded();
7784    readLock();
7785    boolean success = false;
7786    try {
7787      checkOperation(OperationCategory.READ);
7788      results = FSNDNCacheOp.listCacheDirectives(this, cacheManager, startId,
7789          filter);
7790      success = true;
7791    } finally {
7792      readUnlock(operationName);
7793      logAuditEvent(success, operationName, filter.toString(), null,
7794          null);
7795    }
7796    return results;
7797  }
7798
7799  void addCachePool(CachePoolInfo req, boolean logRetryCache)
7800      throws IOException {
7801    final String operationName = "addCachePool";
7802    checkOperation(OperationCategory.WRITE);
7803    writeLock();
7804    boolean success = false;
7805    String poolInfoStr = null;
7806    try {
7807      checkOperation(OperationCategory.WRITE);
7808      if (isInSafeMode()) {
7809        throw new SafeModeException(
7810            "Cannot add cache pool " + req.getPoolName(), safeMode);
7811      }
7812      CachePoolInfo info = FSNDNCacheOp.addCachePool(this, cacheManager, req,
7813          logRetryCache);
7814      poolInfoStr = info.toString();
7815      success = true;
7816    } finally {
7817      writeUnlock(operationName);
7818      logAuditEvent(success, operationName, poolInfoStr, null, null);
7819    }
7820    
7821    getEditLog().logSync();
7822  }
7823
7824  void modifyCachePool(CachePoolInfo req, boolean logRetryCache)
7825      throws IOException {
7826    final String operationName = "modifyCachePool";
7827    checkOperation(OperationCategory.WRITE);
7828    writeLock();
7829    boolean success = false;
7830    try {
7831      checkOperation(OperationCategory.WRITE);
7832      if (isInSafeMode()) {
7833        throw new SafeModeException(
7834            "Cannot modify cache pool " + req.getPoolName(), safeMode);
7835      }
7836      FSNDNCacheOp.modifyCachePool(this, cacheManager, req, logRetryCache);
7837      success = true;
7838    } finally {
7839      writeUnlock(operationName);
7840      String poolNameStr = "{poolName: " +
7841          (req == null ? null : req.getPoolName()) + "}";
7842      logAuditEvent(success, operationName, poolNameStr,
7843                    req == null ? null : req.toString(), null);
7844    }
7845
7846    getEditLog().logSync();
7847  }
7848
7849  void removeCachePool(String cachePoolName, boolean logRetryCache)
7850      throws IOException {
7851    final String operationName = "removeCachePool";
7852    checkOperation(OperationCategory.WRITE);
7853    writeLock();
7854    boolean success = false;
7855    try {
7856      checkOperation(OperationCategory.WRITE);
7857      if (isInSafeMode()) {
7858        throw new SafeModeException(
7859            "Cannot remove cache pool " + cachePoolName, safeMode);
7860      }
7861      FSNDNCacheOp.removeCachePool(this, cacheManager, cachePoolName,
7862          logRetryCache);
7863      success = true;
7864    } finally {
7865      writeUnlock(operationName);
7866      String poolNameStr = "{poolName: " + cachePoolName + "}";
7867      logAuditEvent(success, operationName, poolNameStr, null, null);
7868    }
7869    
7870    getEditLog().logSync();
7871  }
7872
7873  BatchedListEntries<CachePoolEntry> listCachePools(String prevKey)
7874      throws IOException {
7875    final String operationName = "listCachePools";
7876    BatchedListEntries<CachePoolEntry> results;
7877    checkOperation(OperationCategory.READ);
7878    boolean success = false;
7879    cacheManager.waitForRescanIfNeeded();
7880    readLock();
7881    try {
7882      checkOperation(OperationCategory.READ);
7883      results = FSNDNCacheOp.listCachePools(this, cacheManager, prevKey);
7884      success = true;
7885    } finally {
7886      readUnlock(operationName);
7887      logAuditEvent(success, operationName, null, null, null);
7888    }
7889    return results;
7890  }
7891
7892  void modifyAclEntries(final String src, List<AclEntry> aclSpec)
7893      throws IOException {
7894    final String operationName = "modifyAclEntries";
7895    HdfsFileStatus auditStat = null;
7896    checkOperation(OperationCategory.WRITE);
7897    writeLock();
7898    try {
7899      checkOperation(OperationCategory.WRITE);
7900      checkNameNodeSafeMode("Cannot modify ACL entries on " + src);
7901      auditStat = FSDirAclOp.modifyAclEntries(dir, src, aclSpec);
7902    } catch (AccessControlException e) {
7903      logAuditEvent(false, operationName, src);
7904      throw e;
7905    } finally {
7906      writeUnlock(operationName);
7907    }
7908    getEditLog().logSync();
7909    logAuditEvent(true, operationName, src, null, auditStat);
7910  }
7911
7912  void removeAclEntries(final String src, List<AclEntry> aclSpec)
7913      throws IOException {
7914    final String operationName = "removeAclEntries";
7915    checkOperation(OperationCategory.WRITE);
7916    HdfsFileStatus auditStat = null;
7917    writeLock();
7918    try {
7919      checkOperation(OperationCategory.WRITE);
7920      checkNameNodeSafeMode("Cannot remove ACL entries on " + src);
7921      auditStat = FSDirAclOp.removeAclEntries(dir, src, aclSpec);
7922    } catch (AccessControlException e) {
7923      logAuditEvent(false, operationName, src);
7924      throw e;
7925    } finally {
7926      writeUnlock(operationName);
7927    }
7928    getEditLog().logSync();
7929    logAuditEvent(true, operationName, src, null, auditStat);
7930  }
7931
7932  void removeDefaultAcl(final String src) throws IOException {
7933    final String operationName = "removeDefaultAcl";
7934    HdfsFileStatus auditStat = null;
7935    checkOperation(OperationCategory.WRITE);
7936    writeLock();
7937    try {
7938      checkOperation(OperationCategory.WRITE);
7939      checkNameNodeSafeMode("Cannot remove default ACL entries on " + src);
7940      auditStat = FSDirAclOp.removeDefaultAcl(dir, src);
7941    } catch (AccessControlException e) {
7942      logAuditEvent(false, operationName, src);
7943      throw e;
7944    } finally {
7945      writeUnlock(operationName);
7946    }
7947    getEditLog().logSync();
7948    logAuditEvent(true, operationName, src, null, auditStat);
7949  }
7950
7951  void removeAcl(final String src) throws IOException {
7952    final String operationName = "removeAcl";
7953    HdfsFileStatus auditStat = null;
7954    checkOperation(OperationCategory.WRITE);
7955    writeLock();
7956    try {
7957      checkOperation(OperationCategory.WRITE);
7958      checkNameNodeSafeMode("Cannot remove ACL on " + src);
7959      auditStat = FSDirAclOp.removeAcl(dir, src);
7960    } catch (AccessControlException e) {
7961      logAuditEvent(false, operationName, src);
7962      throw e;
7963    } finally {
7964      writeUnlock(operationName);
7965    }
7966    getEditLog().logSync();
7967    logAuditEvent(true, operationName, src, null, auditStat);
7968  }
7969
7970  void setAcl(final String src, List<AclEntry> aclSpec) throws IOException {
7971    final String operationName = "setAcl";
7972    HdfsFileStatus auditStat = null;
7973    checkOperation(OperationCategory.WRITE);
7974    writeLock();
7975    try {
7976      checkOperation(OperationCategory.WRITE);
7977      checkNameNodeSafeMode("Cannot set ACL on " + src);
7978      auditStat = FSDirAclOp.setAcl(dir, src, aclSpec);
7979    } catch (AccessControlException e) {
7980      logAuditEvent(false, operationName, src);
7981      throw e;
7982    } finally {
7983      writeUnlock(operationName);
7984    }
7985    getEditLog().logSync();
7986    logAuditEvent(true, operationName, src, null, auditStat);
7987  }
7988
7989  AclStatus getAclStatus(String src) throws IOException {
7990    final String operationName = "getAclStatus";
7991    checkOperation(OperationCategory.READ);
7992    boolean success = false;
7993    readLock();
7994    try {
7995      checkOperation(OperationCategory.READ);
7996      final AclStatus ret = FSDirAclOp.getAclStatus(dir, src);
7997      success = true;
7998      return ret;
7999    } finally {
8000      readUnlock(operationName);
8001      logAuditEvent(success, operationName, src);
8002    }
8003  }
8004
8005  /**
8006   * Create an encryption zone on directory src using the specified key.
8007   *
8008   * @param src     the path of a directory which will be the root of the
8009   *                encryption zone. The directory must be empty.
8010   * @param keyName name of a key which must be present in the configured
8011   *                KeyProvider.
8012   * @throws AccessControlException  if the caller is not the superuser.
8013   * @throws UnresolvedLinkException if the path can't be resolved.
8014   * @throws SafeModeException       if the Namenode is in safe mode.
8015   */
8016  void createEncryptionZone(final String src, final String keyName,
8017                            boolean logRetryCache)
8018    throws IOException, UnresolvedLinkException,
8019      SafeModeException, AccessControlException {
8020    try {
8021      if (provider == null) {
8022        throw new IOException(
8023            "Can't create an encryption zone for " + src +
8024            " since no key provider is available.");
8025      }
8026      if (keyName == null || keyName.isEmpty()) {
8027        throw new IOException("Must specify a key name when creating an " +
8028            "encryption zone");
8029      }
8030      KeyProvider.Metadata metadata = provider.getMetadata(keyName);
8031      if (metadata == null) {
8032        /*
8033         * It would be nice if we threw something more specific than
8034         * IOException when the key is not found, but the KeyProvider API
8035         * doesn't provide for that. If that API is ever changed to throw
8036         * something more specific (e.g. UnknownKeyException) then we can
8037         * update this to match it, or better yet, just rethrow the
8038         * KeyProvider's exception.
8039         */
8040        throw new IOException("Key " + keyName + " doesn't exist.");
8041      }
8042      // If the provider supports pool for EDEKs, this will fill in the pool
8043      generateEncryptedDataEncryptionKey(keyName);
8044      createEncryptionZoneInt(src, metadata.getCipher(),
8045          keyName, logRetryCache);
8046    } catch (AccessControlException e) {
8047      logAuditEvent(false, "createEncryptionZone", src);
8048      throw e;
8049    }
8050  }
8051
8052  private void createEncryptionZoneInt(final String srcArg, String cipher,
8053      String keyName, final boolean logRetryCache) throws IOException {
8054    final String operationName = "createEncryptionZone";
8055    String src = srcArg;
8056    HdfsFileStatus resultingStat = null;
8057    checkSuperuserPrivilege();
8058    FSPermissionChecker pc = getPermissionChecker();
8059    writeLock();
8060    try {
8061      checkSuperuserPrivilege();
8062      checkOperation(OperationCategory.WRITE);
8063      checkNameNodeSafeMode("Cannot create encryption zone on " + src);
8064      final INodesInPath iip = dir.resolvePathForWrite(pc, src);
8065      src = iip.getPath();
8066
8067      final CipherSuite suite = CipherSuite.convert(cipher);
8068      // For now this is hardcoded, as we only support one method.
8069      final CryptoProtocolVersion version =
8070          CryptoProtocolVersion.ENCRYPTION_ZONES;
8071      final XAttr ezXAttr = dir.createEncryptionZone(src, suite,
8072          version, keyName);
8073      List<XAttr> xAttrs = Lists.newArrayListWithCapacity(1);
8074      xAttrs.add(ezXAttr);
8075      getEditLog().logSetXAttrs(src, xAttrs, logRetryCache);
8076      resultingStat = dir.getAuditFileInfo(iip);
8077    } finally {
8078      writeUnlock(operationName);
8079    }
8080    getEditLog().logSync();
8081    logAuditEvent(true, operationName, srcArg, null, resultingStat);
8082  }
8083
8084  /**
8085   * Get the encryption zone for the specified path.
8086   *
8087   * @param srcArg the path of a file or directory to get the EZ for.
8088   * @return the EZ of the of the path or null if none.
8089   * @throws AccessControlException  if the caller is not the superuser.
8090   * @throws UnresolvedLinkException if the path can't be resolved.
8091   */
8092  EncryptionZone getEZForPath(final String srcArg)
8093    throws AccessControlException, UnresolvedLinkException, IOException {
8094    String src = srcArg;
8095    final String operationName = "getEZForPath";
8096    HdfsFileStatus resultingStat = null;
8097    boolean success = false;
8098    final FSPermissionChecker pc = getPermissionChecker();
8099    checkOperation(OperationCategory.READ);
8100    readLock();
8101    try {
8102      checkOperation(OperationCategory.READ);
8103      INodesInPath iip = dir.resolvePath(pc, src);
8104      if (isPermissionEnabled) {
8105        dir.checkPathAccess(pc, iip, FsAction.READ);
8106      }
8107      final EncryptionZone ret = dir.getEZForPath(iip);
8108      resultingStat = dir.getAuditFileInfo(iip);
8109      success = true;
8110      return ret;
8111    } finally {
8112      readUnlock(operationName);
8113      logAuditEvent(success, operationName, srcArg, null, resultingStat);
8114    }
8115  }
8116
8117  BatchedListEntries<EncryptionZone> listEncryptionZones(long prevId)
8118      throws IOException {
8119    final String operationName = "listEncryptionZones";
8120    boolean success = false;
8121    checkSuperuserPrivilege();
8122    checkOperation(OperationCategory.READ);
8123    readLock();
8124    try {
8125      checkSuperuserPrivilege();
8126      checkOperation(OperationCategory.READ);
8127      final BatchedListEntries<EncryptionZone> ret =
8128          dir.listEncryptionZones(prevId);
8129      success = true;
8130      return ret;
8131    } finally {
8132      readUnlock(operationName);
8133      logAuditEvent(success, operationName, null);
8134    }
8135  }
8136
8137  void setXAttr(String src, XAttr xAttr, EnumSet<XAttrSetFlag> flag,
8138                boolean logRetryCache)
8139      throws IOException {
8140    final String operationName = "setXAttr";
8141    HdfsFileStatus auditStat = null;
8142    writeLock();
8143    try {
8144      checkOperation(OperationCategory.WRITE);
8145      checkNameNodeSafeMode("Cannot set XAttr on " + src);
8146      auditStat = FSDirXAttrOp.setXAttr(dir, src, xAttr, flag, logRetryCache);
8147    } catch (AccessControlException e) {
8148      logAuditEvent(false, operationName, src);
8149      throw e;
8150    } finally {
8151      writeUnlock(operationName);
8152    }
8153    getEditLog().logSync();
8154    logAuditEvent(true, operationName, src, null, auditStat);
8155  }
8156
8157  List<XAttr> getXAttrs(final String src, List<XAttr> xAttrs)
8158      throws IOException {
8159    final String operationName = "getXAttrs";
8160    checkOperation(OperationCategory.READ);
8161    readLock();
8162    try {
8163      checkOperation(OperationCategory.READ);
8164      return FSDirXAttrOp.getXAttrs(dir, src, xAttrs);
8165    } catch (AccessControlException e) {
8166      logAuditEvent(false, operationName, src);
8167      throw e;
8168    } finally {
8169      readUnlock(operationName);
8170    }
8171  }
8172
8173  List<XAttr> listXAttrs(String src) throws IOException {
8174    final String operationName = "listXAttrs";
8175    checkOperation(OperationCategory.READ);
8176    readLock();
8177    try {
8178      checkOperation(OperationCategory.READ);
8179      return FSDirXAttrOp.listXAttrs(dir, src);
8180    } catch (AccessControlException e) {
8181      logAuditEvent(false, operationName, src);
8182      throw e;
8183    } finally {
8184      readUnlock(operationName);
8185    }
8186  }
8187
8188  void removeXAttr(String src, XAttr xAttr, boolean logRetryCache)
8189      throws IOException {
8190    final String operationName = "removeXAttr";
8191    HdfsFileStatus auditStat = null;
8192    writeLock();
8193    try {
8194      checkOperation(OperationCategory.WRITE);
8195      checkNameNodeSafeMode("Cannot remove XAttr entry on " + src);
8196      auditStat = FSDirXAttrOp.removeXAttr(dir, src, xAttr, logRetryCache);
8197    } catch (AccessControlException e) {
8198      logAuditEvent(false, operationName, src);
8199      throw e;
8200    } finally {
8201      writeUnlock(operationName);
8202    }
8203    getEditLog().logSync();
8204    logAuditEvent(true, operationName, src, null, auditStat);
8205  }
8206
8207  void checkAccess(String src, FsAction mode) throws IOException {
8208    final String operationName = "checkAccess";
8209    checkOperation(OperationCategory.READ);
8210    FSPermissionChecker pc = getPermissionChecker();
8211    readLock();
8212    try {
8213      checkOperation(OperationCategory.READ);
8214      final INodesInPath iip = dir.resolvePath(pc, src);
8215      src = iip.getPath();
8216      INode inode = iip.getLastINode();
8217      if (inode == null) {
8218        throw new FileNotFoundException("Path not found");
8219      }
8220      if (isPermissionEnabled) {
8221        dir.checkPathAccess(pc, iip, mode);
8222      }
8223    } catch (AccessControlException e) {
8224      logAuditEvent(false, operationName, src);
8225      throw e;
8226    } finally {
8227      readUnlock(operationName);
8228    }
8229  }
8230
8231  /**
8232   * Default AuditLogger implementation; used when no access logger is
8233   * defined in the config file. It can also be explicitly listed in the
8234   * config file.
8235   */
8236  private static class DefaultAuditLogger extends HdfsAuditLogger {
8237
8238    private boolean logTokenTrackingId;
8239
8240    @Override
8241    public void initialize(Configuration conf) {
8242      logTokenTrackingId = conf.getBoolean(
8243          DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_KEY,
8244          DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_DEFAULT);
8245    }
8246
8247    @Override
8248    public void logAuditEvent(boolean succeeded, String userName,
8249        InetAddress addr, String cmd, String src, String dst,
8250        FileStatus status, UserGroupInformation ugi,
8251        DelegationTokenSecretManager dtSecretManager) {
8252      if (auditLog.isInfoEnabled()) {
8253        final StringBuilder sb = auditBuffer.get();
8254        sb.setLength(0);
8255        sb.append("allowed=").append(succeeded).append("\t");
8256        sb.append("ugi=").append(userName).append("\t");
8257        sb.append("ip=").append(addr).append("\t");
8258        sb.append("cmd=").append(cmd).append("\t");
8259        sb.append("src=").append(src).append("\t");
8260        sb.append("dst=").append(dst).append("\t");
8261        if (null == status) {
8262          sb.append("perm=null");
8263        } else {
8264          sb.append("perm=");
8265          sb.append(status.getOwner()).append(":");
8266          sb.append(status.getGroup()).append(":");
8267          sb.append(status.getPermission());
8268        }
8269        if (logTokenTrackingId) {
8270          sb.append("\t").append("trackingId=");
8271          String trackingId = null;
8272          if (ugi != null && dtSecretManager != null
8273              && ugi.getAuthenticationMethod() == AuthenticationMethod.TOKEN) {
8274            for (TokenIdentifier tid: ugi.getTokenIdentifiers()) {
8275              if (tid instanceof DelegationTokenIdentifier) {
8276                DelegationTokenIdentifier dtid =
8277                    (DelegationTokenIdentifier)tid;
8278                trackingId = dtSecretManager.getTokenTrackingId(dtid);
8279                break;
8280              }
8281            }
8282          }
8283          sb.append(trackingId);
8284        }
8285        sb.append("\t").append("proto=");
8286        sb.append(NamenodeWebHdfsMethods.isWebHdfsInvocation() ? "webhdfs" : "rpc");
8287        logAuditMessage(sb.toString());
8288      }
8289    }
8290
8291    public void logAuditMessage(String message) {
8292      auditLog.info(message);
8293    }
8294  }
8295
8296  private static void enableAsyncAuditLog() {
8297    if (!(auditLog instanceof Log4JLogger)) {
8298      LOG.warn("Log4j is required to enable async auditlog");
8299      return;
8300    }
8301    Logger logger = ((Log4JLogger)auditLog).getLogger();
8302    @SuppressWarnings("unchecked")
8303    List<Appender> appenders = Collections.list(logger.getAllAppenders());
8304    // failsafe against trying to async it more than once
8305    if (!appenders.isEmpty() && !(appenders.get(0) instanceof AsyncAppender)) {
8306      AsyncAppender asyncAppender = new AsyncAppender();
8307      // change logger to have an async appender containing all the
8308      // previously configured appenders
8309      for (Appender appender : appenders) {
8310        logger.removeAppender(appender);
8311        asyncAppender.addAppender(appender);
8312      }
8313      logger.addAppender(asyncAppender);        
8314    }
8315  }
8316
8317}
8318