001 /**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements. See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership. The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License. You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018 package org.apache.hadoop.hdfs.qjournal.server;
019
020 import java.io.Closeable;
021 import java.io.File;
022 import java.io.FileInputStream;
023 import java.io.IOException;
024 import java.io.InputStream;
025 import java.io.OutputStreamWriter;
026 import java.net.URL;
027 import java.security.PrivilegedExceptionAction;
028 import java.util.Iterator;
029 import java.util.List;
030 import java.util.concurrent.TimeUnit;
031
032 import org.apache.commons.logging.Log;
033 import org.apache.commons.logging.LogFactory;
034 import org.apache.hadoop.conf.Configuration;
035 import org.apache.hadoop.fs.FileUtil;
036 import org.apache.hadoop.hdfs.protocol.HdfsConstants;
037 import org.apache.hadoop.hdfs.qjournal.protocol.JournalNotFormattedException;
038 import org.apache.hadoop.hdfs.qjournal.protocol.JournalOutOfSyncException;
039 import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocol;
040 import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos;
041 import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.NewEpochResponseProto;
042 import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.PersistedRecoveryPaxosData;
043 import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.PrepareRecoveryResponseProto;
044 import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.SegmentStateProto;
045 import org.apache.hadoop.hdfs.qjournal.protocol.RequestInfo;
046 import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption;
047 import org.apache.hadoop.hdfs.server.common.StorageErrorReporter;
048 import org.apache.hadoop.hdfs.server.common.StorageInfo;
049 import org.apache.hadoop.hdfs.server.namenode.EditLogOutputStream;
050 import org.apache.hadoop.hdfs.server.namenode.FileJournalManager;
051 import org.apache.hadoop.hdfs.server.namenode.FileJournalManager.EditLogFile;
052 import org.apache.hadoop.hdfs.server.namenode.JournalManager;
053 import org.apache.hadoop.hdfs.server.namenode.TransferFsImage;
054 import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
055 import org.apache.hadoop.hdfs.server.protocol.RemoteEditLog;
056 import org.apache.hadoop.hdfs.server.protocol.RemoteEditLogManifest;
057 import org.apache.hadoop.hdfs.util.AtomicFileOutputStream;
058 import org.apache.hadoop.hdfs.util.BestEffortLongFile;
059 import org.apache.hadoop.hdfs.util.PersistentLongFile;
060 import org.apache.hadoop.io.IOUtils;
061 import org.apache.hadoop.ipc.Server;
062 import org.apache.hadoop.security.SecurityUtil;
063 import org.apache.hadoop.security.UserGroupInformation;
064
065 import com.google.common.annotations.VisibleForTesting;
066 import com.google.common.base.Charsets;
067 import com.google.common.base.Preconditions;
068 import com.google.common.base.Stopwatch;
069 import com.google.common.collect.ImmutableList;
070 import com.google.common.collect.Range;
071 import com.google.common.collect.Ranges;
072 import com.google.protobuf.TextFormat;
073
074 /**
075 * A JournalNode can manage journals for several clusters at once.
076 * Each such journal is entirely independent despite being hosted by
077 * the same JVM.
078 */
079 public class Journal implements Closeable {
080 static final Log LOG = LogFactory.getLog(Journal.class);
081
082
083 // Current writing state
084 private EditLogOutputStream curSegment;
085 private long curSegmentTxId = HdfsConstants.INVALID_TXID;
086 private long nextTxId = HdfsConstants.INVALID_TXID;
087 private long highestWrittenTxId = 0;
088
089 private final String journalId;
090
091 private final JNStorage storage;
092
093 /**
094 * When a new writer comes along, it asks each node to promise
095 * to ignore requests from any previous writer, as identified
096 * by epoch number. In order to make such a promise, the epoch
097 * number of that writer is stored persistently on disk.
098 */
099 private PersistentLongFile lastPromisedEpoch;
100
101 /**
102 * Each IPC that comes from a given client contains a serial number
103 * which only increases from the client's perspective. Whenever
104 * we switch epochs, we reset this back to -1. Whenever an IPC
105 * comes from a client, we ensure that it is strictly higher
106 * than any previous IPC. This guards against any bugs in the IPC
107 * layer that would re-order IPCs or cause a stale retry from an old
108 * request to resurface and confuse things.
109 */
110 private long currentEpochIpcSerial = -1;
111
112 /**
113 * The epoch number of the last writer to actually write a transaction.
114 * This is used to differentiate log segments after a crash at the very
115 * beginning of a segment. See the the 'testNewerVersionOfSegmentWins'
116 * test case.
117 */
118 private PersistentLongFile lastWriterEpoch;
119
120 /**
121 * Lower-bound on the last committed transaction ID. This is not
122 * depended upon for correctness, but acts as a sanity check
123 * during the recovery procedures, and as a visibility mark
124 * for clients reading in-progress logs.
125 */
126 private BestEffortLongFile committedTxnId;
127
128 public static final String LAST_PROMISED_FILENAME = "last-promised-epoch";
129 public static final String LAST_WRITER_EPOCH = "last-writer-epoch";
130 private static final String COMMITTED_TXID_FILENAME = "committed-txid";
131
132 private final FileJournalManager fjm;
133
134 private final JournalMetrics metrics;
135
136 /**
137 * Time threshold for sync calls, beyond which a warning should be logged to the console.
138 */
139 private static final int WARN_SYNC_MILLIS_THRESHOLD = 1000;
140
141 Journal(Configuration conf, File logDir, String journalId,
142 StartupOption startOpt, StorageErrorReporter errorReporter)
143 throws IOException {
144 storage = new JNStorage(conf, logDir, startOpt, errorReporter);
145 this.journalId = journalId;
146
147 refreshCachedData();
148
149 this.fjm = storage.getJournalManager();
150
151 this.metrics = JournalMetrics.create(this);
152
153 EditLogFile latest = scanStorageForLatestEdits();
154 if (latest != null) {
155 highestWrittenTxId = latest.getLastTxId();
156 }
157 }
158
159 /**
160 * Reload any data that may have been cached. This is necessary
161 * when we first load the Journal, but also after any formatting
162 * operation, since the cached data is no longer relevant.
163 */
164 private synchronized void refreshCachedData() {
165 IOUtils.closeStream(committedTxnId);
166
167 File currentDir = storage.getSingularStorageDir().getCurrentDir();
168 this.lastPromisedEpoch = new PersistentLongFile(
169 new File(currentDir, LAST_PROMISED_FILENAME), 0);
170 this.lastWriterEpoch = new PersistentLongFile(
171 new File(currentDir, LAST_WRITER_EPOCH), 0);
172 this.committedTxnId = new BestEffortLongFile(
173 new File(currentDir, COMMITTED_TXID_FILENAME),
174 HdfsConstants.INVALID_TXID);
175 }
176
177 /**
178 * Scan the local storage directory, and return the segment containing
179 * the highest transaction.
180 * @return the EditLogFile with the highest transactions, or null
181 * if no files exist.
182 */
183 private synchronized EditLogFile scanStorageForLatestEdits() throws IOException {
184 if (!fjm.getStorageDirectory().getCurrentDir().exists()) {
185 return null;
186 }
187
188 LOG.info("Scanning storage " + fjm);
189 List<EditLogFile> files = fjm.getLogFiles(0);
190
191 while (!files.isEmpty()) {
192 EditLogFile latestLog = files.remove(files.size() - 1);
193 latestLog.scanLog();
194 LOG.info("Latest log is " + latestLog);
195 if (latestLog.getLastTxId() == HdfsConstants.INVALID_TXID) {
196 // the log contains no transactions
197 LOG.warn("Latest log " + latestLog + " has no transactions. " +
198 "moving it aside and looking for previous log");
199 latestLog.moveAsideEmptyFile();
200 } else {
201 return latestLog;
202 }
203 }
204
205 LOG.info("No files in " + fjm);
206 return null;
207 }
208
209 /**
210 * Format the local storage with the given namespace.
211 */
212 void format(NamespaceInfo nsInfo) throws IOException {
213 Preconditions.checkState(nsInfo.getNamespaceID() != 0,
214 "can't format with uninitialized namespace info: %s",
215 nsInfo);
216 LOG.info("Formatting " + this + " with namespace info: " +
217 nsInfo);
218 storage.format(nsInfo);
219 refreshCachedData();
220 }
221
222 /**
223 * Unlock and release resources.
224 */
225 @Override // Closeable
226 public void close() throws IOException {
227 storage.close();
228 IOUtils.closeStream(committedTxnId);
229 IOUtils.closeStream(curSegment);
230 }
231
232 JNStorage getStorage() {
233 return storage;
234 }
235
236 String getJournalId() {
237 return journalId;
238 }
239
240 /**
241 * @return the last epoch which this node has promised not to accept
242 * any lower epoch, or 0 if no promises have been made.
243 */
244 synchronized long getLastPromisedEpoch() throws IOException {
245 checkFormatted();
246 return lastPromisedEpoch.get();
247 }
248
249 synchronized public long getLastWriterEpoch() throws IOException {
250 checkFormatted();
251 return lastWriterEpoch.get();
252 }
253
254 synchronized long getCommittedTxnIdForTests() throws IOException {
255 return committedTxnId.get();
256 }
257
258 synchronized long getCurrentLagTxns() throws IOException {
259 long committed = committedTxnId.get();
260 if (committed == 0) {
261 return 0;
262 }
263
264 return Math.max(committed - highestWrittenTxId, 0L);
265 }
266
267 synchronized long getHighestWrittenTxId() {
268 return highestWrittenTxId;
269 }
270
271 @VisibleForTesting
272 JournalMetrics getMetricsForTests() {
273 return metrics;
274 }
275
276 /**
277 * Try to create a new epoch for this journal.
278 * @param nsInfo the namespace, which is verified for consistency or used to
279 * format, if the Journal has not yet been written to.
280 * @param epoch the epoch to start
281 * @return the status information necessary to begin recovery
282 * @throws IOException if the node has already made a promise to another
283 * writer with a higher epoch number, if the namespace is inconsistent,
284 * or if a disk error occurs.
285 */
286 synchronized NewEpochResponseProto newEpoch(
287 NamespaceInfo nsInfo, long epoch) throws IOException {
288
289 checkFormatted();
290 storage.checkConsistentNamespace(nsInfo);
291
292 // Check that the new epoch being proposed is in fact newer than
293 // any other that we've promised.
294 if (epoch <= getLastPromisedEpoch()) {
295 throw new IOException("Proposed epoch " + epoch + " <= last promise " +
296 getLastPromisedEpoch());
297 }
298
299 updateLastPromisedEpoch(epoch);
300 abortCurSegment();
301
302 NewEpochResponseProto.Builder builder =
303 NewEpochResponseProto.newBuilder();
304
305 EditLogFile latestFile = scanStorageForLatestEdits();
306
307 if (latestFile != null) {
308 builder.setLastSegmentTxId(latestFile.getFirstTxId());
309 }
310
311 return builder.build();
312 }
313
314 private void updateLastPromisedEpoch(long newEpoch) throws IOException {
315 LOG.info("Updating lastPromisedEpoch from " + lastPromisedEpoch.get() +
316 " to " + newEpoch + " for client " + Server.getRemoteIp());
317 lastPromisedEpoch.set(newEpoch);
318
319 // Since we have a new writer, reset the IPC serial - it will start
320 // counting again from 0 for this writer.
321 currentEpochIpcSerial = -1;
322 }
323
324 private void abortCurSegment() throws IOException {
325 if (curSegment == null) {
326 return;
327 }
328
329 curSegment.abort();
330 curSegment = null;
331 curSegmentTxId = HdfsConstants.INVALID_TXID;
332 }
333
334 /**
335 * Write a batch of edits to the journal.
336 * {@see QJournalProtocol#journal(RequestInfo, long, long, int, byte[])}
337 */
338 synchronized void journal(RequestInfo reqInfo,
339 long segmentTxId, long firstTxnId,
340 int numTxns, byte[] records) throws IOException {
341 checkFormatted();
342 checkWriteRequest(reqInfo);
343
344 checkSync(curSegment != null,
345 "Can't write, no segment open");
346
347 if (curSegmentTxId != segmentTxId) {
348 // Sanity check: it is possible that the writer will fail IPCs
349 // on both the finalize() and then the start() of the next segment.
350 // This could cause us to continue writing to an old segment
351 // instead of rolling to a new one, which breaks one of the
352 // invariants in the design. If it happens, abort the segment
353 // and throw an exception.
354 JournalOutOfSyncException e = new JournalOutOfSyncException(
355 "Writer out of sync: it thinks it is writing segment " + segmentTxId
356 + " but current segment is " + curSegmentTxId);
357 abortCurSegment();
358 throw e;
359 }
360
361 checkSync(nextTxId == firstTxnId,
362 "Can't write txid " + firstTxnId + " expecting nextTxId=" + nextTxId);
363
364 long lastTxnId = firstTxnId + numTxns - 1;
365 if (LOG.isTraceEnabled()) {
366 LOG.trace("Writing txid " + firstTxnId + "-" + lastTxnId);
367 }
368
369 // If the edit has already been marked as committed, we know
370 // it has been fsynced on a quorum of other nodes, and we are
371 // "catching up" with the rest. Hence we do not need to fsync.
372 boolean isLagging = lastTxnId <= committedTxnId.get();
373 boolean shouldFsync = !isLagging;
374
375 curSegment.writeRaw(records, 0, records.length);
376 curSegment.setReadyToFlush();
377 Stopwatch sw = new Stopwatch();
378 sw.start();
379 curSegment.flush(shouldFsync);
380 sw.stop();
381
382 metrics.addSync(sw.elapsedTime(TimeUnit.MICROSECONDS));
383 if (sw.elapsedTime(TimeUnit.MILLISECONDS) > WARN_SYNC_MILLIS_THRESHOLD) {
384 LOG.warn("Sync of transaction range " + firstTxnId + "-" + lastTxnId +
385 " took " + sw.elapsedTime(TimeUnit.MILLISECONDS) + "ms");
386 }
387
388 if (isLagging) {
389 // This batch of edits has already been committed on a quorum of other
390 // nodes. So, we are in "catch up" mode. This gets its own metric.
391 metrics.batchesWrittenWhileLagging.incr(1);
392 }
393
394 metrics.batchesWritten.incr(1);
395 metrics.bytesWritten.incr(records.length);
396 metrics.txnsWritten.incr(numTxns);
397
398 highestWrittenTxId = lastTxnId;
399 nextTxId = lastTxnId + 1;
400 }
401
402 public void heartbeat(RequestInfo reqInfo) throws IOException {
403 checkRequest(reqInfo);
404 }
405
406 /**
407 * Ensure that the given request is coming from the correct writer and in-order.
408 * @param reqInfo the request info
409 * @throws IOException if the request is invalid.
410 */
411 private synchronized void checkRequest(RequestInfo reqInfo) throws IOException {
412 // Invariant 25 from ZAB paper
413 if (reqInfo.getEpoch() < lastPromisedEpoch.get()) {
414 throw new IOException("IPC's epoch " + reqInfo.getEpoch() +
415 " is less than the last promised epoch " +
416 lastPromisedEpoch.get());
417 } else if (reqInfo.getEpoch() > lastPromisedEpoch.get()) {
418 // A newer client has arrived. Fence any previous writers by updating
419 // the promise.
420 updateLastPromisedEpoch(reqInfo.getEpoch());
421 }
422
423 // Ensure that the IPCs are arriving in-order as expected.
424 checkSync(reqInfo.getIpcSerialNumber() > currentEpochIpcSerial,
425 "IPC serial %s from client %s was not higher than prior highest " +
426 "IPC serial %s", reqInfo.getIpcSerialNumber(),
427 Server.getRemoteIp(),
428 currentEpochIpcSerial);
429 currentEpochIpcSerial = reqInfo.getIpcSerialNumber();
430
431 if (reqInfo.hasCommittedTxId()) {
432 Preconditions.checkArgument(
433 reqInfo.getCommittedTxId() >= committedTxnId.get(),
434 "Client trying to move committed txid backward from " +
435 committedTxnId.get() + " to " + reqInfo.getCommittedTxId());
436
437 committedTxnId.set(reqInfo.getCommittedTxId());
438 }
439 }
440
441 private synchronized void checkWriteRequest(RequestInfo reqInfo) throws IOException {
442 checkRequest(reqInfo);
443
444 if (reqInfo.getEpoch() != lastWriterEpoch.get()) {
445 throw new IOException("IPC's epoch " + reqInfo.getEpoch() +
446 " is not the current writer epoch " +
447 lastWriterEpoch.get());
448 }
449 }
450
451 public synchronized boolean isFormatted() {
452 return storage.isFormatted();
453 }
454
455 private void checkFormatted() throws JournalNotFormattedException {
456 if (!isFormatted()) {
457 throw new JournalNotFormattedException("Journal " +
458 storage.getSingularStorageDir() + " not formatted");
459 }
460 }
461
462 /**
463 * @throws JournalOutOfSyncException if the given expression is not true.
464 * The message of the exception is formatted using the 'msg' and
465 * 'formatArgs' parameters.
466 */
467 private void checkSync(boolean expression, String msg,
468 Object... formatArgs) throws JournalOutOfSyncException {
469 if (!expression) {
470 throw new JournalOutOfSyncException(String.format(msg, formatArgs));
471 }
472 }
473
474 /**
475 * @throws AssertionError if the given expression is not true.
476 * The message of the exception is formatted using the 'msg' and
477 * 'formatArgs' parameters.
478 *
479 * This should be used in preference to Java's built-in assert in
480 * non-performance-critical paths, where a failure of this invariant
481 * might cause the protocol to lose data.
482 */
483 private void alwaysAssert(boolean expression, String msg,
484 Object... formatArgs) {
485 if (!expression) {
486 throw new AssertionError(String.format(msg, formatArgs));
487 }
488 }
489
490 /**
491 * Start a new segment at the given txid. The previous segment
492 * must have already been finalized.
493 */
494 public synchronized void startLogSegment(RequestInfo reqInfo, long txid,
495 int layoutVersion) throws IOException {
496 assert fjm != null;
497 checkFormatted();
498 checkRequest(reqInfo);
499
500 if (curSegment != null) {
501 LOG.warn("Client is requesting a new log segment " + txid +
502 " though we are already writing " + curSegment + ". " +
503 "Aborting the current segment in order to begin the new one.");
504 // The writer may have lost a connection to us and is now
505 // re-connecting after the connection came back.
506 // We should abort our own old segment.
507 abortCurSegment();
508 }
509
510 // Paranoid sanity check: we should never overwrite a finalized log file.
511 // Additionally, if it's in-progress, it should have at most 1 transaction.
512 // This can happen if the writer crashes exactly at the start of a segment.
513 EditLogFile existing = fjm.getLogFile(txid);
514 if (existing != null) {
515 if (!existing.isInProgress()) {
516 throw new IllegalStateException("Already have a finalized segment " +
517 existing + " beginning at " + txid);
518 }
519
520 // If it's in-progress, it should only contain one transaction,
521 // because the "startLogSegment" transaction is written alone at the
522 // start of each segment.
523 existing.scanLog();
524 if (existing.getLastTxId() != existing.getFirstTxId()) {
525 throw new IllegalStateException("The log file " +
526 existing + " seems to contain valid transactions");
527 }
528 }
529
530 long curLastWriterEpoch = lastWriterEpoch.get();
531 if (curLastWriterEpoch != reqInfo.getEpoch()) {
532 LOG.info("Updating lastWriterEpoch from " + curLastWriterEpoch +
533 " to " + reqInfo.getEpoch() + " for client " +
534 Server.getRemoteIp());
535 lastWriterEpoch.set(reqInfo.getEpoch());
536 }
537
538 // The fact that we are starting a segment at this txid indicates
539 // that any previous recovery for this same segment was aborted.
540 // Otherwise, no writer would have started writing. So, we can
541 // remove the record of the older segment here.
542 purgePaxosDecision(txid);
543
544 curSegment = fjm.startLogSegment(txid, layoutVersion);
545 curSegmentTxId = txid;
546 nextTxId = txid;
547 }
548
549 /**
550 * Finalize the log segment at the given transaction ID.
551 */
552 public synchronized void finalizeLogSegment(RequestInfo reqInfo, long startTxId,
553 long endTxId) throws IOException {
554 checkFormatted();
555 checkRequest(reqInfo);
556
557 boolean needsValidation = true;
558
559 // Finalizing the log that the writer was just writing.
560 if (startTxId == curSegmentTxId) {
561 if (curSegment != null) {
562 curSegment.close();
563 curSegment = null;
564 curSegmentTxId = HdfsConstants.INVALID_TXID;
565 }
566
567 checkSync(nextTxId == endTxId + 1,
568 "Trying to finalize in-progress log segment %s to end at " +
569 "txid %s but only written up to txid %s",
570 startTxId, endTxId, nextTxId - 1);
571 // No need to validate the edit log if the client is finalizing
572 // the log segment that it was just writing to.
573 needsValidation = false;
574 }
575
576 FileJournalManager.EditLogFile elf = fjm.getLogFile(startTxId);
577 if (elf == null) {
578 throw new JournalOutOfSyncException("No log file to finalize at " +
579 "transaction ID " + startTxId);
580 }
581
582 if (elf.isInProgress()) {
583 if (needsValidation) {
584 LOG.info("Validating log segment " + elf.getFile() + " about to be " +
585 "finalized");
586 elf.scanLog();
587
588 checkSync(elf.getLastTxId() == endTxId,
589 "Trying to finalize in-progress log segment %s to end at " +
590 "txid %s but log %s on disk only contains up to txid %s",
591 startTxId, endTxId, elf.getFile(), elf.getLastTxId());
592 }
593 fjm.finalizeLogSegment(startTxId, endTxId);
594 } else {
595 Preconditions.checkArgument(endTxId == elf.getLastTxId(),
596 "Trying to re-finalize already finalized log " +
597 elf + " with different endTxId " + endTxId);
598 }
599
600 // Once logs are finalized, a different length will never be decided.
601 // During recovery, we treat a finalized segment the same as an accepted
602 // recovery. Thus, we no longer need to keep track of the previously-
603 // accepted decision. The existence of the finalized log segment is enough.
604 purgePaxosDecision(elf.getFirstTxId());
605 }
606
607 /**
608 * @see JournalManager#purgeLogsOlderThan(long)
609 */
610 public synchronized void purgeLogsOlderThan(RequestInfo reqInfo,
611 long minTxIdToKeep) throws IOException {
612 checkFormatted();
613 checkRequest(reqInfo);
614
615 storage.purgeDataOlderThan(minTxIdToKeep);
616 }
617
618 /**
619 * Remove the previously-recorded 'accepted recovery' information
620 * for a given log segment, once it is no longer necessary.
621 * @param segmentTxId the transaction ID to purge
622 * @throws IOException if the file could not be deleted
623 */
624 private void purgePaxosDecision(long segmentTxId) throws IOException {
625 File paxosFile = storage.getPaxosFile(segmentTxId);
626 if (paxosFile.exists()) {
627 if (!paxosFile.delete()) {
628 throw new IOException("Unable to delete paxos file " + paxosFile);
629 }
630 }
631 }
632
633 /**
634 * @see QJournalProtocol#getEditLogManifest(String, long, boolean)
635 */
636 public RemoteEditLogManifest getEditLogManifest(long sinceTxId,
637 boolean inProgressOk) throws IOException {
638 // No need to checkRequest() here - anyone may ask for the list
639 // of segments.
640 checkFormatted();
641
642 List<RemoteEditLog> logs = fjm.getRemoteEditLogs(sinceTxId, inProgressOk);
643
644 if (inProgressOk) {
645 RemoteEditLog log = null;
646 for (Iterator<RemoteEditLog> iter = logs.iterator(); iter.hasNext();) {
647 log = iter.next();
648 if (log.isInProgress()) {
649 iter.remove();
650 break;
651 }
652 }
653 if (log != null && log.isInProgress()) {
654 logs.add(new RemoteEditLog(log.getStartTxId(), getHighestWrittenTxId(),
655 true));
656 }
657 }
658
659 return new RemoteEditLogManifest(logs);
660 }
661
662 /**
663 * @return the current state of the given segment, or null if the
664 * segment does not exist.
665 */
666 @VisibleForTesting
667 SegmentStateProto getSegmentInfo(long segmentTxId)
668 throws IOException {
669 EditLogFile elf = fjm.getLogFile(segmentTxId);
670 if (elf == null) {
671 return null;
672 }
673 if (elf.isInProgress()) {
674 elf.scanLog();
675 }
676 if (elf.getLastTxId() == HdfsConstants.INVALID_TXID) {
677 LOG.info("Edit log file " + elf + " appears to be empty. " +
678 "Moving it aside...");
679 elf.moveAsideEmptyFile();
680 return null;
681 }
682 SegmentStateProto ret = SegmentStateProto.newBuilder()
683 .setStartTxId(segmentTxId)
684 .setEndTxId(elf.getLastTxId())
685 .setIsInProgress(elf.isInProgress())
686 .build();
687 LOG.info("getSegmentInfo(" + segmentTxId + "): " + elf + " -> " +
688 TextFormat.shortDebugString(ret));
689 return ret;
690 }
691
692 /**
693 * @see QJournalProtocol#prepareRecovery(RequestInfo, long)
694 */
695 public synchronized PrepareRecoveryResponseProto prepareRecovery(
696 RequestInfo reqInfo, long segmentTxId) throws IOException {
697 checkFormatted();
698 checkRequest(reqInfo);
699
700 abortCurSegment();
701
702 PrepareRecoveryResponseProto.Builder builder =
703 PrepareRecoveryResponseProto.newBuilder();
704
705 PersistedRecoveryPaxosData previouslyAccepted = getPersistedPaxosData(segmentTxId);
706 completeHalfDoneAcceptRecovery(previouslyAccepted);
707
708 SegmentStateProto segInfo = getSegmentInfo(segmentTxId);
709 boolean hasFinalizedSegment = segInfo != null && !segInfo.getIsInProgress();
710
711 if (previouslyAccepted != null && !hasFinalizedSegment) {
712 SegmentStateProto acceptedState = previouslyAccepted.getSegmentState();
713 assert acceptedState.getEndTxId() == segInfo.getEndTxId() :
714 "prev accepted: " + TextFormat.shortDebugString(previouslyAccepted)+ "\n" +
715 "on disk: " + TextFormat.shortDebugString(segInfo);
716
717 builder.setAcceptedInEpoch(previouslyAccepted.getAcceptedInEpoch())
718 .setSegmentState(previouslyAccepted.getSegmentState());
719 } else {
720 if (segInfo != null) {
721 builder.setSegmentState(segInfo);
722 }
723 }
724
725 builder.setLastWriterEpoch(lastWriterEpoch.get());
726 if (committedTxnId.get() != HdfsConstants.INVALID_TXID) {
727 builder.setLastCommittedTxId(committedTxnId.get());
728 }
729
730 PrepareRecoveryResponseProto resp = builder.build();
731 LOG.info("Prepared recovery for segment " + segmentTxId + ": " +
732 TextFormat.shortDebugString(resp));
733 return resp;
734 }
735
736 /**
737 * @see QJournalProtocol#acceptRecovery(RequestInfo, QJournalProtocolProtos.SegmentStateProto, URL)
738 */
739 public synchronized void acceptRecovery(RequestInfo reqInfo,
740 SegmentStateProto segment, URL fromUrl)
741 throws IOException {
742 checkFormatted();
743 checkRequest(reqInfo);
744
745 abortCurSegment();
746
747 long segmentTxId = segment.getStartTxId();
748
749 // Basic sanity checks that the segment is well-formed and contains
750 // at least one transaction.
751 Preconditions.checkArgument(segment.getEndTxId() > 0 &&
752 segment.getEndTxId() >= segmentTxId,
753 "bad recovery state for segment %s: %s",
754 segmentTxId, TextFormat.shortDebugString(segment));
755
756 PersistedRecoveryPaxosData oldData = getPersistedPaxosData(segmentTxId);
757 PersistedRecoveryPaxosData newData = PersistedRecoveryPaxosData.newBuilder()
758 .setAcceptedInEpoch(reqInfo.getEpoch())
759 .setSegmentState(segment)
760 .build();
761
762 // If we previously acted on acceptRecovery() from a higher-numbered writer,
763 // this call is out of sync. We should never actually trigger this, since the
764 // checkRequest() call above should filter non-increasing epoch numbers.
765 if (oldData != null) {
766 alwaysAssert(oldData.getAcceptedInEpoch() <= reqInfo.getEpoch(),
767 "Bad paxos transition, out-of-order epochs.\nOld: %s\nNew: %s\n",
768 oldData, newData);
769 }
770
771 File syncedFile = null;
772
773 SegmentStateProto currentSegment = getSegmentInfo(segmentTxId);
774 if (currentSegment == null ||
775 currentSegment.getEndTxId() != segment.getEndTxId()) {
776 if (currentSegment == null) {
777 LOG.info("Synchronizing log " + TextFormat.shortDebugString(segment) +
778 ": no current segment in place");
779
780 // Update the highest txid for lag metrics
781 highestWrittenTxId = Math.max(segment.getEndTxId(),
782 highestWrittenTxId);
783 } else {
784 LOG.info("Synchronizing log " + TextFormat.shortDebugString(segment) +
785 ": old segment " + TextFormat.shortDebugString(currentSegment) +
786 " is not the right length");
787
788 // Paranoid sanity check: if the new log is shorter than the log we
789 // currently have, we should not end up discarding any transactions
790 // which are already Committed.
791 if (txnRange(currentSegment).contains(committedTxnId.get()) &&
792 !txnRange(segment).contains(committedTxnId.get())) {
793 throw new AssertionError(
794 "Cannot replace segment " +
795 TextFormat.shortDebugString(currentSegment) +
796 " with new segment " +
797 TextFormat.shortDebugString(segment) +
798 ": would discard already-committed txn " +
799 committedTxnId.get());
800 }
801
802 // Another paranoid check: we should not be asked to synchronize a log
803 // on top of a finalized segment.
804 alwaysAssert(currentSegment.getIsInProgress(),
805 "Should never be asked to synchronize a different log on top of an " +
806 "already-finalized segment");
807
808 // If we're shortening the log, update our highest txid
809 // used for lag metrics.
810 if (txnRange(currentSegment).contains(highestWrittenTxId)) {
811 highestWrittenTxId = segment.getEndTxId();
812 }
813 }
814 syncedFile = syncLog(reqInfo, segment, fromUrl);
815
816 } else {
817 LOG.info("Skipping download of log " +
818 TextFormat.shortDebugString(segment) +
819 ": already have up-to-date logs");
820 }
821
822 // This is one of the few places in the protocol where we have a single
823 // RPC that results in two distinct actions:
824 //
825 // - 1) Downloads the new log segment data (above)
826 // - 2) Records the new Paxos data about the synchronized segment (below)
827 //
828 // These need to be treated as a transaction from the perspective
829 // of any external process. We do this by treating the persistPaxosData()
830 // success as the "commit" of an atomic transaction. If we fail before
831 // this point, the downloaded edit log will only exist at a temporary
832 // path, and thus not change any externally visible state. If we fail
833 // after this point, then any future prepareRecovery() call will see
834 // the Paxos data, and by calling completeHalfDoneAcceptRecovery() will
835 // roll forward the rename of the referenced log file.
836 //
837 // See also: HDFS-3955
838 //
839 // The fault points here are exercised by the randomized fault injection
840 // test case to ensure that this atomic "transaction" operates correctly.
841 JournalFaultInjector.get().beforePersistPaxosData();
842 persistPaxosData(segmentTxId, newData);
843 JournalFaultInjector.get().afterPersistPaxosData();
844
845 if (syncedFile != null) {
846 FileUtil.replaceFile(syncedFile,
847 storage.getInProgressEditLog(segmentTxId));
848 }
849
850 LOG.info("Accepted recovery for segment " + segmentTxId + ": " +
851 TextFormat.shortDebugString(newData));
852 }
853
854 private Range<Long> txnRange(SegmentStateProto seg) {
855 Preconditions.checkArgument(seg.hasEndTxId(),
856 "invalid segment: %s", seg);
857 return Ranges.closed(seg.getStartTxId(), seg.getEndTxId());
858 }
859
860 /**
861 * Synchronize a log segment from another JournalNode. The log is
862 * downloaded from the provided URL into a temporary location on disk,
863 * which is named based on the current request's epoch.
864 *
865 * @return the temporary location of the downloaded file
866 */
867 private File syncLog(RequestInfo reqInfo,
868 final SegmentStateProto segment, final URL url) throws IOException {
869 final File tmpFile = storage.getSyncLogTemporaryFile(
870 segment.getStartTxId(), reqInfo.getEpoch());
871 final List<File> localPaths = ImmutableList.of(tmpFile);
872
873 LOG.info("Synchronizing log " +
874 TextFormat.shortDebugString(segment) + " from " + url);
875 SecurityUtil.doAsLoginUser(
876 new PrivilegedExceptionAction<Void>() {
877 @Override
878 public Void run() throws IOException {
879 // We may have lost our ticket since last checkpoint, log in again, just in case
880 if (UserGroupInformation.isSecurityEnabled()) {
881 UserGroupInformation.getCurrentUser().checkTGTAndReloginFromKeytab();
882 }
883
884 boolean success = false;
885 try {
886 TransferFsImage.doGetUrl(url, localPaths, storage, true);
887 assert tmpFile.exists();
888 success = true;
889 } finally {
890 if (!success) {
891 if (!tmpFile.delete()) {
892 LOG.warn("Failed to delete temporary file " + tmpFile);
893 }
894 }
895 }
896 return null;
897 }
898 });
899 return tmpFile;
900 }
901
902
903 /**
904 * In the case the node crashes in between downloading a log segment
905 * and persisting the associated paxos recovery data, the log segment
906 * will be left in its temporary location on disk. Given the paxos data,
907 * we can check if this was indeed the case, and "roll forward"
908 * the atomic operation.
909 *
910 * See the inline comments in
911 * {@link #acceptRecovery(RequestInfo, SegmentStateProto, URL)} for more
912 * details.
913 *
914 * @throws IOException if the temporary file is unable to be renamed into
915 * place
916 */
917 private void completeHalfDoneAcceptRecovery(
918 PersistedRecoveryPaxosData paxosData) throws IOException {
919 if (paxosData == null) {
920 return;
921 }
922
923 long segmentId = paxosData.getSegmentState().getStartTxId();
924 long epoch = paxosData.getAcceptedInEpoch();
925
926 File tmp = storage.getSyncLogTemporaryFile(segmentId, epoch);
927
928 if (tmp.exists()) {
929 File dst = storage.getInProgressEditLog(segmentId);
930 LOG.info("Rolling forward previously half-completed synchronization: " +
931 tmp + " -> " + dst);
932 FileUtil.replaceFile(tmp, dst);
933 }
934 }
935
936 /**
937 * Retrieve the persisted data for recovering the given segment from disk.
938 */
939 private PersistedRecoveryPaxosData getPersistedPaxosData(long segmentTxId)
940 throws IOException {
941 File f = storage.getPaxosFile(segmentTxId);
942 if (!f.exists()) {
943 // Default instance has no fields filled in (they're optional)
944 return null;
945 }
946
947 InputStream in = new FileInputStream(f);
948 try {
949 PersistedRecoveryPaxosData ret = PersistedRecoveryPaxosData.parseDelimitedFrom(in);
950 Preconditions.checkState(ret != null &&
951 ret.getSegmentState().getStartTxId() == segmentTxId,
952 "Bad persisted data for segment %s: %s",
953 segmentTxId, ret);
954 return ret;
955 } finally {
956 IOUtils.closeStream(in);
957 }
958 }
959
960 /**
961 * Persist data for recovering the given segment from disk.
962 */
963 private void persistPaxosData(long segmentTxId,
964 PersistedRecoveryPaxosData newData) throws IOException {
965 File f = storage.getPaxosFile(segmentTxId);
966 boolean success = false;
967 AtomicFileOutputStream fos = new AtomicFileOutputStream(f);
968 try {
969 newData.writeDelimitedTo(fos);
970 fos.write('\n');
971 // Write human-readable data after the protobuf. This is only
972 // to assist in debugging -- it's not parsed at all.
973 OutputStreamWriter writer = new OutputStreamWriter(fos, Charsets.UTF_8);
974
975 writer.write(String.valueOf(newData));
976 writer.write('\n');
977 writer.flush();
978
979 fos.flush();
980 success = true;
981 } finally {
982 if (success) {
983 IOUtils.closeStream(fos);
984 } else {
985 fos.abort();
986 }
987 }
988 }
989
990 synchronized void discardSegments(long startTxId) throws IOException {
991 storage.getJournalManager().discardSegments(startTxId);
992 // we delete all the segments after the startTxId. let's reset committedTxnId
993 committedTxnId.set(startTxId - 1);
994 }
995
996 public synchronized void doPreUpgrade() throws IOException {
997 // Do not hold file lock on committedTxnId, because the containing
998 // directory will be renamed. It will be reopened lazily on next access.
999 IOUtils.cleanup(LOG, committedTxnId);
1000 storage.getJournalManager().doPreUpgrade();
1001 }
1002
1003 public synchronized void doUpgrade(StorageInfo sInfo) throws IOException {
1004 long oldCTime = storage.getCTime();
1005 storage.cTime = sInfo.cTime;
1006 int oldLV = storage.getLayoutVersion();
1007 storage.layoutVersion = sInfo.layoutVersion;
1008 LOG.info("Starting upgrade of edits directory: "
1009 + ".\n old LV = " + oldLV
1010 + "; old CTime = " + oldCTime
1011 + ".\n new LV = " + storage.getLayoutVersion()
1012 + "; new CTime = " + storage.getCTime());
1013 storage.getJournalManager().doUpgrade(storage);
1014 storage.createPaxosDir();
1015
1016 // Copy over the contents of the epoch data files to the new dir.
1017 File currentDir = storage.getSingularStorageDir().getCurrentDir();
1018 File previousDir = storage.getSingularStorageDir().getPreviousDir();
1019
1020 PersistentLongFile prevLastPromisedEpoch = new PersistentLongFile(
1021 new File(previousDir, LAST_PROMISED_FILENAME), 0);
1022 PersistentLongFile prevLastWriterEpoch = new PersistentLongFile(
1023 new File(previousDir, LAST_WRITER_EPOCH), 0);
1024 BestEffortLongFile prevCommittedTxnId = new BestEffortLongFile(
1025 new File(previousDir, COMMITTED_TXID_FILENAME),
1026 HdfsConstants.INVALID_TXID);
1027
1028 lastPromisedEpoch = new PersistentLongFile(
1029 new File(currentDir, LAST_PROMISED_FILENAME), 0);
1030 lastWriterEpoch = new PersistentLongFile(
1031 new File(currentDir, LAST_WRITER_EPOCH), 0);
1032 committedTxnId = new BestEffortLongFile(
1033 new File(currentDir, COMMITTED_TXID_FILENAME),
1034 HdfsConstants.INVALID_TXID);
1035
1036 try {
1037 lastPromisedEpoch.set(prevLastPromisedEpoch.get());
1038 lastWriterEpoch.set(prevLastWriterEpoch.get());
1039 committedTxnId.set(prevCommittedTxnId.get());
1040 } finally {
1041 IOUtils.cleanup(LOG, prevCommittedTxnId);
1042 }
1043 }
1044
1045 public synchronized void doFinalize() throws IOException {
1046 LOG.info("Finalizing upgrade for journal "
1047 + storage.getRoot() + "."
1048 + (storage.getLayoutVersion()==0 ? "" :
1049 "\n cur LV = " + storage.getLayoutVersion()
1050 + "; cur CTime = " + storage.getCTime()));
1051 storage.getJournalManager().doFinalize();
1052 }
1053
1054 public Boolean canRollBack(StorageInfo storage, StorageInfo prevStorage,
1055 int targetLayoutVersion) throws IOException {
1056 return this.storage.getJournalManager().canRollBack(storage, prevStorage,
1057 targetLayoutVersion);
1058 }
1059
1060 public synchronized void doRollback() throws IOException {
1061 // Do not hold file lock on committedTxnId, because the containing
1062 // directory will be renamed. It will be reopened lazily on next access.
1063 IOUtils.cleanup(LOG, committedTxnId);
1064 storage.getJournalManager().doRollback();
1065 }
1066
1067 public Long getJournalCTime() throws IOException {
1068 return storage.getJournalManager().getJournalCTime();
1069 }
1070 }