001 /**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements. See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership. The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License. You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018 package org.apache.hadoop.hdfs.server.namenode;
019
020 import static org.apache.hadoop.util.ExitUtil.terminate;
021
022 import java.io.IOException;
023 import java.util.ArrayList;
024 import java.util.Collection;
025 import java.util.Collections;
026 import java.util.Comparator;
027 import java.util.LinkedList;
028 import java.util.List;
029 import java.util.PriorityQueue;
030 import java.util.SortedSet;
031 import java.util.concurrent.CopyOnWriteArrayList;
032
033 import org.apache.commons.logging.Log;
034 import org.apache.commons.logging.LogFactory;
035 import org.apache.hadoop.classification.InterfaceAudience;
036 import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
037 import org.apache.hadoop.hdfs.server.protocol.RemoteEditLog;
038 import org.apache.hadoop.hdfs.server.protocol.RemoteEditLogManifest;
039
040 import static org.apache.hadoop.util.ExitUtil.terminate;
041
042 import com.google.common.base.Preconditions;
043 import com.google.common.collect.ComparisonChain;
044 import com.google.common.collect.ImmutableList;
045 import com.google.common.collect.ImmutableListMultimap;
046 import com.google.common.collect.Lists;
047 import com.google.common.collect.Multimaps;
048 import com.google.common.collect.Sets;
049
050 /**
051 * Manages a collection of Journals. None of the methods are synchronized, it is
052 * assumed that FSEditLog methods, that use this class, use proper
053 * synchronization.
054 */
055 public class JournalSet implements JournalManager {
056
057 static final Log LOG = LogFactory.getLog(FSEditLog.class);
058
059 static final public Comparator<EditLogInputStream>
060 EDIT_LOG_INPUT_STREAM_COMPARATOR = new Comparator<EditLogInputStream>() {
061 @Override
062 public int compare(EditLogInputStream a, EditLogInputStream b) {
063 return ComparisonChain.start().
064 compare(a.getFirstTxId(), b.getFirstTxId()).
065 compare(b.getLastTxId(), a.getLastTxId()).
066 result();
067 }
068 };
069
070 /**
071 * Container for a JournalManager paired with its currently
072 * active stream.
073 *
074 * If a Journal gets disabled due to an error writing to its
075 * stream, then the stream will be aborted and set to null.
076 */
077 static class JournalAndStream implements CheckableNameNodeResource {
078 private final JournalManager journal;
079 private boolean disabled = false;
080 private EditLogOutputStream stream;
081 private boolean required = false;
082
083 public JournalAndStream(JournalManager manager, boolean required) {
084 this.journal = manager;
085 this.required = required;
086 }
087
088 public void startLogSegment(long txId) throws IOException {
089 Preconditions.checkState(stream == null);
090 disabled = false;
091 stream = journal.startLogSegment(txId);
092 }
093
094 /**
095 * Closes the stream, also sets it to null.
096 */
097 public void closeStream() throws IOException {
098 if (stream == null) return;
099 stream.close();
100 stream = null;
101 }
102
103 /**
104 * Close the Journal and Stream
105 */
106 public void close() throws IOException {
107 closeStream();
108
109 journal.close();
110 }
111
112 /**
113 * Aborts the stream, also sets it to null.
114 */
115 public void abort() {
116 if (stream == null) return;
117 try {
118 stream.abort();
119 } catch (IOException ioe) {
120 LOG.error("Unable to abort stream " + stream, ioe);
121 }
122 stream = null;
123 }
124
125 boolean isActive() {
126 return stream != null;
127 }
128
129 /**
130 * Should be used outside JournalSet only for testing.
131 */
132 EditLogOutputStream getCurrentStream() {
133 return stream;
134 }
135
136 @Override
137 public String toString() {
138 return "JournalAndStream(mgr=" + journal +
139 ", " + "stream=" + stream + ")";
140 }
141
142 void setCurrentStreamForTests(EditLogOutputStream stream) {
143 this.stream = stream;
144 }
145
146 JournalManager getManager() {
147 return journal;
148 }
149
150 boolean isDisabled() {
151 return disabled;
152 }
153
154 private void setDisabled(boolean disabled) {
155 this.disabled = disabled;
156 }
157
158 @Override
159 public boolean isResourceAvailable() {
160 return !isDisabled();
161 }
162
163 @Override
164 public boolean isRequired() {
165 return required;
166 }
167 }
168
169 // COW implementation is necessary since some users (eg the web ui) call
170 // getAllJournalStreams() and then iterate. Since this is rarely
171 // mutated, there is no performance concern.
172 private List<JournalAndStream> journals =
173 new CopyOnWriteArrayList<JournalSet.JournalAndStream>();
174 final int minimumRedundantJournals;
175
176 JournalSet(int minimumRedundantResources) {
177 this.minimumRedundantJournals = minimumRedundantResources;
178 }
179
180 @Override
181 public void format(NamespaceInfo nsInfo) throws IOException {
182 // The iteration is done by FSEditLog itself
183 throw new UnsupportedOperationException();
184 }
185
186 @Override
187 public boolean hasSomeData() throws IOException {
188 // This is called individually on the underlying journals,
189 // not on the JournalSet.
190 throw new UnsupportedOperationException();
191 }
192
193
194 @Override
195 public EditLogOutputStream startLogSegment(final long txId) throws IOException {
196 mapJournalsAndReportErrors(new JournalClosure() {
197 @Override
198 public void apply(JournalAndStream jas) throws IOException {
199 jas.startLogSegment(txId);
200 }
201 }, "starting log segment " + txId);
202 return new JournalSetOutputStream();
203 }
204
205 @Override
206 public void finalizeLogSegment(final long firstTxId, final long lastTxId)
207 throws IOException {
208 mapJournalsAndReportErrors(new JournalClosure() {
209 @Override
210 public void apply(JournalAndStream jas) throws IOException {
211 if (jas.isActive()) {
212 jas.closeStream();
213 jas.getManager().finalizeLogSegment(firstTxId, lastTxId);
214 }
215 }
216 }, "finalize log segment " + firstTxId + ", " + lastTxId);
217 }
218
219 @Override
220 public void close() throws IOException {
221 mapJournalsAndReportErrors(new JournalClosure() {
222 @Override
223 public void apply(JournalAndStream jas) throws IOException {
224 jas.close();
225 }
226 }, "close journal");
227 }
228
229 /**
230 * In this function, we get a bunch of streams from all of our JournalManager
231 * objects. Then we add these to the collection one by one.
232 *
233 * @param streams The collection to add the streams to. It may or
234 * may not be sorted-- this is up to the caller.
235 * @param fromTxId The transaction ID to start looking for streams at
236 * @param inProgressOk Should we consider unfinalized streams?
237 * @param forReading Whether or not the caller intends to read from
238 * the returned streams.
239 */
240 @Override
241 public void selectInputStreams(Collection<EditLogInputStream> streams,
242 long fromTxId, boolean inProgressOk, boolean forReading) throws IOException {
243 final PriorityQueue<EditLogInputStream> allStreams =
244 new PriorityQueue<EditLogInputStream>(64,
245 EDIT_LOG_INPUT_STREAM_COMPARATOR);
246 for (JournalAndStream jas : journals) {
247 if (jas.isDisabled()) {
248 LOG.info("Skipping jas " + jas + " since it's disabled");
249 continue;
250 }
251 try {
252 jas.getManager().selectInputStreams(allStreams, fromTxId, inProgressOk,
253 forReading);
254 } catch (IOException ioe) {
255 LOG.warn("Unable to determine input streams from " + jas.getManager() +
256 ". Skipping.", ioe);
257 }
258 }
259 chainAndMakeRedundantStreams(streams, allStreams, fromTxId);
260 }
261
262 public static void chainAndMakeRedundantStreams(
263 Collection<EditLogInputStream> outStreams,
264 PriorityQueue<EditLogInputStream> allStreams, long fromTxId) {
265 // We want to group together all the streams that start on the same start
266 // transaction ID. To do this, we maintain an accumulator (acc) of all
267 // the streams we've seen at a given start transaction ID. When we see a
268 // higher start transaction ID, we select a stream from the accumulator and
269 // clear it. Then we begin accumulating streams with the new, higher start
270 // transaction ID.
271 LinkedList<EditLogInputStream> acc =
272 new LinkedList<EditLogInputStream>();
273 EditLogInputStream elis;
274 while ((elis = allStreams.poll()) != null) {
275 if (acc.isEmpty()) {
276 acc.add(elis);
277 } else {
278 long accFirstTxId = acc.get(0).getFirstTxId();
279 if (accFirstTxId == elis.getFirstTxId()) {
280 acc.add(elis);
281 } else if (accFirstTxId < elis.getFirstTxId()) {
282 outStreams.add(new RedundantEditLogInputStream(acc, fromTxId));
283 acc.clear();
284 acc.add(elis);
285 } else if (accFirstTxId > elis.getFirstTxId()) {
286 throw new RuntimeException("sorted set invariants violated! " +
287 "Got stream with first txid " + elis.getFirstTxId() +
288 ", but the last firstTxId was " + accFirstTxId);
289 }
290 }
291 }
292 if (!acc.isEmpty()) {
293 outStreams.add(new RedundantEditLogInputStream(acc, fromTxId));
294 acc.clear();
295 }
296 }
297
298 /**
299 * Returns true if there are no journals, all redundant journals are disabled,
300 * or any required journals are disabled.
301 *
302 * @return True if there no journals, all redundant journals are disabled,
303 * or any required journals are disabled.
304 */
305 public boolean isEmpty() {
306 return !NameNodeResourcePolicy.areResourcesAvailable(journals,
307 minimumRedundantJournals);
308 }
309
310 /**
311 * Called when some journals experience an error in some operation.
312 */
313 private void disableAndReportErrorOnJournals(List<JournalAndStream> badJournals) {
314 if (badJournals == null || badJournals.isEmpty()) {
315 return; // nothing to do
316 }
317
318 for (JournalAndStream j : badJournals) {
319 LOG.error("Disabling journal " + j);
320 j.abort();
321 j.setDisabled(true);
322 }
323 }
324
325 /**
326 * Implementations of this interface encapsulate operations that can be
327 * iteratively applied on all the journals. For example see
328 * {@link JournalSet#mapJournalsAndReportErrors}.
329 */
330 private interface JournalClosure {
331 /**
332 * The operation on JournalAndStream.
333 * @param jas Object on which operations are performed.
334 * @throws IOException
335 */
336 public void apply(JournalAndStream jas) throws IOException;
337 }
338
339 /**
340 * Apply the given operation across all of the journal managers, disabling
341 * any for which the closure throws an IOException.
342 * @param closure {@link JournalClosure} object encapsulating the operation.
343 * @param status message used for logging errors (e.g. "opening journal")
344 * @throws IOException If the operation fails on all the journals.
345 */
346 private void mapJournalsAndReportErrors(
347 JournalClosure closure, String status) throws IOException{
348
349 List<JournalAndStream> badJAS = Lists.newLinkedList();
350 for (JournalAndStream jas : journals) {
351 try {
352 closure.apply(jas);
353 } catch (Throwable t) {
354 if (jas.isRequired()) {
355 final String msg = "Error: " + status + " failed for required journal ("
356 + jas + ")";
357 LOG.fatal(msg, t);
358 // If we fail on *any* of the required journals, then we must not
359 // continue on any of the other journals. Abort them to ensure that
360 // retry behavior doesn't allow them to keep going in any way.
361 abortAllJournals();
362 // the current policy is to shutdown the NN on errors to shared edits
363 // dir. There are many code paths to shared edits failures - syncs,
364 // roll of edits etc. All of them go through this common function
365 // where the isRequired() check is made. Applying exit policy here
366 // to catch all code paths.
367 terminate(1, msg);
368 } else {
369 LOG.error("Error: " + status + " failed for (journal " + jas + ")", t);
370 badJAS.add(jas);
371 }
372 }
373 }
374 disableAndReportErrorOnJournals(badJAS);
375 if (!NameNodeResourcePolicy.areResourcesAvailable(journals,
376 minimumRedundantJournals)) {
377 String message = status + " failed for too many journals";
378 LOG.error("Error: " + message);
379 throw new IOException(message);
380 }
381 }
382
383 /**
384 * Abort all of the underlying streams.
385 */
386 private void abortAllJournals() {
387 for (JournalAndStream jas : journals) {
388 if (jas.isActive()) {
389 jas.abort();
390 }
391 }
392 }
393
394 /**
395 * An implementation of EditLogOutputStream that applies a requested method on
396 * all the journals that are currently active.
397 */
398 private class JournalSetOutputStream extends EditLogOutputStream {
399
400 JournalSetOutputStream() throws IOException {
401 super();
402 }
403
404 @Override
405 public void write(final FSEditLogOp op)
406 throws IOException {
407 mapJournalsAndReportErrors(new JournalClosure() {
408 @Override
409 public void apply(JournalAndStream jas) throws IOException {
410 if (jas.isActive()) {
411 jas.getCurrentStream().write(op);
412 }
413 }
414 }, "write op");
415 }
416
417 @Override
418 public void writeRaw(final byte[] data, final int offset, final int length)
419 throws IOException {
420 mapJournalsAndReportErrors(new JournalClosure() {
421 @Override
422 public void apply(JournalAndStream jas) throws IOException {
423 if (jas.isActive()) {
424 jas.getCurrentStream().writeRaw(data, offset, length);
425 }
426 }
427 }, "write bytes");
428 }
429
430 @Override
431 public void create() throws IOException {
432 mapJournalsAndReportErrors(new JournalClosure() {
433 @Override
434 public void apply(JournalAndStream jas) throws IOException {
435 if (jas.isActive()) {
436 jas.getCurrentStream().create();
437 }
438 }
439 }, "create");
440 }
441
442 @Override
443 public void close() throws IOException {
444 mapJournalsAndReportErrors(new JournalClosure() {
445 @Override
446 public void apply(JournalAndStream jas) throws IOException {
447 jas.closeStream();
448 }
449 }, "close");
450 }
451
452 @Override
453 public void abort() throws IOException {
454 mapJournalsAndReportErrors(new JournalClosure() {
455 @Override
456 public void apply(JournalAndStream jas) throws IOException {
457 jas.abort();
458 }
459 }, "abort");
460 }
461
462 @Override
463 public void setReadyToFlush() throws IOException {
464 mapJournalsAndReportErrors(new JournalClosure() {
465 @Override
466 public void apply(JournalAndStream jas) throws IOException {
467 if (jas.isActive()) {
468 jas.getCurrentStream().setReadyToFlush();
469 }
470 }
471 }, "setReadyToFlush");
472 }
473
474 @Override
475 protected void flushAndSync(final boolean durable) throws IOException {
476 mapJournalsAndReportErrors(new JournalClosure() {
477 @Override
478 public void apply(JournalAndStream jas) throws IOException {
479 if (jas.isActive()) {
480 jas.getCurrentStream().flushAndSync(durable);
481 }
482 }
483 }, "flushAndSync");
484 }
485
486 @Override
487 public void flush() throws IOException {
488 mapJournalsAndReportErrors(new JournalClosure() {
489 @Override
490 public void apply(JournalAndStream jas) throws IOException {
491 if (jas.isActive()) {
492 jas.getCurrentStream().flush();
493 }
494 }
495 }, "flush");
496 }
497
498 @Override
499 public boolean shouldForceSync() {
500 for (JournalAndStream js : journals) {
501 if (js.isActive() && js.getCurrentStream().shouldForceSync()) {
502 return true;
503 }
504 }
505 return false;
506 }
507
508 @Override
509 protected long getNumSync() {
510 for (JournalAndStream jas : journals) {
511 if (jas.isActive()) {
512 return jas.getCurrentStream().getNumSync();
513 }
514 }
515 return 0;
516 }
517 }
518
519 @Override
520 public void setOutputBufferCapacity(final int size) {
521 try {
522 mapJournalsAndReportErrors(new JournalClosure() {
523 @Override
524 public void apply(JournalAndStream jas) throws IOException {
525 jas.getManager().setOutputBufferCapacity(size);
526 }
527 }, "setOutputBufferCapacity");
528 } catch (IOException e) {
529 LOG.error("Error in setting outputbuffer capacity");
530 }
531 }
532
533 List<JournalAndStream> getAllJournalStreams() {
534 return journals;
535 }
536
537 List<JournalManager> getJournalManagers() {
538 List<JournalManager> jList = new ArrayList<JournalManager>();
539 for (JournalAndStream j : journals) {
540 jList.add(j.getManager());
541 }
542 return jList;
543 }
544
545 void add(JournalManager j, boolean required) {
546 JournalAndStream jas = new JournalAndStream(j, required);
547 journals.add(jas);
548 }
549
550 void remove(JournalManager j) {
551 JournalAndStream jasToRemove = null;
552 for (JournalAndStream jas: journals) {
553 if (jas.getManager().equals(j)) {
554 jasToRemove = jas;
555 break;
556 }
557 }
558 if (jasToRemove != null) {
559 jasToRemove.abort();
560 journals.remove(jasToRemove);
561 }
562 }
563
564 @Override
565 public void purgeLogsOlderThan(final long minTxIdToKeep) throws IOException {
566 mapJournalsAndReportErrors(new JournalClosure() {
567 @Override
568 public void apply(JournalAndStream jas) throws IOException {
569 jas.getManager().purgeLogsOlderThan(minTxIdToKeep);
570 }
571 }, "purgeLogsOlderThan " + minTxIdToKeep);
572 }
573
574 @Override
575 public void recoverUnfinalizedSegments() throws IOException {
576 mapJournalsAndReportErrors(new JournalClosure() {
577 @Override
578 public void apply(JournalAndStream jas) throws IOException {
579 jas.getManager().recoverUnfinalizedSegments();
580 }
581 }, "recoverUnfinalizedSegments");
582 }
583
584 /**
585 * Return a manifest of what finalized edit logs are available. All available
586 * edit logs are returned starting from the transaction id passed.
587 *
588 * @param fromTxId Starting transaction id to read the logs.
589 * @return RemoteEditLogManifest object.
590 */
591 public synchronized RemoteEditLogManifest getEditLogManifest(long fromTxId,
592 boolean forReading) {
593 // Collect RemoteEditLogs available from each FileJournalManager
594 List<RemoteEditLog> allLogs = Lists.newArrayList();
595 for (JournalAndStream j : journals) {
596 if (j.getManager() instanceof FileJournalManager) {
597 FileJournalManager fjm = (FileJournalManager)j.getManager();
598 try {
599 allLogs.addAll(fjm.getRemoteEditLogs(fromTxId, forReading, false));
600 } catch (Throwable t) {
601 LOG.warn("Cannot list edit logs in " + fjm, t);
602 }
603 }
604 }
605
606 // Group logs by their starting txid
607 ImmutableListMultimap<Long, RemoteEditLog> logsByStartTxId =
608 Multimaps.index(allLogs, RemoteEditLog.GET_START_TXID);
609 long curStartTxId = fromTxId;
610
611 List<RemoteEditLog> logs = Lists.newArrayList();
612 while (true) {
613 ImmutableList<RemoteEditLog> logGroup = logsByStartTxId.get(curStartTxId);
614 if (logGroup.isEmpty()) {
615 // we have a gap in logs - for example because we recovered some old
616 // storage directory with ancient logs. Clear out any logs we've
617 // accumulated so far, and then skip to the next segment of logs
618 // after the gap.
619 SortedSet<Long> startTxIds = Sets.newTreeSet(logsByStartTxId.keySet());
620 startTxIds = startTxIds.tailSet(curStartTxId);
621 if (startTxIds.isEmpty()) {
622 break;
623 } else {
624 if (LOG.isDebugEnabled()) {
625 LOG.debug("Found gap in logs at " + curStartTxId + ": " +
626 "not returning previous logs in manifest.");
627 }
628 logs.clear();
629 curStartTxId = startTxIds.first();
630 continue;
631 }
632 }
633
634 // Find the one that extends the farthest forward
635 RemoteEditLog bestLog = Collections.max(logGroup);
636 logs.add(bestLog);
637 // And then start looking from after that point
638 curStartTxId = bestLog.getEndTxId() + 1;
639 }
640 RemoteEditLogManifest ret = new RemoteEditLogManifest(logs);
641
642 if (LOG.isDebugEnabled()) {
643 LOG.debug("Generated manifest for logs since " + fromTxId + ":"
644 + ret);
645 }
646 return ret;
647 }
648
649 /**
650 * Add sync times to the buffer.
651 */
652 String getSyncTimes() {
653 StringBuilder buf = new StringBuilder();
654 for (JournalAndStream jas : journals) {
655 if (jas.isActive()) {
656 buf.append(jas.getCurrentStream().getTotalSyncTime());
657 buf.append(" ");
658 }
659 }
660 return buf.toString();
661 }
662 }