001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing,
013 * software distributed under the License is distributed on an
014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015 * KIND, either express or implied.  See the License for the
016 * specific language governing permissions and limitations
017 * under the License.
018 */
019package org.apache.commons.compress.archivers.dump;
020
021import java.io.EOFException;
022import java.io.IOException;
023import java.io.InputStream;
024import java.util.Arrays;
025import java.util.HashMap;
026import java.util.Map;
027import java.util.PriorityQueue;
028import java.util.Queue;
029import java.util.Stack;
030
031import org.apache.commons.compress.archivers.ArchiveException;
032import org.apache.commons.compress.archivers.ArchiveInputStream;
033import org.apache.commons.compress.archivers.zip.ZipEncoding;
034import org.apache.commons.compress.archivers.zip.ZipEncodingHelper;
035import org.apache.commons.compress.utils.IOUtils;
036
037/**
038 * The DumpArchiveInputStream reads a UNIX dump archive as an InputStream.
039 * Methods are provided to position at each successive entry in
040 * the archive, and the read each entry as a normal input stream
041 * using read().
042 *
043 * There doesn't seem to exist a hint on the encoding of string values
044 * in any piece documentation.  Given the main purpose of dump/restore
045 * is backing up a system it seems very likely the format uses the
046 * current default encoding of the system.
047 *
048 * @NotThreadSafe
049 */
050public class DumpArchiveInputStream extends ArchiveInputStream {
051    private final DumpArchiveSummary summary;
052    private DumpArchiveEntry active;
053    private boolean isClosed;
054    private boolean hasHitEOF;
055    private long entrySize;
056    private long entryOffset;
057    private int readIdx;
058    private final byte[] readBuf = new byte[DumpArchiveConstants.TP_SIZE];
059    private byte[] blockBuffer;
060    private int recordOffset;
061    private long filepos;
062    protected TapeInputStream raw;
063
064    // map of ino -> dirent entry. We can use this to reconstruct full paths.
065    private final Map<Integer, Dirent> names = new HashMap<>();
066
067    // map of ino -> (directory) entry when we're missing one or more elements in the path.
068    private final Map<Integer, DumpArchiveEntry> pending = new HashMap<>();
069
070    // queue of (directory) entries where we now have the full path.
071    private final Queue<DumpArchiveEntry> queue;
072
073    /**
074     * The encoding to use for file names and labels.
075     */
076    private final ZipEncoding zipEncoding;
077
078    // the provided encoding (for unit tests)
079    final String encoding;
080
081    /**
082     * Constructor using the platform's default encoding for file
083     * names.
084     *
085     * @param is stream to read from
086     * @throws ArchiveException on error
087     */
088    public DumpArchiveInputStream(final InputStream is) throws ArchiveException {
089        this(is, null);
090    }
091
092    /**
093     * Constructor.
094     *
095     * @param is stream to read from
096     * @param encoding the encoding to use for file names, use null
097     * for the platform's default encoding
098     * @since 1.6
099     * @throws ArchiveException on error
100     */
101    public DumpArchiveInputStream(final InputStream is, final String encoding)
102        throws ArchiveException {
103        this.raw = new TapeInputStream(is);
104        this.hasHitEOF = false;
105        this.encoding = encoding;
106        this.zipEncoding = ZipEncodingHelper.getZipEncoding(encoding);
107
108        try {
109            // read header, verify it's a dump archive.
110            final byte[] headerBytes = raw.readRecord();
111
112            if (!DumpArchiveUtil.verify(headerBytes)) {
113                throw new UnrecognizedFormatException();
114            }
115
116            // get summary information
117            summary = new DumpArchiveSummary(headerBytes, this.zipEncoding);
118
119            // reset buffer with actual block size.
120            raw.resetBlockSize(summary.getNTRec(), summary.isCompressed());
121
122            // allocate our read buffer.
123            blockBuffer = new byte[4 * DumpArchiveConstants.TP_SIZE];
124
125            // skip past CLRI and BITS segments since we don't handle them yet.
126            readCLRI();
127            readBITS();
128        } catch (final IOException ex) {
129            throw new ArchiveException(ex.getMessage(), ex);
130        }
131
132        // put in a dummy record for the root node.
133        final Dirent root = new Dirent(2, 2, 4, ".");
134        names.put(2, root);
135
136        // use priority based on queue to ensure parent directories are
137        // released first.
138        queue = new PriorityQueue<>(10,
139                (p, q) -> {
140                    if (p.getOriginalName() == null || q.getOriginalName() == null) {
141                        return Integer.MAX_VALUE;
142                    }
143
144                    return p.getOriginalName().compareTo(q.getOriginalName());
145                });
146    }
147
148    @Deprecated
149    @Override
150    public int getCount() {
151        return (int) getBytesRead();
152    }
153
154    @Override
155    public long getBytesRead() {
156        return raw.getBytesRead();
157    }
158
159    /**
160     * Return the archive summary information.
161     * @return the summary
162     */
163    public DumpArchiveSummary getSummary() {
164        return summary;
165    }
166
167    /**
168     * Read CLRI (deleted inode) segment.
169     */
170    private void readCLRI() throws IOException {
171        final byte[] buffer = raw.readRecord();
172
173        if (!DumpArchiveUtil.verify(buffer)) {
174            throw new InvalidFormatException();
175        }
176
177        active = DumpArchiveEntry.parse(buffer);
178
179        if (DumpArchiveConstants.SEGMENT_TYPE.CLRI != active.getHeaderType()) {
180            throw new InvalidFormatException();
181        }
182
183        // we don't do anything with this yet.
184        if (raw.skip((long) DumpArchiveConstants.TP_SIZE * active.getHeaderCount())
185            == -1) {
186            throw new EOFException();
187        }
188        readIdx = active.getHeaderCount();
189    }
190
191    /**
192     * Read BITS segment.
193     */
194    private void readBITS() throws IOException {
195        final byte[] buffer = raw.readRecord();
196
197        if (!DumpArchiveUtil.verify(buffer)) {
198            throw new InvalidFormatException();
199        }
200
201        active = DumpArchiveEntry.parse(buffer);
202
203        if (DumpArchiveConstants.SEGMENT_TYPE.BITS != active.getHeaderType()) {
204            throw new InvalidFormatException();
205        }
206
207        // we don't do anything with this yet.
208        if (raw.skip((long) DumpArchiveConstants.TP_SIZE * active.getHeaderCount())
209            == -1) {
210            throw new EOFException();
211        }
212        readIdx = active.getHeaderCount();
213    }
214
215    /**
216     * Read the next entry.
217     * @return the next entry
218     * @throws IOException on error
219     */
220    public DumpArchiveEntry getNextDumpEntry() throws IOException {
221        return getNextEntry();
222    }
223
224    @Override
225    public DumpArchiveEntry getNextEntry() throws IOException {
226        DumpArchiveEntry entry = null;
227        String path = null;
228
229        // is there anything in the queue?
230        if (!queue.isEmpty()) {
231            return queue.remove();
232        }
233
234        while (entry == null) {
235            if (hasHitEOF) {
236                return null;
237            }
238
239            // skip any remaining records in this segment for prior file.
240            // we might still have holes... easiest to do it
241            // block by block. We may want to revisit this if
242            // the unnecessary decompression time adds up.
243            while (readIdx < active.getHeaderCount()) {
244                if (!active.isSparseRecord(readIdx++)
245                    && raw.skip(DumpArchiveConstants.TP_SIZE) == -1) {
246                    throw new EOFException();
247                }
248            }
249
250            readIdx = 0;
251            filepos = raw.getBytesRead();
252
253            byte[] headerBytes = raw.readRecord();
254
255            if (!DumpArchiveUtil.verify(headerBytes)) {
256                throw new InvalidFormatException();
257            }
258
259            active = DumpArchiveEntry.parse(headerBytes);
260
261            // skip any remaining segments for prior file.
262            while (DumpArchiveConstants.SEGMENT_TYPE.ADDR == active.getHeaderType()) {
263                if (raw.skip((long) DumpArchiveConstants.TP_SIZE
264                             * (active.getHeaderCount()
265                                - active.getHeaderHoles())) == -1) {
266                    throw new EOFException();
267                }
268
269                filepos = raw.getBytesRead();
270                headerBytes = raw.readRecord();
271
272                if (!DumpArchiveUtil.verify(headerBytes)) {
273                    throw new InvalidFormatException();
274                }
275
276                active = DumpArchiveEntry.parse(headerBytes);
277            }
278
279            // check if this is an end-of-volume marker.
280            if (DumpArchiveConstants.SEGMENT_TYPE.END == active.getHeaderType()) {
281                hasHitEOF = true;
282
283                return null;
284            }
285
286            entry = active;
287
288            if (entry.isDirectory()) {
289                readDirectoryEntry(active);
290
291                // now we create an empty InputStream.
292                entryOffset = 0;
293                entrySize = 0;
294                readIdx = active.getHeaderCount();
295            } else {
296                entryOffset = 0;
297                entrySize = active.getEntrySize();
298                readIdx = 0;
299            }
300
301            recordOffset = readBuf.length;
302
303            path = getPath(entry);
304
305            if (path == null) {
306                entry = null;
307            }
308        }
309
310        entry.setName(path);
311        entry.setSimpleName(names.get(entry.getIno()).getName());
312        entry.setOffset(filepos);
313
314        return entry;
315    }
316
317    /**
318     * Read directory entry.
319     */
320    private void readDirectoryEntry(DumpArchiveEntry entry)
321        throws IOException {
322        long size = entry.getEntrySize();
323        boolean first = true;
324
325        while (first ||
326                DumpArchiveConstants.SEGMENT_TYPE.ADDR == entry.getHeaderType()) {
327            // read the header that we just peeked at.
328            if (!first) {
329                raw.readRecord();
330            }
331
332            if (!names.containsKey(entry.getIno()) &&
333                    DumpArchiveConstants.SEGMENT_TYPE.INODE == entry.getHeaderType()) {
334                pending.put(entry.getIno(), entry);
335            }
336
337            final int datalen = DumpArchiveConstants.TP_SIZE * entry.getHeaderCount();
338
339            if (blockBuffer.length < datalen) {
340                blockBuffer = IOUtils.readRange(raw, datalen);
341                if (blockBuffer.length != datalen) {
342                    throw new EOFException();
343                }
344            } else if (raw.read(blockBuffer, 0, datalen) != datalen) {
345                throw new EOFException();
346            }
347
348            int reclen = 0;
349
350            for (int i = 0; i < datalen - 8 && i < size - 8;
351                    i += reclen) {
352                final int ino = DumpArchiveUtil.convert32(blockBuffer, i);
353                reclen = DumpArchiveUtil.convert16(blockBuffer, i + 4);
354
355                final byte type = blockBuffer[i + 6];
356
357                final String name = DumpArchiveUtil.decode(zipEncoding, blockBuffer, i + 8, blockBuffer[i + 7]);
358
359                if (".".equals(name) || "..".equals(name)) {
360                    // do nothing...
361                    continue;
362                }
363
364                final Dirent d = new Dirent(ino, entry.getIno(), type, name);
365
366                /*
367                if ((type == 4) && names.containsKey(ino)) {
368                    System.out.println("we already have ino: " +
369                                       names.get(ino));
370                }
371                */
372
373                names.put(ino, d);
374
375                // check whether this allows us to fill anything in the pending list.
376                pending.forEach((k, v) -> {
377                    final String path = getPath(v);
378
379                    if (path != null) {
380                        v.setName(path);
381                        v.setSimpleName(names.get(k).getName());
382                        queue.add(v);
383                    }
384                });
385
386                // remove anything that we found. (We can't do it earlier
387                // because of concurrent modification exceptions.)
388                queue.forEach(e -> pending.remove(e.getIno()));
389            }
390
391            final byte[] peekBytes = raw.peek();
392
393            if (!DumpArchiveUtil.verify(peekBytes)) {
394                throw new InvalidFormatException();
395            }
396
397            entry = DumpArchiveEntry.parse(peekBytes);
398            first = false;
399            size -= DumpArchiveConstants.TP_SIZE;
400        }
401    }
402
403    /**
404     * Get full path for specified archive entry, or null if there's a gap.
405     *
406     * @param entry
407     * @return  full path for specified archive entry, or null if there's a gap.
408     */
409    private String getPath(final DumpArchiveEntry entry) {
410        // build the stack of elements. It's possible that we're
411        // still missing an intermediate value and if so we
412        final Stack<String> elements = new Stack<>();
413        Dirent dirent = null;
414
415        for (int i = entry.getIno();; i = dirent.getParentIno()) {
416            if (!names.containsKey(i)) {
417                elements.clear();
418                break;
419            }
420
421            dirent = names.get(i);
422            elements.push(dirent.getName());
423
424            if (dirent.getIno() == dirent.getParentIno()) {
425                break;
426            }
427        }
428
429        // if an element is missing defer the work and read next entry.
430        if (elements.isEmpty()) {
431            pending.put(entry.getIno(), entry);
432
433            return null;
434        }
435
436        // generate full path from stack of elements.
437        final StringBuilder sb = new StringBuilder(elements.pop());
438
439        while (!elements.isEmpty()) {
440            sb.append('/');
441            sb.append(elements.pop());
442        }
443
444        return sb.toString();
445    }
446
447    /**
448     * Reads bytes from the current dump archive entry.
449     *
450     * This method is aware of the boundaries of the current
451     * entry in the archive and will deal with them as if they
452     * were this stream's start and EOF.
453     *
454     * @param buf The buffer into which to place bytes read.
455     * @param off The offset at which to place bytes read.
456     * @param len The number of bytes to read.
457     * @return The number of bytes read, or -1 at EOF.
458     * @throws IOException on error
459     */
460    @Override
461    public int read(final byte[] buf, int off, int len) throws IOException {
462        if (len == 0) {
463            return 0;
464        }
465        int totalRead = 0;
466
467        if (hasHitEOF || isClosed || entryOffset >= entrySize) {
468            return -1;
469        }
470
471        if (active == null) {
472            throw new IllegalStateException("No current dump entry");
473        }
474
475        if (len + entryOffset > entrySize) {
476            len = (int) (entrySize - entryOffset);
477        }
478
479        while (len > 0) {
480            final int sz = Math.min(len, readBuf.length - recordOffset);
481
482            // copy any data we have
483            if (recordOffset + sz <= readBuf.length) {
484                System.arraycopy(readBuf, recordOffset, buf, off, sz);
485                totalRead += sz;
486                recordOffset += sz;
487                len -= sz;
488                off += sz;
489            }
490
491            // load next block if necessary.
492            if (len > 0) {
493                if (readIdx >= 512) {
494                    final byte[] headerBytes = raw.readRecord();
495
496                    if (!DumpArchiveUtil.verify(headerBytes)) {
497                        throw new InvalidFormatException();
498                    }
499
500                    active = DumpArchiveEntry.parse(headerBytes);
501                    readIdx = 0;
502                }
503
504                if (!active.isSparseRecord(readIdx++)) {
505                    final int r = raw.read(readBuf, 0, readBuf.length);
506                    if (r != readBuf.length) {
507                        throw new EOFException();
508                    }
509                } else {
510                    Arrays.fill(readBuf, (byte) 0);
511                }
512
513                recordOffset = 0;
514            }
515        }
516
517        entryOffset += totalRead;
518
519        return totalRead;
520    }
521
522    /**
523     * Closes the stream for this entry.
524     */
525    @Override
526    public void close() throws IOException {
527        if (!isClosed) {
528            isClosed = true;
529            raw.close();
530        }
531    }
532
533    /**
534     * Look at the first few bytes of the file to decide if it's a dump
535     * archive. With 32 bytes we can look at the magic value, with a full
536     * 1k we can verify the checksum.
537     * @param buffer data to match
538     * @param length length of data
539     * @return whether the buffer seems to contain dump data
540     */
541    public static boolean matches(final byte[] buffer, final int length) {
542        // do we have enough of the header?
543        if (length < 32) {
544            return false;
545        }
546
547        // this is the best test
548        if (length >= DumpArchiveConstants.TP_SIZE) {
549            return DumpArchiveUtil.verify(buffer);
550        }
551
552        // this will work in a pinch.
553        return DumpArchiveConstants.NFS_MAGIC == DumpArchiveUtil.convert32(buffer,
554            24);
555    }
556
557}