001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    package org.apache.hadoop.hdfs.util;
019    
020    import java.io.BufferedReader;
021    import java.io.File;
022    import java.io.FileInputStream;
023    import java.io.FileNotFoundException;
024    import java.io.IOException;
025    import java.io.InputStream;
026    import java.io.InputStreamReader;
027    import java.security.DigestInputStream;
028    import java.security.MessageDigest;
029    import java.util.regex.Matcher;
030    import java.util.regex.Pattern;
031    
032    import org.apache.commons.logging.Log;
033    import org.apache.commons.logging.LogFactory;
034    import org.apache.hadoop.io.IOUtils;
035    import org.apache.hadoop.io.MD5Hash;
036    import org.apache.hadoop.util.StringUtils;
037    
038    import com.google.common.base.Charsets;
039    
040    /**
041     * Static functions for dealing with files of the same format
042     * that the Unix "md5sum" utility writes.
043     */
044    public abstract class MD5FileUtils {
045      private static final Log LOG = LogFactory.getLog(
046          MD5FileUtils.class);
047    
048      public static final String MD5_SUFFIX = ".md5";
049      private static final Pattern LINE_REGEX =
050        Pattern.compile("([0-9a-f]{32}) [ \\*](.+)");
051      
052      /**
053       * Verify that the previously saved md5 for the given file matches
054       * expectedMd5.
055       * @throws IOException 
056       */
057      public static void verifySavedMD5(File dataFile, MD5Hash expectedMD5)
058          throws IOException {
059        MD5Hash storedHash = readStoredMd5ForFile(dataFile);
060        // Check the hash itself
061        if (!expectedMD5.equals(storedHash)) {
062          throw new IOException(
063              "File " + dataFile + " did not match stored MD5 checksum " +
064              " (stored: " + storedHash + ", computed: " + expectedMD5);
065        }
066      }
067      
068      /**
069       * Read the md5 file stored alongside the given data file
070       * and match the md5 file content.
071       * @param dataFile the file containing data
072       * @return a matcher with two matched groups
073       *   where group(1) is the md5 string and group(2) is the data file path.
074       */
075      private static Matcher readStoredMd5(File md5File) throws IOException {
076        BufferedReader reader =
077            new BufferedReader(new InputStreamReader(new FileInputStream(
078                md5File), Charsets.UTF_8));
079        String md5Line;
080        try {
081          md5Line = reader.readLine();
082          if (md5Line == null) { md5Line = ""; }
083          md5Line = md5Line.trim();
084        } catch (IOException ioe) {
085          throw new IOException("Error reading md5 file at " + md5File, ioe);
086        } finally {
087          IOUtils.cleanup(LOG, reader);
088        }
089        
090        Matcher matcher = LINE_REGEX.matcher(md5Line);
091        if (!matcher.matches()) {
092          throw new IOException("Invalid MD5 file " + md5File + ": the content \""
093              + md5Line + "\" does not match the expected pattern.");
094        }
095        return matcher;
096      }
097    
098      /**
099       * Read the md5 checksum stored alongside the given data file.
100       * @param dataFile the file containing data
101       * @return the checksum stored in dataFile.md5
102       */
103      public static MD5Hash readStoredMd5ForFile(File dataFile) throws IOException {
104        final File md5File = getDigestFileForFile(dataFile);
105        if (!md5File.exists()) {
106          return null;
107        }
108    
109        final Matcher matcher = readStoredMd5(md5File);
110        String storedHash = matcher.group(1);
111        File referencedFile = new File(matcher.group(2));
112    
113        // Sanity check: Make sure that the file referenced in the .md5 file at
114        // least has the same name as the file we expect
115        if (!referencedFile.getName().equals(dataFile.getName())) {
116          throw new IOException(
117              "MD5 file at " + md5File + " references file named " +
118              referencedFile.getName() + " but we expected it to reference " +
119              dataFile);
120        }
121        return new MD5Hash(storedHash);
122      }
123      
124      /**
125       * Read dataFile and compute its MD5 checksum.
126       */
127      public static MD5Hash computeMd5ForFile(File dataFile) throws IOException {
128        InputStream in = new FileInputStream(dataFile);
129        try {
130          MessageDigest digester = MD5Hash.getDigester();
131          DigestInputStream dis = new DigestInputStream(in, digester);
132          IOUtils.copyBytes(dis, new IOUtils.NullOutputStream(), 128*1024);
133          
134          return new MD5Hash(digester.digest());
135        } finally {
136          IOUtils.closeStream(in);
137        }
138      }
139    
140      /**
141       * Save the ".md5" file that lists the md5sum of another file.
142       * @param dataFile the original file whose md5 was computed
143       * @param digest the computed digest
144       * @throws IOException
145       */
146      public static void saveMD5File(File dataFile, MD5Hash digest)
147          throws IOException {
148        final String digestString = StringUtils.byteToHexString(digest.getDigest());
149        saveMD5File(dataFile, digestString);
150      }
151    
152      private static void saveMD5File(File dataFile, String digestString)
153          throws IOException {
154        File md5File = getDigestFileForFile(dataFile);
155        String md5Line = digestString + " *" + dataFile.getName() + "\n";
156    
157        AtomicFileOutputStream afos = new AtomicFileOutputStream(md5File);
158        afos.write(md5Line.getBytes(Charsets.UTF_8));
159        afos.close();
160    
161        if (LOG.isDebugEnabled()) {
162          LOG.debug("Saved MD5 " + digestString + " to " + md5File);
163        }
164      }
165    
166      public static void renameMD5File(File oldDataFile, File newDataFile)
167          throws IOException {
168        final File fromFile = getDigestFileForFile(oldDataFile);
169        if (!fromFile.exists()) {
170          throw new FileNotFoundException(fromFile + " does not exist.");
171        }
172    
173        final String digestString = readStoredMd5(fromFile).group(1);
174        saveMD5File(newDataFile, digestString);
175    
176        if (!fromFile.delete()) {
177          LOG.warn("deleting  " + fromFile.getAbsolutePath() + " FAILED");
178        }
179      }
180    
181      /**
182       * @return a reference to the file with .md5 suffix that will
183       * contain the md5 checksum for the given data file.
184       */
185      public static File getDigestFileForFile(File file) {
186        return new File(file.getParentFile(), file.getName() + MD5_SUFFIX);
187      }
188    }