001 /**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements. See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership. The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License. You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018 package org.apache.hadoop.hdfs.server.datanode;
019
020 import java.io.File;
021 import java.io.FileOutputStream;
022 import java.io.IOException;
023 import java.io.RandomAccessFile;
024
025 import org.apache.hadoop.hdfs.protocol.Block;
026 import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.ReplicaState;
027 import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsVolumeSpi;
028 import org.apache.hadoop.hdfs.server.datanode.fsdataset.ReplicaOutputStreams;
029 import org.apache.hadoop.io.IOUtils;
030 import org.apache.hadoop.util.DataChecksum;
031 import org.apache.hadoop.util.StringUtils;
032
033 /**
034 * This class defines a replica in a pipeline, which
035 * includes a persistent replica being written to by a dfs client or
036 * a temporary replica being replicated by a source datanode or
037 * being copied for the balancing purpose.
038 *
039 * The base class implements a temporary replica
040 */
041 public class ReplicaInPipeline extends ReplicaInfo
042 implements ReplicaInPipelineInterface {
043 private long bytesAcked;
044 private long bytesOnDisk;
045 private byte[] lastChecksum;
046 private Thread writer;
047
048 /**
049 * Bytes reserved for this replica on the containing volume.
050 * Based off difference between the estimated maximum block length and
051 * the bytes already written to this block.
052 */
053 private long bytesReserved;
054
055 /**
056 * Constructor for a zero length replica
057 * @param blockId block id
058 * @param genStamp replica generation stamp
059 * @param vol volume where replica is located
060 * @param dir directory path where block and meta files are located
061 * @param bytesToReserve disk space to reserve for this replica, based on
062 * the estimated maximum block length.
063 */
064 public ReplicaInPipeline(long blockId, long genStamp,
065 FsVolumeSpi vol, File dir, long bytesToReserve) {
066 this(blockId, 0L, genStamp, vol, dir, Thread.currentThread(), bytesToReserve);
067 }
068
069 /**
070 * Constructor
071 * @param block a block
072 * @param vol volume where replica is located
073 * @param dir directory path where block and meta files are located
074 * @param writer a thread that is writing to this replica
075 */
076 ReplicaInPipeline(Block block,
077 FsVolumeSpi vol, File dir, Thread writer) {
078 this( block.getBlockId(), block.getNumBytes(), block.getGenerationStamp(),
079 vol, dir, writer, 0L);
080 }
081
082 /**
083 * Constructor
084 * @param blockId block id
085 * @param len replica length
086 * @param genStamp replica generation stamp
087 * @param vol volume where replica is located
088 * @param dir directory path where block and meta files are located
089 * @param writer a thread that is writing to this replica
090 * @param bytesToReserve disk space to reserve for this replica, based on
091 * the estimated maximum block length.
092 */
093 ReplicaInPipeline(long blockId, long len, long genStamp,
094 FsVolumeSpi vol, File dir, Thread writer, long bytesToReserve) {
095 super( blockId, len, genStamp, vol, dir);
096 this.bytesAcked = len;
097 this.bytesOnDisk = len;
098 this.writer = writer;
099 this.bytesReserved = bytesToReserve;
100 }
101
102 /**
103 * Copy constructor.
104 * @param from where to copy from
105 */
106 public ReplicaInPipeline(ReplicaInPipeline from) {
107 super(from);
108 this.bytesAcked = from.getBytesAcked();
109 this.bytesOnDisk = from.getBytesOnDisk();
110 this.writer = from.writer;
111 this.bytesReserved = from.bytesReserved;
112 }
113
114 @Override
115 public long getVisibleLength() {
116 return -1;
117 }
118
119 @Override //ReplicaInfo
120 public ReplicaState getState() {
121 return ReplicaState.TEMPORARY;
122 }
123
124 @Override // ReplicaInPipelineInterface
125 public long getBytesAcked() {
126 return bytesAcked;
127 }
128
129 @Override // ReplicaInPipelineInterface
130 public void setBytesAcked(long bytesAcked) {
131 long newBytesAcked = bytesAcked - this.bytesAcked;
132 this.bytesAcked = bytesAcked;
133
134 // Once bytes are ACK'ed we can release equivalent space from the
135 // volume's reservedForRbw count. We could have released it as soon
136 // as the write-to-disk completed but that would be inefficient.
137 getVolume().releaseReservedSpace(newBytesAcked);
138 bytesReserved -= newBytesAcked;
139 }
140
141 @Override // ReplicaInPipelineInterface
142 public long getBytesOnDisk() {
143 return bytesOnDisk;
144 }
145
146 @Override
147 public long getBytesReserved() {
148 return bytesReserved;
149 }
150
151 @Override // ReplicaInPipelineInterface
152 public synchronized void setLastChecksumAndDataLen(long dataLength, byte[] lastChecksum) {
153 this.bytesOnDisk = dataLength;
154 this.lastChecksum = lastChecksum;
155 }
156
157 @Override // ReplicaInPipelineInterface
158 public synchronized ChunkChecksum getLastChecksumAndDataLen() {
159 return new ChunkChecksum(getBytesOnDisk(), lastChecksum);
160 }
161
162 /**
163 * Set the thread that is writing to this replica
164 * @param writer a thread writing to this replica
165 */
166 public void setWriter(Thread writer) {
167 this.writer = writer;
168 }
169
170 @Override // Object
171 public boolean equals(Object o) {
172 return super.equals(o);
173 }
174
175 /**
176 * Interrupt the writing thread and wait until it dies
177 * @throws IOException the waiting is interrupted
178 */
179 public void stopWriter(long xceiverStopTimeout) throws IOException {
180 if (writer != null && writer != Thread.currentThread() && writer.isAlive()) {
181 writer.interrupt();
182 try {
183 writer.join(xceiverStopTimeout);
184 if (writer.isAlive()) {
185 final String msg = "Join on writer thread " + writer + " timed out";
186 DataNode.LOG.warn(msg + "\n" + StringUtils.getStackTrace(writer));
187 throw new IOException(msg);
188 }
189 } catch (InterruptedException e) {
190 throw new IOException("Waiting for writer thread is interrupted.");
191 }
192 }
193 }
194
195 @Override // Object
196 public int hashCode() {
197 return super.hashCode();
198 }
199
200 @Override // ReplicaInPipelineInterface
201 public ReplicaOutputStreams createStreams(boolean isCreate,
202 DataChecksum requestedChecksum) throws IOException {
203 File blockFile = getBlockFile();
204 File metaFile = getMetaFile();
205 if (DataNode.LOG.isDebugEnabled()) {
206 DataNode.LOG.debug("writeTo blockfile is " + blockFile +
207 " of size " + blockFile.length());
208 DataNode.LOG.debug("writeTo metafile is " + metaFile +
209 " of size " + metaFile.length());
210 }
211 long blockDiskSize = 0L;
212 long crcDiskSize = 0L;
213
214 // the checksum that should actually be used -- this
215 // may differ from requestedChecksum for appends.
216 final DataChecksum checksum;
217
218 RandomAccessFile metaRAF = new RandomAccessFile(metaFile, "rw");
219
220 if (!isCreate) {
221 // For append or recovery, we must enforce the existing checksum.
222 // Also, verify that the file has correct lengths, etc.
223 boolean checkedMeta = false;
224 try {
225 BlockMetadataHeader header = BlockMetadataHeader.readHeader(metaRAF);
226 checksum = header.getChecksum();
227
228 if (checksum.getBytesPerChecksum() !=
229 requestedChecksum.getBytesPerChecksum()) {
230 throw new IOException("Client requested checksum " +
231 requestedChecksum + " when appending to an existing block " +
232 "with different chunk size: " + checksum);
233 }
234
235 int bytesPerChunk = checksum.getBytesPerChecksum();
236 int checksumSize = checksum.getChecksumSize();
237
238 blockDiskSize = bytesOnDisk;
239 crcDiskSize = BlockMetadataHeader.getHeaderSize() +
240 (blockDiskSize+bytesPerChunk-1)/bytesPerChunk*checksumSize;
241 if (blockDiskSize>0 &&
242 (blockDiskSize>blockFile.length() || crcDiskSize>metaFile.length())) {
243 throw new IOException("Corrupted block: " + this);
244 }
245 checkedMeta = true;
246 } finally {
247 if (!checkedMeta) {
248 // clean up in case of exceptions.
249 IOUtils.closeStream(metaRAF);
250 }
251 }
252 } else {
253 // for create, we can use the requested checksum
254 checksum = requestedChecksum;
255 }
256
257 FileOutputStream blockOut = null;
258 FileOutputStream crcOut = null;
259 try {
260 blockOut = new FileOutputStream(
261 new RandomAccessFile( blockFile, "rw" ).getFD() );
262 crcOut = new FileOutputStream(metaRAF.getFD() );
263 if (!isCreate) {
264 blockOut.getChannel().position(blockDiskSize);
265 crcOut.getChannel().position(crcDiskSize);
266 }
267 return new ReplicaOutputStreams(blockOut, crcOut, checksum,
268 getVolume().isTransientStorage());
269 } catch (IOException e) {
270 IOUtils.closeStream(blockOut);
271 IOUtils.closeStream(metaRAF);
272 throw e;
273 }
274 }
275
276 @Override
277 public String toString() {
278 return super.toString()
279 + "\n bytesAcked=" + bytesAcked
280 + "\n bytesOnDisk=" + bytesOnDisk;
281 }
282 }