|
@@ -34,7 +34,9 @@ import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
|
|
|
import org.apache.hadoop.hdfs.protocol.HdfsConstants;
|
|
|
import org.apache.hadoop.hdfs.protocol.datatransfer.PacketHeader;
|
|
|
import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsDatasetSpi;
|
|
|
+import org.apache.hadoop.hdfs.util.DataTransferThrottler;
|
|
|
import org.apache.hadoop.io.IOUtils;
|
|
|
+import org.apache.hadoop.io.nativeio.NativeIO;
|
|
|
import org.apache.hadoop.net.SocketOutputStream;
|
|
|
import org.apache.hadoop.util.DataChecksum;
|
|
|
import org.apache.hadoop.util.StringUtils;
|
|
@@ -56,8 +58,10 @@ public class RaidBlockSender implements java.io.Closeable {
|
|
|
private DataInputStream checksumIn; // checksum datastream
|
|
|
private DataChecksum checksum; // checksum stream
|
|
|
private long offset; // starting position to read
|
|
|
+ /** Initial position to read */
|
|
|
+ private long initialOffset;
|
|
|
private long endOffset; // ending position
|
|
|
- private int bytesPerChecksum; // chunk size
|
|
|
+ private int chunkSize; // chunk size
|
|
|
private int checksumSize; // checksum size
|
|
|
private boolean corruptChecksumOk; // if need to verify checksum
|
|
|
private boolean chunkOffsetOK; // if need to send chunk offset
|
|
@@ -74,6 +78,8 @@ public class RaidBlockSender implements java.io.Closeable {
|
|
|
* not sure if there will be much more improvement.
|
|
|
*/
|
|
|
private static final int MIN_BUFFER_WITH_TRANSFERTO = 64*1024;
|
|
|
+ private static final int TRANSFERTO_BUFFER_SIZE = Math.max(
|
|
|
+ HdfsConstants.IO_FILE_BUFFER_SIZE, MIN_BUFFER_WITH_TRANSFERTO);
|
|
|
private volatile ChunkChecksum lastChunkChecksum = null;
|
|
|
|
|
|
|
|
@@ -125,12 +131,13 @@ public class RaidBlockSender implements java.io.Closeable {
|
|
|
* is mostly corrupted. For now just truncate bytesPerchecksum to
|
|
|
* blockLength.
|
|
|
*/
|
|
|
- bytesPerChecksum = checksum.getBytesPerChecksum();
|
|
|
- if (bytesPerChecksum > 10*1024*1024 && bytesPerChecksum > replicaVisibleLength) {
|
|
|
+ int size = checksum.getBytesPerChecksum();
|
|
|
+ if (size > 10*1024*1024 && size > replicaVisibleLength) {
|
|
|
checksum = DataChecksum.newDataChecksum(checksum.getChecksumType(),
|
|
|
Math.max((int)replicaVisibleLength, 10*1024*1024));
|
|
|
- bytesPerChecksum = checksum.getBytesPerChecksum();
|
|
|
+ size = checksum.getBytesPerChecksum();
|
|
|
}
|
|
|
+ chunkSize = size;
|
|
|
checksumSize = checksum.getChecksumSize();
|
|
|
|
|
|
if (length < 0) {
|
|
@@ -147,12 +154,12 @@ public class RaidBlockSender implements java.io.Closeable {
|
|
|
throw new IOException(msg);
|
|
|
}
|
|
|
|
|
|
- offset = (startOffset - (startOffset % bytesPerChecksum));
|
|
|
+ offset = (startOffset - (startOffset % chunkSize));
|
|
|
if (length >= 0) {
|
|
|
// Make sure endOffset points to end of a checksumed chunk.
|
|
|
long tmpLen = startOffset + length;
|
|
|
- if (tmpLen % bytesPerChecksum != 0) {
|
|
|
- tmpLen += (bytesPerChecksum - tmpLen % bytesPerChecksum);
|
|
|
+ if (tmpLen % chunkSize != 0) {
|
|
|
+ tmpLen += (chunkSize - tmpLen % chunkSize);
|
|
|
}
|
|
|
if (tmpLen < endOffset) {
|
|
|
// will use on-disk checksum here since the end is a stable chunk
|
|
@@ -162,7 +169,7 @@ public class RaidBlockSender implements java.io.Closeable {
|
|
|
|
|
|
// seek to the right offsets
|
|
|
if (offset > 0) {
|
|
|
- long checksumSkip = (offset / bytesPerChecksum) * checksumSize;
|
|
|
+ long checksumSkip = (offset / chunkSize) * checksumSize;
|
|
|
// note blockInStream is seeked when created below
|
|
|
if (checksumSkip > 0) {
|
|
|
// Should we use seek() for checksum file as well?
|
|
@@ -178,7 +185,7 @@ public class RaidBlockSender implements java.io.Closeable {
|
|
|
throw ioe;
|
|
|
}
|
|
|
}
|
|
|
-
|
|
|
+
|
|
|
/**
|
|
|
* close opened files.
|
|
|
*/
|
|
@@ -227,57 +234,85 @@ public class RaidBlockSender implements java.io.Closeable {
|
|
|
// otherwise just return the same exception.
|
|
|
return ioe;
|
|
|
}
|
|
|
-
|
|
|
+
|
|
|
/**
|
|
|
- * Sends upto maxChunks chunks of data.
|
|
|
- *
|
|
|
- * When blockInPosition is >= 0, assumes 'out' is a
|
|
|
- * {@link SocketOutputStream} and tries
|
|
|
- * {@link SocketOutputStream#transferToFully(FileChannel, long, int)} to
|
|
|
- * send data (and updates blockInPosition).
|
|
|
+ * @param datalen Length of data
|
|
|
+ * @return number of chunks for data of given size
|
|
|
*/
|
|
|
- private int sendChunks(ByteBuffer pkt, int maxChunks, OutputStream out)
|
|
|
- throws IOException {
|
|
|
- // Sends multiple chunks in one packet with a single write().
|
|
|
-
|
|
|
- int len = (int) Math.min(endOffset - offset,
|
|
|
- (((long) bytesPerChecksum) * ((long) maxChunks)));
|
|
|
- int numChunks = (len + bytesPerChecksum - 1)/bytesPerChecksum;
|
|
|
- int packetLen = len + numChunks*checksumSize + 4;
|
|
|
- boolean lastDataPacket = offset + len == endOffset && len > 0;
|
|
|
+ private int numberOfChunks(long datalen) {
|
|
|
+ return (int) ((datalen + chunkSize - 1)/chunkSize);
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * Write packet header into {@code pkt}
|
|
|
+ */
|
|
|
+ private void writePacketHeader(ByteBuffer pkt, int dataLen, int packetLen) {
|
|
|
pkt.clear();
|
|
|
-
|
|
|
-
|
|
|
- PacketHeader header = new PacketHeader(
|
|
|
- packetLen, offset, seqno, (len == 0), len);
|
|
|
+ PacketHeader header = new PacketHeader(packetLen, offset, seqno,
|
|
|
+ (dataLen == 0), dataLen, false);
|
|
|
header.putInBuffer(pkt);
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * Read checksum into given buffer
|
|
|
+ * @param buf buffer to read the checksum into
|
|
|
+ * @param checksumOffset offset at which to write the checksum into buf
|
|
|
+ * @param checksumLen length of checksum to write
|
|
|
+ * @throws IOException on error
|
|
|
+ */
|
|
|
+ private void readChecksum(byte[] buf, final int checksumOffset,
|
|
|
+ final int checksumLen) throws IOException {
|
|
|
+ if (checksumSize <= 0 && checksumIn == null) {
|
|
|
+ return;
|
|
|
+ }
|
|
|
+ try {
|
|
|
+ checksumIn.readFully(buf, checksumOffset, checksumLen);
|
|
|
+ } catch (IOException e) {
|
|
|
+ LOG.warn(" Could not read or failed to veirfy checksum for data"
|
|
|
+ + " at offset " + offset + " for block " + block, e);
|
|
|
+ IOUtils.closeStream(checksumIn);
|
|
|
+ checksumIn = null;
|
|
|
+ if (corruptChecksumOk) {
|
|
|
+ if (checksumOffset < checksumLen) {
|
|
|
+ // Just fill the array with zeros.
|
|
|
+ Arrays.fill(buf, checksumOffset, checksumLen, (byte) 0);
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ throw e;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * Sends a packet with up to maxChunks chunks of data.
|
|
|
+ *
|
|
|
+ * @param pkt buffer used for writing packet data
|
|
|
+ * @param maxChunks maximum number of chunks to send
|
|
|
+ * @param out stream to send data to
|
|
|
+ * @param transferTo use transferTo to send data
|
|
|
+ * @param throttler used for throttling data transfer bandwidth
|
|
|
+ */
|
|
|
+ private int sendPacket(ByteBuffer pkt, int maxChunks, OutputStream out,
|
|
|
+ boolean transferTo, DataTransferThrottler throttler) throws IOException {
|
|
|
+ int dataLen = (int) Math.min(endOffset - offset,
|
|
|
+ (chunkSize * (long) maxChunks));
|
|
|
+
|
|
|
+ int numChunks = numberOfChunks(dataLen); // Number of chunks be sent in the packet
|
|
|
+ int checksumDataLen = numChunks * checksumSize;
|
|
|
+ int packetLen = dataLen + checksumDataLen + 4;
|
|
|
+ boolean lastDataPacket = offset + dataLen == endOffset && dataLen > 0;
|
|
|
+
|
|
|
+ writePacketHeader(pkt, dataLen, packetLen);
|
|
|
|
|
|
int checksumOff = pkt.position();
|
|
|
- int checksumLen = numChunks * checksumSize;
|
|
|
byte[] buf = pkt.array();
|
|
|
|
|
|
if (checksumSize > 0 && checksumIn != null) {
|
|
|
- try {
|
|
|
- checksumIn.readFully(buf, checksumOff, checksumLen);
|
|
|
- } catch (IOException e) {
|
|
|
- LOG.warn(" Could not read or failed to veirfy checksum for data" +
|
|
|
- " at offset " + offset + " for block " + block + " got : "
|
|
|
- + StringUtils.stringifyException(e));
|
|
|
- IOUtils.closeStream(checksumIn);
|
|
|
- checksumIn = null;
|
|
|
- if (corruptChecksumOk) {
|
|
|
- if (checksumOff < checksumLen) {
|
|
|
- // Just fill the array with zeros.
|
|
|
- Arrays.fill(buf, checksumOff, checksumLen, (byte) 0);
|
|
|
- }
|
|
|
- } else {
|
|
|
- throw e;
|
|
|
- }
|
|
|
- }
|
|
|
+ readChecksum(buf, checksumOff, checksumDataLen);
|
|
|
|
|
|
// write in progress that we need to use to get last checksum
|
|
|
if (lastDataPacket && lastChunkChecksum != null) {
|
|
|
- int start = checksumOff + checksumLen - checksumSize;
|
|
|
+ int start = checksumOff + checksumDataLen - checksumSize;
|
|
|
byte[] updatedChecksum = lastChunkChecksum.getChecksum();
|
|
|
|
|
|
if (updatedChecksum != null) {
|
|
@@ -286,61 +321,85 @@ public class RaidBlockSender implements java.io.Closeable {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- int dataOff = checksumOff + checksumLen;
|
|
|
-
|
|
|
- if (blockInPosition < 0) {
|
|
|
- //normal transfer
|
|
|
- IOUtils.readFully(blockIn, buf, dataOff, len);
|
|
|
+ int dataOff = checksumOff + checksumDataLen;
|
|
|
+ if (!transferTo) { // normal transfer
|
|
|
+ IOUtils.readFully(blockIn, buf, dataOff, dataLen);
|
|
|
|
|
|
if (verifyChecksum) {
|
|
|
- int dOff = dataOff;
|
|
|
- int cOff = checksumOff;
|
|
|
- int dLeft = len;
|
|
|
-
|
|
|
- for (int i=0; i<numChunks; i++) {
|
|
|
- checksum.reset();
|
|
|
- int dLen = Math.min(dLeft, bytesPerChecksum);
|
|
|
- checksum.update(buf, dOff, dLen);
|
|
|
- if (!checksum.compare(buf, cOff)) {
|
|
|
- long failedPos = offset + len -dLeft;
|
|
|
- throw new ChecksumException("Checksum failed at " +
|
|
|
- failedPos, failedPos);
|
|
|
- }
|
|
|
- dLeft -= dLen;
|
|
|
- dOff += dLen;
|
|
|
- cOff += checksumSize;
|
|
|
- }
|
|
|
+ verifyChecksum(buf, dataOff, dataLen, numChunks, checksumOff);
|
|
|
}
|
|
|
- //writing is done below (mainly to handle IOException)
|
|
|
}
|
|
|
|
|
|
try {
|
|
|
- if (blockInPosition >= 0) {
|
|
|
- //use transferTo(). Checks on out and blockIn are already done.
|
|
|
-
|
|
|
+ if (transferTo) {
|
|
|
SocketOutputStream sockOut = (SocketOutputStream)out;
|
|
|
- //first write the packet
|
|
|
- sockOut.write(buf, 0, dataOff);
|
|
|
+ sockOut.write(buf, 0, dataOff); // First write checksum
|
|
|
+
|
|
|
// no need to flush. since we know out is not a buffered stream.
|
|
|
-
|
|
|
sockOut.transferToFully(((FileInputStream)blockIn).getChannel(),
|
|
|
- blockInPosition, len);
|
|
|
-
|
|
|
- blockInPosition += len;
|
|
|
- } else {
|
|
|
+ blockInPosition, dataLen);
|
|
|
+ blockInPosition += dataLen;
|
|
|
+ } else {
|
|
|
// normal transfer
|
|
|
- out.write(buf, 0, dataOff + len);
|
|
|
+ out.write(buf, 0, dataOff + dataLen);
|
|
|
}
|
|
|
-
|
|
|
} catch (IOException e) {
|
|
|
- /* exception while writing to the client (well, with transferTo(),
|
|
|
- * it could also be while reading from the local file).
|
|
|
+ /* Exception while writing to the client. Connection closure from
|
|
|
+ * the other end is mostly the case and we do not care much about
|
|
|
+ * it. But other things can go wrong, especially in transferTo(),
|
|
|
+ * which we do not want to ignore.
|
|
|
+ *
|
|
|
+ * The message parsing below should not be considered as a good
|
|
|
+ * coding example. NEVER do it to drive a program logic. NEVER.
|
|
|
+ * It was done here because the NIO throws an IOException for EPIPE.
|
|
|
*/
|
|
|
+ String ioem = e.getMessage();
|
|
|
+ if (!ioem.startsWith("Broken pipe") && !ioem.startsWith("Connection reset")) {
|
|
|
+ LOG.error("BlockSender.sendChunks() exception: ", e);
|
|
|
+ }
|
|
|
throw ioeToSocketException(e);
|
|
|
}
|
|
|
|
|
|
- return len;
|
|
|
+ if (throttler != null) { // rebalancing so throttle
|
|
|
+ throttler.throttle(packetLen);
|
|
|
+ }
|
|
|
+
|
|
|
+ return dataLen;
|
|
|
}
|
|
|
+
|
|
|
+ /**
|
|
|
+ * Compute checksum for chunks and verify the checksum that is read from
|
|
|
+ * the metadata file is correct.
|
|
|
+ *
|
|
|
+ * @param buf buffer that has checksum and data
|
|
|
+ * @param dataOffset position where data is written in the buf
|
|
|
+ * @param datalen length of data
|
|
|
+ * @param numChunks number of chunks corresponding to data
|
|
|
+ * @param checksumOffset offset where checksum is written in the buf
|
|
|
+ * @throws ChecksumException on failed checksum verification
|
|
|
+ */
|
|
|
+ public void verifyChecksum(final byte[] buf, final int dataOffset,
|
|
|
+ final int datalen, final int numChunks, final int checksumOffset)
|
|
|
+ throws ChecksumException {
|
|
|
+ int dOff = dataOffset;
|
|
|
+ int cOff = checksumOffset;
|
|
|
+ int dLeft = datalen;
|
|
|
+
|
|
|
+ for (int i = 0; i < numChunks; i++) {
|
|
|
+ checksum.reset();
|
|
|
+ int dLen = Math.min(dLeft, chunkSize);
|
|
|
+ checksum.update(buf, dOff, dLen);
|
|
|
+ if (!checksum.compare(buf, cOff)) {
|
|
|
+ long failedPos = offset + datalen - dLeft;
|
|
|
+ throw new ChecksumException("Checksum failed at " + failedPos,
|
|
|
+ failedPos);
|
|
|
+ }
|
|
|
+ dLeft -= dLen;
|
|
|
+ dOff += dLen;
|
|
|
+ cOff += checksumSize;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
|
|
|
/**
|
|
|
* sendBlock() is used to read block and its metadata and stream the data to
|
|
@@ -356,79 +415,61 @@ public class RaidBlockSender implements java.io.Closeable {
|
|
|
*/
|
|
|
public long sendBlock(DataOutputStream out, OutputStream baseStream)
|
|
|
throws IOException {
|
|
|
- if( out == null ) {
|
|
|
+ if (out == null) {
|
|
|
throw new IOException( "out stream is null" );
|
|
|
}
|
|
|
-
|
|
|
- long initialOffset = offset;
|
|
|
+ initialOffset = offset;
|
|
|
long totalRead = 0;
|
|
|
OutputStream streamForSendChunks = out;
|
|
|
|
|
|
final long startTime = ClientTraceLog.isInfoEnabled() ? System.nanoTime() : 0;
|
|
|
try {
|
|
|
- try {
|
|
|
- checksum.writeHeader(out);
|
|
|
- if ( chunkOffsetOK ) {
|
|
|
- out.writeLong( offset );
|
|
|
- }
|
|
|
- out.flush();
|
|
|
- } catch (IOException e) { //socket error
|
|
|
- throw ioeToSocketException(e);
|
|
|
- }
|
|
|
-
|
|
|
int maxChunksPerPacket;
|
|
|
int pktSize = PacketHeader.PKT_HEADER_LEN;
|
|
|
-
|
|
|
- if (transferToAllowed && !verifyChecksum &&
|
|
|
- baseStream instanceof SocketOutputStream &&
|
|
|
- blockIn instanceof FileInputStream) {
|
|
|
-
|
|
|
+ boolean transferTo = transferToAllowed && !verifyChecksum
|
|
|
+ && baseStream instanceof SocketOutputStream
|
|
|
+ && blockIn instanceof FileInputStream;
|
|
|
+ if (transferTo) {
|
|
|
FileChannel fileChannel = ((FileInputStream)blockIn).getChannel();
|
|
|
-
|
|
|
- // blockInPosition also indicates sendChunks() uses transferTo.
|
|
|
blockInPosition = fileChannel.position();
|
|
|
streamForSendChunks = baseStream;
|
|
|
+ maxChunksPerPacket = numberOfChunks(TRANSFERTO_BUFFER_SIZE);
|
|
|
|
|
|
- // assure a mininum buffer size.
|
|
|
- maxChunksPerPacket = (Math.max(HdfsConstants.IO_FILE_BUFFER_SIZE,
|
|
|
- MIN_BUFFER_WITH_TRANSFERTO)
|
|
|
- + bytesPerChecksum - 1)/bytesPerChecksum;
|
|
|
-
|
|
|
- // allocate smaller buffer while using transferTo().
|
|
|
+ // Smaller packet size to only hold checksum when doing transferTo
|
|
|
pktSize += checksumSize * maxChunksPerPacket;
|
|
|
} else {
|
|
|
maxChunksPerPacket = Math.max(1,
|
|
|
- (HdfsConstants.IO_FILE_BUFFER_SIZE + bytesPerChecksum - 1)/bytesPerChecksum);
|
|
|
- pktSize += (bytesPerChecksum + checksumSize) * maxChunksPerPacket;
|
|
|
+ numberOfChunks(HdfsConstants.IO_FILE_BUFFER_SIZE));
|
|
|
+ // Packet size includes both checksum and data
|
|
|
+ pktSize += (chunkSize + checksumSize) * maxChunksPerPacket;
|
|
|
}
|
|
|
|
|
|
ByteBuffer pktBuf = ByteBuffer.allocate(pktSize);
|
|
|
|
|
|
while (endOffset > offset) {
|
|
|
- long len = sendChunks(pktBuf, maxChunksPerPacket,
|
|
|
- streamForSendChunks);
|
|
|
+ long len = sendPacket(pktBuf, maxChunksPerPacket, streamForSendChunks,
|
|
|
+ transferTo, null);
|
|
|
offset += len;
|
|
|
- totalRead += len + ((len + bytesPerChecksum - 1)/bytesPerChecksum*
|
|
|
- checksumSize);
|
|
|
+ totalRead += len + (numberOfChunks(len) * checksumSize);
|
|
|
seqno++;
|
|
|
}
|
|
|
try {
|
|
|
// send an empty packet to mark the end of the block
|
|
|
- sendChunks(pktBuf, maxChunksPerPacket, streamForSendChunks);
|
|
|
+ sendPacket(pktBuf, maxChunksPerPacket, streamForSendChunks, transferTo,
|
|
|
+ null);
|
|
|
out.flush();
|
|
|
} catch (IOException e) { //socket error
|
|
|
throw ioeToSocketException(e);
|
|
|
}
|
|
|
+ blockReadFully = true;
|
|
|
} finally {
|
|
|
if (clientTraceFmt != null) {
|
|
|
final long endTime = System.nanoTime();
|
|
|
- ClientTraceLog.info(String.format(clientTraceFmt, totalRead, initialOffset, endTime - startTime));
|
|
|
+ ClientTraceLog.info(String.format(clientTraceFmt, totalRead,
|
|
|
+ initialOffset, endTime - startTime));
|
|
|
}
|
|
|
close();
|
|
|
}
|
|
|
-
|
|
|
- blockReadFully = initialOffset == 0 && offset >= replicaVisibleLength;
|
|
|
-
|
|
|
return totalRead;
|
|
|
}
|
|
|
|
|
@@ -440,6 +481,13 @@ public class RaidBlockSender implements java.io.Closeable {
|
|
|
public InputStream createStream(long offset) throws IOException;
|
|
|
}
|
|
|
|
|
|
+ /**
|
|
|
+ * @return the checksum type that will be used with this block transfer.
|
|
|
+ */
|
|
|
+ public DataChecksum getChecksum() {
|
|
|
+ return checksum;
|
|
|
+ }
|
|
|
+
|
|
|
private static class BlockInputStreamFactory implements InputStreamFactory {
|
|
|
private final ExtendedBlock block;
|
|
|
private final FsDatasetSpi<?> data;
|