|
@@ -771,7 +771,7 @@ public class DFSInputStream extends FSInputStream
|
|
* ChecksumFileSystem
|
|
* ChecksumFileSystem
|
|
*/
|
|
*/
|
|
private synchronized int readBuffer(ReaderStrategy reader, int len,
|
|
private synchronized int readBuffer(ReaderStrategy reader, int len,
|
|
- CorruptedBlocks corruptedBlocks)
|
|
|
|
|
|
+ CorruptedBlocks corruptedBlocks, final Map<InetSocketAddress, List<IOException>> exceptionMap)
|
|
throws IOException {
|
|
throws IOException {
|
|
IOException ioe;
|
|
IOException ioe;
|
|
|
|
|
|
@@ -786,6 +786,7 @@ public class DFSInputStream extends FSInputStream
|
|
while (true) {
|
|
while (true) {
|
|
// retry as many times as seekToNewSource allows.
|
|
// retry as many times as seekToNewSource allows.
|
|
try {
|
|
try {
|
|
|
|
+ DFSClientFaultInjector.get().fetchFromDatanodeException();
|
|
return reader.readFromBlock(blockReader, len);
|
|
return reader.readFromBlock(blockReader, len);
|
|
} catch (ChecksumException ce) {
|
|
} catch (ChecksumException ce) {
|
|
DFSClient.LOG.warn("Found Checksum error for "
|
|
DFSClient.LOG.warn("Found Checksum error for "
|
|
@@ -796,11 +797,18 @@ public class DFSInputStream extends FSInputStream
|
|
// we want to remember which block replicas we have tried
|
|
// we want to remember which block replicas we have tried
|
|
corruptedBlocks.addCorruptedBlock(getCurrentBlock(), currentNode);
|
|
corruptedBlocks.addCorruptedBlock(getCurrentBlock(), currentNode);
|
|
} catch (IOException e) {
|
|
} catch (IOException e) {
|
|
- if (!retryCurrentNode) {
|
|
|
|
- DFSClient.LOG.warn("Exception while reading from "
|
|
|
|
- + getCurrentBlock() + " of " + src + " from "
|
|
|
|
- + currentNode, e);
|
|
|
|
|
|
+ String msg = String.format("Failed to read block %s for file %s from datanode %s. "
|
|
|
|
+ + "Exception is %s. Retry with the current or next available datanode.",
|
|
|
|
+ getCurrentBlock().getBlockName(), src, currentNode.getXferAddr(), e);
|
|
|
|
+ DFSClient.LOG.warn(msg);
|
|
|
|
+
|
|
|
|
+ // Add the exception to exceptionMap for this datanode.
|
|
|
|
+ InetSocketAddress datanode = currentNode.getResolvedAddress();
|
|
|
|
+ if (!exceptionMap.containsKey(datanode)) {
|
|
|
|
+ exceptionMap.put(datanode, new LinkedList<IOException>());
|
|
}
|
|
}
|
|
|
|
+ exceptionMap.get(datanode).add(e);
|
|
|
|
+
|
|
ioe = e;
|
|
ioe = e;
|
|
}
|
|
}
|
|
boolean sourceFound;
|
|
boolean sourceFound;
|
|
@@ -822,6 +830,29 @@ public class DFSInputStream extends FSInputStream
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+ /**
|
|
|
|
+ * Send IOExceptions happened at each individual datanode to DFSClient.LOG for a failed read
|
|
|
|
+ * request. Used in both readWithStrategy() and pread(), to record the exceptions when a read
|
|
|
|
+ * request failed to be served.
|
|
|
|
+ * @param position offset in the file where we fail to read
|
|
|
|
+ * @param exceptionMap a map which stores the list of IOExceptions for each datanode
|
|
|
|
+ */
|
|
|
|
+ private void logDataNodeExceptionsOnReadError(long position, final Map<InetSocketAddress,
|
|
|
|
+ List<IOException>> exceptionMap) {
|
|
|
|
+ String msg = String.format("Failed to read from all available datanodes for file %s "
|
|
|
|
+ + "at position=%d after retrying.", src, position);
|
|
|
|
+ DFSClient.LOG.error(msg);
|
|
|
|
+ for (Map.Entry<InetSocketAddress, List<IOException>> dataNodeExceptions :
|
|
|
|
+ exceptionMap.entrySet()) {
|
|
|
|
+ List<IOException> exceptions = dataNodeExceptions.getValue();
|
|
|
|
+ for (IOException ex : exceptions) {
|
|
|
|
+ msg = String.format("Exception when fetching file %s at position=%d at datanode %s:", src,
|
|
|
|
+ position, dataNodeExceptions.getKey());
|
|
|
|
+ DFSClient.LOG.error(msg, ex);
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+
|
|
protected synchronized int readWithStrategy(ReaderStrategy strategy)
|
|
protected synchronized int readWithStrategy(ReaderStrategy strategy)
|
|
throws IOException {
|
|
throws IOException {
|
|
dfsClient.checkOpen();
|
|
dfsClient.checkOpen();
|
|
@@ -831,6 +862,9 @@ public class DFSInputStream extends FSInputStream
|
|
|
|
|
|
int len = strategy.getTargetLength();
|
|
int len = strategy.getTargetLength();
|
|
CorruptedBlocks corruptedBlocks = new CorruptedBlocks();
|
|
CorruptedBlocks corruptedBlocks = new CorruptedBlocks();
|
|
|
|
+ // A map to record IOExceptions when fetching from each datanode. Key is the socketAddress of
|
|
|
|
+ // a datanode.
|
|
|
|
+ Map<InetSocketAddress, List<IOException>> exceptionMap = new HashMap<>();
|
|
failures = 0;
|
|
failures = 0;
|
|
|
|
|
|
maybeRegisterBlockRefresh();
|
|
maybeRegisterBlockRefresh();
|
|
@@ -852,7 +886,7 @@ public class DFSInputStream extends FSInputStream
|
|
}
|
|
}
|
|
}
|
|
}
|
|
long beginReadMS = Time.monotonicNow();
|
|
long beginReadMS = Time.monotonicNow();
|
|
- int result = readBuffer(strategy, realLen, corruptedBlocks);
|
|
|
|
|
|
+ int result = readBuffer(strategy, realLen, corruptedBlocks, exceptionMap);
|
|
long readTimeMS = Time.monotonicNow() - beginReadMS;
|
|
long readTimeMS = Time.monotonicNow() - beginReadMS;
|
|
if (result >= 0) {
|
|
if (result >= 0) {
|
|
pos += result;
|
|
pos += result;
|
|
@@ -880,6 +914,8 @@ public class DFSInputStream extends FSInputStream
|
|
dfsClient.addNodeToDeadNodeDetector(this, currentNode);
|
|
dfsClient.addNodeToDeadNodeDetector(this, currentNode);
|
|
}
|
|
}
|
|
if (--retries == 0) {
|
|
if (--retries == 0) {
|
|
|
|
+ // Fail the request and log all exceptions
|
|
|
|
+ logDataNodeExceptionsOnReadError(pos, exceptionMap);
|
|
throw e;
|
|
throw e;
|
|
}
|
|
}
|
|
} finally {
|
|
} finally {
|
|
@@ -1122,8 +1158,8 @@ public class DFSInputStream extends FSInputStream
|
|
return errMsgr.toString();
|
|
return errMsgr.toString();
|
|
}
|
|
}
|
|
|
|
|
|
- protected void fetchBlockByteRange(LocatedBlock block, long start, long end,
|
|
|
|
- ByteBuffer buf, CorruptedBlocks corruptedBlocks)
|
|
|
|
|
|
+ protected void fetchBlockByteRange(LocatedBlock block, long start, long end, ByteBuffer buf,
|
|
|
|
+ CorruptedBlocks corruptedBlocks, final Map<InetSocketAddress, List<IOException>> exceptionMap)
|
|
throws IOException {
|
|
throws IOException {
|
|
while (true) {
|
|
while (true) {
|
|
DNAddrPair addressPair = chooseDataNode(block, null);
|
|
DNAddrPair addressPair = chooseDataNode(block, null);
|
|
@@ -1131,7 +1167,7 @@ public class DFSInputStream extends FSInputStream
|
|
block = addressPair.block;
|
|
block = addressPair.block;
|
|
try {
|
|
try {
|
|
actualGetFromOneDataNode(addressPair, start, end, buf,
|
|
actualGetFromOneDataNode(addressPair, start, end, buf,
|
|
- corruptedBlocks);
|
|
|
|
|
|
+ corruptedBlocks, exceptionMap);
|
|
return;
|
|
return;
|
|
} catch (IOException e) {
|
|
} catch (IOException e) {
|
|
checkInterrupted(e); // check if the read has been interrupted
|
|
checkInterrupted(e); // check if the read has been interrupted
|
|
@@ -1142,15 +1178,15 @@ public class DFSInputStream extends FSInputStream
|
|
}
|
|
}
|
|
|
|
|
|
private Callable<ByteBuffer> getFromOneDataNode(final DNAddrPair datanode,
|
|
private Callable<ByteBuffer> getFromOneDataNode(final DNAddrPair datanode,
|
|
- final LocatedBlock block, final long start, final long end,
|
|
|
|
|
|
+ final long start, final long end,
|
|
final ByteBuffer bb,
|
|
final ByteBuffer bb,
|
|
final CorruptedBlocks corruptedBlocks,
|
|
final CorruptedBlocks corruptedBlocks,
|
|
- final int hedgedReadId) {
|
|
|
|
|
|
+ final Map<InetSocketAddress, List<IOException>> exceptionMap) {
|
|
return new Callable<ByteBuffer>() {
|
|
return new Callable<ByteBuffer>() {
|
|
@Override
|
|
@Override
|
|
public ByteBuffer call() throws Exception {
|
|
public ByteBuffer call() throws Exception {
|
|
DFSClientFaultInjector.get().sleepBeforeHedgedGet();
|
|
DFSClientFaultInjector.get().sleepBeforeHedgedGet();
|
|
- actualGetFromOneDataNode(datanode, start, end, bb, corruptedBlocks);
|
|
|
|
|
|
+ actualGetFromOneDataNode(datanode, start, end, bb, corruptedBlocks, exceptionMap);
|
|
return bb;
|
|
return bb;
|
|
}
|
|
}
|
|
};
|
|
};
|
|
@@ -1167,7 +1203,8 @@ public class DFSInputStream extends FSInputStream
|
|
* block replica
|
|
* block replica
|
|
*/
|
|
*/
|
|
void actualGetFromOneDataNode(final DNAddrPair datanode, final long startInBlk,
|
|
void actualGetFromOneDataNode(final DNAddrPair datanode, final long startInBlk,
|
|
- final long endInBlk, ByteBuffer buf, CorruptedBlocks corruptedBlocks)
|
|
|
|
|
|
+ final long endInBlk, ByteBuffer buf, CorruptedBlocks corruptedBlocks,
|
|
|
|
+ final Map<InetSocketAddress, List<IOException>> exceptionMap)
|
|
throws IOException {
|
|
throws IOException {
|
|
DFSClientFaultInjector.get().startFetchFromDatanode();
|
|
DFSClientFaultInjector.get().startFetchFromDatanode();
|
|
int refetchToken = 1; // only need to get a new access token once
|
|
int refetchToken = 1; // only need to get a new access token once
|
|
@@ -1236,9 +1273,16 @@ public class DFSInputStream extends FSInputStream
|
|
// ignore IOE, since we can retry it later in a loop
|
|
// ignore IOE, since we can retry it later in a loop
|
|
}
|
|
}
|
|
} else {
|
|
} else {
|
|
- String msg = "Failed to connect to " + datanode.addr + " for file "
|
|
|
|
- + src + " for block " + block.getBlock() + ":" + e;
|
|
|
|
- DFSClient.LOG.warn("Connection failure: " + msg, e);
|
|
|
|
|
|
+ String msg = String.format("Failed to read block %s for file %s from datanode %s. "
|
|
|
|
+ + "Exception is %s. Retry with the next available datanode.",
|
|
|
|
+ block.getBlock().getBlockName(), src, datanode.addr, e);
|
|
|
|
+ DFSClient.LOG.warn(msg);
|
|
|
|
+
|
|
|
|
+ // Add the exception to the exceptionMap
|
|
|
|
+ if (!exceptionMap.containsKey(datanode.addr)) {
|
|
|
|
+ exceptionMap.put(datanode.addr, new LinkedList<IOException>());
|
|
|
|
+ }
|
|
|
|
+ exceptionMap.get(datanode.addr).add(e);
|
|
addToLocalDeadNodes(datanode.info);
|
|
addToLocalDeadNodes(datanode.info);
|
|
dfsClient.addNodeToDeadNodeDetector(this, datanode.info);
|
|
dfsClient.addNodeToDeadNodeDetector(this, datanode.info);
|
|
throw new IOException(msg);
|
|
throw new IOException(msg);
|
|
@@ -1270,9 +1314,9 @@ public class DFSInputStream extends FSInputStream
|
|
* 'hedged' read if the first read is taking longer than configured amount of
|
|
* 'hedged' read if the first read is taking longer than configured amount of
|
|
* time. We then wait on which ever read returns first.
|
|
* time. We then wait on which ever read returns first.
|
|
*/
|
|
*/
|
|
- private void hedgedFetchBlockByteRange(LocatedBlock block, long start,
|
|
|
|
- long end, ByteBuffer buf, CorruptedBlocks corruptedBlocks)
|
|
|
|
- throws IOException {
|
|
|
|
|
|
+ private void hedgedFetchBlockByteRange(LocatedBlock block, long start, long end, ByteBuffer buf,
|
|
|
|
+ CorruptedBlocks corruptedBlocks,
|
|
|
|
+ final Map<InetSocketAddress, List<IOException>> exceptionMap) throws IOException {
|
|
final DfsClientConf conf = dfsClient.getConf();
|
|
final DfsClientConf conf = dfsClient.getConf();
|
|
ArrayList<Future<ByteBuffer>> futures = new ArrayList<>();
|
|
ArrayList<Future<ByteBuffer>> futures = new ArrayList<>();
|
|
CompletionService<ByteBuffer> hedgedService =
|
|
CompletionService<ByteBuffer> hedgedService =
|
|
@@ -1280,7 +1324,6 @@ public class DFSInputStream extends FSInputStream
|
|
ArrayList<DatanodeInfo> ignored = new ArrayList<>();
|
|
ArrayList<DatanodeInfo> ignored = new ArrayList<>();
|
|
ByteBuffer bb;
|
|
ByteBuffer bb;
|
|
int len = (int) (end - start + 1);
|
|
int len = (int) (end - start + 1);
|
|
- int hedgedReadId = 0;
|
|
|
|
while (true) {
|
|
while (true) {
|
|
// see HDFS-6591, this metric is used to verify/catch unnecessary loops
|
|
// see HDFS-6591, this metric is used to verify/catch unnecessary loops
|
|
hedgedReadOpsLoopNumForTesting++;
|
|
hedgedReadOpsLoopNumForTesting++;
|
|
@@ -1293,9 +1336,8 @@ public class DFSInputStream extends FSInputStream
|
|
// Latest block, if refreshed internally
|
|
// Latest block, if refreshed internally
|
|
block = chosenNode.block;
|
|
block = chosenNode.block;
|
|
bb = ByteBuffer.allocate(len);
|
|
bb = ByteBuffer.allocate(len);
|
|
- Callable<ByteBuffer> getFromDataNodeCallable = getFromOneDataNode(
|
|
|
|
- chosenNode, block, start, end, bb,
|
|
|
|
- corruptedBlocks, hedgedReadId++);
|
|
|
|
|
|
+ Callable<ByteBuffer> getFromDataNodeCallable =
|
|
|
|
+ getFromOneDataNode(chosenNode, start, end, bb, corruptedBlocks, exceptionMap);
|
|
Future<ByteBuffer> firstRequest = hedgedService
|
|
Future<ByteBuffer> firstRequest = hedgedService
|
|
.submit(getFromDataNodeCallable);
|
|
.submit(getFromDataNodeCallable);
|
|
futures.add(firstRequest);
|
|
futures.add(firstRequest);
|
|
@@ -1335,8 +1377,7 @@ public class DFSInputStream extends FSInputStream
|
|
block = chosenNode.block;
|
|
block = chosenNode.block;
|
|
bb = ByteBuffer.allocate(len);
|
|
bb = ByteBuffer.allocate(len);
|
|
Callable<ByteBuffer> getFromDataNodeCallable =
|
|
Callable<ByteBuffer> getFromDataNodeCallable =
|
|
- getFromOneDataNode(chosenNode, block, start, end, bb,
|
|
|
|
- corruptedBlocks, hedgedReadId++);
|
|
|
|
|
|
+ getFromOneDataNode(chosenNode, start, end, bb, corruptedBlocks, exceptionMap);
|
|
Future<ByteBuffer> oneMoreRequest =
|
|
Future<ByteBuffer> oneMoreRequest =
|
|
hedgedService.submit(getFromDataNodeCallable);
|
|
hedgedService.submit(getFromDataNodeCallable);
|
|
futures.add(oneMoreRequest);
|
|
futures.add(oneMoreRequest);
|
|
@@ -1486,6 +1527,11 @@ public class DFSInputStream extends FSInputStream
|
|
List<LocatedBlock> blockRange = getBlockRange(position, realLen);
|
|
List<LocatedBlock> blockRange = getBlockRange(position, realLen);
|
|
int remaining = realLen;
|
|
int remaining = realLen;
|
|
CorruptedBlocks corruptedBlocks = new CorruptedBlocks();
|
|
CorruptedBlocks corruptedBlocks = new CorruptedBlocks();
|
|
|
|
+ // A map to record all IOExceptions happened at each datanode when fetching a block.
|
|
|
|
+ // In HDFS-17332, we worked on populating this map only for DFSInputStream, but not for
|
|
|
|
+ // DFSStripedInputStream. If you need the same function for DFSStripedInputStream, please
|
|
|
|
+ // work on it yourself (fetchBlockByteRange() in DFSStripedInputStream).
|
|
|
|
+ Map<InetSocketAddress, List<IOException>> exceptionMap = new HashMap<>();
|
|
for (LocatedBlock blk : blockRange) {
|
|
for (LocatedBlock blk : blockRange) {
|
|
long targetStart = position - blk.getStartOffset();
|
|
long targetStart = position - blk.getStartOffset();
|
|
int bytesToRead = (int) Math.min(remaining,
|
|
int bytesToRead = (int) Math.min(remaining,
|
|
@@ -1494,11 +1540,17 @@ public class DFSInputStream extends FSInputStream
|
|
try {
|
|
try {
|
|
if (dfsClient.isHedgedReadsEnabled() && !blk.isStriped()) {
|
|
if (dfsClient.isHedgedReadsEnabled() && !blk.isStriped()) {
|
|
hedgedFetchBlockByteRange(blk, targetStart,
|
|
hedgedFetchBlockByteRange(blk, targetStart,
|
|
- targetEnd, buffer, corruptedBlocks);
|
|
|
|
|
|
+ targetEnd, buffer, corruptedBlocks, exceptionMap);
|
|
} else {
|
|
} else {
|
|
fetchBlockByteRange(blk, targetStart, targetEnd,
|
|
fetchBlockByteRange(blk, targetStart, targetEnd,
|
|
- buffer, corruptedBlocks);
|
|
|
|
|
|
+ buffer, corruptedBlocks, exceptionMap);
|
|
}
|
|
}
|
|
|
|
+ } catch (IOException e) {
|
|
|
|
+ // When we reach here, it means we fail to fetch the current block from all available
|
|
|
|
+ // datanodes. Send IOExceptions in exceptionMap to the log and rethrow the exception to
|
|
|
|
+ // fail this request.
|
|
|
|
+ logDataNodeExceptionsOnReadError(position, exceptionMap);
|
|
|
|
+ throw e;
|
|
} finally {
|
|
} finally {
|
|
// Check and report if any block replicas are corrupted.
|
|
// Check and report if any block replicas are corrupted.
|
|
// BlockMissingException may be caught if all block replicas are
|
|
// BlockMissingException may be caught if all block replicas are
|
|
@@ -1507,6 +1559,8 @@ public class DFSInputStream extends FSInputStream
|
|
false);
|
|
false);
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+ // Reset exceptionMap before fetching the next block.
|
|
|
|
+ exceptionMap.clear();
|
|
remaining -= bytesToRead;
|
|
remaining -= bytesToRead;
|
|
position += bytesToRead;
|
|
position += bytesToRead;
|
|
}
|
|
}
|