|
@@ -46,12 +46,14 @@ import java.util.regex.Matcher;
|
|
|
import java.util.regex.Pattern;
|
|
|
|
|
|
import org.apache.hadoop.conf.Configuration;
|
|
|
+import org.apache.hadoop.hdfs.DFSConfigKeys;
|
|
|
import org.apache.hadoop.hdfs.NameNodeProxiesClient.ProxyAndInfo;
|
|
|
import org.apache.hadoop.hdfs.client.HdfsClientConfigKeys;
|
|
|
import org.apache.hadoop.hdfs.protocol.ClientProtocol;
|
|
|
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
|
|
|
import org.apache.hadoop.hdfs.server.federation.resolver.ActiveNamenodeResolver;
|
|
|
import org.apache.hadoop.hdfs.server.federation.resolver.FederationNamenodeContext;
|
|
|
+import org.apache.hadoop.hdfs.server.federation.resolver.FederationNamenodeServiceState;
|
|
|
import org.apache.hadoop.hdfs.server.federation.resolver.RemoteLocation;
|
|
|
import org.apache.hadoop.io.retry.RetryPolicies;
|
|
|
import org.apache.hadoop.io.retry.RetryPolicy;
|
|
@@ -123,10 +125,14 @@ public class RouterRpcClient {
|
|
|
this.connectionManager = new ConnectionManager(conf);
|
|
|
this.connectionManager.start();
|
|
|
|
|
|
+ int numThreads = conf.getInt(
|
|
|
+ DFSConfigKeys.DFS_ROUTER_CLIENT_THREADS_SIZE,
|
|
|
+ DFSConfigKeys.DFS_ROUTER_CLIENT_THREADS_SIZE_DEFAULT);
|
|
|
ThreadFactory threadFactory = new ThreadFactoryBuilder()
|
|
|
.setNameFormat("RPC Router Client-%d")
|
|
|
.build();
|
|
|
- this.executorService = Executors.newCachedThreadPool(threadFactory);
|
|
|
+ this.executorService = Executors.newFixedThreadPool(
|
|
|
+ numThreads, threadFactory);
|
|
|
|
|
|
this.rpcMonitor = monitor;
|
|
|
|
|
@@ -134,8 +140,8 @@ public class RouterRpcClient {
|
|
|
HdfsClientConfigKeys.Failover.MAX_ATTEMPTS_KEY,
|
|
|
HdfsClientConfigKeys.Failover.MAX_ATTEMPTS_DEFAULT);
|
|
|
int maxRetryAttempts = conf.getInt(
|
|
|
- HdfsClientConfigKeys.Retry.MAX_ATTEMPTS_KEY,
|
|
|
- HdfsClientConfigKeys.Retry.MAX_ATTEMPTS_DEFAULT);
|
|
|
+ DFSConfigKeys.DFS_ROUTER_CLIENT_MAX_ATTEMPTS,
|
|
|
+ DFSConfigKeys.DFS_ROUTER_CLIENT_MAX_ATTEMPTS_DEFAULT);
|
|
|
int failoverSleepBaseMillis = conf.getInt(
|
|
|
HdfsClientConfigKeys.Failover.SLEEPTIME_BASE_KEY,
|
|
|
HdfsClientConfigKeys.Failover.SLEEPTIME_BASE_DEFAULT);
|
|
@@ -274,11 +280,24 @@ public class RouterRpcClient {
|
|
|
*
|
|
|
* @param ioe IOException reported.
|
|
|
* @param retryCount Number of retries.
|
|
|
+ * @param nsId Nameservice ID.
|
|
|
* @return Retry decision.
|
|
|
- * @throws IOException Original exception if the retry policy generates one.
|
|
|
+ * @throws IOException Original exception if the retry policy generates one
|
|
|
+ * or IOException for no available namenodes.
|
|
|
*/
|
|
|
- private RetryDecision shouldRetry(final IOException ioe, final int retryCount)
|
|
|
- throws IOException {
|
|
|
+ private RetryDecision shouldRetry(final IOException ioe, final int retryCount,
|
|
|
+ final String nsId) throws IOException {
|
|
|
+ // check for the case of cluster unavailable state
|
|
|
+ if (isClusterUnAvailable(nsId)) {
|
|
|
+ // we allow to retry once if cluster is unavailable
|
|
|
+ if (retryCount == 0) {
|
|
|
+ return RetryDecision.RETRY;
|
|
|
+ } else {
|
|
|
+ throw new IOException("No namenode available under nameservice " + nsId,
|
|
|
+ ioe);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
try {
|
|
|
final RetryPolicy.RetryAction a =
|
|
|
this.retryPolicy.shouldRetry(ioe, retryCount, 0, true);
|
|
@@ -329,7 +348,7 @@ public class RouterRpcClient {
|
|
|
connection = this.getConnection(ugi, nsId, rpcAddress);
|
|
|
ProxyAndInfo<ClientProtocol> client = connection.getClient();
|
|
|
ClientProtocol proxy = client.getProxy();
|
|
|
- ret = invoke(0, method, proxy, params);
|
|
|
+ ret = invoke(nsId, 0, method, proxy, params);
|
|
|
if (failover) {
|
|
|
// Success on alternate server, update
|
|
|
InetSocketAddress address = client.getAddress();
|
|
@@ -400,6 +419,8 @@ public class RouterRpcClient {
|
|
|
* Re-throws exceptions generated by the remote RPC call as either
|
|
|
* RemoteException or IOException.
|
|
|
*
|
|
|
+ * @param nsId Identifier for the namespace
|
|
|
+ * @param retryCount Current retry times
|
|
|
* @param method Method to invoke
|
|
|
* @param obj Target object for the method
|
|
|
* @param params Variable parameters
|
|
@@ -407,8 +428,8 @@ public class RouterRpcClient {
|
|
|
* @throws IOException
|
|
|
* @throws InterruptedException
|
|
|
*/
|
|
|
- private Object invoke(int retryCount, final Method method, final Object obj,
|
|
|
- final Object... params) throws IOException {
|
|
|
+ private Object invoke(String nsId, int retryCount, final Method method,
|
|
|
+ final Object obj, final Object... params) throws IOException {
|
|
|
try {
|
|
|
return method.invoke(obj, params);
|
|
|
} catch (IllegalAccessException e) {
|
|
@@ -421,11 +442,16 @@ public class RouterRpcClient {
|
|
|
Throwable cause = e.getCause();
|
|
|
if (cause instanceof IOException) {
|
|
|
IOException ioe = (IOException) cause;
|
|
|
+
|
|
|
// Check if we should retry.
|
|
|
- RetryDecision decision = shouldRetry(ioe, retryCount);
|
|
|
+ RetryDecision decision = shouldRetry(ioe, retryCount, nsId);
|
|
|
if (decision == RetryDecision.RETRY) {
|
|
|
+ if (this.rpcMonitor != null) {
|
|
|
+ this.rpcMonitor.proxyOpRetries();
|
|
|
+ }
|
|
|
+
|
|
|
// retry
|
|
|
- return invoke(++retryCount, method, obj, params);
|
|
|
+ return invoke(nsId, ++retryCount, method, obj, params);
|
|
|
} else if (decision == RetryDecision.FAILOVER_AND_RETRY) {
|
|
|
// failover, invoker looks for standby exceptions for failover.
|
|
|
if (ioe instanceof StandbyException) {
|
|
@@ -447,6 +473,29 @@ public class RouterRpcClient {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+ /**
|
|
|
+ * Check if the cluster of given nameservice id is available.
|
|
|
+ * @param nsId nameservice ID.
|
|
|
+ * @return
|
|
|
+ * @throws IOException
|
|
|
+ */
|
|
|
+ private boolean isClusterUnAvailable(String nsId) throws IOException {
|
|
|
+ List<? extends FederationNamenodeContext> nnState = this.namenodeResolver
|
|
|
+ .getNamenodesForNameserviceId(nsId);
|
|
|
+
|
|
|
+ if (nnState != null) {
|
|
|
+ for (FederationNamenodeContext nnContext : nnState) {
|
|
|
+ // Once we find one NN is in active state, we assume this
|
|
|
+ // cluster is available.
|
|
|
+ if (nnContext.getState() == FederationNamenodeServiceState.ACTIVE) {
|
|
|
+ return false;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ return true;
|
|
|
+ }
|
|
|
+
|
|
|
/**
|
|
|
* Get a clean copy of the exception. Sometimes the exceptions returned by the
|
|
|
* server contain the full stack trace in the message.
|