Просмотр исходного кода

HDDS-806. Update Ratis to latest snapshot version in ozone. Contributed by Tsz Wo Nicholas Sze and Mukul Kumar Singh.

Shashikant Banerjee 6 лет назад
Родитель
Сommit
788cf061cd

+ 8 - 0
hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ScmConfigKeys.java

@@ -80,6 +80,14 @@ public final class ScmConfigKeys {
   public static final TimeDuration
       DFS_CONTAINER_RATIS_STATEMACHINEDATA_SYNC_TIMEOUT_DEFAULT =
       TimeDuration.valueOf(10, TimeUnit.SECONDS);
+  public static final String
+      DFS_CONTAINER_RATIS_STATEMACHINEDATA_SYNC_RETRIES =
+      "dfs.container.ratis.statemachinedata.sync.retries";
+  public static final int
+      DFS_CONTAINER_RATIS_STATEMACHINEDATA_SYNC_RETRIES_DEFAULT = -1;
+  public static final String DFS_CONTAINER_RATIS_LOG_QUEUE_SIZE =
+      "dfs.container.ratis.log.queue.size";
+  public static final int DFS_CONTAINER_RATIS_LOG_QUEUE_SIZE_DEFAULT = 128;
   public static final String DFS_RATIS_CLIENT_REQUEST_TIMEOUT_DURATION_KEY =
       "dfs.ratis.client.request.timeout.duration";
   public static final TimeDuration

+ 10 - 0
hadoop-hdds/common/src/main/java/org/apache/hadoop/ozone/OzoneConfigKeys.java

@@ -264,6 +264,16 @@ public final class OzoneConfigKeys {
   public static final TimeDuration
       DFS_RATIS_SERVER_RETRY_CACHE_TIMEOUT_DURATION_DEFAULT =
       ScmConfigKeys.DFS_RATIS_SERVER_RETRY_CACHE_TIMEOUT_DURATION_DEFAULT;
+  public static final String
+      DFS_CONTAINER_RATIS_STATEMACHINEDATA_SYNC_RETRIES =
+      ScmConfigKeys.DFS_CONTAINER_RATIS_STATEMACHINEDATA_SYNC_RETRIES;
+  public static final int
+      DFS_CONTAINER_RATIS_STATEMACHINEDATA_SYNC_RETRIES_DEFAULT =
+      ScmConfigKeys.DFS_CONTAINER_RATIS_STATEMACHINEDATA_SYNC_RETRIES_DEFAULT;
+  public static final String DFS_CONTAINER_RATIS_LOG_QUEUE_SIZE =
+      ScmConfigKeys.DFS_CONTAINER_RATIS_LOG_QUEUE_SIZE;
+  public static final int DFS_CONTAINER_RATIS_LOG_QUEUE_SIZE_DEFAULT =
+      ScmConfigKeys.DFS_CONTAINER_RATIS_LOG_QUEUE_SIZE_DEFAULT;
   public static final String DFS_RATIS_SERVER_REQUEST_TIMEOUT_DURATION_KEY =
       ScmConfigKeys.DFS_RATIS_SERVER_REQUEST_TIMEOUT_DURATION_KEY;
   public static final TimeDuration

+ 15 - 0
hadoop-hdds/common/src/main/resources/ozone-default.xml

@@ -59,6 +59,21 @@
     <description>Timeout for StateMachine data writes by Ratis.
     </description>
   </property>
+  <property>
+    <name>dfs.container.ratis.statemachinedata.sync.retries</name>
+    <value>-1</value>
+    <tag>OZONE, DEBUG, CONTAINER, RATIS</tag>
+    <description>Number of times the WriteStateMachineData op will be tried
+      before failing, if this value is -1, then this retries indefinitely.
+    </description>
+  </property>
+  <property>
+    <name>dfs.container.ratis.log.queue.size</name>
+    <value>128</value>
+    <tag>OZONE, DEBUG, CONTAINER, RATIS</tag>
+    <description>Number of operation pending with Raft's Log Worker.
+    </description>
+  </property>
   <property>
     <name>dfs.container.ratis.datanode.storage.dir</name>
     <value/>

+ 21 - 27
hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java

@@ -22,7 +22,7 @@ import com.google.common.base.Preconditions;
 import org.apache.hadoop.hdds.HddsUtils;
 import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos;
 import org.apache.hadoop.ozone.container.common.helpers.BlockData;
-import org.apache.ratis.proto.RaftProtos.StateMachineEntryProto;
+import org.apache.ratis.proto.RaftProtos.RaftPeerRole;
 import org.apache.ratis.protocol.RaftGroup;
 import org.apache.ratis.protocol.RaftGroupId;
 import org.apache.ratis.server.RaftServer;
@@ -55,7 +55,6 @@ import org.apache.ratis.statemachine.StateMachineStorage;
 import org.apache.ratis.statemachine.TransactionContext;
 import org.apache.ratis.statemachine.impl.BaseStateMachine;
 import org.apache.ratis.statemachine.impl.SimpleStateMachineStorage;
-import org.apache.ratis.statemachine.impl.TransactionContextImpl;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -206,7 +205,6 @@ public class ContainerStateMachine extends BaseStateMachine {
     final ContainerCommandRequestProto proto =
         getRequestProto(request.getMessage().getContent());
     Preconditions.checkArgument(request.getRaftGroupId().equals(gid));
-    final StateMachineLogEntryProto log;
     if (proto.getCmdType() == Type.WriteChunk) {
       final WriteChunkRequestProto write = proto.getWriteChunk();
       // create the state machine data proto
@@ -236,33 +234,29 @@ public class ContainerStateMachine extends BaseStateMachine {
               .setWriteChunk(commitWriteChunkProto)
               .build();
 
-      log = createSMLogEntryProto(request,
-          commitContainerCommandProto.toByteString(),
-          dataContainerCommandProto.toByteString());
+      return TransactionContext.newBuilder()
+          .setClientRequest(request)
+          .setStateMachine(this)
+          .setServerRole(RaftPeerRole.LEADER)
+          .setStateMachineData(dataContainerCommandProto.toByteString())
+          .setLogData(commitContainerCommandProto.toByteString())
+          .build();
     } else if (proto.getCmdType() == Type.CreateContainer) {
-      log = createSMLogEntryProto(request,
-          request.getMessage().getContent(), request.getMessage().getContent());
+      return TransactionContext.newBuilder()
+          .setClientRequest(request)
+          .setStateMachine(this)
+          .setServerRole(RaftPeerRole.LEADER)
+          .setStateMachineData(request.getMessage().getContent())
+          .setLogData(request.getMessage().getContent())
+          .build();
     } else {
-      log = createSMLogEntryProto(request, request.getMessage().getContent(),
-          null);
+      return TransactionContext.newBuilder()
+          .setClientRequest(request)
+          .setStateMachine(this)
+          .setServerRole(RaftPeerRole.LEADER)
+          .setLogData(request.getMessage().getContent())
+          .build();
     }
-    return new TransactionContextImpl(this, request, log);
-  }
-
-  private StateMachineLogEntryProto createSMLogEntryProto(RaftClientRequest r,
-      ByteString logData, ByteString smData) {
-    StateMachineLogEntryProto.Builder builder =
-        StateMachineLogEntryProto.newBuilder();
-
-    builder.setCallId(r.getCallId())
-        .setClientId(r.getClientId().toByteString())
-        .setLogData(logData);
-
-    if (smData != null) {
-      builder.setStateMachineEntry(StateMachineEntryProto.newBuilder()
-          .setStateMachineData(smData).build());
-    }
-    return builder.build();
   }
 
   private ByteString getStateMachineData(StateMachineLogEntryProto entryProto) {

+ 11 - 0
hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/XceiverServerRatis.java

@@ -287,6 +287,17 @@ public final class XceiverServerRatis implements XceiverServerSpi {
       setAutoTriggerEnabled(properties, true);
     RaftServerConfigKeys.Snapshot.
       setAutoTriggerThreshold(properties, snapshotThreshold);
+    int logQueueSize =
+        conf.getInt(OzoneConfigKeys.DFS_CONTAINER_RATIS_LOG_QUEUE_SIZE,
+            OzoneConfigKeys.DFS_CONTAINER_RATIS_LOG_QUEUE_SIZE_DEFAULT);
+    RaftServerConfigKeys.Log.setQueueSize(properties, logQueueSize);
+
+    int numSyncRetries = conf.getInt(
+        OzoneConfigKeys.DFS_CONTAINER_RATIS_STATEMACHINEDATA_SYNC_RETRIES,
+        OzoneConfigKeys.
+            DFS_CONTAINER_RATIS_STATEMACHINEDATA_SYNC_RETRIES_DEFAULT);
+    RaftServerConfigKeys.Log.StateMachineData.setSyncTimeoutRetry(properties,
+        numSyncRetries);
 
     return properties;
   }

+ 1 - 1
hadoop-project/pom.xml

@@ -101,7 +101,7 @@
     <ldap-api.version>1.0.0-M33</ldap-api.version>
 
     <!-- Apache Ratis version -->
-    <ratis.version>0.3.0-1d2ebee-SNAPSHOT</ratis.version>
+    <ratis.version>0.3.0-1d07b18-SNAPSHOT</ratis.version>
     <jcache.version>1.0-alpha-1</jcache.version>
     <ehcache.version>3.3.1</ehcache.version>
     <hikari.version>2.4.12</hikari.version>