Forráskód Böngészése

ZOOKEEPER-3056: Fails to load database with missing snapshot file but with valid transaction log file.

ZOOKEEPER-2325 introduced a check on snapshot and transaction log files during recovery, which will treat empty or missing snapshot files with the presence of transaction log files illegal state, and abort start up process.

For old versions of ZooKeeper, it's possible to have valid transaction log files but no snapshot. For example, if snap count set too low, or server crashes before the first snapshot was taken. For new versions of ZooKeeper with ZOOKEEPER-2325, this is not a problem as ZooKeeper will make sure the presence of at least one snapshot file (e.g. on a fresh start with empty local data, ZooKeeper will create one snapshot file as part of the start up.).

So we need provide a smooth upgrade path for users with old version of ZooKeeper. This patch introduces a system property flag, once set, will skip the empty snapshot validation during start up. The default value of this flag is false, as we don't want to disable the features introduced in ZOOKEEPER-2325. We also want the flag to be explicitly set by admin, as ZooKeeper itself is not able to tell when to trust missing snapshot or not.

Author: Michael Han <lhan@twitter.com>

Reviewers: Enrico Olivelli <eolivelli@apache.org>, Brian Nixon <nixon@fb.com>

Closes #1069 from hanm/ZOOKEEPER-3056
Michael Han 5 éve
szülő
commit
68c21988d5

+ 14 - 0
zookeeper-docs/src/main/resources/markdown/zookeeperAdmin.md

@@ -915,6 +915,20 @@ property, when available, is noted below.
     ZooKeeper when loading database from disk, and syncing with leader.
     By default, this feautre is disabled, set "true" to enable it.
 
+* *snapshot.trust.empty* :
+    (Java system property only: **zookeeper.snapshot.trust.empty**)
+    **New in 3.5.6:**
+    This property controls whether or not ZooKeeper should treat missing
+    snapshot files as a fatal state that can't be recovered from.
+    Set to true to allow ZooKeeper servers recover without snapshot
+    files. This should only be set during upgrading from old versions of
+    ZooKeeper (3.4.x, pre 3.5.3) where ZooKeeper might only have transaction
+    log files but without presence of snapshot files. If the value is set
+    during upgrade, we recommend to set the value back to false after upgrading
+    and restart ZooKeeper process so ZooKeeper can continue normal data
+    consistency check during recovery process.
+    Default value is false.
+
 <a name="sc_clusterOptions"></a>
 
 #### Cluster Options

+ 16 - 2
zookeeper-server/src/main/java/org/apache/zookeeper/server/persistence/FileTxnSnapLog.java

@@ -59,6 +59,7 @@ public class FileTxnSnapLog {
     TxnLog txnLog;
     SnapShot snapLog;
     private final boolean autoCreateDB;
+    private final boolean trustEmptySnapshot;
     public static final int VERSION = 2;
     public static final String version = "version-";
 
@@ -72,6 +73,10 @@ public class FileTxnSnapLog {
 
     private static final String ZOOKEEPER_DB_AUTOCREATE_DEFAULT = "true";
 
+    public static final String ZOOKEEPER_SNAPSHOT_TRUST_EMPTY = "zookeeper.snapshot.trust.empty";
+
+    private static final String EMPTY_SNAPSHOT_WARNING = "No snapshot found, but there are log entries. ";
+
     /**
      * This listener helps
      * the external apis calling
@@ -102,6 +107,9 @@ public class FileTxnSnapLog {
         boolean enableAutocreate = Boolean.parseBoolean(
             System.getProperty(ZOOKEEPER_DATADIR_AUTOCREATE, ZOOKEEPER_DATADIR_AUTOCREATE_DEFAULT));
 
+        trustEmptySnapshot = Boolean.getBoolean(ZOOKEEPER_SNAPSHOT_TRUST_EMPTY);
+        LOG.info(ZOOKEEPER_SNAPSHOT_TRUST_EMPTY + " : " + trustEmptySnapshot);
+
         if (!this.dataDir.exists()) {
             if (!enableAutocreate) {
                 throw new DatadirException(String.format(
@@ -232,11 +240,18 @@ public class FileTxnSnapLog {
         } else {
             trustEmptyDB = autoCreateDB;
         }
+
         if (-1L == deserializeResult) {
             /* this means that we couldn't find any snapshot, so we need to
              * initialize an empty database (reported in ZOOKEEPER-2325) */
             if (txnLog.getLastLoggedZxid() != -1) {
-                throw new IOException("No snapshot found, but there are log entries. " + "Something is broken!");
+                // ZOOKEEPER-3056: provides an escape hatch for users upgrading
+                // from old versions of zookeeper (3.4.x, pre 3.5.3).
+                if (!trustEmptySnapshot) {
+                    throw new IOException(EMPTY_SNAPSHOT_WARNING + "Something is broken!");
+                } else {
+                    LOG.warn(EMPTY_SNAPSHOT_WARNING + "This should only be allowed during upgrading.");
+                }
             }
 
             if (trustEmptyDB) {
@@ -600,5 +615,4 @@ public class FileTxnSnapLog {
     public long getTotalLogSize() {
         return txnLog.getTotalLogSize();
     }
-
 }

+ 23 - 5
zookeeper-server/src/test/java/org/apache/zookeeper/test/EmptiedSnapshotRecoveryTest.java

@@ -39,7 +39,7 @@ import org.apache.zookeeper.server.persistence.FileTxnSnapLog;
 import org.junit.Test;
 
 /** If snapshots are corrupted to the empty file or deleted, Zookeeper should
- *  not proceed to read its transactiong log files
+ *  not proceed to read its transaction log files
  *  Test that zxid == -1 in the presence of emptied/deleted snapshots
  */
 public class EmptiedSnapshotRecoveryTest extends ZKTestCase implements Watcher {
@@ -50,7 +50,7 @@ public class EmptiedSnapshotRecoveryTest extends ZKTestCase implements Watcher {
     private static final int N_TRANSACTIONS = 150;
     private static final int SNAP_COUNT = 100;
 
-    public void runTest(boolean leaveEmptyFile) throws Exception {
+    public void runTest(boolean leaveEmptyFile, boolean trustEmptySnap) throws Exception {
         File tmpSnapDir = ClientBase.createTmpDir();
         File tmpLogDir = ClientBase.createTmpDir();
         ClientBase.setupTestEnv();
@@ -92,15 +92,28 @@ public class EmptiedSnapshotRecoveryTest extends ZKTestCase implements Watcher {
             }
         }
 
+        if (trustEmptySnap) {
+            System.setProperty(FileTxnSnapLog.ZOOKEEPER_SNAPSHOT_TRUST_EMPTY, "true");
+        }
         // start server again with corrupted database
         zks = new ZooKeeperServer(tmpSnapDir, tmpLogDir, 3000);
         try {
             zks.startdata();
             zxid = zks.getZKDatabase().loadDataBase();
-            fail("Should have gotten exception for corrupted database");
+            if (!trustEmptySnap) {
+                fail("Should have gotten exception for corrupted database");
+            }
         } catch (IOException e) {
             // expected behavior
+            if (trustEmptySnap) {
+                fail("Should not get exception for empty database");
+            }
+        } finally {
+            if (trustEmptySnap) {
+                System.clearProperty(FileTxnSnapLog.ZOOKEEPER_SNAPSHOT_TRUST_EMPTY);
+            }
         }
+
         zks.shutdown();
     }
 
@@ -110,7 +123,7 @@ public class EmptiedSnapshotRecoveryTest extends ZKTestCase implements Watcher {
      */
     @Test
     public void testRestoreWithEmptySnapFiles() throws Exception {
-        runTest(true);
+        runTest(true, false);
     }
 
     /**
@@ -119,7 +132,12 @@ public class EmptiedSnapshotRecoveryTest extends ZKTestCase implements Watcher {
      */
     @Test
     public void testRestoreWithNoSnapFiles() throws Exception {
-        runTest(false);
+        runTest(false, false);
+    }
+
+    @Test
+    public void testRestoreWithTrustedEmptySnapFiles() throws Exception {
+        runTest(false, true);
     }
 
     public void process(WatchedEvent event) {