Browse Source

ZOOKEEPER-1333. NPE in FileTxnSnapLog when restarting a cluster. (Patrick Hunt via mahadev)

git-svn-id: https://svn.apache.org/repos/asf/zookeeper/trunk@1221868 13f79535-47bb-0310-9956-ffa450edef68
Mahadev Konar 13 years ago
parent
commit
c85f61aee2

+ 3 - 0
CHANGES.txt

@@ -81,6 +81,9 @@ BUGFIXES:
 
   ZOOKEEPER-1323. c client doesn't compile on freebsd
   (michi mutsuzaki via phunt)
+
+  ZOOKEEPER-1333. NPE in FileTxnSnapLog when restarting a cluster.
+  (Patrick Hunt via mahadev)
  
 IMPROVEMENTS:
 

+ 11 - 9
src/java/main/org/apache/zookeeper/server/DataTree.java

@@ -20,6 +20,7 @@ package org.apache.zookeeper.server;
 
 import java.io.IOException;
 import java.io.PrintWriter;
+import java.nio.ByteBuffer;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
@@ -30,21 +31,17 @@ import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.ConcurrentHashMap;
 
-import java.nio.ByteBuffer;
-
 import org.apache.jute.Index;
 import org.apache.jute.InputArchive;
 import org.apache.jute.OutputArchive;
 import org.apache.jute.Record;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
 import org.apache.zookeeper.KeeperException;
+import org.apache.zookeeper.KeeperException.Code;
+import org.apache.zookeeper.KeeperException.NoNodeException;
 import org.apache.zookeeper.Quotas;
 import org.apache.zookeeper.StatsTrack;
 import org.apache.zookeeper.WatchedEvent;
 import org.apache.zookeeper.Watcher;
-import org.apache.zookeeper.KeeperException.Code;
-import org.apache.zookeeper.KeeperException.NoNodeException;
 import org.apache.zookeeper.Watcher.Event;
 import org.apache.zookeeper.Watcher.Event.EventType;
 import org.apache.zookeeper.Watcher.Event.KeeperState;
@@ -54,15 +51,17 @@ import org.apache.zookeeper.common.PathTrie;
 import org.apache.zookeeper.data.ACL;
 import org.apache.zookeeper.data.Stat;
 import org.apache.zookeeper.data.StatPersisted;
+import org.apache.zookeeper.txn.CheckVersionTxn;
 import org.apache.zookeeper.txn.CreateTxn;
 import org.apache.zookeeper.txn.DeleteTxn;
 import org.apache.zookeeper.txn.ErrorTxn;
+import org.apache.zookeeper.txn.MultiTxn;
 import org.apache.zookeeper.txn.SetACLTxn;
 import org.apache.zookeeper.txn.SetDataTxn;
-import org.apache.zookeeper.txn.CheckVersionTxn;
 import org.apache.zookeeper.txn.Txn;
-import org.apache.zookeeper.txn.MultiTxn;
 import org.apache.zookeeper.txn.TxnHeader;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 /**
  * This class maintains the tree data structure. It doesn't have any networking
@@ -757,6 +756,7 @@ public class DataTree {
                     debug = "Set data transaction for "
                             + setDataTxn.getPath()
                             + " to new value=" + Arrays.toString(setDataTxn.getData());
+                    rc.path = setDataTxn.getPath();
                     rc.stat = setData(setDataTxn.getPath(), setDataTxn
                             .getData(), setDataTxn.getVersion(), header
                             .getZxid(), header.getTime());
@@ -765,6 +765,7 @@ public class DataTree {
                     SetACLTxn setACLTxn = (SetACLTxn) txn;
                     debug = "Set ACL transaction for "
                             + setACLTxn.getPath();
+                    rc.path = setACLTxn.getPath();
                     rc.stat = setACL(setACLTxn.getPath(), setACLTxn.getAcl(),
                             setACLTxn.getVersion());
                     break;
@@ -870,7 +871,8 @@ public class DataTree {
         if (rc.zxid > lastProcessedZxid) {
             lastProcessedZxid = rc.zxid;
         }
-        /**
+
+        /*
          * Snapshots are taken lazily. It can happen that the child
          * znodes of a parent are created after the parent
          * is serialized. Therefore, while replaying logs during restore, a

+ 12 - 11
src/java/main/org/apache/zookeeper/server/persistence/FileTxnSnapLog.java

@@ -34,7 +34,6 @@ import org.apache.zookeeper.server.Request;
 import org.apache.zookeeper.server.ZooTrace;
 import org.apache.zookeeper.server.persistence.TxnLog.TxnIterator;
 import org.apache.zookeeper.txn.CreateSessionTxn;
-import org.apache.zookeeper.txn.CreateTxn;
 import org.apache.zookeeper.txn.TxnHeader;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -150,7 +149,7 @@ public class FileTxnSnapLog {
                 processTransaction(hdr,dt,sessions, itr.getTxn());
             } catch(KeeperException.NoNodeException e) {
                throw new IOException("Failed to process transaction type: " +
-                     hdr.getType() + " error: " + e.getMessage());
+                     hdr.getType() + " error: " + e.getMessage(), e);
             }
             listener.onTxnLoaded(hdr, itr.getTxn());
             if (!itr.next())
@@ -197,20 +196,22 @@ public class FileTxnSnapLog {
             rc = dt.processTxn(hdr, txn);
         }
 
-              
-        if(rc.err !=  Code.OK.intValue()) {          
-            if(rc.err == Code.NONODE.intValue()) {
+        /**
+         * This should never happen. A NONODE can never show up in the 
+         * transaction logs. This is more indicative of a corrupt transaction
+         * log. Refer ZOOKEEPER-1333 for more info.
+         */
+        if (rc.err != Code.OK.intValue()) {          
+            if (hdr.getType() == OpCode.create && rc.err == Code.NONODE.intValue()) {
                 int lastSlash = rc.path.lastIndexOf('/');
                 String parentName = rc.path.substring(0, lastSlash);
-                LOG.error("Failed to set parent cversion for: " +
-                        parentName);
-                  throw new KeeperException.NoNodeException(parentName);
-            }
-            else {
+                LOG.error("Parent {} missing for {}", parentName, rc.path);
+                throw new KeeperException.NoNodeException(parentName);
+            } else {
                 LOG.debug("Ignoring processTxn failure hdr: " + hdr.getType() +
                         " : error: " + rc.err);
             }
-        }      
+        }
     }
 
     /**

+ 86 - 20
src/java/test/org/apache/zookeeper/test/LoadFromLogTest.java

@@ -20,42 +20,41 @@ package org.apache.zookeeper.test;
 
 import java.io.ByteArrayOutputStream;
 import java.io.File;
+import java.io.FileInputStream;
+import java.nio.ByteBuffer;
 import java.util.ArrayList;
 import java.util.List;
 
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
+import org.apache.jute.BinaryInputArchive;
+import org.apache.jute.BinaryOutputArchive;
+import org.apache.jute.Record;
 import org.apache.zookeeper.CreateMode;
-import org.apache.zookeeper.MultiTransactionRecord;
+import org.apache.zookeeper.KeeperException.NoNodeException;
 import org.apache.zookeeper.PortAssignment;
 import org.apache.zookeeper.WatchedEvent;
 import org.apache.zookeeper.Watcher;
 import org.apache.zookeeper.ZKTestCase;
-import org.apache.zookeeper.ZooKeeper;
 import org.apache.zookeeper.ZooDefs.Ids;
+import org.apache.zookeeper.ZooDefs.OpCode;
+import org.apache.zookeeper.ZooKeeper;
+import org.apache.zookeeper.server.DataNode;
+import org.apache.zookeeper.server.DataTree;
 import org.apache.zookeeper.server.ServerCnxnFactory;
 import org.apache.zookeeper.server.SyncRequestProcessor;
 import org.apache.zookeeper.server.ZooKeeperServer;
-import org.apache.zookeeper.server.persistence.FileTxnSnapLog;
+import org.apache.zookeeper.server.persistence.FileHeader;
 import org.apache.zookeeper.server.persistence.FileTxnLog;
+import org.apache.zookeeper.server.persistence.FileTxnSnapLog;
 import org.apache.zookeeper.server.persistence.TxnLog.TxnIterator;
-import org.apache.zookeeper.txn.TxnHeader;
-import org.junit.Assert;
-import org.junit.Test;
-import org.apache.zookeeper.server.DataTree;
-import org.apache.zookeeper.server.DataNode;
 import org.apache.zookeeper.txn.CreateTxn;
 import org.apache.zookeeper.txn.DeleteTxn;
 import org.apache.zookeeper.txn.MultiTxn;
 import org.apache.zookeeper.txn.Txn;
-import org.apache.zookeeper.ZooDefs.OpCode;
-import org.apache.jute.BinaryOutputArchive;
-import org.apache.jute.Record;
-import java.io.FileInputStream;
-import java.nio.ByteBuffer;
-
-import org.apache.jute.BinaryInputArchive;
-import org.apache.zookeeper.server.persistence.FileHeader;
+import org.apache.zookeeper.txn.TxnHeader;
+import org.junit.Assert;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 public class LoadFromLogTest extends ZKTestCase implements  Watcher {
     private static String HOSTPORT = "127.0.0.1:" + PortAssignment.unique();
@@ -100,8 +99,8 @@ public class LoadFromLogTest extends ZKTestCase implements  Watcher {
                 ClientBase.waitForServerDown(HOSTPORT, CONNECTION_TIMEOUT));
 
         // now verify that the FileTxnLog reads every transaction only once
-	File logDir = new File(tmpDir, FileTxnSnapLog.version + FileTxnSnapLog.VERSION);
-	FileTxnLog txnLog = new FileTxnLog(logDir);
+        File logDir = new File(tmpDir, FileTxnSnapLog.version + FileTxnSnapLog.VERSION);
+        FileTxnLog txnLog = new FileTxnLog(logDir);
 
         TxnIterator itr = txnLog.read(0);
         long expectedZxid = 0;
@@ -368,4 +367,71 @@ public class LoadFromLogTest extends ZKTestCase implements  Watcher {
 				(children.length == NUM_MESSAGES));
 		f.shutdown();
 	}
+    
+    /**
+     * Test we can restore a snapshot that has errors and data ahead of the zxid
+     * of the snapshot file. 
+     */
+    @Test
+    public void testRestoreWithTransactionErrors() throws Exception {
+        // setup a single server cluster
+        File tmpDir = ClientBase.createTmpDir();
+        ClientBase.setupTestEnv();
+        ZooKeeperServer zks = new ZooKeeperServer(tmpDir, tmpDir, 3000);
+        SyncRequestProcessor.setSnapCount(10000);
+        final int PORT = Integer.parseInt(HOSTPORT.split(":")[1]);
+        ServerCnxnFactory f = ServerCnxnFactory.createFactory(PORT, -1);
+        f.startup(zks);
+        Assert.assertTrue("waiting for server being up ", ClientBase
+                .waitForServerUp(HOSTPORT, CONNECTION_TIMEOUT));
+        ZooKeeper zk = new ZooKeeper(HOSTPORT, CONNECTION_TIMEOUT, this);
+
+        long start = System.currentTimeMillis();
+        while (!connected) {
+            long end = System.currentTimeMillis();
+            if (end - start > 5000) {
+                Assert.assertTrue("Could not connect with server in 5 seconds",
+                        false);
+            }
+            try {
+                Thread.sleep(200);
+            } catch (Exception e) {
+                LOG.warn("Intrrupted");
+            }
+
+        }
+        // generate some transactions
+        try {
+            for (int i = 0; i < NUM_MESSAGES; i++) {
+                try {
+                    zk.create("/invaliddir/test-", new byte[0],
+                            Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT_SEQUENTIAL);
+                } catch(NoNodeException e) {
+                    //Expected
+                }
+            }
+        } finally {
+            zk.close();
+        }
+
+        // force the zxid to be behind the content
+        zks.getZKDatabase().setlastProcessedZxid(
+                zks.getZKDatabase().getDataTreeLastProcessedZxid() - 10);
+        LOG.info("Set lastProcessedZxid to "
+                + zks.getZKDatabase().getDataTreeLastProcessedZxid());
+        
+        // Force snapshot and restore
+        zks.takeSnapshot();
+        zks.shutdown();
+        f.shutdown();
+
+        zks = new ZooKeeperServer(tmpDir, tmpDir, 3000);
+        SyncRequestProcessor.setSnapCount(10000);
+        f = ServerCnxnFactory.createFactory(PORT, -1);
+        f.startup(zks);
+        Assert.assertTrue("waiting for server being up ", ClientBase
+                .waitForServerUp(HOSTPORT, CONNECTION_TIMEOUT));
+        
+        f.shutdown();
+    }
 }