Prechádzať zdrojové kódy

ZOOKEEPER-1181. Fix problems with Kerberos TGT renewal. (Eugene Koontz via mahadev)

git-svn-id: https://svn.apache.org/repos/asf/zookeeper/trunk@1188033 13f79535-47bb-0310-9956-ffa450edef68
Mahadev Konar 13 rokov pred
rodič
commit
a89d9b66ce
2 zmenil súbory, kde vykonal 245 pridanie a 160 odobranie
  1. 3 0
      CHANGES.txt
  2. 242 160
      src/java/main/org/apache/zookeeper/Login.java

+ 3 - 0
CHANGES.txt

@@ -395,6 +395,9 @@ BUGFIXES:
   ZOOKEEPER-1185. Send AuthFailed event to client if SASL authentication fails.
   (Eugene Kuntz via mahadev)
 
+  ZOOKEEPER-1181. Fix problems with Kerberos TGT renewal.
+  (Eugene Koontz via mahadev)
+
 IMPROVEMENTS:
   ZOOKEEPER-724. Improve junit test integration - log harness information 
   (phunt via mahadev)

+ 242 - 160
src/java/main/org/apache/zookeeper/Login.java

@@ -36,8 +36,8 @@ import org.apache.log4j.Logger;
 
 import javax.security.auth.kerberos.KerberosTicket;
 import javax.security.auth.Subject;
-import java.io.IOException;
 import java.util.Date;
+import java.util.Random;
 import java.util.Set;
 
 public class Login {
@@ -49,6 +49,11 @@ public class Login {
     // and try to renew the ticket.
     private static final float TICKET_RENEW_WINDOW = 0.80f;
 
+    /**
+     * Percentage of random jitter added to the renewal time
+     */
+    private static final float TICKET_RENEW_JITTER = 0.05f;
+
     // Regardless of TICKET_RENEW_WINDOW setting above and the ticket expiry time,
     // thread will not sleep between refresh attempts any less than 1 minute (60*1000 milliseconds = 1 minute).
     // Change the '1' to e.g. 5, to change this to 5 minutes.
@@ -58,6 +63,18 @@ public class Login {
     private Thread t = null;
     private boolean isKrbTicket = false;
     private boolean isUsingTicketCache = false;
+    private boolean isUsingKeytab = false;
+
+    /** Random number generator */
+    private static Random rng = new Random();
+
+    private LoginContext login = null;
+    private String loginContextName = null;
+    private String keytabFile = null;
+    private String principal = null;
+
+    private long lastLogin = 0;
+
     /**
      * LoginThread constructor. The constructor starts the thread used
      * to periodically re-login to the Kerberos Ticket Granting Server.
@@ -71,129 +88,170 @@ public class Login {
      *               Thrown if authentication fails.
      */
     public Login(final String loginContextName, CallbackHandler callbackHandler)
-      throws LoginException {
+            throws LoginException {
         this.callbackHandler = callbackHandler;
-        final LoginContext loginContext = login(loginContextName);
-        subject = loginContext.getSubject();
+        login = login(loginContextName);
+        this.loginContextName = loginContextName;
+        subject = login.getSubject();
         isKrbTicket = !subject.getPrivateCredentials(KerberosTicket.class).isEmpty();
         AppConfigurationEntry entries[] = Configuration.getConfiguration().getAppConfigurationEntry(loginContextName);
         for (AppConfigurationEntry entry: entries) {
+            // there will only be a single entry, so this for() loop will only be iterated through once.
             if (entry.getOptions().get("useTicketCache") != null) {
                 String val = (String)entry.getOptions().get("useTicketCache");
                 if (val.equals("true")) {
                     isUsingTicketCache = true;
                 }
-                break;
             }
+            if (entry.getOptions().get("keyTab") != null) {
+                keytabFile = (String)entry.getOptions().get("keyTab");
+                isUsingKeytab = true;
+            }
+            if (entry.getOptions().get("principal") != null) {
+                principal = (String)entry.getOptions().get("principal");
+            }
+            break;
         }
-        if (isKrbTicket && isUsingTicketCache) {
-            // Refresh the Ticket Granting Ticket (TGT) cache periodically. How often to refresh is determined by the
-            // TGT's existing expiry date and the configured MIN_TIME_BEFORE_RELOGIN. For testing and development,
-            // you can decrease the interval of expiration of tickets (for example, to 3 minutes) by running :
-            //  "modprinc -maxlife 3mins <principal>" in kadmin.
-            t = new Thread(new Runnable() {
-                public void run() {
-                    LOG.info("TGT refresh thread started.");
-                    while (true) {  // renewal thread's main loop. if it exits from here, thread will exit.
-                        KerberosTicket tgt = getTGT();
-                        long now = System.currentTimeMillis();
-                        long nextRefresh;
-                        Date nextRefreshDate;
-                        if (tgt == null) {
-                            nextRefresh = now + MIN_TIME_BEFORE_RELOGIN;
-                            nextRefreshDate = new Date(nextRefresh);
-                            LOG.warn("No TGT found: will try again at " + nextRefreshDate);
-                        }
-                        else {
-                            // determine how long to sleep from looking at ticket's expiry.
-                            // We must not allow the ticket to expire, but we should take into consideration
-                            // MIN_TIME_BEFORE_RELOGIN. Will not sleep less than MIN_TIME_BEFORE_RELOGIN, except when
-                            // unless it would cause ticket expiration.
-                            nextRefresh = getRefreshTime(tgt);
-                            long expiry = tgt.getEndTime().getTime();
 
-                            if ((nextRefresh > expiry) ||
-                              ((now + MIN_TIME_BEFORE_RELOGIN) > expiry)) {
-                                // expiry is before next scheduled refresh).
-                                LOG.info("refreshing now because expiry is before next scheduled refresh time.");
-                                nextRefresh = now;
-                            }
-                            else {
-                                if (nextRefresh < (now + MIN_TIME_BEFORE_RELOGIN)) {
-                                    // next scheduled refresh is sooner than (now + MIN_TIME_BEFORE_LOGIN).
-                                    Date until = new Date(nextRefresh);
-                                    Date newuntil = new Date(now + MIN_TIME_BEFORE_RELOGIN);
-                                    LOG.warn("TGT refresh thread time adjusted from : " + until + " to : " + newuntil + " since "
-                                      + "the former is sooner than the minimum refresh interval ("
-                                      + MIN_TIME_BEFORE_RELOGIN / 1000 + " seconds) from now.");
-                                }
-                                nextRefresh = Math.max(nextRefresh, now + MIN_TIME_BEFORE_RELOGIN);
-                            }
-                            nextRefreshDate = new Date(nextRefresh);
-                            if (nextRefresh > expiry) {
-                                Date expiryDate = new Date(expiry);
-                                LOG.error("next refresh: " + nextRefreshDate + " is later than expiry " + expiryDate
-                                  + ". This may indicated a clock skew problem. Check that this host and the KDC's "
-                                  + "hosts' clocks are in sync.");
-                                return;
-                            }
-                        }
+        if (!isKrbTicket) {
+            // if no TGT, do not bother with ticket management.
+            return;
+        }
 
-                        if (now < nextRefresh) {
-                            Date until = new Date(nextRefresh);
-                            LOG.info("TGT refresh thread sleeping until: " + until.toString());
-                            try {
-                                Thread.sleep(nextRefresh - now);
-                            }
-                            catch (InterruptedException ie) {
-                                LOG.warn("TGT renewal thread has been interrupted and will exit.");
-                                break;
+        // Refresh the Ticket Granting Ticket (TGT) periodically. How often to refresh is determined by the
+        // TGT's existing expiry date and the configured MIN_TIME_BEFORE_RELOGIN. For testing and development,
+        // you can decrease the interval of expiration of tickets (for example, to 3 minutes) by running :
+        //  "modprinc -maxlife 3mins <principal>" in kadmin.
+        t = new Thread(new Runnable() {
+            public void run() {
+                LOG.info("TGT refresh thread started.");
+                while (true) {  // renewal thread's main loop. if it exits from here, thread will exit.
+                    KerberosTicket tgt = getTGT();
+                    long now = System.currentTimeMillis();
+                    long nextRefresh;
+                    Date nextRefreshDate;
+                    if (tgt == null) {
+                        nextRefresh = now + MIN_TIME_BEFORE_RELOGIN;
+                        nextRefreshDate = new Date(nextRefresh);
+                        LOG.warn("No TGT found: will try again at " + nextRefreshDate);
+                    } else {
+                        nextRefresh = getRefreshTime(tgt);
+                        long expiry = tgt.getEndTime().getTime();
+                        Date expiryDate = new Date(expiry);
+                        if ((isUsingTicketCache) && (tgt.getEndTime().equals(tgt.getRenewTill()))) {
+                            LOG.error("The TGT cannot be renewed beyond the next expiry date: " + expiryDate + "." +
+                                    "This process will not be able to authenticate new SASL connections after that " +
+                                    "time (for example, it will not be authenticate a new connection with a Zookeeper " +
+                                    "Quorum member).  Ask your system administrator to either increase the " +
+                                    "'renew until' time by doing : 'modprinc -maxrenewlife " + principal + "' within " +
+                                    "kadmin, or instead, to generate a keytab for " + principal + ". Because the TGT's " +
+                                    "expiry cannot be further extended by refreshing, exiting refresh thread now.");
+                            return;
+                        }
+                        // determine how long to sleep from looking at ticket's expiry.
+                        // We should not allow the ticket to expire, but we should take into consideration
+                        // MIN_TIME_BEFORE_RELOGIN. Will not sleep less than MIN_TIME_BEFORE_RELOGIN, unless doing so
+                        // would cause ticket expiration.
+                        if ((nextRefresh > expiry) ||
+                                ((now + MIN_TIME_BEFORE_RELOGIN) > expiry)) {
+                            // expiry is before next scheduled refresh).
+                            LOG.info("refreshing now because expiry is before next scheduled refresh time.");
+                            nextRefresh = now;
+                        } else {
+                            if (nextRefresh < (now + MIN_TIME_BEFORE_RELOGIN)) {
+                                // next scheduled refresh is sooner than (now + MIN_TIME_BEFORE_LOGIN).
+                                Date until = new Date(nextRefresh);
+                                Date newuntil = new Date(now + MIN_TIME_BEFORE_RELOGIN);
+                                LOG.warn("TGT refresh thread time adjusted from : " + until + " to : " + newuntil + " since "
+                                        + "the former is sooner than the minimum refresh interval ("
+                                        + MIN_TIME_BEFORE_RELOGIN / 1000 + " seconds) from now.");
                             }
+                            nextRefresh = Math.max(nextRefresh, now + MIN_TIME_BEFORE_RELOGIN);
                         }
-                        else {
-                            LOG.error("nextRefresh:" + nextRefreshDate + " is in the past: exiting refresh thread. Check"
-                              + " clock sync between this host and KDC - (KDC's clock is likely ahead of this host)."
-                              + " Manual intervention will be required for this client to successfully authenticate.");
-                            // TODO: if we have a keytab, we can use that to re-initialize and avoid the need for
-                            // manual intervention.
+                        nextRefreshDate = new Date(nextRefresh);
+                        if (nextRefresh > expiry) {
+                            LOG.error("next refresh: " + nextRefreshDate + " is later than expiry " + expiryDate
+                                    + ". This may indicate a clock skew problem. Check that this host and the KDC's "
+                                    + "hosts' clocks are in sync. Exiting refresh thread.");
                             return;
                         }
-
+                    }
+                    if (now < nextRefresh) {
+                        Date until = new Date(nextRefresh);
+                        LOG.info("TGT refresh sleeping until: " + until.toString());
+                        try {
+                            Thread.sleep(nextRefresh - now);
+                        } catch (InterruptedException ie) {
+                            LOG.warn("TGT renewal thread has been interrupted and will exit.");
+                            break;
+                        }
+                    }
+                    else {
+                        LOG.error("nextRefresh:" + nextRefreshDate + " is in the past: exiting refresh thread. Check"
+                                + " clock sync between this host and KDC - (KDC's clock is likely ahead of this host)."
+                                + " Manual intervention will be required for this client to successfully authenticate."
+                                + " Exiting refresh thread.");
+                        return;
+                    }
+                    if (isUsingTicketCache) {
                         String cmd = "/usr/bin/kinit";
                         if (System.getProperty("zookeeper.kinit") != null) {
                             cmd = System.getProperty("zookeeper.kinit");
                         }
                         String kinitArgs = "-R";
-                        try {
-                            Shell.execCommand(cmd,kinitArgs);
-                        }
-                        catch (Shell.ExitCodeException e) {
-                            LOG.error("Could not renew TGT due to problem running shell command: '" + cmd
-                              + " " + kinitArgs + "'" + "; exception was:" + e + ". Will try shell command again at: "
-                              + nextRefreshDate);
-                        }
-                        catch (IOException e) {
-                            LOG.error("Could not renew TGT due to problem running shell command: '" + cmd
-                              + " " + kinitArgs + "'; exception was:" + e + ". Will try shell command again at: "
-                              + nextRefreshDate);
-                        }
-                        try {
-                            reloginFromTicketCache(loginContextName, loginContext);
-                            LOG.debug("renewed TGT successfully.");
+                        int retry = 1;
+                        while (retry >= 0) {
+                            try {
+                                LOG.debug("running ticket cache refresh command: " + cmd + " " + kinitArgs);
+                                Shell.execCommand(cmd, kinitArgs);
+                                break;
+                            } catch (Exception e) {
+                                if (retry > 0) {
+                                    --retry;
+                                    // sleep for 10 seconds
+                                    try {
+                                        Thread.sleep(10 * 1000);
+                                    } catch (InterruptedException ie) {
+                                        LOG.error("Interrupted while renewing TGT, exiting Login thread");
+                                        return;
+                                    }
+                                } else {
+                                    LOG.warn("Could not renew TGT due to problem running shell command: '" + cmd
+                                            + " " + kinitArgs + "'" + "; exception was:" + e + ". Exiting refresh thread.",e);
+                                    return;
+                                }
+                            }
                         }
-                        catch (LoginException e) {
-                            LOG.error("Could not renew TGT due to LoginException: " + e + "."
-                              + " Will try again at: "
-                              + nextRefreshDate);
+                    }
+                    try {
+                        int retry = 1;
+                        while (retry >= 0) {
+                            try {
+                                reLogin();
+                                break;
+                            } catch (LoginException le) {
+                                if (retry > 0) {
+                                    --retry;
+                                    // sleep for 10 seconds.
+                                    try {
+                                        Thread.sleep(10 * 1000);
+                                    } catch (InterruptedException e) {
+                                        LOG.error("Interrupted during login retry after LoginException:", le);
+                                        throw le;
+                                    }
+                                } else {
+                                    LOG.error("Could not refresh TGT for principal: " + principal + ".", le);
+                                }
+                            }
                         }
+                    } catch (LoginException le) {
+                        LOG.error("Failed to refresh TGT: refresh thread exiting now.",le);
+                        break;
                     }
                 }
-            });
-            t.setDaemon(true);
-        }
-        else {
-            LOG.error("Not using Ticket Granting Ticket cache: will not start a TGT renewal thread.");
-        }
+            }
+        });
+        t.setDaemon(true);
     }
 
     public void startThreadIfNeeded() {
@@ -203,11 +261,25 @@ public class Login {
         }
     }
 
+    public void shutdown() {
+        if ((t != null) && (t.isAlive())) {
+            t.interrupt();
+            try {
+                t.join();
+            } catch (InterruptedException e) {
+                LOG.warn("error while waiting for Login thread to shutdown: " + e);
+            }
+        }
+    }
+
+    public Subject getSubject() {
+        return subject;
+    }
 
     private synchronized LoginContext login(final String loginContextName) throws LoginException {
         if (loginContextName == null) {
             throw new LoginException("loginContext name (JAAS file section header) was null. " +
-              "Please check your java.security.login.auth.config setting.");
+                    "Please check your java.security.login.auth.config setting.");
         }
         LoginContext loginContext = new LoginContext(loginContextName,callbackHandler);
         loginContext.login();
@@ -215,17 +287,14 @@ public class Login {
         return loginContext;
     }
 
-    public Subject getSubject() {
-        return subject;
-    }
-
     // c.f. org.apache.hadoop.security.UserGroupInformation.
     private long getRefreshTime(KerberosTicket tgt) {
         long start = tgt.getStartTime().getTime();
         long expires = tgt.getEndTime().getTime();
-        LOG.info("TGT valid starting at: " + tgt.getStartTime().toString());
-        LOG.info("TGT expires: " + tgt.getEndTime().toString());
-        long proposedRefresh = start + (long) ((expires - start) * TICKET_RENEW_WINDOW);
+        LOG.info("TGT valid starting at:        " + tgt.getStartTime().toString());
+        LOG.info("TGT expires:                  " + tgt.getEndTime().toString());
+        long proposedRefresh = start + (long) ((expires - start) *
+                (TICKET_RENEW_WINDOW + (TICKET_RENEW_JITTER * rng.nextDouble())));
         if (proposedRefresh > expires) {
             // proposedRefresh is too far in the future: it's after ticket expires: simply return now.
             return System.currentTimeMillis();
@@ -247,67 +316,80 @@ public class Login {
         return null;
     }
 
-    // TODO : refactor this with login() to maximize code-sharing.
-    public synchronized void reloginFromTicketCache(final String loginContextName, LoginContext loginContext)
-        throws LoginException {
-        if (!(isKrbTicket && isUsingTicketCache)) {
+    private boolean hasSufficientTimeElapsed() {
+        long now = System.currentTimeMillis();
+        if (now - getLastLogin() < MIN_TIME_BEFORE_RELOGIN ) {
+            LOG.warn("Not attempting to re-login since the last re-login was " +
+                    "attempted less than " + (MIN_TIME_BEFORE_RELOGIN/1000) + " seconds"+
+                    " before.");
+            return false;
+        }
+        // register most recent relogin attempt
+        setLastLogin(now);
+        return true;
+    }
+
+    /**
+     * Returns login object
+     * @return login
+     */
+    private LoginContext getLogin() {
+        return login;
+    }
+
+    /**
+     * Set the login object
+     * @param login
+     */
+    private void setLogin(LoginContext login) {
+        this.login = login;
+    }
+
+    /**
+     * Set the last login time.
+     * @param time the number of milliseconds since the beginning of time
+     */
+    private void setLastLogin(long time) {
+        lastLogin = time;
+    }
+
+    /**
+     * Get the time of the last login.
+     * @return the number of milliseconds since the beginning of time.
+     */
+    private long getLastLogin() {
+        return lastLogin;
+    }
+
+    /**
+     * Re-login a principal. This method assumes that {@link #login(String)} has happened already.
+     * @throws javax.security.auth.login.LoginException on a failure
+     */
+    // c.f. HADOOP-6559
+    private synchronized void reLogin()
+            throws LoginException {
+        if (!isKrbTicket) {
             return;
         }
-        if (loginContext == null) {
+        LoginContext login = getLogin();
+        if (login  == null) {
             throw new LoginException("login must be done first");
         }
-        String principalName = getPrincipalName();
-        try {
-            LOG.info("Logging out " + principalName);
-            //clear up the Kerberos state. But the tokens are not cleared! As per
+        if (!hasSufficientTimeElapsed()) {
+            return;
+        }
+        LOG.info("Initiating logout for " + principal);
+        synchronized (Login.class) {
+            //clear up the kerberos state. But the tokens are not cleared! As per
             //the Java kerberos login module code, only the kerberos credentials
-            //are cleared.
-            loginContext.logout();
+            //are cleared
+            login.logout();
             //login and also update the subject field of this instance to
             //have the new credentials (pass it to the LoginContext constructor)
-            if (loginContextName == null) {
-                throw new LoginException("loginContext name (JAAS file section header) was null. " +
-                  "Please check your java.security.login.auth.config setting.");
-            }
-            if (subject == null) {
-                throw new LoginException("login subject was null.");
-            }
-            LOG.info("Logging in " + principalName);
-            loginContext.login();
-            if (principalName.equals("(no principal name)")) {
-                // try again to get the principal name, in case the ticket cache was manually refreshed.
-                principalName = getPrincipalName();
-            }
-            LOG.info("Login successful for " + principalName);
-        } catch (LoginException le) {
-            throw new LoginException("Login failure for " + principalName);
-        }
-    }
-
-    private String getPrincipalName() {
-        try {
-            return getSubject().getPrincipals(KerberosPrincipal.class).toArray()[0].toString();
-        }
-        catch (NullPointerException e) {
-            LOG.warn("could not display principal name because login was null or login's subject was null: returning '(no principal found)'.");
+            login = new LoginContext(loginContextName, getSubject());
+            LOG.info("Initiating re-login for " + principal);
+            login.login();
+            setLogin(login);
         }
-        catch (ArrayIndexOutOfBoundsException e) {
-            LOG.warn("could not display principal name because login's subject had no principals: returning '(no principal found)'.");
-        }
-        return "(no principal found)";
     }
-
-    public void shutdown() {
-        if ((t != null) && (t.isAlive())) {
-            t.interrupt();
-            try {
-                t.join();
-            }
-            catch (InterruptedException e) {
-                LOG.error("error while waiting for Login thread to shutdown: " + e);
-            }
-        }
-    }
-
 }
-