|
@@ -0,0 +1,1098 @@
|
|
|
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
|
|
|
+<html>
|
|
|
+<head>
|
|
|
+<META http-equiv="Content-Type" content="text/html; charset=UTF-8">
|
|
|
+<meta content="Apache Forrest" name="Generator">
|
|
|
+<meta name="Forrest-version" content="0.9">
|
|
|
+<meta name="Forrest-skin-name" content="pelt">
|
|
|
+<title>ZooKeeper Dynamic Reconfiguration</title>
|
|
|
+<link type="text/css" href="skin/basic.css" rel="stylesheet">
|
|
|
+<link media="screen" type="text/css" href="skin/screen.css" rel="stylesheet">
|
|
|
+<link media="print" type="text/css" href="skin/print.css" rel="stylesheet">
|
|
|
+<link type="text/css" href="skin/profile.css" rel="stylesheet">
|
|
|
+<script src="skin/getBlank.js" language="javascript" type="text/javascript"></script><script src="skin/getMenu.js" language="javascript" type="text/javascript"></script><script src="skin/fontsize.js" language="javascript" type="text/javascript"></script>
|
|
|
+<link rel="shortcut icon" href="images/favicon.ico">
|
|
|
+</head>
|
|
|
+<body onload="init()">
|
|
|
+<script type="text/javascript">ndeSetTextSize();</script>
|
|
|
+<div id="top">
|
|
|
+<!--+
|
|
|
+ |breadtrail
|
|
|
+ +-->
|
|
|
+<div class="breadtrail">
|
|
|
+<a href="http://www.apache.org/">Apache</a> > <a href="http://zookeeper.apache.org/">ZooKeeper</a> > <a href="http://zookeeper.apache.org/">ZooKeeper</a><script src="skin/breadcrumbs.js" language="JavaScript" type="text/javascript"></script>
|
|
|
+</div>
|
|
|
+<!--+
|
|
|
+ |header
|
|
|
+ +-->
|
|
|
+<div class="header">
|
|
|
+<!--+
|
|
|
+ |start group logo
|
|
|
+ +-->
|
|
|
+<div class="grouplogo">
|
|
|
+<a href="http://hadoop.apache.org/"><img class="logoImage" alt="Hadoop" src="images/hadoop-logo.jpg" title="Apache Hadoop"></a>
|
|
|
+</div>
|
|
|
+<!--+
|
|
|
+ |end group logo
|
|
|
+ +-->
|
|
|
+<!--+
|
|
|
+ |start Project Logo
|
|
|
+ +-->
|
|
|
+<div class="projectlogo">
|
|
|
+<a href="http://zookeeper.apache.org/"><img class="logoImage" alt="ZooKeeper" src="images/zookeeper_small.gif" title="ZooKeeper: distributed coordination"></a>
|
|
|
+</div>
|
|
|
+<!--+
|
|
|
+ |end Project Logo
|
|
|
+ +-->
|
|
|
+<!--+
|
|
|
+ |start Search
|
|
|
+ +-->
|
|
|
+<div class="searchbox">
|
|
|
+<form action="http://www.google.com/search" method="get" class="roundtopsmall">
|
|
|
+<input value="zookeeper.apache.org" name="sitesearch" type="hidden"><input onFocus="getBlank (this, 'Search the site with google');" size="25" name="q" id="query" type="text" value="Search the site with google">
|
|
|
+ <input name="Search" value="Search" type="submit">
|
|
|
+</form>
|
|
|
+</div>
|
|
|
+<!--+
|
|
|
+ |end search
|
|
|
+ +-->
|
|
|
+<!--+
|
|
|
+ |start Tabs
|
|
|
+ +-->
|
|
|
+<ul id="tabs">
|
|
|
+<li>
|
|
|
+<a class="unselected" href="http://zookeeper.apache.org/">Project</a>
|
|
|
+</li>
|
|
|
+<li>
|
|
|
+<a class="unselected" href="https://cwiki.apache.org/confluence/display/ZOOKEEPER/">Wiki</a>
|
|
|
+</li>
|
|
|
+<li class="current">
|
|
|
+<a class="selected" href="index.html">ZooKeeper 3.4 Documentation</a>
|
|
|
+</li>
|
|
|
+</ul>
|
|
|
+<!--+
|
|
|
+ |end Tabs
|
|
|
+ +-->
|
|
|
+</div>
|
|
|
+</div>
|
|
|
+<div id="main">
|
|
|
+<div id="publishedStrip">
|
|
|
+<!--+
|
|
|
+ |start Subtabs
|
|
|
+ +-->
|
|
|
+<div id="level2tabs"></div>
|
|
|
+<!--+
|
|
|
+ |end Endtabs
|
|
|
+ +-->
|
|
|
+<script type="text/javascript"><!--
|
|
|
+document.write("Last Published: " + document.lastModified);
|
|
|
+// --></script>
|
|
|
+</div>
|
|
|
+<!--+
|
|
|
+ |breadtrail
|
|
|
+ +-->
|
|
|
+<div class="breadtrail">
|
|
|
+
|
|
|
+
|
|
|
+ </div>
|
|
|
+<!--+
|
|
|
+ |start Menu, mainarea
|
|
|
+ +-->
|
|
|
+<!--+
|
|
|
+ |start Menu
|
|
|
+ +-->
|
|
|
+<div id="menu">
|
|
|
+<div onclick="SwitchMenu('menu_1.1', 'skin/')" id="menu_1.1Title" class="menutitle">Overview</div>
|
|
|
+<div id="menu_1.1" class="menuitemgroup">
|
|
|
+<div class="menuitem">
|
|
|
+<a href="index.html">Welcome</a>
|
|
|
+</div>
|
|
|
+<div class="menuitem">
|
|
|
+<a href="zookeeperOver.html">Overview</a>
|
|
|
+</div>
|
|
|
+<div class="menuitem">
|
|
|
+<a href="zookeeperStarted.html">Getting Started</a>
|
|
|
+</div>
|
|
|
+<div class="menuitem">
|
|
|
+<a href="releasenotes.html">Release Notes</a>
|
|
|
+</div>
|
|
|
+</div>
|
|
|
+<div onclick="SwitchMenu('menu_1.2', 'skin/')" id="menu_1.2Title" class="menutitle">Developer</div>
|
|
|
+<div id="menu_1.2" class="menuitemgroup">
|
|
|
+<div class="menuitem">
|
|
|
+<a href="api/index.html">API Docs</a>
|
|
|
+</div>
|
|
|
+<div class="menuitem">
|
|
|
+<a href="zookeeperProgrammers.html">Programmer's Guide</a>
|
|
|
+</div>
|
|
|
+<div class="menuitem">
|
|
|
+<a href="javaExample.html">Java Example</a>
|
|
|
+</div>
|
|
|
+<div class="menuitem">
|
|
|
+<a href="zookeeperTutorial.html">Barrier and Queue Tutorial</a>
|
|
|
+</div>
|
|
|
+<div class="menuitem">
|
|
|
+<a href="recipes.html">Recipes</a>
|
|
|
+</div>
|
|
|
+</div>
|
|
|
+<div onclick="SwitchMenu('menu_selected_1.3', 'skin/')" id="menu_selected_1.3Title" class="menutitle" style="background-image: url('skin/images/chapter_open.gif');">Admin & Ops</div>
|
|
|
+<div id="menu_selected_1.3" class="selectedmenuitemgroup" style="display: block;">
|
|
|
+<div class="menuitem">
|
|
|
+<a href="zookeeperAdmin.html">Administrator's Guide</a>
|
|
|
+</div>
|
|
|
+<div class="menuitem">
|
|
|
+<a href="zookeeperQuotas.html">Quota Guide</a>
|
|
|
+</div>
|
|
|
+<div class="menuitem">
|
|
|
+<a href="zookeeperJMX.html">JMX</a>
|
|
|
+</div>
|
|
|
+<div class="menuitem">
|
|
|
+<a href="zookeeperObservers.html">Observers Guide</a>
|
|
|
+</div>
|
|
|
+<div class="menupage">
|
|
|
+<div class="menupagetitle">Dynamic Reconfiguration</div>
|
|
|
+</div>
|
|
|
+</div>
|
|
|
+<div onclick="SwitchMenu('menu_1.4', 'skin/')" id="menu_1.4Title" class="menutitle">Contributor</div>
|
|
|
+<div id="menu_1.4" class="menuitemgroup">
|
|
|
+<div class="menuitem">
|
|
|
+<a href="zookeeperInternals.html">ZooKeeper Internals</a>
|
|
|
+</div>
|
|
|
+</div>
|
|
|
+<div onclick="SwitchMenu('menu_1.5', 'skin/')" id="menu_1.5Title" class="menutitle">Miscellaneous</div>
|
|
|
+<div id="menu_1.5" class="menuitemgroup">
|
|
|
+<div class="menuitem">
|
|
|
+<a href="https://cwiki.apache.org/confluence/display/ZOOKEEPER">Wiki</a>
|
|
|
+</div>
|
|
|
+<div class="menuitem">
|
|
|
+<a href="https://cwiki.apache.org/confluence/display/ZOOKEEPER/FAQ">FAQ</a>
|
|
|
+</div>
|
|
|
+<div class="menuitem">
|
|
|
+<a href="http://zookeeper.apache.org/mailing_lists.html">Mailing Lists</a>
|
|
|
+</div>
|
|
|
+</div>
|
|
|
+<div id="credit"></div>
|
|
|
+<div id="roundbottom">
|
|
|
+<img style="display: none" class="corner" height="15" width="15" alt="" src="skin/images/rc-b-l-15-1body-2menu-3menu.png"></div>
|
|
|
+<!--+
|
|
|
+ |alternative credits
|
|
|
+ +-->
|
|
|
+<div id="credit2"></div>
|
|
|
+</div>
|
|
|
+<!--+
|
|
|
+ |end Menu
|
|
|
+ +-->
|
|
|
+<!--+
|
|
|
+ |start content
|
|
|
+ +-->
|
|
|
+<div id="content">
|
|
|
+<div title="Portable Document Format" class="pdflink">
|
|
|
+<a class="dida" href="zookeeperReconfig.pdf"><img alt="PDF -icon" src="skin/images/pdfdoc.gif" class="skin"><br>
|
|
|
+ PDF</a>
|
|
|
+</div>
|
|
|
+<h1>ZooKeeper Dynamic Reconfiguration</h1>
|
|
|
+<div id="front-matter">
|
|
|
+<div id="minitoc-area">
|
|
|
+<ul class="minitoc">
|
|
|
+<li>
|
|
|
+<a href="#ch_reconfig_intro">Overview</a>
|
|
|
+</li>
|
|
|
+<li>
|
|
|
+<a href="#ch_reconfig_format">Changes to Configuration Format</a>
|
|
|
+<ul class="minitoc">
|
|
|
+<li>
|
|
|
+<a href="#sc_reconfig_clientport">Specifying the client port</a>
|
|
|
+</li>
|
|
|
+<li>
|
|
|
+<a href="#sc_reconfig_standaloneEnabled">The standaloneEnabled flag</a>
|
|
|
+</li>
|
|
|
+<li>
|
|
|
+<a href="#sc_reconfig_file">Dynamic configuration file</a>
|
|
|
+</li>
|
|
|
+<li>
|
|
|
+<a href="#sc_reconfig_backward">Backward compatibility</a>
|
|
|
+</li>
|
|
|
+</ul>
|
|
|
+</li>
|
|
|
+<li>
|
|
|
+<a href="#ch_reconfig_upgrade">Upgrading to 3.5.0</a>
|
|
|
+</li>
|
|
|
+<li>
|
|
|
+<a href="#ch_reconfig_dyn">Dynamic Reconfiguration of the ZooKeeper Ensemble</a>
|
|
|
+<ul class="minitoc">
|
|
|
+<li>
|
|
|
+<a href="#sc_reconfig_retrieving">Retrieving the current dynamic configuration</a>
|
|
|
+</li>
|
|
|
+<li>
|
|
|
+<a href="#sc_reconfig_modifying">Modifying the current dynamic configuration</a>
|
|
|
+<ul class="minitoc">
|
|
|
+<li>
|
|
|
+<a href="#sc_reconfig_general">General</a>
|
|
|
+</li>
|
|
|
+<li>
|
|
|
+<a href="#sc_reconfig_incremental">Incremental mode</a>
|
|
|
+</li>
|
|
|
+<li>
|
|
|
+<a href="#sc_reconfig_nonincremental">Non-incremental mode</a>
|
|
|
+</li>
|
|
|
+<li>
|
|
|
+<a href="#sc_reconfig_conditional">Conditional reconfig</a>
|
|
|
+</li>
|
|
|
+<li>
|
|
|
+<a href="#sc_reconfig_errors">Error conditions</a>
|
|
|
+</li>
|
|
|
+<li>
|
|
|
+<a href="#sc_reconfig_additional">Additional comments</a>
|
|
|
+</li>
|
|
|
+</ul>
|
|
|
+</li>
|
|
|
+</ul>
|
|
|
+</li>
|
|
|
+<li>
|
|
|
+<a href="#ch_reconfig_rebalancing">Rebalancing Client Connections</a>
|
|
|
+</li>
|
|
|
+</ul>
|
|
|
+</div>
|
|
|
+</div>
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+<a name="ch_reconfig_intro"></a>
|
|
|
+<h2 class="h3">Overview</h2>
|
|
|
+<div class="section">
|
|
|
+<p>Prior to the 3.5.0 release, the membership and all other configuration
|
|
|
+ parameters of Zookeeper were static - loaded during boot and immutable at
|
|
|
+ runtime. Operators resorted to ''rolling restarts'' - a manually intensive
|
|
|
+ and error-prone method of changing the configuration that has caused data
|
|
|
+ loss and inconsistency in production.</p>
|
|
|
+<p>Starting with 3.5.0, “rolling restarts” are no longer needed!
|
|
|
+ ZooKeeper comes with full support for automated configuration changes: the
|
|
|
+ set of Zookeeper servers, their roles (participant / observer), all ports,
|
|
|
+ and even the quorum system can be changed dynamically, without service
|
|
|
+ interruption and while maintaining data consistency. Reconfigurations are
|
|
|
+ performed immediately, just like other operations in ZooKeeper. Multiple
|
|
|
+ changes can be done using a single reconfiguration command. The dynamic
|
|
|
+ reconfiguration functionality does not limit operation concurrency, does
|
|
|
+ not require client operations to be stopped during reconfigurations, has a
|
|
|
+ very simple interface for administrators and no added complexity to other
|
|
|
+ client operations.</p>
|
|
|
+<p>New client-side features allow clients to find out about configuration
|
|
|
+ changes and to update the connection string (list of servers and their
|
|
|
+ client ports) stored in their ZooKeeper handle. A probabilistic algorithm
|
|
|
+ is used to rebalance clients across the new configuration servers while
|
|
|
+ keeping the extent of client migrations proportional to the change in
|
|
|
+ ensemble membership.</p>
|
|
|
+<p>This document provides the administrator manual for reconfiguration.
|
|
|
+ For a detailed description of the reconfiguration algorithms, performance
|
|
|
+ measurements, and more, please see our paper:</p>
|
|
|
+<dl>
|
|
|
+
|
|
|
+<dt>
|
|
|
+<term>Shraer, A., Reed, B., Malkhi, D., Junqueira, F. Dynamic
|
|
|
+ Reconfiguration of Primary/Backup Clusters. In <em>USENIX Annual
|
|
|
+ Technical Conference (ATC) </em>(2012), 425-437</term>
|
|
|
+</dt>
|
|
|
+<dd>
|
|
|
+<p>Links: <a href="https://www.usenix.org/system/files/conference/atc12/atc12-final74.pdf">paper (pdf)</a>, <a href="https://www.usenix.org/sites/default/files/conference/protected-files/shraer_atc12_slides.pdf">slides (pdf)</a>, <a href="https://www.usenix.org/conference/atc12/technical-sessions/presentation/shraer">video</a>, <a href="http://www.slideshare.net/Hadoop_Summit/dynamic-reconfiguration-of-zookeeper">hadoop summit slides</a>
|
|
|
+</p>
|
|
|
+</dd>
|
|
|
+
|
|
|
+</dl>
|
|
|
+</div>
|
|
|
+
|
|
|
+<a name="ch_reconfig_format"></a>
|
|
|
+<h2 class="h3">Changes to Configuration Format</h2>
|
|
|
+<div class="section">
|
|
|
+<a name="sc_reconfig_clientport"></a>
|
|
|
+<h3 class="h4">Specifying the client port</h3>
|
|
|
+<p>A client port of a server is the port on which the server accepts
|
|
|
+ client connection requests. Starting with 3.5.0 the
|
|
|
+ <em>clientPort</em> and <em>clientPortAddress
|
|
|
+ </em> configuration parameters should no longer be used. Instead,
|
|
|
+ this information is now part of the server keyword specification, which
|
|
|
+ becomes as follows:</p>
|
|
|
+<p>
|
|
|
+<span class="codefrag computeroutput">server.<positive id> = <address1>:<port1>:<port2>[:role];[<client port address>:]<client port></span>
|
|
|
+</p>
|
|
|
+<p>The client port specification is to the right of the semicolon. The
|
|
|
+ client port address is optional, and if not specified it defaults to
|
|
|
+ "0.0.0.0". As usual, role is also optional, it can be
|
|
|
+ <em>participant</em> or <em>observer</em>
|
|
|
+ (<em>participant</em> by default).</p>
|
|
|
+<p> Examples of legal server statements: </p>
|
|
|
+<ul>
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>
|
|
|
+<span class="codefrag computeroutput">server.5 = 125.23.63.23:1234:1235;1236</span>
|
|
|
+</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>
|
|
|
+<span class="codefrag computeroutput">server.5 = 125.23.63.23:1234:1235:participant;1236</span>
|
|
|
+</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>
|
|
|
+<span class="codefrag computeroutput">server.5 = 125.23.63.23:1234:1235:observer;1236</span>
|
|
|
+</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>
|
|
|
+<span class="codefrag computeroutput">server.5 = 125.23.63.23:1234:1235;125.23.63.24:1236</span>
|
|
|
+</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>
|
|
|
+<span class="codefrag computeroutput">server.5 = 125.23.63.23:1234:1235:participant;125.23.63.23:1236</span>
|
|
|
+</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+</ul>
|
|
|
+<a name="sc_reconfig_standaloneEnabled"></a>
|
|
|
+<h3 class="h4">The standaloneEnabled flag</h3>
|
|
|
+<p>Prior to 3.5.0, one could run ZooKeeper in Standalone mode or in a
|
|
|
+ Distributed mode. These are separate implementation stacks, and
|
|
|
+ switching between them during run time is not possible. By default (for
|
|
|
+ backward compatibility) <em>standaloneEnabled</em> is set to
|
|
|
+ <em>true</em>. The consequence of using this default is that
|
|
|
+ if started with a single server the ensemble will not be allowed to
|
|
|
+ grow, and if started with more than one server it will not be allowed to
|
|
|
+ shrink to contain fewer than two participants.</p>
|
|
|
+<p>Setting the flag to <em>false</em> instructs the system
|
|
|
+ to run the Distributed software stack even if there is only a single
|
|
|
+ participant in the ensemble. To achieve this the (static) configuration
|
|
|
+ file should contain:</p>
|
|
|
+<p>
|
|
|
+<span class="codefrag computeroutput">standaloneEnabled=false</span>
|
|
|
+</p>
|
|
|
+<p>With this setting it is possible to start a ZooKeeper ensemble
|
|
|
+ containing a single participant and to dynamically grow it by adding
|
|
|
+ more servers. Similarly, it is possible to shrink an ensemble so that
|
|
|
+ just a single participant remains, by removing servers.</p>
|
|
|
+<p>Since running the Distributed mode allows more flexibility, we
|
|
|
+ recommend setting the flag to <em>false</em>. We expect that
|
|
|
+ the legacy Standalone mode will be deprecated in the future.</p>
|
|
|
+<a name="sc_reconfig_file"></a>
|
|
|
+<h3 class="h4">Dynamic configuration file</h3>
|
|
|
+<p>Starting with 3.5.0 we're distinguishing between dynamic
|
|
|
+ configuration parameters, which can be changed during runtime, and
|
|
|
+ static configuration parameters, which are read from a configuration
|
|
|
+ file when a server boots and don't change during its execution. For now,
|
|
|
+ the following configuration keywords are considered part of the dynamic
|
|
|
+ configuration: <em>server</em>, <em>group</em>
|
|
|
+ and <em>weight</em>.</p>
|
|
|
+<p>Dynamic configuration parameters are stored in a separate file on
|
|
|
+ the server (which we call the dynamic configuration file). This file is
|
|
|
+ linked from the static config file using the new
|
|
|
+ <em>dynamicConfigFile</em> keyword.</p>
|
|
|
+<p>
|
|
|
+<strong>Example</strong>
|
|
|
+</p>
|
|
|
+<div class="note example">
|
|
|
+<div class="label">zoo_replicated1.cfg</div>
|
|
|
+<div class="content">
|
|
|
+
|
|
|
+<title>zoo_replicated1.cfg</title>
|
|
|
+
|
|
|
+<pre class="code">tickTime=2000
|
|
|
+dataDir=/zookeeper/data/zookeeper1
|
|
|
+initLimit=5
|
|
|
+syncLimit=2
|
|
|
+dynamicConfigFile=/zookeeper/conf/zoo_replicated1.cfg.dynamic</pre>
|
|
|
+
|
|
|
+</div>
|
|
|
+</div>
|
|
|
+<div class="note example">
|
|
|
+<div class="label">zoo_replicated1.cfg.dynamic</div>
|
|
|
+<div class="content">
|
|
|
+
|
|
|
+<title>zoo_replicated1.cfg.dynamic</title>
|
|
|
+
|
|
|
+<pre class="code">server.1=125.23.63.23:2780:2783:participant;2791
|
|
|
+server.2=125.23.63.24:2781:2784:participant;2792
|
|
|
+server.3=125.23.63.25:2782:2785:participant;2793</pre>
|
|
|
+
|
|
|
+</div>
|
|
|
+</div>
|
|
|
+<p>When the ensemble configuration changes, the static configuration
|
|
|
+ parameters remain the same. The dynamic parameters are pushed by
|
|
|
+ ZooKeeper and overwrite the dynamic configuration files on all servers.
|
|
|
+ Thus, the dynamic configuration files on the different servers are
|
|
|
+ usually identical (they can only differ momentarily when a
|
|
|
+ reconfiguration is in progress, or if a new configuration hasn't
|
|
|
+ propagated yet to some of the servers). Once created, the dynamic
|
|
|
+ configuration file should not be manually altered. Changed are only made
|
|
|
+ through the new reconfiguration commands outlined below. Note that
|
|
|
+ changing the config of an offline cluster could result in an
|
|
|
+ inconsistency with respect to configuration information stored in the
|
|
|
+ ZooKeeper log (and the special configuration znode, populated from the
|
|
|
+ log) and is therefore highly discouraged.</p>
|
|
|
+<p>
|
|
|
+<strong>Example 2</strong>
|
|
|
+</p>
|
|
|
+<p>Users may prefer to initially specify a single configuration file.
|
|
|
+ The following is thus also legal:</p>
|
|
|
+<div class="note example">
|
|
|
+<div class="label">zoo_replicated1.cfg</div>
|
|
|
+<div class="content">
|
|
|
+
|
|
|
+<title>zoo_replicated1.cfg</title>
|
|
|
+
|
|
|
+<pre class="code">tickTime=2000
|
|
|
+dataDir=/zookeeper/data/zookeeper1
|
|
|
+initLimit=5
|
|
|
+syncLimit=2
|
|
|
+clientPort=<strong>2791</strong> // note that this line is now redundant and therefore not recommended
|
|
|
+server.1=125.23.63.23:2780:2783:participant;<strong>2791</strong>
|
|
|
+server.2=125.23.63.24:2781:2784:participant;2792
|
|
|
+server.3=125.23.63.25:2782:2785:participant;2793</pre>
|
|
|
+
|
|
|
+</div>
|
|
|
+</div>
|
|
|
+<p>The configuration files on each server will be automatically split
|
|
|
+ into dynamic and static files, if they are not already in this format.
|
|
|
+ So the configuration file above will be automatically transformed into
|
|
|
+ the two files in Example 1. Note that the clientPort and
|
|
|
+ clientPortAddress lines (if specified) will be automatically removed
|
|
|
+ during this process, if they are redundant (as in the example above).
|
|
|
+ The original static configuration file is backed up (in a .bak
|
|
|
+ file).</p>
|
|
|
+<a name="sc_reconfig_backward"></a>
|
|
|
+<h3 class="h4">Backward compatibility</h3>
|
|
|
+<p>We still support the old configuration format. For example, the
|
|
|
+ following configuration file is acceptable (but not recommended):</p>
|
|
|
+<div class="note example">
|
|
|
+<div class="label">zoo_replicated1.cfg</div>
|
|
|
+<div class="content">
|
|
|
+
|
|
|
+<title>zoo_replicated1.cfg</title>
|
|
|
+
|
|
|
+<pre class="code">tickTime=2000
|
|
|
+dataDir=/zookeeper/data/zookeeper1
|
|
|
+initLimit=5
|
|
|
+syncLimit=2
|
|
|
+clientPort=2791
|
|
|
+server.1=125.23.63.23:2780:2783:participant
|
|
|
+server.2=125.23.63.24:2781:2784:participant
|
|
|
+server.3=125.23.63.25:2782:2785:participant</pre>
|
|
|
+
|
|
|
+</div>
|
|
|
+</div>
|
|
|
+<p>During boot, a dynamic configuration file is created and contains
|
|
|
+ the dynamic part of the configuration as explained earlier. In this
|
|
|
+ case, however, the line "clientPort=2791" will remain in the static
|
|
|
+ configuration file of server 1 since it is not redundant -- it was not
|
|
|
+ specified as part of the "server.1=..." using the format explained in
|
|
|
+ the section <a href="#ch_reconfig_format">Changes to Configuration Format</a>. If a reconfiguration
|
|
|
+ is invoked that sets the client port of server 1, we remove
|
|
|
+ "clientPort=2791" from the static configuration file (the dynamic file
|
|
|
+ now contain this information as part of the specification of server
|
|
|
+ 1).</p>
|
|
|
+</div>
|
|
|
+
|
|
|
+<a name="ch_reconfig_upgrade"></a>
|
|
|
+<h2 class="h3">Upgrading to 3.5.0</h2>
|
|
|
+<div class="section">
|
|
|
+<p>Upgrading a running ZooKeeper ensemble to 3.5.0 should be done only
|
|
|
+ after upgrading your ensemble to the 3.4.6 release. Note that this is only
|
|
|
+ necessary for rolling upgrades (if you're fine with shutting down the
|
|
|
+ system completely, you don't have to go through 3.4.6). If you attempt a
|
|
|
+ rolling upgrade without going through 3.4.6 (for example from 3.4.5), you
|
|
|
+ may get the following error:</p>
|
|
|
+<pre class="code">2013-01-30 11:32:10,663 [myid:2] - INFO [localhost/127.0.0.1:2784:QuorumCnxManager$Listener@498] - Received connection request /127.0.0.1:60876
|
|
|
+2013-01-30 11:32:10,663 [myid:2] - WARN [localhost/127.0.0.1:2784:QuorumCnxManager@349] - Invalid server id: -65536</pre>
|
|
|
+<p>During a rolling upgrade, each server is taken down in turn and
|
|
|
+ rebooted with the new 3.5.0 binaries. Before starting the server with
|
|
|
+ 3.5.0 binaries, we highly recommend updating the configuration file so
|
|
|
+ that all server statements "server.x=..." contain client ports (see the
|
|
|
+ section <a href="#sc_reconfig_clientport">Specifying the client port</a>). As explained earlier
|
|
|
+ you may leave the configuration in a single file, as well as leave the
|
|
|
+ clientPort/clientPortAddress statements (although if you specify client
|
|
|
+ ports in the new format, these statements are now redundant).</p>
|
|
|
+</div>
|
|
|
+
|
|
|
+<a name="ch_reconfig_dyn"></a>
|
|
|
+<h2 class="h3">Dynamic Reconfiguration of the ZooKeeper Ensemble</h2>
|
|
|
+<div class="section">
|
|
|
+<p>The ZooKeeper Java and C API were extended with getConfig and reconfig
|
|
|
+ commands that facilitate reconfiguration. Both commands have a synchronous
|
|
|
+ (blocking) variant and an asynchronous one. We demonstrate these commands
|
|
|
+ here using the Java CLI, but note that you can similarly use the C CLI or
|
|
|
+ invoke the commands directly from a program just like any other ZooKeeper
|
|
|
+ command.</p>
|
|
|
+<a name="sc_reconfig_retrieving"></a>
|
|
|
+<h3 class="h4">Retrieving the current dynamic configuration</h3>
|
|
|
+<p>The dynamic configuration is stored in a special znode
|
|
|
+ ZooDefs.CONFIG_NODE = /zookeeper/config. The new
|
|
|
+ <span class="codefrag command">config</span> CLI command reads this znode (currently it is
|
|
|
+ simply a wrapper to <span class="codefrag command">get /zookeeper/config</span>). As with
|
|
|
+ normal reads, to retrieve the latest committed value you should do a
|
|
|
+ <span class="codefrag command">sync</span> first.</p>
|
|
|
+<pre class="code">[zk: 127.0.0.1:2791(CONNECTED) 3] config
|
|
|
+server.1=localhost:2780:2783:participant;localhost:2791
|
|
|
+server.2=localhost:2781:2784:participant;localhost:2792
|
|
|
+server.3=localhost:2782:2785:participant;localhost:2793
|
|
|
+<strong>version=400000003</strong>
|
|
|
+</pre>
|
|
|
+<p>Notice the last line of the output. This is the configuration
|
|
|
+ version. The version equals to the zxid of the reconfiguration command
|
|
|
+ which created this configuration. The version of the first established
|
|
|
+ configuration equals to the zxid of the NEWLEADER message sent by the
|
|
|
+ first successfully established leader. When a configuration is written
|
|
|
+ to a dynamic configuration file, the version automatically becomes part
|
|
|
+ of the filename and the static configuration file is updated with the
|
|
|
+ path to the new dynamic configuration file. Configuration files
|
|
|
+ corresponding to earlier versions are retained for backup
|
|
|
+ purposes.</p>
|
|
|
+<p>During boot time the version (if it exists) is extracted from the
|
|
|
+ filename. The version should never be altered manually by users or the
|
|
|
+ system administrator. It is used by the system to know which
|
|
|
+ configuration is most up-to-date. Manipulating it manually can result in
|
|
|
+ data loss and inconsistency.</p>
|
|
|
+<p>Just like a <span class="codefrag command">get</span> command, the
|
|
|
+ <span class="codefrag command">config</span> CLI command accepts the <span class="codefrag option">-w</span>
|
|
|
+ flag for setting a watch on the znode, and <span class="codefrag option">-s</span> flag for
|
|
|
+ displaying the Stats of the znode. It additionally accepts a new flag
|
|
|
+ <span class="codefrag option">-c</span> which outputs only the version and the client
|
|
|
+ connection string corresponding to the current configuration. For
|
|
|
+ example, for the configuration above we would get:</p>
|
|
|
+<pre class="code">[zk: 127.0.0.1:2791(CONNECTED) 17] config -c
|
|
|
+400000003 localhost:2791,localhost:2793,localhost:2792</pre>
|
|
|
+<p>Note that when using the API directly, this command is called
|
|
|
+ <span class="codefrag command">getConfig</span>.</p>
|
|
|
+<p>As any read command it returns the configuration known to the
|
|
|
+ follower to which your client is connected, which may be slightly
|
|
|
+ out-of-date. One can use the <span class="codefrag command">sync</span> command for
|
|
|
+ stronger guarantees. For example using the Java API:</p>
|
|
|
+<pre class="code">zk.sync(ZooDefs.CONFIG_NODE, void_callback, context);
|
|
|
+zk.getConfig(watcher, callback, context);</pre>
|
|
|
+<p>Note: in 3.5.0 it doesn't really matter which path is passed to the
|
|
|
+ <span class="codefrag command">sync() </span> command as all the server's state is brought
|
|
|
+ up to date with the leader (so one could use a different path instead of
|
|
|
+ ZooDefs.CONFIG_NODE). However, this may change in the future.</p>
|
|
|
+<a name="sc_reconfig_modifying"></a>
|
|
|
+<h3 class="h4">Modifying the current dynamic configuration</h3>
|
|
|
+<p>Modifying the configuration is done through the
|
|
|
+ <span class="codefrag command">reconfig</span> command. There are two modes of
|
|
|
+ reconfiguration: incremental and non-incremental (bulk). The
|
|
|
+ non-incremental simply specifies the new dynamic configuration of the
|
|
|
+ system. The incremental specifies changes to the current configuration.
|
|
|
+ The <span class="codefrag command">reconfig</span> command returns the new
|
|
|
+ configuration.</p>
|
|
|
+<p>A few examples are in: <span class="codefrag filename">ReconfigTest.java</span>,
|
|
|
+ <span class="codefrag filename">ReconfigRecoveryTest.java</span> and
|
|
|
+ <span class="codefrag filename">TestReconfigServer.cc</span>.</p>
|
|
|
+<a name="sc_reconfig_general"></a>
|
|
|
+<h4>General</h4>
|
|
|
+<p>
|
|
|
+<strong>Removing servers:</strong> Any server can
|
|
|
+ be removed, including the leader (although removing the leader will
|
|
|
+ result in a short unavailability, see Figures 6 and 8 in the <a href="https://www.usenix.org/conference/usenixfederatedconferencesweek/dynamic-recon%EF%AC%81guration-primarybackup-clusters">paper</a>). The server will not be shut-down automatically.
|
|
|
+ Instead, it becomes a "non-voting follower". This is somewhat similar
|
|
|
+ to an observer in that its votes don't count towards the Quorum of
|
|
|
+ votes necessary to commit operations. However, unlike a non-voting
|
|
|
+ follower, an observer doesn't actually see any operation proposals and
|
|
|
+ does not ACK them. Thus a non-voting follower has a more significant
|
|
|
+ negative effect on system throughput compared to an observer.
|
|
|
+ Non-voting follower mode should only be used as a temporary mode,
|
|
|
+ before shutting the server down, or adding it as a follower or as an
|
|
|
+ observer to the ensemble. We do not shut the server down automatically
|
|
|
+ for two main reasons. The first reason is that we do not want all the
|
|
|
+ clients connected to this server to be immediately disconnected,
|
|
|
+ causing a flood of connection requests to other servers. Instead, it
|
|
|
+ is better if each client decides when to migrate independently. The
|
|
|
+ second reason is that removing a server may sometimes (rarely) be
|
|
|
+ necessary in order to change it from "observer" to "participant" (this
|
|
|
+ is explained in the section <a href="#sc_reconfig_additional">Additional comments</a>).</p>
|
|
|
+<p>Note that the new configuration should have some minimal number of
|
|
|
+ participants in order to be considered legal. If the proposed change
|
|
|
+ would leave the cluster with less than 2 participants and standalone
|
|
|
+ mode is enabled (standaloneEnabled=true, see the section <a href="#sc_reconfig_standaloneEnabled">The standaloneEnabled flag</a>), the reconfig will not be
|
|
|
+ processed (BadArgumentsException). If standalone mode is disabled
|
|
|
+ (standaloneEnabled=false) then its legal to remain with 1 or more
|
|
|
+ participants.</p>
|
|
|
+<p>
|
|
|
+<strong>Adding servers:</strong> Before a
|
|
|
+ reconfiguration is invoked, the administrator must make sure that a
|
|
|
+ quorum (majority) of participants from the new configuration are
|
|
|
+ already connected and synced with the current leader. To achieve this
|
|
|
+ we need to connect a new joining server to the leader before it is
|
|
|
+ officially part of the ensemble. This is done by starting the joining
|
|
|
+ server using an initial list of servers which is technically not a
|
|
|
+ legal configuration of the system but (a) contains the joiner, and (b)
|
|
|
+ gives sufficient information to the joiner in order for it to find and
|
|
|
+ connect to the current leader. We list a few different options of
|
|
|
+ doing this safely.</p>
|
|
|
+<ol>
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>Initial configuration of joiners is comprised of servers in
|
|
|
+ the last committed configuration and one or more joiners, where
|
|
|
+ <strong>joiners are listed as observers.</strong>
|
|
|
+ For example, if servers D and E are added at the same time to (A,
|
|
|
+ B, C) and server C is being removed, the initial configuration of
|
|
|
+ D could be (A, B, C, D) or (A, B, C, D, E), where D and E are
|
|
|
+ listed as observers. Similarly, the configuration of E could be
|
|
|
+ (A, B, C, E) or (A, B, C, D, E), where D and E are listed as
|
|
|
+ observers. <strong>Note that listing the joiners as
|
|
|
+ observers will not actually make them observers - it will only
|
|
|
+ prevent them from accidentally forming a quorum with other
|
|
|
+ joiners.</strong> Instead, they will contact the servers in the
|
|
|
+ current configuration and adopt the last committed configuration
|
|
|
+ (A, B, C), where the joiners are absent. Configuration files of
|
|
|
+ joiners are backed up and replaced automatically as this happens.
|
|
|
+ After connecting to the current leader, joiners become non-voting
|
|
|
+ followers until the system is reconfigured and they are added to
|
|
|
+ the ensemble (as participant or observer, as appropriate).</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>Initial configuration of each joiner is comprised of servers
|
|
|
+ in the last committed configuration + <strong>the
|
|
|
+ joiner itself, listed as a participant.</strong> For example, to
|
|
|
+ add a new server D to a configuration consisting of servers (A, B,
|
|
|
+ C), the administrator can start D using an initial configuration
|
|
|
+ file consisting of servers (A, B, C, D). If both D and E are added
|
|
|
+ at the same time to (A, B, C), the initial configuration of D
|
|
|
+ could be (A, B, C, D) and the configuration of E could be (A, B,
|
|
|
+ C, E). Similarly, if D is added and C is removed at the same time,
|
|
|
+ the initial configuration of D could be (A, B, C, D). Never list
|
|
|
+ more than one joiner as participant in the initial configuration
|
|
|
+ (see warning below).</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>Whether listing the joiner as an observer or as participant,
|
|
|
+ it is also fine not to list all the current configuration servers,
|
|
|
+ as long as the current leader is in the list. For example, when
|
|
|
+ adding D we could start D with a configuration file consisting of
|
|
|
+ just (A, D) if A is the current leader. however this is more
|
|
|
+ fragile since if A fails before D officially joins the ensemble, D
|
|
|
+ doesn’t know anyone else and therefore the administrator will have
|
|
|
+ to intervene and restart D with another server list.</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+</ol>
|
|
|
+<div class="note">
|
|
|
+<div class="label">Warning</div>
|
|
|
+<div class="content">
|
|
|
+
|
|
|
+<title>Warning</title>
|
|
|
+
|
|
|
+<p>Never specify more than one joining server in the same initial
|
|
|
+ configuration as participants. Currently, the joining servers don’t
|
|
|
+ know that they are joining an existing ensemble; if multiple joiners
|
|
|
+ are listed as participants they may form an independent quorum
|
|
|
+ creating a split-brain situation such as processing operations
|
|
|
+ independently from your main ensemble. It is OK to list multiple
|
|
|
+ joiners as observers in an initial config.</p>
|
|
|
+
|
|
|
+</div>
|
|
|
+</div>
|
|
|
+<p>Finally, note that once connected to the leader, a joiner adopts
|
|
|
+ the last committed configuration, in which it is absent (the initial
|
|
|
+ config of the joiner is backed up before being rewritten). If the
|
|
|
+ joiner restarts in this state, it will not be able to boot since it is
|
|
|
+ absent from its configuration file. In order to start it you’ll once
|
|
|
+ again have to specify an initial configuration.</p>
|
|
|
+<p>
|
|
|
+<strong>Modifying server parameters:</strong> One
|
|
|
+ can modify any of the ports of a server, or its role
|
|
|
+ (participant/observer) by adding it to the ensemble with different
|
|
|
+ parameters. This works in both the incremental and the bulk
|
|
|
+ reconfiguration modes. It is not necessary to remove the server and
|
|
|
+ then add it back; just specify the new parameters as if the server is
|
|
|
+ not yet in the system. The server will detect the configuration change
|
|
|
+ and perform the necessary adjustments. See an example in the section
|
|
|
+ <a href="#sc_reconfig_incremental">Incremental mode</a> and an exception to this
|
|
|
+ rule in the section <a href="#sc_reconfig_additional">Additional comments</a>.</p>
|
|
|
+<p>It is also possible to change the Quorum System used by the
|
|
|
+ ensemble (for example, change the Majority Quorum System to a
|
|
|
+ Hierarchical Quorum System on the fly). This, however, is only allowed
|
|
|
+ using the bulk (non-incremental) reconfiguration mode. In general,
|
|
|
+ incremental reconfiguration only works with the Majority Quorum
|
|
|
+ System. Bulk reconfiguration works with both Hierarchical and Majority
|
|
|
+ Quorum Systems.</p>
|
|
|
+<p>
|
|
|
+<strong>Performance Impact:</strong> There is
|
|
|
+ practically no performance impact when removing a follower, since it
|
|
|
+ is not being automatically shut down (the effect of removal is that
|
|
|
+ the server's votes are no longer being counted). When adding a server,
|
|
|
+ there is no leader change and no noticeable performance disruption.
|
|
|
+ For details and graphs please see Figures 6, 7 and 8 in the <a href="https://www.usenix.org/conference/usenixfederatedconferencesweek/dynamic-recon%EF%AC%81guration-primarybackup-clusters">paper</a>.</p>
|
|
|
+<p>The most significant disruption will happen when a leader change
|
|
|
+ is caused, in one of the following cases:</p>
|
|
|
+<ol>
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>Leader is removed from the ensemble.</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>Leader's role is changed from participant to observer.</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>The port used by the leader to send transactions to others
|
|
|
+ (quorum port) is modified.</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+</ol>
|
|
|
+<p>In these cases we perform a leader hand-off where the old leader
|
|
|
+ nominates a new leader. The resulting unavailability is usually
|
|
|
+ shorter than when a leader crashes since detecting leader failure is
|
|
|
+ unnecessary and electing a new leader can usually be avoided during a
|
|
|
+ hand-off (see Figures 6 and 8 in the <a href="https://www.usenix.org/conference/usenixfederatedconferencesweek/dynamic-recon%EF%AC%81guration-primarybackup-clusters">paper</a>).</p>
|
|
|
+<p>When the client port of a server is modified, it does not drop
|
|
|
+ existing client connections. New connections to the server will have
|
|
|
+ to use the new client port.</p>
|
|
|
+<p>
|
|
|
+<strong>Progress guarantees:</strong> Up to the
|
|
|
+ invocation of the reconfig operation, a quorum of the old
|
|
|
+ configuration is required to be available and connected for ZooKeeper
|
|
|
+ to be able to make progress. Once reconfig is invoked, a quorum of
|
|
|
+ both the old and of the new configurations must be available. The
|
|
|
+ final transition happens once (a) the new configuration is activated,
|
|
|
+ and (b) all operations scheduled before the new configuration is
|
|
|
+ activated by the leader are committed. Once (a) and (b) happen, only a
|
|
|
+ quorum of the new configuration is required. Note, however, that
|
|
|
+ neither (a) nor (b) are visible to a client. Specifically, when a
|
|
|
+ reconfiguration operation commits, it only means that an activation
|
|
|
+ message was sent out by the leader. It does not necessarily mean that
|
|
|
+ a quorum of the new configuration got this message (which is required
|
|
|
+ in order to activate it) or that (b) has happened. If one wants to
|
|
|
+ make sure that both (a) and (b) has already occurred (for example, in
|
|
|
+ order to know that it is safe to shut down old servers that were
|
|
|
+ removed), one can simply invoke an update
|
|
|
+ (<span class="codefrag command">set-data</span>, or some other quorum operation, but not
|
|
|
+ a <span class="codefrag command">sync</span>) and wait for it to commit. An alternative
|
|
|
+ way to achieve this was to introduce another round to the
|
|
|
+ reconfiguration protocol (which, for simplicity and compatibility with
|
|
|
+ Zab, we decided to avoid).</p>
|
|
|
+<a name="sc_reconfig_incremental"></a>
|
|
|
+<h4>Incremental mode</h4>
|
|
|
+<p>The incremental mode allows adding and removing servers to the
|
|
|
+ current configuration. Multiple changes are allowed. For
|
|
|
+ example:</p>
|
|
|
+<p>
|
|
|
+<span class="codefrag userinput">> reconfig -remove 3 -add
|
|
|
+ server.5=125.23.63.23:1234:1235;1236</span>
|
|
|
+</p>
|
|
|
+<p>Both the add and the remove options get a list of comma separated
|
|
|
+ arguments (no spaces):</p>
|
|
|
+<p>
|
|
|
+<span class="codefrag userinput">> reconfig -remove 3,4 -add
|
|
|
+ server.5=localhost:2111:2112;2113,6=localhost:2114:2115:observer;2116</span>
|
|
|
+</p>
|
|
|
+<p>The format of the server statement is exactly the same as
|
|
|
+ described in the section <a href="#sc_reconfig_clientport">Specifying the client port</a> and
|
|
|
+ includes the client port. Notice that here instead of "server.5=" you
|
|
|
+ can just say "5=". In the example above, if server 5 is already in the
|
|
|
+ system, but has different ports or is not an observer, it is updated
|
|
|
+ and once the configuration commits becomes an observer and starts
|
|
|
+ using these new ports. This is an easy way to turn participants into
|
|
|
+ observers and vise versa or change any of their ports, without
|
|
|
+ rebooting the server.</p>
|
|
|
+<p>ZooKeeper supports two types of Quorum Systems – the simple
|
|
|
+ Majority system (where the leader commits operations after receiving
|
|
|
+ ACKs from a majority of voters) and a more complex Hierarchical
|
|
|
+ system, where votes of different servers have different weights and
|
|
|
+ servers are divided into voting groups. Currently, incremental
|
|
|
+ reconfiguration is allowed only if the last proposed configuration
|
|
|
+ known to the leader uses a Majority Quorum System
|
|
|
+ (BadArgumentsException is thrown otherwise).</p>
|
|
|
+<p>Incremental mode - examples using the Java API:</p>
|
|
|
+<pre class="code">List<String> leavingServers = new ArrayList<String>();
|
|
|
+leavingServers.add("1");
|
|
|
+leavingServers.add("2");
|
|
|
+byte[] config = zk.reconfig(null, leavingServers, null, -1, new Stat());</pre>
|
|
|
+<pre class="code">List<String> leavingServers = new ArrayList<String>();
|
|
|
+List<String> joiningServers = new ArrayList<String>();
|
|
|
+leavingServers.add("1");
|
|
|
+joiningServers.add("server.4=localhost:1234:1235;1236");
|
|
|
+byte[] config = zk.reconfig(joiningServers, leavingServers, null, -1, new Stat());
|
|
|
+
|
|
|
+String configStr = new String(config);
|
|
|
+System.out.println(configStr);</pre>
|
|
|
+<p>There is also an asynchronous API, and an API accepting comma
|
|
|
+ separated Strings instead of List<String>. See
|
|
|
+ src/java/main/org/apache/zookeeper/ZooKeeper.java.</p>
|
|
|
+<a name="sc_reconfig_nonincremental"></a>
|
|
|
+<h4>Non-incremental mode</h4>
|
|
|
+<p>The second mode of reconfiguration is non-incremental, whereby a
|
|
|
+ client gives a complete specification of the new dynamic system
|
|
|
+ configuration. The new configuration can either be given in place or
|
|
|
+ read from a file:</p>
|
|
|
+<p>
|
|
|
+<span class="codefrag userinput">> reconfig -file newconfig.cfg
|
|
|
+ </span>//newconfig.cfg is a dynamic config file, see <a href="#sc_reconfig_file">Dynamic configuration file</a>
|
|
|
+</p>
|
|
|
+<p>
|
|
|
+<span class="codefrag userinput">> reconfig -members
|
|
|
+ server.1=125.23.63.23:2780:2783:participant;2791,server.2=125.23.63.24:2781:2784:participant;2792,server.3=125.23.63.25:2782:2785:participant;2793</span>
|
|
|
+</p>
|
|
|
+<p>The new configuration may use a different Quorum System. For
|
|
|
+ example, you may specify a Hierarchical Quorum System even if the
|
|
|
+ current ensemble uses a Majority Quorum System.</p>
|
|
|
+<p>Bulk mode - example using the Java API:</p>
|
|
|
+<pre class="code">ArrayList<String> newMembers = new ArrayList<String>();
|
|
|
+newMembers.add("server.1=1111:1234:1235;1236");
|
|
|
+newMembers.add("server.2=1112:1237:1238;1239");
|
|
|
+newMembers.add("server.3=1114:1240:1241:observer;1242");
|
|
|
+
|
|
|
+byte[] config = zk.reconfig(null, null, newMembers, -1, new Stat());
|
|
|
+
|
|
|
+String configStr = new String(config);
|
|
|
+System.out.println(configStr);</pre>
|
|
|
+<p>There is also an asynchronous API, and an API accepting comma
|
|
|
+ separated String containing the new members instead of
|
|
|
+ List<String>. See
|
|
|
+ src/java/main/org/apache/zookeeper/ZooKeeper.java.</p>
|
|
|
+<a name="sc_reconfig_conditional"></a>
|
|
|
+<h4>Conditional reconfig</h4>
|
|
|
+<p>Sometimes (especially in non-incremental mode) a new proposed
|
|
|
+ configuration depends on what the client "believes" to be the current
|
|
|
+ configuration, and should be applied only to that configuration.
|
|
|
+ Specifically, the <span class="codefrag command">reconfig</span> succeeds only if the
|
|
|
+ last configuration at the leader has the specified version.</p>
|
|
|
+<p>
|
|
|
+<span class="codefrag userinput">> reconfig -file <filename> -v <version></span>
|
|
|
+</p>
|
|
|
+<p>In the previously listed Java examples, instead of -1 one could
|
|
|
+ specify a configuration version to condition the
|
|
|
+ reconfiguration.</p>
|
|
|
+<a name="sc_reconfig_errors"></a>
|
|
|
+<h4>Error conditions</h4>
|
|
|
+<p>In addition to normal ZooKeeper error conditions, a
|
|
|
+ reconfiguration may fail for the following reasons:</p>
|
|
|
+<ol>
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>another reconfig is currently in progress
|
|
|
+ (ReconfigInProgress)</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>the proposed change would leave the cluster with less than 2
|
|
|
+ participants, in case standalone mode is enabled, or, if
|
|
|
+ standalone mode is disabled then its legal to remain with 1 or
|
|
|
+ more participants (BadArgumentsException)</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>no quorum of the new configuration was connected and
|
|
|
+ up-to-date with the leader when the reconfiguration processing
|
|
|
+ began (NewConfigNoQuorum)</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>
|
|
|
+<span class="codefrag userinput">-v x</span> was specified, but the version
|
|
|
+ <span class="codefrag userinput">y</span> of the latest configuration is not
|
|
|
+ <span class="codefrag userinput">x</span> (BadVersionException)</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>an incremental reconfiguration was requested but the last
|
|
|
+ configuration at the leader uses a Quorum System which is
|
|
|
+ different from the Majority system (BadArgumentsException)</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>syntax error (BadArgumentsException)</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>I/O exception when reading the configuration from a file
|
|
|
+ (BadArgumentsException)</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+</ol>
|
|
|
+<p>Most of these are illustrated by test-cases in
|
|
|
+ <span class="codefrag filename">ReconfigFailureCases.java</span>.</p>
|
|
|
+<a name="sc_reconfig_additional"></a>
|
|
|
+<h4>Additional comments</h4>
|
|
|
+<p>
|
|
|
+<strong>Liveness:</strong> To better understand
|
|
|
+ the difference between incremental and non-incremental
|
|
|
+ reconfiguration, suppose that client C1 adds server D to the system
|
|
|
+ while a different client C2 adds server E. With the non-incremental
|
|
|
+ mode, each client would first invoke <span class="codefrag command">config</span> to find
|
|
|
+ out the current configuration, and then locally create a new list of
|
|
|
+ servers by adding its own suggested server. The new configuration can
|
|
|
+ then be submitted using the non-incremental
|
|
|
+ <span class="codefrag command">reconfig</span> command. After both reconfigurations
|
|
|
+ complete, only one of E or D will be added (not both), depending on
|
|
|
+ which client's request arrives second to the leader, overwriting the
|
|
|
+ previous configuration. The other client can repeat the process until
|
|
|
+ its change takes effect. This method guarantees system-wide progress
|
|
|
+ (i.e., for one of the clients), but does not ensure that every client
|
|
|
+ succeeds. To have more control C2 may request to only execute the
|
|
|
+ reconfiguration in case the version of the current configuration
|
|
|
+ hasn't changed, as explained in the section <a href="#sc_reconfig_conditional">Conditional reconfig</a>. In this way it may avoid blindly
|
|
|
+ overwriting the configuration of C1 if C1's configuration reached the
|
|
|
+ leader first.</p>
|
|
|
+<p>With incremental reconfiguration, both changes will take effect as
|
|
|
+ they are simply applied by the leader one after the other to the
|
|
|
+ current configuration, whatever that is (assuming that the second
|
|
|
+ reconfig request reaches the leader after it sends a commit message
|
|
|
+ for the first reconfig request -- currently the leader will refuse to
|
|
|
+ propose a reconfiguration if another one is already pending). Since
|
|
|
+ both clients are guaranteed to make progress, this method guarantees
|
|
|
+ stronger liveness. In practice, multiple concurrent reconfigurations
|
|
|
+ are probably rare. Non-incremental reconfiguration is currently the
|
|
|
+ only way to dynamically change the Quorum System. Incremental
|
|
|
+ configuration is currently only allowed with the Majority Quorum
|
|
|
+ System.</p>
|
|
|
+<p>
|
|
|
+<strong>Changing an observer into a
|
|
|
+ follower:</strong> Clearly, changing a server that participates in
|
|
|
+ voting into an observer may fail if error (2) occurs, i.e., if fewer
|
|
|
+ than the minimal allowed number of participants would remain. However,
|
|
|
+ converting an observer into a participant may sometimes fail for a
|
|
|
+ more subtle reason: Suppose, for example, that the current
|
|
|
+ configuration is (A, B, C, D), where A is the leader, B and C are
|
|
|
+ followers and D is an observer. In addition, suppose that B has
|
|
|
+ crashed. If a reconfiguration is submitted where D is said to become a
|
|
|
+ follower, it will fail with error (3) since in this configuration, a
|
|
|
+ majority of voters in the new configuration (any 3 voters), must be
|
|
|
+ connected and up-to-date with the leader. An observer cannot
|
|
|
+ acknowledge the history prefix sent during reconfiguration, and
|
|
|
+ therefore it does not count towards these 3 required servers and the
|
|
|
+ reconfiguration will be aborted. In case this happens, a client can
|
|
|
+ achieve the same task by two reconfig commands: first invoke a
|
|
|
+ reconfig to remove D from the configuration and then invoke a second
|
|
|
+ command to add it back as a participant (follower). During the
|
|
|
+ intermediate state D is a non-voting follower and can ACK the state
|
|
|
+ transfer performed during the second reconfig comand.</p>
|
|
|
+</div>
|
|
|
+
|
|
|
+<a name="ch_reconfig_rebalancing"></a>
|
|
|
+<h2 class="h3">Rebalancing Client Connections</h2>
|
|
|
+<div class="section">
|
|
|
+<p>When a ZooKeeper cluster is started, if each client is given the same
|
|
|
+ connection string (list of servers), the client will randomly choose a
|
|
|
+ server in the list to connect to, which makes the expected number of
|
|
|
+ client connections per server the same for each of the servers. We
|
|
|
+ implemented a method that preserves this property when the set of servers
|
|
|
+ changes through reconfiguration. See Sections 4 and 5.1 in the <a href="https://www.usenix.org/conference/usenixfederatedconferencesweek/dynamic-recon%EF%AC%81guration-primarybackup-clusters">paper</a>.</p>
|
|
|
+<p>In order for the method to work, all clients must subscribe to
|
|
|
+ configuration changes (by setting a watch on /zookeeper/config either
|
|
|
+ directly or through the <span class="codefrag command">getConfig</span> API command). When
|
|
|
+ the watch is triggered, the client should read the new configuration by
|
|
|
+ invoking <span class="codefrag command">sync</span> and <span class="codefrag command">getConfig</span> and if
|
|
|
+ the configuration is indeed new invoke the
|
|
|
+ <span class="codefrag command">updateServerList</span> API command. To avoid mass client
|
|
|
+ migration at the same time, it is better to have each client sleep a
|
|
|
+ random short period of time before invoking
|
|
|
+ <span class="codefrag command">updateServerList</span>.</p>
|
|
|
+<p>A few examples can be found in:
|
|
|
+ <span class="codefrag filename">StaticHostProviderTest.java</span> and
|
|
|
+ <span class="codefrag filename">TestReconfig.cc</span>
|
|
|
+</p>
|
|
|
+<p>Example (this is not a recipe, but a simplified example just to
|
|
|
+ explain the general idea):</p>
|
|
|
+<pre class="code">
|
|
|
+public void process(WatchedEvent event) {
|
|
|
+ synchronized (this) {
|
|
|
+ if (event.getType() == EventType.None) {
|
|
|
+ connected = (event.getState() == KeeperState.SyncConnected);
|
|
|
+ notifyAll();
|
|
|
+ } else if (event.getPath()!=null && event.getPath().equals(ZooDefs.CONFIG_NODE)) {
|
|
|
+ // in prod code never block the event thread!
|
|
|
+ zk.sync(ZooDefs.CONFIG_NODE, this, null);
|
|
|
+ zk.getConfig(this, this, null);
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+public void processResult(int rc, String path, Object ctx, byte[] data, Stat stat) {
|
|
|
+ if (path!=null && path.equals(ZooDefs.CONFIG_NODE)) {
|
|
|
+ String config[] = ConfigUtils.getClientConfigStr(new String(data)).split(" "); // similar to config -c
|
|
|
+ long version = Long.parseLong(config[0], 16);
|
|
|
+ if (this.configVersion == null){
|
|
|
+ this.configVersion = version;
|
|
|
+ } else if (version > this.configVersion) {
|
|
|
+ hostList = config[1];
|
|
|
+ try {
|
|
|
+ // the following command is not blocking but may cause the client to close the socket and
|
|
|
+ // migrate to a different server. In practice its better to wait a short period of time, chosen
|
|
|
+ // randomly, so that different clients migrate at different times
|
|
|
+ zk.updateServerList(hostList);
|
|
|
+ } catch (IOException e) {
|
|
|
+ System.err.println("Error updating server list");
|
|
|
+ e.printStackTrace();
|
|
|
+ }
|
|
|
+ this.configVersion = version;
|
|
|
+} } }</pre>
|
|
|
+</div>
|
|
|
+
|
|
|
+<p align="right">
|
|
|
+<font size="-2"></font>
|
|
|
+</p>
|
|
|
+</div>
|
|
|
+<!--+
|
|
|
+ |end content
|
|
|
+ +-->
|
|
|
+<div class="clearboth"> </div>
|
|
|
+</div>
|
|
|
+<div id="footer">
|
|
|
+<!--+
|
|
|
+ |start bottomstrip
|
|
|
+ +-->
|
|
|
+<div class="lastmodified">
|
|
|
+<script type="text/javascript"><!--
|
|
|
+document.write("Last Published: " + document.lastModified);
|
|
|
+// --></script>
|
|
|
+</div>
|
|
|
+<div class="copyright">
|
|
|
+ Copyright ©
|
|
|
+ 2008-2013 <a href="http://www.apache.org/licenses/">The Apache Software Foundation.</a>
|
|
|
+</div>
|
|
|
+<!--+
|
|
|
+ |end bottomstrip
|
|
|
+ +-->
|
|
|
+</div>
|
|
|
+</body>
|
|
|
+</html>
|