zookeeperReconfig.html 56 KB


  1. <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
  2. <html>
  3. <head>
  4. <META http-equiv="Content-Type" content="text/html; charset=UTF-8">
  5. <meta content="Apache Forrest" name="Generator">
  6. <meta name="Forrest-version" content="0.9">
  7. <meta name="Forrest-skin-name" content="pelt">
  8. <title>ZooKeeper Dynamic Reconfiguration</title>
  9. <link type="text/css" href="skin/basic.css" rel="stylesheet">
  10. <link media="screen" type="text/css" href="skin/screen.css" rel="stylesheet">
  11. <link media="print" type="text/css" href="skin/print.css" rel="stylesheet">
  12. <link type="text/css" href="skin/profile.css" rel="stylesheet">
  13. <script src="skin/getBlank.js" language="javascript" type="text/javascript"></script><script src="skin/getMenu.js" language="javascript" type="text/javascript"></script><script src="skin/fontsize.js" language="javascript" type="text/javascript"></script>
  14. <link rel="shortcut icon" href="images/favicon.ico">
  15. </head>
  16. <body onload="init()">
  17. <script type="text/javascript">ndeSetTextSize();</script>
  18. <div id="top">
  19. <!--+
  20. |breadtrail
  21. +-->
  22. <div class="breadtrail">
  23. <a href="http://www.apache.org/">Apache</a> &gt; <a href="http://zookeeper.apache.org/">ZooKeeper</a> &gt; <a href="http://zookeeper.apache.org/">ZooKeeper</a><script src="skin/breadcrumbs.js" language="JavaScript" type="text/javascript"></script>
  24. </div>
  25. <!--+
  26. |header
  27. +-->
  28. <div class="header">
  29. <!--+
  30. |start group logo
  31. +-->
  32. <div class="grouplogo">
  33. <a href="http://hadoop.apache.org/"><img class="logoImage" alt="Hadoop" src="images/hadoop-logo.jpg" title="Apache Hadoop"></a>
  34. </div>
  35. <!--+
  36. |end group logo
  37. +-->
  38. <!--+
  39. |start Project Logo
  40. +-->
  41. <div class="projectlogo">
  42. <a href="http://zookeeper.apache.org/"><img class="logoImage" alt="ZooKeeper" src="images/zookeeper_small.gif" title="ZooKeeper: distributed coordination"></a>
  43. </div>
  44. <!--+
  45. |end Project Logo
  46. +-->
  47. <!--+
  48. |start Search
  49. +-->
  50. <div class="searchbox">
  51. <form action="http://www.google.com/search" method="get" class="roundtopsmall">
  52. <input value="zookeeper.apache.org" name="sitesearch" type="hidden"><input onFocus="getBlank (this, 'Search the site with google');" size="25" name="q" id="query" type="text" value="Search the site with google">&nbsp;
  53. <input name="Search" value="Search" type="submit">
  54. </form>
  55. </div>
  56. <!--+
  57. |end search
  58. +-->
  59. <!--+
  60. |start Tabs
  61. +-->
  62. <ul id="tabs">
  63. <li>
  64. <a class="unselected" href="http://zookeeper.apache.org/">Project</a>
  65. </li>
  66. <li>
  67. <a class="unselected" href="https://cwiki.apache.org/confluence/display/ZOOKEEPER/">Wiki</a>
  68. </li>
  69. <li class="current">
  70. <a class="selected" href="index.html">ZooKeeper 3.6 Documentation</a>
  71. </li>
  72. </ul>
  73. <!--+
  74. |end Tabs
  75. +-->
  76. </div>
  77. </div>
  78. <div id="main">
  79. <div id="publishedStrip">
  80. <!--+
  81. |start Subtabs
  82. +-->
  83. <div id="level2tabs"></div>
  84. <!--+
  85. |end Endtabs
  86. +-->
  87. <script type="text/javascript"><!--
  88. document.write("Last Published: " + document.lastModified);
  89. // --></script>
  90. </div>
  91. <!--+
  92. |breadtrail
  93. +-->
  94. <div class="breadtrail">
  95. &nbsp;
  96. </div>
  97. <!--+
  98. |start Menu, mainarea
  99. +-->
  100. <!--+
  101. |start Menu
  102. +-->
  103. <div id="menu">
  104. <div onclick="SwitchMenu('menu_1.1', 'skin/')" id="menu_1.1Title" class="menutitle">Overview</div>
  105. <div id="menu_1.1" class="menuitemgroup">
  106. <div class="menuitem">
  107. <a href="index.html">Welcome</a>
  108. </div>
  109. <div class="menuitem">
  110. <a href="zookeeperOver.html">Overview</a>
  111. </div>
  112. <div class="menuitem">
  113. <a href="zookeeperStarted.html">Getting Started</a>
  114. </div>
  115. <div class="menuitem">
  116. <a href="releasenotes.html">Release Notes</a>
  117. </div>
  118. </div>
  119. <div onclick="SwitchMenu('menu_1.2', 'skin/')" id="menu_1.2Title" class="menutitle">Developer</div>
  120. <div id="menu_1.2" class="menuitemgroup">
  121. <div class="menuitem">
  122. <a href="api/index.html">API Docs</a>
  123. </div>
  124. <div class="menuitem">
  125. <a href="zookeeperProgrammers.html">Programmer's Guide</a>
  126. </div>
  127. <div class="menuitem">
  128. <a href="javaExample.html">Java Example</a>
  129. </div>
  130. <div class="menuitem">
  131. <a href="zookeeperTutorial.html">Barrier and Queue Tutorial</a>
  132. </div>
  133. <div class="menuitem">
  134. <a href="recipes.html">Recipes</a>
  135. </div>
  136. </div>
  137. <div onclick="SwitchMenu('menu_selected_1.3', 'skin/')" id="menu_selected_1.3Title" class="menutitle" style="background-image: url('skin/images/chapter_open.gif');">Admin &amp; Ops</div>
  138. <div id="menu_selected_1.3" class="selectedmenuitemgroup" style="display: block;">
  139. <div class="menuitem">
  140. <a href="zookeeperAdmin.html">Administrator's Guide</a>
  141. </div>
  142. <div class="menuitem">
  143. <a href="zookeeperQuotas.html">Quota Guide</a>
  144. </div>
  145. <div class="menuitem">
  146. <a href="zookeeperJMX.html">JMX</a>
  147. </div>
  148. <div class="menuitem">
  149. <a href="zookeeperObservers.html">Observers Guide</a>
  150. </div>
  151. <div class="menupage">
  152. <div class="menupagetitle">Dynamic Reconfiguration</div>
  153. </div>
  154. </div>
  155. <div onclick="SwitchMenu('menu_1.4', 'skin/')" id="menu_1.4Title" class="menutitle">Contributor</div>
  156. <div id="menu_1.4" class="menuitemgroup">
  157. <div class="menuitem">
  158. <a href="zookeeperInternals.html">ZooKeeper Internals</a>
  159. </div>
  160. </div>
  161. <div onclick="SwitchMenu('menu_1.5', 'skin/')" id="menu_1.5Title" class="menutitle">Miscellaneous</div>
  162. <div id="menu_1.5" class="menuitemgroup">
  163. <div class="menuitem">
  164. <a href="https://cwiki.apache.org/confluence/display/ZOOKEEPER">Wiki</a>
  165. </div>
  166. <div class="menuitem">
  167. <a href="https://cwiki.apache.org/confluence/display/ZOOKEEPER/FAQ">FAQ</a>
  168. </div>
  169. <div class="menuitem">
  170. <a href="http://zookeeper.apache.org/mailing_lists.html">Mailing Lists</a>
  171. </div>
  172. </div>
  173. <div id="credit"></div>
  174. <div id="roundbottom">
  175. <img style="display: none" class="corner" height="15" width="15" alt="" src="skin/images/rc-b-l-15-1body-2menu-3menu.png"></div>
  176. <!--+
  177. |alternative credits
  178. +-->
  179. <div id="credit2"></div>
  180. </div>
  181. <!--+
  182. |end Menu
  183. +-->
  184. <!--+
  185. |start content
  186. +-->
  187. <div id="content">
  188. <div title="Portable Document Format" class="pdflink">
  189. <a class="dida" href="zookeeperReconfig.pdf"><img alt="PDF -icon" src="skin/images/pdfdoc.gif" class="skin"><br>
  190. PDF</a>
  191. </div>
  192. <h1>ZooKeeper Dynamic Reconfiguration</h1>
  193. <div id="front-matter">
  194. <div id="minitoc-area">
  195. <ul class="minitoc">
  196. <li>
  197. <a href="#ch_reconfig_intro">Overview</a>
  198. </li>
  199. <li>
  200. <a href="#ch_reconfig_format">Changes to Configuration Format</a>
  201. <ul class="minitoc">
  202. <li>
  203. <a href="#sc_reconfig_clientport">Specifying the client port</a>
  204. </li>
  205. <li>
  206. <a href="#sc_reconfig_standaloneEnabled">The standaloneEnabled flag</a>
  207. </li>
  208. <li>
  209. <a href="#sc_reconfig_reconfigEnabled">The reconfigEnabled flag</a>
  210. </li>
  211. <li>
  212. <a href="#sc_reconfig_file">Dynamic configuration file</a>
  213. </li>
  214. <li>
  215. <a href="#sc_reconfig_backward">Backward compatibility</a>
  216. </li>
  217. </ul>
  218. </li>
  219. <li>
  220. <a href="#ch_reconfig_upgrade">Upgrading to 3.5.0</a>
  221. </li>
  222. <li>
  223. <a href="#ch_reconfig_dyn">Dynamic Reconfiguration of the ZooKeeper Ensemble</a>
  224. <ul class="minitoc">
  225. <li>
  226. <a href="#ch_reconfig_api">API</a>
  227. </li>
  228. <li>
  229. <a href="#sc_reconfig_access_control">Security</a>
  230. </li>
  231. <li>
  232. <a href="#sc_reconfig_retrieving">Retrieving the current dynamic configuration</a>
  233. </li>
  234. <li>
  235. <a href="#sc_reconfig_modifying">Modifying the current dynamic configuration</a>
  236. <ul class="minitoc">
  237. <li>
  238. <a href="#sc_reconfig_general">General</a>
  239. </li>
  240. <li>
  241. <a href="#sc_reconfig_incremental">Incremental mode</a>
  242. </li>
  243. <li>
  244. <a href="#sc_reconfig_nonincremental">Non-incremental mode</a>
  245. </li>
  246. <li>
  247. <a href="#sc_reconfig_conditional">Conditional reconfig</a>
  248. </li>
  249. <li>
  250. <a href="#sc_reconfig_errors">Error conditions</a>
  251. </li>
  252. <li>
  253. <a href="#sc_reconfig_additional">Additional comments</a>
  254. </li>
  255. </ul>
  256. </li>
  257. </ul>
  258. </li>
  259. <li>
  260. <a href="#ch_reconfig_rebalancing">Rebalancing Client Connections</a>
  261. </li>
  262. </ul>
  263. </div>
  264. </div>
  265. <a name="ch_reconfig_intro"></a>
  266. <h2 class="h3">Overview</h2>
  267. <div class="section">
  268. <p>Prior to the 3.5.0 release, the membership and all other configuration
  269. parameters of Zookeeper were static - loaded during boot and immutable at
  270. runtime. Operators resorted to ''rolling restarts'' - a manually intensive
  271. and error-prone method of changing the configuration that has caused data
  272. loss and inconsistency in production.</p>
  273. <p>Starting with 3.5.0, &ldquo;rolling restarts&rdquo; are no longer needed!
  274. ZooKeeper comes with full support for automated configuration changes: the
  275. set of Zookeeper servers, their roles (participant / observer), all ports,
  276. and even the quorum system can be changed dynamically, without service
  277. interruption and while maintaining data consistency. Reconfigurations are
  278. performed immediately, just like other operations in ZooKeeper. Multiple
  279. changes can be done using a single reconfiguration command. The dynamic
  280. reconfiguration functionality does not limit operation concurrency, does
  281. not require client operations to be stopped during reconfigurations, has a
  282. very simple interface for administrators and no added complexity to other
  283. client operations.</p>
  284. <p>New client-side features allow clients to find out about configuration
  285. changes and to update the connection string (list of servers and their
  286. client ports) stored in their ZooKeeper handle. A probabilistic algorithm
  287. is used to rebalance clients across the new configuration servers while
  288. keeping the extent of client migrations proportional to the change in
  289. ensemble membership.</p>
  290. <p>This document provides the administrator manual for reconfiguration.
  291. For a detailed description of the reconfiguration algorithms, performance
  292. measurements, and more, please see our paper:</p>
  293. <dl>
  294. <dt>
  295. <term>Shraer, A., Reed, B., Malkhi, D., Junqueira, F. Dynamic
  296. Reconfiguration of Primary/Backup Clusters. In <em>USENIX Annual
  297. Technical Conference (ATC) </em>(2012), 425-437</term>
  298. </dt>
  299. <dd>
  300. <p>Links: <a href="https://www.usenix.org/system/files/conference/atc12/atc12-final74.pdf">paper (pdf)</a>, <a href="https://www.usenix.org/sites/default/files/conference/protected-files/shraer_atc12_slides.pdf">slides (pdf)</a>, <a href="https://www.usenix.org/conference/atc12/technical-sessions/presentation/shraer">video</a>, <a href="http://www.slideshare.net/Hadoop_Summit/dynamic-reconfiguration-of-zookeeper">hadoop summit slides</a>
  301. </p>
  302. </dd>
  303. </dl>
  304. <p>
  305. <strong>Note:</strong> Starting with 3.5.3, the dynamic reconfiguration
  306. feature is disabled by default, and has to be explicitly turned on via
  307. <a href="zookeeperAdmin.html#sc_advancedConfiguration">
  308. reconfigEnabled </a> configuration option.
  309. </p>
  310. </div>
  311. <a name="ch_reconfig_format"></a>
  312. <h2 class="h3">Changes to Configuration Format</h2>
  313. <div class="section">
  314. <a name="sc_reconfig_clientport"></a>
  315. <h3 class="h4">Specifying the client port</h3>
  316. <p>A client port of a server is the port on which the server accepts
  317. client connection requests. Starting with 3.5.0 the
  318. <em>clientPort</em> and <em>clientPortAddress
  319. </em> configuration parameters should no longer be used. Instead,
  320. this information is now part of the server keyword specification, which
  321. becomes as follows:</p>
  322. <p>
  323. <span class="codefrag computeroutput">server.&lt;positive id&gt; = &lt;address1&gt;:&lt;port1&gt;:&lt;port2&gt;[:role];[&lt;client port address&gt;:]&lt;client port&gt;</span>
  324. </p>
  325. <p>The client port specification is to the right of the semicolon. The
  326. client port address is optional, and if not specified it defaults to
  327. "0.0.0.0". As usual, role is also optional, it can be
  328. <em>participant</em> or <em>observer</em>
  329. (<em>participant</em> by default).</p>
  330. <p> Examples of legal server statements: </p>
  331. <ul>
  332. <li>
  333. <p>
  334. <span class="codefrag computeroutput">server.5 = 125.23.63.23:1234:1235;1236</span>
  335. </p>
  336. </li>
  337. <li>
  338. <p>
  339. <span class="codefrag computeroutput">server.5 = 125.23.63.23:1234:1235:participant;1236</span>
  340. </p>
  341. </li>
  342. <li>
  343. <p>
  344. <span class="codefrag computeroutput">server.5 = 125.23.63.23:1234:1235:observer;1236</span>
  345. </p>
  346. </li>
  347. <li>
  348. <p>
  349. <span class="codefrag computeroutput">server.5 = 125.23.63.23:1234:1235;125.23.63.24:1236</span>
  350. </p>
  351. </li>
  352. <li>
  353. <p>
  354. <span class="codefrag computeroutput">server.5 = 125.23.63.23:1234:1235:participant;125.23.63.23:1236</span>
  355. </p>
  356. </li>
  357. </ul>
  358. <a name="sc_reconfig_standaloneEnabled"></a>
  359. <h3 class="h4">The standaloneEnabled flag</h3>
  360. <p>Prior to 3.5.0, one could run ZooKeeper in Standalone mode or in a
  361. Distributed mode. These are separate implementation stacks, and
  362. switching between them during run time is not possible. By default (for
  363. backward compatibility) <em>standaloneEnabled</em> is set to
  364. <em>true</em>. The consequence of using this default is that
  365. if started with a single server the ensemble will not be allowed to
  366. grow, and if started with more than one server it will not be allowed to
  367. shrink to contain fewer than two participants.</p>
  368. <p>Setting the flag to <em>false</em> instructs the system
  369. to run the Distributed software stack even if there is only a single
  370. participant in the ensemble. To achieve this the (static) configuration
  371. file should contain:</p>
  372. <p>
  373. <span class="codefrag computeroutput">standaloneEnabled=false</span>
  374. </p>
  375. <p>With this setting it is possible to start a ZooKeeper ensemble
  376. containing a single participant and to dynamically grow it by adding
  377. more servers. Similarly, it is possible to shrink an ensemble so that
  378. just a single participant remains, by removing servers.</p>
  379. <p>Since running the Distributed mode allows more flexibility, we
  380. recommend setting the flag to <em>false</em>. We expect that
  381. the legacy Standalone mode will be deprecated in the future.</p>
  382. <a name="sc_reconfig_reconfigEnabled"></a>
  383. <h3 class="h4">The reconfigEnabled flag</h3>
  384. <p>Starting with 3.5.0 and prior to 3.5.3, there is no way to disable
  385. dynamic reconfiguration feature. We would like to offer the option of
  386. disabling reconfiguration feature because with reconfiguration enabled,
  387. we have a security concern that a malicious actor can make arbitrary changes
  388. to the configuration of a ZooKeeper ensemble, including adding a compromised
  389. server to the ensemble. We prefer to leave to the discretion of the user to
  390. decide whether to enable it or not and make sure that the appropriate security
  391. measure are in place. So in 3.5.3 the <a href="zookeeperAdmin.html#sc_advancedConfiguration">
  392. reconfigEnabled </a> configuration option is introduced
  393. such that the reconfiguration feature can be completely disabled and any attempts
  394. to reconfigure a cluster through reconfig API with or without authentication
  395. will fail by default, unless <strong>reconfigEnabled</strong> is set to
  396. <strong>true</strong>.
  397. </p>
  398. <p>To set the option to true, the configuration file (zoo.cfg) should contain:</p>
  399. <p>
  400. <span class="codefrag computeroutput">reconfigEnabled=true</span>
  401. </p>
  402. <a name="sc_reconfig_file"></a>
  403. <h3 class="h4">Dynamic configuration file</h3>
  404. <p>Starting with 3.5.0 we're distinguishing between dynamic
  405. configuration parameters, which can be changed during runtime, and
  406. static configuration parameters, which are read from a configuration
  407. file when a server boots and don't change during its execution. For now,
  408. the following configuration keywords are considered part of the dynamic
  409. configuration: <em>server</em>, <em>group</em>
  410. and <em>weight</em>.</p>
  411. <p>Dynamic configuration parameters are stored in a separate file on
  412. the server (which we call the dynamic configuration file). This file is
  413. linked from the static config file using the new
  414. <em>dynamicConfigFile</em> keyword.</p>
  415. <p>
  416. <strong>Example</strong>
  417. </p>
  418. <div class="note example">
  419. <div class="label">zoo_replicated1.cfg</div>
  420. <div class="content">
  421. <title>zoo_replicated1.cfg</title>
  422. <pre class="code">tickTime=2000
  423. dataDir=/zookeeper/data/zookeeper1
  424. initLimit=5
  425. syncLimit=2
  426. dynamicConfigFile=/zookeeper/conf/zoo_replicated1.cfg.dynamic</pre>
  427. </div>
  428. </div>
  429. <div class="note example">
  430. <div class="label">zoo_replicated1.cfg.dynamic</div>
  431. <div class="content">
  432. <title>zoo_replicated1.cfg.dynamic</title>
  433. <pre class="code">server.1=125.23.63.23:2780:2783:participant;2791
  434. server.2=125.23.63.24:2781:2784:participant;2792
  435. server.3=125.23.63.25:2782:2785:participant;2793</pre>
  436. </div>
  437. </div>
  438. <p>When the ensemble configuration changes, the static configuration
  439. parameters remain the same. The dynamic parameters are pushed by
  440. ZooKeeper and overwrite the dynamic configuration files on all servers.
  441. Thus, the dynamic configuration files on the different servers are
  442. usually identical (they can only differ momentarily when a
  443. reconfiguration is in progress, or if a new configuration hasn't
  444. propagated yet to some of the servers). Once created, the dynamic
  445. configuration file should not be manually altered. Changed are only made
  446. through the new reconfiguration commands outlined below. Note that
  447. changing the config of an offline cluster could result in an
  448. inconsistency with respect to configuration information stored in the
  449. ZooKeeper log (and the special configuration znode, populated from the
  450. log) and is therefore highly discouraged.</p>
  451. <p>
  452. <strong>Example 2</strong>
  453. </p>
  454. <p>Users may prefer to initially specify a single configuration file.
  455. The following is thus also legal:</p>
  456. <div class="note example">
  457. <div class="label">zoo_replicated1.cfg</div>
  458. <div class="content">
  459. <title>zoo_replicated1.cfg</title>
  460. <pre class="code">tickTime=2000
  461. dataDir=/zookeeper/data/zookeeper1
  462. initLimit=5
  463. syncLimit=2
  464. clientPort=<strong>2791</strong> // note that this line is now redundant and therefore not recommended
  465. server.1=125.23.63.23:2780:2783:participant;<strong>2791</strong>
  466. server.2=125.23.63.24:2781:2784:participant;2792
  467. server.3=125.23.63.25:2782:2785:participant;2793</pre>
  468. </div>
  469. </div>
  470. <p>The configuration files on each server will be automatically split
  471. into dynamic and static files, if they are not already in this format.
  472. So the configuration file above will be automatically transformed into
  473. the two files in Example 1. Note that the clientPort and
  474. clientPortAddress lines (if specified) will be automatically removed
  475. during this process, if they are redundant (as in the example above).
  476. The original static configuration file is backed up (in a .bak
  477. file).</p>
  478. <a name="sc_reconfig_backward"></a>
  479. <h3 class="h4">Backward compatibility</h3>
  480. <p>We still support the old configuration format. For example, the
  481. following configuration file is acceptable (but not recommended):</p>
  482. <div class="note example">
  483. <div class="label">zoo_replicated1.cfg</div>
  484. <div class="content">
  485. <title>zoo_replicated1.cfg</title>
  486. <pre class="code">tickTime=2000
  487. dataDir=/zookeeper/data/zookeeper1
  488. initLimit=5
  489. syncLimit=2
  490. clientPort=2791
  491. server.1=125.23.63.23:2780:2783:participant
  492. server.2=125.23.63.24:2781:2784:participant
  493. server.3=125.23.63.25:2782:2785:participant</pre>
  494. </div>
  495. </div>
  496. <p>During boot, a dynamic configuration file is created and contains
  497. the dynamic part of the configuration as explained earlier. In this
  498. case, however, the line "clientPort=2791" will remain in the static
  499. configuration file of server 1 since it is not redundant -- it was not
  500. specified as part of the "server.1=..." using the format explained in
  501. the section <a href="#ch_reconfig_format">Changes to Configuration Format</a>. If a reconfiguration
  502. is invoked that sets the client port of server 1, we remove
  503. "clientPort=2791" from the static configuration file (the dynamic file
  504. now contain this information as part of the specification of server
  505. 1).</p>
  506. </div>
  507. <a name="ch_reconfig_upgrade"></a>
  508. <h2 class="h3">Upgrading to 3.5.0</h2>
  509. <div class="section">
  510. <p>Upgrading a running ZooKeeper ensemble to 3.5.0 should be done only
  511. after upgrading your ensemble to the 3.4.6 release. Note that this is only
  512. necessary for rolling upgrades (if you're fine with shutting down the
  513. system completely, you don't have to go through 3.4.6). If you attempt a
  514. rolling upgrade without going through 3.4.6 (for example from 3.4.5), you
  515. may get the following error:</p>
  516. <pre class="code">2013-01-30 11:32:10,663 [myid:2] - INFO [localhost/127.0.0.1:2784:QuorumCnxManager$Listener@498] - Received connection request /127.0.0.1:60876
  517. 2013-01-30 11:32:10,663 [myid:2] - WARN [localhost/127.0.0.1:2784:QuorumCnxManager@349] - Invalid server id: -65536</pre>
  518. <p>During a rolling upgrade, each server is taken down in turn and
  519. rebooted with the new 3.5.0 binaries. Before starting the server with
  520. 3.5.0 binaries, we highly recommend updating the configuration file so
  521. that all server statements "server.x=..." contain client ports (see the
  522. section <a href="#sc_reconfig_clientport">Specifying the client port</a>). As explained earlier
  523. you may leave the configuration in a single file, as well as leave the
  524. clientPort/clientPortAddress statements (although if you specify client
  525. ports in the new format, these statements are now redundant).</p>
  526. </div>
  527. <a name="ch_reconfig_dyn"></a>
  528. <h2 class="h3">Dynamic Reconfiguration of the ZooKeeper Ensemble</h2>
  529. <div class="section">
  530. <p>The ZooKeeper Java and C API were extended with getConfig and reconfig
  531. commands that facilitate reconfiguration. Both commands have a synchronous
  532. (blocking) variant and an asynchronous one. We demonstrate these commands
  533. here using the Java CLI, but note that you can similarly use the C CLI or
  534. invoke the commands directly from a program just like any other ZooKeeper
  535. command.</p>
  536. <a name="ch_reconfig_api"></a>
  537. <h3 class="h4">API</h3>
  538. <p>There are two sets of APIs for both Java and C client.
  539. </p>
  540. <dl>
  541. <dt>
  542. <term>
  543. <strong>Reconfiguration API</strong>
  544. </term>
  545. </dt>
  546. <dd>
  547. <p>Reconfiguration API is used to reconfigure the ZooKeeper cluster.
  548. Starting with 3.5.3, reconfiguration Java APIs are moved into ZooKeeperAdmin class
  549. from ZooKeeper class, and use of this API requires ACL setup and user
  550. authentication (see <a href="#sc_reconfig_access_control">Security</a> for more information.).
  551. </p>
  552. </dd>
  553. <dt>
  554. <term>
  555. <strong>Get Configuration API</strong>
  556. </term>
  557. </dt>
  558. <dd>
  559. <p>Get configuration APIs are used to retrieve ZooKeeper cluster configuration information
  560. stored in /zookeeper/config znode. Use of this API does not require specific setup or authentication,
  561. because /zookeeper/config is readable to any users.</p>
  562. </dd>
  563. </dl>
  564. <a name="sc_reconfig_access_control"></a>
  565. <h3 class="h4">Security</h3>
  566. <p>Prior to <strong>3.5.3</strong>, there is no enforced security mechanism
  567. over reconfig so any ZooKeeper clients that can connect to ZooKeeper server ensemble
  568. will have the ability to change the state of a ZooKeeper cluster via reconfig.
  569. It is thus possible for a malicious client to add compromised server to an ensemble,
  570. e.g., add a compromised server, or remove legitimate servers.
  571. Cases like these could be security vulnerabilities on a case by case basis.
  572. </p>
  573. <p>To address this security concern, we introduced access control over reconfig
  574. starting from <strong>3.5.3</strong> such that only a specific set of users
  575. can use reconfig commands or APIs, and these users need be configured explicitly. In addition,
  576. the setup of ZooKeeper cluster must enable authentication so ZooKeeper clients can be authenticated.
  577. </p>
  578. <p>
  579. We also provides an escape hatch for users who operate and interact with a ZooKeeper ensemble in a secured
  580. environment (i.e. behind company firewall). For those users who want to use reconfiguration feature but
  581. don't want the overhead of configuring an explicit list of authorized user for reconfig access checks,
  582. they can set <a href="zookeeperAdmin.html#sc_authOptions">"skipACL"</a> to "yes" which will
  583. skip ACL check and allow any user to reconfigure cluster.
  584. </p>
  585. <p>
  586. Overall, ZooKeeper provides flexible configuration options for the reconfigure feature
  587. that allow a user to choose based on user's security requirement.
  588. We leave to the discretion of the user to decide appropriate security measure are in place.
  589. </p>
  590. <dl>
  591. <dt>
  592. <term>
  593. <strong>Access Control</strong>
  594. </term>
  595. </dt>
  596. <dd>
  597. <p>The dynamic configuration is stored in a special znode
  598. ZooDefs.CONFIG_NODE = /zookeeper/config. This node by default is read only
  599. for all users, except super user and users that's explicitly configured for write
  600. access.
  601. </p>
  602. <p>Clients that need to use reconfig commands or reconfig API should be configured as users
  603. that have write access to CONFIG_NODE. By default, only the super user has full control including
  604. write access to CONFIG_NODE. Additional users can be granted write access through superuser
  605. by setting an ACL that has write permission associated with specified user.
  606. </p>
  607. <p>A few examples of how to setup ACLs and use reconfiguration API with authentication can be found in
  608. ReconfigExceptionTest.java and TestReconfigServer.cc.</p>
  609. </dd>
  610. <dt>
  611. <term>
  612. <strong>Authentication</strong>
  613. </term>
  614. </dt>
  615. <dd>
  616. <p>Authentication of users is orthogonal to the access control and is delegated to
  617. existing authentication mechanism supported by ZooKeeper's pluggable authentication schemes.
  618. See <a href="https://cwiki.apache.org/confluence/display/ZOOKEEPER/Zookeeper+and+SASL">ZooKeeper and SASL</a> for more details on this topic.
  619. </p>
  620. </dd>
  621. <dt>
  622. <term>
  623. <strong>Disable ACL check</strong>
  624. </term>
  625. </dt>
  626. <dd>
  627. <p>
  628. ZooKeeper supports <a href="zookeeperAdmin.html#sc_authOptions">"skipACL"</a> option such that ACL
  629. check will be completely skipped, if skipACL is set to "yes". In such cases any unauthenticated
  630. users can use reconfig API.
  631. </p>
  632. </dd>
  633. </dl>
  634. <a name="sc_reconfig_retrieving"></a>
  635. <h3 class="h4">Retrieving the current dynamic configuration</h3>
  636. <p>The dynamic configuration is stored in a special znode
  637. ZooDefs.CONFIG_NODE = /zookeeper/config. The new
  638. <span class="codefrag command">config</span> CLI command reads this znode (currently it is
  639. simply a wrapper to <span class="codefrag command">get /zookeeper/config</span>). As with
  640. normal reads, to retrieve the latest committed value you should do a
  641. <span class="codefrag command">sync</span> first.</p>
  642. <pre class="code">[zk: 127.0.0.1:2791(CONNECTED) 3] config
  643. server.1=localhost:2780:2783:participant;localhost:2791
  644. server.2=localhost:2781:2784:participant;localhost:2792
  645. server.3=localhost:2782:2785:participant;localhost:2793
  646. <strong>version=400000003</strong>
  647. </pre>
  648. <p>Notice the last line of the output. This is the configuration
  649. version. The version equals to the zxid of the reconfiguration command
  650. which created this configuration. The version of the first established
  651. configuration equals to the zxid of the NEWLEADER message sent by the
  652. first successfully established leader. When a configuration is written
  653. to a dynamic configuration file, the version automatically becomes part
  654. of the filename and the static configuration file is updated with the
  655. path to the new dynamic configuration file. Configuration files
  656. corresponding to earlier versions are retained for backup
  657. purposes.</p>
  658. <p>During boot time the version (if it exists) is extracted from the
  659. filename. The version should never be altered manually by users or the
  660. system administrator. It is used by the system to know which
  661. configuration is most up-to-date. Manipulating it manually can result in
  662. data loss and inconsistency.</p>
  663. <p>Just like a <span class="codefrag command">get</span> command, the
  664. <span class="codefrag command">config</span> CLI command accepts the <span class="codefrag option">-w</span>
  665. flag for setting a watch on the znode, and <span class="codefrag option">-s</span> flag for
  666. displaying the Stats of the znode. It additionally accepts a new flag
  667. <span class="codefrag option">-c</span> which outputs only the version and the client
  668. connection string corresponding to the current configuration. For
  669. example, for the configuration above we would get:</p>
  670. <pre class="code">[zk: 127.0.0.1:2791(CONNECTED) 17] config -c
  671. 400000003 localhost:2791,localhost:2793,localhost:2792</pre>
  672. <p>Note that when using the API directly, this command is called
  673. <span class="codefrag command">getConfig</span>.</p>
  674. <p>As any read command it returns the configuration known to the
  675. follower to which your client is connected, which may be slightly
  676. out-of-date. One can use the <span class="codefrag command">sync</span> command for
  677. stronger guarantees. For example using the Java API:</p>
  678. <pre class="code">zk.sync(ZooDefs.CONFIG_NODE, void_callback, context);
  679. zk.getConfig(watcher, callback, context);</pre>
  680. <p>Note: in 3.5.0 it doesn't really matter which path is passed to the
  681. <span class="codefrag command">sync() </span> command as all the server's state is brought
  682. up to date with the leader (so one could use a different path instead of
  683. ZooDefs.CONFIG_NODE). However, this may change in the future.</p>
  684. <a name="sc_reconfig_modifying"></a>
  685. <h3 class="h4">Modifying the current dynamic configuration</h3>
  686. <p>Modifying the configuration is done through the
  687. <span class="codefrag command">reconfig</span> command. There are two modes of
  688. reconfiguration: incremental and non-incremental (bulk). The
  689. non-incremental simply specifies the new dynamic configuration of the
  690. system. The incremental specifies changes to the current configuration.
  691. The <span class="codefrag command">reconfig</span> command returns the new
  692. configuration.</p>
  693. <p>A few examples are in: <span class="codefrag filename">ReconfigTest.java</span>,
  694. <span class="codefrag filename">ReconfigRecoveryTest.java</span> and
  695. <span class="codefrag filename">TestReconfigServer.cc</span>.</p>
  696. <a name="sc_reconfig_general"></a>
  697. <h4>General</h4>
  698. <p>
  699. <strong>Removing servers:</strong> Any server can
  700. be removed, including the leader (although removing the leader will
  701. result in a short unavailability, see Figures 6 and 8 in the <a href="https://www.usenix.org/conference/usenixfederatedconferencesweek/dynamic-recon%EF%AC%81guration-primarybackup-clusters">paper</a>). The server will not be shut-down automatically.
  702. Instead, it becomes a "non-voting follower". This is somewhat similar
  703. to an observer in that its votes don't count towards the Quorum of
  704. votes necessary to commit operations. However, unlike a non-voting
  705. follower, an observer doesn't actually see any operation proposals and
  706. does not ACK them. Thus a non-voting follower has a more significant
  707. negative effect on system throughput compared to an observer.
  708. Non-voting follower mode should only be used as a temporary mode,
  709. before shutting the server down, or adding it as a follower or as an
  710. observer to the ensemble. We do not shut the server down automatically
  711. for two main reasons. The first reason is that we do not want all the
  712. clients connected to this server to be immediately disconnected,
  713. causing a flood of connection requests to other servers. Instead, it
  714. is better if each client decides when to migrate independently. The
  715. second reason is that removing a server may sometimes (rarely) be
  716. necessary in order to change it from "observer" to "participant" (this
  717. is explained in the section <a href="#sc_reconfig_additional">Additional comments</a>).</p>
  718. <p>Note that the new configuration should have some minimal number of
  719. participants in order to be considered legal. If the proposed change
  720. would leave the cluster with less than 2 participants and standalone
  721. mode is enabled (standaloneEnabled=true, see the section <a href="#sc_reconfig_standaloneEnabled">The standaloneEnabled flag</a>), the reconfig will not be
  722. processed (BadArgumentsException). If standalone mode is disabled
  723. (standaloneEnabled=false) then its legal to remain with 1 or more
  724. participants.</p>
  725. <p>
  726. <strong>Adding servers:</strong> Before a
  727. reconfiguration is invoked, the administrator must make sure that a
  728. quorum (majority) of participants from the new configuration are
  729. already connected and synced with the current leader. To achieve this
  730. we need to connect a new joining server to the leader before it is
  731. officially part of the ensemble. This is done by starting the joining
  732. server using an initial list of servers which is technically not a
  733. legal configuration of the system but (a) contains the joiner, and (b)
  734. gives sufficient information to the joiner in order for it to find and
  735. connect to the current leader. We list a few different options of
  736. doing this safely.</p>
  737. <ol>
  738. <li>
  739. <p>Initial configuration of joiners is comprised of servers in
  740. the last committed configuration and one or more joiners, where
  741. <strong>joiners are listed as observers.</strong>
  742. For example, if servers D and E are added at the same time to (A,
  743. B, C) and server C is being removed, the initial configuration of
  744. D could be (A, B, C, D) or (A, B, C, D, E), where D and E are
  745. listed as observers. Similarly, the configuration of E could be
  746. (A, B, C, E) or (A, B, C, D, E), where D and E are listed as
  747. observers. <strong>Note that listing the joiners as
  748. observers will not actually make them observers - it will only
  749. prevent them from accidentally forming a quorum with other
  750. joiners.</strong> Instead, they will contact the servers in the
  751. current configuration and adopt the last committed configuration
  752. (A, B, C), where the joiners are absent. Configuration files of
  753. joiners are backed up and replaced automatically as this happens.
  754. After connecting to the current leader, joiners become non-voting
  755. followers until the system is reconfigured and they are added to
  756. the ensemble (as participant or observer, as appropriate).</p>
  757. </li>
  758. <li>
  759. <p>Initial configuration of each joiner is comprised of servers
  760. in the last committed configuration + <strong>the
  761. joiner itself, listed as a participant.</strong> For example, to
  762. add a new server D to a configuration consisting of servers (A, B,
  763. C), the administrator can start D using an initial configuration
  764. file consisting of servers (A, B, C, D). If both D and E are added
  765. at the same time to (A, B, C), the initial configuration of D
  766. could be (A, B, C, D) and the configuration of E could be (A, B,
  767. C, E). Similarly, if D is added and C is removed at the same time,
  768. the initial configuration of D could be (A, B, C, D). Never list
  769. more than one joiner as participant in the initial configuration
  770. (see warning below).</p>
  771. </li>
  772. <li>
  773. <p>Whether listing the joiner as an observer or as participant,
  774. it is also fine not to list all the current configuration servers,
  775. as long as the current leader is in the list. For example, when
  776. adding D we could start D with a configuration file consisting of
  777. just (A, D) if A is the current leader. however this is more
  778. fragile since if A fails before D officially joins the ensemble, D
  779. doesn&rsquo;t know anyone else and therefore the administrator will have
  780. to intervene and restart D with another server list.</p>
  781. </li>
  782. </ol>
  783. <div class="note">
  784. <div class="label">Warning</div>
  785. <div class="content">
  786. <title>Warning</title>
  787. <p>Never specify more than one joining server in the same initial
  788. configuration as participants. Currently, the joining servers don&rsquo;t
  789. know that they are joining an existing ensemble; if multiple joiners
  790. are listed as participants they may form an independent quorum
  791. creating a split-brain situation such as processing operations
  792. independently from your main ensemble. It is OK to list multiple
  793. joiners as observers in an initial config.</p>
  794. </div>
  795. </div>
  796. <p>Finally, note that once connected to the leader, a joiner adopts
  797. the last committed configuration, in which it is absent (the initial
  798. config of the joiner is backed up before being rewritten). If the
  799. joiner restarts in this state, it will not be able to boot since it is
  800. absent from its configuration file. In order to start it you&rsquo;ll once
  801. again have to specify an initial configuration.</p>
  802. <p>
  803. <strong>Modifying server parameters:</strong> One
  804. can modify any of the ports of a server, or its role
  805. (participant/observer) by adding it to the ensemble with different
  806. parameters. This works in both the incremental and the bulk
  807. reconfiguration modes. It is not necessary to remove the server and
  808. then add it back; just specify the new parameters as if the server is
  809. not yet in the system. The server will detect the configuration change
  810. and perform the necessary adjustments. See an example in the section
  811. <a href="#sc_reconfig_incremental">Incremental mode</a> and an exception to this
  812. rule in the section <a href="#sc_reconfig_additional">Additional comments</a>.</p>
  813. <p>It is also possible to change the Quorum System used by the
  814. ensemble (for example, change the Majority Quorum System to a
  815. Hierarchical Quorum System on the fly). This, however, is only allowed
  816. using the bulk (non-incremental) reconfiguration mode. In general,
  817. incremental reconfiguration only works with the Majority Quorum
  818. System. Bulk reconfiguration works with both Hierarchical and Majority
  819. Quorum Systems.</p>
  820. <p>
  821. <strong>Performance Impact:</strong> There is
  822. practically no performance impact when removing a follower, since it
  823. is not being automatically shut down (the effect of removal is that
  824. the server's votes are no longer being counted). When adding a server,
  825. there is no leader change and no noticeable performance disruption.
  826. For details and graphs please see Figures 6, 7 and 8 in the <a href="https://www.usenix.org/conference/usenixfederatedconferencesweek/dynamic-recon%EF%AC%81guration-primarybackup-clusters">paper</a>.</p>
  827. <p>The most significant disruption will happen when a leader change
  828. is caused, in one of the following cases:</p>
  829. <ol>
  830. <li>
  831. <p>Leader is removed from the ensemble.</p>
  832. </li>
  833. <li>
  834. <p>Leader's role is changed from participant to observer.</p>
  835. </li>
  836. <li>
  837. <p>The port used by the leader to send transactions to others
  838. (quorum port) is modified.</p>
  839. </li>
  840. </ol>
  841. <p>In these cases we perform a leader hand-off where the old leader
  842. nominates a new leader. The resulting unavailability is usually
  843. shorter than when a leader crashes since detecting leader failure is
  844. unnecessary and electing a new leader can usually be avoided during a
  845. hand-off (see Figures 6 and 8 in the <a href="https://www.usenix.org/conference/usenixfederatedconferencesweek/dynamic-recon%EF%AC%81guration-primarybackup-clusters">paper</a>).</p>
  846. <p>When the client port of a server is modified, it does not drop
  847. existing client connections. New connections to the server will have
  848. to use the new client port.</p>
  849. <p>
  850. <strong>Progress guarantees:</strong> Up to the
  851. invocation of the reconfig operation, a quorum of the old
  852. configuration is required to be available and connected for ZooKeeper
  853. to be able to make progress. Once reconfig is invoked, a quorum of
  854. both the old and of the new configurations must be available. The
  855. final transition happens once (a) the new configuration is activated,
  856. and (b) all operations scheduled before the new configuration is
  857. activated by the leader are committed. Once (a) and (b) happen, only a
  858. quorum of the new configuration is required. Note, however, that
  859. neither (a) nor (b) are visible to a client. Specifically, when a
  860. reconfiguration operation commits, it only means that an activation
  861. message was sent out by the leader. It does not necessarily mean that
  862. a quorum of the new configuration got this message (which is required
  863. in order to activate it) or that (b) has happened. If one wants to
  864. make sure that both (a) and (b) has already occurred (for example, in
  865. order to know that it is safe to shut down old servers that were
  866. removed), one can simply invoke an update
  867. (<span class="codefrag command">set-data</span>, or some other quorum operation, but not
  868. a <span class="codefrag command">sync</span>) and wait for it to commit. An alternative
  869. way to achieve this was to introduce another round to the
  870. reconfiguration protocol (which, for simplicity and compatibility with
  871. Zab, we decided to avoid).</p>
  872. <a name="sc_reconfig_incremental"></a>
  873. <h4>Incremental mode</h4>
  874. <p>The incremental mode allows adding and removing servers to the
  875. current configuration. Multiple changes are allowed. For
  876. example:</p>
  877. <p>
  878. <span class="codefrag userinput">&gt; reconfig -remove 3 -add
  879. server.5=125.23.63.23:1234:1235;1236</span>
  880. </p>
  881. <p>Both the add and the remove options get a list of comma separated
  882. arguments (no spaces):</p>
  883. <p>
  884. <span class="codefrag userinput">&gt; reconfig -remove 3,4 -add
  885. server.5=localhost:2111:2112;2113,6=localhost:2114:2115:observer;2116</span>
  886. </p>
  887. <p>The format of the server statement is exactly the same as
  888. described in the section <a href="#sc_reconfig_clientport">Specifying the client port</a> and
  889. includes the client port. Notice that here instead of "server.5=" you
  890. can just say "5=". In the example above, if server 5 is already in the
  891. system, but has different ports or is not an observer, it is updated
  892. and once the configuration commits becomes an observer and starts
  893. using these new ports. This is an easy way to turn participants into
  894. observers and vise versa or change any of their ports, without
  895. rebooting the server.</p>
  896. <p>ZooKeeper supports two types of Quorum Systems &ndash; the simple
  897. Majority system (where the leader commits operations after receiving
  898. ACKs from a majority of voters) and a more complex Hierarchical
  899. system, where votes of different servers have different weights and
  900. servers are divided into voting groups. Currently, incremental
  901. reconfiguration is allowed only if the last proposed configuration
  902. known to the leader uses a Majority Quorum System
  903. (BadArgumentsException is thrown otherwise).</p>
  904. <p>Incremental mode - examples using the Java API:</p>
  905. <pre class="code">List&lt;String&gt; leavingServers = new ArrayList&lt;String&gt;();
  906. leavingServers.add("1");
  907. leavingServers.add("2");
  908. byte[] config = zk.reconfig(null, leavingServers, null, -1, new Stat());</pre>
  909. <pre class="code">List&lt;String&gt; leavingServers = new ArrayList&lt;String&gt;();
  910. List&lt;String&gt; joiningServers = new ArrayList&lt;String&gt;();
  911. leavingServers.add("1");
  912. joiningServers.add("server.4=localhost:1234:1235;1236");
  913. byte[] config = zk.reconfig(joiningServers, leavingServers, null, -1, new Stat());
  914. String configStr = new String(config);
  915. System.out.println(configStr);</pre>
  916. <p>There is also an asynchronous API, and an API accepting comma
  917. separated Strings instead of List&lt;String&gt;. See
  918. src/java/main/org/apache/zookeeper/ZooKeeper.java.</p>
  919. <a name="sc_reconfig_nonincremental"></a>
  920. <h4>Non-incremental mode</h4>
  921. <p>The second mode of reconfiguration is non-incremental, whereby a
  922. client gives a complete specification of the new dynamic system
  923. configuration. The new configuration can either be given in place or
  924. read from a file:</p>
  925. <p>
  926. <span class="codefrag userinput">&gt; reconfig -file newconfig.cfg
  927. </span>//newconfig.cfg is a dynamic config file, see <a href="#sc_reconfig_file">Dynamic configuration file</a>
  928. </p>
  929. <p>
  930. <span class="codefrag userinput">&gt; reconfig -members
  931. server.1=125.23.63.23:2780:2783:participant;2791,server.2=125.23.63.24:2781:2784:participant;2792,server.3=125.23.63.25:2782:2785:participant;2793</span>
  932. </p>
  933. <p>The new configuration may use a different Quorum System. For
  934. example, you may specify a Hierarchical Quorum System even if the
  935. current ensemble uses a Majority Quorum System.</p>
  936. <p>Bulk mode - example using the Java API:</p>
  937. <pre class="code">ArrayList&lt;String&gt; newMembers = new ArrayList&lt;String&gt;();
  938. newMembers.add("server.1=1111:1234:1235;1236");
  939. newMembers.add("server.2=1112:1237:1238;1239");
  940. newMembers.add("server.3=1114:1240:1241:observer;1242");
  941. byte[] config = zk.reconfig(null, null, newMembers, -1, new Stat());
  942. String configStr = new String(config);
  943. System.out.println(configStr);</pre>
  944. <p>There is also an asynchronous API, and an API accepting comma
  945. separated String containing the new members instead of
  946. List&lt;String&gt;. See
  947. src/java/main/org/apache/zookeeper/ZooKeeper.java.</p>
  948. <a name="sc_reconfig_conditional"></a>
  949. <h4>Conditional reconfig</h4>
  950. <p>Sometimes (especially in non-incremental mode) a new proposed
  951. configuration depends on what the client "believes" to be the current
  952. configuration, and should be applied only to that configuration.
  953. Specifically, the <span class="codefrag command">reconfig</span> succeeds only if the
  954. last configuration at the leader has the specified version.</p>
  955. <p>
  956. <span class="codefrag userinput">&gt; reconfig -file &lt;filename&gt; -v &lt;version&gt;</span>
  957. </p>
  958. <p>In the previously listed Java examples, instead of -1 one could
  959. specify a configuration version to condition the
  960. reconfiguration.</p>
  961. <a name="sc_reconfig_errors"></a>
  962. <h4>Error conditions</h4>
  963. <p>In addition to normal ZooKeeper error conditions, a
  964. reconfiguration may fail for the following reasons:</p>
  965. <ol>
  966. <li>
  967. <p>another reconfig is currently in progress
  968. (ReconfigInProgress)</p>
  969. </li>
  970. <li>
  971. <p>the proposed change would leave the cluster with less than 2
  972. participants, in case standalone mode is enabled, or, if
  973. standalone mode is disabled then its legal to remain with 1 or
  974. more participants (BadArgumentsException)</p>
  975. </li>
  976. <li>
  977. <p>no quorum of the new configuration was connected and
  978. up-to-date with the leader when the reconfiguration processing
  979. began (NewConfigNoQuorum)</p>
  980. </li>
  981. <li>
  982. <p>
  983. <span class="codefrag userinput">-v x</span> was specified, but the version
  984. <span class="codefrag userinput">y</span> of the latest configuration is not
  985. <span class="codefrag userinput">x</span> (BadVersionException)</p>
  986. </li>
  987. <li>
  988. <p>an incremental reconfiguration was requested but the last
  989. configuration at the leader uses a Quorum System which is
  990. different from the Majority system (BadArgumentsException)</p>
  991. </li>
  992. <li>
  993. <p>syntax error (BadArgumentsException)</p>
  994. </li>
  995. <li>
  996. <p>I/O exception when reading the configuration from a file
  997. (BadArgumentsException)</p>
  998. </li>
  999. </ol>
  1000. <p>Most of these are illustrated by test-cases in
  1001. <span class="codefrag filename">ReconfigFailureCases.java</span>.</p>
  1002. <a name="sc_reconfig_additional"></a>
  1003. <h4>Additional comments</h4>
  1004. <p>
  1005. <strong>Liveness:</strong> To better understand
  1006. the difference between incremental and non-incremental
  1007. reconfiguration, suppose that client C1 adds server D to the system
  1008. while a different client C2 adds server E. With the non-incremental
  1009. mode, each client would first invoke <span class="codefrag command">config</span> to find
  1010. out the current configuration, and then locally create a new list of
  1011. servers by adding its own suggested server. The new configuration can
  1012. then be submitted using the non-incremental
  1013. <span class="codefrag command">reconfig</span> command. After both reconfigurations
  1014. complete, only one of E or D will be added (not both), depending on
  1015. which client's request arrives second to the leader, overwriting the
  1016. previous configuration. The other client can repeat the process until
  1017. its change takes effect. This method guarantees system-wide progress
  1018. (i.e., for one of the clients), but does not ensure that every client
  1019. succeeds. To have more control C2 may request to only execute the
  1020. reconfiguration in case the version of the current configuration
  1021. hasn't changed, as explained in the section <a href="#sc_reconfig_conditional">Conditional reconfig</a>. In this way it may avoid blindly
  1022. overwriting the configuration of C1 if C1's configuration reached the
  1023. leader first.</p>
  1024. <p>With incremental reconfiguration, both changes will take effect as
  1025. they are simply applied by the leader one after the other to the
  1026. current configuration, whatever that is (assuming that the second
  1027. reconfig request reaches the leader after it sends a commit message
  1028. for the first reconfig request -- currently the leader will refuse to
  1029. propose a reconfiguration if another one is already pending). Since
  1030. both clients are guaranteed to make progress, this method guarantees
  1031. stronger liveness. In practice, multiple concurrent reconfigurations
  1032. are probably rare. Non-incremental reconfiguration is currently the
  1033. only way to dynamically change the Quorum System. Incremental
  1034. configuration is currently only allowed with the Majority Quorum
  1035. System.</p>
  1036. <p>
  1037. <strong>Changing an observer into a
  1038. follower:</strong> Clearly, changing a server that participates in
  1039. voting into an observer may fail if error (2) occurs, i.e., if fewer
  1040. than the minimal allowed number of participants would remain. However,
  1041. converting an observer into a participant may sometimes fail for a
  1042. more subtle reason: Suppose, for example, that the current
  1043. configuration is (A, B, C, D), where A is the leader, B and C are
  1044. followers and D is an observer. In addition, suppose that B has
  1045. crashed. If a reconfiguration is submitted where D is said to become a
  1046. follower, it will fail with error (3) since in this configuration, a
  1047. majority of voters in the new configuration (any 3 voters), must be
  1048. connected and up-to-date with the leader. An observer cannot
  1049. acknowledge the history prefix sent during reconfiguration, and
  1050. therefore it does not count towards these 3 required servers and the
  1051. reconfiguration will be aborted. In case this happens, a client can
  1052. achieve the same task by two reconfig commands: first invoke a
  1053. reconfig to remove D from the configuration and then invoke a second
  1054. command to add it back as a participant (follower). During the
  1055. intermediate state D is a non-voting follower and can ACK the state
  1056. transfer performed during the second reconfig comand.</p>
  1057. </div>
  1058. <a name="ch_reconfig_rebalancing"></a>
  1059. <h2 class="h3">Rebalancing Client Connections</h2>
  1060. <div class="section">
  1061. <p>When a ZooKeeper cluster is started, if each client is given the same
  1062. connection string (list of servers), the client will randomly choose a
  1063. server in the list to connect to, which makes the expected number of
  1064. client connections per server the same for each of the servers. We
  1065. implemented a method that preserves this property when the set of servers
  1066. changes through reconfiguration. See Sections 4 and 5.1 in the <a href="https://www.usenix.org/conference/usenixfederatedconferencesweek/dynamic-recon%EF%AC%81guration-primarybackup-clusters">paper</a>.</p>
  1067. <p>In order for the method to work, all clients must subscribe to
  1068. configuration changes (by setting a watch on /zookeeper/config either
  1069. directly or through the <span class="codefrag command">getConfig</span> API command). When
  1070. the watch is triggered, the client should read the new configuration by
  1071. invoking <span class="codefrag command">sync</span> and <span class="codefrag command">getConfig</span> and if
  1072. the configuration is indeed new invoke the
  1073. <span class="codefrag command">updateServerList</span> API command. To avoid mass client
  1074. migration at the same time, it is better to have each client sleep a
  1075. random short period of time before invoking
  1076. <span class="codefrag command">updateServerList</span>.</p>
  1077. <p>A few examples can be found in:
  1078. <span class="codefrag filename">StaticHostProviderTest.java</span> and
  1079. <span class="codefrag filename">TestReconfig.cc</span>
  1080. </p>
  1081. <p>Example (this is not a recipe, but a simplified example just to
  1082. explain the general idea):</p>
  1083. <pre class="code">
  1084. public void process(WatchedEvent event) {
  1085. synchronized (this) {
  1086. if (event.getType() == EventType.None) {
  1087. connected = (event.getState() == KeeperState.SyncConnected);
  1088. notifyAll();
  1089. } else if (event.getPath()!=null &amp;&amp; event.getPath().equals(ZooDefs.CONFIG_NODE)) {
  1090. // in prod code never block the event thread!
  1091. zk.sync(ZooDefs.CONFIG_NODE, this, null);
  1092. zk.getConfig(this, this, null);
  1093. }
  1094. }
  1095. }
  1096. public void processResult(int rc, String path, Object ctx, byte[] data, Stat stat) {
  1097. if (path!=null &amp;&amp; path.equals(ZooDefs.CONFIG_NODE)) {
  1098. String config[] = ConfigUtils.getClientConfigStr(new String(data)).split(" "); // similar to config -c
  1099. long version = Long.parseLong(config[0], 16);
  1100. if (this.configVersion == null){
  1101. this.configVersion = version;
  1102. } else if (version &gt; this.configVersion) {
  1103. hostList = config[1];
  1104. try {
  1105. // the following command is not blocking but may cause the client to close the socket and
  1106. // migrate to a different server. In practice its better to wait a short period of time, chosen
  1107. // randomly, so that different clients migrate at different times
  1108. zk.updateServerList(hostList);
  1109. } catch (IOException e) {
  1110. System.err.println("Error updating server list");
  1111. e.printStackTrace();
  1112. }
  1113. this.configVersion = version;
  1114. } } }</pre>
  1115. </div>
  1116. <p align="right">
  1117. <font size="-2"></font>
  1118. </p>
  1119. </div>
  1120. <!--+
  1121. |end content
  1122. +-->
  1123. <div class="clearboth">&nbsp;</div>
  1124. </div>
  1125. <div id="footer">
  1126. <!--+
  1127. |start bottomstrip
  1128. +-->
  1129. <div class="lastmodified">
  1130. <script type="text/javascript"><!--
  1131. document.write("Last Published: " + document.lastModified);
  1132. // --></script>
  1133. </div>
  1134. <div class="copyright">
  1135. Copyright &copy;
  1136. 2008-2013 <a href="http://www.apache.org/licenses/">The Apache Software Foundation.</a>
  1137. </div>
  1138. <!--+
  1139. |end bottomstrip
  1140. +-->
  1141. </div>
  1142. </body>
  1143. </html>