recipes.html 32 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021
  1. <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
  2. <html>
  3. <head>
  4. <META http-equiv="Content-Type" content="text/html; charset=UTF-8">
  5. <meta content="Apache Forrest" name="Generator">
  6. <meta name="Forrest-version" content="0.9">
  7. <meta name="Forrest-skin-name" content="pelt">
  8. <title>ZooKeeper Recipes and Solutions</title>
  9. <link type="text/css" href="skin/basic.css" rel="stylesheet">
  10. <link media="screen" type="text/css" href="skin/screen.css" rel="stylesheet">
  11. <link media="print" type="text/css" href="skin/print.css" rel="stylesheet">
  12. <link type="text/css" href="skin/profile.css" rel="stylesheet">
  13. <script src="skin/getBlank.js" language="javascript" type="text/javascript"></script><script src="skin/getMenu.js" language="javascript" type="text/javascript"></script><script src="skin/fontsize.js" language="javascript" type="text/javascript"></script>
  14. <link rel="shortcut icon" href="images/favicon.ico">
  15. </head>
  16. <body onload="init()">
  17. <script type="text/javascript">ndeSetTextSize();</script>
  18. <div id="top">
  19. <!--+
  20. |breadtrail
  21. +-->
  22. <div class="breadtrail">
  23. <a href="http://www.apache.org/">Apache</a> &gt; <a href="http://zookeeper.apache.org/">ZooKeeper</a> &gt; <a href="http://zookeeper.apache.org/">ZooKeeper</a><script src="skin/breadcrumbs.js" language="JavaScript" type="text/javascript"></script>
  24. </div>
  25. <!--+
  26. |header
  27. +-->
  28. <div class="header">
  29. <!--+
  30. |start group logo
  31. +-->
  32. <div class="grouplogo">
  33. <a href="http://hadoop.apache.org/"><img class="logoImage" alt="Hadoop" src="images/hadoop-logo.jpg" title="Apache Hadoop"></a>
  34. </div>
  35. <!--+
  36. |end group logo
  37. +-->
  38. <!--+
  39. |start Project Logo
  40. +-->
  41. <div class="projectlogo">
  42. <a href="http://zookeeper.apache.org/"><img class="logoImage" alt="ZooKeeper" src="images/zookeeper_small.gif" title="ZooKeeper: distributed coordination"></a>
  43. </div>
  44. <!--+
  45. |end Project Logo
  46. +-->
  47. <!--+
  48. |start Search
  49. +-->
  50. <div class="searchbox">
  51. <form action="http://www.google.com/search" method="get" class="roundtopsmall">
  52. <input value="zookeeper.apache.org" name="sitesearch" type="hidden"><input onFocus="getBlank (this, 'Search the site with google');" size="25" name="q" id="query" type="text" value="Search the site with google">&nbsp;
  53. <input name="Search" value="Search" type="submit">
  54. </form>
  55. </div>
  56. <!--+
  57. |end search
  58. +-->
  59. <!--+
  60. |start Tabs
  61. +-->
  62. <ul id="tabs">
  63. <li>
  64. <a class="unselected" href="http://zookeeper.apache.org/">Project</a>
  65. </li>
  66. <li>
  67. <a class="unselected" href="https://cwiki.apache.org/confluence/display/ZOOKEEPER/">Wiki</a>
  68. </li>
  69. <li class="current">
  70. <a class="selected" href="index.html">ZooKeeper 3.4 Documentation</a>
  71. </li>
  72. </ul>
  73. <!--+
  74. |end Tabs
  75. +-->
  76. </div>
  77. </div>
  78. <div id="main">
  79. <div id="publishedStrip">
  80. <!--+
  81. |start Subtabs
  82. +-->
  83. <div id="level2tabs"></div>
  84. <!--+
  85. |end Endtabs
  86. +-->
  87. <script type="text/javascript"><!--
  88. document.write("Last Published: " + document.lastModified);
  89. // --></script>
  90. </div>
  91. <!--+
  92. |breadtrail
  93. +-->
  94. <div class="breadtrail">
  95. &nbsp;
  96. </div>
  97. <!--+
  98. |start Menu, mainarea
  99. +-->
  100. <!--+
  101. |start Menu
  102. +-->
  103. <div id="menu">
  104. <div onclick="SwitchMenu('menu_1.1', 'skin/')" id="menu_1.1Title" class="menutitle">Overview</div>
  105. <div id="menu_1.1" class="menuitemgroup">
  106. <div class="menuitem">
  107. <a href="index.html">Welcome</a>
  108. </div>
  109. <div class="menuitem">
  110. <a href="zookeeperOver.html">Overview</a>
  111. </div>
  112. <div class="menuitem">
  113. <a href="zookeeperStarted.html">Getting Started</a>
  114. </div>
  115. <div class="menuitem">
  116. <a href="releasenotes.html">Release Notes</a>
  117. </div>
  118. </div>
  119. <div onclick="SwitchMenu('menu_selected_1.2', 'skin/')" id="menu_selected_1.2Title" class="menutitle" style="background-image: url('skin/images/chapter_open.gif');">Developer</div>
  120. <div id="menu_selected_1.2" class="selectedmenuitemgroup" style="display: block;">
  121. <div class="menuitem">
  122. <a href="api/index.html">API Docs</a>
  123. </div>
  124. <div class="menuitem">
  125. <a href="zookeeperProgrammers.html">Programmer's Guide</a>
  126. </div>
  127. <div class="menuitem">
  128. <a href="javaExample.html">Java Example</a>
  129. </div>
  130. <div class="menuitem">
  131. <a href="zookeeperTutorial.html">Barrier and Queue Tutorial</a>
  132. </div>
  133. <div class="menupage">
  134. <div class="menupagetitle">Recipes</div>
  135. </div>
  136. </div>
  137. <div onclick="SwitchMenu('menu_1.3', 'skin/')" id="menu_1.3Title" class="menutitle">Admin &amp; Ops</div>
  138. <div id="menu_1.3" class="menuitemgroup">
  139. <div class="menuitem">
  140. <a href="zookeeperAdmin.html">Administrator's Guide</a>
  141. </div>
  142. <div class="menuitem">
  143. <a href="zookeeperQuotas.html">Quota Guide</a>
  144. </div>
  145. <div class="menuitem">
  146. <a href="zookeeperJMX.html">JMX</a>
  147. </div>
  148. <div class="menuitem">
  149. <a href="zookeeperObservers.html">Observers Guide</a>
  150. </div>
  151. </div>
  152. <div onclick="SwitchMenu('menu_1.4', 'skin/')" id="menu_1.4Title" class="menutitle">Contributor</div>
  153. <div id="menu_1.4" class="menuitemgroup">
  154. <div class="menuitem">
  155. <a href="zookeeperInternals.html">ZooKeeper Internals</a>
  156. </div>
  157. </div>
  158. <div onclick="SwitchMenu('menu_1.5', 'skin/')" id="menu_1.5Title" class="menutitle">Miscellaneous</div>
  159. <div id="menu_1.5" class="menuitemgroup">
  160. <div class="menuitem">
  161. <a href="https://cwiki.apache.org/confluence/display/ZOOKEEPER">Wiki</a>
  162. </div>
  163. <div class="menuitem">
  164. <a href="https://cwiki.apache.org/confluence/display/ZOOKEEPER/FAQ">FAQ</a>
  165. </div>
  166. <div class="menuitem">
  167. <a href="http://zookeeper.apache.org/mailing_lists.html">Mailing Lists</a>
  168. </div>
  169. </div>
  170. <div id="credit"></div>
  171. <div id="roundbottom">
  172. <img style="display: none" class="corner" height="15" width="15" alt="" src="skin/images/rc-b-l-15-1body-2menu-3menu.png"></div>
  173. <!--+
  174. |alternative credits
  175. +-->
  176. <div id="credit2"></div>
  177. </div>
  178. <!--+
  179. |end Menu
  180. +-->
  181. <!--+
  182. |start content
  183. +-->
  184. <div id="content">
  185. <div title="Portable Document Format" class="pdflink">
  186. <a class="dida" href="recipes.pdf"><img alt="PDF -icon" src="skin/images/pdfdoc.gif" class="skin"><br>
  187. PDF</a>
  188. </div>
  189. <h1>ZooKeeper Recipes and Solutions</h1>
  190. <div id="front-matter">
  191. <div id="minitoc-area">
  192. <ul class="minitoc">
  193. <li>
  194. <a href="#ch_recipes">A Guide to Creating Higher-level Constructs with ZooKeeper</a>
  195. <ul class="minitoc">
  196. <li>
  197. <a href="#sc_recipes_errorHandlingNote">Important Note About Error Handling</a>
  198. </li>
  199. <li>
  200. <a href="#sc_outOfTheBox">Out of the Box Applications: Name Service, Configuration, Group
  201. Membership</a>
  202. </li>
  203. <li>
  204. <a href="#sc_recipes_eventHandles">Barriers</a>
  205. <ul class="minitoc">
  206. <li>
  207. <a href="#sc_doubleBarriers">Double Barriers</a>
  208. </li>
  209. </ul>
  210. </li>
  211. <li>
  212. <a href="#sc_recipes_Queues">Queues</a>
  213. <ul class="minitoc">
  214. <li>
  215. <a href="#sc_recipes_priorityQueues">Priority Queues</a>
  216. </li>
  217. </ul>
  218. </li>
  219. <li>
  220. <a href="#sc_recipes_Locks">Locks</a>
  221. <ul class="minitoc">
  222. <li>
  223. <a href="#sc_recipes_GuidNote">Recoverable Errors and the GUID</a>
  224. </li>
  225. <li>
  226. <a href="#Shared+Locks">Shared Locks</a>
  227. </li>
  228. <li>
  229. <a href="#sc_recoverableSharedLocks">Recoverable Shared Locks</a>
  230. </li>
  231. </ul>
  232. </li>
  233. <li>
  234. <a href="#sc_recipes_twoPhasedCommit">Two-phased Commit</a>
  235. </li>
  236. <li>
  237. <a href="#sc_leaderElection">Leader Election</a>
  238. </li>
  239. </ul>
  240. </li>
  241. </ul>
  242. </div>
  243. </div>
  244. <a name="ch_recipes"></a>
  245. <h2 class="h3">A Guide to Creating Higher-level Constructs with ZooKeeper</h2>
  246. <div class="section">
  247. <p>In this article, you'll find guidelines for using
  248. ZooKeeper to implement higher order functions. All of them are conventions
  249. implemented at the client and do not require special support from
  250. ZooKeeper. Hopfully the community will capture these conventions in client-side libraries
  251. to ease their use and to encourage standardization.</p>
  252. <p>One of the most interesting things about ZooKeeper is that even
  253. though ZooKeeper uses <em>asynchronous</em> notifications, you
  254. can use it to build <em>synchronous</em> consistency
  255. primitives, such as queues and locks. As you will see, this is possible
  256. because ZooKeeper imposes an overall order on updates, and has mechanisms
  257. to expose this ordering.</p>
  258. <p>Note that the recipes below attempt to employ best practices. In
  259. particular, they avoid polling, timers or anything else that would result
  260. in a "herd effect", causing bursts of traffic and limiting
  261. scalability.</p>
  262. <p>There are many useful functions that can be imagined that aren't
  263. included here - revocable read-write priority locks, as just one example.
  264. And some of the constructs mentioned here - locks, in particular -
  265. illustrate certain points, even though you may find other constructs, such
  266. as event handles or queues, a more practical means of performing the same
  267. function. In general, the examples in this section are designed to
  268. stimulate thought.</p>
  269. <a name="sc_recipes_errorHandlingNote"></a>
  270. <h3 class="h4">Important Note About Error Handling</h3>
  271. <p>When implementing the recipes you must handle recoverable exceptions
  272. (see the <a href="https://cwiki.apache.org/confluence/display/ZOOKEEPER/FAQ">FAQ</a>). In
  273. particular, several of the recipes employ sequential ephemeral
  274. nodes. When creating a sequential ephemeral node there is an error case in
  275. which the create() succeeds on the server but the server crashes before
  276. returning the name of the node to the client. When the client reconnects its
  277. session is still valid and, thus, the node is not removed. The implication is
  278. that it is difficult for the client to know if its node was created or not. The
  279. recipes below include measures to handle this.</p>
  280. <a name="sc_outOfTheBox"></a>
  281. <h3 class="h4">Out of the Box Applications: Name Service, Configuration, Group
  282. Membership</h3>
  283. <p>Name service and configuration are two of the primary applications
  284. of ZooKeeper. These two functions are provided directly by the ZooKeeper
  285. API.</p>
  286. <p>Another function directly provided by ZooKeeper is <em>group
  287. membership</em>. The group is represented by a node. Members of the
  288. group create ephemeral nodes under the group node. Nodes of the members
  289. that fail abnormally will be removed automatically when ZooKeeper detects
  290. the failure.</p>
  291. <a name="sc_recipes_eventHandles"></a>
  292. <h3 class="h4">Barriers</h3>
  293. <p>Distributed systems use <em>barriers</em>
  294. to block processing of a set of nodes until a condition is met
  295. at which time all the nodes are allowed to proceed. Barriers are
  296. implemented in ZooKeeper by designating a barrier node. The
  297. barrier is in place if the barrier node exists. Here's the
  298. pseudo code:</p>
  299. <ol>
  300. <li>
  301. <p>Client calls the ZooKeeper API's <strong>exists()</strong> function on the barrier node, with
  302. <em>watch</em> set to true.</p>
  303. </li>
  304. <li>
  305. <p>If <strong>exists()</strong> returns false, the
  306. barrier is gone and the client proceeds</p>
  307. </li>
  308. <li>
  309. <p>Else, if <strong>exists()</strong> returns true,
  310. the clients wait for a watch event from ZooKeeper for the barrier
  311. node.</p>
  312. </li>
  313. <li>
  314. <p>When the watch event is triggered, the client reissues the
  315. <strong>exists( )</strong> call, again waiting until
  316. the barrier node is removed.</p>
  317. </li>
  318. </ol>
  319. <a name="sc_doubleBarriers"></a>
  320. <h4>Double Barriers</h4>
  321. <p>Double barriers enable clients to synchronize the beginning and
  322. the end of a computation. When enough processes have joined the barrier,
  323. processes start their computation and leave the barrier once they have
  324. finished. This recipe shows how to use a ZooKeeper node as a
  325. barrier.</p>
  326. <p>The pseudo code in this recipe represents the barrier node as
  327. <em>b</em>. Every client process <em>p</em>
  328. registers with the barrier node on entry and unregisters when it is
  329. ready to leave. A node registers with the barrier node via the <strong>Enter</strong> procedure below, it waits until
  330. <em>x</em> client process register before proceeding with
  331. the computation. (The <em>x</em> here is up to you to
  332. determine for your system.)</p>
  333. <table class="ForrestTable" cellspacing="1" cellpadding="4">
  334. <tr>
  335. <td><strong>Enter</strong></td>
  336. <td><strong>Leave</strong></td>
  337. </tr>
  338. <tr>
  339. <td>
  340. <ol>
  341. <li>
  342. <p>Create a name <em><em>n</em> =
  343. <em>b</em>+&ldquo;/&rdquo;+<em>p</em></em>
  344. </p>
  345. </li>
  346. <li>
  347. <p>Set watch: <strong>exists(<em>b</em> + &lsquo;&lsquo;/ready&rsquo;&rsquo;,
  348. true)</strong>
  349. </p>
  350. </li>
  351. <li>
  352. <p>Create child: <strong>create(
  353. <em>n</em>, EPHEMERAL)</strong>
  354. </p>
  355. </li>
  356. <li>
  357. <p>
  358. <strong>L = getChildren(b,
  359. false)</strong>
  360. </p>
  361. </li>
  362. <li>
  363. <p>if fewer children in L than<em>
  364. x</em>, wait for watch event</p>
  365. </li>
  366. <li>
  367. <p>else <strong>create(b + &lsquo;&lsquo;/ready&rsquo;&rsquo;,
  368. REGULAR)</strong>
  369. </p>
  370. </li>
  371. </ol>
  372. </td>
  373. <td>
  374. <ol>
  375. <li>
  376. <p>
  377. <strong>L = getChildren(b,
  378. false)</strong>
  379. </p>
  380. </li>
  381. <li>
  382. <p>if no children, exit</p>
  383. </li>
  384. <li>
  385. <p>if <em>p</em> is only process node in
  386. L, delete(n) and exit</p>
  387. </li>
  388. <li>
  389. <p>if <em>p</em> is the lowest process
  390. node in L, wait on highest process node in L</p>
  391. </li>
  392. <li>
  393. <p>else <strong>delete(<em>n</em>) </strong>if
  394. still exists and wait on lowest process node in L</p>
  395. </li>
  396. <li>
  397. <p>goto 1</p>
  398. </li>
  399. </ol>
  400. </td>
  401. </tr>
  402. </table>
  403. <p>On entering, all processes watch on a ready node and
  404. create an ephemeral node as a child of the barrier node. Each process
  405. but the last enters the barrier and waits for the ready node to appear
  406. at line 5. The process that creates the xth node, the last process, will
  407. see x nodes in the list of children and create the ready node, waking up
  408. the other processes. Note that waiting processes wake up only when it is
  409. time to exit, so waiting is efficient.
  410. </p>
  411. <p>On exit, you can't use a flag such as <em>ready</em>
  412. because you are watching for process nodes to go away. By using
  413. ephemeral nodes, processes that fail after the barrier has been entered
  414. do not prevent correct processes from finishing. When processes are
  415. ready to leave, they need to delete their process nodes and wait for all
  416. other processes to do the same.</p>
  417. <p>Processes exit when there are no process nodes left as children of
  418. <em>b</em>. However, as an efficiency, you can use the
  419. lowest process node as the ready flag. All other processes that are
  420. ready to exit watch for the lowest existing process node to go away, and
  421. the owner of the lowest process watches for any other process node
  422. (picking the highest for simplicity) to go away. This means that only a
  423. single process wakes up on each node deletion except for the last node,
  424. which wakes up everyone when it is removed.</p>
  425. <a name="sc_recipes_Queues"></a>
  426. <h3 class="h4">Queues</h3>
  427. <p>Distributed queues are a common data structure. To implement a
  428. distributed queue in ZooKeeper, first designate a znode to hold the queue,
  429. the queue node. The distributed clients put something into the queue by
  430. calling create() with a pathname ending in "queue-", with the
  431. <em>sequence</em> and <em>ephemeral</em> flags in
  432. the create() call set to true. Because the <em>sequence</em>
  433. flag is set, the new pathnames will have the form
  434. _path-to-queue-node_/queue-X, where X is a monotonic increasing number. A
  435. client that wants to be removed from the queue calls ZooKeeper's <strong>getChildren( )</strong> function, with
  436. <em>watch</em> set to true on the queue node, and begins
  437. processing nodes with the lowest number. The client does not need to issue
  438. another <strong>getChildren( )</strong> until it exhausts
  439. the list obtained from the first <strong>getChildren(
  440. )</strong> call. If there are are no children in the queue node, the
  441. reader waits for a watch notification to check the queue again.</p>
  442. <div class="note">
  443. <div class="label">Note</div>
  444. <div class="content">
  445. <p>There now exists a Queue implementation in ZooKeeper
  446. recipes directory. This is distributed with the release --
  447. src/recipes/queue directory of the release artifact.
  448. </p>
  449. </div>
  450. </div>
  451. <a name="sc_recipes_priorityQueues"></a>
  452. <h4>Priority Queues</h4>
  453. <p>To implement a priority queue, you need only make two simple
  454. changes to the generic <a href="#sc_recipes_Queues">queue
  455. recipe</a> . First, to add to a queue, the pathname ends with
  456. "queue-YY" where YY is the priority of the element with lower numbers
  457. representing higher priority (just like UNIX). Second, when removing
  458. from the queue, a client uses an up-to-date children list meaning that
  459. the client will invalidate previously obtained children lists if a watch
  460. notification triggers for the queue node.</p>
  461. <a name="sc_recipes_Locks"></a>
  462. <h3 class="h4">Locks</h3>
  463. <p>Fully distributed locks that are globally synchronous, meaning at
  464. any snapshot in time no two clients think they hold the same lock. These
  465. can be implemented using ZooKeeeper. As with priority queues, first define
  466. a lock node.</p>
  467. <div class="note">
  468. <div class="label">Note</div>
  469. <div class="content">
  470. <p>There now exists a Lock implementation in ZooKeeper
  471. recipes directory. This is distributed with the release --
  472. src/recipes/lock directory of the release artifact.
  473. </p>
  474. </div>
  475. </div>
  476. <p>Clients wishing to obtain a lock do the following:</p>
  477. <ol>
  478. <li>
  479. <p>Call <strong>create( )</strong> with a pathname
  480. of "_locknode_/guid-lock-" and the <em>sequence</em> and
  481. <em>ephemeral</em> flags set. The <em>guid</em>
  482. is needed in case the create() result is missed. See the note below.</p>
  483. </li>
  484. <li>
  485. <p>Call <strong>getChildren( )</strong> on the lock
  486. node <em>without</em> setting the watch flag (this is
  487. important to avoid the herd effect).</p>
  488. </li>
  489. <li>
  490. <p>If the pathname created in step <strong>1</strong> has the lowest sequence number suffix, the
  491. client has the lock and the client exits the protocol.</p>
  492. </li>
  493. <li>
  494. <p>The client calls <strong>exists( )</strong> with
  495. the watch flag set on the path in the lock directory with the next
  496. lowest sequence number.</p>
  497. </li>
  498. <li>
  499. <p>if <strong>exists( )</strong> returns false, go
  500. to step <strong>2</strong>. Otherwise, wait for a
  501. notification for the pathname from the previous step before going to
  502. step <strong>2</strong>.</p>
  503. </li>
  504. </ol>
  505. <p>The unlock protocol is very simple: clients wishing to release a
  506. lock simply delete the node they created in step 1.</p>
  507. <p>Here are a few things to notice:</p>
  508. <ul>
  509. <li>
  510. <p>The removal of a node will only cause one client to wake up
  511. since each node is watched by exactly one client. In this way, you
  512. avoid the herd effect.</p>
  513. </li>
  514. </ul>
  515. <ul>
  516. <li>
  517. <p>There is no polling or timeouts.</p>
  518. </li>
  519. </ul>
  520. <ul>
  521. <li>
  522. <p>Because of the way you implement locking, it is easy to see the
  523. amount of lock contention, break locks, debug locking problems,
  524. etc.</p>
  525. </li>
  526. </ul>
  527. <a name="sc_recipes_GuidNote"></a>
  528. <h4>Recoverable Errors and the GUID</h4>
  529. <ul>
  530. <li>
  531. <p>If a recoverable error occurs calling <strong>create()</strong> the
  532. client should call <strong>getChildren()</strong> and check for a node
  533. containing the <em>guid</em> used in the path name.
  534. This handles the case (noted <a href="#sc_recipes_errorHandlingNote">above</a>) of
  535. the create() succeeding on the server but the server crashing before returning the name
  536. of the new node.</p>
  537. </li>
  538. </ul>
  539. <a name="Shared+Locks"></a>
  540. <h4>Shared Locks</h4>
  541. <p>You can implement shared locks by with a few changes to the lock
  542. protocol:</p>
  543. <table class="ForrestTable" cellspacing="1" cellpadding="4">
  544. <tr>
  545. <td><strong>Obtaining a read
  546. lock:</strong></td>
  547. <td><strong>Obtaining a write
  548. lock:</strong></td>
  549. </tr>
  550. <tr>
  551. <td>
  552. <ol>
  553. <li>
  554. <p>Call <strong>create( )</strong> to
  555. create a node with pathname
  556. "<span class="codefrag filename">guid-/read-</span>". This is the
  557. lock node use later in the protocol. Make sure to set both
  558. the <em>sequence</em> and
  559. <em>ephemeral</em> flags.</p>
  560. </li>
  561. <li>
  562. <p>Call <strong>getChildren( )</strong>
  563. on the lock node <em>without</em> setting the
  564. <em>watch</em> flag - this is important, as it
  565. avoids the herd effect.</p>
  566. </li>
  567. <li>
  568. <p>If there are no children with a pathname starting
  569. with "<span class="codefrag filename">write-</span>" and having a lower
  570. sequence number than the node created in step <strong>1</strong>, the client has the lock and can
  571. exit the protocol. </p>
  572. </li>
  573. <li>
  574. <p>Otherwise, call <strong>exists(
  575. )</strong>, with <em>watch</em> flag, set on
  576. the node in lock directory with pathname staring with
  577. "<span class="codefrag filename">write-</span>" having the next lowest
  578. sequence number.</p>
  579. </li>
  580. <li>
  581. <p>If <strong>exists( )</strong>
  582. returns <em>false</em>, goto step <strong>2</strong>.</p>
  583. </li>
  584. <li>
  585. <p>Otherwise, wait for a notification for the pathname
  586. from the previous step before going to step <strong>2</strong>
  587. </p>
  588. </li>
  589. </ol>
  590. </td>
  591. <td>
  592. <ol>
  593. <li>
  594. <p>Call <strong>create( )</strong> to
  595. create a node with pathname
  596. "<span class="codefrag filename">guid-/write-</span>". This is the
  597. lock node spoken of later in the protocol. Make sure to
  598. set both <em>sequence</em> and
  599. <em>ephemeral</em> flags.</p>
  600. </li>
  601. <li>
  602. <p>Call <strong>getChildren( )
  603. </strong> on the lock node <em>without</em>
  604. setting the <em>watch</em> flag - this is
  605. important, as it avoids the herd effect.</p>
  606. </li>
  607. <li>
  608. <p>If there are no children with a lower sequence
  609. number than the node created in step <strong>1</strong>, the client has the lock and the
  610. client exits the protocol.</p>
  611. </li>
  612. <li>
  613. <p>Call <strong>exists( ),</strong>
  614. with <em>watch</em> flag set, on the node with
  615. the pathname that has the next lowest sequence
  616. number.</p>
  617. </li>
  618. <li>
  619. <p>If <strong>exists( )</strong>
  620. returns <em>false</em>, goto step <strong>2</strong>. Otherwise, wait for a
  621. notification for the pathname from the previous step
  622. before going to step <strong>2</strong>.</p>
  623. </li>
  624. </ol>
  625. </td>
  626. </tr>
  627. </table>
  628. <p>Notes:</p>
  629. <ul>
  630. <li>
  631. <p>It might appear that this recipe creates a herd effect:
  632. when there is a large group of clients waiting for a read
  633. lock, and all getting notified more or less simultaneously
  634. when the "<span class="codefrag filename">write-</span>" node with the lowest
  635. sequence number is deleted. In fact. that's valid behavior:
  636. as all those waiting reader clients should be released since
  637. they have the lock. The herd effect refers to releasing a
  638. "herd" when in fact only a single or a small number of
  639. machines can proceed.</p>
  640. </li>
  641. </ul>
  642. <ul>
  643. <li>
  644. <p>See the <a href="#sc_recipes_GuidNote">note for Locks</a> on how to use the guid in the node.</p>
  645. </li>
  646. </ul>
  647. <a name="sc_recoverableSharedLocks"></a>
  648. <h4>Recoverable Shared Locks</h4>
  649. <p>With minor modifications to the Shared Lock protocol, you make
  650. shared locks revocable by modifying the shared lock protocol:</p>
  651. <p>In step <strong>1</strong>, of both obtain reader
  652. and writer lock protocols, call <strong>getData(
  653. )</strong> with <em>watch</em> set, immediately after the
  654. call to <strong>create( )</strong>. If the client
  655. subsequently receives notification for the node it created in step
  656. <strong>1</strong>, it does another <strong>getData( )</strong> on that node, with
  657. <em>watch</em> set and looks for the string "unlock", which
  658. signals to the client that it must release the lock. This is because,
  659. according to this shared lock protocol, you can request the client with
  660. the lock give up the lock by calling <strong>setData()
  661. </strong> on the lock node, writing "unlock" to that node.</p>
  662. <p>Note that this protocol requires the lock holder to consent to
  663. releasing the lock. Such consent is important, especially if the lock
  664. holder needs to do some processing before releasing the lock. Of course
  665. you can always implement <em>Revocable Shared Locks with Freaking
  666. Laser Beams</em> by stipulating in your protocol that the revoker
  667. is allowed to delete the lock node if after some length of time the lock
  668. isn't deleted by the lock holder.</p>
  669. <a name="sc_recipes_twoPhasedCommit"></a>
  670. <h3 class="h4">Two-phased Commit</h3>
  671. <p>A two-phase commit protocol is an algorithm that lets all clients in
  672. a distributed system agree either to commit a transaction or abort.</p>
  673. <p>In ZooKeeper, you can implement a two-phased commit by having a
  674. coordinator create a transaction node, say "/app/Tx", and one child node
  675. per participating site, say "/app/Tx/s_i". When coordinator creates the
  676. child node, it leaves the content undefined. Once each site involved in
  677. the transaction receives the transaction from the coordinator, the site
  678. reads each child node and sets a watch. Each site then processes the query
  679. and votes "commit" or "abort" by writing to its respective node. Once the
  680. write completes, the other sites are notified, and as soon as all sites
  681. have all votes, they can decide either "abort" or "commit". Note that a
  682. node can decide "abort" earlier if some site votes for "abort".</p>
  683. <p>An interesting aspect of this implementation is that the only role
  684. of the coordinator is to decide upon the group of sites, to create the
  685. ZooKeeper nodes, and to propagate the transaction to the corresponding
  686. sites. In fact, even propagating the transaction can be done through
  687. ZooKeeper by writing it in the transaction node.</p>
  688. <p>There are two important drawbacks of the approach described above.
  689. One is the message complexity, which is O(n&sup2;). The second is the
  690. impossibility of detecting failures of sites through ephemeral nodes. To
  691. detect the failure of a site using ephemeral nodes, it is necessary that
  692. the site create the node.</p>
  693. <p>To solve the first problem, you can have only the coordinator
  694. notified of changes to the transaction nodes, and then notify the sites
  695. once coordinator reaches a decision. Note that this approach is scalable,
  696. but it's is slower too, as it requires all communication to go through the
  697. coordinator.</p>
  698. <p>To address the second problem, you can have the coordinator
  699. propagate the transaction to the sites, and have each site creating its
  700. own ephemeral node.</p>
  701. <a name="sc_leaderElection"></a>
  702. <h3 class="h4">Leader Election</h3>
  703. <p>A simple way of doing leader election with ZooKeeper is to use the
  704. <strong>SEQUENCE|EPHEMERAL</strong> flags when creating
  705. znodes that represent "proposals" of clients. The idea is to have a znode,
  706. say "/election", such that each znode creates a child znode "/election/guid-n_"
  707. with both flags SEQUENCE|EPHEMERAL. With the sequence flag, ZooKeeper
  708. automatically appends a sequence number that is greater that any one
  709. previously appended to a child of "/election". The process that created
  710. the znode with the smallest appended sequence number is the leader.
  711. </p>
  712. <p>That's not all, though. It is important to watch for failures of the
  713. leader, so that a new client arises as the new leader in the case the
  714. current leader fails. A trivial solution is to have all application
  715. processes watching upon the current smallest znode, and checking if they
  716. are the new leader when the smallest znode goes away (note that the
  717. smallest znode will go away if the leader fails because the node is
  718. ephemeral). But this causes a herd effect: upon of failure of the current
  719. leader, all other processes receive a notification, and execute
  720. getChildren on "/election" to obtain the current list of children of
  721. "/election". If the number of clients is large, it causes a spike on the
  722. number of operations that ZooKeeper servers have to process. To avoid the
  723. herd effect, it is sufficient to watch for the next znode down on the
  724. sequence of znodes. If a client receives a notification that the znode it
  725. is watching is gone, then it becomes the new leader in the case that there
  726. is no smaller znode. Note that this avoids the herd effect by not having
  727. all clients watching the same znode. </p>
  728. <p>Here's the pseudo code:</p>
  729. <p>Let ELECTION be a path of choice of the application. To volunteer to
  730. be a leader: </p>
  731. <ol>
  732. <li>
  733. <p>Create znode z with path "ELECTION/guid-n_" with both SEQUENCE and
  734. EPHEMERAL flags;</p>
  735. </li>
  736. <li>
  737. <p>Let C be the children of "ELECTION", and i be the sequence
  738. number of z;</p>
  739. </li>
  740. <li>
  741. <p>Watch for changes on "ELECTION/guid-n_j", where j is the largest
  742. sequence number such that j &lt; i and n_j is a znode in C;</p>
  743. </li>
  744. </ol>
  745. <p>Upon receiving a notification of znode deletion: </p>
  746. <ol>
  747. <li>
  748. <p>Let C be the new set of children of ELECTION; </p>
  749. </li>
  750. <li>
  751. <p>If z is the smallest node in C, then execute leader
  752. procedure;</p>
  753. </li>
  754. <li>
  755. <p>Otherwise, watch for changes on "ELECTION/guid-n_j", where j is the
  756. largest sequence number such that j &lt; i and n_j is a znode in C;
  757. </p>
  758. </li>
  759. </ol>
  760. <p>Notes:</p>
  761. <ul>
  762. <li>
  763. <p>Note that the znode having no preceding znode on the list of
  764. children does not imply that the creator of this znode is aware that it is
  765. the current leader. Applications may consider creating a separate znode
  766. to acknowledge that the leader has executed the leader procedure. </p>
  767. </li>
  768. </ul>
  769. <ul>
  770. <li>
  771. <p>See the <a href="#sc_recipes_GuidNote">note for Locks</a> on how to use the guid in the node.</p>
  772. </li>
  773. </ul>
  774. </div>
  775. <p align="right">
  776. <font size="-2"></font>
  777. </p>
  778. </div>
  779. <!--+
  780. |end content
  781. +-->
  782. <div class="clearboth">&nbsp;</div>
  783. </div>
  784. <div id="footer">
  785. <!--+
  786. |start bottomstrip
  787. +-->
  788. <div class="lastmodified">
  789. <script type="text/javascript"><!--
  790. document.write("Last Published: " + document.lastModified);
  791. // --></script>
  792. </div>
  793. <div class="copyright">
  794. Copyright &copy;
  795. 2008-2013 <a href="http://www.apache.org/licenses/">The Apache Software Foundation.</a>
  796. </div>
  797. <!--+
  798. |end bottomstrip
  799. +-->
  800. </div>
  801. </body>
  802. </html>