zookeeperInternals.html 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748
  1. <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
  2. <html>
  3. <head>
  4. <META http-equiv="Content-Type" content="text/html; charset=UTF-8">
  5. <meta content="Apache Forrest" name="Generator">
  6. <meta name="Forrest-version" content="0.8">
  7. <meta name="Forrest-skin-name" content="pelt">
  8. <title>ZooKeeper Internals</title>
  9. <link type="text/css" href="skin/basic.css" rel="stylesheet">
  10. <link media="screen" type="text/css" href="skin/screen.css" rel="stylesheet">
  11. <link media="print" type="text/css" href="skin/print.css" rel="stylesheet">
  12. <link type="text/css" href="skin/profile.css" rel="stylesheet">
  13. <script src="skin/getBlank.js" language="javascript" type="text/javascript"></script><script src="skin/getMenu.js" language="javascript" type="text/javascript"></script><script src="skin/fontsize.js" language="javascript" type="text/javascript"></script>
  14. <link rel="shortcut icon" href="images/favicon.ico">
  15. </head>
  16. <body onload="init()">
  17. <script type="text/javascript">ndeSetTextSize();</script>
  18. <div id="top">
  19. <!--+
  20. |breadtrail
  21. +-->
  22. <div class="breadtrail">
  23. <a href="http://www.apache.org/">Apache</a> &gt; <a href="http://hadoop.apache.org/">Hadoop</a> &gt; <a href="http://hadoop.apache.org/zookeeper/">ZooKeeper</a><script src="skin/breadcrumbs.js" language="JavaScript" type="text/javascript"></script>
  24. </div>
  25. <!--+
  26. |header
  27. +-->
  28. <div class="header">
  29. <!--+
  30. |start group logo
  31. +-->
  32. <div class="grouplogo">
  33. <a href="http://hadoop.apache.org/"><img class="logoImage" alt="Hadoop" src="images/hadoop-logo.jpg" title="Apache Hadoop"></a>
  34. </div>
  35. <!--+
  36. |end group logo
  37. +-->
  38. <!--+
  39. |start Project Logo
  40. +-->
  41. <div class="projectlogo">
  42. <a href="http://hadoop.apache.org/zookeeper/"><img class="logoImage" alt="ZooKeeper" src="images/zookeeper_small.gif" title="ZooKeeper: distributed coordination"></a>
  43. </div>
  44. <!--+
  45. |end Project Logo
  46. +-->
  47. <!--+
  48. |start Search
  49. +-->
  50. <div class="searchbox">
  51. <form action="http://www.google.com/search" method="get" class="roundtopsmall">
  52. <input value="hadoop.apache.org" name="sitesearch" type="hidden"><input onFocus="getBlank (this, 'Search the site with google');" size="25" name="q" id="query" type="text" value="Search the site with google">&nbsp;
  53. <input name="Search" value="Search" type="submit">
  54. </form>
  55. </div>
  56. <!--+
  57. |end search
  58. +-->
  59. <!--+
  60. |start Tabs
  61. +-->
  62. <ul id="tabs">
  63. <li>
  64. <a class="unselected" href="http://hadoop.apache.org/zookeeper/">Project</a>
  65. </li>
  66. <li>
  67. <a class="unselected" href="http://wiki.apache.org/hadoop/ZooKeeper">Wiki</a>
  68. </li>
  69. <li class="current">
  70. <a class="selected" href="index.html">ZooKeeper 3.1 Documentation</a>
  71. </li>
  72. </ul>
  73. <!--+
  74. |end Tabs
  75. +-->
  76. </div>
  77. </div>
  78. <div id="main">
  79. <div id="publishedStrip">
  80. <!--+
  81. |start Subtabs
  82. +-->
  83. <div id="level2tabs"></div>
  84. <!--+
  85. |end Endtabs
  86. +-->
  87. <script type="text/javascript"><!--
  88. document.write("Last Published: " + document.lastModified);
  89. // --></script>
  90. </div>
  91. <!--+
  92. |breadtrail
  93. +-->
  94. <div class="breadtrail">
  95. &nbsp;
  96. </div>
  97. <!--+
  98. |start Menu, mainarea
  99. +-->
  100. <!--+
  101. |start Menu
  102. +-->
  103. <div id="menu">
  104. <div onclick="SwitchMenu('menu_1.1', 'skin/')" id="menu_1.1Title" class="menutitle">Overview</div>
  105. <div id="menu_1.1" class="menuitemgroup">
  106. <div class="menuitem">
  107. <a href="index.html">Welcome</a>
  108. </div>
  109. <div class="menuitem">
  110. <a href="zookeeperOver.html">Overview</a>
  111. </div>
  112. <div class="menuitem">
  113. <a href="zookeeperStarted.html">Getting Started</a>
  114. </div>
  115. <div class="menuitem">
  116. <a href="releasenotes.html">Release Notes</a>
  117. </div>
  118. </div>
  119. <div onclick="SwitchMenu('menu_1.2', 'skin/')" id="menu_1.2Title" class="menutitle">Developer</div>
  120. <div id="menu_1.2" class="menuitemgroup">
  121. <div class="menuitem">
  122. <a href="api/index.html">API Docs</a>
  123. </div>
  124. <div class="menuitem">
  125. <a href="zookeeperProgrammers.html">Programmer's Guide</a>
  126. </div>
  127. <div class="menuitem">
  128. <a href="javaExample.html">Java Example</a>
  129. </div>
  130. <div class="menuitem">
  131. <a href="zookeeperTutorial.html">Barrier and Queue Tutorial</a>
  132. </div>
  133. <div class="menuitem">
  134. <a href="recipes.html">Recipes</a>
  135. </div>
  136. </div>
  137. <div onclick="SwitchMenu('menu_1.3', 'skin/')" id="menu_1.3Title" class="menutitle">Admin &amp; Ops</div>
  138. <div id="menu_1.3" class="menuitemgroup">
  139. <div class="menuitem">
  140. <a href="zookeeperAdmin.html">Administrator's Guide</a>
  141. </div>
  142. <div class="menuitem">
  143. <a href="zookeeperQuotas.html">Quota Guide</a>
  144. </div>
  145. <div class="menuitem">
  146. <a href="zookeeperJMX.html">JMX</a>
  147. </div>
  148. </div>
  149. <div onclick="SwitchMenu('menu_selected_1.4', 'skin/')" id="menu_selected_1.4Title" class="menutitle" style="background-image: url('skin/images/chapter_open.gif');">Contributor</div>
  150. <div id="menu_selected_1.4" class="selectedmenuitemgroup" style="display: block;">
  151. <div class="menupage">
  152. <div class="menupagetitle">ZooKeeper Internals</div>
  153. </div>
  154. </div>
  155. <div onclick="SwitchMenu('menu_1.5', 'skin/')" id="menu_1.5Title" class="menutitle">Miscellaneous</div>
  156. <div id="menu_1.5" class="menuitemgroup">
  157. <div class="menuitem">
  158. <a href="http://wiki.apache.org/hadoop/ZooKeeper">Wiki</a>
  159. </div>
  160. <div class="menuitem">
  161. <a href="http://wiki.apache.org/hadoop/ZooKeeper/FAQ">FAQ</a>
  162. </div>
  163. <div class="menuitem">
  164. <a href="http://hadoop.apache.org/zookeeper/mailing_lists.html">Mailing Lists</a>
  165. </div>
  166. </div>
  167. <div id="credit"></div>
  168. <div id="roundbottom">
  169. <img style="display: none" class="corner" height="15" width="15" alt="" src="skin/images/rc-b-l-15-1body-2menu-3menu.png"></div>
  170. <!--+
  171. |alternative credits
  172. +-->
  173. <div id="credit2"></div>
  174. </div>
  175. <!--+
  176. |end Menu
  177. +-->
  178. <!--+
  179. |start content
  180. +-->
  181. <div id="content">
  182. <div title="Portable Document Format" class="pdflink">
  183. <a class="dida" href="zookeeperInternals.pdf"><img alt="PDF -icon" src="skin/images/pdfdoc.gif" class="skin"><br>
  184. PDF</a>
  185. </div>
  186. <h1>ZooKeeper Internals</h1>
  187. <div id="minitoc-area">
  188. <ul class="minitoc">
  189. <li>
  190. <a href="#ch_Introduction">Introduction</a>
  191. </li>
  192. <li>
  193. <a href="#sc_atomicBroadcast">Atomic Broadcast</a>
  194. <ul class="minitoc">
  195. <li>
  196. <a href="#sc_guaranteesPropertiesDefinitions">Guarantees, Properties, and Definitions</a>
  197. </li>
  198. <li>
  199. <a href="#sc_leaderElection">Leader Activation</a>
  200. </li>
  201. <li>
  202. <a href="#sc_activeMessaging">Active Messaging</a>
  203. </li>
  204. <li>
  205. <a href="#sc_summary">Summary</a>
  206. </li>
  207. <li>
  208. <a href="#sc_comparisons">Comparisons</a>
  209. </li>
  210. </ul>
  211. </li>
  212. <li>
  213. <a href="#sc_logging">Logging</a>
  214. <ul class="minitoc">
  215. <li>
  216. <a href="#sc_developerGuidelines">Developer Guidelines</a>
  217. <ul class="minitoc">
  218. <li>
  219. <a href="#sc_rightLevel">Logging at the Right Level</a>
  220. </li>
  221. <li>
  222. <a href="#sc_log4jIdioms">Use of Standard log4j Idioms</a>
  223. </li>
  224. </ul>
  225. </li>
  226. </ul>
  227. </li>
  228. </ul>
  229. </div>
  230. <a name="N10009"></a><a name="ch_Introduction"></a>
  231. <h2 class="h3">Introduction</h2>
  232. <div class="section">
  233. <p>This document contains information on the inner workings of ZooKeeper.
  234. So far, it discusses these topics:
  235. </p>
  236. <ul>
  237. <li>
  238. <p>
  239. <a href="#sc_atomicBroadcast">Atomic Broadcast</a>
  240. </p>
  241. </li>
  242. <li>
  243. <p>
  244. <a href="#sc_logging">Logging</a>
  245. </p>
  246. </li>
  247. </ul>
  248. </div>
  249. <a name="N10022"></a><a name="sc_atomicBroadcast"></a>
  250. <h2 class="h3">Atomic Broadcast</h2>
  251. <div class="section">
  252. <p>
  253. At the heart of ZooKeeper is an atomic messaging system that keeps all of the servers in sync.</p>
  254. <a name="N1002B"></a><a name="sc_guaranteesPropertiesDefinitions"></a>
  255. <h3 class="h4">Guarantees, Properties, and Definitions</h3>
  256. <p>
  257. The specific guarantees provided by the messaging system used by ZooKeeper are the following:</p>
  258. <dl>
  259. <dt>
  260. <term>
  261. <em>Reliable delivery</em>
  262. </term>
  263. </dt>
  264. <dd>
  265. <p>If a message, m, is delivered
  266. by one server, it will be eventually delivered by all servers.</p>
  267. </dd>
  268. <dt>
  269. <term>
  270. <em>Total order</em>
  271. </term>
  272. </dt>
  273. <dd>
  274. <p> If a message is
  275. delivered before message b by one server, a will be delivered before b by all
  276. servers. If a and b are delivered messages, either a will be delivered before b
  277. or b will be delivered before a.</p>
  278. </dd>
  279. <dt>
  280. <term>
  281. <em>Causal order</em>
  282. </term>
  283. </dt>
  284. <dd>
  285. <p>
  286. If a message b is sent after a message a has been delivered by the sender of b,
  287. a must be ordered before b. If a sender sends c after sending b, c must be ordered after b.
  288. </p>
  289. </dd>
  290. </dl>
  291. <p>
  292. The ZooKeeper messaging system also needs to be efficient, reliable, and easy to
  293. implement and maintain. We make heavy use of messaging, so we need the system to
  294. be able to handle thousands of requests per second. Although we can require at
  295. least k+1 correct servers to send new messages, we must be able to recover from
  296. correlated failures such as power outages. When we implemented the system we had
  297. little time and few engineering resources, so we needed a protocol that is
  298. accessible to engineers and is easy to implement. We found that our protocol
  299. satisfied all of these goals.
  300. </p>
  301. <p>
  302. Our protocol assumes that we can construct point-to-point FIFO channels between
  303. the servers. While similar services usually assume message delivery that can
  304. lose or reorder messages, our assumption of FIFO channels is very practical
  305. given that we use TCP for communication. Specifically we rely on the following property of TCP:</p>
  306. <dl>
  307. <dt>
  308. <term>
  309. <em>Ordered delivery</em>
  310. </term>
  311. </dt>
  312. <dd>
  313. <p>Data is delivered in the same order it is sent and a message m is
  314. delivered only after all messages sent before m have been delivered.
  315. (The corollary to this is that if message m is lost all messages after m will be lost.)</p>
  316. </dd>
  317. <dt>
  318. <term>
  319. <em>No message after close</em>
  320. </term>
  321. </dt>
  322. <dd>
  323. <p>Once a FIFO channel is closed, no messages will be received from it.</p>
  324. </dd>
  325. </dl>
  326. <p>
  327. FLP proved that consensus cannot be achieved in asynchronous distributed systems
  328. if failures are possible. To ensure we achieve consensus in the presence of failures
  329. we use timeouts. However, we rely on times for liveness not for correctness. So,
  330. if timeouts stop working (clocks malfunction for example) the messaging system may
  331. hang, but it will not violate its guarantees.</p>
  332. <p>When describing the ZooKeeper messaging protocol we will talk of packets,
  333. proposals, and messages:</p>
  334. <dl>
  335. <dt>
  336. <term>
  337. <em>Packet</em>
  338. </term>
  339. </dt>
  340. <dd>
  341. <p>a sequence of bytes sent through a FIFO channel</p>
  342. </dd>
  343. <dt>
  344. <term>
  345. <em>Proposal</em>
  346. </term>
  347. </dt>
  348. <dd>
  349. <p>a unit of agreement. Proposals are agreed upon by exchanging packets
  350. with a quorum of ZooKeeper servers. Most proposals contain messages, however the
  351. NEW_LEADER proposal is an example of a proposal that does not correspond to a message.</p>
  352. </dd>
  353. <dt>
  354. <term>
  355. <em>Message</em>
  356. </term>
  357. </dt>
  358. <dd>
  359. <p>a sequence of bytes to be atomically broadcast to all ZooKeeper
  360. servers. A message put into a proposal and agreed upon before it is delivered.</p>
  361. </dd>
  362. </dl>
  363. <p>
  364. As stated above, ZooKeeper guarantees a total order of messages, and it also
  365. guarantees a total order of proposals. ZooKeeper exposes the total ordering using
  366. a ZooKeeper transaction id (<em>zxid</em>). All proposals will be stamped with a zxid when
  367. it is proposed and exactly reflects the total ordering. Proposals are sent to all
  368. ZooKeeper servers and committed when a quorum of them acknowledge the proposal.
  369. If a proposal contains a message, the message will be delivered when the proposal
  370. is committed. Acknowledgement means the server has recorded the proposal to persistent storage.
  371. Our quorums have the requirement that any pair of quorum must have at least one server
  372. in common. We ensure this by requiring that all quorums have size (<em>n/2+1</em>) where
  373. n is the number of servers that make up a ZooKeeper service.
  374. </p>
  375. <p>
  376. The zxid has two parts: the epoch and a counter. In our implementation the zxid
  377. is a 64-bit number. We use the high order 32-bits for the epoch and the low order
  378. 32-bits for the counter. Because it has two parts represent the zxid both as a
  379. number and as a pair of integers, (<em>epoch, count</em>). The epoch number represents a
  380. change in leadership. Each time a new leader comes into power it will have its
  381. own epoch number. We have a simple algorithm to assign a unique zxid to a proposal:
  382. the leader simply increments the zxid to obtain a unique zxid for each proposal.
  383. <em>Leadership activation will ensure that only one leader uses a given epoch, so our
  384. simple algorithm guarantees that every proposal will have a unique id.</em>
  385. </p>
  386. <p>
  387. ZooKeeper messaging consists of two phases:</p>
  388. <dl>
  389. <dt>
  390. <term>
  391. <em>Leader activation</em>
  392. </term>
  393. </dt>
  394. <dd>
  395. <p>In this phase a leader establishes the correct state of the system
  396. and gets ready to start making proposals.</p>
  397. </dd>
  398. <dt>
  399. <term>
  400. <em>Active messaging</em>
  401. </term>
  402. </dt>
  403. <dd>
  404. <p>In this phase a leader accepts messages to propose and coordinates message delivery.</p>
  405. </dd>
  406. </dl>
  407. <p>
  408. ZooKeeper is a holistic protocol. We do not focus on individual proposals, rather
  409. look at the stream of proposals as a whole. Our strict ordering allows us to do this
  410. efficiently and greatly simplifies our protocol. Leadership activation embodies
  411. this holistic concept. A leader becomes active only when a quorum of followers
  412. (The leader counts as a follower as well. You can always vote for yourself ) has synced
  413. up with the leader, they have the same state. This state consists of all of the
  414. proposals that the leader believes have been committed and the proposal to follow
  415. the leader, the NEW_LEADER proposal. (Hopefully you are thinking to
  416. yourself, <em>Does the set of proposals that the leader believes has been committed
  417. included all the proposals that really have been committed?</em> The answer is <em>yes</em>.
  418. Below, we make clear why.)
  419. </p>
  420. <a name="N100B9"></a><a name="sc_leaderElection"></a>
  421. <h3 class="h4">Leader Activation</h3>
  422. <p>
  423. Leader activation includes leader election. We currently have two leader election
  424. algorithms in ZooKeeper: LeaderElection and FastLeaderElection (AuthFastLeaderElection
  425. is a variant of FastLeaderElection that uses UDP and allows servers to perform a simple
  426. form of authentication to avoid IP spoofing). ZooKeeper messaging doesn't care about the
  427. exact method of electing a leader has long as the following holds:
  428. </p>
  429. <ul>
  430. <li>
  431. <p>The leader has seen the highest zxid of all the followers.</p>
  432. </li>
  433. <li>
  434. <p>A quorum of servers have committed to following the leader.</p>
  435. </li>
  436. </ul>
  437. <p>
  438. Of these two requirements only the first, the highest zxid amoung the followers
  439. needs to hold for correct operation. The second requirement, a quorum of followers,
  440. just needs to hold with high probability. We are going to recheck the second requirement,
  441. so if a failure happens during or after the leader election and quorum is lost,
  442. we will recover by abandoning leader activation and running another election.
  443. </p>
  444. <p>
  445. After leader election a single server will be designated as a leader and start
  446. waiting for followers to connect. The rest of the servers will try to connect to
  447. the leader. The leader will sync up with followers by sending any proposals they
  448. are missing, or if a follower is missing too many proposals, it will send a full
  449. snapshot of the state to the follower.
  450. </p>
  451. <p>
  452. There is a corner case in which a follower that has proposals, U, not seen
  453. by a leader arrives. Proposals are seen in order, so the proposals of U will have a zxids
  454. higher than zxids seen by the leader. The follower must have arrived after the
  455. leader election, otherwise the follower would have been elected leader given that
  456. it has seen a higher zxid. Since committed proposals must be seen by a quorum of
  457. servers, and a quorum of servers that elected the leader did not see U, the proposals
  458. of you have not been committed, so they can be discarded. When the follower connects
  459. to the leader, the leader will tell the follower to discard U.
  460. </p>
  461. <p>
  462. A new leader establishes a zxid to start using for new proposals by getting the
  463. epoch, e, of the highest zxid it has seen and setting the next zxid to use to be
  464. (e+1, 0), fter the leader syncs with a follower, it will propose a NEW_LEADER
  465. proposal. Once the NEW_LEADER proposal has been committed, the leader will activate
  466. and start receiving and issuing proposals.
  467. </p>
  468. <p>
  469. It all sounds complicated but here are the basic rules of operation during leader
  470. activation:
  471. </p>
  472. <ul>
  473. <li>
  474. <p>A follower will ACK the NEW_LEADER proposal after it has synced with the leader.</p>
  475. </li>
  476. <li>
  477. <p>A follower will only ACK a NEW_LEADER proposal with a given zxid from a single server.</p>
  478. </li>
  479. <li>
  480. <p>A new leader will COMMIT the NEW_LEADER proposal when a quorum of followers have ACKed it.</p>
  481. </li>
  482. <li>
  483. <p>A follower will commit any state it received from the leader when the NEW_LEADER proposal is COMMIT.</p>
  484. </li>
  485. <li>
  486. <p>A new leader will not accept new proposals until the NEW_LEADER proposal has been COMMITED.</p>
  487. </li>
  488. </ul>
  489. <p>
  490. If leader election terminates erroneously, we don't have a problem since the
  491. NEW_LEADER proposal will not be committed since the leader will not have quorum.
  492. When this happens, the leader and any remaining followers will timeout and go back
  493. to leader election.
  494. </p>
  495. <a name="N100F7"></a><a name="sc_activeMessaging"></a>
  496. <h3 class="h4">Active Messaging</h3>
  497. <p>
  498. Leader Activation does all the heavy lifting. Once the leader is coronated he can
  499. start blasting out proposals. As long as he remains the leader no other leader can
  500. emerge since no other leader will be able to get a quorum of followers. If a new
  501. leader does emerge,
  502. it means that the leader has lost quorum, and the new leader will clean up any
  503. mess left over during her leadership activation.
  504. </p>
  505. <p>ZooKeeper messaging operates similar to a classic two-phase commit.</p>
  506. <img alt="" src="images/2pc.png"><p>
  507. All communication channels are FIFO, so everything is done in order. Specifically
  508. the following operating constraints are observed:</p>
  509. <ul>
  510. <li>
  511. <p>The leader sends proposals to all followers using
  512. the same order. Moreover, this order follows the order in which requests have been
  513. received. Because we use FIFO channels this means that followers also receive proposals in order.
  514. </p>
  515. </li>
  516. <li>
  517. <p>Followers process messages in the order they are received. This
  518. means that messages will be ACKed in order and the leader will receive ACKs from
  519. followers in order, due to the FIFO channels. It also means that if message $m$
  520. has been written to non-volatile storage, all messages that were proposed before
  521. $m$ have been written to non-volatile storage.</p>
  522. </li>
  523. <li>
  524. <p>The leader will issue a COMMIT to all followers as soon as a
  525. quorum of followers have ACKed a message. Since messages are ACKed in order,
  526. COMMITs will be sent by the leader as received by the followers in order.</p>
  527. </li>
  528. <li>
  529. <p>COMMITs are processed in order. Followers deliver a proposals
  530. message when that proposal is committed.</p>
  531. </li>
  532. </ul>
  533. <a name="N1011E"></a><a name="sc_summary"></a>
  534. <h3 class="h4">Summary</h3>
  535. <p>So there you go. Why does it work? Specifically, why does is set of proposals
  536. believed by a new leader always contain any proposal that has actually been committed?
  537. First, all proposals have a unique zxid, so unlike other protocols, we never have
  538. to worry about two different values being proposed for the same zxid; followers
  539. (a leader is also a follower) see and record proposals in order; proposals are
  540. committed in order; there is only one active leader at a time since followers only
  541. follow a single leader at a time; a new leader has seen all committed proposals
  542. from the previous epoch since it has seen the highest zxid from a quorum of servers;
  543. any uncommited proposals from a previous epoch seen by a new leader will be committed
  544. by that leader before it becomes active.</p>
  545. <a name="N10127"></a><a name="sc_comparisons"></a>
  546. <h3 class="h4">Comparisons</h3>
  547. <p>
  548. Isn't this just Multi-Paxos? No, Multi-Paxos requires some way of assuring that
  549. there is only a single coordinator. We do not count on such assurances. Instead
  550. we use the leader activation to recover from leadership change or old leaders
  551. believing they are still active.
  552. </p>
  553. <p>
  554. Isn't this just Paxos? Your active messaging phase looks just like phase 2 of Paxos?
  555. Actually, to us active messaging looks just like 2 phase commit without the need to
  556. handle aborts. Active messaging is different from both in the sense that it has
  557. cross proposal ordering requirements. If we do not maintain strict FIFO ordering of
  558. all packets, it all falls apart. Also, our leader activation phase is different from
  559. both of them. In particular, our use of epochs allows us to skip blocks of uncommitted
  560. proposals and to not worry about duplicate proposals for a given zxid.
  561. </p>
  562. </div>
  563. <a name="N10134"></a><a name="sc_logging"></a>
  564. <h2 class="h3">Logging</h2>
  565. <div class="section">
  566. <p>
  567. ZooKeeper uses
  568. <a href="http://logging.apache.org/log4j">log4j</a>
  569. version 1.2 as its logging infrastructure. For information on configuring log4j for
  570. ZooKeeper, see the <a href="zookeeperAdmin.html#sc_logging">Logging</a> section
  571. of the <a href="zookeeperAdmin.html">ZooKeeper Administrator's Guide.</a>
  572. </p>
  573. <a name="N10149"></a><a name="sc_developerGuidelines"></a>
  574. <h3 class="h4">Developer Guidelines</h3>
  575. <p>Please follow these guidelines when submitting code. Patch reviewers will look for the following:</p>
  576. <a name="N10151"></a><a name="sc_rightLevel"></a>
  577. <h4>Logging at the Right Level</h4>
  578. <p>
  579. There are <a href="http://logging.apache.org/log4j/1.2/apidocs/org/apache/log4j/Level.html#FATAL">6 levels of logging in log4j</a>.
  580. It's important to pick the right one. In order of higher to lower severity:</p>
  581. <ol>
  582. <li>
  583. <p> FATAL level designates very severe error events that will presumably lead the application to abort</p>
  584. </li>
  585. <li>
  586. <p>ERROR level designates error events that might still allow the application to continue running.</p>
  587. </li>
  588. <li>
  589. <p>WARN level designates potentially harmful situations.</p>
  590. </li>
  591. <li>
  592. <p>INFO level designates informational messages that highlight the progress of the application at coarse-grained level.</p>
  593. </li>
  594. <li>
  595. <p>EBUG Level designates fine-grained informational events that are most useful to debug an application.</p>
  596. </li>
  597. <li>
  598. <p>TRACE Level designates finer-grained informational events than the DEBUG.</p>
  599. </li>
  600. </ol>
  601. <p>
  602. ZooKeeper is typically run in production such that log messages of INFO level
  603. severity and higher (more severe) are output to the log.</p>
  604. <a name="N1017C"></a><a name="sc_log4jIdioms"></a>
  605. <h4>Use of Standard log4j Idioms</h4>
  606. <p>
  607. <em>Static Message Logging</em>
  608. </p>
  609. <pre class="code">
  610. LOG.debug("process completed successfully!");
  611. </pre>
  612. <p>However when creating a message from a number of components (string
  613. concatenation), the log call should be wrapped with a "isXEnabled()" call. this
  614. eliminates the string concatenation overhead when debug level logging is not enabled.
  615. </p>
  616. <pre class="code">
  617. if (LOG.isDebugEnabled()) {
  618. LOG.debug("got " + count + " messages in " + time + " minutes");
  619. }
  620. </pre>
  621. <p>
  622. <em>Naming</em>
  623. </p>
  624. <p>
  625. Loggers should be named after the class in which they are used. (See the
  626. <a href="http://logging.apache.org/log4j/1.2/faq.html#2.4">log4j faq</a>
  627. for reasons why this is a good idea.)
  628. </p>
  629. <pre class="code">
  630. public class Foo {
  631. private static final Logger LOG = Logger.getLogger(Foo.class);
  632. ....
  633. public Foo() {
  634. LOG.info("constructing Foo");
  635. </pre>
  636. <p>
  637. <em>Exception handling</em>
  638. </p>
  639. <pre class="code">
  640. try {
  641. // code
  642. } catch (XYZException e) {
  643. // do this
  644. LOG.error("Something bad happened", e);
  645. // don't do this (generally)
  646. // LOG.error(e);
  647. // why? because "don't do" case hides the stack trace
  648. // continue process here as you need... recover or (re)throw
  649. }
  650. </pre>
  651. </div>
  652. <p align="right">
  653. <font size="-2"></font>
  654. </p>
  655. </div>
  656. <!--+
  657. |end content
  658. +-->
  659. <div class="clearboth">&nbsp;</div>
  660. </div>
  661. <div id="footer">
  662. <!--+
  663. |start bottomstrip
  664. +-->
  665. <div class="lastmodified">
  666. <script type="text/javascript"><!--
  667. document.write("Last Published: " + document.lastModified);
  668. // --></script>
  669. </div>
  670. <div class="copyright">
  671. Copyright &copy;
  672. 2008 <a href="http://www.apache.org/licenses/">The Apache Software Foundation.</a>
  673. </div>
  674. <!--+
  675. |end bottomstrip
  676. +-->
  677. </div>
  678. </body>
  679. </html>