|
@@ -0,0 +1,1540 @@
|
|
|
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
|
|
|
+<html>
|
|
|
+<head>
|
|
|
+<META http-equiv="Content-Type" content="text/html; charset=UTF-8">
|
|
|
+<meta content="Apache Forrest" name="Generator">
|
|
|
+<meta name="Forrest-version" content="0.8">
|
|
|
+<meta name="Forrest-skin-name" content="pelt">
|
|
|
+<title></title>
|
|
|
+<link type="text/css" href="skin/basic.css" rel="stylesheet">
|
|
|
+<link media="screen" type="text/css" href="skin/screen.css" rel="stylesheet">
|
|
|
+<link media="print" type="text/css" href="skin/print.css" rel="stylesheet">
|
|
|
+<link type="text/css" href="skin/profile.css" rel="stylesheet">
|
|
|
+<script src="skin/getBlank.js" language="javascript" type="text/javascript"></script><script src="skin/getMenu.js" language="javascript" type="text/javascript"></script><script src="skin/fontsize.js" language="javascript" type="text/javascript"></script>
|
|
|
+<link rel="shortcut icon" href="images/favicon.ico">
|
|
|
+</head>
|
|
|
+<body onload="init()">
|
|
|
+<script type="text/javascript">ndeSetTextSize();</script>
|
|
|
+<div id="top">
|
|
|
+<!--+
|
|
|
+ |breadtrail
|
|
|
+ +-->
|
|
|
+<div class="breadtrail">
|
|
|
+<a href="http://www.apache.org/">Apache</a> > <a href="http://hadoop.apache.org/">Hadoop</a> > <a href="http://hadoop.apache.org/zookeeper/">ZooKeeper</a><script src="skin/breadcrumbs.js" language="JavaScript" type="text/javascript"></script>
|
|
|
+</div>
|
|
|
+<!--+
|
|
|
+ |header
|
|
|
+ +-->
|
|
|
+<div class="header">
|
|
|
+<!--+
|
|
|
+ |start group logo
|
|
|
+ +-->
|
|
|
+<div class="grouplogo">
|
|
|
+<a href="http://hadoop.apache.org/"><img class="logoImage" alt="Hadoop" src="images/hadoop-logo.jpg" title="Apache Hadoop"></a>
|
|
|
+</div>
|
|
|
+<!--+
|
|
|
+ |end group logo
|
|
|
+ +-->
|
|
|
+<!--+
|
|
|
+ |start Project Logo
|
|
|
+ +-->
|
|
|
+<div class="projectlogo">
|
|
|
+<a href="http://hadoop.apache.org/zookeeper/"><img class="logoImage" alt="ZooKeeper" src="images/zookeeper_small.gif" title="The Hadoop database"></a>
|
|
|
+</div>
|
|
|
+<!--+
|
|
|
+ |end Project Logo
|
|
|
+ +-->
|
|
|
+<!--+
|
|
|
+ |start Search
|
|
|
+ +-->
|
|
|
+<div class="searchbox">
|
|
|
+<form action="http://www.google.com/search" method="get" class="roundtopsmall">
|
|
|
+<input value="hadoop.apache.org" name="sitesearch" type="hidden"><input onFocus="getBlank (this, 'Search the site with google');" size="25" name="q" id="query" type="text" value="Search the site with google">
|
|
|
+ <input name="Search" value="Search" type="submit">
|
|
|
+</form>
|
|
|
+</div>
|
|
|
+<!--+
|
|
|
+ |end search
|
|
|
+ +-->
|
|
|
+<!--+
|
|
|
+ |start Tabs
|
|
|
+ +-->
|
|
|
+<ul id="tabs">
|
|
|
+<li>
|
|
|
+<a class="unselected" href="http://hadoop.apache.org/zookeeper/">Project</a>
|
|
|
+</li>
|
|
|
+<li>
|
|
|
+<a class="unselected" href="http://wiki.apache.org/hadoop/ZooKeeper">Wiki</a>
|
|
|
+</li>
|
|
|
+<li class="current">
|
|
|
+<a class="selected" href="index.html">ZooKeeper Documentation</a>
|
|
|
+</li>
|
|
|
+</ul>
|
|
|
+<!--+
|
|
|
+ |end Tabs
|
|
|
+ +-->
|
|
|
+</div>
|
|
|
+</div>
|
|
|
+<div id="main">
|
|
|
+<div id="publishedStrip">
|
|
|
+<!--+
|
|
|
+ |start Subtabs
|
|
|
+ +-->
|
|
|
+<div id="level2tabs"></div>
|
|
|
+<!--+
|
|
|
+ |end Endtabs
|
|
|
+ +-->
|
|
|
+<script type="text/javascript"><!--
|
|
|
+document.write("Last Published: " + document.lastModified);
|
|
|
+// --></script>
|
|
|
+</div>
|
|
|
+<!--+
|
|
|
+ |breadtrail
|
|
|
+ +-->
|
|
|
+<div class="breadtrail">
|
|
|
+
|
|
|
+
|
|
|
+ </div>
|
|
|
+<!--+
|
|
|
+ |start Menu, mainarea
|
|
|
+ +-->
|
|
|
+<!--+
|
|
|
+ |start Menu
|
|
|
+ +-->
|
|
|
+<div id="menu">
|
|
|
+<div onclick="SwitchMenu('menu_selected_1.1', 'skin/')" id="menu_selected_1.1Title" class="menutitle" style="background-image: url('skin/images/chapter_open.gif');">Documentation</div>
|
|
|
+<div id="menu_selected_1.1" class="selectedmenuitemgroup" style="display: block;">
|
|
|
+<div class="menuitem">
|
|
|
+<a href="index.html">Welcome</a>
|
|
|
+</div>
|
|
|
+<div class="menuitem">
|
|
|
+<a href="zookeeperOver.html">Zookeeper Overview</a>
|
|
|
+</div>
|
|
|
+<div class="menuitem">
|
|
|
+<a href="zookeeperStarted.html">Getting Started</a>
|
|
|
+</div>
|
|
|
+<div class="menupage">
|
|
|
+<div class="menupagetitle">Programmer's Guide</div>
|
|
|
+</div>
|
|
|
+<div class="menuitem">
|
|
|
+<a href="recipes.html">Recipes</a>
|
|
|
+</div>
|
|
|
+<div class="menuitem">
|
|
|
+<a href="zookeeperAdmin.html">Administrator's Guide</a>
|
|
|
+</div>
|
|
|
+<div class="menuitem">
|
|
|
+<a href="api/index.html">API Docs</a>
|
|
|
+</div>
|
|
|
+<div class="menuitem">
|
|
|
+<a href="http://wiki.apache.org/hadoop/ZooKeeper">Wiki</a>
|
|
|
+</div>
|
|
|
+<div class="menuitem">
|
|
|
+<a href="http://wiki.apache.org/hadoop/ZooKeeper/FAQ">FAQ</a>
|
|
|
+</div>
|
|
|
+<div class="menuitem">
|
|
|
+<a href="http://hadoop.apache.org/zookeeper/mailing_lists.html">Mailing Lists</a>
|
|
|
+</div>
|
|
|
+<div class="menuitem">
|
|
|
+<a href="zookeeperOtherInfo.html">Other Info</a>
|
|
|
+</div>
|
|
|
+</div>
|
|
|
+<div id="credit"></div>
|
|
|
+<div id="roundbottom">
|
|
|
+<img style="display: none" class="corner" height="15" width="15" alt="" src="skin/images/rc-b-l-15-1body-2menu-3menu.png"></div>
|
|
|
+<!--+
|
|
|
+ |alternative credits
|
|
|
+ +-->
|
|
|
+<div id="credit2"></div>
|
|
|
+</div>
|
|
|
+<!--+
|
|
|
+ |end Menu
|
|
|
+ +-->
|
|
|
+<!--+
|
|
|
+ |start content
|
|
|
+ +-->
|
|
|
+<div id="content">
|
|
|
+<div title="Portable Document Format" class="pdflink">
|
|
|
+<a class="dida" href="zookeeperProgrammers.pdf"><img alt="PDF -icon" src="skin/images/pdfdoc.gif" class="skin"><br>
|
|
|
+ PDF</a>
|
|
|
+</div>
|
|
|
+<div id="minitoc-area">
|
|
|
+<ul class="minitoc">
|
|
|
+<li>
|
|
|
+<a href="#The+ZooKeeper+Data+Model">The ZooKeeper Data Model</a>
|
|
|
+<ul class="minitoc">
|
|
|
+<li>
|
|
|
+<a href="#sc_zkDataModel_znodes">ZNodes</a>
|
|
|
+<ul class="minitoc">
|
|
|
+<li>
|
|
|
+<a href="#sc_zkDataMode_watches">Watches</a>
|
|
|
+</li>
|
|
|
+<li>
|
|
|
+<a href="#Data+Access">Data Access</a>
|
|
|
+</li>
|
|
|
+<li>
|
|
|
+<a href="#Ephemeral+Nodes">Ephemeral Nodes</a>
|
|
|
+</li>
|
|
|
+<li>
|
|
|
+<a href="#Unique+Naming">Unique Naming</a>
|
|
|
+</li>
|
|
|
+</ul>
|
|
|
+</li>
|
|
|
+<li>
|
|
|
+<a href="#sc_timeInZk">Time in ZooKeeper</a>
|
|
|
+</li>
|
|
|
+<li>
|
|
|
+<a href="#sc_zkStatStructure">ZooKeeper Stat Structure</a>
|
|
|
+</li>
|
|
|
+</ul>
|
|
|
+</li>
|
|
|
+<li>
|
|
|
+<a href="#ZooKeeper+Sessions">ZooKeeper Sessions</a>
|
|
|
+</li>
|
|
|
+<li>
|
|
|
+<a href="#ZooKeeper+Watches">ZooKeeper Watches</a>
|
|
|
+<ul class="minitoc">
|
|
|
+<li>
|
|
|
+<a href="#sc_WatchGuarantees">What ZooKeeper Guarantees about Watches</a>
|
|
|
+</li>
|
|
|
+<li>
|
|
|
+<a href="#sc_WatchRememberThese">Things to Remember about Watches</a>
|
|
|
+</li>
|
|
|
+</ul>
|
|
|
+</li>
|
|
|
+<li>
|
|
|
+<a href="#Consistency+Guarantees">Consistency Guarantees</a>
|
|
|
+</li>
|
|
|
+<li>
|
|
|
+<a href="#Bindings">Bindings</a>
|
|
|
+<ul class="minitoc">
|
|
|
+<li>
|
|
|
+<a href="#Java+Binding">Java Binding</a>
|
|
|
+</li>
|
|
|
+<li>
|
|
|
+<a href="#C+Binding">C Binding</a>
|
|
|
+<ul class="minitoc">
|
|
|
+<li>
|
|
|
+<a href="#Installation">Installation</a>
|
|
|
+</li>
|
|
|
+<li>
|
|
|
+<a href="#Using+the+Client">Using the Client</a>
|
|
|
+</li>
|
|
|
+</ul>
|
|
|
+</li>
|
|
|
+</ul>
|
|
|
+</li>
|
|
|
+<li>
|
|
|
+<a href="#Building+Blocks%3A+A+Guide+to+ZooKeeper+Operations">Building Blocks: A Guide to ZooKeeper Operations</a>
|
|
|
+</li>
|
|
|
+<li>
|
|
|
+<a href="#Program+Structure%2C+with+Simple+Example">Program Structure, with Simple Example</a>
|
|
|
+</li>
|
|
|
+<li>
|
|
|
+<a href="#Gotchas%3A+Common+Problems+and+Troubleshooting">Gotchas: Common Problems and Troubleshooting</a>
|
|
|
+</li>
|
|
|
+</ul>
|
|
|
+</div>
|
|
|
+
|
|
|
+<title>ZooKeeper Programmer's Guide</title>
|
|
|
+
|
|
|
+
|
|
|
+<subtitle>Developing Distributed Applications that use ZooKeeper</subtitle>
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+<a name="_introduction"></a>
|
|
|
+<preface id="_introduction">
|
|
|
+
|
|
|
+<title>Introduction</title>
|
|
|
+
|
|
|
+
|
|
|
+<p>This document is a guide for developers wishing to create
|
|
|
+ distributed applications that take advantage of ZooKeeper's coordination
|
|
|
+ services. It contains conceptual and practical information.</p>
|
|
|
+
|
|
|
+
|
|
|
+<p>The first four chapters of this guide present higher level
|
|
|
+ discussions of various ZooKeeper concepts. These are necessary both for an
|
|
|
+ understanding of how Zookeeper works as well how to work with it. It does
|
|
|
+ not contain source code, but it does assume a familiarity with the
|
|
|
+ problems associated with distributed computing. The chapters in this first
|
|
|
+ group are:</p>
|
|
|
+
|
|
|
+
|
|
|
+<ul>
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>
|
|
|
+<a href="#ch_zkDataModel">The ZooKeeper Data Model</a>
|
|
|
+</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>
|
|
|
+<a href="#ch_zkSessions">ZooKeeper Sessions</a>
|
|
|
+</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>
|
|
|
+<a href="#ch_zkWatches">ZooKeeper Watches</a>
|
|
|
+</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>
|
|
|
+<a href="#ch_zkGuarantees">Consistency Guarantees</a>
|
|
|
+</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+</ul>
|
|
|
+
|
|
|
+
|
|
|
+<p>The next four chapters of this provided practical programming
|
|
|
+ information. These are:</p>
|
|
|
+
|
|
|
+
|
|
|
+<ul>
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>
|
|
|
+<a href="#ch_guideToZkOperations">Building Blocks: A Guide to ZooKeeper Operations</a>
|
|
|
+</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>
|
|
|
+<a href="#ch_bindings">Bindings</a>
|
|
|
+</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>
|
|
|
+<a href="#ch_programStructureWithExample">Program Structure, with Simple Example</a>
|
|
|
+
|
|
|
+<remark>[tbd]</remark>
|
|
|
+</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>
|
|
|
+<a href="#ch_gotchas">Gotchas: Common Problems and Troubleshooting</a>
|
|
|
+</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+</ul>
|
|
|
+
|
|
|
+
|
|
|
+<p>The book concludes with an <a href="#apx_linksToOtherInfo">appendix</a> containing links to other
|
|
|
+ useful, ZooKeeper-related information.</p>
|
|
|
+
|
|
|
+
|
|
|
+<p>Most of information in this document is written to be accessible as
|
|
|
+ stand-alone reference material. However, before starting your first
|
|
|
+ ZooKeeper application, you should probably at least read the chaptes on
|
|
|
+ the <a href="#ch_zkDataModel">ZooKeeper Data Model</a> and <a href="#ch_guideToZkOperations">ZooKeeper Basic Operations</a>. Also,
|
|
|
+ the <a href="#ch_programStructureWithExample">Simple Programmming
|
|
|
+ Example</a>
|
|
|
+<remark>[tbd]</remark> is helpful for understand the basic
|
|
|
+ structure of a ZooKeeper client application.</p>
|
|
|
+
|
|
|
+</preface>
|
|
|
+
|
|
|
+
|
|
|
+<a name="N1007F"></a><a name="The+ZooKeeper+Data+Model"></a>
|
|
|
+<h2 class="h3">The ZooKeeper Data Model</h2>
|
|
|
+<div class="section">
|
|
|
+<p>ZooKeeper has a hierarchal name space, much like a distributed file
|
|
|
+ system. The only difference is that each node in the namespace can have
|
|
|
+ data associated with it as well as children. It is like having a file
|
|
|
+ system that allows a file to also be a directory. Paths to nodes are
|
|
|
+ always expressed as canonical, absolute, slash-separated paths; there are
|
|
|
+ no relative reference. Any unicode character can be used in a path subject
|
|
|
+ to the following constraints:</p>
|
|
|
+<ul>
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>The null character (\u0000) cannot be part of a path name. (This
|
|
|
+ causes problems with the C binding.)</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>The following characters can't be used because they don't
|
|
|
+ display well, or render in confusing ways: \u0001 - \u0019 and \u007F
|
|
|
+ - \u009F.</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>The following characters are not allowed because <remark>[tbd:
|
|
|
+ do we need reasons?]</remark> :\ud800 -uF8FFF, \uFFF0-uFFFF, \uXFFFE -
|
|
|
+ \uXFFFF (where X is an digit 1 - E), \uF0000 - \uFFFFF.</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>The "." character can be used as part of another name, but "."
|
|
|
+ and ".." cannot alone make up the whole name of a path location,
|
|
|
+ because ZooKeeper doesn't use relative paths. The following would be
|
|
|
+ invalid: "/a/b/./c" or "/a/b/../c".</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>The token "zookeeper" is reserved.</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+</ul>
|
|
|
+<a name="N100AC"></a><a name="sc_zkDataModel_znodes"></a>
|
|
|
+<h3 class="h4">ZNodes</h3>
|
|
|
+<p>Every node in a ZooKeeper tree is refered to as a
|
|
|
+ <em>znode</em>. Znodes maintain a stat structure that
|
|
|
+ includes version numbers for data changes, acl changes. The stat
|
|
|
+ structure also has timestamps. The version number, together with the
|
|
|
+ timestamp allow ZooKeeper to validate the cache and to coordinate
|
|
|
+ updates. Each time a znode's data changes, the version number increases.
|
|
|
+ For instance, whenever a client retrieves data, it also receives the
|
|
|
+ version of the data. And when a client performs an update or a delete,
|
|
|
+ it must supply the version of the data of the znode it is changing. If
|
|
|
+ the version it supplies doesn't match the actual version of the data,
|
|
|
+ the update will fail. (This behavior can be overridden. For more
|
|
|
+ information see... <remark>[tbd... reference here to the section
|
|
|
+ describing the special version number -1]</remark>
|
|
|
+</p>
|
|
|
+<div class="note">
|
|
|
+<div class="label">Note</div>
|
|
|
+<div class="content">
|
|
|
+
|
|
|
+<p>In distributed application engineering, the word
|
|
|
+ <em>node</em> can refer to a generic host machine, a
|
|
|
+ server, a member of quorums, a client process, etc. In the ZooKeeper
|
|
|
+ documentatin, <em>znodes</em> refer to the data nodes.
|
|
|
+ <em>Servers</em> to refer to machines that make up the
|
|
|
+ ZooKeeper service; <em>quorum peers</em> refer to the
|
|
|
+ servers that make up a quorum; client refers to any host or process
|
|
|
+ which uses a ZooKeeper service.</p>
|
|
|
+
|
|
|
+</div>
|
|
|
+</div>
|
|
|
+<p>Znodes are the main enitity that a programmer access. They have
|
|
|
+ several characteristics that are worth mentioning here.</p>
|
|
|
+<a name="N100CF"></a><a name="sc_zkDataMode_watches"></a>
|
|
|
+<h4>Watches</h4>
|
|
|
+<p>Clients can set watches on znodes. Changes to that znode trigger
|
|
|
+ the watch and then clear the watch. When a watch triggers, ZooKeeper
|
|
|
+ sends the client a notification. More information about watches can be
|
|
|
+ found in the section
|
|
|
+ <a href="recipes.html#sc_recipes_Locks">
|
|
|
+ Zookeeper Watches</a>.
|
|
|
+ <remark>[tbd: fix this link] [tbd: Ben there is note from to emphasize
|
|
|
+ that "it is queued". What is "it" and is what we have here
|
|
|
+ sufficient?]</remark>
|
|
|
+</p>
|
|
|
+<a name="N100DF"></a><a name="Data+Access"></a>
|
|
|
+<h4>Data Access</h4>
|
|
|
+<p>The data stored at each znode in a namespace is read and written
|
|
|
+ atomically. Reads get all the data bytes associated with a znode and a
|
|
|
+ write replaces all the data. Each node has an Access Control List
|
|
|
+ (ACL) that restricts who can do what.</p>
|
|
|
+<a name="N100E9"></a><a name="Ephemeral+Nodes"></a>
|
|
|
+<h4>Ephemeral Nodes</h4>
|
|
|
+<p>ZooKeeper also has the notion of ephemeral nodes. These znodes
|
|
|
+ exists as long as the session that created the znode is active. When
|
|
|
+ the session ends the znode is deleted. Because of this behavior
|
|
|
+ ephemeral znodes are not allowed to have children.</p>
|
|
|
+<a name="N100F3"></a><a name="Unique+Naming"></a>
|
|
|
+<h4>Unique Naming</h4>
|
|
|
+<p>Finally you create a znode, you can request that ZooKeeper
|
|
|
+ append a monotonicly increasing counter be appended to the path name
|
|
|
+ of the znode to be requested. This counter is unique to the parent
|
|
|
+ znode.</p>
|
|
|
+<a name="N100FE"></a><a name="sc_timeInZk"></a>
|
|
|
+<h3 class="h4">Time in ZooKeeper</h3>
|
|
|
+<p>ZooKeeper tracks time multiple ways:</p>
|
|
|
+<ul>
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>
|
|
|
+<strong>Zxid</strong>
|
|
|
+</p>
|
|
|
+
|
|
|
+
|
|
|
+<p>Every change to the ZooKeeper state receives a stamp in the
|
|
|
+ form of a <em>zxid</em> (ZooKeeper Transaction Id).
|
|
|
+ This exposes the total ordering of all changes to ZooKeeper. Each
|
|
|
+ change will have a unique zxid and if zxid1 is smaller than zxid2
|
|
|
+ then zxid1 happened before zxid2.</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>
|
|
|
+<strong>Version numbers</strong>
|
|
|
+</p>
|
|
|
+
|
|
|
+
|
|
|
+<p>Every change to a a node will cause an increase to one of the
|
|
|
+ version numbers of that node. The three version numbers are version
|
|
|
+ (number of changes to the data of a znode), cversion (number of
|
|
|
+ changes to the children of a znode), and aversion (number of changes
|
|
|
+ to the ACL of a znode).</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>
|
|
|
+<strong>Ticks</strong>
|
|
|
+</p>
|
|
|
+
|
|
|
+
|
|
|
+<p>When using multi-server ZooKeeper, servers use ticks to define
|
|
|
+ timing of events such as status uploads, session timeouts,
|
|
|
+ connection timeouts between peers, etc. The tick time is only
|
|
|
+ indirectly exposed through the minimum session timeout (2 times the
|
|
|
+ tick time); if a client requests a session timeout less than the
|
|
|
+ minimum session timeout, the server will tell the client that the
|
|
|
+ session timeout is actually the minimum session timeout.</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>
|
|
|
+<strong>Real time</strong>
|
|
|
+</p>
|
|
|
+
|
|
|
+
|
|
|
+<p>ZooKeeper doesn't use real time, or clock time, at all except
|
|
|
+ to put timestamps into the stat structure on znode creation and
|
|
|
+ znode modification.</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+</ul>
|
|
|
+<a name="N10136"></a><a name="sc_zkStatStructure"></a>
|
|
|
+<h3 class="h4">ZooKeeper Stat Structure</h3>
|
|
|
+<p>The Stat structure for each znode in ZooKeeper is made up of the
|
|
|
+ following fields:</p>
|
|
|
+<ul>
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>
|
|
|
+<strong>czxid</strong>
|
|
|
+</p>
|
|
|
+
|
|
|
+
|
|
|
+<p>The zxid of the change that caused this znode to be
|
|
|
+ created.</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>
|
|
|
+<strong>mzxid</strong>
|
|
|
+</p>
|
|
|
+
|
|
|
+
|
|
|
+<p>The zxid of the change that last modified this znode.</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>
|
|
|
+<strong>ctime</strong>
|
|
|
+</p>
|
|
|
+
|
|
|
+
|
|
|
+<p>The time in milliseconds from epoch when this znode was
|
|
|
+ created.</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>
|
|
|
+<strong>mtime</strong>
|
|
|
+</p>
|
|
|
+
|
|
|
+
|
|
|
+<p>The time in milliseconds from epoch when this znode was last
|
|
|
+ modified.</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>
|
|
|
+<strong>version</strong>
|
|
|
+</p>
|
|
|
+
|
|
|
+
|
|
|
+<p>The number of changes to the data of this znode.</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>
|
|
|
+<strong>cversion</strong>
|
|
|
+</p>
|
|
|
+
|
|
|
+
|
|
|
+<p>The number of changes to the children of this znode.</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>
|
|
|
+<strong>aversion</strong>
|
|
|
+</p>
|
|
|
+
|
|
|
+
|
|
|
+<p>The number of changes to the ACL of this znode.</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>
|
|
|
+<strong>ephemeralOwner</strong>
|
|
|
+</p>
|
|
|
+
|
|
|
+
|
|
|
+<p>The session id of the owner of this znode if the znode is an
|
|
|
+ ephemeral node. If it is not an ephemeral node, it will be
|
|
|
+ zero.</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+</ul>
|
|
|
+</div>
|
|
|
+
|
|
|
+
|
|
|
+<a name="N10194"></a><a name="ZooKeeper+Sessions"></a>
|
|
|
+<h2 class="h3">ZooKeeper Sessions</h2>
|
|
|
+<div class="section">
|
|
|
+<p>When a client gets a handle to the ZooKeeper service, ZooKeeper
|
|
|
+ creates a ZooKeeper session, represented as a 64-bit number, that it
|
|
|
+ assigns to the client. If the client connects to a different ZooKeeper
|
|
|
+ server, it will send the session id as a part of the connection handshake.
|
|
|
+ As a security measure, the server creates a password for the session id
|
|
|
+ that any ZooKeeper server can validate. <remark>[tbd: note from Ben:
|
|
|
+ "perhaps capability is a better word." need clarification on that.]
|
|
|
+ </remark>The password is sent to the client with the session id when the
|
|
|
+ client establishes the session. The client sends this password with the
|
|
|
+ session id whenever it reestablishes the session with a new server.</p>
|
|
|
+<p>One of the parameters to the ZooKeeper client library call to create
|
|
|
+ a ZooKeeper session is the session timeout in milliseconds. The client
|
|
|
+ sends a requested timeout, the server responds with the timeout that it
|
|
|
+ can give the client. The current implementation requires that the timeout
|
|
|
+ be between 2 times the tickTime (as set in the server configuration) and
|
|
|
+ 60 seconds.</p>
|
|
|
+<p>The session is kept alive by requests sent by the client. If the
|
|
|
+ session is idle for a period of time that would timeout the session, the
|
|
|
+ client will send a PING request to keep the session alive. This PING
|
|
|
+ request not only allows the ZooKeeper server to know that the client is
|
|
|
+ still active, but it also allows the client to verify that its connection
|
|
|
+ to the ZooKeeper server is still active. The timing of the PING is
|
|
|
+ conservative enough to ensure reasonable time to detect a dead connection
|
|
|
+ and reconnect to a new server.</p>
|
|
|
+</div>
|
|
|
+
|
|
|
+
|
|
|
+<a name="N101A7"></a><a name="ZooKeeper+Watches"></a>
|
|
|
+<h2 class="h3">ZooKeeper Watches</h2>
|
|
|
+<div class="section">
|
|
|
+<p>All of the read operations in ZooKeeper - <strong>getData()</strong>, <strong>getChildren()</strong>, and <strong>exists()</strong> - have the option of setting a watch as a
|
|
|
+ side effect. Here is ZooKeeper's definition of a watch: a watch event is
|
|
|
+ one-time trigger, sent to the client that set the watch, which occurs when
|
|
|
+ the data for which the watch was set changes. There are three key points
|
|
|
+ to consider in this definition of a watch:</p>
|
|
|
+<ul>
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>
|
|
|
+<strong>One-time trigger</strong>
|
|
|
+</p>
|
|
|
+
|
|
|
+
|
|
|
+<p>One watch event will be sent to the client the data has changed.
|
|
|
+ For example, if a client does a getData("/znode1", true) and later the
|
|
|
+ data for /znode1 is changed or deleted, the client will get a watch
|
|
|
+ event for /znode1. If /znode1 changes again, no watch event will be
|
|
|
+ sent unless the client has done another read that sets a new
|
|
|
+ watch.</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>
|
|
|
+<strong>Sent to the client</strong>
|
|
|
+</p>
|
|
|
+
|
|
|
+
|
|
|
+<p>This implies that an event is on the way to the client, but may
|
|
|
+ not reach the client before the successful return code to the change
|
|
|
+ operation reaches the client that initiated the change. Watches are
|
|
|
+ sent asynchronously to watchers. ZooKeeper provides an ordering
|
|
|
+ guarantee: a client will never see a change for which it has set a
|
|
|
+ watch until it first sees the watch event. Network delays or other
|
|
|
+ factors may cause different clients to see watches and return codes
|
|
|
+ from updates at different times. The key point is that everything seen
|
|
|
+ by the different clients will have a consistent order.</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>
|
|
|
+<strong>The data for which the watch was
|
|
|
+ set</strong>
|
|
|
+</p>
|
|
|
+
|
|
|
+
|
|
|
+<p>This refers to the different ways a node can change. ZooKeeper
|
|
|
+ maintains two lists of watches: data watches and child watches.
|
|
|
+ getData() and exists() set data watches. getChildren() sets child
|
|
|
+ watches. Thus, setData() will trigger data watches for the znode being
|
|
|
+ set (assuming the set is successful). A successful create() will
|
|
|
+ trigger a data watch for the znode being created and a child watch for
|
|
|
+ the parent znode. A successful delete() will trigger both a data watch
|
|
|
+ and a child watch (since there can be no more children) for a znode
|
|
|
+ being deleted as well as a child watch for the parent znode.</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+</ul>
|
|
|
+<p>Watches are maintained locally at the ZooKeeper server to which the
|
|
|
+ client is connected. This allows watches to be light weight to set,
|
|
|
+ maintain, and dispatch. It also means if a client connects to a different
|
|
|
+ server, the new server is not going to know about its watches. So, when a
|
|
|
+ client gets a disconnect event, it must consider that an implicit trigger
|
|
|
+ of all watches. When a client reconnects to a new server, the client
|
|
|
+ should re-set any watches that it is still interested in.</p>
|
|
|
+<a name="N101DD"></a><a name="sc_WatchGuarantees"></a>
|
|
|
+<h3 class="h4">What ZooKeeper Guarantees about Watches</h3>
|
|
|
+<p>With regard to watches, ZooKeeper maintains these
|
|
|
+ guarantees:</p>
|
|
|
+<ul>
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>Watches are ordered with respect to other events, other
|
|
|
+ watches, and asynchronous replies. The ZooKeeper client libraries
|
|
|
+ ensures that everything is dispatched in order.</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+</ul>
|
|
|
+<ul>
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>A client will see a watch event for a znode it is watching
|
|
|
+ before seeing the new data that corresponds to that znode.</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+</ul>
|
|
|
+<ul>
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>The order of watch events from ZooKeeper corresponds to the
|
|
|
+ order of the updates as seen by the ZooKeeper service.</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+</ul>
|
|
|
+<a name="N10202"></a><a name="sc_WatchRememberThese"></a>
|
|
|
+<h3 class="h4">Things to Remember about Watches</h3>
|
|
|
+<ul>
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>Watches are one time triggers; if you get a watch event and
|
|
|
+ you want to get notified of future changes, you must set another
|
|
|
+ watch.</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+</ul>
|
|
|
+<ul>
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>Because watches are one time triggers and there is latency
|
|
|
+ between getting the event and sending a new request to get a watch
|
|
|
+ you cannot reliably see every change that happens to a node in
|
|
|
+ ZooKeeper. Be prepared to handle the case where the znode changes
|
|
|
+ multiple times between getting the event and setting the watch
|
|
|
+ again. (You may not care, but at least realize it may
|
|
|
+ happen.)</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+</ul>
|
|
|
+<ul>
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>When you disconnect from a server (for example, when the
|
|
|
+ server fails), all of the watches you have registered are lost, so
|
|
|
+ you should treat this case as if all your watches were
|
|
|
+ triggered.</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+</ul>
|
|
|
+</div>
|
|
|
+
|
|
|
+
|
|
|
+<a name="N10225"></a><a name="Consistency+Guarantees"></a>
|
|
|
+<h2 class="h3">Consistency Guarantees</h2>
|
|
|
+<div class="section">
|
|
|
+<p>ZooKeeper is a high performance, scalable service. Both reads and
|
|
|
+ write operations are designed to be fast, though reads are faster than
|
|
|
+ writes. The reason for this is that in the case of reads, ZooKeeper can
|
|
|
+ serve older data, which in turn is due to ZooKeeper's consistency
|
|
|
+ guarantees:</p>
|
|
|
+<dl>
|
|
|
+
|
|
|
+<dt>
|
|
|
+<term>Sequential Consistency</term>
|
|
|
+</dt>
|
|
|
+<dd>
|
|
|
+<p>Updates from a client will be applied in the order that they
|
|
|
+ were sent.</p>
|
|
|
+</dd>
|
|
|
+
|
|
|
+
|
|
|
+<dt>
|
|
|
+<term>Atomicity</term>
|
|
|
+</dt>
|
|
|
+<dd>
|
|
|
+<p>Updates either succeed or fail -- there are no partial
|
|
|
+ results.</p>
|
|
|
+</dd>
|
|
|
+
|
|
|
+
|
|
|
+<dt>
|
|
|
+<term>Single System Image</term>
|
|
|
+</dt>
|
|
|
+<dd>
|
|
|
+<p>A client will see the same view of the service regardless of
|
|
|
+ the server that it connects to.</p>
|
|
|
+</dd>
|
|
|
+
|
|
|
+
|
|
|
+<dt>
|
|
|
+<term>Reliability</term>
|
|
|
+</dt>
|
|
|
+<dd>
|
|
|
+<p>Once an update has been applied, it will persist from that
|
|
|
+ time forward until a client overwrites the update. This guarantee
|
|
|
+ has two corollaries:</p>
|
|
|
+<ol>
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>If a client gets a successful return code, the update will
|
|
|
+ have been applied. On some failures (communication errors,
|
|
|
+ timeouts, etc) the client will not know if the update has
|
|
|
+ applied or not. We take steps to minimize the failures, but the
|
|
|
+ only guarantee is only present with successful return codes.
|
|
|
+ (This is called the _monotonicity condition_ in Paxos.)</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>Any updates that are seen by the client, through a read
|
|
|
+ request or successful update, will never be rolled back when
|
|
|
+ recovering from server failures.</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+</ol>
|
|
|
+</dd>
|
|
|
+
|
|
|
+
|
|
|
+<dt>
|
|
|
+<term>Timeliness</term>
|
|
|
+</dt>
|
|
|
+<dd>
|
|
|
+<p>The clients view of the system is guaranteed to be up-to-date
|
|
|
+ within a certain time bound. (On the order of tens of seconds.)
|
|
|
+ Either system changes will be seen by a client within this bound, or
|
|
|
+ the client will detect a service outage.</p>
|
|
|
+</dd>
|
|
|
+
|
|
|
+</dl>
|
|
|
+<p>Using these consistency guarantees it is easy to build higher level
|
|
|
+ functions such as leader election, barriers, queues, and read/write
|
|
|
+ revocable locks solely at the ZooKeeper client (no additions needed to
|
|
|
+ ZooKeeper). See <a href="recipes.html">Recipes and Solutions</a>
|
|
|
+ for more details.</p>
|
|
|
+<p>
|
|
|
+<div class="note">
|
|
|
+<div class="label">Note</div>
|
|
|
+<div class="content">
|
|
|
+
|
|
|
+<p>Sometimes developers mistakenly assume one other guarantee that
|
|
|
+ Zookeeper does <em>not</em> in fact make. This is:</p>
|
|
|
+
|
|
|
+
|
|
|
+<dl>
|
|
|
+
|
|
|
+<dt>
|
|
|
+<term>Simultaneously Conistent Cross-Client Views</term>
|
|
|
+</dt>
|
|
|
+<dd>
|
|
|
+<p>ZooKeeper does not guarantee that at every instance in
|
|
|
+ time, two different clients will have identical views of
|
|
|
+ ZooKeeper data. Due to factors like network delays, one client
|
|
|
+ may perform an update before another client gets notified of the
|
|
|
+ change. Consider the scenario of two clients, A and B. If client
|
|
|
+ A sets the value of a znode /a from 0 to 1, then tells client B
|
|
|
+ to read /a, client B may read the old value of 0, depending on
|
|
|
+ which server in the ZooKeeper quorum it is connected to. If it
|
|
|
+ is important that Client A and Client B read the same value,
|
|
|
+ Client B should should call the <strong>sync()</strong> method from the ZooKeeper API
|
|
|
+ method before it performs its read.</p>
|
|
|
+<p>So, ZooKeeper by itself doesn't guarantee instantaneous,
|
|
|
+ atomic, synchronization across its quorum, but ZooKeeper
|
|
|
+ primitives can be used to construct higher level functions that
|
|
|
+ provide complete client synchronization. (For more information,
|
|
|
+ see the <a href="recipes.html#sc_recipes_Locks">Locks</a>
|
|
|
+
|
|
|
+<remark>[tbd: fix final link target]</remark> in <a href="recipes.html">Zookeeper Recipes</a>.
|
|
|
+ <remark>[tbd: fix final link target]</remark>).</p>
|
|
|
+</dd>
|
|
|
+
|
|
|
+</dl>
|
|
|
+
|
|
|
+</div>
|
|
|
+</div>
|
|
|
+</p>
|
|
|
+</div>
|
|
|
+
|
|
|
+
|
|
|
+<a name="N10291"></a><a name="Bindings"></a>
|
|
|
+<h2 class="h3">Bindings</h2>
|
|
|
+<div class="section">
|
|
|
+<p>The ZooKeeper client libraries come in two languages: Java and C.
|
|
|
+ The following sections describe these.</p>
|
|
|
+<a name="N1029A"></a><a name="Java+Binding"></a>
|
|
|
+<h3 class="h4">Java Binding</h3>
|
|
|
+<p>There are two packages that make up the ZooKeeper Java binding:
|
|
|
+ <strong>org.apache.zookeeper</strong> and <strong>org.apache.zookeeper.data</strong>. The rest of the
|
|
|
+ packages that make up ZooKeeper are used internally or are part of the
|
|
|
+ server implementation. The <strong>org.apache.zookeeper.data</strong> package is made up of
|
|
|
+ generated classes that are used simply as containers.</p>
|
|
|
+<p>The main class used by a ZooKeeper Java client is the <strong>ZooKeeper</strong> class. Its two constructors differ only
|
|
|
+ by an optional session id and password. ZooKeeper supports session
|
|
|
+ recovery accross instances of a process. A Java program may save its
|
|
|
+ session id and password to stable storage, restart, and recover the
|
|
|
+ session that was used by the earlier instance of the program.</p>
|
|
|
+<p>When a ZooKeeper object is created, two threads are created as
|
|
|
+ well: an IO thread and an event thread. All IO happens on the IO thread
|
|
|
+ (using Java NIO). All event callbacks happen on the event thread.
|
|
|
+ Session maintenance such as reconnecting to ZooKeeper servers and
|
|
|
+ maintaining heartbeat is done on the IO thread. Responses for
|
|
|
+ synchronous methods are also processed in the IO thread. All responses
|
|
|
+ to asynchronous methods and watch events are processed on the event
|
|
|
+ thread. There are a few things to notice that result from this
|
|
|
+ design:</p>
|
|
|
+<ul>
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>All completions for asynchronous calls and watcher callbacks
|
|
|
+ will be made in order, one at a time. The caller can do any
|
|
|
+ processing they wish, but no other callbacks will be processed
|
|
|
+ during that time.</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>Callbacks do not block the processing of the IO thread or the
|
|
|
+ processing of the synchronous calls.</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>Synchronous calls may not return in the correct order. For
|
|
|
+ example, assume a client does the following processing: issues an
|
|
|
+ asynchronous read of node <strong>/a</strong> with
|
|
|
+ <em>watch</em> set to true, and then in the completion
|
|
|
+ callback of the read it does a synchronous read of <strong>/a</strong>. (Maybe not good practice, but not illegal
|
|
|
+ either, and it makes for a simple example.)</p>
|
|
|
+
|
|
|
+
|
|
|
+<p>Note that if there is a change to <strong>/a</strong> between the asynchronous read and the
|
|
|
+ synchronous read, the client library will receive the watch event
|
|
|
+ saying <strong>/a</strong> changed before the
|
|
|
+ response for the synchronous read, but because the completion
|
|
|
+ callback is blocking the event queue, the synchronous read will
|
|
|
+ return with the new value of <strong>/a</strong>
|
|
|
+ before the watch event is processed.</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+</ul>
|
|
|
+<p>Finally, the rules associated with shutdown are straightforward:
|
|
|
+ once a ZooKeeper object is closed or receives a fatal event
|
|
|
+ (SESSION_EXPIRED and AUTH_FAILED), the ZooKeeper object becomes invalid,
|
|
|
+ the two threads shut down, and any further ZooKeeper calls throw
|
|
|
+ errors.</p>
|
|
|
+<a name="N102E3"></a><a name="C+Binding"></a>
|
|
|
+<h3 class="h4">C Binding</h3>
|
|
|
+<p>The C binding has a single-threaded and multi-threaded library.
|
|
|
+ The multi-threaded library is easiest to use and is most similar to the
|
|
|
+ Java API. This library will create an IO thread and an event dispatch
|
|
|
+ thread for handling connection maintenance and callbacks. The
|
|
|
+ single-threaded library allows ZooKeeper to be used in event driven
|
|
|
+ applications by exposing the event loop used in the multi-threaded
|
|
|
+ library.</p>
|
|
|
+<p>The package includes two shared libraries: zookeeper_st and
|
|
|
+ zookeeper_mt. The former only provides the asynchronous APIs and
|
|
|
+ callbacks for integrating into the application's event loop. The only
|
|
|
+ reason this library exists is to support the platforms were a
|
|
|
+ <em>pthread</em> library is not available or is unstable
|
|
|
+ (i.e. FreeBSD 4.x). In all other cases, application developers should
|
|
|
+ link with zookeeper_mt, as it includes support for both Sync and Async
|
|
|
+ API.</p>
|
|
|
+<a name="N102F2"></a><a name="Installation"></a>
|
|
|
+<h4>Installation</h4>
|
|
|
+<p>If you're building the client from a check-out from the Apache
|
|
|
+ repository, follow the steps outlined below. If you're building from a
|
|
|
+ project source package downloaded from apache, skip to step <strong>3</strong>.</p>
|
|
|
+<ol>
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>Run <span class="codefrag command">ant compile_just</span> from the zookeeper
|
|
|
+ top level directory (<span class="codefrag filename">.../trunk/zookeeper</span>).
|
|
|
+ This will create a directory named "generated" under
|
|
|
+ <span class="codefrag filename">zookeeper/c</span>.</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>Change directory to the<span class="codefrag filename">zookeeper/c</span> and
|
|
|
+ run <span class="codefrag command">autoreconf -i</span> to bootstrap <strong>autoconf</strong>, <strong>automake</strong> and <strong>libtool</strong>. Make sure you have <strong>autoconf version 2.59</strong> or greater installed.
|
|
|
+ Skip to step<strong> 4</strong>.</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>If you are building from a project source package,
|
|
|
+ unzip/untar the source tarball and cd to the<span class="codefrag filename">
|
|
|
+ zookeeper-x.x.x/</span> directory.</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>Run <span class="codefrag command">./configure <your-options></span> to
|
|
|
+ generate the makefile. Here are some of options the <strong>configure</strong> utility supports that can be
|
|
|
+ useful in this step:</p>
|
|
|
+
|
|
|
+
|
|
|
+<ul>
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>
|
|
|
+<span class="codefrag command">--enable-debug</span>
|
|
|
+</p>
|
|
|
+
|
|
|
+
|
|
|
+<p>Enables optimization and enables debug info compiler
|
|
|
+ options. (Disabled by default.)</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>
|
|
|
+<span class="codefrag command">--without-syncapi </span>
|
|
|
+</p>
|
|
|
+
|
|
|
+
|
|
|
+<p>Disables Sync API support; zookeeper_mt library won't be
|
|
|
+ built. (Enabled by default.)</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>
|
|
|
+<span class="codefrag command">--disable-static </span>
|
|
|
+</p>
|
|
|
+
|
|
|
+
|
|
|
+<p>Do not build static libraries. (Enabled by
|
|
|
+ default.)</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>
|
|
|
+<span class="codefrag command">--disable-shared</span>
|
|
|
+</p>
|
|
|
+
|
|
|
+
|
|
|
+<p>Do not build shared libraries. (Enabled by
|
|
|
+ default.)</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+</ul>
|
|
|
+
|
|
|
+
|
|
|
+<div class="note">
|
|
|
+<div class="label">Note</div>
|
|
|
+<div class="content">
|
|
|
+
|
|
|
+<p>See INSTALL for general information about running
|
|
|
+ <strong>configure</strong>. <remark>[tbd: what
|
|
|
+ is INSTALL? a directory? a file?]</remark>
|
|
|
+</p>
|
|
|
+
|
|
|
+</div>
|
|
|
+</div>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>Run <span class="codefrag command">make</span> or <span class="codefrag command">make
|
|
|
+ install</span> to build the libraries and install them.</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>To generate doxygen documentation for the ZooKeeper API, run
|
|
|
+ <span class="codefrag command">make doxygen-doc</span>. All documentation will be
|
|
|
+ placed in a new subfolder named docs. By default, this command
|
|
|
+ only generates HTML. For information on other document formats,
|
|
|
+ run <span class="codefrag command">./configure --help</span>
|
|
|
+</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+</ol>
|
|
|
+<a name="N1039D"></a><a name="Using+the+Client"></a>
|
|
|
+<h4>Using the Client</h4>
|
|
|
+<p>You can test your client by running a zookeeper server (see
|
|
|
+ instructions on the project wiki page on how to run it) and connecting
|
|
|
+ to it using one of the cli applications that were built as part of the
|
|
|
+ installation procedure. cli_mt (multithreaded, built against
|
|
|
+ zookeeper_mt library) is shown in this example, but you could also use
|
|
|
+ cli_st (singlethreaded, built against zookeeper_st library):</p>
|
|
|
+<p>
|
|
|
+<pre class="code">$ cli_mt zookeeper_host:9876</pre>This
|
|
|
+ is a client application that gives you a shell for executing simple
|
|
|
+ zookeeper commands. Once succesully started and connected to the
|
|
|
+ server it displays a shell prompt. You can now enter zookeeper
|
|
|
+ commands. For example, to create a node:</p>
|
|
|
+<pre class="code">> create /my_new_node</pre>
|
|
|
+<p>To verify that the node's been created:</p>
|
|
|
+<p>You should see a list of node who are children of the root node
|
|
|
+ "/". <remark>[tbd: document all the cli commands (I think this is
|
|
|
+ Ben's tbd? It's from sourceforge)]</remark>
|
|
|
+</p>
|
|
|
+<p>In order to be able to use the ZooKeeper API in your application
|
|
|
+ you have to remember to</p>
|
|
|
+<ol>
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>Include zookeeper header: #include
|
|
|
+ <zookeeper/zookeeper.h</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>If you are building a multithreaded client, compile with
|
|
|
+ -DTHREADED compiler flag to enable the multi-threaded version of
|
|
|
+ the library, and then link against against the
|
|
|
+ <span class="codefrag varname">zookeeper_mt</span> library. If you are building a
|
|
|
+ single-threaded client, do not compile with -DTHREADED, and be
|
|
|
+ sure to link against the<span class="codefrag varname"> zookeeper_st
|
|
|
+ </span>library.</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+</ol>
|
|
|
+<p>Refer to <a href="#ch_programStructureWithExample">Program Structure, with Simple Example</a>for examples of usage in Java and C.
|
|
|
+ <remark>[tbd: some kind of short tutorial would be helpful, I guess
|
|
|
+ (ben's tbd?) ][tbd: whatever the case, make sure that link points to something.]</remark>
|
|
|
+</p>
|
|
|
+</div>
|
|
|
+
|
|
|
+
|
|
|
+<a name="N103DC"></a><a name="Building+Blocks%3A+A+Guide+to+ZooKeeper+Operations"></a>
|
|
|
+<h2 class="h3">Building Blocks: A Guide to ZooKeeper Operations</h2>
|
|
|
+<div class="section">
|
|
|
+<p>
|
|
|
+<remark>[Engineering input needed. This is a new section. The below
|
|
|
+ is just placeholder, and was actually copied from the overview book. There
|
|
|
+ should probably be a subsection on each of those operations, with a little
|
|
|
+ bit of illustrative code for each op.] </remark>
|
|
|
+</p>
|
|
|
+<p>One of the design goals of ZooKeeper is provide a very simple
|
|
|
+ programming interface. As a result, it supports only these
|
|
|
+ operations:</p>
|
|
|
+<dl>
|
|
|
+
|
|
|
+<dt>
|
|
|
+<term>create</term>
|
|
|
+</dt>
|
|
|
+<dd>
|
|
|
+<p>creates a node at a location in the tree</p>
|
|
|
+</dd>
|
|
|
+
|
|
|
+
|
|
|
+<dt>
|
|
|
+<term>delete</term>
|
|
|
+</dt>
|
|
|
+<dd>
|
|
|
+<p>deletes a node</p>
|
|
|
+</dd>
|
|
|
+
|
|
|
+
|
|
|
+<dt>
|
|
|
+<term>exists</term>
|
|
|
+</dt>
|
|
|
+<dd>
|
|
|
+<p>tests if a node exists at a location</p>
|
|
|
+</dd>
|
|
|
+
|
|
|
+
|
|
|
+<dt>
|
|
|
+<term>get data</term>
|
|
|
+</dt>
|
|
|
+<dd>
|
|
|
+<p>reads the data from a node</p>
|
|
|
+</dd>
|
|
|
+
|
|
|
+
|
|
|
+<dt>
|
|
|
+<term>set data</term>
|
|
|
+</dt>
|
|
|
+<dd>
|
|
|
+<p>writes data to a node</p>
|
|
|
+</dd>
|
|
|
+
|
|
|
+
|
|
|
+<dt>
|
|
|
+<term>get children</term>
|
|
|
+</dt>
|
|
|
+<dd>
|
|
|
+<p>retrieves a list of children of a node</p>
|
|
|
+</dd>
|
|
|
+
|
|
|
+
|
|
|
+<dt>
|
|
|
+<term>sync</term>
|
|
|
+</dt>
|
|
|
+<dd>
|
|
|
+<p>waits for data to be propagated.</p>
|
|
|
+</dd>
|
|
|
+
|
|
|
+</dl>
|
|
|
+</div>
|
|
|
+
|
|
|
+
|
|
|
+<a name="N1041E"></a><a name="Program+Structure%2C+with+Simple+Example"></a>
|
|
|
+<h2 class="h3">Program Structure, with Simple Example</h2>
|
|
|
+<div class="section">
|
|
|
+<p>
|
|
|
+<remark>[tbd]</remark>
|
|
|
+</p>
|
|
|
+</div>
|
|
|
+
|
|
|
+
|
|
|
+<a name="N10429"></a><a name="Gotchas%3A+Common+Problems+and+Troubleshooting"></a>
|
|
|
+<h2 class="h3">Gotchas: Common Problems and Troubleshooting</h2>
|
|
|
+<div class="section">
|
|
|
+<p>So now you know ZooKeeper. It's fast, simple, your application
|
|
|
+ works, but wait ... something's wrong. Here are some pitfalls that
|
|
|
+ ZooKeeper users fall into:</p>
|
|
|
+<ol>
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>If you are using watches, you must look for the connected watch
|
|
|
+ event. When a ZooKeeper client disconnects from a server, all the
|
|
|
+ watches are removed, so a client must treat the disconnect event as an
|
|
|
+ implicit trigger of watches. The easiest way to deal with this is to
|
|
|
+ act like the connected watch event is a watch trigger for all your
|
|
|
+ watches. The connected event makes a better trigger than the
|
|
|
+ disconnected event because you can access ZooKeeper and reestablish
|
|
|
+ watches when you are connected.</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>You must test ZooKeeper server failures. The ZooKeeper service
|
|
|
+ can survive failures as long as a majority of servers are active. The
|
|
|
+ question to ask is: can your application handle it? In the real world
|
|
|
+ a client's connection to ZooKeeper can break. (ZooKeeper server
|
|
|
+ failures and network partitions are common reasons for connection
|
|
|
+ loss.) The ZooKeeper client library takes care of recovering your
|
|
|
+ connection and letting you know what happened, but you must make sure
|
|
|
+ that you recover your state and any outstanding requests that failed.
|
|
|
+ Find out if you got it right in the test lab, not in production - test
|
|
|
+ with a ZooKeeper service made up of a several of servers and subject
|
|
|
+ them to reboots.</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>The list of ZooKeeper servers used by the client must match the
|
|
|
+ list of ZooKeeper servers that each ZooKeeper server has. Things can
|
|
|
+ work, although not optimally, if the client list is a subset of the
|
|
|
+ real list of ZooKeeper servers, but not if the client lists ZooKeeper
|
|
|
+ servers not in the ZooKeeper cluster.</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>Be careful where you put that transaction log. The most
|
|
|
+ performance-critical part of ZooKeeper is the transaction log.
|
|
|
+ ZooKeeper must sync transactions to media before it returns a
|
|
|
+ response. A dedicated transaction log device is key to consistent good
|
|
|
+ performance. Putting the log on a busy device will adversely effect
|
|
|
+ performance. If you only have one storage device, put trace files on
|
|
|
+ NFS and increase the snapshotCount; it doesn't eliminate the problem,
|
|
|
+ but it can mitigate it.</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+
|
|
|
+<li>
|
|
|
+
|
|
|
+<p>Set your Java max heap size correctly. It is very important to
|
|
|
+ <em>avoid swapping.</em> Going to disk unnecessarily will
|
|
|
+ almost certainly degrade your performance unacceptably. Remember, in
|
|
|
+ ZooKeeper, everything is ordered, so if one request hits the disk, all
|
|
|
+ other queued requests hit the disk.</p>
|
|
|
+
|
|
|
+
|
|
|
+<p>To avoid swapping, try to set the heapsize to the amount of
|
|
|
+ physical memory you have, minus the amount needed by the OS and cache.
|
|
|
+ The best way to determine an optimal heap size for your configurations
|
|
|
+ is to <em>run load tests</em>. If for some reason you
|
|
|
+ can't, be conservative in your estimates and choose a number well
|
|
|
+ below the limit that would cause your machine to swap. For example, on
|
|
|
+ a 4G machine, a 3G heap is a conservative estimate to start
|
|
|
+ with.</p>
|
|
|
+
|
|
|
+</li>
|
|
|
+
|
|
|
+</ol>
|
|
|
+</div>
|
|
|
+
|
|
|
+
|
|
|
+<a name="apx_linksToOtherInfo"></a>
|
|
|
+<appendix id="apx_linksToOtherInfo">
|
|
|
+
|
|
|
+<title>Links to Other Information</title>
|
|
|
+
|
|
|
+
|
|
|
+<p>Outside the formal documentation, there're several other sources of
|
|
|
+ information for ZooKeeper developers.</p>
|
|
|
+
|
|
|
+
|
|
|
+<dl>
|
|
|
+
|
|
|
+<dt>
|
|
|
+<term>ZooKeeper Whitepaper <remark>[tbd: find url]</remark>
|
|
|
+</term>
|
|
|
+</dt>
|
|
|
+<dd>
|
|
|
+<p>The definitive discussion of ZooKeeper design and performance,
|
|
|
+ by Yahoo! Research</p>
|
|
|
+</dd>
|
|
|
+
|
|
|
+
|
|
|
+<dt>
|
|
|
+<term>API Reference <remark>[tbd: find url]</remark>
|
|
|
+</term>
|
|
|
+</dt>
|
|
|
+<dd>
|
|
|
+<p>The complete reference to the ZooKeeper API</p>
|
|
|
+</dd>
|
|
|
+
|
|
|
+
|
|
|
+<dt>
|
|
|
+<term>
|
|
|
+<a href="http://us.dl1.yimg.com/download.yahoo.com/dl/ydn/zookeeper.m4v">Zookeeper
|
|
|
+ Talk at the Hadoup Summit 2008</a>
|
|
|
+</term>
|
|
|
+</dt>
|
|
|
+<dd>
|
|
|
+<p>A video introduction to ZooKeeper, by Benjamin Reed of Yahoo!
|
|
|
+ Research</p>
|
|
|
+</dd>
|
|
|
+
|
|
|
+
|
|
|
+<dt>
|
|
|
+<term>
|
|
|
+<a href="http://wiki.apache.org/hadoop/ZooKeeper/Tutorial">Barrier and
|
|
|
+ Queue Tutorial</a>
|
|
|
+</term>
|
|
|
+</dt>
|
|
|
+<dd>
|
|
|
+<p>The excellent Java tutorial by Flavio Junqueira, implementing
|
|
|
+ simple barriers and producer-consumer queues using ZooKeeper.</p>
|
|
|
+</dd>
|
|
|
+
|
|
|
+
|
|
|
+<dt>
|
|
|
+<term>
|
|
|
+<a href="http://wiki.apache.org/hadoop/ZooKeeper/ZooKeeperArticles">ZooKeeper
|
|
|
+ - A Reliable, Scalable Distributed Coordination System</a>
|
|
|
+</term>
|
|
|
+</dt>
|
|
|
+<dd>
|
|
|
+<p>An article by Todd Hoff (07/15/2008)</p>
|
|
|
+</dd>
|
|
|
+
|
|
|
+
|
|
|
+<dt>
|
|
|
+<term>
|
|
|
+<a href="recipes.html">Zookeeper Recipes [tbd: fix
|
|
|
+ linkend for apache site]</a>
|
|
|
+</term>
|
|
|
+</dt>
|
|
|
+<dd>
|
|
|
+<p>Pseudo-level discussion of the implementation of various
|
|
|
+ synchronization solutions with ZooKeeper: Event Handles, Queues,
|
|
|
+ Locks, and Two-phase Commits.</p>
|
|
|
+</dd>
|
|
|
+
|
|
|
+
|
|
|
+<dt>
|
|
|
+<term>
|
|
|
+<remark>[tbd]</remark>
|
|
|
+</term>
|
|
|
+</dt>
|
|
|
+<dd>
|
|
|
+<p>Whatever good sources anyone can think of...</p>
|
|
|
+</dd>
|
|
|
+
|
|
|
+</dl>
|
|
|
+
|
|
|
+</appendix>
|
|
|
+
|
|
|
+<p align="right">
|
|
|
+<font size="-2"></font>
|
|
|
+</p>
|
|
|
+</div>
|
|
|
+<!--+
|
|
|
+ |end content
|
|
|
+ +-->
|
|
|
+<div class="clearboth"> </div>
|
|
|
+</div>
|
|
|
+<div id="footer">
|
|
|
+<!--+
|
|
|
+ |start bottomstrip
|
|
|
+ +-->
|
|
|
+<div class="lastmodified">
|
|
|
+<script type="text/javascript"><!--
|
|
|
+document.write("Last Published: " + document.lastModified);
|
|
|
+// --></script>
|
|
|
+</div>
|
|
|
+<div class="copyright">
|
|
|
+ Copyright ©
|
|
|
+ 2008 <a href="http://www.apache.org/licenses/">The Apache Software Foundation.</a>
|
|
|
+</div>
|
|
|
+<!--+
|
|
|
+ |end bottomstrip
|
|
|
+ +-->
|
|
|
+</div>
|
|
|
+</body>
|
|
|
+</html>
|