DataTree.java 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563
  1. /**
  2. * Licensed to the Apache Software Foundation (ASF) under one
  3. * or more contributor license agreements. See the NOTICE file
  4. * distributed with this work for additional information
  5. * regarding copyright ownership. The ASF licenses this file
  6. * to you under the Apache License, Version 2.0 (the
  7. * "License"); you may not use this file except in compliance
  8. * with the License. You may obtain a copy of the License at
  9. *
  10. * http://www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing, software
  13. * distributed under the License is distributed on an "AS IS" BASIS,
  14. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. * See the License for the specific language governing permissions and
  16. * limitations under the License.
  17. */
  18. package org.apache.zookeeper.server;
  19. import java.io.IOException;
  20. import java.util.ArrayList;
  21. import java.util.Collection;
  22. import java.util.HashSet;
  23. import java.util.List;
  24. import java.util.Set;
  25. import java.util.concurrent.ConcurrentHashMap;
  26. import org.apache.jute.InputArchive;
  27. import org.apache.jute.OutputArchive;
  28. import org.apache.jute.Record;
  29. import org.apache.log4j.Logger;
  30. import org.apache.zookeeper.KeeperException;
  31. import org.apache.zookeeper.Watcher;
  32. import org.apache.zookeeper.KeeperException.Code;
  33. import org.apache.zookeeper.Watcher.Event;
  34. import org.apache.zookeeper.ZooDefs.OpCode;
  35. import org.apache.zookeeper.data.ACL;
  36. import org.apache.zookeeper.data.Stat;
  37. import org.apache.zookeeper.txn.CreateTxn;
  38. import org.apache.zookeeper.txn.DeleteTxn;
  39. import org.apache.zookeeper.txn.ErrorTxn;
  40. import org.apache.zookeeper.txn.SetACLTxn;
  41. import org.apache.zookeeper.txn.SetDataTxn;
  42. import org.apache.zookeeper.txn.TxnHeader;
  43. /**
  44. * This class maintains the tree data structure. It doesn't have any networking
  45. * or client connection code in it so that it can be tested in a stand alone
  46. * way.
  47. * <p>
  48. * The tree maintains two parallel data structures: a hashtable that maps from
  49. * full paths to DataNodes and a tree of DataNodes. All accesses to a path is
  50. * through the hashtable. The tree is traversed only when serializing to disk.
  51. */
  52. public class DataTree {
  53. private static final Logger LOG = Logger.getLogger(DataTree.class);
  54. /**
  55. * This hashtable provides a fast lookup to the datanodes. The tree is the
  56. * source of truth and is where all the locking occurs
  57. */
  58. private ConcurrentHashMap<String, DataNode> nodes = new ConcurrentHashMap<String, DataNode>();
  59. private WatchManager dataWatches = new WatchManager();
  60. private WatchManager childWatches = new WatchManager();
  61. /**
  62. * This hashtable lists the paths of the ephemeral nodes of a session.
  63. */
  64. private ConcurrentHashMap<Long, HashSet<String>> ephemerals = new ConcurrentHashMap<Long, HashSet<String>>();
  65. /** A debug string * */
  66. private String debug = "debug";
  67. public HashSet<String> getEphemerals(long sessionId) {
  68. HashSet<String> retv = ephemerals.get(sessionId);
  69. if (retv == null) {
  70. return new HashSet<String>();
  71. }
  72. HashSet<String> cloned = null;
  73. synchronized(retv) {
  74. cloned = (HashSet<String>) retv.clone();
  75. }
  76. return cloned;
  77. }
  78. public Collection<Long> getSessions() {
  79. return ephemerals.keySet();
  80. }
  81. public DataNode getNode(String path) {
  82. return nodes.get(path);
  83. }
  84. public int getNodeCount(){
  85. return nodes.size();
  86. }
  87. public int getWatchCount(){
  88. return dataWatches.size()+childWatches.size();
  89. }
  90. /**
  91. * This is a pointer to the root of the DataTree. It is the source of truth,
  92. * but we usually use the nodes hashmap to find nodes in the tree.
  93. */
  94. private DataNode root = new DataNode(null, new byte[0], null, new Stat());
  95. public DataTree() {
  96. /* Rather than fight it, let root have an alias */
  97. nodes.put("", root);
  98. nodes.put("/", root);
  99. }
  100. static public void copyStat(Stat from, Stat to) {
  101. to.setAversion(from.getAversion());
  102. to.setCtime(from.getCtime());
  103. to.setCversion(from.getCversion());
  104. to.setCzxid(from.getCzxid());
  105. to.setMtime(from.getMtime());
  106. to.setMzxid(from.getMzxid());
  107. to.setVersion(from.getVersion());
  108. to.setEphemeralOwner(from.getEphemeralOwner());
  109. }
  110. // public void remooveInterest(String path, Watcher nw) {
  111. // DataNode n = nodes.get(path);
  112. // if (n == null) {
  113. // synchronized (nonExistentWatches) {
  114. // HashSet<Watcher> list = nonExistentWatches.get(path);
  115. // if (list != null) {
  116. // list.remove(nw);
  117. // }
  118. // }
  119. // }
  120. // synchronized (n) {
  121. // n.dataWatchers.remove(nw);
  122. // n.childWatchers.remove(nw);
  123. // }
  124. // }
  125. /**
  126. * @param path
  127. * @param data
  128. * @param acl
  129. * @param ephemeralOwner
  130. * the session id that owns this node. -1 indicates this is
  131. * not an ephemeral node.
  132. * @param zxid
  133. * @param time
  134. * @return
  135. * @throws KeeperException
  136. */
  137. public String createNode(String path, byte data[], List<ACL> acl,
  138. long ephemeralOwner, long zxid, long time) throws KeeperException.NoNodeException, KeeperException.NodeExistsException {
  139. int lastSlash = path.lastIndexOf('/');
  140. String parentName = path.substring(0, lastSlash);
  141. String childName = path.substring(lastSlash + 1);
  142. Stat stat = new Stat();
  143. stat.setCtime(time);
  144. stat.setMtime(time);
  145. stat.setCzxid(zxid);
  146. stat.setMzxid(zxid);
  147. stat.setVersion(0);
  148. stat.setAversion(0);
  149. stat.setEphemeralOwner(ephemeralOwner);
  150. DataNode parent = nodes.get(parentName);
  151. if (parent == null) {
  152. throw new KeeperException.NoNodeException();
  153. }
  154. synchronized (parent) {
  155. if (parent.children.contains(childName)) {
  156. throw new KeeperException.NodeExistsException();
  157. }
  158. int cver = parent.stat.getCversion();
  159. cver++;
  160. parent.stat.setCversion(cver);
  161. DataNode child = new DataNode(parent, data, acl, stat);
  162. parent.children.add(childName);
  163. nodes.put(path, child);
  164. if (ephemeralOwner != 0) {
  165. HashSet<String> list = ephemerals.get(ephemeralOwner);
  166. if (list == null) {
  167. list = new HashSet<String>();
  168. ephemerals.put(ephemeralOwner, list);
  169. }
  170. synchronized(list) {
  171. list.add(path);
  172. }
  173. }
  174. }
  175. dataWatches.triggerWatch(path, Event.EventNodeCreated);
  176. childWatches.triggerWatch(parentName.equals("")?"/":parentName, Event.EventNodeChildrenChanged);
  177. return path;
  178. }
  179. public void deleteNode(String path) throws KeeperException.NoNodeException {
  180. int lastSlash = path.lastIndexOf('/');
  181. String parentName = path.substring(0, lastSlash);
  182. String childName = path.substring(lastSlash + 1);
  183. DataNode node = nodes.get(path);
  184. if (node == null) {
  185. throw new KeeperException.NoNodeException();
  186. }
  187. nodes.remove(path);
  188. DataNode parent = nodes.get(parentName);
  189. if (parent == null) {
  190. throw new KeeperException.NoNodeException();
  191. }
  192. synchronized (parent) {
  193. parent.children.remove(childName);
  194. parent.stat.setCversion(parent.stat.getCversion() + 1);
  195. long eowner = node.stat.getEphemeralOwner();
  196. if (eowner != 0) {
  197. HashSet<String> nodes = ephemerals.get(eowner);
  198. if (nodes != null) {
  199. synchronized(nodes) {
  200. nodes.remove(path);
  201. }
  202. }
  203. }
  204. node.parent = null;
  205. }
  206. ZooTrace.logTraceMessage(LOG,
  207. ZooTrace.EVENT_DELIVERY_TRACE_MASK,
  208. "dataWatches.triggerWatch " + path);
  209. ZooTrace.logTraceMessage(LOG,
  210. ZooTrace.EVENT_DELIVERY_TRACE_MASK,
  211. "childWatches.triggerWatch " + parentName);
  212. Set<Watcher> processed =
  213. dataWatches.triggerWatch(path, Event.EventNodeDeleted);
  214. childWatches.triggerWatch(path, Event.EventNodeDeleted, processed);
  215. childWatches.triggerWatch(parentName.equals("")?"/":parentName, Event.EventNodeChildrenChanged);
  216. }
  217. public Stat setData(String path, byte data[], int version, long zxid,
  218. long time) throws KeeperException.NoNodeException {
  219. Stat s = new Stat();
  220. DataNode n = nodes.get(path);
  221. if (n == null) {
  222. throw new KeeperException.NoNodeException();
  223. }
  224. synchronized (n) {
  225. n.data = data;
  226. n.stat.setMtime(time);
  227. n.stat.setMzxid(zxid);
  228. n.stat.setVersion(version);
  229. copyStat(n.stat, s);
  230. }
  231. dataWatches.triggerWatch(path, Event.EventNodeDataChanged);
  232. return s;
  233. }
  234. public byte[] getData(String path, Stat stat, Watcher watcher) throws KeeperException.NoNodeException {
  235. DataNode n = nodes.get(path);
  236. if (n == null) {
  237. throw new KeeperException.NoNodeException();
  238. }
  239. synchronized (n) {
  240. copyStat(n.stat, stat);
  241. if (watcher != null) {
  242. dataWatches.addWatch(path, watcher);
  243. }
  244. return n.data;
  245. }
  246. }
  247. public Stat statNode(String path, Watcher watcher) throws KeeperException.NoNodeException {
  248. Stat stat = new Stat();
  249. DataNode n = nodes.get(path);
  250. if (watcher != null) {
  251. dataWatches.addWatch(path, watcher);
  252. }
  253. if (n == null) {
  254. throw new KeeperException.NoNodeException();
  255. }
  256. synchronized (n) {
  257. copyStat(n.stat, stat);
  258. return stat;
  259. }
  260. }
  261. public ArrayList<String> getChildren(String path, Stat stat, Watcher watcher) throws KeeperException.NoNodeException {
  262. DataNode n = nodes.get(path);
  263. if (n == null) {
  264. throw new KeeperException.NoNodeException();
  265. }
  266. synchronized (n) {
  267. ArrayList<String> children = new ArrayList<String>();
  268. children.addAll(n.children);
  269. if (watcher != null) {
  270. childWatches.addWatch(path, watcher);
  271. }
  272. return children;
  273. }
  274. }
  275. public Stat setACL(String path, List<ACL> acl, int version) throws KeeperException.NoNodeException {
  276. Stat stat = new Stat();
  277. DataNode n = nodes.get(path);
  278. if (n == null) {
  279. throw new KeeperException.NoNodeException();
  280. }
  281. synchronized (n) {
  282. n.stat.setAversion(version);
  283. n.acl = acl;
  284. copyStat(n.stat, stat);
  285. return stat;
  286. }
  287. }
  288. @SuppressWarnings("unchecked")
  289. public List<ACL> getACL(String path, Stat stat) throws KeeperException.NoNodeException {
  290. DataNode n = nodes.get(path);
  291. if (n == null) {
  292. throw new KeeperException.NoNodeException();
  293. }
  294. synchronized (n) {
  295. copyStat(n.stat, stat);
  296. return new ArrayList<ACL>(n.acl);
  297. }
  298. }
  299. static public class ProcessTxnResult {
  300. public long clientId;
  301. public int cxid;
  302. public long zxid;
  303. public int err;
  304. public int type;
  305. public String path;
  306. public Stat stat;
  307. /**
  308. * Equality is defined as the clientId and the cxid being the same. This
  309. * allows us to use hash tables to track completion of transactions.
  310. *
  311. * @see java.lang.Object#equals(java.lang.Object)
  312. */
  313. @Override
  314. public boolean equals(Object o) {
  315. if (o instanceof ProcessTxnResult) {
  316. ProcessTxnResult other = (ProcessTxnResult) o;
  317. return other.clientId == clientId && other.cxid == cxid;
  318. }
  319. return false;
  320. }
  321. /**
  322. * See equals() to find the rational for how this hashcode is generated.
  323. *
  324. * @see ProcessTxnResult#equals(Object)
  325. * @see java.lang.Object#hashCode()
  326. */
  327. @Override
  328. public int hashCode() {
  329. return (int) ((clientId ^ cxid) % Integer.MAX_VALUE);
  330. }
  331. }
  332. public volatile long lastProcessedZxid = 0;
  333. @SuppressWarnings("unchecked")
  334. public ProcessTxnResult processTxn(TxnHeader header, Record txn) {
  335. ProcessTxnResult rc = new ProcessTxnResult();
  336. try {
  337. rc.clientId = header.getClientId();
  338. rc.cxid = header.getCxid();
  339. rc.zxid = header.getZxid();
  340. rc.type = header.getType();
  341. rc.err = 0;
  342. if (rc.zxid > lastProcessedZxid) {
  343. lastProcessedZxid = rc.zxid;
  344. }
  345. switch (header.getType()) {
  346. case OpCode.create:
  347. CreateTxn createTxn = (CreateTxn) txn;
  348. debug = "Create transaction for " + createTxn.getPath();
  349. createNode(createTxn.getPath(), createTxn.getData(), createTxn
  350. .getAcl(), createTxn.getEphemeral() ? header
  351. .getClientId() : 0, header.getZxid(), header.getTime());
  352. rc.path = createTxn.getPath();
  353. break;
  354. case OpCode.delete:
  355. DeleteTxn deleteTxn = (DeleteTxn) txn;
  356. debug = "Delete transaction for " + deleteTxn.getPath();
  357. deleteNode(deleteTxn.getPath());
  358. break;
  359. case OpCode.setData:
  360. SetDataTxn setDataTxn = (SetDataTxn) txn;
  361. debug = "Set data for transaction for " + setDataTxn.getPath();
  362. rc.stat = setData(setDataTxn.getPath(), setDataTxn.getData(),
  363. setDataTxn.getVersion(), header.getZxid(), header
  364. .getTime());
  365. break;
  366. case OpCode.setACL:
  367. SetACLTxn setACLTxn = (SetACLTxn) txn;
  368. rc.stat = setACL(setACLTxn.getPath(), setACLTxn.getAcl(),
  369. setACLTxn.getVersion());
  370. break;
  371. case OpCode.closeSession:
  372. killSession(header.getClientId());
  373. break;
  374. case OpCode.error:
  375. ErrorTxn errTxn = (ErrorTxn) txn;
  376. rc.err = errTxn.getErr();
  377. break;
  378. }
  379. } catch (KeeperException e) {
  380. // These are expected errors since we take a lazy snapshot
  381. if (initialized
  382. || (e.getCode() != Code.NoNode && e.getCode() != Code.NodeExists)) {
  383. LOG.warn(debug);
  384. LOG.error("FIXMSG",e);
  385. }
  386. }
  387. return rc;
  388. }
  389. void killSession(long session) {
  390. // the list is already removed from the ephemerals
  391. // so we do not have to worry about synchronyzing on
  392. // the list. This is only called from FinalRequestProcessor
  393. // so there is no need for synchornization. The list is not
  394. // changed here. Only create and delete change the list which
  395. // are again called from FinalRequestProcessor in sequence.
  396. HashSet<String> list = ephemerals.remove(session);
  397. if (list != null) {
  398. for (String path : list) {
  399. try {
  400. deleteNode(path);
  401. ZooTrace.logTraceMessage(LOG,
  402. ZooTrace.SESSION_TRACE_MASK,
  403. "Deleting ephemeral node "
  404. + path + " for session 0x"
  405. + Long.toHexString(session));
  406. } catch (KeeperException e) {
  407. LOG.error("FIXMSG",e);
  408. }
  409. }
  410. }
  411. }
  412. /**
  413. * this method uses a stringbuilder to create a new
  414. * path for children. This is faster than string
  415. * appends ( str1 + str2).
  416. * @param oa OutputArchive to write to.
  417. * @param path a string builder.
  418. * @throws IOException
  419. * @throws InterruptedException
  420. */
  421. void serializeNode(OutputArchive oa, StringBuilder path)
  422. throws IOException, InterruptedException {
  423. String pathString = path.toString();
  424. DataNode node = getNode(pathString);
  425. if (node == null) {
  426. return;
  427. }
  428. String children[] = null;
  429. synchronized (node) {
  430. scount++;
  431. oa.writeString(pathString, "path");
  432. oa.writeRecord(node, "node");
  433. children = node.children.toArray(new String[node.children.size()]);
  434. }
  435. path.append('/');
  436. int off = path.length();
  437. if (children != null) {
  438. for (String child : children) {
  439. //since this is single buffer being resused
  440. // we need
  441. // to truncate the previous bytes of string.
  442. path.delete(off, Integer.MAX_VALUE);
  443. path.append(child);
  444. serializeNode(oa, path);
  445. }
  446. }
  447. }
  448. int scount;
  449. public boolean initialized = false;
  450. public void serialize(OutputArchive oa, String tag) throws IOException,
  451. InterruptedException {
  452. scount = 0;
  453. serializeNode(oa, new StringBuilder(""));
  454. // / marks end of stream
  455. // we need to check if clear had been called in between the snapshot.
  456. if (root != null) {
  457. oa.writeString("/", "path");
  458. }
  459. }
  460. public void deserialize(InputArchive ia, String tag) throws IOException {
  461. nodes.clear();
  462. String path = ia.readString("path");
  463. while (!path.equals("/")) {
  464. DataNode node = new DataNode();
  465. ia.readRecord(node, "node");
  466. nodes.put(path, node);
  467. int lastSlash = path.lastIndexOf('/');
  468. if (lastSlash == -1) {
  469. root = node;
  470. } else {
  471. String parentPath = path.substring(0, lastSlash);
  472. node.parent = nodes.get(parentPath);
  473. node.parent.children.add(path.substring(lastSlash + 1));
  474. long eowner = node.stat.getEphemeralOwner();
  475. if (eowner != 0) {
  476. HashSet<String> list = ephemerals.get(eowner);
  477. if (list == null) {
  478. list = new HashSet<String>();
  479. ephemerals.put(eowner, list);
  480. }
  481. list.add(path);
  482. }
  483. }
  484. path = ia.readString("path");
  485. }
  486. nodes.put("/", root);
  487. }
  488. public String dumpEphemerals() {
  489. Set<Long> keys = ephemerals.keySet();
  490. StringBuffer sb = new StringBuffer("Sessions with Ephemerals ("
  491. + keys.size() + "):\n");
  492. for (long k : keys) {
  493. sb.append("0x" + Long.toHexString(k));
  494. sb.append(":\n");
  495. HashSet<String> tmp = ephemerals.get(k);
  496. synchronized(tmp) {
  497. for (String path : tmp) {
  498. sb.append("\t" + path + "\n");
  499. }
  500. }
  501. }
  502. return sb.toString();
  503. }
  504. public void removeCnxn(Watcher watcher) {
  505. dataWatches.removeWatcher(watcher);
  506. childWatches.removeWatcher(watcher);
  507. }
  508. public void clear() {
  509. root = null;
  510. nodes.clear();
  511. ephemerals.clear();
  512. // dataWatches = null;
  513. // childWatches = null;
  514. }
  515. }