rpc_engine.h 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354
  1. /**
  2. * Licensed to the Apache Software Foundation (ASF) under one
  3. * or more contributor license agreements. See the NOTICE file
  4. * distributed with this work for additional information
  5. * regarding copyright ownership. The ASF licenses this file
  6. * to you under the Apache License, Version 2.0 (the
  7. * "License"); you may not use this file except in compliance
  8. * with the License. You may obtain a copy of the License at
  9. *
  10. * http://www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing, software
  13. * distributed under the License is distributed on an "AS IS" BASIS,
  14. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. * See the License for the specific language governing permissions and
  16. * limitations under the License.
  17. */
  18. #ifndef LIB_RPC_RPC_ENGINE_H_
  19. #define LIB_RPC_RPC_ENGINE_H_
  20. #include "hdfspp/options.h"
  21. #include "hdfspp/status.h"
  22. #include "common/auth_info.h"
  23. #include "common/retry_policy.h"
  24. #include "common/libhdfs_events_impl.h"
  25. #include "common/new_delete.h"
  26. #include <google/protobuf/message_lite.h>
  27. #include <google/protobuf/io/coded_stream.h>
  28. #include <google/protobuf/io/zero_copy_stream_impl_lite.h>
  29. #include <asio/ip/tcp.hpp>
  30. #include <asio/deadline_timer.hpp>
  31. #include <atomic>
  32. #include <memory>
  33. #include <unordered_map>
  34. #include <vector>
  35. #include <mutex>
  36. namespace hdfs {
  37. /*
  38. * NOTE ABOUT LOCKING MODELS
  39. *
  40. * To prevent deadlocks, anything that might acquire multiple locks must
  41. * acquire the lock on the RpcEngine first, then the RpcConnection. Callbacks
  42. * will never be called while holding any locks, so the components are free
  43. * to take locks when servicing a callback.
  44. *
  45. * An RpcRequest or RpcConnection should never call any methods on the RpcEngine
  46. * except for those that are exposed through the LockFreeRpcEngine interface.
  47. */
  48. typedef const std::function<void(const Status &)> RpcCallback;
  49. class LockFreeRpcEngine;
  50. class RpcConnection;
  51. class SaslProtocol;
  52. /*
  53. * Internal bookkeeping for an outstanding request from the consumer.
  54. *
  55. * Threading model: not thread-safe; should only be accessed from a single
  56. * thread at a time
  57. */
  58. class Request {
  59. public:
  60. MEMCHECKED_CLASS(Request)
  61. typedef std::function<void(::google::protobuf::io::CodedInputStream *is,
  62. const Status &status)> Handler;
  63. Request(LockFreeRpcEngine *engine, const std::string &method_name, int call_id,
  64. const std::string &request, Handler &&callback);
  65. Request(LockFreeRpcEngine *engine, const std::string &method_name, int call_id,
  66. const ::google::protobuf::MessageLite *request, Handler &&callback);
  67. // Null request (with no actual message) used to track the state of an
  68. // initial Connect call
  69. Request(LockFreeRpcEngine *engine, Handler &&handler);
  70. int call_id() const { return call_id_; }
  71. std::string method_name() const { return method_name_; }
  72. ::asio::deadline_timer &timer() { return timer_; }
  73. int IncrementRetryCount() { return retry_count_++; }
  74. void GetPacket(std::string *res) const;
  75. void OnResponseArrived(::google::protobuf::io::CodedInputStream *is,
  76. const Status &status);
  77. private:
  78. LockFreeRpcEngine *const engine_;
  79. const std::string method_name_;
  80. const int call_id_;
  81. ::asio::deadline_timer timer_;
  82. std::string payload_;
  83. const Handler handler_;
  84. int retry_count_;
  85. };
  86. /*
  87. * Encapsulates a persistent connection to the NameNode, and the sending of
  88. * RPC requests and evaluating their responses.
  89. *
  90. * Can have multiple RPC requests in-flight simultaneously, but they are
  91. * evaluated in-order on the server side in a blocking manner.
  92. *
  93. * Threading model: public interface is thread-safe
  94. * All handlers passed in to method calls will be called from an asio thread,
  95. * and will not be holding any internal RpcConnection locks.
  96. */
  97. class RpcConnection : public std::enable_shared_from_this<RpcConnection> {
  98. public:
  99. MEMCHECKED_CLASS(RpcConnection)
  100. RpcConnection(LockFreeRpcEngine *engine);
  101. virtual ~RpcConnection();
  102. // Note that a single server can have multiple endpoints - especially both
  103. // an ipv4 and ipv6 endpoint
  104. virtual void Connect(const std::vector<::asio::ip::tcp::endpoint> &server,
  105. const AuthInfo & auth_info,
  106. RpcCallback &handler) = 0;
  107. virtual void ConnectAndFlush(const std::vector<::asio::ip::tcp::endpoint> &server) = 0;
  108. virtual void Disconnect() = 0;
  109. void StartReading();
  110. void AsyncRpc(const std::string &method_name,
  111. const ::google::protobuf::MessageLite *req,
  112. std::shared_ptr<::google::protobuf::MessageLite> resp,
  113. const RpcCallback &handler);
  114. void AsyncRpc(const std::vector<std::shared_ptr<Request> > & requests);
  115. // Enqueue requests before the connection is connected. Will be flushed
  116. // on connect
  117. void PreEnqueueRequests(std::vector<std::shared_ptr<Request>> requests);
  118. void SetEventHandlers(std::shared_ptr<LibhdfsEvents> event_handlers);
  119. void SetClusterName(std::string cluster_name);
  120. LockFreeRpcEngine *engine() { return engine_; }
  121. ::asio::io_service &io_service();
  122. protected:
  123. struct Response {
  124. enum ResponseState {
  125. kReadLength,
  126. kReadContent,
  127. kParseResponse,
  128. } state_;
  129. unsigned length_;
  130. std::vector<char> data_;
  131. std::unique_ptr<::google::protobuf::io::ArrayInputStream> ar;
  132. std::unique_ptr<::google::protobuf::io::CodedInputStream> in;
  133. Response() : state_(kReadLength), length_(0) {}
  134. };
  135. // Initial handshaking protocol: connect->handshake-->(auth)?-->context->connected
  136. virtual void SendHandshake(RpcCallback &handler) = 0;
  137. void HandshakeComplete(const Status &s);
  138. void AuthComplete(const Status &s, const AuthInfo & new_auth_info);
  139. void AuthComplete_locked(const Status &s, const AuthInfo & new_auth_info);
  140. virtual void SendContext(RpcCallback &handler) = 0;
  141. void ContextComplete(const Status &s);
  142. virtual void OnSendCompleted(const ::asio::error_code &ec,
  143. size_t transferred) = 0;
  144. virtual void OnRecvCompleted(const ::asio::error_code &ec,
  145. size_t transferred) = 0;
  146. virtual void FlushPendingRequests()=0; // Synchronously write the next request
  147. void AsyncRpc_locked(
  148. const std::string &method_name,
  149. const ::google::protobuf::MessageLite *req,
  150. std::shared_ptr<::google::protobuf::MessageLite> resp,
  151. const RpcCallback &handler);
  152. void SendRpcRequests(const std::vector<std::shared_ptr<Request> > & requests);
  153. void AsyncFlushPendingRequests(); // Queue requests to be flushed at a later time
  154. std::shared_ptr<std::string> PrepareHandshakePacket();
  155. std::shared_ptr<std::string> PrepareContextPacket();
  156. static std::string SerializeRpcRequest(
  157. const std::string &method_name,
  158. const ::google::protobuf::MessageLite *req);
  159. void HandleRpcResponse(std::shared_ptr<Response> response);
  160. void HandleRpcTimeout(std::shared_ptr<Request> req,
  161. const ::asio::error_code &ec);
  162. void CommsError(const Status &status);
  163. void ClearAndDisconnect(const ::asio::error_code &ec);
  164. std::shared_ptr<Request> RemoveFromRunningQueue(int call_id);
  165. LockFreeRpcEngine *const engine_;
  166. std::shared_ptr<Response> current_response_state_;
  167. AuthInfo auth_info_;
  168. // Connection can have deferred connection, especially when we're pausing
  169. // during retry
  170. enum ConnectedState {
  171. kNotYetConnected,
  172. kConnecting,
  173. kHandshaking,
  174. kAuthenticating,
  175. kConnected,
  176. kDisconnected
  177. };
  178. static std::string ToString(ConnectedState connected);
  179. ConnectedState connected_;
  180. // State machine for performing a SASL handshake
  181. std::shared_ptr<SaslProtocol> sasl_protocol_;
  182. // The request being sent over the wire; will also be in requests_on_fly_
  183. std::shared_ptr<Request> request_over_the_wire_;
  184. // Requests to be sent over the wire
  185. std::vector<std::shared_ptr<Request>> pending_requests_;
  186. // Requests to be sent over the wire during authentication; not retried if
  187. // there is a connection error
  188. std::vector<std::shared_ptr<Request>> auth_requests_;
  189. // Requests that are waiting for responses
  190. typedef std::unordered_map<int, std::shared_ptr<Request>> RequestOnFlyMap;
  191. RequestOnFlyMap requests_on_fly_;
  192. std::shared_ptr<LibhdfsEvents> event_handlers_;
  193. std::string cluster_name_;
  194. // Lock for mutable parts of this class that need to be thread safe
  195. std::mutex connection_state_lock_;
  196. friend class SaslProtocol;
  197. };
  198. /*
  199. * These methods of the RpcEngine will never acquire locks, and are safe for
  200. * RpcConnections to call while holding a ConnectionLock.
  201. */
  202. class LockFreeRpcEngine {
  203. public:
  204. MEMCHECKED_CLASS(LockFreeRpcEngine)
  205. /* Enqueues a CommsError without acquiring a lock*/
  206. virtual void AsyncRpcCommsError(const Status &status,
  207. std::shared_ptr<RpcConnection> failedConnection,
  208. std::vector<std::shared_ptr<Request>> pendingRequests) = 0;
  209. virtual const RetryPolicy * retry_policy() const = 0;
  210. virtual int NextCallId() = 0;
  211. virtual const std::string &client_name() const = 0;
  212. virtual const std::string &user_name() const = 0;
  213. virtual const std::string &protocol_name() const = 0;
  214. virtual int protocol_version() const = 0;
  215. virtual ::asio::io_service &io_service() = 0;
  216. virtual const Options &options() const = 0;
  217. };
  218. /*
  219. * An engine for reliable communication with a NameNode. Handles connection,
  220. * retry, and (someday) failover of the requested messages.
  221. *
  222. * Threading model: thread-safe. All callbacks will be called back from
  223. * an asio pool and will not hold any internal locks
  224. */
  225. class RpcEngine : public LockFreeRpcEngine {
  226. public:
  227. MEMCHECKED_CLASS(RpcEngine)
  228. enum { kRpcVersion = 9 };
  229. enum {
  230. kCallIdAuthorizationFailed = -1,
  231. kCallIdInvalid = -2,
  232. kCallIdConnectionContext = -3,
  233. kCallIdPing = -4,
  234. kCallIdSasl = -33
  235. };
  236. RpcEngine(::asio::io_service *io_service, const Options &options,
  237. const std::string &client_name, const std::string &user_name,
  238. const char *protocol_name, int protocol_version);
  239. void Connect(const std::string & cluster_name,
  240. const std::vector<::asio::ip::tcp::endpoint> &server,
  241. RpcCallback &handler);
  242. void AsyncRpc(const std::string &method_name,
  243. const ::google::protobuf::MessageLite *req,
  244. const std::shared_ptr<::google::protobuf::MessageLite> &resp,
  245. const std::function<void(const Status &)> &handler);
  246. Status Rpc(const std::string &method_name,
  247. const ::google::protobuf::MessageLite *req,
  248. const std::shared_ptr<::google::protobuf::MessageLite> &resp);
  249. void Start();
  250. void Shutdown();
  251. /* Enqueues a CommsError without acquiring a lock*/
  252. void AsyncRpcCommsError(const Status &status,
  253. std::shared_ptr<RpcConnection> failedConnection,
  254. std::vector<std::shared_ptr<Request>> pendingRequests) override;
  255. void RpcCommsError(const Status &status,
  256. std::shared_ptr<RpcConnection> failedConnection,
  257. std::vector<std::shared_ptr<Request>> pendingRequests);
  258. const RetryPolicy * retry_policy() const override { return retry_policy_.get(); }
  259. int NextCallId() override { return ++call_id_; }
  260. void TEST_SetRpcConnection(std::shared_ptr<RpcConnection> conn);
  261. const std::string &client_name() const override { return client_name_; }
  262. const std::string &user_name() const override { return auth_info_.getUser(); }
  263. const std::string &protocol_name() const override { return protocol_name_; }
  264. int protocol_version() const override { return protocol_version_; }
  265. ::asio::io_service &io_service() override { return *io_service_; }
  266. const Options &options() const override { return options_; }
  267. static std::string GetRandomClientName();
  268. void SetFsEventCallback(fs_event_callback callback);
  269. protected:
  270. std::shared_ptr<RpcConnection> conn_;
  271. std::shared_ptr<RpcConnection> InitializeConnection();
  272. virtual std::shared_ptr<RpcConnection> NewConnection();
  273. virtual std::unique_ptr<const RetryPolicy> MakeRetryPolicy(const Options &options);
  274. // Remember all of the last endpoints in case we need to reconnect and retry
  275. std::vector<::asio::ip::tcp::endpoint> last_endpoints_;
  276. private:
  277. ::asio::io_service * const io_service_;
  278. const Options options_;
  279. const std::string client_name_;
  280. const std::string protocol_name_;
  281. const int protocol_version_;
  282. const std::unique_ptr<const RetryPolicy> retry_policy_; //null --> no retry
  283. AuthInfo auth_info_;
  284. std::string cluster_name_;
  285. std::atomic_int call_id_;
  286. ::asio::deadline_timer retry_timer;
  287. std::shared_ptr<LibhdfsEvents> event_handlers_;
  288. std::mutex engine_state_lock_;
  289. };
  290. }
  291. #endif