1
0

csvarchive.cc 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357
  1. /**
  2. * Licensed to the Apache Software Foundation (ASF) under one
  3. * or more contributor license agreements. See the NOTICE file
  4. * distributed with this work for additional information
  5. * regarding copyright ownership. The ASF licenses this file
  6. * to you under the Apache License, Version 2.0 (the
  7. * "License"); you may not use this file except in compliance
  8. * with the License. You may obtain a copy of the License at
  9. *
  10. * http://www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing, software
  13. * distributed under the License is distributed on an "AS IS" BASIS,
  14. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. * See the License for the specific language governing permissions and
  16. * limitations under the License.
  17. */
  18. #include "csvarchive.hh"
  19. #include <stdlib.h>
  20. using namespace hadoop;
  21. static std::string readUptoTerminator(PushBackInStream& stream)
  22. {
  23. std::string s;
  24. while (1) {
  25. char c;
  26. if (1 != stream.read(&c, 1)) {
  27. throw new IOException("Error in deserialization.");
  28. }
  29. if (c == ',' || c == '\n' || c == '}') {
  30. if (c != ',') {
  31. stream.pushBack(c);
  32. }
  33. break;
  34. }
  35. s.push_back(c);
  36. }
  37. return s;
  38. }
  39. void hadoop::ICsvArchive::deserialize(int8_t& t, const char* tag)
  40. {
  41. std::string s = readUptoTerminator(stream);
  42. t = (int8_t) strtol(s.c_str(), NULL, 10);
  43. }
  44. void hadoop::ICsvArchive::deserialize(bool& t, const char* tag)
  45. {
  46. std::string s = readUptoTerminator(stream);
  47. t = (s == "T") ? true : false;
  48. }
  49. void hadoop::ICsvArchive::deserialize(int32_t& t, const char* tag)
  50. {
  51. std::string s = readUptoTerminator(stream);
  52. t = strtol(s.c_str(), NULL, 10);
  53. }
  54. void hadoop::ICsvArchive::deserialize(int64_t& t, const char* tag)
  55. {
  56. std::string s = readUptoTerminator(stream);
  57. t = strtoll(s.c_str(), NULL, 10);
  58. }
  59. void hadoop::ICsvArchive::deserialize(float& t, const char* tag)
  60. {
  61. std::string s = readUptoTerminator(stream);
  62. t = strtof(s.c_str(), NULL);
  63. }
  64. void hadoop::ICsvArchive::deserialize(double& t, const char* tag)
  65. {
  66. std::string s = readUptoTerminator(stream);
  67. t = strtod(s.c_str(), NULL);
  68. }
  69. static void replaceAll(std::string s, const char *src, char c)
  70. {
  71. std::string::size_type pos = 0;
  72. while (pos != std::string::npos) {
  73. pos = s.find(src);
  74. if (pos != std::string::npos) {
  75. s.replace(pos, strlen(src), 1, c);
  76. }
  77. }
  78. }
  79. void hadoop::ICsvArchive::deserialize(std::string& t, const char* tag)
  80. {
  81. t = readUptoTerminator(stream);
  82. if (t[0] != '\'') {
  83. throw new IOException("Errror deserializing string.");
  84. }
  85. t.erase(0, 1); /// erase first character
  86. replaceAll(t, "%0D", 0x0D);
  87. replaceAll(t, "%0A", 0x0A);
  88. replaceAll(t, "%7D", 0x7D);
  89. replaceAll(t, "%00", 0x00);
  90. replaceAll(t, "%2C", 0x2C);
  91. replaceAll(t, "%25", 0x25);
  92. }
  93. void hadoop::ICsvArchive::deserialize(std::string& t, size_t& len, const char* tag)
  94. {
  95. std::string s = readUptoTerminator(stream);
  96. if (s[0] != '#') {
  97. throw new IOException("Errror deserializing buffer.");
  98. }
  99. s.erase(0, 1); /// erase first character
  100. len = s.length();
  101. if (len%2 == 1) { // len is guaranteed to be even
  102. throw new IOException("Errror deserializing buffer.");
  103. }
  104. len >> 1;
  105. for (size_t idx = 0; idx < len; idx++) {
  106. char buf[3];
  107. buf[0] = s[2*idx];
  108. buf[1] = s[2*idx+1];
  109. buf[2] = '\0';
  110. int i;
  111. if (1 != sscanf(buf, "%2x", &i)) {
  112. throw new IOException("Errror deserializing buffer.");
  113. }
  114. t.push_back((char) i);
  115. }
  116. len = t.length();
  117. }
  118. void hadoop::ICsvArchive::startRecord(Record& s, const char* tag)
  119. {
  120. if (tag != NULL) {
  121. char mark[2];
  122. if (2 != stream.read(mark, 2)) {
  123. throw new IOException("Error deserializing record.");
  124. }
  125. if (mark[0] != 's' || mark[1] != '{') {
  126. throw new IOException("Error deserializing record.");
  127. }
  128. }
  129. }
  130. void hadoop::ICsvArchive::endRecord(Record& s, const char* tag)
  131. {
  132. char mark;
  133. if (1 != stream.read(&mark, 1)) {
  134. throw new IOException("Error deserializing record.");
  135. }
  136. if (tag == NULL) {
  137. if (mark != '\n') {
  138. throw new IOException("Error deserializing record.");
  139. }
  140. } else if (mark != '}') {
  141. throw new IOException("Error deserializing record.");
  142. } else {
  143. readUptoTerminator(stream);
  144. }
  145. }
  146. Index* hadoop::ICsvArchive::startVector(const char* tag)
  147. {
  148. char mark[2];
  149. if (2 != stream.read(mark, 2)) {
  150. throw new IOException("Error deserializing vector.");
  151. }
  152. if (mark[0] != 'v' || mark[1] != '{') {
  153. throw new IOException("Error deserializing vector.");
  154. }
  155. return new CsvIndex(stream);
  156. }
  157. void hadoop::ICsvArchive::endVector(Index* idx, const char* tag)
  158. {
  159. delete idx;
  160. char mark;
  161. if (1 != stream.read(&mark, 1)) {
  162. throw new IOException("Error deserializing vector.");
  163. }
  164. if (mark != '}') {
  165. throw new IOException("Error deserializing vector.");
  166. }
  167. readUptoTerminator(stream);
  168. }
  169. Index* hadoop::ICsvArchive::startMap(const char* tag)
  170. {
  171. char mark[2];
  172. if (2 != stream.read(mark, 2)) {
  173. throw new IOException("Error deserializing map.");
  174. }
  175. if (mark[0] != 'm' || mark[1] != '{') {
  176. throw new IOException("Error deserializing map.");
  177. }
  178. return new CsvIndex(stream);
  179. }
  180. void hadoop::ICsvArchive::endMap(Index* idx, const char* tag)
  181. {
  182. delete idx;
  183. char mark;
  184. if (1 != stream.read(&mark, 1)) {
  185. throw new IOException("Error deserializing map.");
  186. }
  187. if (mark != '}') {
  188. throw new IOException("Error deserializing map.");
  189. }
  190. readUptoTerminator(stream);
  191. }
  192. hadoop::ICsvArchive::~ICsvArchive()
  193. {
  194. }
  195. void hadoop::OCsvArchive::serialize(int8_t t, const char* tag)
  196. {
  197. printCommaUnlessFirst();
  198. char sval[5];
  199. sprintf(sval, "%d", t);
  200. stream.write(sval, strlen(sval));
  201. }
  202. void hadoop::OCsvArchive::serialize(bool t, const char* tag)
  203. {
  204. printCommaUnlessFirst();
  205. const char *sval = t ? "T" : "F";
  206. stream.write(sval,1);
  207. }
  208. void hadoop::OCsvArchive::serialize(int32_t t, const char* tag)
  209. {
  210. printCommaUnlessFirst();
  211. char sval[128];
  212. sprintf(sval, "%d", t);
  213. stream.write(sval, strlen(sval));
  214. }
  215. void hadoop::OCsvArchive::serialize(int64_t t, const char* tag)
  216. {
  217. printCommaUnlessFirst();
  218. char sval[128];
  219. sprintf(sval, "%lld", t);
  220. stream.write(sval, strlen(sval));
  221. }
  222. void hadoop::OCsvArchive::serialize(float t, const char* tag)
  223. {
  224. printCommaUnlessFirst();
  225. char sval[128];
  226. sprintf(sval, "%f", t);
  227. stream.write(sval, strlen(sval));
  228. }
  229. void hadoop::OCsvArchive::serialize(double t, const char* tag)
  230. {
  231. printCommaUnlessFirst();
  232. char sval[128];
  233. sprintf(sval, "%lf", t);
  234. stream.write(sval, strlen(sval));
  235. }
  236. void hadoop::OCsvArchive::serialize(const std::string& t, const char* tag)
  237. {
  238. printCommaUnlessFirst();
  239. stream.write("'",1);
  240. int len = t.length();
  241. for (int idx = 0; idx < len; idx++) {
  242. char c = t[idx];
  243. switch(c) {
  244. case '\0':
  245. stream.write("%00",3);
  246. break;
  247. case 0x0A:
  248. stream.write("%0A",3);
  249. break;
  250. case 0x0D:
  251. stream.write("%0D",3);
  252. break;
  253. case 0x25:
  254. stream.write("%25",3);
  255. break;
  256. case 0x2C:
  257. stream.write("%2C",3);
  258. break;
  259. case 0x7D:
  260. stream.write("%7D",3);
  261. break;
  262. default:
  263. stream.write(&c,1);
  264. break;
  265. }
  266. }
  267. }
  268. void hadoop::OCsvArchive::serialize(const std::string& t, size_t len, const char* tag)
  269. {
  270. printCommaUnlessFirst();
  271. stream.write("#",1);
  272. for(int idx = 0; idx < len; idx++) {
  273. uint8_t b = t[idx];
  274. char sval[3];
  275. sprintf(sval,"%2x",b);
  276. stream.write(sval, 2);
  277. }
  278. }
  279. void hadoop::OCsvArchive::startRecord(const Record& s, const char* tag)
  280. {
  281. printCommaUnlessFirst();
  282. if (tag != NULL && strlen(tag) != 0) {
  283. stream.write("s{",2);
  284. }
  285. isFirst = true;
  286. }
  287. void hadoop::OCsvArchive::endRecord(const Record& s, const char* tag)
  288. {
  289. if (tag == NULL || strlen(tag) == 0) {
  290. stream.write("\n",1);
  291. isFirst = true;
  292. } else {
  293. stream.write("}",1);
  294. isFirst = false;
  295. }
  296. }
  297. void hadoop::OCsvArchive::startVector(size_t len, const char* tag)
  298. {
  299. printCommaUnlessFirst();
  300. stream.write("v{",2);
  301. isFirst = true;
  302. }
  303. void hadoop::OCsvArchive::endVector(size_t len, const char* tag)
  304. {
  305. stream.write("}",1);
  306. isFirst = false;
  307. }
  308. void hadoop::OCsvArchive::startMap(size_t len, const char* tag)
  309. {
  310. printCommaUnlessFirst();
  311. stream.write("m{",2);
  312. isFirst = true;
  313. }
  314. void hadoop::OCsvArchive::endMap(size_t len, const char* tag)
  315. {
  316. stream.write("}",1);
  317. isFirst = false;
  318. }
  319. hadoop::OCsvArchive::~OCsvArchive()
  320. {
  321. }