csvarchive.cc 8.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368
  1. /**
  2. * Licensed to the Apache Software Foundation (ASF) under one
  3. * or more contributor license agreements. See the NOTICE file
  4. * distributed with this work for additional information
  5. * regarding copyright ownership. The ASF licenses this file
  6. * to you under the Apache License, Version 2.0 (the
  7. * "License"); you may not use this file except in compliance
  8. * with the License. You may obtain a copy of the License at
  9. *
  10. * http://www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing, software
  13. * distributed under the License is distributed on an "AS IS" BASIS,
  14. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. * See the License for the specific language governing permissions and
  16. * limitations under the License.
  17. */
  18. #include "csvarchive.hh"
  19. #include <stdlib.h>
  20. using namespace hadoop;
  21. static std::string readUptoTerminator(PushBackInStream& stream)
  22. {
  23. std::string s;
  24. while (1) {
  25. char c;
  26. if (1 != stream.read(&c, 1)) {
  27. throw new IOException("Error in deserialization.");
  28. }
  29. if (c == ',' || c == '\n' || c == '}') {
  30. if (c != ',') {
  31. stream.pushBack(c);
  32. }
  33. break;
  34. }
  35. s.push_back(c);
  36. }
  37. return s;
  38. }
  39. void hadoop::ICsvArchive::deserialize(int8_t& t, const char* tag)
  40. {
  41. std::string s = readUptoTerminator(stream);
  42. t = (int8_t) strtol(s.c_str(), NULL, 10);
  43. }
  44. void hadoop::ICsvArchive::deserialize(bool& t, const char* tag)
  45. {
  46. std::string s = readUptoTerminator(stream);
  47. t = (s == "T") ? true : false;
  48. }
  49. void hadoop::ICsvArchive::deserialize(int32_t& t, const char* tag)
  50. {
  51. std::string s = readUptoTerminator(stream);
  52. t = strtol(s.c_str(), NULL, 10);
  53. }
  54. void hadoop::ICsvArchive::deserialize(int64_t& t, const char* tag)
  55. {
  56. std::string s = readUptoTerminator(stream);
  57. t = strtoll(s.c_str(), NULL, 10);
  58. }
  59. void hadoop::ICsvArchive::deserialize(float& t, const char* tag)
  60. {
  61. std::string s = readUptoTerminator(stream);
  62. t = strtof(s.c_str(), NULL);
  63. }
  64. void hadoop::ICsvArchive::deserialize(double& t, const char* tag)
  65. {
  66. std::string s = readUptoTerminator(stream);
  67. t = strtod(s.c_str(), NULL);
  68. }
  69. void hadoop::ICsvArchive::deserialize(std::string& t, const char* tag)
  70. {
  71. std::string temp = readUptoTerminator(stream);
  72. if (temp[0] != '\'') {
  73. throw new IOException("Errror deserializing string.");
  74. }
  75. t.clear();
  76. // skip first character, replace escaped characters
  77. int len = temp.length();
  78. for (int i = 1; i < len; i++) {
  79. char c = temp.at(i);
  80. if (c == '%') {
  81. // since we escape '%', there have to be at least two chars following a '%'
  82. char ch1 = temp.at(i+1);
  83. char ch2 = temp.at(i+2);
  84. i += 2;
  85. if (ch1 == '0' && ch2 == '0') {
  86. t.append(1, '\0');
  87. } else if (ch1 == '0' && ch2 == 'A') {
  88. t.append(1, '\n');
  89. } else if (ch1 == '0' && ch2 == 'D') {
  90. t.append(1, '\r');
  91. } else if (ch1 == '2' && ch2 == 'C') {
  92. t.append(1, ',');
  93. } else if (ch1 == '7' && ch2 == 'D') {
  94. t.append(1, '}');
  95. } else if (ch1 == '2' && ch2 == '5') {
  96. t.append(1, '%');
  97. } else {
  98. throw new IOException("Error deserializing string.");
  99. }
  100. }
  101. else {
  102. t.append(1, c);
  103. }
  104. }
  105. }
  106. void hadoop::ICsvArchive::deserialize(std::string& t, size_t& len, const char* tag)
  107. {
  108. std::string s = readUptoTerminator(stream);
  109. if (s[0] != '#') {
  110. throw new IOException("Errror deserializing buffer.");
  111. }
  112. s.erase(0, 1); /// erase first character
  113. len = s.length();
  114. if (len%2 == 1) { // len is guaranteed to be even
  115. throw new IOException("Errror deserializing buffer.");
  116. }
  117. len = len >> 1;
  118. for (size_t idx = 0; idx < len; idx++) {
  119. char buf[3];
  120. buf[0] = s[2*idx];
  121. buf[1] = s[2*idx+1];
  122. buf[2] = '\0';
  123. int i;
  124. if (1 != sscanf(buf, "%2x", &i)) {
  125. throw new IOException("Errror deserializing buffer.");
  126. }
  127. t.push_back((char) i);
  128. }
  129. len = t.length();
  130. }
  131. void hadoop::ICsvArchive::startRecord(Record& s, const char* tag)
  132. {
  133. if (tag != NULL) {
  134. char mark[2];
  135. if (2 != stream.read(mark, 2)) {
  136. throw new IOException("Error deserializing record.");
  137. }
  138. if (mark[0] != 's' || mark[1] != '{') {
  139. throw new IOException("Error deserializing record.");
  140. }
  141. }
  142. }
  143. void hadoop::ICsvArchive::endRecord(Record& s, const char* tag)
  144. {
  145. char mark;
  146. if (1 != stream.read(&mark, 1)) {
  147. throw new IOException("Error deserializing record.");
  148. }
  149. if (tag == NULL) {
  150. if (mark != '\n') {
  151. throw new IOException("Error deserializing record.");
  152. }
  153. } else if (mark != '}') {
  154. throw new IOException("Error deserializing record.");
  155. } else {
  156. readUptoTerminator(stream);
  157. }
  158. }
  159. Index* hadoop::ICsvArchive::startVector(const char* tag)
  160. {
  161. char mark[2];
  162. if (2 != stream.read(mark, 2)) {
  163. throw new IOException("Error deserializing vector.");
  164. }
  165. if (mark[0] != 'v' || mark[1] != '{') {
  166. throw new IOException("Error deserializing vector.");
  167. }
  168. return new CsvIndex(stream);
  169. }
  170. void hadoop::ICsvArchive::endVector(Index* idx, const char* tag)
  171. {
  172. delete idx;
  173. char mark;
  174. if (1 != stream.read(&mark, 1)) {
  175. throw new IOException("Error deserializing vector.");
  176. }
  177. if (mark != '}') {
  178. throw new IOException("Error deserializing vector.");
  179. }
  180. readUptoTerminator(stream);
  181. }
  182. Index* hadoop::ICsvArchive::startMap(const char* tag)
  183. {
  184. char mark[2];
  185. if (2 != stream.read(mark, 2)) {
  186. throw new IOException("Error deserializing map.");
  187. }
  188. if (mark[0] != 'm' || mark[1] != '{') {
  189. throw new IOException("Error deserializing map.");
  190. }
  191. return new CsvIndex(stream);
  192. }
  193. void hadoop::ICsvArchive::endMap(Index* idx, const char* tag)
  194. {
  195. delete idx;
  196. char mark;
  197. if (1 != stream.read(&mark, 1)) {
  198. throw new IOException("Error deserializing map.");
  199. }
  200. if (mark != '}') {
  201. throw new IOException("Error deserializing map.");
  202. }
  203. readUptoTerminator(stream);
  204. }
  205. hadoop::ICsvArchive::~ICsvArchive()
  206. {
  207. }
  208. void hadoop::OCsvArchive::serialize(int8_t t, const char* tag)
  209. {
  210. printCommaUnlessFirst();
  211. char sval[5];
  212. sprintf(sval, "%d", t);
  213. stream.write(sval, strlen(sval));
  214. }
  215. void hadoop::OCsvArchive::serialize(bool t, const char* tag)
  216. {
  217. printCommaUnlessFirst();
  218. const char *sval = t ? "T" : "F";
  219. stream.write(sval,1);
  220. }
  221. void hadoop::OCsvArchive::serialize(int32_t t, const char* tag)
  222. {
  223. printCommaUnlessFirst();
  224. char sval[128];
  225. sprintf(sval, "%d", t);
  226. stream.write(sval, strlen(sval));
  227. }
  228. void hadoop::OCsvArchive::serialize(int64_t t, const char* tag)
  229. {
  230. printCommaUnlessFirst();
  231. char sval[128];
  232. sprintf(sval, "%lld", t);
  233. stream.write(sval, strlen(sval));
  234. }
  235. void hadoop::OCsvArchive::serialize(float t, const char* tag)
  236. {
  237. printCommaUnlessFirst();
  238. char sval[128];
  239. sprintf(sval, "%f", t);
  240. stream.write(sval, strlen(sval));
  241. }
  242. void hadoop::OCsvArchive::serialize(double t, const char* tag)
  243. {
  244. printCommaUnlessFirst();
  245. char sval[128];
  246. sprintf(sval, "%lf", t);
  247. stream.write(sval, strlen(sval));
  248. }
  249. void hadoop::OCsvArchive::serialize(const std::string& t, const char* tag)
  250. {
  251. printCommaUnlessFirst();
  252. stream.write("'",1);
  253. int len = t.length();
  254. for (int idx = 0; idx < len; idx++) {
  255. char c = t[idx];
  256. switch(c) {
  257. case '\0':
  258. stream.write("%00",3);
  259. break;
  260. case 0x0A:
  261. stream.write("%0A",3);
  262. break;
  263. case 0x0D:
  264. stream.write("%0D",3);
  265. break;
  266. case 0x25:
  267. stream.write("%25",3);
  268. break;
  269. case 0x2C:
  270. stream.write("%2C",3);
  271. break;
  272. case 0x7D:
  273. stream.write("%7D",3);
  274. break;
  275. default:
  276. stream.write(&c,1);
  277. break;
  278. }
  279. }
  280. }
  281. void hadoop::OCsvArchive::serialize(const std::string& t, size_t len, const char* tag)
  282. {
  283. printCommaUnlessFirst();
  284. stream.write("#",1);
  285. for(size_t idx = 0; idx < len; idx++) {
  286. uint8_t b = t[idx];
  287. char sval[3];
  288. sprintf(sval,"%2x",b);
  289. stream.write(sval, 2);
  290. }
  291. }
  292. void hadoop::OCsvArchive::startRecord(const Record& s, const char* tag)
  293. {
  294. printCommaUnlessFirst();
  295. if (tag != NULL && strlen(tag) != 0) {
  296. stream.write("s{",2);
  297. }
  298. isFirst = true;
  299. }
  300. void hadoop::OCsvArchive::endRecord(const Record& s, const char* tag)
  301. {
  302. if (tag == NULL || strlen(tag) == 0) {
  303. stream.write("\n",1);
  304. isFirst = true;
  305. } else {
  306. stream.write("}",1);
  307. isFirst = false;
  308. }
  309. }
  310. void hadoop::OCsvArchive::startVector(size_t len, const char* tag)
  311. {
  312. printCommaUnlessFirst();
  313. stream.write("v{",2);
  314. isFirst = true;
  315. }
  316. void hadoop::OCsvArchive::endVector(size_t len, const char* tag)
  317. {
  318. stream.write("}",1);
  319. isFirst = false;
  320. }
  321. void hadoop::OCsvArchive::startMap(size_t len, const char* tag)
  322. {
  323. printCommaUnlessFirst();
  324. stream.write("m{",2);
  325. isFirst = true;
  326. }
  327. void hadoop::OCsvArchive::endMap(size_t len, const char* tag)
  328. {
  329. stream.write("}",1);
  330. isFirst = false;
  331. }
  332. hadoop::OCsvArchive::~OCsvArchive()
  333. {
  334. }