determine-flaky-tests-hadoop.py 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245
  1. #!/usr/bin/env python
  2. #
  3. # Licensed to the Apache Software Foundation (ASF) under one
  4. # or more contributor license agreements. See the NOTICE file
  5. # distributed with this work for additional information
  6. # regarding copyright ownership. The ASF licenses this file
  7. # to you under the Apache License, Version 2.0 (the
  8. # "License"); you may not use this file except in compliance
  9. # with the License. You may obtain a copy of the License at
  10. #
  11. # http://www.apache.org/licenses/LICENSE-2.0
  12. #
  13. # Unless required by applicable law or agreed to in writing, software
  14. # distributed under the License is distributed on an "AS IS" BASIS,
  15. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16. # See the License for the specific language governing permissions and
  17. # limitations under the License.
  18. #
  19. # Given a jenkins test job, this script examines all runs of the job done
  20. # within specified period of time (number of days prior to the execution
  21. # time of this script), and reports all failed tests.
  22. #
  23. # The output of this script includes a section for each run that has failed
  24. # tests, with each failed test name listed.
  25. #
  26. # More importantly, at the end, it outputs a summary section to list all failed
  27. # tests within all examined runs, and indicate how many runs a same test
  28. # failed, and sorted all failed tests by how many runs each test failed.
  29. #
  30. # This way, when we see failed tests in PreCommit build, we can quickly tell
  31. # whether a failed test is a new failure, or it failed before and how often it
  32. # failed, so to have idea whether it may just be a flaky test.
  33. #
  34. # Of course, to be 100% sure about the reason of a test failure, closer look
  35. # at the failed test for the specific run is necessary.
  36. #
  37. import sys
  38. import platform
  39. sysversion = sys.hexversion
  40. onward30 = False
  41. if sysversion < 0x020600F0:
  42. sys.exit("Minimum supported python version is 2.6, the current version is " +
  43. "Python" + platform.python_version())
  44. if sysversion == 0x030000F0:
  45. sys.exit("There is a known bug with Python" + platform.python_version() +
  46. ", please try a different version");
  47. if sysversion < 0x03000000:
  48. import urllib2
  49. else:
  50. onward30 = True
  51. import urllib.request
  52. import datetime
  53. import json as simplejson
  54. import logging
  55. from optparse import OptionParser
  56. import time
  57. # Configuration
  58. DEFAULT_JENKINS_URL = "https://builds.apache.org"
  59. DEFAULT_JOB_NAME = "Hadoop-Common-trunk"
  60. DEFAULT_NUM_PREVIOUS_DAYS = 14
  61. DEFAULT_TOP_NUM_FAILED_TEST = -1
  62. SECONDS_PER_DAY = 86400
  63. # total number of runs to examine
  64. numRunsToExamine = 0
  65. #summary mode
  66. summary_mode = False
  67. #total number of errors
  68. error_count = 0
  69. """ Parse arguments """
  70. def parse_args():
  71. parser = OptionParser()
  72. parser.add_option("-J", "--jenkins-url", type="string",
  73. dest="jenkins_url", help="Jenkins URL",
  74. default=DEFAULT_JENKINS_URL)
  75. parser.add_option("-j", "--job-name", type="string",
  76. dest="job_name", help="Job name to look at",
  77. default=DEFAULT_JOB_NAME)
  78. parser.add_option("-n", "--num-days", type="int",
  79. dest="num_prev_days", help="Number of days to examine",
  80. default=DEFAULT_NUM_PREVIOUS_DAYS)
  81. parser.add_option("-t", "--top", type="int",
  82. dest="num_failed_tests",
  83. help="Summary Mode, only show top number of failed tests",
  84. default=DEFAULT_TOP_NUM_FAILED_TEST)
  85. (options, args) = parser.parse_args()
  86. if args:
  87. parser.error("unexpected arguments: " + repr(args))
  88. return options
  89. """ Load data from specified url """
  90. def load_url_data(url):
  91. if onward30:
  92. ourl = urllib.request.urlopen(url)
  93. codec = ourl.info().get_param('charset')
  94. content = ourl.read().decode(codec)
  95. data = simplejson.loads(content, strict=False)
  96. else:
  97. ourl = urllib2.urlopen(url)
  98. data = simplejson.load(ourl, strict=False)
  99. return data
  100. """ List all builds of the target project. """
  101. def list_builds(jenkins_url, job_name):
  102. global summary_mode
  103. url = "%(jenkins)s/job/%(job_name)s/api/json?tree=builds[url,result,timestamp]" % dict(
  104. jenkins=jenkins_url,
  105. job_name=job_name)
  106. try:
  107. data = load_url_data(url)
  108. except:
  109. if not summary_mode:
  110. logging.error("Could not fetch: %s" % url)
  111. error_count += 1
  112. raise
  113. return data['builds']
  114. """ Find the names of any tests which failed in the given build output URL. """
  115. def find_failing_tests(testReportApiJson, jobConsoleOutput):
  116. global summary_mode
  117. global error_count
  118. ret = set()
  119. try:
  120. data = load_url_data(testReportApiJson)
  121. except:
  122. if not summary_mode:
  123. logging.error(" Could not open testReport, check " +
  124. jobConsoleOutput + " for why it was reported failed")
  125. error_count += 1
  126. return ret
  127. for suite in data['suites']:
  128. for cs in suite['cases']:
  129. status = cs['status']
  130. errDetails = cs['errorDetails']
  131. if (status == 'REGRESSION' or status == 'FAILED' or (errDetails is not None)):
  132. ret.add(cs['className'] + "." + cs['name'])
  133. if len(ret) == 0 and (not summary_mode):
  134. logging.info(" No failed tests in testReport, check " +
  135. jobConsoleOutput + " for why it was reported failed.")
  136. return ret
  137. """ Iterate runs of specfied job within num_prev_days and collect results """
  138. def find_flaky_tests(jenkins_url, job_name, num_prev_days):
  139. global numRunsToExamine
  140. global summary_mode
  141. all_failing = dict()
  142. # First list all builds
  143. builds = list_builds(jenkins_url, job_name)
  144. # Select only those in the last N days
  145. min_time = int(time.time()) - SECONDS_PER_DAY * num_prev_days
  146. builds = [b for b in builds if (int(b['timestamp']) / 1000) > min_time]
  147. # Filter out only those that failed
  148. failing_build_urls = [(b['url'] , b['timestamp']) for b in builds
  149. if (b['result'] in ('UNSTABLE', 'FAILURE'))]
  150. tnum = len(builds)
  151. num = len(failing_build_urls)
  152. numRunsToExamine = tnum
  153. if not summary_mode:
  154. logging.info(" THERE ARE " + str(num) + " builds (out of " + str(tnum)
  155. + ") that have failed tests in the past " + str(num_prev_days) + " days"
  156. + ((".", ", as listed below:\n")[num > 0]))
  157. for failed_build_with_time in failing_build_urls:
  158. failed_build = failed_build_with_time[0];
  159. jobConsoleOutput = failed_build + "Console";
  160. testReport = failed_build + "testReport";
  161. testReportApiJson = testReport + "/api/json";
  162. ts = float(failed_build_with_time[1]) / 1000.
  163. st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
  164. if not summary_mode:
  165. logging.info("===>%s" % str(testReport) + " (" + st + ")")
  166. failing = find_failing_tests(testReportApiJson, jobConsoleOutput)
  167. if failing:
  168. for ftest in failing:
  169. if not summary_mode:
  170. logging.info(" Failed test: %s" % ftest)
  171. all_failing[ftest] = all_failing.get(ftest,0)+1
  172. return all_failing
  173. def main():
  174. global numRunsToExamine
  175. global summary_mode
  176. logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)
  177. # set up logger to write to stdout
  178. soh = logging.StreamHandler(sys.stdout)
  179. soh.setLevel(logging.INFO)
  180. logger = logging.getLogger()
  181. logger.removeHandler(logger.handlers[0])
  182. logger.addHandler(soh)
  183. opts = parse_args()
  184. logging.info("****Recently FAILED builds in url: " + opts.jenkins_url
  185. + "/job/" + opts.job_name + "")
  186. if opts.num_failed_tests != -1:
  187. summary_mode = True
  188. all_failing = find_flaky_tests(opts.jenkins_url, opts.job_name,
  189. opts.num_prev_days)
  190. if len(all_failing) == 0:
  191. raise SystemExit(0)
  192. if summary_mode and opts.num_failed_tests < len(all_failing):
  193. logging.info("\nAmong " + str(numRunsToExamine) +
  194. " runs examined, top " + str(opts.num_failed_tests) +
  195. " failed tests <#failedRuns: testName>:")
  196. else:
  197. logging.info("\nAmong " + str(numRunsToExamine) +
  198. " runs examined, all failed tests <#failedRuns: testName>:")
  199. # print summary section: all failed tests sorted by how many times they failed
  200. line_count = 0
  201. for tn in sorted(all_failing, key=all_failing.get, reverse=True):
  202. logging.info(" " + str(all_failing[tn])+ ": " + tn)
  203. if summary_mode:
  204. line_count += 1
  205. if line_count == opts.num_failed_tests:
  206. break
  207. if summary_mode and error_count > 0:
  208. logging.info("\n" + str(error_count) + " errors found, you may "
  209. + "re-run in non summary mode to see error details.");
  210. if __name__ == "__main__":
  211. main()