فهرست منبع

HADOOP-11045. Introducing a tool to detect flaky tests of hadoop jenkins testing job. Contributed by Yongjun Zhang and Todd Lipcon.

Tsuyoshi Ozawa 10 سال پیش
والد
کامیت
80705e034b
2فایلهای تغییر یافته به همراه207 افزوده شده و 0 حذف شده
  1. 204 0
      dev-support/determine-flaky-tests-hadoop.py
  2. 3 0
      hadoop-common-project/hadoop-common/CHANGES.txt

+ 204 - 0
dev-support/determine-flaky-tests-hadoop.py

@@ -0,0 +1,204 @@
+#!/usr/bin/env python
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Given a jenkins test job, this script examines all runs of the job done
+# within specified period of time (number of days prior to the execution
+# time of this script), and reports all failed tests.
+#
+# The output of this script includes a section for each run that has failed
+# tests, with each failed test name listed.
+#
+# More importantly, at the end, it outputs a summary section to list all failed
+# tests within all examined runs, and indicate how many runs a same test
+# failed, and sorted all failed tests by how many runs each test failed.
+#
+# This way, when we see failed tests in PreCommit build, we can quickly tell
+# whether a failed test is a new failure, or it failed before and how often it
+# failed, so to have idea whether it may just be a flaky test.
+#
+# Of course, to be 100% sure about the reason of a test failure, closer look
+# at the failed test for the specific run is necessary.
+#
+import sys
+import platform
+sysversion = sys.hexversion
+onward30 = False
+if sysversion < 0x020600F0:
+  sys.exit("Minimum supported python version is 2.6, the current version is " +
+      "Python" + platform.python_version())
+
+if sysversion == 0x030000F0:
+  sys.exit("There is a known bug with Python" + platform.python_version() +
+      ", please try a different version");
+
+if sysversion < 0x03000000:
+  import urllib2
+else:
+  onward30 = True
+  import urllib.request
+
+import datetime
+import json as simplejson
+import logging
+from optparse import OptionParser
+import time
+
+# Configuration
+DEFAULT_JENKINS_URL = "https://builds.apache.org"
+DEFAULT_JOB_NAME = "Hadoop-Common-trunk"
+DEFAULT_NUM_PREVIOUS_DAYS = 14
+
+SECONDS_PER_DAY = 86400
+
+# total number of runs to examine
+numRunsToExamine = 0
+
+""" Parse arguments """
+def parse_args():
+  parser = OptionParser()
+  parser.add_option("-J", "--jenkins-url", type="string",
+                    dest="jenkins_url", help="Jenkins URL",
+                    default=DEFAULT_JENKINS_URL)
+  parser.add_option("-j", "--job-name", type="string",
+                    dest="job_name", help="Job name to look at",
+                    default=DEFAULT_JOB_NAME)
+  parser.add_option("-n", "--num-days", type="int",
+                    dest="num_prev_days", help="Number of days to examine",
+                    default=DEFAULT_NUM_PREVIOUS_DAYS)
+
+  (options, args) = parser.parse_args()
+  if args:
+    parser.error("unexpected arguments: " + repr(args))
+  return options
+
+""" Load data from specified url """
+def load_url_data(url):
+  if onward30:
+    ourl = urllib.request.urlopen(url)
+    codec = ourl.info().get_param('charset')
+    content = ourl.read().decode(codec)
+    data = simplejson.loads(content)
+  else:
+    ourl = urllib2.urlopen(url)
+    data = simplejson.load(ourl)
+  return data
+ 
+""" List all builds of the target project. """
+def list_builds(jenkins_url, job_name):
+  url = "%(jenkins)s/job/%(job_name)s/api/json?tree=builds[url,result,timestamp]" % dict(
+      jenkins=jenkins_url,
+      job_name=job_name)
+
+  try:
+    data = load_url_data(url)
+
+  except:
+    logging.error("Could not fetch: %s" % url)
+    raise
+  return data['builds']
+
+""" Find the names of any tests which failed in the given build output URL. """
+def find_failing_tests(testReportApiJson, jobConsoleOutput):
+  ret = set()
+  try:
+    data = load_url_data(testReportApiJson)
+
+  except:
+    logging.error("    Could not open testReport, check " +
+        jobConsoleOutput + " for why it was reported failed")
+    return ret
+
+  for suite in data['suites']:
+    for cs in suite['cases']:
+      status = cs['status']
+      errDetails = cs['errorDetails']
+      if (status == 'REGRESSION' or status == 'FAILED' or (errDetails is not None)):
+        ret.add(cs['className'] + "." + cs['name'])
+
+  if len(ret) == 0:
+    logging.info("    No failed tests in testReport, check " +
+        jobConsoleOutput + " for why it was reported failed.")
+  return ret
+
+""" Iterate runs of specfied job within num_prev_days and collect results """
+def find_flaky_tests(jenkins_url, job_name, num_prev_days):
+  global numRunsToExamine
+  all_failing = dict()
+  # First list all builds
+  builds = list_builds(jenkins_url, job_name)
+
+  # Select only those in the last N days
+  min_time = int(time.time()) - SECONDS_PER_DAY * num_prev_days
+  builds = [b for b in builds if (int(b['timestamp']) / 1000) > min_time]
+
+  # Filter out only those that failed
+  failing_build_urls = [(b['url'] , b['timestamp']) for b in builds
+      if (b['result'] in ('UNSTABLE', 'FAILURE'))]
+
+  tnum = len(builds)
+  num = len(failing_build_urls)
+  numRunsToExamine = tnum
+  logging.info("    THERE ARE " + str(num) + " builds (out of " + str(tnum)
+      + ") that have failed tests in the past " + str(num_prev_days) + " days"
+      + ((".", ", as listed below:\n")[num > 0]))
+
+  for failed_build_with_time in failing_build_urls:
+    failed_build = failed_build_with_time[0];
+    jobConsoleOutput = failed_build + "Console";
+    testReport = failed_build + "testReport";
+    testReportApiJson = testReport + "/api/json";
+
+    ts = float(failed_build_with_time[1]) / 1000.
+    st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
+    logging.info("===>%s" % str(testReport) + " (" + st + ")")
+    failing = find_failing_tests(testReportApiJson, jobConsoleOutput)
+    if failing:
+      for ftest in failing:
+        logging.info("    Failed test: %s" % ftest)
+        all_failing[ftest] = all_failing.get(ftest,0)+1
+
+  return all_failing
+
+def main():
+  global numRunsToExamine
+  logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)
+
+  # set up logger to write to stdout
+  soh = logging.StreamHandler(sys.stdout)
+  soh.setLevel(logging.INFO)
+  logger = logging.getLogger()
+  logger.removeHandler(logger.handlers[0])
+  logger.addHandler(soh)
+
+  opts = parse_args()
+  logging.info("****Recently FAILED builds in url: " + opts.jenkins_url
+      + "/job/" + opts.job_name + "")
+
+  all_failing = find_flaky_tests(opts.jenkins_url, opts.job_name,
+      opts.num_prev_days)
+  if len(all_failing) == 0:
+    raise SystemExit(0)
+  logging.info("\nAmong " + str(numRunsToExamine) + " runs examined, all failed "
+      + "tests <#failedRuns: testName>:")
+
+  # print summary section: all failed tests sorted by how many times they failed
+  for tn in sorted(all_failing, key=all_failing.get, reverse=True):
+    logging.info("    " + str(all_failing[tn])+ ": " + tn)
+
+if __name__ == "__main__":
+  main()

+ 3 - 0
hadoop-common-project/hadoop-common/CHANGES.txt

@@ -386,6 +386,9 @@ Release 2.7.0 - UNRELEASED
     HADOOP-11490. Expose truncate API via FileSystem and shell command.
     (Milan Desai via shv)
 
+    HADOOP-11045. Introducing a tool to detect flaky tests of hadoop jenkins testing
+    job. (Yongjun Zhang and Todd Lipcon via ozawa)
+
   IMPROVEMENTS
 
     HADOOP-11483. HardLink.java should use the jdk7 createLink method (aajisaka)