11 years ago · 80705e034b
--- a/dev-support/determine-flaky-tests-hadoop.py
+++ b/dev-support/determine-flaky-tests-hadoop.py
@@ -0,0 +1,204 @@
 
				+#!/usr/bin/env python
			
 
				+#
			
 
				+# Licensed to the Apache Software Foundation (ASF) under one
			
 
				+# or more contributor license agreements.  See the NOTICE file
			
 
				+# distributed with this work for additional information
			
 
				+# regarding copyright ownership.  The ASF licenses this file
			
 
				+# to you under the Apache License, Version 2.0 (the
			
 
				+# "License"); you may not use this file except in compliance
			
 
				+# with the License.  You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+#
			
 
				+# Given a jenkins test job, this script examines all runs of the job done
			
 
				+# within specified period of time (number of days prior to the execution
			
 
				+# time of this script), and reports all failed tests.
			
 
				+#
			
 
				+# The output of this script includes a section for each run that has failed
			
 
				+# tests, with each failed test name listed.
			
 
				+#
			
 
				+# More importantly, at the end, it outputs a summary section to list all failed
			
 
				+# tests within all examined runs, and indicate how many runs a same test
			
 
				+# failed, and sorted all failed tests by how many runs each test failed.
			
 
				+#
			
 
				+# This way, when we see failed tests in PreCommit build, we can quickly tell
			
 
				+# whether a failed test is a new failure, or it failed before and how often it
			
 
				+# failed, so to have idea whether it may just be a flaky test.
			
 
				+#
			
 
				+# Of course, to be 100% sure about the reason of a test failure, closer look
			
 
				+# at the failed test for the specific run is necessary.
			
 
				+#
			
 
				+import sys
			
 
				+import platform
			
 
				+sysversion = sys.hexversion
			
 
				+onward30 = False
			
 
				+if sysversion < 0x020600F0:
			
 
				+  sys.exit("Minimum supported python version is 2.6, the current version is " +
			
 
				+      "Python" + platform.python_version())
			
 
				+
			
 
				+if sysversion == 0x030000F0:
			
 
				+  sys.exit("There is a known bug with Python" + platform.python_version() +
			
 
				+      ", please try a different version");
			
 
				+
			
 
				+if sysversion < 0x03000000:
			
 
				+  import urllib2
			
 
				+else:
			
 
				+  onward30 = True
			
 
				+  import urllib.request
			
 
				+
			
 
				+import datetime
			
 
				+import json as simplejson
			
 
				+import logging
			
 
				+from optparse import OptionParser
			
 
				+import time
			
 
				+
			
 
				+# Configuration
			
 
				+DEFAULT_JENKINS_URL = "https://builds.apache.org"
			
 
				+DEFAULT_JOB_NAME = "Hadoop-Common-trunk"
			
 
				+DEFAULT_NUM_PREVIOUS_DAYS = 14
			
 
				+
			
 
				+SECONDS_PER_DAY = 86400
			
 
				+
			
 
				+# total number of runs to examine
			
 
				+numRunsToExamine = 0
			
 
				+
			
 
				+""" Parse arguments """
			
 
				+def parse_args():
			
 
				+  parser = OptionParser()
			
 
				+  parser.add_option("-J", "--jenkins-url", type="string",
			
 
				+                    dest="jenkins_url", help="Jenkins URL",
			
 
				+                    default=DEFAULT_JENKINS_URL)
			
 
				+  parser.add_option("-j", "--job-name", type="string",
			
 
				+                    dest="job_name", help="Job name to look at",
			
 
				+                    default=DEFAULT_JOB_NAME)
			
 
				+  parser.add_option("-n", "--num-days", type="int",
			
 
				+                    dest="num_prev_days", help="Number of days to examine",
			
 
				+                    default=DEFAULT_NUM_PREVIOUS_DAYS)
			
 
				+
			
 
				+  (options, args) = parser.parse_args()
			
 
				+  if args:
			
 
				+    parser.error("unexpected arguments: " + repr(args))
			
 
				+  return options
			
 
				+
			
 
				+""" Load data from specified url """
			
 
				+def load_url_data(url):
			
 
				+  if onward30:
			
 
				+    ourl = urllib.request.urlopen(url)
			
 
				+    codec = ourl.info().get_param('charset')
			
 
				+    content = ourl.read().decode(codec)
			
 
				+    data = simplejson.loads(content)
			
 
				+  else:
			
 
				+    ourl = urllib2.urlopen(url)
			
 
				+    data = simplejson.load(ourl)
			
 
				+  return data
			
 
				+ 
			
 
				+""" List all builds of the target project. """
			
 
				+def list_builds(jenkins_url, job_name):
			
 
				+  url = "%(jenkins)s/job/%(job_name)s/api/json?tree=builds[url,result,timestamp]" % dict(
			
 
				+      jenkins=jenkins_url,
			
 
				+      job_name=job_name)
			
 
				+
			
 
				+  try:
			
 
				+    data = load_url_data(url)
			
 
				+
			
 
				+  except:
			
 
				+    logging.error("Could not fetch: %s" % url)
			
 
				+    raise
			
 
				+  return data['builds']
			
 
				+
			
 
				+""" Find the names of any tests which failed in the given build output URL. """
			
 
				+def find_failing_tests(testReportApiJson, jobConsoleOutput):
			
 
				+  ret = set()
			
 
				+  try:
			
 
				+    data = load_url_data(testReportApiJson)
			
 
				+
			
 
				+  except:
			
 
				+    logging.error("    Could not open testReport, check " +
			
 
				+        jobConsoleOutput + " for why it was reported failed")
			
 
				+    return ret
			
 
				+
			
 
				+  for suite in data['suites']:
			
 
				+    for cs in suite['cases']:
			
 
				+      status = cs['status']
			
 
				+      errDetails = cs['errorDetails']
			
 
				+      if (status == 'REGRESSION' or status == 'FAILED' or (errDetails is not None)):
			
 
				+        ret.add(cs['className'] + "." + cs['name'])
			
 
				+
			
 
				+  if len(ret) == 0:
			
 
				+    logging.info("    No failed tests in testReport, check " +
			
 
				+        jobConsoleOutput + " for why it was reported failed.")
			
 
				+  return ret
			
 
				+
			
 
				+""" Iterate runs of specfied job within num_prev_days and collect results """
			
 
				+def find_flaky_tests(jenkins_url, job_name, num_prev_days):
			
 
				+  global numRunsToExamine
			
 
				+  all_failing = dict()
			
 
				+  # First list all builds
			
 
				+  builds = list_builds(jenkins_url, job_name)
			
 
				+
			
 
				+  # Select only those in the last N days
			
 
				+  min_time = int(time.time()) - SECONDS_PER_DAY * num_prev_days
			
 
				+  builds = [b for b in builds if (int(b['timestamp']) / 1000) > min_time]
			
 
				+
			
 
				+  # Filter out only those that failed
			
 
				+  failing_build_urls = [(b['url'] , b['timestamp']) for b in builds
			
 
				+      if (b['result'] in ('UNSTABLE', 'FAILURE'))]
			
 
				+
			
 
				+  tnum = len(builds)
			
 
				+  num = len(failing_build_urls)
			
 
				+  numRunsToExamine = tnum
			
 
				+  logging.info("    THERE ARE " + str(num) + " builds (out of " + str(tnum)
			
 
				+      + ") that have failed tests in the past " + str(num_prev_days) + " days"
			
 
				+      + ((".", ", as listed below:\n")[num > 0]))
			
 
				+
			
 
				+  for failed_build_with_time in failing_build_urls:
			
 
				+    failed_build = failed_build_with_time[0];
			
 
				+    jobConsoleOutput = failed_build + "Console";
			
 
				+    testReport = failed_build + "testReport";
			
 
				+    testReportApiJson = testReport + "/api/json";
			
 
				+
			
 
				+    ts = float(failed_build_with_time[1]) / 1000.
			
 
				+    st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
			
 
				+    logging.info("===>%s" % str(testReport) + " (" + st + ")")
			
 
				+    failing = find_failing_tests(testReportApiJson, jobConsoleOutput)
			
 
				+    if failing:
			
 
				+      for ftest in failing:
			
 
				+        logging.info("    Failed test: %s" % ftest)
			
 
				+        all_failing[ftest] = all_failing.get(ftest,0)+1
			
 
				+
			
 
				+  return all_failing
			
 
				+
			
 
				+def main():
			
 
				+  global numRunsToExamine
			
 
				+  logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)
			
 
				+
			
 
				+  # set up logger to write to stdout
			
 
				+  soh = logging.StreamHandler(sys.stdout)
			
 
				+  soh.setLevel(logging.INFO)
			
 
				+  logger = logging.getLogger()
			
 
				+  logger.removeHandler(logger.handlers[0])
			
 
				+  logger.addHandler(soh)
			
 
				+
			
 
				+  opts = parse_args()
			
 
				+  logging.info("****Recently FAILED builds in url: " + opts.jenkins_url
			
 
				+      + "/job/" + opts.job_name + "")
			
 
				+
			
 
				+  all_failing = find_flaky_tests(opts.jenkins_url, opts.job_name,
			
 
				+      opts.num_prev_days)
			
 
				+  if len(all_failing) == 0:
			
 
				+    raise SystemExit(0)
			
 
				+  logging.info("\nAmong " + str(numRunsToExamine) + " runs examined, all failed "
			
 
				+      + "tests <#failedRuns: testName>:")
			
 
				+
			
 
				+  # print summary section: all failed tests sorted by how many times they failed
			
 
				+  for tn in sorted(all_failing, key=all_failing.get, reverse=True):
			
 
				+    logging.info("    " + str(all_failing[tn])+ ": " + tn)
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+  main()
			
--- a/hadoop-common-project/hadoop-common/CHANGES.txt
+++ b/hadoop-common-project/hadoop-common/CHANGES.txt
@@ -386,6 +386,9 @@ Release 2.7.0 - UNRELEASED
 
				     HADOOP-11490. Expose truncate API via FileSystem and shell command.
			
 
				     (Milan Desai via shv)
			
 
				 
			
 
				+    HADOOP-11045. Introducing a tool to detect flaky tests of hadoop jenkins testing
			
 
				+    job. (Yongjun Zhang and Todd Lipcon via ozawa)
			
 
				+
			
 
				   IMPROVEMENTS
			
 
				 
			
 
				     HADOOP-11483. HardLink.java should use the jdk7 createLink method (aajisaka)