liuzhangxing
/
hadoop
şundan çatallanmış apache/hadoop


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384
							#!/usr/bin/env bash

# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# This script is used for fetching the standard Hadoop metrics which the
# Dynamometer NameNode generates during its execution (standard Hadoop metrics).
# Those metrics are uploaded onto HDFS when the Dynamometer application completes.
# This script will download them locally and parse out the specified metric for
# the given time period. This is useful to, for example, isolate only the metrics
# produced during the workload replay portion of a job. For this, specify startTimeMs
# as the start time of the workload job (which it logs during execution) and
# periodMinutes the period (in minutes) of the replay.

if [ $# -lt 5 ]; then
  echo "Usage:"
  echo "./parse-metrics.sh applicationID outputFileName startTimeMs periodMinutes metricName [ context ] [ isCounter ]"
  echo "If no file namenode_metrics_{applicationID} is present in the working directory,"
  echo "attempts to download one from HDFS for applicationID. Filters values"
  echo "for the specified metric, during the range"
  echo "(startTimeMs, startTimeMs + periodMinutes) optionally filtering on the context as well"
  echo "(which is just applied as a regex search across the metric line output)"
  echo "and outputs CSV pairs of (seconds_since_start_time,value)."
  echo "If isCounter is true, treats the metrics as a counter and outputs per-second rate values."
  exit 1
fi

appId="$1"
output="$2"
start_ts="$3"
period_minutes="$4"
metric="$5"
context="$6"
is_counter="$7"

localFile="namenode_metrics_$appId"
if [ ! -f "$localFile" ]; then
  remoteFile=".dynamometer/$appId/namenode_metrics"
  echo "Downloading file from HDFS: $remoteFile"
  if ! hdfs dfs -copyToLocal "$remoteFile" "$localFile"; then
    exit 1
  fi
fi

read -d '' -r awk_script <<'EOF'
BEGIN {
    metric_regex="[[:space:]]"metric"=([[:digit:].E]+)";
    end_ts=start_ts+(period_minutes*60*1000)
    last_val=0
    last_ts=start_ts
}
"true" ~ is_counter && $0 ~ metric_regex && $0 ~ context && $1 < start_ts {
    match($0, metric_regex, val_arr);
    last_val=val_arr[1]
    last_ts=$1
}
$0 ~ metric_regex && $0 ~ context && $1 >= start_ts && $1 <= end_ts {
    match($0, metric_regex, val_arr);
    val=val_arr[1]
    if (is_counter == "true") {
      tmp=val
      val=val-last_val
      val=val/(($1-last_ts)/1000)
      last_ts=$1
      last_val=tmp
    }
    printf("%.0f,%.6f\n", ($0-start_ts)/1000, val)
}
EOF

gawk -v metric="$metric" -v context="$context" -v start_ts="$start_ts" \
  -v period_minutes="$period_minutes" -v is_counter="$is_counter" -v OFS="," "$awk_script" "$localFile" > "$output"