Jelajahi Sumber

HDFS-4235. When outputting XML, OfflineEditsViewer can't handle some edits containing non-ASCII strings. Contributed by Colin Patrick McCabe.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1449984 13f79535-47bb-0310-9956-ffa450edef68
Aaron Myers 12 tahun lalu
induk
melakukan
7e2d98da40

+ 3 - 0
hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt

@@ -333,6 +333,9 @@ Release 2.0.4-beta - UNRELEASED
     HDFS-4482. ReplicationMonitor thread can exit with NPE due to the race 
     between delete and replication of same file. (umamahesh)
 
+    HDFS-4235. When outputting XML, OfflineEditsViewer can't handle some edits
+    containing non-ASCII strings. (Colin Patrick McCabe via atm)
+
 Release 2.0.3-alpha - 2013-02-06
 
   INCOMPATIBLE CHANGES

+ 2 - 1
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/offlineEditsViewer/OfflineEditsXmlLoader.java

@@ -26,6 +26,7 @@ import java.util.Stack;
 
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.hdfs.util.XMLUtils;
 import org.apache.hadoop.hdfs.util.XMLUtils.InvalidXmlException;
 import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp;
 import org.apache.hadoop.hdfs.server.namenode.FSEditLogOpCodes;
@@ -176,7 +177,7 @@ class OfflineEditsXmlLoader
   
   @Override
   public void endElement (String uri, String name, String qName) {
-    String str = cbuf.toString().trim();
+    String str = XMLUtils.unmangleXmlString(cbuf.toString()).trim();
     cbuf = new StringBuffer();
     switch (state) {
     case EXPECT_EDITS_TAG:

+ 137 - 1
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/util/XMLUtils.java

@@ -46,6 +46,140 @@ public class XMLUtils {
     }
   }
   
+  /**
+   * Exception that reflects a string that cannot be unmangled.
+   */
+  public static class UnmanglingError extends RuntimeException {
+    private static final long serialVersionUID = 1L;
+    
+    public UnmanglingError(String str, Exception e) {
+      super(str, e);
+    }
+    
+    public UnmanglingError(String str) {
+      super(str);
+    }
+  }
+  
+
+  /**
+   * Given a code point, determine if it should be mangled before being
+   * represented in an XML document.
+   * 
+   * Any code point that isn't valid in XML must be mangled.
+   * See http://en.wikipedia.org/wiki/Valid_characters_in_XML for a
+   * quick reference, or the w3 standard for the authoritative reference.
+   * 
+   * @param cp      The code point
+   * @return        True if the code point should be mangled
+   */
+  private static boolean codePointMustBeMangled(int cp) {
+    if (cp < 0x20) {
+      return ((cp != 0x9) && (cp != 0xa) && (cp != 0xd));
+    } else if ((0xd7ff < cp) && (cp < 0xe000)) {
+      return true;
+    } else if ((cp == 0xfffe) || (cp == 0xffff)) {
+      return true;
+    } else if (cp == 0x5c) {
+      // we mangle backslash to simplify decoding... it's
+      // easier if backslashes always begin mangled sequences. 
+      return true;
+    }
+    return false;
+  }
+
+  private static int NUM_SLASH_POSITIONS = 4;
+
+  private static String mangleCodePoint(int cp) {
+    return String.format("\\%0" + NUM_SLASH_POSITIONS + "x;", cp);
+  }
+
+  /**
+   * Mangle a string so that it can be represented in an XML document.
+   * 
+   * There are three kinds of code points in XML:
+   * - Those that can be represented normally,
+   * - Those that have to be escaped (for example, & must be represented 
+   *     as &amp;)
+   * - Those that cannot be represented at all in XML.
+   *
+   * The built-in SAX functions will handle the first two types for us just
+   * fine.  However, sometimes we come across a code point of the third type.
+   * In this case, we have to mangle the string in order to represent it at
+   * all.  We also mangle backslash to avoid confusing a backslash in the
+   * string with part our escape sequence.
+   * 
+   * The encoding used here is as follows: an illegal code point is
+   * represented as '\ABCD;', where ABCD is the hexadecimal value of 
+   * the code point.
+   *
+   * @param str     The input string.
+   *
+   * @return        The mangled string.
+   */
+  public static String mangleXmlString(String str) {
+    final StringBuilder bld = new StringBuilder();
+    final int length = str.length();
+    for (int offset = 0; offset < length; ) {
+       final int cp = str.codePointAt(offset);
+       final int len = Character.charCount(cp);
+       if (codePointMustBeMangled(cp)) {
+         bld.append(mangleCodePoint(cp));
+       } else {
+         for (int i = 0; i < len; i++) {
+           bld.append(str.charAt(offset + i));
+         }
+       }
+       offset += len;
+    }
+    return bld.toString();
+  }
+
+  /**
+   * Demangle a string from an XML document.
+   * See {@link #mangleXmlString(String)} for a description of the mangling
+   * format.
+   *
+   * @param str    The string to be demangled.
+   * 
+   * @return       The unmangled string
+   * @throws       UnmanglingError if the input is malformed.
+   */
+  public static String unmangleXmlString(String str)
+        throws UnmanglingError {
+    int slashPosition = -1;
+    String escapedCp = "";
+    StringBuilder bld = new StringBuilder();
+    for (int i = 0; i < str.length(); i++) {
+      char ch = str.charAt(i);
+      if ((slashPosition >= 0) && (slashPosition < NUM_SLASH_POSITIONS)) {
+        escapedCp += ch;
+        ++slashPosition;
+      } else if (slashPosition == NUM_SLASH_POSITIONS) {
+        if (ch != ';') {
+          throw new UnmanglingError("unterminated code point escape: " +
+              "expected semicolon at end.");
+        }
+        try {
+          bld.appendCodePoint(Integer.parseInt(escapedCp, 16));
+        } catch (NumberFormatException e) {
+          throw new UnmanglingError("error parsing unmangling escape code", e);
+        }
+        escapedCp = "";
+        slashPosition = -1;
+      } else if (ch == '\\') {
+        slashPosition = 0;
+      } else {
+        bld.append(ch);
+      }
+    }
+    if (slashPosition != -1) {
+      throw new UnmanglingError("unterminated code point escape: string " +
+          "broke off in the middle");
+    }
+    return bld.toString();
+  }
+  
   /**
    * Add a SAX tag with a string inside.
    *
@@ -56,7 +190,7 @@ public class XMLUtils {
   public static void addSaxString(ContentHandler contentHandler,
       String tag, String val) throws SAXException {
     contentHandler.startElement("", "", tag, new AttributesImpl());
-    char c[] = val.toString().toCharArray();
+    char c[] = mangleXmlString(val).toCharArray();
     contentHandler.characters(c, 0, c.length);
     contentHandler.endElement("", "", tag);
   }
@@ -67,6 +201,8 @@ public class XMLUtils {
    */
   static public class Stanza {
     private TreeMap<String, LinkedList <Stanza > > subtrees;
+
+    /** The unmangled value of this stanza. */
     private String value;
     
     public Stanza() {

+ 1 - 1
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/OfflineEditsViewerHelper.java

@@ -143,7 +143,7 @@ public class OfflineEditsViewerHelper {
       (DistributedFileSystem)cluster.getFileSystem();
     FileContext fc = FileContext.getFileContext(cluster.getURI(0), config);
     // OP_ADD 0, OP_SET_GENSTAMP 10
-    Path pathFileCreate = new Path("/file_create");
+    Path pathFileCreate = new Path("/file_create_u\1F431");
     FSDataOutputStream s = dfs.create(pathFileCreate);
     // OP_CLOSE 9
     s.close();

+ 70 - 0
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/util/TestXMLUtils.java

@@ -0,0 +1,70 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.util;
+
+import junit.framework.Assert;
+
+import org.apache.hadoop.hdfs.util.XMLUtils.UnmanglingError;
+import org.junit.Test;
+
+public class TestXMLUtils {
+  private static void testRoundTrip(String str, String expectedMangled) {
+    String mangled = XMLUtils.mangleXmlString(str);
+    Assert.assertEquals(mangled, expectedMangled);
+    String unmangled = XMLUtils.unmangleXmlString(mangled);
+    Assert.assertEquals(unmangled, str);
+  }
+
+  @Test
+  public void testMangleEmptyString() throws Exception {
+    testRoundTrip("", "");
+  }
+
+  @Test
+  public void testMangleVanillaString() throws Exception {
+    testRoundTrip("abcdef", "abcdef");
+  }
+
+  @Test
+  public void testMangleStringWithBackSlash() throws Exception {
+    testRoundTrip("a\\bcdef", "a\\005c;bcdef");
+    testRoundTrip("\\\\", "\\005c;\\005c;");
+  }  
+
+  @Test
+  public void testMangleStringWithForbiddenCodePoint() throws Exception {
+    testRoundTrip("a\u0001bcdef", "a\\0001;bcdef");
+    testRoundTrip("a\u0002\ud800bcdef", "a\\0002;\\d800;bcdef");
+  }
+
+  @Test
+  public void testInvalidSequence() throws Exception {
+    try {
+      XMLUtils.unmangleXmlString("\\000g;foo");
+      Assert.fail("expected an unmangling error");
+    } catch (UnmanglingError e) {
+      // pass through
+    }
+    try {
+      XMLUtils.unmangleXmlString("\\0");
+      Assert.fail("expected an unmangling error");
+    } catch (UnmanglingError e) {
+      // pass through
+    }
+  }
+}