|
@@ -46,6 +46,140 @@ public class XMLUtils {
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+ /**
|
|
|
|
+ * Exception that reflects a string that cannot be unmangled.
|
|
|
|
+ */
|
|
|
|
+ public static class UnmanglingError extends RuntimeException {
|
|
|
|
+ private static final long serialVersionUID = 1L;
|
|
|
|
+
|
|
|
|
+ public UnmanglingError(String str, Exception e) {
|
|
|
|
+ super(str, e);
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ public UnmanglingError(String str) {
|
|
|
|
+ super(str);
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ /**
|
|
|
|
+ * Given a code point, determine if it should be mangled before being
|
|
|
|
+ * represented in an XML document.
|
|
|
|
+ *
|
|
|
|
+ * Any code point that isn't valid in XML must be mangled.
|
|
|
|
+ * See http://en.wikipedia.org/wiki/Valid_characters_in_XML for a
|
|
|
|
+ * quick reference, or the w3 standard for the authoritative reference.
|
|
|
|
+ *
|
|
|
|
+ * @param cp The code point
|
|
|
|
+ * @return True if the code point should be mangled
|
|
|
|
+ */
|
|
|
|
+ private static boolean codePointMustBeMangled(int cp) {
|
|
|
|
+ if (cp < 0x20) {
|
|
|
|
+ return ((cp != 0x9) && (cp != 0xa) && (cp != 0xd));
|
|
|
|
+ } else if ((0xd7ff < cp) && (cp < 0xe000)) {
|
|
|
|
+ return true;
|
|
|
|
+ } else if ((cp == 0xfffe) || (cp == 0xffff)) {
|
|
|
|
+ return true;
|
|
|
|
+ } else if (cp == 0x5c) {
|
|
|
|
+ // we mangle backslash to simplify decoding... it's
|
|
|
|
+ // easier if backslashes always begin mangled sequences.
|
|
|
|
+ return true;
|
|
|
|
+ }
|
|
|
|
+ return false;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ private static int NUM_SLASH_POSITIONS = 4;
|
|
|
|
+
|
|
|
|
+ private static String mangleCodePoint(int cp) {
|
|
|
|
+ return String.format("\\%0" + NUM_SLASH_POSITIONS + "x;", cp);
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ /**
|
|
|
|
+ * Mangle a string so that it can be represented in an XML document.
|
|
|
|
+ *
|
|
|
|
+ * There are three kinds of code points in XML:
|
|
|
|
+ * - Those that can be represented normally,
|
|
|
|
+ * - Those that have to be escaped (for example, & must be represented
|
|
|
|
+ * as &)
|
|
|
|
+ * - Those that cannot be represented at all in XML.
|
|
|
|
+ *
|
|
|
|
+ * The built-in SAX functions will handle the first two types for us just
|
|
|
|
+ * fine. However, sometimes we come across a code point of the third type.
|
|
|
|
+ * In this case, we have to mangle the string in order to represent it at
|
|
|
|
+ * all. We also mangle backslash to avoid confusing a backslash in the
|
|
|
|
+ * string with part our escape sequence.
|
|
|
|
+ *
|
|
|
|
+ * The encoding used here is as follows: an illegal code point is
|
|
|
|
+ * represented as '\ABCD;', where ABCD is the hexadecimal value of
|
|
|
|
+ * the code point.
|
|
|
|
+ *
|
|
|
|
+ * @param str The input string.
|
|
|
|
+ *
|
|
|
|
+ * @return The mangled string.
|
|
|
|
+ */
|
|
|
|
+ public static String mangleXmlString(String str) {
|
|
|
|
+ final StringBuilder bld = new StringBuilder();
|
|
|
|
+ final int length = str.length();
|
|
|
|
+ for (int offset = 0; offset < length; ) {
|
|
|
|
+ final int cp = str.codePointAt(offset);
|
|
|
|
+ final int len = Character.charCount(cp);
|
|
|
|
+ if (codePointMustBeMangled(cp)) {
|
|
|
|
+ bld.append(mangleCodePoint(cp));
|
|
|
|
+ } else {
|
|
|
|
+ for (int i = 0; i < len; i++) {
|
|
|
|
+ bld.append(str.charAt(offset + i));
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ offset += len;
|
|
|
|
+ }
|
|
|
|
+ return bld.toString();
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ /**
|
|
|
|
+ * Demangle a string from an XML document.
|
|
|
|
+ * See {@link #mangleXmlString(String)} for a description of the mangling
|
|
|
|
+ * format.
|
|
|
|
+ *
|
|
|
|
+ * @param str The string to be demangled.
|
|
|
|
+ *
|
|
|
|
+ * @return The unmangled string
|
|
|
|
+ * @throws UnmanglingError if the input is malformed.
|
|
|
|
+ */
|
|
|
|
+ public static String unmangleXmlString(String str)
|
|
|
|
+ throws UnmanglingError {
|
|
|
|
+ int slashPosition = -1;
|
|
|
|
+ String escapedCp = "";
|
|
|
|
+ StringBuilder bld = new StringBuilder();
|
|
|
|
+ for (int i = 0; i < str.length(); i++) {
|
|
|
|
+ char ch = str.charAt(i);
|
|
|
|
+ if ((slashPosition >= 0) && (slashPosition < NUM_SLASH_POSITIONS)) {
|
|
|
|
+ escapedCp += ch;
|
|
|
|
+ ++slashPosition;
|
|
|
|
+ } else if (slashPosition == NUM_SLASH_POSITIONS) {
|
|
|
|
+ if (ch != ';') {
|
|
|
|
+ throw new UnmanglingError("unterminated code point escape: " +
|
|
|
|
+ "expected semicolon at end.");
|
|
|
|
+ }
|
|
|
|
+ try {
|
|
|
|
+ bld.appendCodePoint(Integer.parseInt(escapedCp, 16));
|
|
|
|
+ } catch (NumberFormatException e) {
|
|
|
|
+ throw new UnmanglingError("error parsing unmangling escape code", e);
|
|
|
|
+ }
|
|
|
|
+ escapedCp = "";
|
|
|
|
+ slashPosition = -1;
|
|
|
|
+ } else if (ch == '\\') {
|
|
|
|
+ slashPosition = 0;
|
|
|
|
+ } else {
|
|
|
|
+ bld.append(ch);
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ if (slashPosition != -1) {
|
|
|
|
+ throw new UnmanglingError("unterminated code point escape: string " +
|
|
|
|
+ "broke off in the middle");
|
|
|
|
+ }
|
|
|
|
+ return bld.toString();
|
|
|
|
+ }
|
|
|
|
+
|
|
/**
|
|
/**
|
|
* Add a SAX tag with a string inside.
|
|
* Add a SAX tag with a string inside.
|
|
*
|
|
*
|
|
@@ -56,7 +190,7 @@ public class XMLUtils {
|
|
public static void addSaxString(ContentHandler contentHandler,
|
|
public static void addSaxString(ContentHandler contentHandler,
|
|
String tag, String val) throws SAXException {
|
|
String tag, String val) throws SAXException {
|
|
contentHandler.startElement("", "", tag, new AttributesImpl());
|
|
contentHandler.startElement("", "", tag, new AttributesImpl());
|
|
- char c[] = val.toString().toCharArray();
|
|
|
|
|
|
+ char c[] = mangleXmlString(val).toCharArray();
|
|
contentHandler.characters(c, 0, c.length);
|
|
contentHandler.characters(c, 0, c.length);
|
|
contentHandler.endElement("", "", tag);
|
|
contentHandler.endElement("", "", tag);
|
|
}
|
|
}
|
|
@@ -67,6 +201,8 @@ public class XMLUtils {
|
|
*/
|
|
*/
|
|
static public class Stanza {
|
|
static public class Stanza {
|
|
private TreeMap<String, LinkedList <Stanza > > subtrees;
|
|
private TreeMap<String, LinkedList <Stanza > > subtrees;
|
|
|
|
+
|
|
|
|
+ /** The unmangled value of this stanza. */
|
|
private String value;
|
|
private String value;
|
|
|
|
|
|
public Stanza() {
|
|
public Stanza() {
|