001 /**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements. See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership. The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License. You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018
019 package org.apache.hadoop.hdfs.util;
020
021 import org.apache.hadoop.classification.InterfaceAudience;
022 import org.apache.hadoop.classification.InterfaceStability;
023 import org.xml.sax.ContentHandler;
024 import org.xml.sax.SAXException;
025 import org.xml.sax.helpers.AttributesImpl;
026
027 import java.util.LinkedList;
028 import java.util.List;
029 import java.util.Map;
030 import java.util.TreeMap;
031
032 /**
033 * General xml utilities.
034 *
035 */
036 @InterfaceAudience.Private
037 @InterfaceStability.Unstable
038 public class XMLUtils {
039 /**
040 * Exception that reflects an invalid XML document.
041 */
042 static public class InvalidXmlException extends RuntimeException {
043 private static final long serialVersionUID = 1L;
044 public InvalidXmlException(String s) {
045 super(s);
046 }
047 }
048
049 /**
050 * Exception that reflects a string that cannot be unmangled.
051 */
052 public static class UnmanglingError extends RuntimeException {
053 private static final long serialVersionUID = 1L;
054
055 public UnmanglingError(String str, Exception e) {
056 super(str, e);
057 }
058
059 public UnmanglingError(String str) {
060 super(str);
061 }
062 }
063
064
065 /**
066 * Given a code point, determine if it should be mangled before being
067 * represented in an XML document.
068 *
069 * Any code point that isn't valid in XML must be mangled.
070 * See http://en.wikipedia.org/wiki/Valid_characters_in_XML for a
071 * quick reference, or the w3 standard for the authoritative reference.
072 *
073 * @param cp The code point
074 * @return True if the code point should be mangled
075 */
076 private static boolean codePointMustBeMangled(int cp) {
077 if (cp < 0x20) {
078 return ((cp != 0x9) && (cp != 0xa) && (cp != 0xd));
079 } else if ((0xd7ff < cp) && (cp < 0xe000)) {
080 return true;
081 } else if ((cp == 0xfffe) || (cp == 0xffff)) {
082 return true;
083 } else if (cp == 0x5c) {
084 // we mangle backslash to simplify decoding... it's
085 // easier if backslashes always begin mangled sequences.
086 return true;
087 }
088 return false;
089 }
090
091 private static final int NUM_SLASH_POSITIONS = 4;
092
093 private static String mangleCodePoint(int cp) {
094 return String.format("\\%0" + NUM_SLASH_POSITIONS + "x;", cp);
095 }
096
097 /**
098 * Mangle a string so that it can be represented in an XML document.
099 *
100 * There are three kinds of code points in XML:
101 * - Those that can be represented normally,
102 * - Those that have to be escaped (for example, & must be represented
103 * as &)
104 * - Those that cannot be represented at all in XML.
105 *
106 * The built-in SAX functions will handle the first two types for us just
107 * fine. However, sometimes we come across a code point of the third type.
108 * In this case, we have to mangle the string in order to represent it at
109 * all. We also mangle backslash to avoid confusing a backslash in the
110 * string with part our escape sequence.
111 *
112 * The encoding used here is as follows: an illegal code point is
113 * represented as '\ABCD;', where ABCD is the hexadecimal value of
114 * the code point.
115 *
116 * @param str The input string.
117 *
118 * @return The mangled string.
119 */
120 public static String mangleXmlString(String str) {
121 final StringBuilder bld = new StringBuilder();
122 final int length = str.length();
123 for (int offset = 0; offset < length; ) {
124 final int cp = str.codePointAt(offset);
125 final int len = Character.charCount(cp);
126 if (codePointMustBeMangled(cp)) {
127 bld.append(mangleCodePoint(cp));
128 } else {
129 for (int i = 0; i < len; i++) {
130 bld.append(str.charAt(offset + i));
131 }
132 }
133 offset += len;
134 }
135 return bld.toString();
136 }
137
138 /**
139 * Demangle a string from an XML document.
140 * See {@link #mangleXmlString(String)} for a description of the mangling
141 * format.
142 *
143 * @param str The string to be demangled.
144 *
145 * @return The unmangled string
146 * @throws UnmanglingError if the input is malformed.
147 */
148 public static String unmangleXmlString(String str)
149 throws UnmanglingError {
150 int slashPosition = -1;
151 String escapedCp = "";
152 StringBuilder bld = new StringBuilder();
153 for (int i = 0; i < str.length(); i++) {
154 char ch = str.charAt(i);
155 if ((slashPosition >= 0) && (slashPosition < NUM_SLASH_POSITIONS)) {
156 escapedCp += ch;
157 ++slashPosition;
158 } else if (slashPosition == NUM_SLASH_POSITIONS) {
159 if (ch != ';') {
160 throw new UnmanglingError("unterminated code point escape: " +
161 "expected semicolon at end.");
162 }
163 try {
164 bld.appendCodePoint(Integer.parseInt(escapedCp, 16));
165 } catch (NumberFormatException e) {
166 throw new UnmanglingError("error parsing unmangling escape code", e);
167 }
168 escapedCp = "";
169 slashPosition = -1;
170 } else if (ch == '\\') {
171 slashPosition = 0;
172 } else {
173 bld.append(ch);
174 }
175 }
176 if (slashPosition != -1) {
177 throw new UnmanglingError("unterminated code point escape: string " +
178 "broke off in the middle");
179 }
180 return bld.toString();
181 }
182
183 /**
184 * Add a SAX tag with a string inside.
185 *
186 * @param contentHandler the SAX content handler
187 * @param tag the element tag to use
188 * @param value the string to put inside the tag
189 */
190 public static void addSaxString(ContentHandler contentHandler,
191 String tag, String val) throws SAXException {
192 contentHandler.startElement("", "", tag, new AttributesImpl());
193 char c[] = mangleXmlString(val).toCharArray();
194 contentHandler.characters(c, 0, c.length);
195 contentHandler.endElement("", "", tag);
196 }
197
198 /**
199 * Represents a bag of key-value pairs encountered during parsing an XML
200 * file.
201 */
202 static public class Stanza {
203 private final TreeMap<String, LinkedList <Stanza > > subtrees;
204
205 /** The unmangled value of this stanza. */
206 private String value;
207
208 public Stanza() {
209 subtrees = new TreeMap<String, LinkedList <Stanza > >();
210 value = "";
211 }
212
213 public void setValue(String value) {
214 this.value = value;
215 }
216
217 public String getValue() {
218 return this.value;
219 }
220
221 /**
222 * Discover if a stanza has a given entry.
223 *
224 * @param name entry to look for
225 *
226 * @return true if the entry was found
227 */
228 public boolean hasChildren(String name) {
229 return subtrees.containsKey(name);
230 }
231
232 /**
233 * Pull an entry from a stanza.
234 *
235 * @param name entry to look for
236 *
237 * @return the entry
238 */
239 public List<Stanza> getChildren(String name) throws InvalidXmlException {
240 LinkedList <Stanza> children = subtrees.get(name);
241 if (children == null) {
242 throw new InvalidXmlException("no entry found for " + name);
243 }
244 return children;
245 }
246
247 /**
248 * Pull a string entry from a stanza.
249 *
250 * @param name entry to look for
251 *
252 * @return the entry
253 */
254 public String getValue(String name) throws InvalidXmlException {
255 String ret = getValueOrNull(name);
256 if (ret == null) {
257 throw new InvalidXmlException("no entry found for " + name);
258 }
259 return ret;
260 }
261
262 /**
263 * Pull a string entry from a stanza, or null.
264 *
265 * @param name entry to look for
266 *
267 * @return the entry, or null if it was not found.
268 */
269 public String getValueOrNull(String name) throws InvalidXmlException {
270 if (!subtrees.containsKey(name)) {
271 return null;
272 }
273 LinkedList <Stanza> l = subtrees.get(name);
274 if (l.size() != 1) {
275 throw new InvalidXmlException("More than one value found for " + name);
276 }
277 return l.get(0).getValue();
278 }
279
280 /**
281 * Add an entry to a stanza.
282 *
283 * @param name name of the entry to add
284 * @param child the entry to add
285 */
286 public void addChild(String name, Stanza child) {
287 LinkedList<Stanza> l;
288 if (subtrees.containsKey(name)) {
289 l = subtrees.get(name);
290 } else {
291 l = new LinkedList<Stanza>();
292 subtrees.put(name, l);
293 }
294 l.add(child);
295 }
296
297 /**
298 * Convert a stanza to a human-readable string.
299 */
300 @Override
301 public String toString() {
302 StringBuilder bld = new StringBuilder();
303 bld.append("{");
304 if (!value.equals("")) {
305 bld.append("\"").append(value).append("\"");
306 }
307 String prefix = "";
308 for (Map.Entry<String, LinkedList <Stanza > > entry :
309 subtrees.entrySet()) {
310 String key = entry.getKey();
311 LinkedList <Stanza > ll = entry.getValue();
312 for (Stanza child : ll) {
313 bld.append(prefix);
314 bld.append("<").append(key).append(">");
315 bld.append(child.toString());
316 prefix = ", ";
317 }
318 }
319 bld.append("}");
320 return bld.toString();
321 }
322 }
323 }