001 /*
002 * Copyright 2005 John G. Wilson
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 * http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 *
016 */
017
018 package groovy.util;
019
020 import groovy.util.slurpersupport.GPathResult;
021 import groovy.util.slurpersupport.Node;
022 import groovy.util.slurpersupport.NodeChild;
023 import groovy.xml.FactorySupport;
024
025 import java.io.File;
026 import java.io.FileInputStream;
027 import java.io.IOException;
028 import java.io.InputStream;
029 import java.io.Reader;
030 import java.io.StringReader;
031 import java.net.URL;
032 import java.util.HashMap;
033 import java.util.Hashtable;
034 import java.util.Map;
035 import java.util.Stack;
036
037 import javax.xml.parsers.ParserConfigurationException;
038 import javax.xml.parsers.SAXParser;
039 import javax.xml.parsers.SAXParserFactory;
040
041 import org.xml.sax.Attributes;
042 import org.xml.sax.DTDHandler;
043 import org.xml.sax.EntityResolver;
044 import org.xml.sax.ErrorHandler;
045 import org.xml.sax.InputSource;
046 import org.xml.sax.SAXException;
047 import org.xml.sax.SAXNotRecognizedException;
048 import org.xml.sax.SAXNotSupportedException;
049 import org.xml.sax.XMLReader;
050 import org.xml.sax.helpers.DefaultHandler;
051
052 /**
053 * @author John Wilson
054 *
055 */
056
057 public class XmlSlurper extends DefaultHandler {
058 private final XMLReader reader;
059 private Node currentNode = null;
060 private final Stack stack = new Stack();
061 private final StringBuffer charBuffer = new StringBuffer();
062 private final Map namespaceTagHints = new Hashtable();
063 private boolean keepWhitespace = false;
064
065 public XmlSlurper() throws ParserConfigurationException, SAXException {
066 this(false, true);
067 }
068
069 public XmlSlurper(final boolean validating, final boolean namespaceAware) throws ParserConfigurationException, SAXException {
070 SAXParserFactory factory = FactorySupport.createSaxParserFactory();
071 factory.setNamespaceAware(namespaceAware);
072 factory.setValidating(validating);
073 this.reader = factory.newSAXParser().getXMLReader();
074 }
075
076 public XmlSlurper(final XMLReader reader) {
077 this.reader = reader;
078 }
079
080 public XmlSlurper(final SAXParser parser) throws SAXException {
081 this(parser.getXMLReader());
082 }
083
084 /**
085 * @param keepWhitespace
086 *
087 * If true then whitespace before elements is kept.
088 * The deafult is to discard the whitespace.
089 */
090 public void setKeepWhitespace(boolean keepWhitespace) {
091 this.keepWhitespace = keepWhitespace;
092 }
093
094 /**
095 * @return The GPathResult instance created by consuming a stream of SAX events
096 * Note if one of the parse methods has been called then this returns null
097 * Note if this is called more than once all calls after the first will return null
098 *
099 */
100 public GPathResult getDocument() {
101 try {
102 return new NodeChild(this.currentNode, null, this.namespaceTagHints);
103 } finally {
104 this.currentNode = null;
105 }
106 }
107
108 /**
109 * Parse the content of the specified input source into a GPathResult object
110 *
111 * @param input
112 * @return An object which supports GPath expressions
113 * @throws IOException
114 * @throws SAXException
115 */
116 public GPathResult parse(final InputSource input) throws IOException, SAXException {
117 this.reader.setContentHandler(this);
118 this.reader.parse(input);
119
120 return getDocument();
121
122 }
123
124 /**
125 * Parses the content of the given file as XML turning it into a GPathResult object
126 *
127 * @param file
128 * @return An object which supports GPath expressions
129 * @throws IOException
130 * @throws SAXException
131 */
132 public GPathResult parse(final File file) throws IOException, SAXException {
133 final InputSource input = new InputSource(new FileInputStream(file));
134
135 input.setSystemId("file://" + file.getAbsolutePath());
136
137 return parse(input);
138
139 }
140
141 /**
142 * Parse the content of the specified input stream into an GPathResult Object.
143 * Note that using this method will not provide the parser with any URI
144 * for which to find DTDs etc
145 *
146 * @param input
147 * @return An object which supports GPath expressions
148 * @throws IOException
149 * @throws SAXException
150 */
151 public GPathResult parse(final InputStream input) throws IOException, SAXException {
152 return parse(new InputSource(input));
153 }
154
155 /**
156 * Parse the content of the specified reader into a GPathResult Object.
157 * Note that using this method will not provide the parser with any URI
158 * for which to find DTDs etc
159 *
160 * @param in
161 * @return An object which supports GPath expressions
162 * @throws IOException
163 * @throws SAXException
164 */
165 public GPathResult parse(final Reader in) throws IOException, SAXException {
166 return parse(new InputSource(in));
167 }
168
169 /**
170 * Parse the content of the specified URI into a GPathResult Object
171 *
172 * @param uri
173 * @return An object which supports GPath expressions
174 * @throws IOException
175 * @throws SAXException
176 */
177 public GPathResult parse(final String uri) throws IOException, SAXException {
178 return parse(new InputSource(uri));
179 }
180
181 /**
182 * A helper method to parse the given text as XML
183 *
184 * @param text
185 * @return An object which supports GPath expressions
186 */
187 public GPathResult parseText(final String text) throws IOException, SAXException {
188 return parse(new StringReader(text));
189 }
190
191 // Delegated XMLReader methods
192 //------------------------------------------------------------------------
193
194 /* (non-Javadoc)
195 * @see org.xml.sax.XMLReader#getDTDHandler()
196 */
197 public DTDHandler getDTDHandler() {
198 return this.reader.getDTDHandler();
199 }
200
201 /* (non-Javadoc)
202 * @see org.xml.sax.XMLReader#getEntityResolver()
203 */
204 public EntityResolver getEntityResolver() {
205 return this.reader.getEntityResolver();
206 }
207
208 /* (non-Javadoc)
209 * @see org.xml.sax.XMLReader#getErrorHandler()
210 */
211 public ErrorHandler getErrorHandler() {
212 return this.reader.getErrorHandler();
213 }
214
215 /* (non-Javadoc)
216 * @see org.xml.sax.XMLReader#getFeature(java.lang.String)
217 */
218 public boolean getFeature(final String uri) throws SAXNotRecognizedException, SAXNotSupportedException {
219 return this.reader.getFeature(uri);
220 }
221
222 /* (non-Javadoc)
223 * @see org.xml.sax.XMLReader#getProperty(java.lang.String)
224 */
225 public Object getProperty(final String uri) throws SAXNotRecognizedException, SAXNotSupportedException {
226 return this.reader.getProperty(uri);
227 }
228
229 /* (non-Javadoc)
230 * @see org.xml.sax.XMLReader#setDTDHandler(org.xml.sax.DTDHandler)
231 */
232 public void setDTDHandler(final DTDHandler dtdHandler) {
233 this.reader.setDTDHandler(dtdHandler);
234 }
235
236 /* (non-Javadoc)
237 * @see org.xml.sax.XMLReader#setEntityResolver(org.xml.sax.EntityResolver)
238 */
239 public void setEntityResolver(final EntityResolver entityResolver) {
240 this.reader.setEntityResolver(entityResolver);
241 }
242
243 /**
244 * Resolves entities against using the suppied URL as the base for relative URLs
245 *
246 * @param base
247 * The URL used to resolve relative URLs
248 */
249 public void setEntityBaseUrl(final URL base) {
250 this.reader.setEntityResolver(new EntityResolver() {
251 public InputSource resolveEntity(final String publicId, final String systemId) throws IOException {
252 return new InputSource(new URL(base, systemId).openStream());
253 }
254 });
255 }
256
257 /* (non-Javadoc)
258 * @see org.xml.sax.XMLReader#setErrorHandler(org.xml.sax.ErrorHandler)
259 */
260 public void setErrorHandler(final ErrorHandler errorHandler) {
261 this.reader.setErrorHandler(errorHandler);
262 }
263
264 /* (non-Javadoc)
265 * @see org.xml.sax.XMLReader#setFeature(java.lang.String, boolean)
266 */
267 public void setFeature(final String uri, final boolean value) throws SAXNotRecognizedException, SAXNotSupportedException {
268 this.reader.setFeature(uri, value);
269 }
270
271 /* (non-Javadoc)
272 * @see org.xml.sax.XMLReader#setProperty(java.lang.String, java.lang.Object)
273 */
274 public void setProperty(final String uri, final Object value) throws SAXNotRecognizedException, SAXNotSupportedException {
275 this.reader.setProperty(uri, value);
276 }
277
278
279 // ContentHandler interface
280 //-------------------------------------------------------------------------
281
282 /* (non-Javadoc)
283 * @see org.xml.sax.ContentHandler#startDocument()
284 */
285 public void startDocument() throws SAXException {
286 this.currentNode = null;
287 this.charBuffer.setLength(0);
288 }
289
290 /* (non-Javadoc)
291 * @see org.xml.sax.helpers.DefaultHandler#startPrefixMapping(java.lang.String, java.lang.String)
292 */
293 public void startPrefixMapping(final String tag, final String uri) throws SAXException {
294 this.namespaceTagHints.put(tag, uri);
295 }
296
297 /* (non-Javadoc)
298 * @see org.xml.sax.ContentHandler#startElement(java.lang.String, java.lang.String, java.lang.String, org.xml.sax.Attributes)
299 */
300 public void startElement(final String namespaceURI, final String localName, final String qName, final Attributes atts) throws SAXException {
301 addCdata();
302
303 final Map attributes = new HashMap();
304 final Map attributeNamespaces = new HashMap();
305
306 for (int i = atts.getLength() - 1; i != -1; i--) {
307 if (atts.getURI(i).length() == 0) {
308 attributes.put(atts.getQName(i), atts.getValue(i));
309 } else {
310 attributes.put(atts.getLocalName(i), atts.getValue(i));
311 attributeNamespaces.put(atts.getLocalName(i), atts.getURI(i));
312 }
313
314 }
315
316 final Node newElement;
317
318 if (namespaceURI.length() == 0){
319 newElement = new Node(this.currentNode, qName, attributes, attributeNamespaces, namespaceURI);
320 } else {
321 newElement = new Node(this.currentNode, localName, attributes, attributeNamespaces, namespaceURI);
322 }
323
324 if (this.currentNode != null) {
325 this.currentNode.addChild(newElement);
326 }
327
328 this.stack.push(this.currentNode);
329 this.currentNode = newElement;
330 }
331
332 /* (non-Javadoc)
333 * @see org.xml.sax.ContentHandler#characters(char[], int, int)
334 */
335 public void characters(final char[] ch, final int start, final int length) throws SAXException {
336 this.charBuffer.append(ch, start, length);
337 }
338
339 /* (non-Javadoc)
340 * @see org.xml.sax.ContentHandler#endElement(java.lang.String, java.lang.String, java.lang.String)
341 */
342 public void endElement(final String namespaceURI, final String localName, final String qName) throws SAXException {
343 addCdata();
344
345 final Object oldCurrentNode = this.stack.pop();
346
347 if (oldCurrentNode != null) {
348 this.currentNode = (Node)oldCurrentNode;
349 }
350 }
351
352 /* (non-Javadoc)
353 * @see org.xml.sax.ContentHandler#endDocument()
354 */
355 public void endDocument() throws SAXException {
356 }
357
358 // Implementation methods
359 //-------------------------------------------------------------------------
360
361 /**
362 *
363 */
364 private void addCdata() {
365 if (this.charBuffer.length() != 0) {
366 //
367 // This element is preceeded by CDATA if keepWhitespace is false (the default setting) and
368 // it's not whitespace add it to the body
369 // Note that, according to the XML spec, we should preserve the CDATA if it's all whitespace
370 // but for the sort of work I'm doing ignoring the whitespace is preferable
371 //
372 final String cdata = this.charBuffer.toString();
373
374 this.charBuffer.setLength(0);
375 if (this.keepWhitespace || cdata.trim().length() != 0) {
376 this.currentNode.addChild(cdata);
377 }
378 }
379 }
380 }