Example usage for org.apache.commons.io.input XmlStreamReader XmlStreamReader

List of usage examples for org.apache.commons.io.input XmlStreamReader XmlStreamReader

Introduction

In this page you can find the example usage for org.apache.commons.io.input XmlStreamReader XmlStreamReader.

Prototype

public XmlStreamReader(InputStream is, String httpContentType, boolean lenient, String defaultEncoding)
        throws IOException 

Source Link

Document

Creates a Reader using an InputStream an the associated content-type header.

Usage

From source file:net.yacy.document.parser.GenericXMLParser.java

@Override
public Document[] parse(final DigestURL location, final String mimeType, final String charset,
        final VocabularyScraper scraper, final int timezoneOffset, final InputStream source) throws Failure {

    /* Limit the size of the in-memory buffer to at most 25% of the available memory :
     * because some room is needed, and before being garbage collected the buffer will be converted to a String, then to a byte array. 
     * Eventual stricter limits should be handled by the caller (see for example crawler.[protocol].maxFileSize configuration setting). */
    final long availableMemory = MemoryControl.available();
    final long maxBytes = (long) (availableMemory * 0.25);
    final int maxChars;
    if ((maxBytes / Character.BYTES) > Integer.MAX_VALUE) {
        maxChars = Integer.MAX_VALUE;
    } else {//  w  w  w.j a  va2  s .c om
        maxChars = ((int) maxBytes) / Character.BYTES;
    }

    try (/* Automatically closed by this try-with-resources statement*/ CharBuffer writer = new CharBuffer(
            maxChars);) {

        /* Use commons-io XmlStreamReader advanced rules to help with charset detection when source contains no BOM or XML declaration
         * (detection algorithm notably also include ContentType transmitted by HTTP headers, here eventually present as mimeType and charset parameters),  */
        final XmlStreamReader reader = new XmlStreamReader(source, mimeType, true, charset);
        final InputSource saxSource = new InputSource(reader);
        final String detectedCharset = reader.getEncoding();

        final List<AnchorURL> detectedURLs = new ArrayList<>();

        final GenericXMLContentHandler saxHandler = new GenericXMLContentHandler(writer, detectedURLs);
        final SAXParser saxParser = getParser();
        saxParser.parse(saxSource, saxHandler);

        if (writer.isOverflow()) {
            throw new Parser.Failure("Not enough Memory available for generic the XML parser : "
                    + Formatter.bytesToString(availableMemory), location);
        }

        /* create the parsed document */
        Document[] docs = null;
        final byte[] contentBytes = UTF8.getBytes(writer.toString());
        docs = new Document[] { new Document(location, mimeType, detectedCharset, this, null, null, null, null,
                "", null, null, 0.0d, 0.0d, contentBytes, detectedURLs, null, null, false, new Date()) };
        return docs;
    } catch (Parser.Failure e) {
        throw e;
    } catch (final Exception e) {
        throw new Parser.Failure("Unexpected error while parsing XML file. " + e.getMessage(), location);
    }

}

From source file:net.yacy.document.parser.GenericXMLParser.java

/**
 * {@inheritDoc}//  w w  w  .ja  v a  2s .  co  m
 * @param maxBytes the maximum number of content bytes to process. Be careful with to small values : 
 *    a Failure exception can eventually be thrown when maxBytes value is so small that the parser can even not fill its buffers on input stream and parse the document declaration.
 */
@Override
public Document[] parseWithLimits(DigestURL location, String mimeType, String charsetName,
        VocabularyScraper scraper, int timezoneOffset, InputStream source, int maxLinks, long maxBytes)
        throws Failure, InterruptedException, UnsupportedOperationException {
    /* Limit the size of the in-memory buffer to at most 25% of the available memory :
     * because some room is needed, and before being garbage collected the buffer will be converted to a String, then to a byte array. 
     * Eventual stricter limits should be handled by the caller (see for example crawler.[protocol].maxFileSize configuration setting). */
    final long availableMemory = MemoryControl.available();
    final long maxTextBytes = (long) (availableMemory * 0.25);
    final int maxChars;
    if ((maxTextBytes / Character.BYTES) > Integer.MAX_VALUE) {
        maxChars = Integer.MAX_VALUE;
    } else {
        maxChars = ((int) maxTextBytes) / Character.BYTES;
    }

    try (/* Automatically closed by this try-with-resources statement*/ CharBuffer writer = new CharBuffer(
            maxChars);) {

        final Set<AnchorURL> detectedURLs = new HashSet<>();
        final GenericXMLContentHandler saxHandler = new GenericXMLContentHandler(writer, detectedURLs,
                maxLinks);

        StrictLimitInputStream limitedSource = new StrictLimitInputStream(source, maxBytes);

        /* Use commons-io XmlStreamReader advanced rules to help with charset detection when source contains no BOM or XML declaration
         * (detection algorithm notably also include ContentType transmitted by HTTP headers, here eventually present as mimeType and charset parameters),  */
        final XmlStreamReader reader = new XmlStreamReader(limitedSource, mimeType, true, charsetName);
        final InputSource saxSource = new InputSource(reader);
        final String detectedCharset = reader.getEncoding();

        final SAXParser saxParser = getParser();
        boolean limitExceeded = false;
        try {
            saxParser.parse(saxSource, saxHandler);
        } catch (SAXException e) {
            if (!(e.getCause() instanceof SizeLimitExceededException)) {
                /* Only transmit to upper layer exceptions that are not caused by the maxLinks limit being reached */
                throw e;
            }
            limitExceeded = true;
        } catch (StreamLimitException e) {
            limitExceeded = true;
        }

        if (writer.isOverflow()) {
            throw new Parser.Failure("Not enough Memory available for generic the XML parser : "
                    + Formatter.bytesToString(availableMemory), location);
        }

        /* Create the parsed document with eventually only partial part of the text and links */
        final byte[] contentBytes = UTF8.getBytes(writer.toString());
        Document[] docs = new Document[] {
                new Document(location, mimeType, detectedCharset, this, null, null, null, null, "", null, null,
                        0.0d, 0.0d, contentBytes, detectedURLs, null, null, false, new Date()) };
        docs[0].setPartiallyParsed(limitExceeded);
        return docs;
    } catch (final Exception e) {
        throw new Parser.Failure("Unexpected error while parsing XML file. " + e.getMessage(), location);
    }
}

From source file:org.cee.net.impl.XmlStreamReaderFactory.java

@Override
public ReaderSource createReader(InputStream inputStream, String contentTypeHint, String characterEncodingHint)
        throws IOException {
    if (characterEncodingHint == null) {
        characterEncodingHint = "UTF-8";
    }// w w w  .ja va 2 s .  c om
    XmlStreamReader reader = new XmlStreamReader(inputStream, contentTypeHint, true, characterEncodingHint);
    return new ReaderSource(reader, reader.getEncoding());
}