Example usage for javax.xml.stream XMLInputFactory createXMLEventReader

List of usage examples for javax.xml.stream XMLInputFactory createXMLEventReader

Introduction

In this page you can find the example usage for javax.xml.stream XMLInputFactory createXMLEventReader.

Prototype

public abstract XMLEventReader createXMLEventReader(String systemId, java.io.InputStream stream)
        throws XMLStreamException;

Source Link

Document

Create a new XMLEventReader from a java.io.InputStream

Usage

From source file:Main.java

public static void main(String[] args) throws Exception {
    String filename = "yourXML.xml";

    XMLInputFactory factory = XMLInputFactory.newInstance();
    System.out.println("FACTORY: " + factory);

    XMLEventReader r = factory.createXMLEventReader(filename, new FileInputStream(filename));

    while (r.hasNext()) {
        XMLEvent e = r.nextEvent();
        System.out.println(e.toString());
    }/*w  w w  . j ava 2 s. c om*/
}

From source file:Main.java

private static XMLEventReader getXMLEventReader(String filename) throws Exception {
    XMLInputFactory xmlif = null;
    XMLEventReader xmlr = null;/*from   w  w  w  .  j  a v  a 2  s  .  c o m*/
    xmlif = XMLInputFactory.newInstance();
    xmlif.setProperty(XMLInputFactory.IS_REPLACING_ENTITY_REFERENCES, Boolean.TRUE);
    xmlif.setProperty(XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, Boolean.FALSE);
    xmlif.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, Boolean.TRUE);
    xmlif.setProperty(XMLInputFactory.IS_COALESCING, Boolean.TRUE);

    FileInputStream fis = new FileInputStream(filename);
    xmlr = xmlif.createXMLEventReader(filename, fis);

    return xmlr;
}

From source file:ValidateStax.java

private static XMLEventReader getXMLEventReader(String filename) {
    XMLInputFactory xmlif = null;
    XMLEventReader xmlr = null;/* w  ww.  ja va 2s.  c o  m*/
    try {
        xmlif = XMLInputFactory.newInstance();
        xmlif.setProperty(XMLInputFactory.IS_REPLACING_ENTITY_REFERENCES, Boolean.TRUE);
        xmlif.setProperty(XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, Boolean.FALSE);
        xmlif.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, Boolean.TRUE);
        xmlif.setProperty(XMLInputFactory.IS_COALESCING, Boolean.TRUE);

        FileInputStream fis = new FileInputStream(filename);
        xmlr = xmlif.createXMLEventReader(filename, fis);
    } catch (Exception ex) {
        ex.printStackTrace();
    }
    return xmlr;
}

From source file:StreamSrcStAXRst.java

private static XMLEventReader getXMLEventReader(String filename) {

    XMLInputFactory xmlif = null;
    XMLEventReader xmlr = null;//from   ww  w .  ja va 2  s  . co m
    try {
        xmlif = XMLInputFactory.newInstance();
        xmlif.setProperty(XMLInputFactory.IS_REPLACING_ENTITY_REFERENCES, Boolean.TRUE);
        xmlif.setProperty(XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, Boolean.FALSE);
        xmlif.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, Boolean.TRUE);
        xmlif.setProperty(XMLInputFactory.IS_COALESCING, Boolean.TRUE);

        FileInputStream fis = new FileInputStream(filename);
        xmlr = xmlif.createXMLEventReader(filename, fis);
    } catch (Exception ex) {
        ex.printStackTrace();
    }
    return xmlr;
}

From source file:com.vistatec.ocelot.xliff.okapi.OkapiXLIFFFactory.java

@Override
public XLIFFVersion detectXLIFFVersion(File detectVersion) throws IOException, XMLStreamException {
    try (BOMInputStream bomInputStream = new BOMInputStream(new FileInputStream(detectVersion),
            ByteOrderMark.UTF_8, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_32BE,
            ByteOrderMark.UTF_32LE)) {//w ww . j a v  a 2 s .  c  o  m
        String bom = "UTF-8";
        if (bomInputStream.hasBOM()) {
            bom = bomInputStream.getBOMCharsetName();
        }

        XMLInputFactory xml = XMLInputFactory.newInstance();
        XMLEventReader reader = xml.createXMLEventReader(bomInputStream, bom);
        while (reader.hasNext()) {
            XMLEvent event = reader.nextEvent();
            switch (event.getEventType()) {
            case XMLEvent.START_ELEMENT:
                StartElement startElement = (StartElement) event;
                String localPart = startElement.getName().getLocalPart();
                if (localPart.equals("xliff")) {
                    @SuppressWarnings("unchecked")
                    Iterator<Attribute> attrs = startElement.getAttributes();
                    while (attrs.hasNext()) {
                        Attribute attr = attrs.next();
                        if (isXliffVersionAttributeName(attr.getName())) {
                            String value = attr.getValue();
                            reader.close();
                            if ("2.0".equals(value)) {
                                return XLIFFVersion.XLIFF20;
                            } else {
                                return XLIFFVersion.XLIFF12;
                            }
                        }
                    }
                }
                break;

            default:
                break;
            }
        }
        throw new IllegalStateException("Could not detect XLIFF version");
    }
}

From source file:eionet.webq.converter.JsonXMLBidirectionalConverter.java

/**
 * Template for conversion.//  www  .j av  a  2  s.com
 *
 * @param inputFactory input factory.
 * @param outputFactory output factory.
 * @param source source to convert.
 * @return conversion result as byte array.
 */
private byte[] convert(XMLInputFactory inputFactory, XMLOutputFactory outputFactory, byte[] source) {
    InputStream input = new ByteArrayInputStream(source);
    ByteArrayOutputStream output = new ByteArrayOutputStream();
    try {
        XMLEventReader reader = inputFactory.createXMLEventReader(input, "utf-8");
        XMLEventWriter writer = outputFactory.createXMLEventWriter(output, "utf-8");
        writer = new PrettyXMLEventWriter(writer);
        writer.add(reader);
        closeQuietly(reader, writer);
        return output.toByteArray();
    } catch (XMLStreamException e) {
        throw new RuntimeException(e);
    } finally {
        IOUtils.closeQuietly(output);
        IOUtils.closeQuietly(input);
    }
}

From source file:com.act.lcms.MzMLParser.java

public Iterator<S> getIterator(String inputFile)
        throws ParserConfigurationException, IOException, XMLStreamException {
    DocumentBuilderFactory docFactory = mkDocBuilderFactory();
    DocumentBuilder docBuilder = docFactory.newDocumentBuilder();

    final XMLInputFactory xmlInputFactory = XMLInputFactory.newInstance();
    final XMLOutputFactory xmlOutputFactory = XMLOutputFactory.newInstance();

    return new Iterator<S>() {
        boolean inEntry = false;

        XMLEventReader xr = xmlInputFactory.createXMLEventReader(new FileInputStream(inputFile), "utf-8");
        // TODO: is the use of the XML version/encoding tag definitely necessary?
        StringWriter w = new StringWriter().append(XML_PREAMBLE).append("\n");
        XMLEventWriter xw = xmlOutputFactory.createXMLEventWriter(w);

        S next = null;//from  w w w .  j a  v  a 2 s .com

        /* Because we're handling the XML as a stream, we can only determine whether we have another Spectrum to return
         * by attempting to parse the next one.  `this.next()` reads
         */
        private S getNextSpectrum() {
            S spectrum = null;
            if (xr == null || !xr.hasNext()) {
                return null;
            }

            try {
                while (xr.hasNext()) {
                    XMLEvent e = xr.nextEvent();
                    if (!inEntry && e.isStartElement()
                            && e.asStartElement().getName().getLocalPart().equals((SPECTRUM_OBJECT_TAG))) {
                        xw.add(e);
                        inEntry = true;
                    } else if (e.isEndElement()
                            && e.asEndElement().getName().getLocalPart().equals(SPECTRUM_OBJECT_TAG)) {
                        xw.add(e);
                        xw.flush();
                        /* TODO: the XMLOutputFactory docs don't make it clear if/how events can be written directly into a new
                         * document structure, so we incur the cost of extracting each spectrum entry, serializing it, and
                         * re-reading it into its own document so it can be handled by XPath.  Master this strange corner of the
                         * Java ecosystem and get rid of <></>his doc -> string -> doc conversion. */
                        Document doc = docBuilder.parse(new ReaderInputStream(new StringReader(w.toString())));
                        spectrum = handleSpectrumEntry(doc);
                        xw.close();
                        /* Note: this can also be accomplished with `w.getBuffer().setLength(0);`, but using a new event writer
                         * seems safer. */
                        w = new StringWriter();
                        w.append(XML_PREAMBLE).append("\n");
                        xw = xmlOutputFactory.createXMLEventWriter(w);
                        inEntry = false;
                        // Don't stop parsing if handleSpectrumEntry didn't like this spectrum document.
                        if (spectrum != null) {
                            break;
                        }
                    } else if (inEntry) {
                        // Add this element if we're in an entry
                        xw.add(e);
                    }
                }

                // We've reached the end of the document; close the reader to show that we're done.
                if (!xr.hasNext()) {
                    xr.close();
                    xr = null;
                }
            } catch (Exception e) {
                // TODO: do better.  We seem to run into this sort of thing with Iterators a lot...
                throw new RuntimeException(e);
            }

            return spectrum;
        }

        private S tryParseNext() {
            // Fail the attempt if the reader is closed.
            if (xr == null || !xr.hasNext()) {
                return null;
            }

            // No checks on whether we already have a spectrum stored: we expect the callers to do that.
            return getNextSpectrum();
        }

        @Override
        public boolean hasNext() {
            // Prime the pump if the iterator doesn't have a value stored yet.
            if (this.next == null) {
                this.next = tryParseNext();
            }

            // If we have an entry waiting, return true; otherwise read the next entry and return true if successful.
            return this.next != null;
        }

        @Override
        public S next() {
            // Prime the pump like we do in hasNext().
            if (this.next == null) {
                this.next = tryParseNext();
            }

            // Take available spectrum and return it.
            S res = this.next;
            /* Advance to the next element immediately, making next() do the heavy lifting most of the time.  Otherwise,
             * the parsing will resume on hasNext(), which seems like it ought to be a light-weight operation. */
            this.next = tryParseNext();

            return res;
        }

    };
}

From source file:de.dfki.km.leech.parser.wikipedia.WikipediaDumpParser.java

public MultiValueHashMap<String, String> getPageTitle2Redirects(InputStream sWikipediaDump)
        throws FileNotFoundException, XMLStreamException {
    // <text xml:space="preserve">#REDIRECT [[Autopoiesis]]</text>
    // <text xml:space="preserve">#REDIRECT:[[Hans Leo Haler]]</text>
    // <text xml:space="preserve">#redirect [[Weier Hai]]</text>
    // #weiterleitung
    // <page>
    // <title>Autopoiesis</title>

    Logger.getLogger(WikipediaDumpParser.class.getName()).info("will collect redirects from wikipedia dump...");

    MultiValueHashMap<String, String> hsPageTitle2Redirects = new MultiValueBalancedTreeMap<String, String>();

    String strCurrentTitle = "";
    XMLInputFactory xmlInputFactory = XMLInputFactory.newInstance();

    XMLEventReader xmlEventReader = xmlInputFactory.createXMLEventReader(sWikipediaDump, "Utf-8");
    int iTitlesRead = 0;
    while (xmlEventReader.hasNext()) {
        XMLEvent xmlEvent = xmlEventReader.nextEvent();

        if (!xmlEvent.isStartElement())
            continue;
        // wenn wir einen Title haben, dann merken wir uns den, falls wir ihn brauchen
        if (xmlEvent.asStartElement().getName().getLocalPart().equals("title")) {
            strCurrentTitle = readNextCharEventsText(xmlEventReader);

            iTitlesRead++;//from w ww . ja va  2  s . co m
            if (iTitlesRead % 200000 == 0)
                Logger.getLogger(WikipediaDumpParser.class.getName())
                        .info("read doc #" + StringUtils.beautifyNumber(iTitlesRead));

            continue;
        }

        if (!xmlEvent.asStartElement().getName().getLocalPart().equals("text"))
            continue;

        // jetzt haben wir ein text-tag. Wir schauen, ob jetzt ein redirect kommt
        // entweder kommt ein charEvent oder ein EndEvent. Leere Texte gibts wohl auch
        XMLEvent nextEvent = xmlEventReader.peek();

        if (!nextEvent.isCharacters())
            continue;

        String strCharEventData = readNextCharEventsText(xmlEventReader);
        if (strCharEventData == null)
            continue;

        strCharEventData = strCharEventData.trim();

        boolean bRedirect = false;

        if (strCharEventData.length() >= 9 && strCharEventData.substring(0, 9).equalsIgnoreCase("#redirect"))
            bRedirect = true;
        if (!bRedirect && strCharEventData.length() >= 8
                && strCharEventData.substring(0, 8).equalsIgnoreCase("redirect")
                && !strCharEventData.contains("\n"))
            bRedirect = true;
        if (!bRedirect && strCharEventData.length() >= 14
                && strCharEventData.substring(0, 14).equalsIgnoreCase("#weiterleitung"))
            bRedirect = true;
        if (!bRedirect && strCharEventData.length() >= 13
                && strCharEventData.substring(0, 13).equalsIgnoreCase("weiterleitung")
                && !strCharEventData.contains("\n"))
            bRedirect = true;

        if (!bRedirect)
            continue;

        // wir haben einen redirect - der wird in unsere Datenstruktur eingetragen
        int iStart = strCharEventData.indexOf("[[");
        int iEnd = strCharEventData.indexOf("]]");
        if (iStart < 0 || iEnd < 0)
            continue;
        if (iEnd <= iStart)
            continue;
        if ((iStart + 2) > strCharEventData.length() || iEnd > strCharEventData.length())
            continue;

        String strRedirectTarget = strCharEventData.substring(iStart + 2, iEnd).trim();
        hsPageTitle2Redirects.add(strRedirectTarget, strCurrentTitle);

        // if("Venceslav Konstantinov".equalsIgnoreCase(strCurrentTitle) || "Venceslav Konstantinov".equalsIgnoreCase(strRedirectTarget))
        // System.out.println("redirect found: (" + hsPageTitle2Redirects.keySize() + ") " + strCurrentTitle + " => '" + strRedirectTarget + "'");

    }

    Logger.getLogger(WikipediaDumpParser.class.getName())
            .info("Redirects found: " + StringUtils.beautifyNumber(hsPageTitle2Redirects.valueSize()));

    return hsPageTitle2Redirects;

}

From source file:de.dfki.km.leech.parser.wikipedia.WikipediaDumpParser.java

@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
        throws IOException, SAXException, TikaException {

    try {// w w  w  . j a v a2 s  .  c  om

        // wir iterieren schn ber die page-Eintrge. Darin gibt es dann title, timestamp, <contributor> => <username> und text. den text mssen
        // wir noch bereinigen. dazu nehmen wir eine Vorverarbeitung mit bliki - dazu mssen wir aber selbst nochmal den String vorbereiten und
        // nachbereinigen. Leider.

        WikipediaDumpParserConfig wikipediaDumpParserConfig = context.get(WikipediaDumpParserConfig.class);

        if (wikipediaDumpParserConfig == null) {
            Logger.getLogger(WikipediaDumpParser.class.getName())
                    .info("No wikipedia parser config found. Will take the default one.");
            wikipediaDumpParserConfig = new WikipediaDumpParserConfig();
        }

        TikaInputStream tikaStream = TikaInputStream.get(stream);

        File fWikipediaDumpFile4Stream = tikaStream.getFile();

        MultiValueHashMap<String, String> hsPageTitle2Redirects = new MultiValueHashMap<String, String>();
        if (wikipediaDumpParserConfig.determinePageRedirects)
            hsPageTitle2Redirects = getPageTitle2Redirects(new FileInputStream(fWikipediaDumpFile4Stream));

        HashSet<String> hsRedirectPageTitles = new HashSet<String>(hsPageTitle2Redirects.values());

        String strCleanedText = "";
        String strBaseURL = null;

        XMLInputFactory xmlInputFactory = XMLInputFactory.newInstance();
        XMLEventReader xmlEventReader = xmlInputFactory
                .createXMLEventReader(new FileInputStream(fWikipediaDumpFile4Stream), "Utf-8");
        while (xmlEventReader.hasNext()) {

            XMLEvent xmlEvent = xmlEventReader.nextEvent();

            if (xmlEvent.isEndElement() && xmlEvent.asEndElement().getName().getLocalPart().equals("page")) {
                if (metadata.size() == 0)
                    continue;

                // den mimetype wollen wir auch noch in den Metadaten haben
                metadata.add(Metadata.CONTENT_TYPE, "application/wikipedia+xml");

                XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
                xhtml.startDocument();

                xhtml.startElement("p");
                xhtml.characters(strCleanedText.toCharArray(), 0, strCleanedText.length());
                xhtml.endElement("p");

                xhtml.endDocument();

            }

            if (!xmlEvent.isStartElement())
                continue;

            // ##### die siteinfo

            if (strBaseURL == null && xmlEvent.asStartElement().getName().getLocalPart().equals("base")) {
                // http://de.wikipedia.org/wiki/Wikipedia:Hauptseite =>http://de.wikipedia.org/wiki/
                strBaseURL = readNextCharEventsText(xmlEventReader);
                strBaseURL = strBaseURL.substring(0, strBaseURL.lastIndexOf("/") + 1);
            }

            // ##### die page

            if (xmlEvent.asStartElement().getName().getLocalPart().equals("page")) {
                for (String strKey : metadata.names())
                    metadata.remove(strKey);
            }

            // ##### der Title

            if (xmlEvent.asStartElement().getName().getLocalPart().equals("title")) {
                // wir merken uns immer den aktuellen Titel
                String strCurrentTitle = readNextCharEventsText(xmlEventReader);

                if (strCurrentTitle.equalsIgnoreCase("DuckDuckGo")) {
                    int fasd = 8;
                }

                if (strCurrentTitle.toLowerCase().contains("duck")
                        && strCurrentTitle.toLowerCase().contains("go")) {
                    int is = 666;
                }

                // wenn der Titel eine redirect-Page ist, dann tragen wir die ganze Page aus der EventQueue aus, springen an das endPage, und
                // haben somit diese Seite ignoriert. Ferner ignorieren wir auch spezielle wikipedia-Seiten
                String strSmallTitle = strCurrentTitle.trim().toLowerCase();
                if (hsRedirectPageTitles.contains(strCurrentTitle)
                        || hsRedirectPageTitles.contains(strSmallTitle)
                        || hsRedirectPageTitles.contains(strCurrentTitle.trim())
                        || strSmallTitle.startsWith("category:") || strSmallTitle.startsWith("kategorie:")
                        || strSmallTitle.startsWith("vorlage:") || strSmallTitle.startsWith("template:")
                        || strSmallTitle.startsWith("hilfe:") || strSmallTitle.startsWith("help:")
                        || strSmallTitle.startsWith("wikipedia:") || strSmallTitle.startsWith("portal:")
                        || strSmallTitle.startsWith("mediawiki:")) {

                    while (true) {
                        XMLEvent nextXmlEvent = xmlEventReader.nextEvent();
                        if (nextXmlEvent.isEndElement()
                                && nextXmlEvent.asEndElement().getName().getLocalPart().equals("page"))
                            break;
                    }
                } else {
                    metadata.add(Metadata.TITLE, strCurrentTitle);
                    metadata.add(Metadata.SOURCE, strBaseURL + strCurrentTitle);

                    for (String strRedirect : hsPageTitle2Redirects.get(strCurrentTitle)) {
                        // wir ignorieren Titel, die sich lediglich durch gro/kleinschreibung unterscheiden
                        if (!StringUtils.containsIgnoreCase(strRedirect, metadata.getValues(Metadata.TITLE)))
                            metadata.add(Metadata.TITLE, strRedirect);
                    }
                }

                continue;
            }

            // ##### der text
            if (xmlEvent.asStartElement().getName().getLocalPart().equals("text")) {
                String strText = readNextCharEventsText(xmlEventReader);

                if (wikipediaDumpParserConfig.parseLinksAndCategories)
                    parseLinksAndCategories(strText, strBaseURL, metadata, handler);
                if (wikipediaDumpParserConfig.parseInfoBoxes)
                    parseInfoBox(strText, metadata, handler);
                if (wikipediaDumpParserConfig.parseGeoCoordinates)
                    parseGeoCoordinates(strText, metadata);

                // aufgrund einiger Defizite in dem verwendeten cleaner mssen wir hier leider noch zu-und nacharbeiten
                strText = strText.replaceAll("==\n", "==\n\n");
                strText = strText.replaceAll("\n==", "\n\n==");

                strCleanedText = m_wikiModel.render(new PlainTextConverter(), strText);

                strCleanedText = strCleanedText.replaceAll("\\{\\{", " ");
                strCleanedText = strCleanedText.replaceAll("\\}\\}", " ");

                strCleanedText = StringEscapeUtils.unescapeHtml4(strCleanedText);

                continue;
            }

            // ##### der timestamp
            if (xmlEvent.asStartElement().getName().getLocalPart().equals("timestamp")) {
                String strTimestamp = readNextCharEventsText(xmlEventReader);

                metadata.add(Metadata.MODIFIED, strTimestamp);

                continue;
            }

            // ##### der username
            if (xmlEvent.asStartElement().getName().getLocalPart().equals("username")) {
                String strUsername = readNextCharEventsText(xmlEventReader);

                metadata.add(Metadata.CREATOR, strUsername);

                continue;
            }

        }

    } catch (Exception e) {
        Logger.getLogger(WikipediaDumpParser.class.getName()).log(Level.SEVERE, "Error", e);
    }

}

From source file:ca.phon.session.io.xml.v12.XMLSessionReader_v12.java

@Override
public boolean canRead(File file) throws IOException {
    // open file and make sure the first
    // element is 'session' with the correct version
    boolean canRead = false;

    // use StAX to read only first element
    // create StAX reader
    XMLInputFactory factory = XMLInputFactory.newInstance();
    XMLEventReader reader = null;
    try (FileInputStream source = new FileInputStream(file)) {
        //BufferedReader in = new BufferedReader(new InputStreamReader(source, "UTF-8"));
        XMLEventReader xmlReader = factory.createXMLEventReader(source, "UTF-8");
        reader = factory.createFilteredReader(xmlReader, new XMLWhitespaceFilter());

        XMLEvent evt;/*from  w  w  w  .jav  a2  s  . com*/
        while (!(evt = reader.nextEvent()).isStartElement())
            ;
        canRead = evt.asStartElement().getName().getLocalPart().equals("session")
                && evt.asStartElement().getAttributeByName(new QName("version")).getValue().equals("PB1.2");
    } catch (XMLStreamException e) {
        throw new IOException(e);
    }

    return canRead;
}