Example usage for javax.xml.stream.events Characters getLocation

List of usage examples for javax.xml.stream.events Characters getLocation

Introduction

In this page you can find the example usage for javax.xml.stream.events Characters getLocation.

Prototype

javax.xml.stream.Location getLocation();

Source Link

Document

Return the location of this event.

Usage

From source file:edu.jhu.hlt.concrete.ingesters.bolt.BoltForumPostIngester.java

private static Section handleHeadline(final XMLEventReader rdr, final String content)
        throws XMLStreamException, ConcreteException {
    // The first type is always a document start event. Skip it.
    rdr.nextEvent();// w  w w  .  j  a  va 2 s. c  o  m

    // The second type is a document ID block. Skip it.
    rdr.nextEvent();

    // The third type is a whitespace block. Skip it.
    rdr.nextEvent();

    // The next type is a headline start tag.
    XMLEvent hl = rdr.nextEvent();
    StartElement hlse = hl.asStartElement();
    QName hlqn = hlse.getName();
    final String hlPart = hlqn.getLocalPart();
    LOGGER.debug("QN: {}", hlPart);
    int hlPartOff = hlse.getLocation().getCharacterOffset();
    LOGGER.debug("HL part offset: {}", hlPartOff);

    // Text of the headline. This would be useful for purely getting
    // the content, but for offsets, it's not that useful.
    Characters cc = rdr.nextEvent().asCharacters();
    int charOff = cc.getLocation().getCharacterOffset();
    int clen = cc.getData().length();

    // The next part is the headline end element. Skip.
    rdr.nextEvent();

    // Whitespace. Skip.
    rdr.nextEvent();

    // Reader is now pointing at the first post.
    // Construct section, text span, etc.
    final int charOffPlusLen = charOff + clen;

    // Strip whitespace off
    TextSpan ts;
    if (STRIP_WHITESPACE_OFF_HEADLINE) {
        final String hlText = content.substring(charOff, charOffPlusLen);
        SimpleImmutableEntry<Integer, Integer> pads = trimSpacing(hlText);
        ts = new TextSpan(charOff + pads.getKey(), charOffPlusLen - pads.getValue());
    } else {
        ts = new TextSpan(charOff, charOffPlusLen);
    }
    assert ts.getStart() <= ts.getEnding() : "ts=" + ts;

    Section s = new Section();
    s.setKind("headline");
    s.setTextSpan(ts);
    List<Integer> intList = new ArrayList<>();
    intList.add(0);
    s.setNumberList(intList);
    return s;
}

From source file:edu.jhu.hlt.concrete.ingesters.webposts.WebPostIngester.java

private Section handleBeginning(final XMLEventReader rdr, final String content, final Communication cptr)
        throws XMLStreamException, ConcreteException {
    // The first type is always a document start event. Skip it.
    rdr.nextEvent();/*from   w ww.  j  ava 2  s  . c o  m*/

    // The second type is a document block. Skip it.
    rdr.nextEvent();

    // The third type is a whitespace block. Skip it.
    rdr.nextEvent();

    // The next type is a docid start tag.
    rdr.nextEvent();

    // Text of the docid.
    Characters cc = rdr.nextEvent().asCharacters();
    String idTxt = cc.getData().trim();
    cptr.setId(idTxt);

    // The next part is the docid end element. Skip.
    rdr.nextEvent();

    // Whitespace. Skip.
    rdr.nextEvent();

    // Reader is now pointing at the doctype.
    // XMLEvent doctypeStart = rdr.nextEvent();
    rdr.nextEvent();
    // StartElement dtse = doctypeStart.asStartElement();

    // Doc type content.
    Characters docTypeChars = rdr.nextEvent().asCharacters();
    String docTypeContent = docTypeChars.getData().trim();
    cptr.setType(docTypeContent);

    // Doctype end. Skip.
    rdr.nextEvent();
    // Whitespace. skip.
    rdr.nextEvent();
    // Datetime start.
    rdr.nextEvent();

    // Datetime value.
    Characters dtChars = rdr.nextEvent().asCharacters();
    // TODO: parse this

    String dtValue = dtChars.getData().trim();

    DateTime dt = this.dtf.parseDateTime(dtValue).toDateTime(DateTimeZone.UTC);
    LOGGER.debug("Got DateTime: {}", dt.toString());
    long millis = dt.getMillis();
    cptr.setStartTime(millis / 1000);

    // Datetime end.
    rdr.nextEvent();
    // WS
    rdr.nextEvent();
    // Body begin.
    rdr.nextEvent();
    // WS
    rdr.nextEvent();

    // Headline begin.
    XMLEvent hl = rdr.nextEvent();
    StartElement hlse = hl.asStartElement();
    QName hlqn = hlse.getName();
    final String hlPart = hlqn.getLocalPart();
    LOGGER.debug("QN: {}", hlPart);

    // Headline text.
    Characters hlChars = rdr.nextEvent().asCharacters();
    final int charOff = hlChars.getLocation().getCharacterOffset();
    final int clen = hlChars.getData().length();

    // Construct section, text span, etc.
    final int endTextOffset = charOff + clen;
    final String hlText = content.substring(charOff, endTextOffset);

    SimpleImmutableEntry<Integer, Integer> pads = trimSpacing(hlText);
    TextSpan ts = new TextSpan(charOff + pads.getKey(), endTextOffset - pads.getValue());

    Section s = new Section();
    s.setKind("headline");
    s.setTextSpan(ts);
    List<Integer> intList = new ArrayList<>();
    intList.add(0);
    s.setNumberList(intList);
    return s;
}

From source file:edu.jhu.hlt.concrete.ingesters.bolt.BoltForumPostIngester.java

@Override
public Communication fromCharacterBasedFile(final Path path) throws IngestException {
    if (!Files.exists(path))
        throw new IngestException("No file at: " + path.toString());

    AnalyticUUIDGeneratorFactory f = new AnalyticUUIDGeneratorFactory();
    AnalyticUUIDGenerator gen = f.create();
    Communication c = new Communication();
    c.setUuid(gen.next());//from   w  ww .  java2  s.  c o  m
    c.setType(this.getKind());
    c.setMetadata(TooledMetadataConverter.convert(this));

    try {
        ExistingNonDirectoryFile ef = new ExistingNonDirectoryFile(path);
        c.setId(ef.getName().split("\\.")[0]);
    } catch (NoSuchFileException | NotFileException e) {
        // might throw if path is a directory.
        throw new IngestException(path.toString() + " is not a file, or is a directory.");
    }

    String content;
    try (InputStream is = Files.newInputStream(path);
            BufferedInputStream bin = new BufferedInputStream(is, 1024 * 8 * 8);) {
        content = IOUtils.toString(bin, StandardCharsets.UTF_8);
        c.setText(content);
    } catch (IOException e) {
        throw new IngestException(e);
    }

    try (InputStream is = Files.newInputStream(path);
            BufferedInputStream bin = new BufferedInputStream(is, 1024 * 8 * 8);
            BufferedReader reader = new BufferedReader(new InputStreamReader(bin, StandardCharsets.UTF_8));) {
        XMLEventReader rdr = null;
        try {
            rdr = inF.createXMLEventReader(reader);

            // Below method moves the reader
            // to the first post element.
            Section headline = handleHeadline(rdr, content);
            headline.setUuid(gen.next());
            c.addToSectionList(headline);
            int start = headline.getTextSpan().getStart();
            int ending = headline.getTextSpan().getEnding();
            if (ending < start)
                ending = start; // @tongfei: handle empty headlines
            String htxt = c.getText().substring(start, ending);
            LOGGER.debug("headline text: {}", htxt);

            // Section indices.
            int sectNumber = 1;
            int subSect = 0;

            // Move iterator to post start element.
            this.iterateToPosts(rdr);

            // Offset pointer.
            int currOff = -1;

            SectionFactory sf = new SectionFactory(gen);

            // First post element.
            while (rdr.hasNext()) {
                XMLEvent nextEvent = rdr.nextEvent();
                currOff = nextEvent.getLocation().getCharacterOffset();
                if (currOff > 0) {
                    int currOffPlus = currOff + 20;
                    int currOffLess = currOff - 20;
                    LOGGER.debug("Offset: {}", currOff);
                    if (currOffPlus < content.length())
                        LOGGER.debug("Surrounding text: {}", content.substring(currOffLess, currOffPlus));
                }

                // First: see if document is going to end.
                // If yes: exit.
                if (nextEvent.isEndDocument())
                    break;

                // XMLEvent peeker = rdr.peek();

                // Check if start element.
                if (nextEvent.isStartElement()) {
                    StartElement se = nextEvent.asStartElement();
                    QName name = se.getName();
                    final String localName = name.getLocalPart();
                    LOGGER.debug("Hit start element: {}", localName);

                    //region
                    // Add sections for authors and datetimes for each bolt post
                    // by Tongfei Chen
                    Attribute attrAuthor = se.getAttributeByName(QName.valueOf("author"));
                    Attribute attrDateTime = se.getAttributeByName(QName.valueOf("datetime"));

                    if (attrAuthor != null && attrDateTime != null) {

                        int loc = attrAuthor.getLocation().getCharacterOffset();

                        int sectAuthorBeginningOffset = loc + "<post author=\"".length();

                        Section sectAuthor = sf.fromTextSpan(new TextSpan(sectAuthorBeginningOffset,
                                sectAuthorBeginningOffset + attrAuthor.getValue().length()), "author");
                        c.addToSectionList(sectAuthor);

                        int sectDateTimeBeginningOffset = sectAuthorBeginningOffset
                                + attrAuthor.getValue().length() + " datetime=".length();

                        Section sectDateTime = sf.fromTextSpan(
                                new TextSpan(sectDateTimeBeginningOffset,
                                        sectDateTimeBeginningOffset + attrDateTime.getValue().length()),
                                "datetime");
                        c.addToSectionList(sectDateTime);
                    }
                    //endregion

                    // Move past quotes, images, and links.
                    if (localName.equals(QUOTE_LOCAL_NAME)) {
                        this.handleQuote(rdr);
                    } else if (localName.equals(IMG_LOCAL_NAME)) {
                        this.handleImg(rdr);
                    } else if (localName.equals(LINK_LOCAL_NAME)) {
                        this.handleLink(rdr);
                    }

                    // not a start element
                } else if (nextEvent.isCharacters()) {
                    Characters chars = nextEvent.asCharacters();
                    int coff = chars.getLocation().getCharacterOffset();
                    if (!chars.isWhiteSpace()) {
                        // content to be captured
                        String fpContent = chars.getData();
                        LOGGER.debug("Character offset: {}", coff);
                        LOGGER.debug("Character based data: {}", fpContent);
                        // LOGGER.debug("Character data via offset diff: {}", content.substring(coff - fpContent.length(), coff));

                        SimpleImmutableEntry<Integer, Integer> pads = trimSpacing(fpContent);
                        final int tsb = currOff + pads.getKey();
                        final int tse = currOff + fpContent.length() - pads.getValue();
                        final String subs = content.substring(tsb, tse);
                        if (subs.replaceAll("\\p{Zs}", "").replaceAll("\\n", "").isEmpty()) {
                            LOGGER.info("Found empty section: skipping.");
                            continue;
                        }

                        LOGGER.debug("Section text: {}", subs);
                        TextSpan ts = new TextSpan(tsb, tse);

                        Section s = sf.fromTextSpan(ts, "post");
                        List<Integer> intList = new ArrayList<>();
                        intList.add(sectNumber);
                        intList.add(subSect);
                        s.setNumberList(intList);
                        c.addToSectionList(s);

                        subSect++;
                    }
                } else if (nextEvent.isEndElement()) {
                    EndElement ee = nextEvent.asEndElement();
                    currOff = ee.getLocation().getCharacterOffset();
                    QName name = ee.getName();
                    String localName = name.getLocalPart();
                    LOGGER.debug("Hit end element: {}", localName);
                    if (localName.equalsIgnoreCase(POST_LOCAL_NAME)) {
                        sectNumber++;
                        subSect = 0;
                    }
                }
            }
            return c;
        } catch (XMLStreamException | ConcreteException | StringIndexOutOfBoundsException x) {
            throw new IngestException(x);
        } finally {
            if (rdr != null)
                try {
                    rdr.close();
                } catch (XMLStreamException e) {
                    // not likely.
                    LOGGER.info("Error closing XMLReader.", e);
                }
        }
    } catch (IOException e) {
        throw new IngestException(e);
    }
}