List of usage examples for javax.xml.stream.events Characters getLocation
javax.xml.stream.Location getLocation();
From source file:edu.jhu.hlt.concrete.ingesters.bolt.BoltForumPostIngester.java
private static Section handleHeadline(final XMLEventReader rdr, final String content) throws XMLStreamException, ConcreteException { // The first type is always a document start event. Skip it. rdr.nextEvent();// w w w . j a va 2 s. c o m // The second type is a document ID block. Skip it. rdr.nextEvent(); // The third type is a whitespace block. Skip it. rdr.nextEvent(); // The next type is a headline start tag. XMLEvent hl = rdr.nextEvent(); StartElement hlse = hl.asStartElement(); QName hlqn = hlse.getName(); final String hlPart = hlqn.getLocalPart(); LOGGER.debug("QN: {}", hlPart); int hlPartOff = hlse.getLocation().getCharacterOffset(); LOGGER.debug("HL part offset: {}", hlPartOff); // Text of the headline. This would be useful for purely getting // the content, but for offsets, it's not that useful. Characters cc = rdr.nextEvent().asCharacters(); int charOff = cc.getLocation().getCharacterOffset(); int clen = cc.getData().length(); // The next part is the headline end element. Skip. rdr.nextEvent(); // Whitespace. Skip. rdr.nextEvent(); // Reader is now pointing at the first post. // Construct section, text span, etc. final int charOffPlusLen = charOff + clen; // Strip whitespace off TextSpan ts; if (STRIP_WHITESPACE_OFF_HEADLINE) { final String hlText = content.substring(charOff, charOffPlusLen); SimpleImmutableEntry<Integer, Integer> pads = trimSpacing(hlText); ts = new TextSpan(charOff + pads.getKey(), charOffPlusLen - pads.getValue()); } else { ts = new TextSpan(charOff, charOffPlusLen); } assert ts.getStart() <= ts.getEnding() : "ts=" + ts; Section s = new Section(); s.setKind("headline"); s.setTextSpan(ts); List<Integer> intList = new ArrayList<>(); intList.add(0); s.setNumberList(intList); return s; }
From source file:edu.jhu.hlt.concrete.ingesters.webposts.WebPostIngester.java
private Section handleBeginning(final XMLEventReader rdr, final String content, final Communication cptr) throws XMLStreamException, ConcreteException { // The first type is always a document start event. Skip it. rdr.nextEvent();/*from w ww. j ava 2 s . c o m*/ // The second type is a document block. Skip it. rdr.nextEvent(); // The third type is a whitespace block. Skip it. rdr.nextEvent(); // The next type is a docid start tag. rdr.nextEvent(); // Text of the docid. Characters cc = rdr.nextEvent().asCharacters(); String idTxt = cc.getData().trim(); cptr.setId(idTxt); // The next part is the docid end element. Skip. rdr.nextEvent(); // Whitespace. Skip. rdr.nextEvent(); // Reader is now pointing at the doctype. // XMLEvent doctypeStart = rdr.nextEvent(); rdr.nextEvent(); // StartElement dtse = doctypeStart.asStartElement(); // Doc type content. Characters docTypeChars = rdr.nextEvent().asCharacters(); String docTypeContent = docTypeChars.getData().trim(); cptr.setType(docTypeContent); // Doctype end. Skip. rdr.nextEvent(); // Whitespace. skip. rdr.nextEvent(); // Datetime start. rdr.nextEvent(); // Datetime value. Characters dtChars = rdr.nextEvent().asCharacters(); // TODO: parse this String dtValue = dtChars.getData().trim(); DateTime dt = this.dtf.parseDateTime(dtValue).toDateTime(DateTimeZone.UTC); LOGGER.debug("Got DateTime: {}", dt.toString()); long millis = dt.getMillis(); cptr.setStartTime(millis / 1000); // Datetime end. rdr.nextEvent(); // WS rdr.nextEvent(); // Body begin. rdr.nextEvent(); // WS rdr.nextEvent(); // Headline begin. XMLEvent hl = rdr.nextEvent(); StartElement hlse = hl.asStartElement(); QName hlqn = hlse.getName(); final String hlPart = hlqn.getLocalPart(); LOGGER.debug("QN: {}", hlPart); // Headline text. Characters hlChars = rdr.nextEvent().asCharacters(); final int charOff = hlChars.getLocation().getCharacterOffset(); final int clen = hlChars.getData().length(); // Construct section, text span, etc. final int endTextOffset = charOff + clen; final String hlText = content.substring(charOff, endTextOffset); SimpleImmutableEntry<Integer, Integer> pads = trimSpacing(hlText); TextSpan ts = new TextSpan(charOff + pads.getKey(), endTextOffset - pads.getValue()); Section s = new Section(); s.setKind("headline"); s.setTextSpan(ts); List<Integer> intList = new ArrayList<>(); intList.add(0); s.setNumberList(intList); return s; }
From source file:edu.jhu.hlt.concrete.ingesters.bolt.BoltForumPostIngester.java
@Override public Communication fromCharacterBasedFile(final Path path) throws IngestException { if (!Files.exists(path)) throw new IngestException("No file at: " + path.toString()); AnalyticUUIDGeneratorFactory f = new AnalyticUUIDGeneratorFactory(); AnalyticUUIDGenerator gen = f.create(); Communication c = new Communication(); c.setUuid(gen.next());//from w ww . java2 s. c o m c.setType(this.getKind()); c.setMetadata(TooledMetadataConverter.convert(this)); try { ExistingNonDirectoryFile ef = new ExistingNonDirectoryFile(path); c.setId(ef.getName().split("\\.")[0]); } catch (NoSuchFileException | NotFileException e) { // might throw if path is a directory. throw new IngestException(path.toString() + " is not a file, or is a directory."); } String content; try (InputStream is = Files.newInputStream(path); BufferedInputStream bin = new BufferedInputStream(is, 1024 * 8 * 8);) { content = IOUtils.toString(bin, StandardCharsets.UTF_8); c.setText(content); } catch (IOException e) { throw new IngestException(e); } try (InputStream is = Files.newInputStream(path); BufferedInputStream bin = new BufferedInputStream(is, 1024 * 8 * 8); BufferedReader reader = new BufferedReader(new InputStreamReader(bin, StandardCharsets.UTF_8));) { XMLEventReader rdr = null; try { rdr = inF.createXMLEventReader(reader); // Below method moves the reader // to the first post element. Section headline = handleHeadline(rdr, content); headline.setUuid(gen.next()); c.addToSectionList(headline); int start = headline.getTextSpan().getStart(); int ending = headline.getTextSpan().getEnding(); if (ending < start) ending = start; // @tongfei: handle empty headlines String htxt = c.getText().substring(start, ending); LOGGER.debug("headline text: {}", htxt); // Section indices. int sectNumber = 1; int subSect = 0; // Move iterator to post start element. this.iterateToPosts(rdr); // Offset pointer. int currOff = -1; SectionFactory sf = new SectionFactory(gen); // First post element. while (rdr.hasNext()) { XMLEvent nextEvent = rdr.nextEvent(); currOff = nextEvent.getLocation().getCharacterOffset(); if (currOff > 0) { int currOffPlus = currOff + 20; int currOffLess = currOff - 20; LOGGER.debug("Offset: {}", currOff); if (currOffPlus < content.length()) LOGGER.debug("Surrounding text: {}", content.substring(currOffLess, currOffPlus)); } // First: see if document is going to end. // If yes: exit. if (nextEvent.isEndDocument()) break; // XMLEvent peeker = rdr.peek(); // Check if start element. if (nextEvent.isStartElement()) { StartElement se = nextEvent.asStartElement(); QName name = se.getName(); final String localName = name.getLocalPart(); LOGGER.debug("Hit start element: {}", localName); //region // Add sections for authors and datetimes for each bolt post // by Tongfei Chen Attribute attrAuthor = se.getAttributeByName(QName.valueOf("author")); Attribute attrDateTime = se.getAttributeByName(QName.valueOf("datetime")); if (attrAuthor != null && attrDateTime != null) { int loc = attrAuthor.getLocation().getCharacterOffset(); int sectAuthorBeginningOffset = loc + "<post author=\"".length(); Section sectAuthor = sf.fromTextSpan(new TextSpan(sectAuthorBeginningOffset, sectAuthorBeginningOffset + attrAuthor.getValue().length()), "author"); c.addToSectionList(sectAuthor); int sectDateTimeBeginningOffset = sectAuthorBeginningOffset + attrAuthor.getValue().length() + " datetime=".length(); Section sectDateTime = sf.fromTextSpan( new TextSpan(sectDateTimeBeginningOffset, sectDateTimeBeginningOffset + attrDateTime.getValue().length()), "datetime"); c.addToSectionList(sectDateTime); } //endregion // Move past quotes, images, and links. if (localName.equals(QUOTE_LOCAL_NAME)) { this.handleQuote(rdr); } else if (localName.equals(IMG_LOCAL_NAME)) { this.handleImg(rdr); } else if (localName.equals(LINK_LOCAL_NAME)) { this.handleLink(rdr); } // not a start element } else if (nextEvent.isCharacters()) { Characters chars = nextEvent.asCharacters(); int coff = chars.getLocation().getCharacterOffset(); if (!chars.isWhiteSpace()) { // content to be captured String fpContent = chars.getData(); LOGGER.debug("Character offset: {}", coff); LOGGER.debug("Character based data: {}", fpContent); // LOGGER.debug("Character data via offset diff: {}", content.substring(coff - fpContent.length(), coff)); SimpleImmutableEntry<Integer, Integer> pads = trimSpacing(fpContent); final int tsb = currOff + pads.getKey(); final int tse = currOff + fpContent.length() - pads.getValue(); final String subs = content.substring(tsb, tse); if (subs.replaceAll("\\p{Zs}", "").replaceAll("\\n", "").isEmpty()) { LOGGER.info("Found empty section: skipping."); continue; } LOGGER.debug("Section text: {}", subs); TextSpan ts = new TextSpan(tsb, tse); Section s = sf.fromTextSpan(ts, "post"); List<Integer> intList = new ArrayList<>(); intList.add(sectNumber); intList.add(subSect); s.setNumberList(intList); c.addToSectionList(s); subSect++; } } else if (nextEvent.isEndElement()) { EndElement ee = nextEvent.asEndElement(); currOff = ee.getLocation().getCharacterOffset(); QName name = ee.getName(); String localName = name.getLocalPart(); LOGGER.debug("Hit end element: {}", localName); if (localName.equalsIgnoreCase(POST_LOCAL_NAME)) { sectNumber++; subSect = 0; } } } return c; } catch (XMLStreamException | ConcreteException | StringIndexOutOfBoundsException x) { throw new IngestException(x); } finally { if (rdr != null) try { rdr.close(); } catch (XMLStreamException e) { // not likely. LOGGER.info("Error closing XMLReader.", e); } } } catch (IOException e) { throw new IngestException(e); } }