List of usage examples for org.dom4j.io SAXWriter write
public void write(ProcessingInstruction pi) throws SAXException
From source file:bixo.examples.webmining.AnalyzeHtml.java
License:Apache License
@SuppressWarnings("rawtypes") @Override//from www. ja v a 2 s .c o m protected void process(ParsedDatum datum, Document doc, TupleEntryCollector collector, FlowProcess process) throws Exception { SimpleBodyContentHandler bodyContentHandler = new SimpleBodyContentHandler(); SAXWriter writer = new SAXWriter(bodyContentHandler); writer.write(doc); float pageScore = getScore(bodyContentHandler.toString()); // Get the outlinks. Outlink[] outlinks = getOutlinks(doc); // Extract all of the images, and use them as page results. PageResult[] pageResults = extractImages(datum.getUrl(), doc, outlinks); _result.setUrl(datum.getUrl()); _result.setPageScore(pageScore); _result.setOutlinks(outlinks); _result.setPageResults(pageResults); collector.add(BixoPlatform.clone(_result.getTuple(), process)); }
From source file:com.alibaba.citrus.springext.util.DomUtil.java
License:Open Source License
/** W3C element??SAX */ public static void convertElement(Element element, ContentHandler contentHandler) throws SAXException { SAXWriter writer = new SAXWriter(contentHandler); if (contentHandler instanceof ErrorHandler) { writer.setErrorHandler((ErrorHandler) contentHandler); }//from w w w .j a va 2 s . c o m if (contentHandler instanceof LexicalHandler) { writer.setLexicalHandler((LexicalHandler) contentHandler); } writer.write(convertElement(element)); }
From source file:com.christophermrossi.jpt.HTMLFragment.java
License:Open Source License
public void toXhtml(ContentHandler contentHandler, LexicalHandler lexicalHandler) throws PageTemplateException, SAXException { if (dom == null) { parseFragment();//from w w w . j ava2 s . co m } SAXWriter writer = new SAXWriter(contentHandler, lexicalHandler); for (Iterator i = dom.nodeIterator(); i.hasNext();) { Node node = (Node) i.next(); writer.write(node); } }
From source file:com.finderbots.miner.AnalyzeHtml.java
License:Apache License
@Override protected void process(ParsedDatum datum, Document doc, TupleEntryCollector collector) throws Exception { SimpleBodyContentHandler bodyContentHandler = new SimpleBodyContentHandler(); SAXWriter writer = new SAXWriter(bodyContentHandler); writer.write(doc); //float pageScore = getScore(bodyContentHandler.toString()); float pageScore = (float) 1.0; // Get the outlinks. Outlink[] outlinks = getOutlinks(doc); // Extract all of the images, and use them as page results. PageResult[] pageResults = extractImages(datum.getUrl(), doc, outlinks); _result.setUrl(datum.getUrl());//from w ww . ja v a 2 s. c o m _result.setPageScore(pageScore); _result.setOutlinks(outlinks); _result.setPageResults(pageResults); collector.add(_result.getTuple()); }
From source file:com.finderbots.miner2.pinterest.AnalyzeHtml.java
License:Apache License
@Override protected void process(ParsedDatum datum, Document doc, TupleEntryCollector collector) throws Exception { if (_urlsToMineFilter == null || !_urlsToMineFilter.isRemove(datum.getUrl().toString())) { SimpleBodyContentHandler bodyContentHandler = new SimpleBodyContentHandler(); SAXWriter writer = new SAXWriter(bodyContentHandler); writer.write(doc); //float pageScore = getScore(bodyContentHandler.toString()); float pageScore = (float) 1.0; // Get the outlinks. Outlink[] outlinks = getOutlinks(doc); BooleanPreference[] pageResults = getPrefs(datum.getUrl().toString(), doc); // Extract all of the images, and use them as page results. //PageResult[] pageResults = extractImages(datum.getUrl(), doc, outlinks); _result.setUrl(datum.getUrl());/* ww w . jav a 2 s . c o m*/ _result.setPageScore(pageScore); _result.setOutlinks(outlinks); _result.setPageResults(pageResults); collector.add(_result.getTuple()); } }
From source file:com.finderbots.miner2.tomatoes.MineRTCriticsPreferences.java
License:Apache License
@Override protected void process(ParsedDatum datum, Document doc, TupleEntryCollector collector) throws Exception { LOGGER.debug(this.getClass().toString() + " Got datum for url: " + datum.getUrl()); if (_urlsToMineFilter == null || !_urlsToMineFilter.isRemove(datum.getUrl().toString())) { // currently mines all pages so the fields in the tuple/datum must // ALL be set every time. Either set for prefs OR media pages, not both // todo: split into two datum types and mine separately? SimpleBodyContentHandler bodyContentHandler = new SimpleBodyContentHandler(); SAXWriter writer = new SAXWriter(bodyContentHandler); writer.write(doc); // Mine that data. String url = datum.getUrl(); _result.setUrl(url);/*from w ww . j av a 2s. com*/ if (url.contains("/critic/")) {// mining a critic page _result.setItemId(""); _result.setItemName(""); _result.setPosterImageUrl(""); MultiValuePreference[] prefs = minePrefs(url, doc); _result.setPrefs(prefs); } else if (url.contains("/m/")) {//mining a media page _result.setItemId(mineItemId(url)); _result.setItemName(mineItemName(doc)); _result.setPosterImageUrl(minePosterImageUrl(doc)); _result.setPrefs(new MultiValuePreference[0]); } else {// not a page to mine, should be filtered out so throw an exception? //throw new Exception("Got a page that should not be mined: "+url); LOGGER.info("URLs to mine not working, getting urls that we don't mine like: " + url); } collector.add(_result.getTuple()); } }
From source file:com.webslingerz.jpt.HTMLFragment.java
License:Open Source License
public void toXhtml(ContentHandler contentHandler, LexicalHandler lexicalHandler) throws PageTemplateException, SAXException { if (dom == null) { parseFragment();/*from w w w .j ava 2 s .c o m*/ } SAXWriter writer = new SAXWriter(contentHandler, lexicalHandler); for (Iterator<Node> i = dom.nodeIterator(); i.hasNext();) { Node node = i.next(); writer.write(node); } }
From source file:de.tudarmstadt.ukp.clarin.webanno.tei.TeiReader.java
License:Apache License
@Override public void getNext(CAS aCAS) throws IOException, CollectionException { initCas(aCAS, currentResource);//w ww . j ava 2 s.co m InputStream is = null; try { JCas jcas = aCAS.getJCas(); // Create handler Handler handler = newSaxHandler(); handler.setJCas(jcas); handler.setLogger(getLogger()); // Parse TEI text SAXWriter writer = new SAXWriter(handler); writer.write(currentTeiElement); handler.endDocument(); } catch (CASException e) { throw new CollectionException(e); } catch (SAXException e) { throw new IOException(e); } catch (Exception e) { throw new IOException("This is not a valid WebAnno CPH TEI file"); } finally { closeQuietly(is); } // Move currentTeiElement to the next text nextTeiElement(); }
From source file:de.tudarmstadt.ukp.dkpro.core.io.tei.TeiReader.java
License:Apache License
@Override public void getNext(CAS aCAS) throws IOException, CollectionException { initCas(aCAS, currentResource);//w w w . j a v a 2 s .co m // Set up language if (getConfigParameterValue(PARAM_LANGUAGE) != null) { aCAS.setDocumentLanguage((String) getConfigParameterValue(PARAM_LANGUAGE)); } // Configure mapping only now, because now the language is set in the CAS try { posMappingProvider.configure(aCAS); } catch (AnalysisEngineProcessException e1) { throw new IOException(e1); } InputStream is = null; try { JCas jcas = aCAS.getJCas(); // Create handler Handler handler = newSaxHandler(); handler.setJCas(jcas); handler.setLogger(getLogger()); // Parse TEI text SAXWriter writer = new SAXWriter(handler); writer.write(currentTeiElement); handler.endDocument(); } catch (CASException e) { throw new CollectionException(e); } catch (SAXException e) { throw new IOException(e); } finally { closeQuietly(is); } // Move currentTeiElement to the next text nextTeiElement(); }
From source file:edu.umd.cs.buildServer.inspection.FindBugsDocumentBuilder.java
License:Apache License
@Override protected void documentFinished() { try {/*from w ww . ja va 2s . c o m*/ // Generate a BugCollection from the dom4j tree SAXBugCollectionHandler handler = new SAXBugCollectionHandler(bugCollection); SAXWriter saxWriter = new SAXWriter(handler); saxWriter.write(getDocument()); } catch (SAXException e) { getLog().info("Couldn't generate BugCollection from findbugs XML output", e); } }