List of usage examples for org.xml.sax ContentHandler toString
public String toString()
From source file:uib.scratch.WriteIndex.java
/** * @param args//from ww w . ja va 2s.c o m * @throws java.io.IOException * @throws org.xml.sax.SAXException */ public static void main(String[] args) throws IOException, SAXException { File docs = new File("documents"); File indexDir = new File(INDEX_DIRECTORY); //Directory directory = FSDirectory.open(indexDir); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30); //IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_30, analyzer); //IndexWriter writer = new IndexWriter(directory, analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED); IndexWriter writer = new IndexWriter(FSDirectory.open(indexDir), analyzer, true, IndexWriter.MaxFieldLength.LIMITED); System.out.println(indexDir); writer.deleteAll(); for (File file : docs.listFiles()) { Metadata metadata = new Metadata(); ContentHandler handler = new BodyContentHandler(); ParseContext context = new ParseContext(); Parser parser = new AutoDetectParser(); InputStream stream = new FileInputStream(file); try { parser.parse(stream, handler, metadata, context); } catch (TikaException e) { } catch (IOException e) { e.printStackTrace(); } finally { stream.close(); } String text = handler.toString(); String fileName = file.getName(); Document doc = new Document(); doc.add(new Field("file", fileName, Store.YES, Index.NO)); for (String key : metadata.names()) { String name = key.toLowerCase(); String value = metadata.get(key); if (StringUtils.isEmpty(value)) { continue; } if ("keywords".equalsIgnoreCase(key)) { for (String keyword : value.split(",?(\\s+)")) { doc.add(new Field(name, keyword, Store.YES, Index.NOT_ANALYZED)); } } else if ("title".equalsIgnoreCase(key)) { doc.add(new Field(name, value, Store.YES, Index.ANALYZED)); } else { doc.add(new Field(name, fileName, Store.YES, Index.NOT_ANALYZED)); } } doc.add(new Field("text", text, Store.NO, Index.ANALYZED)); writer.addDocument(doc); } writer.commit(); //.deleteUnusedFiles(); System.out.println(writer.maxDoc() + " documents written"); }
From source file:us.colloquy.sandbox.TestExtractor.java
@Test public void getContent() throws TikaException, SAXException, IOException { File file = new File("temp/OEBPS/Text/0001_1006_2002.xhtml"); InputStream input = new ByteArrayInputStream(FileUtils.readFileToByteArray(file)); ContentHandler text = new BodyContentHandler();//<co id="html.text.co"/> LinkContentHandler links = new LinkContentHandler();//<co id="html.link.co"/> ContentHandler handler = new TeeContentHandler(links, text);//<co id="html.merge"/> Metadata metadata = new Metadata(); Parser parser = new XMLParser(); ParseContext context = new ParseContext(); parser.parse(input, handler, metadata, context);//<co id="html.parse"/> listAvailableMetaDataFields(metadata); System.out.println("Title: " + metadata.get(Metadata.TITLE)); // System.out.println("Body: " + text.toString()); String[] contentArray = text.toString().split("\n"); for (String line : contentArray) { System.out.println(line); }//w ww . ja v a 2 s .co m // System.out.println("Links: " + links.getLinks()); }
From source file:us.colloquy.sandbox.TestExtractor.java
@Test public void parseOnePartToHTML() throws IOException, SAXException, TikaException { // Only get things under html -> body -> div (class=header) XPathParser xhtmlParser = new XPathParser("", XHTMLContentHandler.XHTML); Matcher divContentMatcher = xhtmlParser.parse("*"); ContentHandler handler = new MatchingContentHandler(new ToXMLContentHandler(), divContentMatcher); AutoDetectParser parser = new AutoDetectParser(); Metadata metadata = new Metadata(); File file = new File("temp/OEBPS/Text/0001_1006_2002.xhtml"); try (InputStream stream = new ByteArrayInputStream(FileUtils.readFileToByteArray(file))) { parser.parse(stream, handler, metadata); System.out.println(handler.toString()); }//from w w w. j a v a 2 s.com }