List of usage examples for org.xml.sax ContentHandler toString
public String toString()
From source file:de.mpg.escidoc.services.extraction.ExtractionChain.java
public ExtractionResult doExtract(String infileName, String outfileName) { File outfile = new File(outfileName); Date stepStart = new Date(); Date current;//w w w. j a v a 2 s. com logger.info("Extracting PDF content ----------------------------------------"); logger.info("Infile: " + infileName); logger.info("Outfile: " + outfileName); logger.info(stepStart + " -- started"); // xPDF try { logger.info("Extracting with xPDF"); StringBuffer command = new StringBuffer(2048); command.append(System.getProperty("os.name").contains("Windows") ? pdftotext + " -enc UTF-8 " : "/usr/bin/pdftotext -enc UTF-8 "); command.append(infileName); command.append(" "); command.append(outfileName); Process proc = Runtime.getRuntime().exec(command.toString()); StreamGobbler inputGobbler = new StreamGobbler(proc.getInputStream(), "xPDF"); StreamGobbler errorGobbler = new StreamGobbler(proc.getErrorStream(), "xPDF"); inputGobbler.start(); errorGobbler.start(); int exitCode = proc.waitFor(); if (proc.exitValue() == 0) { if (verbose) { BufferedReader bufferedReader = new BufferedReader( new InputStreamReader(new FileInputStream(outfile), "UTF-8")); String line; while ((line = bufferedReader.readLine()) != null) { logger.info(line); } bufferedReader.close(); } current = new Date(); logger.info(current + " -- finished successfully"); logger.info("Extraction took " + (current.getTime() - stepStart.getTime())); return ExtractionResult.OK; } } catch (Exception e) { logger.warn("Error extracting PDF with xPDF:"); logger.warn(e.getStackTrace()); } current = new Date(); logger.info(current + " -- finished unsuccessfully"); logger.info("Extraction attempt took " + (current.getTime() - stepStart.getTime())); // PDFBox try { logger.info("Extracting with PDFBox"); stepStart = new Date(); StringBuffer command = new StringBuffer(1024); command.append(System.getProperty("os.name").contains("Windows") ? "java -Dfile.encoding=UTF-8 -jar " + pdfboxAppJar + " ExtractText " : "/usr/bin/java -Dfile.encoding=UTF-8 -jar " + pdfboxAppJar + " ExtractText "); command.append(infileName); command.append(" "); command.append(outfileName); Process proc = Runtime.getRuntime().exec(command.toString()); StreamGobbler inputGobbler = new StreamGobbler(proc.getInputStream(), "PDFBox"); StreamGobbler errorGobbler = new StreamGobbler(proc.getErrorStream(), "PDFBox"); inputGobbler.start(); errorGobbler.start(); int exitCode = proc.waitFor(); if (exitCode == 0) { if (verbose) { BufferedReader bufferedReader = new BufferedReader( new InputStreamReader(new FileInputStream(outfile), "UTF-8")); String line; while ((line = bufferedReader.readLine()) != null) { logger.info(line); } bufferedReader.close(); } current = new Date(); logger.info(current + " -- finished successfully"); logger.info("Extraction took " + (current.getTime() - stepStart.getTime())); return ExtractionResult.OK; } } catch (Exception e) { logger.warn("Error extracting PDF with PDFBox:"); logger.warn(e.getStackTrace()); } current = new Date(); logger.info(current + " -- finished unsuccessfully"); logger.info("Extraction attempt took " + (current.getTime() - stepStart.getTime())); // iText try { logger.info("Extracting with iText"); stepStart = new Date(); PdfReader reader = new PdfReader(infileName); int numberOfPages = reader.getNumberOfPages(); outputStreamWriter = new OutputStreamWriter(new FileOutputStream(outfile), "UTF-8"); for (int i = 0; i < numberOfPages; i++) { outputStreamWriter.write(PdfTextExtractor.getTextFromPage(reader, i + 1)); } if (verbose) { BufferedReader bufferedReader = new BufferedReader( new InputStreamReader(new FileInputStream(outfile), "UTF-8")); String line; while ((line = bufferedReader.readLine()) != null) { logger.info(line); } bufferedReader.close(); } current = new Date(); logger.info(current + " -- finished successfully"); logger.info("Extraction took " + (current.getTime() - stepStart.getTime())); return ExtractionResult.OK; } catch (Exception e) { logger.warn("Error extracting PDF with iText:", e); } // tika InputStream stream = null; try { logger.info("Extracting with Tika"); stepStart = new Date(); stream = TikaInputStream.get(new File(infileName)); ContentHandler handler = new BodyContentHandler(TIKA_CONTENT_SIZE); new AutoDetectParser().parse(stream, handler, new Metadata(), new ParseContext()); String content = handler.toString(); FileUtils.writeStringToFile(outfile, content); stream.close(); if (verbose) { BufferedReader bufferedReader = new BufferedReader( new InputStreamReader(new FileInputStream(outfile), "UTF-8")); String line; while ((line = bufferedReader.readLine()) != null) { logger.info(line); } bufferedReader.close(); } current = new Date(); logger.info(current + " -- finished successfully"); logger.info("Extraction took " + (current.getTime() - stepStart.getTime())); return ExtractionResult.OK; } catch (Exception e) { logger.warn("Error extracting Tika:", e); try { stream.close(); } catch (IOException e1) { e1.printStackTrace(); } } current = new Date(); logger.warn(current + " -- finished unsuccessfully"); logger.info("Extraction attempt took " + (current.getTime() - stepStart.getTime())); logger.info("... giving up"); return ExtractionResult.FAILURE; }
From source file:com.bah.applefox.main.plugins.fulltextindex.FTLoader.java
/** * This method is used to add all information parsed by tika into the * Accumulo table/*from w ww.ja v a 2 s .co m*/ * * @param url * - the URL of the page that has been parsed * @param tikaParsed * - all of the engrams from the page * @throws TikaException * @throws SAXException */ private static boolean addToDataBaseTable(String url) { try { // Connect to the data table BatchWriter writer = AccumuloUtils.connectBatchWrite(dTable); // Let the user know the url is being added System.out.println("Adding " + url + " with prefix " + longSuffix); // Get the input stream (in case it is not an html document InputStream urlInput = new URL(url).openStream(); // Set the page contents (used for filtering if it is an html // document) String pageContents = getPageContents(new URL(url)); // If the document is HTML if (exDivs.size() != 0 && pageContents.toLowerCase().contains("<html>")) { // Filter out some divs (especially generic headers/footers, // etc.) pageContents = DivsFilter.filterDivs(pageContents, exDivs); urlInput = new ByteArrayInputStream(pageContents.getBytes()); } // Parse with tika Parser parser = new AutoDetectParser(); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); ContentHandler handler = new BodyContentHandler(); parser.parse(urlInput, handler, metadata, context); // Get the keywords of the page and its title String keywords = metadata.get("keywords"); String title = metadata.get("title"); if (title == null) { WebPageCrawl p; try { p = new WebPageCrawl(url, "", Collections.<String>emptySet()); } catch (PageCrawlException e) { log.info(e); return false; } title = p.getTitle(); } // If there are keywords, delimit the commas, otherwise make it a // blank screen (not null) if (keywords != null) { keywords = keywords.replaceAll(",", "[ ]"); } else { keywords = ""; } // Make everything lower case for ease of search String plainText = handler.toString().toLowerCase(); // Split it into <Key,Value> pairs of NGrams, with the Value being // the count of the NGram on the page HashMap<String, Integer> tikaParsed = IngestUtils .collectTerms(IngestUtils.createNGrams(plainText, maxNGrams)); // A counter for the final number of words Integer totalWords = 0; // A HashMap for the final NGrams HashMap<String, Integer> finalParsed = new HashMap<String, Integer>(); for (String i : tikaParsed.keySet()) { int freq = tikaParsed.get(i); totalWords += freq; // erase stop words if (stopWords != null && !stopWords.contains(i)) { finalParsed.put(i, tikaParsed.get(i)); } else if (stopWords == null) { finalParsed.put(i, tikaParsed.get(i)); } } System.out.println("Tika Parsed: " + finalParsed.keySet().size()); System.out.println("Starting"); int counter = 0; String namedURL = url + "[ ]" + title + "[ ]" + keywords; for (String row : finalParsed.keySet()) { row = row + " " + longSuffix; for (String CQ : finalParsed.keySet()) { String groupedVal = new String(); Integer wc = finalParsed.get(CQ); double freq = wc.doubleValue() / totalWords.doubleValue(); groupedVal = wc + "," + freq; Value val = new Value(IngestUtils.serialize(groupedVal)); Mutation m = new Mutation(row); m.put(namedURL, CQ, new Date().getTime(), val); writer.addMutation(m); counter++; } } System.out.println("Wrote " + counter + " Key-Value pairs to Accumulo."); writer.close(); System.out.println("Stopped writing"); } catch (AccumuloException e) { if (e.getMessage() != null) { log.error(e.getMessage()); } else { log.error(e.getStackTrace()); } return false; } catch (AccumuloSecurityException e) { if (e.getMessage() != null) { log.error(e.getMessage()); } else { log.error(e.getStackTrace()); } return false; } catch (TableNotFoundException e) { if (e.getMessage() != null) { log.error(e.getMessage()); } else { log.error(e.getStackTrace()); } return false; } catch (TableExistsException e) { if (e.getMessage() != null) { log.error(e.getMessage()); } else { log.error(e.getStackTrace()); } return false; } catch (MalformedURLException e) { if (e.getMessage() != null) { log.error(e.getMessage()); } else { log.error(e.getStackTrace()); } } catch (IOException e) { if (e.getMessage() != null) { log.error(e.getMessage()); } else { log.error(e.getStackTrace()); } return false; } catch (SAXException e) { if (e.getMessage() != null) { log.error(e.getMessage()); } else { log.error(e.getStackTrace()); } return false; } catch (TikaException e) { if (e.getMessage() != null) { log.error(e.getMessage()); } else { log.error(e.getStackTrace()); } return false; } return true; }
From source file:de.u808.simpleinquest.indexer.impl.DefaultIndexer.java
public Document indexFile(File file) throws FileNotFoundException, IndexerException { Document document = null;/*from w w w . j a v a 2 s. c o m*/ if (file.canRead()) { try { document = new Document(); InputStream input = new FileInputStream(file); Metadata metadata = new Metadata(); ContentHandler handler = new BodyContentHandler(); new AutoDetectParser().parse(input, handler, metadata); document.add(new Field(Indexer.PATH_FIELD_NAME, file.getPath(), Field.Store.YES, Field.Index.UN_TOKENIZED)); document.add(new Field(Indexer.ID_FIELD_NAME, MD5Util.getInstance().getMD5Hex(file.getPath()), Field.Store.YES, Field.Index.UN_TOKENIZED)); document.add(new Field(Indexer.MODIFIED_FIELD_NAME, DateTools.timeToString(file.lastModified(), DateTools.Resolution.MINUTE), Field.Store.YES, Field.Index.UN_TOKENIZED)); document.add(new Field(Indexer.CONTENT_FIELD_NAME, new StringReader(handler.toString()))); } catch (Exception e) { if (e instanceof TikaException) { Throwable t = e.getCause(); if (t != null && t.getMessage() != null) { if (t.getMessage().startsWith("Error decrypting document")) { log.debug("Cant index encrypted document."); return document; } } } throw new IndexerException(e.getMessage(), e); } } else { log.debug("Cant read file: " + file.getName()); } return document; }
From source file:uk.ac.kcl.itemProcessors.TikaDocumentItemProcessor.java
@Override public Document process(final Document doc) throws Exception { LOG.debug("starting " + this.getClass().getSimpleName() + " on doc " + doc.getDocName()); long startTime = System.currentTimeMillis(); ContentHandler handler; if (keepTags) { handler = new ToXMLContentHandler(); } else {//from ww w . j a v a2 s . c o m handler = new BodyContentHandler(); } Metadata metadata = new Metadata(); String contentType = "TL_CONTENT_TYPE_UNKNOWN"; try (InputStream stream = new ByteArrayInputStream(doc.getBinaryContent())) { ParseContext context = new ParseContext(); context.set(TikaConfig.class, config); parser.parse(stream, handler, metadata, context); Set<String> metaKeys = new HashSet<String>(Arrays.asList(metadata.names())); extractOCRMetadata(doc, metaKeys, metadata); contentType = extractContentTypeMetadata(doc, metaKeys, metadata); extractPageCountMetadata(doc, metaKeys, metadata); addField(doc, handler.toString()); } catch (Exception ex) { addField(doc, ex.getMessage()); } long endTime = System.currentTimeMillis(); LOG.debug("{};Content-Type:{};Time:{} ms", this.getClass().getSimpleName(), contentType, endTime - startTime); LOG.debug("finished " + this.getClass().getSimpleName() + " on doc " + doc.getDocName()); return doc; }
From source file:cx.fbn.nevernote.threads.IndexRunner.java
private void indexResourceRTF(Resource r) { Data d = r.getData();//www. jav a 2s.co m for (int i = 0; i < 20 && d.getSize() == 0; i++) d = r.getData(); if (d.getSize() == 0) return; QTemporaryFile f = writeResource(d); if (!keepRunning) { return; } InputStream input; try { input = new FileInputStream(new File(f.fileName())); ContentHandler textHandler = new BodyContentHandler(-1); Metadata metadata = new Metadata(); RTFParser parser = new RTFParser(); ParseContext context = new ParseContext(); parser.parse(input, textHandler, metadata, context); String[] result = textHandler.toString().split(regex); for (int i = 0; i < result.length && keepRunning; i++) { addToIndex(r.getNoteGuid(), result[i], "RESOURCE"); } input.close(); f.close(); } catch (java.lang.ClassCastException e) { logger.log(logger.LOW, "Cast exception: " + e.getMessage()); } catch (FileNotFoundException e) { logger.log(logger.LOW, "FileNotFound exception: " + e.getMessage()); } catch (IOException e) { logger.log(logger.LOW, "IO exception: " + e.getMessage()); } catch (SAXException e) { logger.log(logger.LOW, "SAX exception: " + e.getMessage()); } catch (TikaException e) { logger.log(logger.LOW, "Tika exception: " + e.getMessage()); } catch (Exception e) { logger.log(logger.LOW, "Unknown exception: " + e.getMessage()); } catch (java.lang.NoSuchMethodError e) { logger.log(logger.LOW, "NoSuchMethod error: " + e.getMessage()); } catch (Error e) { logger.log(logger.LOW, "Unknown error: " + e.getMessage()); } }
From source file:cx.fbn.nevernote.threads.IndexRunner.java
private void indexResourcePDF(Resource r) { Data d = r.getData();/*from w w w . j a v a 2 s . c om*/ for (int i = 0; i < 20 && d.getSize() == 0; i++) d = r.getData(); if (d.getSize() == 0) return; QTemporaryFile f = writeResource(d); if (!keepRunning) { return; } InputStream input; try { input = new FileInputStream(new File(f.fileName())); ContentHandler textHandler = new BodyContentHandler(-1); Metadata metadata = new Metadata(); PDFParser parser = new PDFParser(); ParseContext context = new ParseContext(); parser.parse(input, textHandler, metadata, context); String[] result = textHandler.toString().split(regex); for (int i = 0; i < result.length && keepRunning; i++) { if (interrupt) { processInterrupt(); } addToIndex(r.getNoteGuid(), result[i], "RESOURCE"); } input.close(); f.close(); } catch (java.lang.ClassCastException e) { logger.log(logger.LOW, "Cast exception: " + e.getMessage()); } catch (FileNotFoundException e) { logger.log(logger.LOW, "FileNotFound exception: " + e.getMessage()); } catch (IOException e) { logger.log(logger.LOW, "IO exception: " + e.getMessage()); } catch (SAXException e) { logger.log(logger.LOW, "SAX exception: " + e.getMessage()); } catch (TikaException e) { logger.log(logger.LOW, "Tika exception: " + e.getMessage()); } catch (Exception e) { logger.log(logger.LOW, "Unknown exception: " + e.getMessage()); } catch (java.lang.NoSuchMethodError e) { logger.log(logger.LOW, "NoSuchMethod error: " + e.getMessage()); } catch (Error e) { logger.log(logger.LOW, "Unknown error: " + e.getMessage()); } }
From source file:cx.fbn.nevernote.threads.IndexRunner.java
private void indexResourceOOXML(Resource r) { Data d = r.getData();/*w w w.j a va 2 s .c om*/ for (int i = 0; i < 20 && d.getSize() == 0; i++) d = r.getData(); if (d.getSize() == 0) return; QTemporaryFile f = writeResource(d); if (!keepRunning) { return; } InputStream input; try { input = new FileInputStream(new File(f.fileName())); ContentHandler textHandler = new BodyContentHandler(-1); Metadata metadata = new Metadata(); OOXMLParser parser = new OOXMLParser(); ParseContext context = new ParseContext(); parser.parse(input, textHandler, metadata, context); String[] result = textHandler.toString().split(regex); for (int i = 0; i < result.length && keepRunning; i++) { if (interrupt) { processInterrupt(); } addToIndex(r.getNoteGuid(), result[i], "RESOURCE"); } input.close(); f.close(); } catch (java.lang.ClassCastException e) { logger.log(logger.LOW, "Cast exception: " + e.getMessage()); } catch (FileNotFoundException e) { logger.log(logger.LOW, "FileNotFound exception: " + e.getMessage()); } catch (IOException e) { logger.log(logger.LOW, "IO exception: " + e.getMessage()); } catch (SAXException e) { logger.log(logger.LOW, "SAX exception: " + e.getMessage()); } catch (TikaException e) { logger.log(logger.LOW, "Tika exception: " + e.getMessage()); } catch (Exception e) { logger.log(logger.LOW, "Unknown exception: " + e.getMessage()); } catch (java.lang.NoSuchMethodError e) { logger.log(logger.LOW, "NoSuchMethod error: " + e.getMessage()); } catch (Error e) { logger.log(logger.LOW, "Unknown error: " + e.getMessage()); } }
From source file:cx.fbn.nevernote.threads.IndexRunner.java
private void indexResourceOffice(Resource r) { Data d = r.getData();/*from w w w . j a v a2 s. co m*/ for (int i = 0; i < 20 && d.getSize() == 0; i++) d = r.getData(); if (d.getSize() == 0) return; QTemporaryFile f = writeResource(d); if (!keepRunning) { return; } InputStream input; try { input = new FileInputStream(new File(f.fileName())); ContentHandler textHandler = new BodyContentHandler(-1); Metadata metadata = new Metadata(); OfficeParser parser = new OfficeParser(); ParseContext context = new ParseContext(); parser.parse(input, textHandler, metadata, context); String[] result = textHandler.toString().split(regex); for (int i = 0; i < result.length && keepRunning; i++) { if (interrupt) { processInterrupt(); } addToIndex(r.getNoteGuid(), result[i], "RESOURCE"); } input.close(); f.close(); } catch (java.lang.ClassCastException e) { logger.log(logger.LOW, "Cast exception: " + e.getMessage()); } catch (FileNotFoundException e) { logger.log(logger.LOW, "FileNotFound exception: " + e.getMessage()); } catch (IOException e) { logger.log(logger.LOW, "IO exception: " + e.getMessage()); } catch (SAXException e) { logger.log(logger.LOW, "SAX exception: " + e.getMessage()); } catch (TikaException e) { logger.log(logger.LOW, "Tika exception: " + e.getMessage()); } catch (Exception e) { logger.log(logger.LOW, "Unknown exception: " + e.getMessage()); } catch (java.lang.NoSuchMethodError e) { logger.log(logger.LOW, "NoSuchMethod error: " + e.getMessage()); } catch (Error e) { logger.log(logger.LOW, "Unknown error: " + e.getMessage()); } }
From source file:cx.fbn.nevernote.threads.IndexRunner.java
private void indexResourceODF(Resource r) { Data d = r.getData();//ww w . j a v a2 s. com for (int i = 0; i < 20 && d.getSize() == 0; i++) d = r.getData(); if (d.getSize() == 0) return; QTemporaryFile f = writeResource(d); if (!keepRunning) { return; } InputStream input; try { input = new FileInputStream(new File(f.fileName())); ContentHandler textHandler = new BodyContentHandler(-1); Metadata metadata = new Metadata(); OpenDocumentParser parser = new OpenDocumentParser(); ParseContext context = new ParseContext(); parser.parse(input, textHandler, metadata, context); String[] result = textHandler.toString().split(regex); for (int i = 0; i < result.length && keepRunning; i++) { if (interrupt) { processInterrupt(); } addToIndex(r.getNoteGuid(), result[i], "RESOURCE"); } input.close(); f.close(); } catch (java.lang.ClassCastException e) { logger.log(logger.LOW, "Cast exception: " + e.getMessage()); } catch (FileNotFoundException e) { logger.log(logger.LOW, "FileNotFound exception: " + e.getMessage()); } catch (IOException e) { logger.log(logger.LOW, "IO exception: " + e.getMessage()); } catch (SAXException e) { logger.log(logger.LOW, "SAX exception: " + e.getMessage()); } catch (TikaException e) { logger.log(logger.LOW, "Tika exception: " + e.getMessage()); } catch (Exception e) { logger.log(logger.LOW, "Unknown exception: " + e.getMessage()); } catch (java.lang.NoSuchMethodError e) { logger.log(logger.LOW, "NoSuchMethod error: " + e.getMessage()); } catch (Error e) { logger.log(logger.LOW, "Unknown error: " + e.getMessage()); } }
From source file:it.polito.tellmefirst.web.rest.clients.ClientEpub.java
private String autoParseAll(File file) { InputStream is = null;/*from www. ja v a 2 s .co m*/ String textBody = ""; try { InputStream input = new FileInputStream(file); ContentHandler text = new BodyContentHandler(10 * 1024 * 1024); LinkContentHandler links = new LinkContentHandler(); ContentHandler handler = new TeeContentHandler(links, text); Metadata metadata = new Metadata(); EpubParser parser2 = new EpubParser(); ParseContext context = new ParseContext(); parser2.parse(input, handler, metadata, context); textBody = text.toString().replaceAll(">[\\s]*?<", "><").toLowerCase().replaceAll("\\d+.*", ""); ; // Remove the Project Gutenberg meta information from the text textBody = textBody.split("end of the project gutenberg ebook")[0].toLowerCase(); LOG.debug("Body: " + textBody); //all text in one } catch (Exception el) { el.printStackTrace(); } finally { if (is != null) IOUtils.closeQuietly(is); } return textBody; }