List of usage examples for org.xml.sax ContentHandler toString
public String toString()
From source file:WriteIndex.java
/** * @param args//from w ww . jav a 2 s . co m */ public static void main(String[] args) throws IOException { File docs = new File("documents"); File indexDir = new File(INDEX_DIRECTORY); Directory directory = FSDirectory.open(indexDir); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_35); IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_35, analyzer); IndexWriter writer = new IndexWriter(directory, conf); writer.deleteAll(); for (File file : docs.listFiles()) { Metadata metadata = new Metadata(); ContentHandler handler = new BodyContentHandler(); ParseContext context = new ParseContext(); Parser parser = new AutoDetectParser(); InputStream stream = new FileInputStream(file); try { parser.parse(stream, handler, metadata, context); } catch (TikaException e) { e.printStackTrace(); } catch (SAXException e) { e.printStackTrace(); } finally { stream.close(); } String text = handler.toString(); String fileName = file.getName(); Document doc = new Document(); doc.add(new Field("file", fileName, Store.YES, Index.NO)); for (String key : metadata.names()) { String name = key.toLowerCase(); String value = metadata.get(key); if (StringUtils.isBlank(value)) { continue; } if ("keywords".equalsIgnoreCase(key)) { for (String keyword : value.split(",?(\\s+)")) { doc.add(new Field(name, keyword, Store.YES, Index.NOT_ANALYZED)); } } else if ("title".equalsIgnoreCase(key)) { doc.add(new Field(name, value, Store.YES, Index.ANALYZED)); } else { doc.add(new Field(name, fileName, Store.YES, Index.NOT_ANALYZED)); } } doc.add(new Field("text", text, Store.NO, Index.ANALYZED)); writer.addDocument(doc); } writer.commit(); writer.deleteUnusedFiles(); System.out.println(writer.maxDoc() + " documents written"); }
From source file:lucene_3_tika.MyFirstTika.java
public static String parseUsingComponents(String filename, TikaConfig tikaConfig, Metadata metadata) throws Exception { MimeTypes mimeRegistry = tikaConfig.getMimeRepository(); System.out.println("Examining: [" + filename + "]"); metadata.set(Metadata.RESOURCE_NAME_KEY, filename); System.out.println("The MIME type (based on filename) is: [" + mimeRegistry.detect(null, metadata) + "]"); InputStream stream = TikaInputStream.get(new File(filename)); System.out.println("The MIME type (based on MAGIC) is: [" + mimeRegistry.detect(stream, metadata) + "]"); stream = TikaInputStream.get(new File(filename)); Detector detector = tikaConfig.getDetector(); System.out.println(//from w w w . j av a2s. c o m "The MIME type (based on the Detector interface) is: [" + detector.detect(stream, metadata) + "]"); LanguageIdentifier lang = new LanguageIdentifier( new LanguageProfile(FileUtils.readFileToString(new File(filename), UTF_8))); System.out.println("The language of this content is: [" + lang.getLanguage() + "]"); // Get a non-detecting parser that handles all the types it can Parser parser = tikaConfig.getParser(); // Tell it what we think the content is MediaType type = detector.detect(stream, metadata); metadata.set(Metadata.CONTENT_TYPE, type.toString()); // Have the file parsed to get the content and metadata ContentHandler handler = new BodyContentHandler(); parser.parse(stream, handler, metadata, new ParseContext()); return handler.toString(); }
From source file:lucene_3_tika.MyFirstTika.java
public static String parseUsingAutoDetect(String filename, TikaConfig tikaConfig, Metadata metadata) throws Exception { System.out.println("Handling using AutoDetectParser: [" + filename + "]"); AutoDetectParser parser = new AutoDetectParser(tikaConfig); ContentHandler handler = new BodyContentHandler(); TikaInputStream stream = TikaInputStream.get(new File(filename), metadata); parser.parse(stream, handler, metadata, new ParseContext()); return handler.toString(); }
From source file:com.sustainalytics.crawlerfilter.PDFtoText.java
public static String extractTikaText(String file) { InputStream is = null;// www . java 2 s . c o m ContentHandler contenthandler = null; try { is = new FileInputStream(file); contenthandler = new BodyContentHandler(-1); Metadata metadata = new Metadata(); PDFParser pdfparser = new PDFParser(); pdfparser.parse(is, contenthandler, metadata, new ParseContext()); logger.info("PDF text extracted from " + file + "\n"); } catch (Exception e) { logger.info("Error in parsing with Apache Tika parser\n"); } finally { if (is != null) try { is.close(); } catch (IOException e) { logger.info("Error in closing file with Apache Tika\n"); } } return contenthandler.toString(); }
From source file:com.sustainalytics.crawlerfilter.PDFtoTextBatch.java
public static String extractTikaText(String file) { InputStream is = null;// w w w .j a va 2s. c om ContentHandler contenthandler = null; try { is = new FileInputStream(file); contenthandler = new BodyContentHandler(-1); Metadata metadata = new Metadata(); PDFParser pdfparser = new PDFParser(); pdfparser.parse(is, contenthandler, metadata, new ParseContext()); logger.info("PDF text extracted from " + file + "\n"); } catch (Exception e) { logger.info("Error in parsing with Apache Tika parser\n"); } finally { if (is != null) try { is.close(); } catch (IOException e) { logger.info("Error in closing file with Apache Tika\n"); } } return contenthandler.toString(); }
From source file:com.zimbra.cs.service.FeedManager.java
@VisibleForTesting static final String stripXML(String title) { if (title == null) { return ""; } else if (title.indexOf('<') == -1 && title.indexOf('&') == -1) { return title; }/*from w w w . j a v a 2s .co m*/ org.xml.sax.XMLReader parser = new org.cyberneko.html.parsers.SAXParser(); org.xml.sax.ContentHandler handler = new UnescapedContent(); parser.setContentHandler(handler); try { parser.parse(new org.xml.sax.InputSource(new StringReader(title))); return handler.toString(); } catch (Exception e) { return title; } }
From source file:com.sustainalytics.crawlerfilter.PDFTitleGeneration.java
public static String extractTikaText(String file) { InputStream is = null;// w w w .ja v a 2 s. c o m ContentHandler contenthandler = null; try { is = new FileInputStream(file); contenthandler = new BodyContentHandler(); Metadata metadata = new Metadata(); PDFParser pdfparser = new PDFParser(); pdfparser.parse(is, contenthandler, metadata, new ParseContext()); } catch (Exception e) { e.printStackTrace(); } finally { if (is != null) try { is.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } return contenthandler.toString(); }
From source file:fr.paris.lutece.plugins.document.service.search.DocumentIndexer.java
/** * Builds a document which will be used by Lucene during the indexing of the * pages of the site with the following// w w w. jav a 2 s . c o m * fields : summary, uid, url, contents, title and description. * * @param document the document to index * @param strUrl the url of the documents * @param strRole the lutece role of the page associate to the document * @param strPortletDocumentId the document id concatened to the id portlet * with a & in the middle * @return the built Document * @throws IOException The IO Exception * @throws InterruptedException The InterruptedException */ public static org.apache.lucene.document.Document getDocument(Document document, String strUrl, String strRole, String strPortletDocumentId) throws IOException, InterruptedException { // make a new, empty document org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document(); FieldType ft = new FieldType(StringField.TYPE_STORED); ft.setOmitNorms(false); // Add the url as a field named "url". Use an UnIndexed field, so // that the url is just stored with the document, but is not searchable. doc.add(new Field(SearchItem.FIELD_URL, strUrl, ft)); // Add the PortletDocumentId as a field named "document_portlet_id". doc.add(new Field(SearchItem.FIELD_DOCUMENT_PORTLET_ID, strPortletDocumentId, ft)); // Add the last modified date of the file a field named "modified". // Use a field that is indexed (i.e. searchable), but don't tokenize // the field into words. String strDate = DateTools.dateToString(document.getDateModification(), DateTools.Resolution.DAY); doc.add(new Field(SearchItem.FIELD_DATE, strDate, ft)); // Add the uid as a field, so that index can be incrementally maintained. // This field is not stored with document, it is indexed, but it is not // tokenized prior to indexing. String strIdDocument = String.valueOf(document.getId()); doc.add(new Field(SearchItem.FIELD_UID, strIdDocument + "_" + DocumentIndexer.SHORT_NAME, ft)); String strContentToIndex = getContentToIndex(document); ContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); try { new HtmlParser().parse(new ByteArrayInputStream(strContentToIndex.getBytes()), handler, metadata, new ParseContext()); } catch (SAXException e) { throw new AppException("Error during document parsing."); } catch (TikaException e) { throw new AppException("Error during document parsing."); } //the content of the article is recovered in the parser because this one //had replaced the encoded caracters (as é) by the corresponding special caracter (as ?) String strContent = handler.toString(); // Add the tag-stripped contents as a Reader-valued Text field so it will // get tokenized and indexed. doc.add(new Field(SearchItem.FIELD_CONTENTS, strContent, TextField.TYPE_NOT_STORED)); // Add the title as a separate Text field, so that it can be searched // separately. FieldType ft2 = new FieldType(TextField.TYPE_STORED); ft2.setOmitNorms(true); doc.add(new Field(SearchItem.FIELD_TITLE, document.getTitle(), ft2)); doc.add(new Field(SearchItem.FIELD_TYPE, document.getType(), ft)); doc.add(new Field(SearchItem.FIELD_ROLE, strRole, ft)); // add metadata (mapped to summary) doc.add(new Field(SearchItem.FIELD_METADATA, document.getSummary(), TextField.TYPE_NOT_STORED)); doc.add(new StoredField(SearchItem.FIELD_SUMMARY, document.getSummary())); // return the document return doc; }
From source file:fr.paris.lutece.plugins.calendar.modules.document.service.search.DocumentCalendarIndexer.java
/** * Builds a document which will be used by Lucene during the indexing of the * pages of the site with the following/*from w w w . j a v a 2 s . c om*/ * fields : summary, uid, url, contents, title and description. * * @param document the document to index * @param strUrl the url of the documents * @param strRole the lutece role of the page associate to the document * @param strPortletDocumentId the document id concatened to the id portlet * with a & in the middle * @return the built Document * @throws IOException The IO Exception * @throws InterruptedException The InterruptedException */ public static org.apache.lucene.document.Document getDocument( fr.paris.lutece.plugins.document.business.Document document, String strRole, Event occurrence, String strAgenda, String strOccurrenceUrl) throws IOException, InterruptedException { FieldType ft = new FieldType(StringField.TYPE_STORED); ft.setOmitNorms(false); FieldType ftNotStored = new FieldType(StringField.TYPE_STORED); ftNotStored.setOmitNorms(false); FieldType ftNo = new FieldType(StringField.TYPE_STORED); ftNo.setIndexed(false); ftNo.setTokenized(false); ftNo.setOmitNorms(false); // make a new, empty document org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document(); doc.add(new Field(Constants.FIELD_CALENDAR_ID, strAgenda + "_" + CALENDAR_SHORT_NAME, ftNotStored)); // Add the last modified date of the file a field named "modified". // Use a field that is indexed (i.e. searchable), but don't tokenize // the field into words. String strDate = Utils.getDate(occurrence.getDate()); doc.add(new Field(SearchItem.FIELD_DATE, strDate, ft)); // Add the url as a field named "url". Use an UnIndexed field, so // that the url is just stored with the question/answer, but is not searchable. doc.add(new Field(SearchItem.FIELD_URL, strOccurrenceUrl, ft)); // Add the uid as a field, so that index can be incrementally maintained. // This field is not stored with document, it is indexed, but it is not // tokenized prior to indexing. String strOccurrenceId = String.valueOf(occurrence.getId()); doc.add(new Field(SearchItem.FIELD_UID, strOccurrenceId + "_" + PROPERTY_DOCUMENT_SHORT_NAME, ft)); String strContentToIndex = getContentToIndex(document); ContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); try { new HtmlParser().parse(new ByteArrayInputStream(strContentToIndex.getBytes()), handler, metadata, new ParseContext()); } catch (SAXException e) { throw new AppException("Error during page parsing."); } catch (TikaException e) { throw new AppException("Error during page parsing."); } //the content of the article is recovered in the parser because this one //had replaced the encoded caracters (as é) by the corresponding special caracter (as ?) StringBuilder sb = new StringBuilder(handler.toString()); // Add the tag-stripped contents as a Reader-valued Text field so it will // get tokenized and indexed. doc.add(new Field(SearchItem.FIELD_CONTENTS, sb.toString(), TextField.TYPE_NOT_STORED)); // Add the title as a separate Text field, so that it can be searched // separately. doc.add(new Field(SearchItem.FIELD_TITLE, document.getTitle(), ftNo)); doc.add(new Field(SearchItem.FIELD_TYPE, CalendarPlugin.PLUGIN_NAME, TextField.TYPE_STORED)); doc.add(new Field(SearchItem.FIELD_ROLE, strRole, ft)); // return the document return doc; }
From source file:br.ufrgs.inf.dsmoura.repository.controller.solr.SolrConversionUtil.java
private static void addFileNotNull(SolrInputDocument doc, Artifactable file) { if (file == null || file.getFile() == null) { return;/* ww w . j a va 2 s . c o m*/ } ContentHandler textHandler = new BodyContentHandler(10 * 1024 * 1024); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); InputStream input = new ByteArrayInputStream(file.getFile()); try { new AutoDetectParser().parse(input, textHandler, metadata, context); } catch (Exception e) { logger.error(("File parsing failed: " + file.getName()), e); return; } doc.addField(SolrField.ARTIFACT_TEXT.getName(), textHandler.toString()); logger.info(SolrField.ARTIFACT_TEXT.getName() + " : " + textHandler.toString()); }