Example usage for org.xml.sax ContentHandler toString

List of usage examples for org.xml.sax ContentHandler toString

Introduction

In this page you can find the example usage for org.xml.sax ContentHandler toString.

Prototype

public String toString() 

Source Link

Document

Returns a string representation of the object.

Usage

From source file:WriteIndex.java

/**
 * @param args//from w  ww .  jav  a 2 s  . co m
 */
public static void main(String[] args) throws IOException {

    File docs = new File("documents");
    File indexDir = new File(INDEX_DIRECTORY);

    Directory directory = FSDirectory.open(indexDir);

    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_35);
    IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_35, analyzer);
    IndexWriter writer = new IndexWriter(directory, conf);
    writer.deleteAll();

    for (File file : docs.listFiles()) {
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler();
        ParseContext context = new ParseContext();
        Parser parser = new AutoDetectParser();
        InputStream stream = new FileInputStream(file);
        try {
            parser.parse(stream, handler, metadata, context);
        } catch (TikaException e) {
            e.printStackTrace();
        } catch (SAXException e) {
            e.printStackTrace();
        } finally {
            stream.close();
        }

        String text = handler.toString();
        String fileName = file.getName();

        Document doc = new Document();
        doc.add(new Field("file", fileName, Store.YES, Index.NO));

        for (String key : metadata.names()) {
            String name = key.toLowerCase();
            String value = metadata.get(key);

            if (StringUtils.isBlank(value)) {
                continue;
            }

            if ("keywords".equalsIgnoreCase(key)) {
                for (String keyword : value.split(",?(\\s+)")) {
                    doc.add(new Field(name, keyword, Store.YES, Index.NOT_ANALYZED));
                }
            } else if ("title".equalsIgnoreCase(key)) {
                doc.add(new Field(name, value, Store.YES, Index.ANALYZED));
            } else {
                doc.add(new Field(name, fileName, Store.YES, Index.NOT_ANALYZED));
            }
        }
        doc.add(new Field("text", text, Store.NO, Index.ANALYZED));
        writer.addDocument(doc);

    }

    writer.commit();
    writer.deleteUnusedFiles();

    System.out.println(writer.maxDoc() + " documents written");
}

From source file:lucene_3_tika.MyFirstTika.java

public static String parseUsingComponents(String filename, TikaConfig tikaConfig, Metadata metadata)
        throws Exception {
    MimeTypes mimeRegistry = tikaConfig.getMimeRepository();

    System.out.println("Examining: [" + filename + "]");

    metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
    System.out.println("The MIME type (based on filename) is: [" + mimeRegistry.detect(null, metadata) + "]");

    InputStream stream = TikaInputStream.get(new File(filename));
    System.out.println("The MIME type (based on MAGIC) is: [" + mimeRegistry.detect(stream, metadata) + "]");

    stream = TikaInputStream.get(new File(filename));
    Detector detector = tikaConfig.getDetector();
    System.out.println(//from   w  w w  . j  av  a2s. c o  m
            "The MIME type (based on the Detector interface) is: [" + detector.detect(stream, metadata) + "]");

    LanguageIdentifier lang = new LanguageIdentifier(
            new LanguageProfile(FileUtils.readFileToString(new File(filename), UTF_8)));

    System.out.println("The language of this content is: [" + lang.getLanguage() + "]");

    // Get a non-detecting parser that handles all the types it can
    Parser parser = tikaConfig.getParser();
    // Tell it what we think the content is
    MediaType type = detector.detect(stream, metadata);
    metadata.set(Metadata.CONTENT_TYPE, type.toString());
    // Have the file parsed to get the content and metadata
    ContentHandler handler = new BodyContentHandler();
    parser.parse(stream, handler, metadata, new ParseContext());

    return handler.toString();
}

From source file:lucene_3_tika.MyFirstTika.java

public static String parseUsingAutoDetect(String filename, TikaConfig tikaConfig, Metadata metadata)
        throws Exception {
    System.out.println("Handling using AutoDetectParser: [" + filename + "]");

    AutoDetectParser parser = new AutoDetectParser(tikaConfig);
    ContentHandler handler = new BodyContentHandler();
    TikaInputStream stream = TikaInputStream.get(new File(filename), metadata);
    parser.parse(stream, handler, metadata, new ParseContext());
    return handler.toString();
}

From source file:com.sustainalytics.crawlerfilter.PDFtoText.java

public static String extractTikaText(String file) {
    InputStream is = null;// www . java  2  s  . c  o  m
    ContentHandler contenthandler = null;
    try {
        is = new FileInputStream(file);
        contenthandler = new BodyContentHandler(-1);
        Metadata metadata = new Metadata();
        PDFParser pdfparser = new PDFParser();
        pdfparser.parse(is, contenthandler, metadata, new ParseContext());
        logger.info("PDF text extracted from " + file + "\n");

    } catch (Exception e) {
        logger.info("Error in parsing with Apache Tika parser\n");
    } finally {
        if (is != null)
            try {
                is.close();
            } catch (IOException e) {
                logger.info("Error in closing file with Apache Tika\n");
            }
    }
    return contenthandler.toString();

}

From source file:com.sustainalytics.crawlerfilter.PDFtoTextBatch.java

public static String extractTikaText(String file) {
    InputStream is = null;//  w  w w  .j  a  va 2s. c om
    ContentHandler contenthandler = null;
    try {
        is = new FileInputStream(file);
        contenthandler = new BodyContentHandler(-1);
        Metadata metadata = new Metadata();
        PDFParser pdfparser = new PDFParser();
        pdfparser.parse(is, contenthandler, metadata, new ParseContext());
        logger.info("PDF text extracted from " + file + "\n");

    } catch (Exception e) {
        logger.info("Error in parsing with Apache Tika parser\n");
    } finally {
        if (is != null)
            try {
                is.close();
            } catch (IOException e) {
                logger.info("Error in closing file with Apache Tika\n");
            }
    }
    return contenthandler.toString();
}

From source file:com.zimbra.cs.service.FeedManager.java

@VisibleForTesting
static final String stripXML(String title) {
    if (title == null) {
        return "";
    } else if (title.indexOf('<') == -1 && title.indexOf('&') == -1) {
        return title;
    }/*from  w w w  . j  a  v a  2s .co  m*/

    org.xml.sax.XMLReader parser = new org.cyberneko.html.parsers.SAXParser();
    org.xml.sax.ContentHandler handler = new UnescapedContent();
    parser.setContentHandler(handler);
    try {
        parser.parse(new org.xml.sax.InputSource(new StringReader(title)));
        return handler.toString();
    } catch (Exception e) {
        return title;
    }
}

From source file:com.sustainalytics.crawlerfilter.PDFTitleGeneration.java

public static String extractTikaText(String file) {
    InputStream is = null;//  w w  w  .ja v  a  2 s.  c o m
    ContentHandler contenthandler = null;
    try {
        is = new FileInputStream(file);
        contenthandler = new BodyContentHandler();
        Metadata metadata = new Metadata();
        PDFParser pdfparser = new PDFParser();
        pdfparser.parse(is, contenthandler, metadata, new ParseContext());

    } catch (Exception e) {
        e.printStackTrace();
    } finally {
        if (is != null)
            try {
                is.close();
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
    }
    return contenthandler.toString();
}

From source file:fr.paris.lutece.plugins.document.service.search.DocumentIndexer.java

/**
 * Builds a document which will be used by Lucene during the indexing of the
 * pages of the site with the following//  w w w.  jav  a 2 s . c  o m
 * fields : summary, uid, url, contents, title and description.
 *
 * @param document the document to index
 * @param strUrl the url of the documents
 * @param strRole the lutece role of the page associate to the document
 * @param strPortletDocumentId the document id concatened to the id portlet
 *            with a & in the middle
 * @return the built Document
 * @throws IOException The IO Exception
 * @throws InterruptedException The InterruptedException
 */
public static org.apache.lucene.document.Document getDocument(Document document, String strUrl, String strRole,
        String strPortletDocumentId) throws IOException, InterruptedException {
    // make a new, empty document
    org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document();

    FieldType ft = new FieldType(StringField.TYPE_STORED);
    ft.setOmitNorms(false);

    // Add the url as a field named "url".  Use an UnIndexed field, so
    // that the url is just stored with the document, but is not searchable.
    doc.add(new Field(SearchItem.FIELD_URL, strUrl, ft));

    // Add the PortletDocumentId as a field named "document_portlet_id".  
    doc.add(new Field(SearchItem.FIELD_DOCUMENT_PORTLET_ID, strPortletDocumentId, ft));

    // Add the last modified date of the file a field named "modified".
    // Use a field that is indexed (i.e. searchable), but don't tokenize
    // the field into words.
    String strDate = DateTools.dateToString(document.getDateModification(), DateTools.Resolution.DAY);
    doc.add(new Field(SearchItem.FIELD_DATE, strDate, ft));

    // Add the uid as a field, so that index can be incrementally maintained.
    // This field is not stored with document, it is indexed, but it is not
    // tokenized prior to indexing.
    String strIdDocument = String.valueOf(document.getId());
    doc.add(new Field(SearchItem.FIELD_UID, strIdDocument + "_" + DocumentIndexer.SHORT_NAME, ft));

    String strContentToIndex = getContentToIndex(document);
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();

    try {
        new HtmlParser().parse(new ByteArrayInputStream(strContentToIndex.getBytes()), handler, metadata,
                new ParseContext());
    } catch (SAXException e) {
        throw new AppException("Error during document parsing.");
    } catch (TikaException e) {
        throw new AppException("Error during document parsing.");
    }

    //the content of the article is recovered in the parser because this one
    //had replaced the encoded caracters (as &eacute;) by the corresponding special caracter (as ?)
    String strContent = handler.toString();

    // Add the tag-stripped contents as a Reader-valued Text field so it will
    // get tokenized and indexed.
    doc.add(new Field(SearchItem.FIELD_CONTENTS, strContent, TextField.TYPE_NOT_STORED));

    // Add the title as a separate Text field, so that it can be searched
    // separately.
    FieldType ft2 = new FieldType(TextField.TYPE_STORED);
    ft2.setOmitNorms(true);
    doc.add(new Field(SearchItem.FIELD_TITLE, document.getTitle(), ft2));

    doc.add(new Field(SearchItem.FIELD_TYPE, document.getType(), ft));

    doc.add(new Field(SearchItem.FIELD_ROLE, strRole, ft));

    // add metadata (mapped to summary)
    doc.add(new Field(SearchItem.FIELD_METADATA, document.getSummary(), TextField.TYPE_NOT_STORED));
    doc.add(new StoredField(SearchItem.FIELD_SUMMARY, document.getSummary()));

    // return the document
    return doc;
}

From source file:fr.paris.lutece.plugins.calendar.modules.document.service.search.DocumentCalendarIndexer.java

/**
 * Builds a document which will be used by Lucene during the indexing of the
 * pages of the site with the following/*from   w  w w  .  j  a v a  2  s .  c om*/
 * fields : summary, uid, url, contents, title and description.
 * 
 * @param document the document to index
 * @param strUrl the url of the documents
 * @param strRole the lutece role of the page associate to the document
 * @param strPortletDocumentId the document id concatened to the id portlet
 *            with a & in the middle
 * @return the built Document
 * @throws IOException The IO Exception
 * @throws InterruptedException The InterruptedException
 */
public static org.apache.lucene.document.Document getDocument(
        fr.paris.lutece.plugins.document.business.Document document, String strRole, Event occurrence,
        String strAgenda, String strOccurrenceUrl) throws IOException, InterruptedException {
    FieldType ft = new FieldType(StringField.TYPE_STORED);
    ft.setOmitNorms(false);

    FieldType ftNotStored = new FieldType(StringField.TYPE_STORED);
    ftNotStored.setOmitNorms(false);

    FieldType ftNo = new FieldType(StringField.TYPE_STORED);
    ftNo.setIndexed(false);
    ftNo.setTokenized(false);
    ftNo.setOmitNorms(false);

    // make a new, empty document
    org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document();

    doc.add(new Field(Constants.FIELD_CALENDAR_ID, strAgenda + "_" + CALENDAR_SHORT_NAME, ftNotStored));

    // Add the last modified date of the file a field named "modified".
    // Use a field that is indexed (i.e. searchable), but don't tokenize
    // the field into words.
    String strDate = Utils.getDate(occurrence.getDate());
    doc.add(new Field(SearchItem.FIELD_DATE, strDate, ft));

    // Add the url as a field named "url".  Use an UnIndexed field, so
    // that the url is just stored with the question/answer, but is not searchable.
    doc.add(new Field(SearchItem.FIELD_URL, strOccurrenceUrl, ft));

    // Add the uid as a field, so that index can be incrementally maintained.
    // This field is not stored with document, it is indexed, but it is not
    // tokenized prior to indexing.
    String strOccurrenceId = String.valueOf(occurrence.getId());
    doc.add(new Field(SearchItem.FIELD_UID, strOccurrenceId + "_" + PROPERTY_DOCUMENT_SHORT_NAME, ft));

    String strContentToIndex = getContentToIndex(document);
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try {
        new HtmlParser().parse(new ByteArrayInputStream(strContentToIndex.getBytes()), handler, metadata,
                new ParseContext());
    } catch (SAXException e) {
        throw new AppException("Error during page parsing.");
    } catch (TikaException e) {
        throw new AppException("Error during page parsing.");
    }

    //the content of the article is recovered in the parser because this one
    //had replaced the encoded caracters (as &eacute;) by the corresponding special caracter (as ?)
    StringBuilder sb = new StringBuilder(handler.toString());

    // Add the tag-stripped contents as a Reader-valued Text field so it will
    // get tokenized and indexed.
    doc.add(new Field(SearchItem.FIELD_CONTENTS, sb.toString(), TextField.TYPE_NOT_STORED));

    // Add the title as a separate Text field, so that it can be searched
    // separately.
    doc.add(new Field(SearchItem.FIELD_TITLE, document.getTitle(), ftNo));

    doc.add(new Field(SearchItem.FIELD_TYPE, CalendarPlugin.PLUGIN_NAME, TextField.TYPE_STORED));

    doc.add(new Field(SearchItem.FIELD_ROLE, strRole, ft));

    // return the document
    return doc;
}

From source file:br.ufrgs.inf.dsmoura.repository.controller.solr.SolrConversionUtil.java

private static void addFileNotNull(SolrInputDocument doc, Artifactable file) {
    if (file == null || file.getFile() == null) {
        return;/*  ww w  .  j  a  va  2 s . c  o m*/
    }
    ContentHandler textHandler = new BodyContentHandler(10 * 1024 * 1024);
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();

    InputStream input = new ByteArrayInputStream(file.getFile());
    try {
        new AutoDetectParser().parse(input, textHandler, metadata, context);
    } catch (Exception e) {
        logger.error(("File parsing failed: " + file.getName()), e);
        return;
    }
    doc.addField(SolrField.ARTIFACT_TEXT.getName(), textHandler.toString());
    logger.info(SolrField.ARTIFACT_TEXT.getName() + " : " + textHandler.toString());
}