Example usage for org.xml.sax ContentHandler toString

Introduction

In this page you can find the example usage for org.xml.sax ContentHandler toString.

Prototype

public String toString()

Source Link

Document

Returns a string representation of the object.

Usage

From source file:WriteIndex.java

/**
 * @param args//from w  ww .  jav  a 2 s  . co m
 */
public static void main(String[] args) throws IOException {

    File docs = new File("documents");
    File indexDir = new File(INDEX_DIRECTORY);

    Directory directory = FSDirectory.open(indexDir);

    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_35);
    IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_35, analyzer);
    IndexWriter writer = new IndexWriter(directory, conf);
    writer.deleteAll();

    for (File file : docs.listFiles()) {
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler();
        ParseContext context = new ParseContext();
        Parser parser = new AutoDetectParser();
        InputStream stream = new FileInputStream(file);
        try {
            parser.parse(stream, handler, metadata, context);
        } catch (TikaException e) {
            e.printStackTrace();
        } catch (SAXException e) {
            e.printStackTrace();
        } finally {
            stream.close();
        }

        String text = handler.toString();
        String fileName = file.getName();

        Document doc = new Document();
        doc.add(new Field("file", fileName, Store.YES, Index.NO));

        for (String key : metadata.names()) {
            String name = key.toLowerCase();
            String value = metadata.get(key);

            if (StringUtils.isBlank(value)) {
                continue;
            }

            if ("keywords".equalsIgnoreCase(key)) {
                for (String keyword : value.split(",?(\\s+)")) {
                    doc.add(new Field(name, keyword, Store.YES, Index.NOT_ANALYZED));
                }
            } else if ("title".equalsIgnoreCase(key)) {
                doc.add(new Field(name, value, Store.YES, Index.ANALYZED));
            } else {
                doc.add(new Field(name, fileName, Store.YES, Index.NOT_ANALYZED));
            }
        }
        doc.add(new Field("text", text, Store.NO, Index.ANALYZED));
        writer.addDocument(doc);

    }

    writer.commit();
    writer.deleteUnusedFiles();

    System.out.println(writer.maxDoc() + " documents written");
}

From source file:lucene_3_tika.MyFirstTika.java

public static String parseUsingComponents(String filename, TikaConfig tikaConfig, Metadata metadata)
        throws Exception {
    MimeTypes mimeRegistry = tikaConfig.getMimeRepository();

    System.out.println("Examining: [" + filename + "]");

    metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
    System.out.println("The MIME type (based on filename) is: [" + mimeRegistry.detect(null, metadata) + "]");

    InputStream stream = TikaInputStream.get(new File(filename));
    System.out.println("The MIME type (based on MAGIC) is: [" + mimeRegistry.detect(stream, metadata) + "]");

    stream = TikaInputStream.get(new File(filename));
    Detector detector = tikaConfig.getDetector();
    System.out.println(//from   w  w w  . j  av  a2s. c o  m
            "The MIME type (based on the Detector interface) is: [" + detector.detect(stream, metadata) + "]");

    LanguageIdentifier lang = new LanguageIdentifier(
            new LanguageProfile(FileUtils.readFileToString(new File(filename), UTF_8)));

    System.out.println("The language of this content is: [" + lang.getLanguage() + "]");

    // Get a non-detecting parser that handles all the types it can
    Parser parser = tikaConfig.getParser();
    // Tell it what we think the content is
    MediaType type = detector.detect(stream, metadata);
    metadata.set(Metadata.CONTENT_TYPE, type.toString());
    // Have the file parsed to get the content and metadata
    ContentHandler handler = new BodyContentHandler();
    parser.parse(stream, handler, metadata, new ParseContext());

    return handler.toString();
}

From source file:lucene_3_tika.MyFirstTika.java

public static String parseUsingAutoDetect(String filename, TikaConfig tikaConfig, Metadata metadata)
        throws Exception {
    System.out.println("Handling using AutoDetectParser: [" + filename + "]");

    AutoDetectParser parser = new AutoDetectParser(tikaConfig);
    ContentHandler handler = new BodyContentHandler();
    TikaInputStream stream = TikaInputStream.get(new File(filename), metadata);
    parser.parse(stream, handler, metadata, new ParseContext());
    return handler.toString();
}

From source file:com.sustainalytics.crawlerfilter.PDFtoText.java

public static String extractTikaText(String file) {
    InputStream is = null;// www . java  2  s  . c  o  m
    ContentHandler contenthandler = null;
    try {
        is = new FileInputStream(file);
        contenthandler = new BodyContentHandler(-1);
        Metadata metadata = new Metadata();
        PDFParser pdfparser = new PDFParser();
        pdfparser.parse(is, contenthandler, metadata, new ParseContext());
        logger.info("PDF text extracted from " + file + "\n");

    } catch (Exception e) {
        logger.info("Error in parsing with Apache Tika parser\n");
    } finally {
        if (is != null)
            try {
                is.close();
            } catch (IOException e) {
                logger.info("Error in closing file with Apache Tika\n");
            }
    }
    return contenthandler.toString();

}

From source file:com.sustainalytics.crawlerfilter.PDFtoTextBatch.java

public static String extractTikaText(String file) {
    InputStream is = null;//  w  w w  .j  a  va 2s. c om
    ContentHandler contenthandler = null;
    try {
        is = new FileInputStream(file);
        contenthandler = new BodyContentHandler(-1);
        Metadata metadata = new Metadata();
        PDFParser pdfparser = new PDFParser();
        pdfparser.parse(is, contenthandler, metadata, new ParseContext());
        logger.info("PDF text extracted from " + file + "\n");

    } catch (Exception e) {
        logger.info("Error in parsing with Apache Tika parser\n");
    } finally {
        if (is != null)
            try {
                is.close();
            } catch (IOException e) {
                logger.info("Error in closing file with Apache Tika\n");
            }
    }
    return contenthandler.toString();
}

From source file:com.zimbra.cs.service.FeedManager.java

@VisibleForTesting
static final String stripXML(String title) {
    if (title == null) {
        return "";
    } else if (title.indexOf('<') == -1 && title.indexOf('&') == -1) {
        return title;
    }/*from  w w w  . j  a  v a  2s .co  m*/

    org.xml.sax.XMLReader parser = new org.cyberneko.html.parsers.SAXParser();
    org.xml.sax.ContentHandler handler = new UnescapedContent();
    parser.setContentHandler(handler);
    try {
        parser.parse(new org.xml.sax.InputSource(new StringReader(title)));
        return handler.toString();
    } catch (Exception e) {
        return title;
    }
}

From source file:com.sustainalytics.crawlerfilter.PDFTitleGeneration.java

public static String extractTikaText(String file) {
    InputStream is = null;//  w w  w  .ja v  a  2 s.  c o m
    ContentHandler contenthandler = null;
    try {
        is = new FileInputStream(file);
        contenthandler = new BodyContentHandler();
        Metadata metadata = new Metadata();
        PDFParser pdfparser = new PDFParser();
        pdfparser.parse(is, contenthandler, metadata, new ParseContext());

    } catch (Exception e) {
        e.printStackTrace();
    } finally {
        if (is != null)
            try {
                is.close();
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
    }
    return contenthandler.toString();
}

From source file:fr.paris.lutece.plugins.document.service.search.DocumentIndexer.java

/**
 * Builds a document which will be used by Lucene during the indexing of the
 * pages of the site with the following//  w w w.  jav  a 2 s . c  o m
 * fields : summary, uid, url, contents, title and description.
 *
 * @param document the document to index
 * @param strUrl the url of the documents
 * @param strRole the lutece role of the page associate to the document
 * @param strPortletDocumentId the document id concatened to the id portlet
 *            with a & in the middle
 * @return the built Document
 * @throws IOException The IO Exception
 * @throws InterruptedException The InterruptedException
 */
public static org.apache.lucene.document.Document getDocument(Document document, String strUrl, String strRole,
        String strPortletDocumentId) throws IOException, InterruptedException {
    // make a new, empty document
    org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document();

    FieldType ft = new FieldType(StringField.TYPE_STORED);
    ft.setOmitNorms(false);

    // Add the url as a field named "url".  Use an UnIndexed field, so
    // that the url is just stored with the document, but is not searchable.
    doc.add(new Field(SearchItem.FIELD_URL, strUrl, ft));

    // Add the PortletDocumentId as a field named "document_portlet_id".  
    doc.add(new Field(SearchItem.FIELD_DOCUMENT_PORTLET_ID, strPortletDocumentId, ft));

    // Add the last modified date of the file a field named "modified".
    // Use a field that is indexed (i.e. searchable), but don't tokenize
    // the field into words.
    String strDate = DateTools.dateToString(document.getDateModification(), DateTools.Resolution.DAY);
    doc.add(new Field(SearchItem.FIELD_DATE, strDate, ft));

    // Add the uid as a field, so that index can be incrementally maintained.
    // This field is not stored with document, it is indexed, but it is not
    // tokenized prior to indexing.
    String strIdDocument = String.valueOf(document.getId());
    doc.add(new Field(SearchItem.FIELD_UID, strIdDocument + "_" + DocumentIndexer.SHORT_NAME, ft));

    String strContentToIndex = getContentToIndex(document);
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();

    try {
        new HtmlParser().parse(new ByteArrayInputStream(strContentToIndex.getBytes()), handler, metadata,
                new ParseContext());
    } catch (SAXException e) {
        throw new AppException("Error during document parsing.");
    } catch (TikaException e) {
        throw new AppException("Error during document parsing.");
    }

    //the content of the article is recovered in the parser because this one
    //had replaced the encoded caracters (as &eacute;) by the corresponding special caracter (as ?)
    String strContent = handler.toString();

    // Add the tag-stripped contents as a Reader-valued Text field so it will
    // get tokenized and indexed.
    doc.add(new Field(SearchItem.FIELD_CONTENTS, strContent, TextField.TYPE_NOT_STORED));

    // Add the title as a separate Text field, so that it can be searched
    // separately.
    FieldType ft2 = new FieldType(TextField.TYPE_STORED);
    ft2.setOmitNorms(true);
    doc.add(new Field(SearchItem.FIELD_TITLE, document.getTitle(), ft2));

    doc.add(new Field(SearchItem.FIELD_TYPE, document.getType(), ft));

    doc.add(new Field(SearchItem.FIELD_ROLE, strRole, ft));

    // add metadata (mapped to summary)
    doc.add(new Field(SearchItem.FIELD_METADATA, document.getSummary(), TextField.TYPE_NOT_STORED));
    doc.add(new StoredField(SearchItem.FIELD_SUMMARY, document.getSummary()));

    // return the document
    return doc;
}

From source file:fr.paris.lutece.plugins.calendar.modules.document.service.search.DocumentCalendarIndexer.java

/**
 * Builds a document which will be used by Lucene during the indexing of the
 * pages of the site with the following/*from   w  w w  .  j  a v a  2  s .  c om*/
 * fields : summary, uid, url, contents, title and description.
 * 
 * @param document the document to index
 * @param strUrl the url of the documents
 * @param strRole the lutece role of the page associate to the document
 * @param strPortletDocumentId the document id concatened to the id portlet
 *            with a & in the middle
 * @return the built Document
 * @throws IOException The IO Exception
 * @throws InterruptedException The InterruptedException
 */
public static org.apache.lucene.document.Document getDocument(
        fr.paris.lutece.plugins.document.business.Document document, String strRole, Event occurrence,
        String strAgenda, String strOccurrenceUrl) throws IOException, InterruptedException {
    FieldType ft = new FieldType(StringField.TYPE_STORED);
    ft.setOmitNorms(false);

    FieldType ftNotStored = new FieldType(StringField.TYPE_STORED);
    ftNotStored.setOmitNorms(false);

    FieldType ftNo = new FieldType(StringField.TYPE_STORED);
    ftNo.setIndexed(false);
    ftNo.setTokenized(false);
    ftNo.setOmitNorms(false);

    // make a new, empty document
    org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document();

    doc.add(new Field(Constants.FIELD_CALENDAR_ID, strAgenda + "_" + CALENDAR_SHORT_NAME, ftNotStored));

    // Add the last modified date of the file a field named "modified".
    // Use a field that is indexed (i.e. searchable), but don't tokenize
    // the field into words.
    String strDate = Utils.getDate(occurrence.getDate());
    doc.add(new Field(SearchItem.FIELD_DATE, strDate, ft));

    // Add the url as a field named "url".  Use an UnIndexed field, so
    // that the url is just stored with the question/answer, but is not searchable.
    doc.add(new Field(SearchItem.FIELD_URL, strOccurrenceUrl, ft));

    // Add the uid as a field, so that index can be incrementally maintained.
    // This field is not stored with document, it is indexed, but it is not
    // tokenized prior to indexing.
    String strOccurrenceId = String.valueOf(occurrence.getId());
    doc.add(new Field(SearchItem.FIELD_UID, strOccurrenceId + "_" + PROPERTY_DOCUMENT_SHORT_NAME, ft));

    String strContentToIndex = getContentToIndex(document);
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try {
        new HtmlParser().parse(new ByteArrayInputStream(strContentToIndex.getBytes()), handler, metadata,
                new ParseContext());
    } catch (SAXException e) {
        throw new AppException("Error during page parsing.");
    } catch (TikaException e) {
        throw new AppException("Error during page parsing.");
    }

    //the content of the article is recovered in the parser because this one
    //had replaced the encoded caracters (as &eacute;) by the corresponding special caracter (as ?)
    StringBuilder sb = new StringBuilder(handler.toString());

    // Add the tag-stripped contents as a Reader-valued Text field so it will
    // get tokenized and indexed.
    doc.add(new Field(SearchItem.FIELD_CONTENTS, sb.toString(), TextField.TYPE_NOT_STORED));

    // Add the title as a separate Text field, so that it can be searched
    // separately.
    doc.add(new Field(SearchItem.FIELD_TITLE, document.getTitle(), ftNo));

    doc.add(new Field(SearchItem.FIELD_TYPE, CalendarPlugin.PLUGIN_NAME, TextField.TYPE_STORED));

    doc.add(new Field(SearchItem.FIELD_ROLE, strRole, ft));

    // return the document
    return doc;
}

From source file:br.ufrgs.inf.dsmoura.repository.controller.solr.SolrConversionUtil.java

private static void addFileNotNull(SolrInputDocument doc, Artifactable file) {
    if (file == null || file.getFile() == null) {
        return;/*  ww w  .  j  a  va  2 s . c  o m*/
    }
    ContentHandler textHandler = new BodyContentHandler(10 * 1024 * 1024);
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();

    InputStream input = new ByteArrayInputStream(file.getFile());
    try {
        new AutoDetectParser().parse(input, textHandler, metadata, context);
    } catch (Exception e) {
        logger.error(("File parsing failed: " + file.getName()), e);
        return;
    }
    doc.addField(SolrField.ARTIFACT_TEXT.getName(), textHandler.toString());
    logger.info(SolrField.ARTIFACT_TEXT.getName() + " : " + textHandler.toString());
}