Example usage for org.apache.poi.hpsf SummaryInformation getSubject

List of usage examples for org.apache.poi.hpsf SummaryInformation getSubject

Introduction

In this page you can find the example usage for org.apache.poi.hpsf SummaryInformation getSubject.

Prototype

public String getSubject() 

Source Link

Document

Returns the subject (or null ).

Usage

From source file:com.duroty.lucene.parser.utils.POIFSListener.java

License:Apache License

/**
 * DOCUMENT ME!//from   ww w .ja  va2  s  .c o m
 *
 * @param arg0 DOCUMENT ME!
 */
public void processPOIFSReaderEvent(POIFSReaderEvent readerEvent) {
    org.apache.poi.hpsf.PropertySet propertySet;

    try {
        propertySet = PropertySetFactory.create(readerEvent.getStream());

        SummaryInformation info = (SummaryInformation) propertySet;
        this.author = info.getAuthor();
        this.title = info.getTitle();
        this.keywords = info.getKeywords();
        this.subject = info.getSubject();
    } catch (NoPropertySetStreamException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (MarkUnsupportedException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (UnexpectedPropertySetTypeException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
}

From source file:com.jaeksoft.searchlib.parser.DocParser.java

License:Open Source License

private void currentWordExtraction(ParserResultItem result, InputStream inputStream) throws IOException {
    WordExtractor word = null;//www .j  av  a  2s  . c o m

    try {
        word = new WordExtractor(inputStream);

        SummaryInformation info = word.getSummaryInformation();
        if (info != null) {
            result.addField(ParserFieldEnum.title, info.getTitle());
            result.addField(ParserFieldEnum.author, info.getAuthor());
            result.addField(ParserFieldEnum.subject, info.getSubject());
        }

        String[] paragraphes = word.getParagraphText();
        for (String paragraph : paragraphes) {
            String[] frags = paragraph.split("\\n");
            for (String frag : frags)
                result.addField(ParserFieldEnum.content, StringUtils.replaceConsecutiveSpaces(frag, " "));
        }
    } finally {
        IOUtils.close(word);
    }
}

From source file:com.jaeksoft.searchlib.parser.DocParser.java

License:Open Source License

private void oldWordExtraction(ParserResultItem result, InputStream inputStream) throws IOException {
    Word6Extractor word6 = null;/*  www  . j  a  v  a 2  s . c  o m*/
    try {
        word6 = new Word6Extractor(inputStream);
        SummaryInformation si = word6.getSummaryInformation();
        if (si != null) {
            result.addField(ParserFieldEnum.title, si.getTitle());
            result.addField(ParserFieldEnum.author, si.getAuthor());
            result.addField(ParserFieldEnum.subject, si.getSubject());
        }

        String text = word6.getText();
        String[] frags = text.split("\\n");
        for (String frag : frags)
            result.addField(ParserFieldEnum.content, StringUtils.replaceConsecutiveSpaces(frag, " "));
    } finally {
        IOUtils.close(word6);
    }
}

From source file:com.jaeksoft.searchlib.parser.PublisherParser.java

License:Open Source License

@Override
protected void parseContent(StreamLimiter streamLimiter, LanguageEnum lang) throws IOException {
    PublisherTextExtractor extractor = null;
    try {//from ww  w .  j  a  va2  s  . c  o m
        extractor = new PublisherTextExtractor(streamLimiter.getNewInputStream());
        SummaryInformation info = extractor.getSummaryInformation();
        ParserResultItem result = getNewParserResultItem();
        if (info != null) {
            result.addField(ParserFieldEnum.title, info.getTitle());
            result.addField(ParserFieldEnum.author, info.getAuthor());
            result.addField(ParserFieldEnum.subject, info.getSubject());
        }
        result.addField(ParserFieldEnum.content,
                StringUtils.replaceConsecutiveSpaces(extractor.getText(), " "));
        result.langDetection(10000, ParserFieldEnum.content);
    } finally {
        IOUtils.close(extractor);
    }
}

From source file:com.jaeksoft.searchlib.parser.VisioParser.java

License:Open Source License

@Override
protected void parseContent(StreamLimiter streamLimiter, LanguageEnum lang) throws IOException {
    VisioTextExtractor extractor = null;
    try {/*from  w ww .j a  v  a  2  s .  co m*/
        extractor = new VisioTextExtractor(streamLimiter.getNewInputStream());
        SummaryInformation info = extractor.getSummaryInformation();
        ParserResultItem result = getNewParserResultItem();

        if (info != null) {
            result.addField(ParserFieldEnum.title, info.getTitle());
            result.addField(ParserFieldEnum.author, info.getAuthor());
            result.addField(ParserFieldEnum.subject, info.getSubject());
        }
        String[] texts = extractor.getAllText();
        if (texts == null)
            return;
        for (String text : texts)
            result.addField(ParserFieldEnum.content, StringUtils.replaceConsecutiveSpaces(text, " "));
        result.langDetection(10000, ParserFieldEnum.content);
    } finally {
        IOUtils.close(extractor);
    }
}

From source file:com.jaeksoft.searchlib.parser.XlsParser.java

License:Open Source License

@Override
protected void parseContent(StreamLimiter streamLimiter, LanguageEnum lang) throws IOException {

    HSSFWorkbook workbook = new HSSFWorkbook(streamLimiter.getNewInputStream());
    ExcelExtractor excel = null;//from  w ww .ja  va2s  . co  m
    try {
        excel = new ExcelExtractor(workbook);
        ParserResultItem result = getNewParserResultItem();

        SummaryInformation info = excel.getSummaryInformation();
        if (info != null) {
            result.addField(ParserFieldEnum.title, info.getTitle());
            result.addField(ParserFieldEnum.author, info.getAuthor());
            result.addField(ParserFieldEnum.subject, info.getSubject());
        }

        String content = excel.getText();
        result.addField(ParserFieldEnum.content, StringUtils.replaceConsecutiveSpaces(content, " "));

        result.langDetection(10000, ParserFieldEnum.content);
    } finally {
        IOUtils.close(excel);
    }

}

From source file:com.openkm.util.metadata.MetadataExtractor.java

License:Open Source License

/**
 * Extract metadata from Office Word/*www . ja va 2s.  c o m*/
 */
public static OfficeMetadata officeExtractor(InputStream is, String mimeType) throws IOException {
    POIFSFileSystem fs = new POIFSFileSystem(is);
    OfficeMetadata md = new OfficeMetadata();
    SummaryInformation si = null;

    if (MimeTypeConfig.MIME_MS_WORD.equals(mimeType)) {
        si = new WordExtractor(fs).getSummaryInformation();
    } else if (MimeTypeConfig.MIME_MS_EXCEL.equals(mimeType)) {
        si = new ExcelExtractor(fs).getSummaryInformation();
    } else if (MimeTypeConfig.MIME_MS_POWERPOINT.equals(mimeType)) {
        si = new PowerPointExtractor(fs).getSummaryInformation();
    }

    if (si != null) {
        md.setTitle(si.getTitle());
        md.setSubject(si.getSubject());
        md.setAuthor(si.getAuthor());
        md.setLastAuthor(si.getLastAuthor());
        md.setKeywords(si.getKeywords());
        md.setComments(si.getComments());
        md.setTemplate(si.getTemplate());
        md.setRevNumber(si.getRevNumber());
        md.setApplicationName(si.getApplicationName());
        md.setEditTime(si.getEditTime());
        md.setPageCount(si.getPageCount());
        md.setWordCount(si.getWordCount());
        md.setCharCount(si.getCharCount());
        md.setSecurity(si.getSecurity());

        Calendar createDateTime = Calendar.getInstance();
        createDateTime.setTime(si.getCreateDateTime());
        md.setCreateDateTime(createDateTime);

        Calendar lastSaveDateTime = Calendar.getInstance();
        lastSaveDateTime.setTime(si.getLastSaveDateTime());
        md.setLastSaveDateTime(lastSaveDateTime);

        Calendar lastPrinted = Calendar.getInstance();
        lastPrinted.setTime(si.getLastPrinted());
        md.setLastPrinted(lastPrinted);
    }

    log.info("officeExtractor: {}", md);
    return md;
}

From source file:com.opensearchserver.extractor.parser.Doc.java

License:Apache License

private void currentWordExtraction(InputStream inputStream) throws IOException {
    WordExtractor word = null;//  ww  w .  java 2  s.co  m

    try {
        word = new WordExtractor(inputStream);

        SummaryInformation info = word.getSummaryInformation();
        if (info != null) {
            metas.add(TITLE, info.getTitle());
            metas.add(AUTHOR, info.getAuthor());
            metas.add(SUBJECT, info.getSubject());
            metas.add(CREATION_DATE, info.getCreateDateTime());
            metas.add(MODIFICATION_DATE, info.getLastSaveDateTime());
            metas.add(KEYWORDS, info.getKeywords());
        }

        ParserDocument document = getNewParserDocument();
        String[] paragraphes = word.getParagraphText();
        for (String paragraph : paragraphes)
            document.add(CONTENT, paragraph);
        document.add(LANG_DETECTION, languageDetection(CONTENT, 10000));
    } finally {
        IOUtils.closeQuietly(word);
    }
}

From source file:com.opensearchserver.extractor.parser.Doc.java

License:Apache License

private void oldWordExtraction(InputStream inputStream) throws IOException {
    Word6Extractor word6 = null;//from   ww w .j a v a2s .  c  o m
    try {
        word6 = new Word6Extractor(inputStream);
        SummaryInformation si = word6.getSummaryInformation();
        if (si != null) {
            metas.add(TITLE, si.getTitle());
            metas.add(AUTHOR, si.getAuthor());
            metas.add(SUBJECT, si.getSubject());
        }

        ParserDocument document = getNewParserDocument();
        @SuppressWarnings("deprecation")
        String[] paragraphes = word6.getParagraphText();
        for (String paragraph : paragraphes)
            document.add(CONTENT, paragraph);
        document.add(LANG_DETECTION, languageDetection(CONTENT, 10000));
    } finally {
        IOUtils.closeQuietly(word6);
    }
}

From source file:com.opensearchserver.extractor.parser.Publisher.java

License:Apache License

@Override
protected void parseContent(InputStream inputStream, String extension, String mimeType) throws Exception {
    PublisherTextExtractor extractor = null;
    try {//w ww . j  a  va  2  s .  c  o m
        extractor = new PublisherTextExtractor(inputStream);
        SummaryInformation info = extractor.getSummaryInformation();

        if (info != null) {
            metas.add(TITLE, info.getTitle());
            metas.add(AUTHOR, info.getAuthor());
            metas.add(SUBJECT, info.getSubject());
            metas.add(CREATION_DATE, info.getCreateDateTime());
            metas.add(MODIFICATION_DATE, info.getLastSaveDateTime());
            metas.add(CONTENT, info.getKeywords());
            metas.add(COMMENTS, info.getComments());
        }
        String text = extractor.getText();
        if (StringUtils.isEmpty(text))
            return;
        ParserDocument result = getNewParserDocument();
        result.add(CONTENT, text);
        result.add(LANG_DETECTION, languageDetection(CONTENT, 10000));
    } finally {
        if (extractor != null)
            IOUtils.closeQuietly(extractor);
    }
}