List of usage examples for org.apache.poi.hpsf SummaryInformation getSubject
public String getSubject()
From source file:com.duroty.lucene.parser.utils.POIFSListener.java
License:Apache License
/** * DOCUMENT ME!//from ww w .ja va2 s .c o m * * @param arg0 DOCUMENT ME! */ public void processPOIFSReaderEvent(POIFSReaderEvent readerEvent) { org.apache.poi.hpsf.PropertySet propertySet; try { propertySet = PropertySetFactory.create(readerEvent.getStream()); SummaryInformation info = (SummaryInformation) propertySet; this.author = info.getAuthor(); this.title = info.getTitle(); this.keywords = info.getKeywords(); this.subject = info.getSubject(); } catch (NoPropertySetStreamException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (MarkUnsupportedException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (UnexpectedPropertySetTypeException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } }
From source file:com.jaeksoft.searchlib.parser.DocParser.java
License:Open Source License
private void currentWordExtraction(ParserResultItem result, InputStream inputStream) throws IOException { WordExtractor word = null;//www .j av a 2s . c o m try { word = new WordExtractor(inputStream); SummaryInformation info = word.getSummaryInformation(); if (info != null) { result.addField(ParserFieldEnum.title, info.getTitle()); result.addField(ParserFieldEnum.author, info.getAuthor()); result.addField(ParserFieldEnum.subject, info.getSubject()); } String[] paragraphes = word.getParagraphText(); for (String paragraph : paragraphes) { String[] frags = paragraph.split("\\n"); for (String frag : frags) result.addField(ParserFieldEnum.content, StringUtils.replaceConsecutiveSpaces(frag, " ")); } } finally { IOUtils.close(word); } }
From source file:com.jaeksoft.searchlib.parser.DocParser.java
License:Open Source License
private void oldWordExtraction(ParserResultItem result, InputStream inputStream) throws IOException { Word6Extractor word6 = null;/* www . j a v a 2 s . c o m*/ try { word6 = new Word6Extractor(inputStream); SummaryInformation si = word6.getSummaryInformation(); if (si != null) { result.addField(ParserFieldEnum.title, si.getTitle()); result.addField(ParserFieldEnum.author, si.getAuthor()); result.addField(ParserFieldEnum.subject, si.getSubject()); } String text = word6.getText(); String[] frags = text.split("\\n"); for (String frag : frags) result.addField(ParserFieldEnum.content, StringUtils.replaceConsecutiveSpaces(frag, " ")); } finally { IOUtils.close(word6); } }
From source file:com.jaeksoft.searchlib.parser.PublisherParser.java
License:Open Source License
@Override protected void parseContent(StreamLimiter streamLimiter, LanguageEnum lang) throws IOException { PublisherTextExtractor extractor = null; try {//from ww w . j a va2 s . c o m extractor = new PublisherTextExtractor(streamLimiter.getNewInputStream()); SummaryInformation info = extractor.getSummaryInformation(); ParserResultItem result = getNewParserResultItem(); if (info != null) { result.addField(ParserFieldEnum.title, info.getTitle()); result.addField(ParserFieldEnum.author, info.getAuthor()); result.addField(ParserFieldEnum.subject, info.getSubject()); } result.addField(ParserFieldEnum.content, StringUtils.replaceConsecutiveSpaces(extractor.getText(), " ")); result.langDetection(10000, ParserFieldEnum.content); } finally { IOUtils.close(extractor); } }
From source file:com.jaeksoft.searchlib.parser.VisioParser.java
License:Open Source License
@Override protected void parseContent(StreamLimiter streamLimiter, LanguageEnum lang) throws IOException { VisioTextExtractor extractor = null; try {/*from w ww .j a v a 2 s . co m*/ extractor = new VisioTextExtractor(streamLimiter.getNewInputStream()); SummaryInformation info = extractor.getSummaryInformation(); ParserResultItem result = getNewParserResultItem(); if (info != null) { result.addField(ParserFieldEnum.title, info.getTitle()); result.addField(ParserFieldEnum.author, info.getAuthor()); result.addField(ParserFieldEnum.subject, info.getSubject()); } String[] texts = extractor.getAllText(); if (texts == null) return; for (String text : texts) result.addField(ParserFieldEnum.content, StringUtils.replaceConsecutiveSpaces(text, " ")); result.langDetection(10000, ParserFieldEnum.content); } finally { IOUtils.close(extractor); } }
From source file:com.jaeksoft.searchlib.parser.XlsParser.java
License:Open Source License
@Override protected void parseContent(StreamLimiter streamLimiter, LanguageEnum lang) throws IOException { HSSFWorkbook workbook = new HSSFWorkbook(streamLimiter.getNewInputStream()); ExcelExtractor excel = null;//from w ww .ja va2s . co m try { excel = new ExcelExtractor(workbook); ParserResultItem result = getNewParserResultItem(); SummaryInformation info = excel.getSummaryInformation(); if (info != null) { result.addField(ParserFieldEnum.title, info.getTitle()); result.addField(ParserFieldEnum.author, info.getAuthor()); result.addField(ParserFieldEnum.subject, info.getSubject()); } String content = excel.getText(); result.addField(ParserFieldEnum.content, StringUtils.replaceConsecutiveSpaces(content, " ")); result.langDetection(10000, ParserFieldEnum.content); } finally { IOUtils.close(excel); } }
From source file:com.openkm.util.metadata.MetadataExtractor.java
License:Open Source License
/** * Extract metadata from Office Word/*www . ja va 2s. c o m*/ */ public static OfficeMetadata officeExtractor(InputStream is, String mimeType) throws IOException { POIFSFileSystem fs = new POIFSFileSystem(is); OfficeMetadata md = new OfficeMetadata(); SummaryInformation si = null; if (MimeTypeConfig.MIME_MS_WORD.equals(mimeType)) { si = new WordExtractor(fs).getSummaryInformation(); } else if (MimeTypeConfig.MIME_MS_EXCEL.equals(mimeType)) { si = new ExcelExtractor(fs).getSummaryInformation(); } else if (MimeTypeConfig.MIME_MS_POWERPOINT.equals(mimeType)) { si = new PowerPointExtractor(fs).getSummaryInformation(); } if (si != null) { md.setTitle(si.getTitle()); md.setSubject(si.getSubject()); md.setAuthor(si.getAuthor()); md.setLastAuthor(si.getLastAuthor()); md.setKeywords(si.getKeywords()); md.setComments(si.getComments()); md.setTemplate(si.getTemplate()); md.setRevNumber(si.getRevNumber()); md.setApplicationName(si.getApplicationName()); md.setEditTime(si.getEditTime()); md.setPageCount(si.getPageCount()); md.setWordCount(si.getWordCount()); md.setCharCount(si.getCharCount()); md.setSecurity(si.getSecurity()); Calendar createDateTime = Calendar.getInstance(); createDateTime.setTime(si.getCreateDateTime()); md.setCreateDateTime(createDateTime); Calendar lastSaveDateTime = Calendar.getInstance(); lastSaveDateTime.setTime(si.getLastSaveDateTime()); md.setLastSaveDateTime(lastSaveDateTime); Calendar lastPrinted = Calendar.getInstance(); lastPrinted.setTime(si.getLastPrinted()); md.setLastPrinted(lastPrinted); } log.info("officeExtractor: {}", md); return md; }
From source file:com.opensearchserver.extractor.parser.Doc.java
License:Apache License
private void currentWordExtraction(InputStream inputStream) throws IOException { WordExtractor word = null;// ww w . java 2 s.co m try { word = new WordExtractor(inputStream); SummaryInformation info = word.getSummaryInformation(); if (info != null) { metas.add(TITLE, info.getTitle()); metas.add(AUTHOR, info.getAuthor()); metas.add(SUBJECT, info.getSubject()); metas.add(CREATION_DATE, info.getCreateDateTime()); metas.add(MODIFICATION_DATE, info.getLastSaveDateTime()); metas.add(KEYWORDS, info.getKeywords()); } ParserDocument document = getNewParserDocument(); String[] paragraphes = word.getParagraphText(); for (String paragraph : paragraphes) document.add(CONTENT, paragraph); document.add(LANG_DETECTION, languageDetection(CONTENT, 10000)); } finally { IOUtils.closeQuietly(word); } }
From source file:com.opensearchserver.extractor.parser.Doc.java
License:Apache License
private void oldWordExtraction(InputStream inputStream) throws IOException { Word6Extractor word6 = null;//from ww w .j a v a2s . c o m try { word6 = new Word6Extractor(inputStream); SummaryInformation si = word6.getSummaryInformation(); if (si != null) { metas.add(TITLE, si.getTitle()); metas.add(AUTHOR, si.getAuthor()); metas.add(SUBJECT, si.getSubject()); } ParserDocument document = getNewParserDocument(); @SuppressWarnings("deprecation") String[] paragraphes = word6.getParagraphText(); for (String paragraph : paragraphes) document.add(CONTENT, paragraph); document.add(LANG_DETECTION, languageDetection(CONTENT, 10000)); } finally { IOUtils.closeQuietly(word6); } }
From source file:com.opensearchserver.extractor.parser.Publisher.java
License:Apache License
@Override protected void parseContent(InputStream inputStream, String extension, String mimeType) throws Exception { PublisherTextExtractor extractor = null; try {//w ww . j a va 2 s . c o m extractor = new PublisherTextExtractor(inputStream); SummaryInformation info = extractor.getSummaryInformation(); if (info != null) { metas.add(TITLE, info.getTitle()); metas.add(AUTHOR, info.getAuthor()); metas.add(SUBJECT, info.getSubject()); metas.add(CREATION_DATE, info.getCreateDateTime()); metas.add(MODIFICATION_DATE, info.getLastSaveDateTime()); metas.add(CONTENT, info.getKeywords()); metas.add(COMMENTS, info.getComments()); } String text = extractor.getText(); if (StringUtils.isEmpty(text)) return; ParserDocument result = getNewParserDocument(); result.add(CONTENT, text); result.add(LANG_DETECTION, languageDetection(CONTENT, 10000)); } finally { if (extractor != null) IOUtils.closeQuietly(extractor); } }