List of usage examples for org.apache.poi.hpbf.extractor PublisherTextExtractor PublisherTextExtractor
public PublisherTextExtractor(InputStream is) throws IOException
From source file:com.jaeksoft.searchlib.parser.PublisherParser.java
License:Open Source License
@Override protected void parseContent(StreamLimiter streamLimiter, LanguageEnum lang) throws IOException { PublisherTextExtractor extractor = null; try {//from w w w . ja v a 2 s .c om extractor = new PublisherTextExtractor(streamLimiter.getNewInputStream()); SummaryInformation info = extractor.getSummaryInformation(); ParserResultItem result = getNewParserResultItem(); if (info != null) { result.addField(ParserFieldEnum.title, info.getTitle()); result.addField(ParserFieldEnum.author, info.getAuthor()); result.addField(ParserFieldEnum.subject, info.getSubject()); } result.addField(ParserFieldEnum.content, StringUtils.replaceConsecutiveSpaces(extractor.getText(), " ")); result.langDetection(10000, ParserFieldEnum.content); } finally { IOUtils.close(extractor); } }
From source file:com.opensearchserver.extractor.parser.Publisher.java
License:Apache License
@Override protected void parseContent(InputStream inputStream, String extension, String mimeType) throws Exception { PublisherTextExtractor extractor = null; try {//from w ww.ja v a 2s . co m extractor = new PublisherTextExtractor(inputStream); SummaryInformation info = extractor.getSummaryInformation(); if (info != null) { metas.add(TITLE, info.getTitle()); metas.add(AUTHOR, info.getAuthor()); metas.add(SUBJECT, info.getSubject()); metas.add(CREATION_DATE, info.getCreateDateTime()); metas.add(MODIFICATION_DATE, info.getLastSaveDateTime()); metas.add(CONTENT, info.getKeywords()); metas.add(COMMENTS, info.getComments()); } String text = extractor.getText(); if (StringUtils.isEmpty(text)) return; ParserDocument result = getNewParserDocument(); result.add(CONTENT, text); result.add(LANG_DETECTION, languageDetection(CONTENT, 10000)); } finally { if (extractor != null) IOUtils.closeQuietly(extractor); } }
From source file:com.qwazr.library.poi.PublisherParser.java
License:Apache License
@Override public void parseContent(final MultivaluedMap<String, String> parameters, final InputStream inputStream, final String extension, final String mimeType, final ParserResultBuilder resultBuilder) throws Exception { try (final PublisherTextExtractor extractor = new PublisherTextExtractor(inputStream)) { final ParserFieldsBuilder metas = resultBuilder.metas(); metas.set(MIME_TYPE, findMimeType(extension, mimeType, this::findMimeTypeUsingDefault)); final SummaryInformation info = extractor.getSummaryInformation(); if (info != null) { metas.add(TITLE, info.getTitle()); metas.add(AUTHOR, info.getAuthor()); metas.add(SUBJECT, info.getSubject()); metas.add(CREATION_DATE, info.getCreateDateTime()); metas.add(MODIFICATION_DATE, info.getLastSaveDateTime()); metas.add(CONTENT, info.getKeywords()); metas.add(COMMENTS, info.getComments()); }/*from w w w . j a v a 2 s. c om*/ final String text = extractor.getText(); if (StringUtils.isEmpty(text)) return; final ParserFieldsBuilder result = resultBuilder.newDocument(); result.add(CONTENT, text); result.add(LANG_DETECTION, languageDetection(result, CONTENT, 10000)); } }
From source file:mj.ocraptor.extraction.tika.parser.microsoft.OfficeParser.java
License:Apache License
protected void parse(DirectoryNode root, ParseContext context, Metadata metadata, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException { // Parse summary entries first, to make metadata available early new SummaryExtractor(metadata).parseSummaries(root); // Parse remaining document entries POIFSDocumentType type = POIFSDocumentType.detectType(root); if (type != POIFSDocumentType.UNKNOWN) { setType(metadata, type.getType()); }//from w w w .ja v a2s . c om switch (type) { case SOLIDWORKS_PART: // new SolidworksExtractor(context).parse(root, xhtml); break; case SOLIDWORKS_ASSEMBLY: break; case SOLIDWORKS_DRAWING: break; case PUBLISHER: PublisherTextExtractor publisherTextExtractor = new PublisherTextExtractor(root); xhtml.element("p", publisherTextExtractor.getText()); break; case WORDDOCUMENT: new WordExtractor(context, metadata).parse(root, xhtml); break; case POWERPOINT: new HSLFExtractor(context, metadata).parse(root, xhtml); break; case WORKBOOK: case XLR: Locale locale = context.get(Locale.class, Locale.getDefault()); new ExcelExtractor(context, metadata).parse(root, xhtml, locale); break; case PROJECT: // We currently can't do anything beyond the metadata break; case VISIO: VisioTextExtractor visioTextExtractor = new VisioTextExtractor(root); for (String text : visioTextExtractor.getAllText()) { xhtml.element("p", text); } break; case OUTLOOK: OutlookExtractor extractor = new OutlookExtractor(root, context); extractor.parse(xhtml, metadata); break; case ENCRYPTED: EncryptionInfo info = new EncryptionInfo(root); Decryptor d = Decryptor.getInstance(info); try { // By default, use the default Office Password String password = Decryptor.DEFAULT_PASSWORD; // If they supplied a Password Provider, ask that for the password, // and use the provider given one if available (stick with default if // not) PasswordProvider passwordProvider = context.get(PasswordProvider.class); if (passwordProvider != null) { String suppliedPassword = passwordProvider.getPassword(metadata); if (suppliedPassword != null) { password = suppliedPassword; } } // Check if we've the right password or not if (!d.verifyPassword(password)) { throw new EncryptedDocumentException(); } // Decrypt the OLE2 stream, and delegate the resulting OOXML // file to the regular OOXML parser for normal handling OOXMLParser parser = new OOXMLParser(); parser.parse(d.getDataStream(root), new EmbeddedContentHandler(new BodyContentHandler(xhtml)), metadata, context); } catch (GeneralSecurityException ex) { throw new EncryptedDocumentException(ex); } } }
From source file:org.apache.tika.parser.microsoft.OfficeParser.java
License:Apache License
protected void parse(DirectoryNode root, ParseContext context, Metadata metadata, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException { // Parse summary entries first, to make metadata available early new SummaryExtractor(metadata).parseSummaries(root); // Parse remaining document entries POIFSDocumentType type = POIFSDocumentType.detectType(root); if (type != POIFSDocumentType.UNKNOWN) { setType(metadata, type.getType()); }/*from w w w. j ava 2s . co m*/ switch (type) { case SOLIDWORKS_PART: case SOLIDWORKS_ASSEMBLY: case SOLIDWORKS_DRAWING: break; case PUBLISHER: PublisherTextExtractor publisherTextExtractor = new PublisherTextExtractor(root); xhtml.element("p", publisherTextExtractor.getText()); break; case WORDDOCUMENT: new WordExtractor(context).parse(root, xhtml); break; case POWERPOINT: new HSLFExtractor(context).parse(root, xhtml); break; case WORKBOOK: case XLR: Locale locale = context.get(Locale.class, Locale.getDefault()); new ExcelExtractor(context, metadata).parse(root, xhtml, locale); break; case PROJECT: // We currently can't do anything beyond the metadata break; case VISIO: VisioTextExtractor visioTextExtractor = new VisioTextExtractor(root); for (String text : visioTextExtractor.getAllText()) { xhtml.element("p", text); } break; case OUTLOOK: OutlookExtractor extractor = new OutlookExtractor(root, context); extractor.parse(xhtml, metadata); break; case ENCRYPTED: EncryptionInfo info = new EncryptionInfo(root); Decryptor d = Decryptor.getInstance(info); try { // By default, use the default Office Password String password = Decryptor.DEFAULT_PASSWORD; // If they supplied a Password Provider, ask that for the password, // and use the provider given one if available (stick with default if not) PasswordProvider passwordProvider = context.get(PasswordProvider.class); if (passwordProvider != null) { String suppliedPassword = passwordProvider.getPassword(metadata); if (suppliedPassword != null) { password = suppliedPassword; } } // Check if we've the right password or not if (!d.verifyPassword(password)) { throw new EncryptedDocumentException(); } // Decrypt the OLE2 stream, and delegate the resulting OOXML // file to the regular OOXML parser for normal handling OOXMLParser parser = new OOXMLParser(); parser.parse(d.getDataStream(root), new EmbeddedContentHandler(new BodyContentHandler(xhtml)), metadata, context); } catch (GeneralSecurityException ex) { throw new EncryptedDocumentException(ex); } default: // For unsupported / unhandled types, just the metadata // is extracted, which happened above break; } }
From source file:org.codelibs.fess.crawler.extractor.impl.MsPublisherExtractor.java
License:Apache License
@Override public ExtractData getText(final InputStream in, final Map<String, String> params) { if (in == null) { throw new CrawlerSystemException("The inputstream is null."); }//from ww w . j a v a2 s.c o m try { return new ExtractData(new PublisherTextExtractor(in).getText()); } catch (final IOException e) { throw new ExtractException(e); } }
From source file:org.codelibs.robot.extractor.impl.MsPublisherExtractor.java
License:Apache License
@Override public ExtractData getText(final InputStream in, final Map<String, String> params) { if (in == null) { throw new RobotSystemException("The inputstream is null."); }// w w w. ja v a2s. c om try { return new ExtractData(new PublisherTextExtractor(in).getText()); } catch (final IOException e) { throw new ExtractException(e); } }
From source file:org.terrier.indexing.POIDocument.java
License:Mozilla Public License
protected POITextExtractor getExtractor(String filename, InputStream docStream) throws IOException { //Word .doc: if (filename.endsWith(".doc")) { return new WordExtractor(docStream); }//from w w w . j a v a2 s . c o m //Word .docx: if (filename.endsWith(".docx")) { return new XWPFWordExtractor(new XWPFDocument(docStream)); } //Powertpoint .ppt: if (filename.endsWith(".ppt")) { return new PowerPointExtractor(docStream); } //Powertpoint .pptx: if (filename.endsWith(".pptx")) { return new XSLFPowerPointExtractor(new XMLSlideShow(docStream)); } //Publisher .pub: if (filename.endsWith(".pub")) { return new PublisherTextExtractor(docStream); } //Excel: .xls: if (filename.endsWith(".xls")) { return new ExcelExtractor(new POIFSFileSystem(docStream)); } //Excel: .xlsx: if (filename.endsWith(".xlsx")) { return new org.apache.poi.xssf.extractor.XSSFExcelExtractor(new XSSFWorkbook(docStream)); } //Visio: .vsd: if (filename.endsWith(".vsd")) { return new VisioTextExtractor(docStream); } return null; }