List of usage examples for org.apache.poi.xwpf.extractor XWPFWordExtractor getText
public String getText()
From source file:com.opensearchserver.extractor.parser.Docx.java
License:Apache License
@Override protected void parseContent(InputStream inputStream, String extension, String mimeType) throws IOException { XWPFDocument document = new XWPFDocument(inputStream); XWPFWordExtractor word = null; try {//from www. j a v a 2 s .c o m word = new XWPFWordExtractor(document); CoreProperties info = word.getCoreProperties(); if (info != null) { metas.add(TITLE, info.getTitle()); metas.add(CREATOR, info.getCreator()); metas.add(CREATION_DATE, info.getCreated()); metas.add(MODIFICATION_DATE, info.getModified()); metas.add(SUBJECT, info.getSubject()); metas.add(DESCRIPTION, info.getDescription()); metas.add(KEYWORDS, info.getKeywords()); } ParserDocument parserDocument = getNewParserDocument(); parserDocument.add(CONTENT, word.getText()); parserDocument.add(LANG_DETECTION, languageDetection(CONTENT, 10000)); } finally { IOUtils.closeQuietly(word); } }
From source file:com.opensearchserver.textextractor.parser.Docx.java
License:Apache License
@Override protected void parseContent(InputStream inputStream) throws IOException { XWPFDocument document = new XWPFDocument(inputStream); XWPFWordExtractor word = null; try {//from ww w .ja va 2 s. com word = new XWPFWordExtractor(document); CoreProperties info = word.getCoreProperties(); if (info != null) { metas.add(TITLE, info.getTitle()); metas.add(CREATOR, info.getCreator()); metas.add(CREATION_DATE, info.getCreated()); metas.add(MODIFICATION_DATE, info.getModified()); metas.add(SUBJECT, info.getSubject()); metas.add(DESCRIPTION, info.getDescription()); metas.add(KEYWORDS, info.getKeywords()); } ParserDocument parserDocument = getNewParserDocument(); parserDocument.add(CONTENT, word.getText()); parserDocument.add(LANG_DETECTION, languageDetection(CONTENT, 10000)); } finally { IOUtils.closeQuietly(word); } }
From source file:com.swg.parse.docx.MSDocConvTest.java
/*** * @return String containing the content of the .docx file from POI apache * @throws FileNotFoundException//w w w . ja va2s. co m * @throws IOException */ private String getPOI() throws FileNotFoundException, IOException { FileInputStream inputTest = new FileInputStream(path + "SCD_2009_E-04 2009.08.21.docx"); XWPFDocument docxTest = new XWPFDocument(inputTest); XWPFWordExtractor ContentTest = new XWPFWordExtractor(docxTest); String contentIn = ContentTest.getText(); return contentIn; }
From source file:com.swg.parse.docx.MSDocConvTest2.java
/*** * @return String containing the content of the .docx file from POI apache * @throws FileNotFoundException/*from w w w.j av a 2 s . c o m*/ * @throws IOException */ private String getPOI() throws FileNotFoundException, IOException { FileInputStream inputTest = new FileInputStream(path + "CAD_2013_RE-06.docx"); XWPFDocument docxTest = new XWPFDocument(inputTest); XWPFWordExtractor ContentTest = new XWPFWordExtractor(docxTest); String contentIn = ContentTest.getText(); return contentIn; }
From source file:com.swg.parse.docx.OpenFolderAction.java
/*** * Simply grab the content of a .docx file using Apache POI and put it into a string * This String may have missing part due to POI library * @param f the .dox file/*from w ww. jav a 2 s. com*/ * @return POI content of .docx * @throws FileNotFoundException * @throws IOException */ private String getPOI(File f) throws FileNotFoundException, IOException { FileInputStream inputTest = new FileInputStream(f.getAbsolutePath()); XWPFDocument docxTest = new XWPFDocument(inputTest); XWPFWordExtractor ContentTest = new XWPFWordExtractor(docxTest); String contentIn = ContentTest.getText(); return contentIn; }
From source file:com.swg.parse.docx.OpenWord.java
/*** * Simply grab the content of a .docx file using Apache POI and put it into a string * This String may have missing part due to POI library * @param f the .dox file/* w w w.ja va2 s .c o m*/ * @return POI content of .docx * @throws FileNotFoundException * @throws IOException */ private String getPOI() throws FileNotFoundException, IOException { FileInputStream inputTest = new FileInputStream(selectedFile.getAbsolutePath()); XWPFDocument docxTest = new XWPFDocument(inputTest); XWPFWordExtractor ContentTest = new XWPFWordExtractor(docxTest); String contentIn = ContentTest.getText(); return contentIn; }
From source file:com.swg.parse.docx.V2Test.java
/*** * @return String containing the content of the .docx file from POI apache * @throws FileNotFoundException// w w w . j a v a 2 s.c o m * @throws IOException */ private String getPOI() throws FileNotFoundException, IOException { FileInputStream inputTest = new FileInputStream(path + "CAD_2013_RE-02.docx"); XWPFDocument docxTest = new XWPFDocument(inputTest); XWPFWordExtractor ContentTest = new XWPFWordExtractor(docxTest); String contentIn = ContentTest.getText(); return contentIn; }
From source file:de.catma.document.source.contenthandler.DOCXContentHandler.java
License:Open Source License
@Override public void load(InputStream is) throws IOException { XWPFDocument doc = new XWPFDocument(is); XWPFWordExtractor wordExtractor = new XWPFWordExtractor(doc); String buf = wordExtractor.getText(); //it's still microsoft after all if (FileOSType.getFileOSType(buf).equals(FileOSType.UNIX)) { buf = FileOSType.convertUnixToDos(buf); }//from w w w . java 2s . c o m setContent(buf); }
From source file:de.iisys.schub.processMining.similarity.parsing.DocxParser.java
License:Apache License
/** * Only use this method if you don't want to get chapters sometimes. * Otherwise use 'parseDocxAndChapters' and 'getFullText' methods. * //from www . jav a2 s . c o m * Parses a .docx Word file and returns its text. * @return * Returns the full text (incl. tables) as string. */ public String parseDocxSimple() { if (theDoc != null) { XWPFWordExtractor extr = new XWPFWordExtractor(theDoc); this.fullText = extr.getText(); try { extr.close(); } catch (IOException e) { e.printStackTrace(); } } return this.fullText; }
From source file:de.powerstaff.business.service.impl.reader.msword.DOCXWordDocumentReader.java
License:Open Source License
public ReadResult getContent(File inputFile) throws Exception { XWPFDocument theDocument = new XWPFDocument(new FileInputStream(inputFile)); XWPFWordExtractor theExtractor = new XWPFWordExtractor(theDocument); String theText = theExtractor.getText(); theText = theText.replace('|', ' '); return new ReadResult(toFlatString(theText)); }