Example usage for org.apache.poi.xwpf.extractor XWPFWordExtractor getText

List of usage examples for org.apache.poi.xwpf.extractor XWPFWordExtractor getText

Introduction

In this page you can find the example usage for org.apache.poi.xwpf.extractor XWPFWordExtractor getText.

Prototype

public String getText() 

Source Link

Usage

From source file:com.opensearchserver.extractor.parser.Docx.java

License:Apache License

@Override
protected void parseContent(InputStream inputStream, String extension, String mimeType) throws IOException {

    XWPFDocument document = new XWPFDocument(inputStream);
    XWPFWordExtractor word = null;
    try {//from  www. j a v a  2  s  .c o m
        word = new XWPFWordExtractor(document);

        CoreProperties info = word.getCoreProperties();
        if (info != null) {
            metas.add(TITLE, info.getTitle());
            metas.add(CREATOR, info.getCreator());
            metas.add(CREATION_DATE, info.getCreated());
            metas.add(MODIFICATION_DATE, info.getModified());
            metas.add(SUBJECT, info.getSubject());
            metas.add(DESCRIPTION, info.getDescription());
            metas.add(KEYWORDS, info.getKeywords());
        }
        ParserDocument parserDocument = getNewParserDocument();
        parserDocument.add(CONTENT, word.getText());
        parserDocument.add(LANG_DETECTION, languageDetection(CONTENT, 10000));
    } finally {
        IOUtils.closeQuietly(word);
    }
}

From source file:com.opensearchserver.textextractor.parser.Docx.java

License:Apache License

@Override
protected void parseContent(InputStream inputStream) throws IOException {

    XWPFDocument document = new XWPFDocument(inputStream);
    XWPFWordExtractor word = null;
    try {//from  ww  w .ja va  2  s.  com
        word = new XWPFWordExtractor(document);

        CoreProperties info = word.getCoreProperties();
        if (info != null) {
            metas.add(TITLE, info.getTitle());
            metas.add(CREATOR, info.getCreator());
            metas.add(CREATION_DATE, info.getCreated());
            metas.add(MODIFICATION_DATE, info.getModified());
            metas.add(SUBJECT, info.getSubject());
            metas.add(DESCRIPTION, info.getDescription());
            metas.add(KEYWORDS, info.getKeywords());
        }
        ParserDocument parserDocument = getNewParserDocument();
        parserDocument.add(CONTENT, word.getText());
        parserDocument.add(LANG_DETECTION, languageDetection(CONTENT, 10000));
    } finally {
        IOUtils.closeQuietly(word);
    }
}

From source file:com.swg.parse.docx.MSDocConvTest.java

/***
 * @return String containing the content of the .docx file from POI apache
 * @throws FileNotFoundException//w w w  . ja  va2s. co m
 * @throws IOException 
 */
private String getPOI() throws FileNotFoundException, IOException {

    FileInputStream inputTest = new FileInputStream(path + "SCD_2009_E-04 2009.08.21.docx");
    XWPFDocument docxTest = new XWPFDocument(inputTest);
    XWPFWordExtractor ContentTest = new XWPFWordExtractor(docxTest);
    String contentIn = ContentTest.getText();
    return contentIn;
}

From source file:com.swg.parse.docx.MSDocConvTest2.java

/***
 * @return String containing the content of the .docx file from POI apache
 * @throws FileNotFoundException/*from  w w w.j  av a 2 s . c o m*/
 * @throws IOException 
 */
private String getPOI() throws FileNotFoundException, IOException {

    FileInputStream inputTest = new FileInputStream(path + "CAD_2013_RE-06.docx");
    XWPFDocument docxTest = new XWPFDocument(inputTest);
    XWPFWordExtractor ContentTest = new XWPFWordExtractor(docxTest);
    String contentIn = ContentTest.getText();
    return contentIn;
}

From source file:com.swg.parse.docx.OpenFolderAction.java

/***
 * Simply grab the content of a .docx file using Apache POI and put it into a string
 * This String may have missing part due to POI library
 * @param f the .dox file/*from   w  ww.  jav a 2 s. com*/
 * @return POI content of .docx
 * @throws FileNotFoundException
 * @throws IOException 
 */
private String getPOI(File f) throws FileNotFoundException, IOException {

    FileInputStream inputTest = new FileInputStream(f.getAbsolutePath());
    XWPFDocument docxTest = new XWPFDocument(inputTest);
    XWPFWordExtractor ContentTest = new XWPFWordExtractor(docxTest);
    String contentIn = ContentTest.getText();
    return contentIn;
}

From source file:com.swg.parse.docx.OpenWord.java

/***
 * Simply grab the content of a .docx file using Apache POI and put it into a string
 * This String may have missing part due to POI library
 * @param f the .dox file/* w  w w.ja va2 s  .c o m*/
 * @return POI content of .docx
 * @throws FileNotFoundException
 * @throws IOException 
 */
private String getPOI() throws FileNotFoundException, IOException {

    FileInputStream inputTest = new FileInputStream(selectedFile.getAbsolutePath());
    XWPFDocument docxTest = new XWPFDocument(inputTest);
    XWPFWordExtractor ContentTest = new XWPFWordExtractor(docxTest);
    String contentIn = ContentTest.getText();
    return contentIn;
}

From source file:com.swg.parse.docx.V2Test.java

/***
 * @return String containing the content of the .docx file from POI apache
 * @throws FileNotFoundException//  w  w  w  . j  a v a 2 s.c o  m
 * @throws IOException 
 */
private String getPOI() throws FileNotFoundException, IOException {

    FileInputStream inputTest = new FileInputStream(path + "CAD_2013_RE-02.docx");
    XWPFDocument docxTest = new XWPFDocument(inputTest);
    XWPFWordExtractor ContentTest = new XWPFWordExtractor(docxTest);
    String contentIn = ContentTest.getText();
    return contentIn;
}

From source file:de.catma.document.source.contenthandler.DOCXContentHandler.java

License:Open Source License

@Override
public void load(InputStream is) throws IOException {
    XWPFDocument doc = new XWPFDocument(is);
    XWPFWordExtractor wordExtractor = new XWPFWordExtractor(doc);
    String buf = wordExtractor.getText();

    //it's still microsoft after all
    if (FileOSType.getFileOSType(buf).equals(FileOSType.UNIX)) {
        buf = FileOSType.convertUnixToDos(buf);
    }//from   w w  w  .  java 2s  . c o m

    setContent(buf);
}

From source file:de.iisys.schub.processMining.similarity.parsing.DocxParser.java

License:Apache License

/**
 * Only use this method if you don't want to get chapters sometimes.
 * Otherwise use 'parseDocxAndChapters' and 'getFullText' methods.
 * //from  www .  jav a2  s . c  o  m
 * Parses a .docx Word file and returns its text.
 * @return
 *       Returns the full text (incl. tables) as string.
 */
public String parseDocxSimple() {
    if (theDoc != null) {
        XWPFWordExtractor extr = new XWPFWordExtractor(theDoc);
        this.fullText = extr.getText();
        try {
            extr.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    return this.fullText;
}

From source file:de.powerstaff.business.service.impl.reader.msword.DOCXWordDocumentReader.java

License:Open Source License

public ReadResult getContent(File inputFile) throws Exception {

    XWPFDocument theDocument = new XWPFDocument(new FileInputStream(inputFile));
    XWPFWordExtractor theExtractor = new XWPFWordExtractor(theDocument);
    String theText = theExtractor.getText();
    theText = theText.replace('|', ' ');
    return new ReadResult(toFlatString(theText));
}