Example usage for org.apache.poi.hwpf.extractor WordExtractor stripFields

List of usage examples for org.apache.poi.hwpf.extractor WordExtractor stripFields

Introduction

In this page you can find the example usage for org.apache.poi.hwpf.extractor WordExtractor stripFields.

Prototype

public static String stripFields(String text) 

Source Link

Document

Removes any fields (eg macros, page markers etc) from the string.

Usage

From source file:br.com.schumaker.beta.doc.ReadDocMaster.java

public static void main(String[] args) {
    try {/*from   ww w  .j av a 2  s  .  com*/

        File file = new File(
                "/users/hudsonschumaker/downloads/Guisi01206us - Jira Guide for P3 PECB enhancement requests.doc");
        FileInputStream fis = new FileInputStream(file.getAbsolutePath());
        HWPFDocument doc = new HWPFDocument(fis);
        WordExtractor extractor = new WordExtractor(doc);

        for (String rawText : extractor.getParagraphText()) {
            String text = extractor.stripFields(rawText);
            if (text.length() > 10)
                System.out.println(text.trim());
        }
    } catch (Exception exep) {
    }
}

From source file:org.crypto.sse.TextExtractPar.java

License:Open Source License

private static TextExtractPar extractOneDoc(File[] listOfFile) throws FileNotFoundException {

    Multimap<String, String> lookup1 = ArrayListMultimap.create();
    Multimap<String, String> lookup2 = ArrayListMultimap.create();

    for (File file : listOfFile) {

        for (int j = 0; j < 100; j++) {

            if (counter == (int) ((j + 1) * listOfFile.length / 100)) {
                System.out.println("Number of files read equals " + j + " %");
                break;
            }//from   w  ww .j av  a2  s .  c  om
        }

        List<String> lines = new ArrayList<String>();
        counter++;
        FileInputStream fis = new FileInputStream(file);

        // ***********************************************************************************************//

        ///////////////////// .docx /////////////////////////////

        // ***********************************************************************************************//

        if (file.getName().endsWith(".docx")) {
            XWPFDocument doc;
            try {
                // System.out.println("File read: "+file.getName());

                doc = new XWPFDocument(fis);
                XWPFWordExtractor ex = new XWPFWordExtractor(doc);
                lines.add(ex.getText());
            } catch (IOException e) {
                // TODO Auto-generated catch block
                System.out.println("File not read: " + file.getName());
            }

        }

        // ***********************************************************************************************//

        ///////////////////// .pptx /////////////////////////////

        // ***********************************************************************************************//

        else if (file.getName().endsWith(".pptx")) {

            OPCPackage ppt;
            try {
                // System.out.println("File read: "+file.getName());

                ppt = OPCPackage.open(fis);
                XSLFPowerPointExtractor xw = new XSLFPowerPointExtractor(ppt);
                lines.add(xw.getText());
            } catch (XmlException e) {
                // TODO Auto-generated catch block
                System.out.println("File not read: " + file.getName());
            } catch (IOException e) {
                // TODO Auto-generated catch block
                System.out.println("File not read: " + file.getName());
            } catch (OpenXML4JException e) {
                System.out.println("File not read: " + file.getName());
            }

        }

        // ***********************************************************************************************//

        ///////////////////// .xlsx /////////////////////////////

        // ***********************************************************************************************//

        else if (file.getName().endsWith(".xlsx")) {

            OPCPackage xls;
            try {
                // System.out.println("File read: "+file.getName());

                xls = OPCPackage.open(fis);
                XSSFExcelExtractor xe = new XSSFExcelExtractor(xls);
                lines.add(xe.getText());
            } catch (InvalidFormatException e) {
                // TODO Auto-generated catch block
                System.out.println("File not read: " + file.getName());
            } catch (IOException e) {
                System.out.println("File not read: " + file.getName());

            } catch (XmlException e) {
                // TODO Auto-generated catch block
                System.out.println("File not read: " + file.getName());
            } catch (OpenXML4JException e) {
                System.out.println("File not read: " + file.getName());
            }

        }

        // ***********************************************************************************************//

        ///////////////////// .doc /////////////////////////////

        // ***********************************************************************************************//

        else if (file.getName().endsWith(".doc")) {

            NPOIFSFileSystem fs;
            try {
                // System.out.println("File read: "+file.getName());

                fs = new NPOIFSFileSystem(file);
                WordExtractor extractor = new WordExtractor(fs.getRoot());
                for (String rawText : extractor.getParagraphText()) {
                    lines.add(extractor.stripFields(rawText));
                }
            } catch (IOException e) {
                // TODO Auto-generated catch block
                System.out.println("File not read: " + file.getName());
            }

        }

        // ***********************************************************************************************//

        ///////////////////// .pdf /////////////////////////////

        // ***********************************************************************************************//

        else if (file.getName().endsWith(".pdf")) {

            PDFParser parser;
            try {
                // System.out.println("File read: "+file.getName());

                parser = new PDFParser(fis);
                parser.parse();
                COSDocument cd = parser.getDocument();
                PDFTextStripper stripper = new PDFTextStripper();
                lines.add(stripper.getText(new PDDocument(cd)));

            } catch (IOException e) {
                // TODO Auto-generated catch block
                System.out.println("File not read: " + file.getName());
            }

        }

        // ***********************************************************************************************//

        ///////////////////// Media Files such as gif, jpeg, .wmv, .mpeg,
        ///////////////////// .mp4 /////////////////////////////

        // ***********************************************************************************************//

        else if (file.getName().endsWith(".gif") && file.getName().endsWith(".jpeg")
                && file.getName().endsWith(".wmv") && file.getName().endsWith(".mpeg")
                && file.getName().endsWith(".mp4")) {

            lines.add(file.getName());

        }

        // ***********************************************************************************************//

        ///////////////////// raw text extensions
        ///////////////////// /////////////////////////////

        // ***********************************************************************************************//

        else {
            try {
                // System.out.println("File read: "+file.getName());

                lines = Files.readLines(file, Charsets.UTF_8);
            } catch (IOException e) {
                // TODO Auto-generated catch block
                System.out.println("File not read: " + file.getName());
            } finally {
                try {
                    fis.close();
                } catch (IOException ioex) {
                    // omitted.
                }
            }
        }

        // ***********************************************************************************************//

        ///////////////////// Begin word extraction
        ///////////////////// /////////////////////////////

        // ***********************************************************************************************//

        int temporaryCounter = 0;

        // Filter threshold
        int counterDoc = 0;
        for (int i = 0; i < lines.size(); i++) {

            CharArraySet noise = EnglishAnalyzer.getDefaultStopSet();

            // We are using a standard tokenizer that eliminates the stop
            // words. We can use Stemming tokenizer such Porter
            // A set of English noise keywords is used that will eliminates
            // words such as "the, a, etc"

            Analyzer analyzer = new StandardAnalyzer(noise);
            List<String> token = Tokenizer.tokenizeString(analyzer, lines.get(i));
            temporaryCounter = temporaryCounter + token.size();
            for (int j = 0; j < token.size(); j++) {

                // Avoid counting occurrences of words in the same file
                if (!lookup2.get(file.getName()).contains(token.get(j))) {
                    lookup2.put(file.getName(), token.get(j));
                }

                // Avoid counting occurrences of words in the same file
                if (!lookup1.get(token.get(j)).contains(file.getName())) {
                    lookup1.put(token.get(j), file.getName());
                }

            }

        }

    }

    // System.out.println(lookup.toString());
    return new TextExtractPar(lookup1, lookup2);

}

From source file:org.mitre.xtext.converters.MSDocConverter.java

License:Apache License

/** TODO: Replace with a Tika converter?
 *//* ww w.ja  v  a2  s. com*/
@Override
public ConvertedDocument convert(java.io.File doc) throws IOException {
    java.io.InputStream io = new FileInputStream(doc);
    org.apache.poi.hwpf.extractor.WordExtractor ex = new WordExtractor(io);

    String[] ps = ex.getParagraphText();
    io.close();

    StringBuilder sb = new StringBuilder();
    for (int i = 0; i < ps.length; i++) {
        sb.append(WordExtractor.stripFields(ps[i]).trim());
        sb.append('\n');
    }
    ConvertedDocument textdoc = new ConvertedDocument(doc);
    textdoc.setPayload(sb.toString());

    return textdoc;
}

From source file:org.opensextant.xtext.converters.MSDocConverter.java

License:Apache License

/** 
 *//*from  ww  w .java2 s  . c om*/
@Override
protected ConvertedDocument conversionImplementation(InputStream input, java.io.File doc) throws IOException {
    org.apache.poi.hwpf.extractor.WordExtractor ex = new WordExtractor(input);

    String[] ps = ex.getParagraphText();
    input.close();

    StringBuilder sb = new StringBuilder();
    for (int i = 0; i < ps.length; i++) {
        sb.append(WordExtractor.stripFields(ps[i]).trim());
        sb.append('\n');
    }
    ConvertedDocument textdoc = new ConvertedDocument(doc);
    textdoc.setText(sb.toString());

    ex.close();

    return textdoc;
}

From source file:projekt.servise.impl.ReadDataFromWordServiceImpl.java

@Override
public void getData() {
    String FilePath = "C:/Users/Lenovo/Documents/NetBeansProjects/SoftwareArchitectureProject-master/src/main/java/projekt/nimekiri_test.doc";
    FileInputStream fis;/*  w  w  w . j av a 2  s  .co m*/
    try {
        fis = new FileInputStream(new File(FilePath));
        HWPFDocument doc = new HWPFDocument(fis);
        WordExtractor extractor = new WordExtractor(doc);

        Connection conn = DriverManager.getConnection(
                "jdbc:postgresql://dev.vk.edu.ee:5432/GroupWork?currentSchema=project", "t131566", "t131566");

        String text = extractor.getText();
        String strippedText = extractor.stripFields(text).replace("\r\n\r\n", "\n").replace("\t", " ")
                .replace("\r\n", "\n");
        String[] paragraphs = strippedText.split("\n");
        String code = "";
        List<String> groupNames = groupService.getGroupNames();

        for (int i = 8; i < paragraphs.length; i++) {
            String line = paragraphs[i].replace("*", "").replace("OK", "").replace("TREV", "").replace("REV",
                    "");
            int index = 0;
            String jrk = "";
            if (!paragraphs[i].trim().isEmpty() && paragraphs[i].substring(0, 3).contains("Jrk")) {
                String groupCode = paragraphs[i].substring(17, 21);
                jrk = paragraphs[i];
                index = strippedText.indexOf(jrk);
                String groupName = "";
                int j = 1;
                do {
                    if (!paragraphs[i - j].trim().isEmpty()) {
                        if (Character.isUpperCase(paragraphs[i - j].charAt(3))) {
                            groupName = paragraphs[i - j];
                            if (groupName.contains("(KAUGPE)")) {
                                groupName = groupName.replace("(KAUGPE)", "").trim();
                            }
                        }
                    }

                    j++;
                } while (!paragraphs[i - j].trim().isEmpty());

                PreparedStatement preparedStatementGetGroup = conn.prepareStatement(
                        "SELECT id FROM project.group where name is null and groupcode like ?");
                preparedStatementGetGroup.setString(1, groupCode + "%");
                ResultSet resultGroup = preparedStatementGetGroup.executeQuery();
                while (resultGroup.next()) {
                    int groupId = resultGroup.getInt(1);
                    PreparedStatement preparedStatementSetGroupName = conn
                            .prepareStatement("UPDATE project.group SET name=? where id=?");
                    preparedStatementSetGroupName.setString(1, groupName.replace("  ", " "));
                    preparedStatementSetGroupName.setInt(2, groupId);
                    preparedStatementSetGroupName.executeUpdate();
                }
            }
        }

        for (int i = 8; i < paragraphs.length; i++) {
            String line = paragraphs[i].replace("*", "").replace("OK", "").replace("TREV", "").replace("REV",
                    "");

            /*   int index = 0;
            String jrk = "";*/
            if (!paragraphs[i].trim().isEmpty() && paragraphs[i].substring(0, 3).contains("Jrk")) {
                /* String groupCode = paragraphs[i].substring(17, 21);*/
                code = line.substring(line.indexOf(":") + 1, line.indexOf(":") + 11);
                code = code.replace("", "").replace(" - ", "").replace("  ", "");
            }
            if (!line.trim().isEmpty() && !line.contains("KOOD") && !line.contains("KAUGPE")
                    && !line.contains("lipilane") && !groupNames.contains(line) && !line.contains("Jrk")
                    && !isAllUpperCase(line)) {

                String[] splittedLine = line.split(" ");
                String studentLastname = "";
                List<String> newSplittedLine = new ArrayList<String>();

                for (String item : splittedLine) {
                    if (!item.isEmpty()) {
                        newSplittedLine.add(item);
                    }
                }
                if (newSplittedLine.size() >= 4) {
                    PreparedStatement preparedStatementGetStudent = conn
                            .prepareStatement("SELECT * FROM project.student where code=?");
                    if (newSplittedLine.size() == 4) {
                        preparedStatementGetStudent.setString(1, newSplittedLine.get(2));
                        studentLastname = newSplittedLine.get(1);

                    } else if (newSplittedLine.size() == 5) {
                        preparedStatementGetStudent.setString(1, newSplittedLine.get(3));
                        studentLastname = newSplittedLine.get(1) + " " + newSplittedLine.get(2);

                    }
                    ResultSet resultStudent = preparedStatementGetStudent.executeQuery();

                    if (!resultStudent.next()) {
                        PreparedStatement preparedStatementGetPerson = conn.prepareStatement(
                                "SELECT * FROM project.person where firstname=? and lastname=?");
                        preparedStatementGetPerson.setString(1, newSplittedLine.get(0));
                        preparedStatementGetPerson.setString(2, studentLastname);

                        ResultSet resultPersonExists = preparedStatementGetPerson.executeQuery();
                        if (!resultPersonExists.next()) {
                            PreparedStatement preparedStatementNewPerson = conn.prepareStatement(
                                    "INSERT INTO project.person (firstname,lastname,roleid) VALUES (?,?,?)");
                            if (newSplittedLine.size() == 4) {
                                preparedStatementNewPerson.setString(1, newSplittedLine.get(0));
                                preparedStatementNewPerson.setString(2, studentLastname);
                                preparedStatementNewPerson.setInt(3, 2);
                            } else if (newSplittedLine.size() == 5) {
                                preparedStatementNewPerson.setString(1, newSplittedLine.get(0));
                                preparedStatementNewPerson.setString(2, studentLastname);
                                preparedStatementNewPerson.setInt(3, 2);
                            }
                            preparedStatementNewPerson.executeUpdate();
                            PreparedStatement preparedStatementLastPerson = conn.prepareStatement(
                                    "SELECT id FROM project.person where firstname=? and lastname=? and roleid=?");
                            preparedStatementLastPerson.setString(1, newSplittedLine.get(0));
                            preparedStatementLastPerson.setString(2, studentLastname);
                            preparedStatementLastPerson.setInt(3, 2);
                            Integer personId = 0;
                            ResultSet resultPerson = preparedStatementLastPerson.executeQuery();
                            if (resultPerson.next()) {
                                personId = resultPerson.getInt(1);
                                System.out.println("GROUP CODE " + code);
                                Group1 group = groupService.getByGroupcode(code);

                                if (group != null) {
                                    System.out.println("GROUP ID " + group.getId());

                                    PreparedStatement preparedStatementSetStudent = conn.prepareStatement(
                                            "INSERT INTO project.student (personid,groupid,code) values(?,?,?)");
                                    preparedStatementSetStudent.setInt(1, personId);
                                    preparedStatementSetStudent.setInt(2, group.getId());
                                    preparedStatementSetStudent.setString(3, code);
                                    preparedStatementSetStudent.executeUpdate();
                                    System.out.println("NEW STUDENT " + personId);
                                } else {
                                    System.out.println("GROUP WAS NULL ");
                                    PreparedStatement preparedStatementGroup = conn.prepareStatement(
                                            "INSERT INTO project.group (groupcode) VALUES (?)");
                                    preparedStatementGroup.setString(1, code);
                                    preparedStatementGroup.executeUpdate();
                                    System.out.println("NEW GROUP " + code);
                                    code = code.replace(" ", "");
                                    PreparedStatement preparedStatementLastGroup = conn
                                            .prepareStatement("SELECT id FROM project.group where groupcode=?");
                                    preparedStatementLastGroup.setString(1, code);
                                    System.out.println("SELECT id FROM project.group where groupcode=" + code);
                                    int groupId = 0;
                                    ResultSet resultLastGroup = preparedStatementLastGroup.executeQuery();
                                    if (resultLastGroup.next()) {
                                        PreparedStatement preparedStatementStudentExist = conn.prepareStatement(
                                                "SELECT * FROM project.student where personid=?");
                                        preparedStatementStudentExist.setInt(1, personId);
                                        ResultSet studentExists = preparedStatementStudentExist.executeQuery();
                                        if (!studentExists.next()) {
                                            groupId = resultLastGroup.getInt(1);
                                            PreparedStatement preparedStatementSetStudent = conn
                                                    .prepareStatement(
                                                            "INSERT INTO project.student (personid,groupid,code) values(?,?,?)");
                                            preparedStatementSetStudent.setInt(1, personId);
                                            preparedStatementSetStudent.setInt(2, groupId);
                                            preparedStatementSetStudent.setString(3, code);
                                            preparedStatementSetStudent.executeUpdate();
                                            System.out.println("NEW STUDENT " + personId);
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
                // System.out.println();
            }
        }
        conn.commit();
        conn.close();
    } catch (IOException e) {
        Logger.getLogger(ReadDataFromExcelServiceImpl.class.getName()).log(Level.SEVERE, null, e);
    } catch (SQLException ex) {
        Logger.getLogger(ReadDataFromWordServiceImpl.class.getName()).log(Level.SEVERE, null, ex);
    }
}