List of usage examples for org.apache.poi.hwpf.extractor WordExtractor stripFields
public static String stripFields(String text)
From source file:br.com.schumaker.beta.doc.ReadDocMaster.java
public static void main(String[] args) { try {/*from ww w .j av a 2 s . com*/ File file = new File( "/users/hudsonschumaker/downloads/Guisi01206us - Jira Guide for P3 PECB enhancement requests.doc"); FileInputStream fis = new FileInputStream(file.getAbsolutePath()); HWPFDocument doc = new HWPFDocument(fis); WordExtractor extractor = new WordExtractor(doc); for (String rawText : extractor.getParagraphText()) { String text = extractor.stripFields(rawText); if (text.length() > 10) System.out.println(text.trim()); } } catch (Exception exep) { } }
From source file:org.crypto.sse.TextExtractPar.java
License:Open Source License
private static TextExtractPar extractOneDoc(File[] listOfFile) throws FileNotFoundException { Multimap<String, String> lookup1 = ArrayListMultimap.create(); Multimap<String, String> lookup2 = ArrayListMultimap.create(); for (File file : listOfFile) { for (int j = 0; j < 100; j++) { if (counter == (int) ((j + 1) * listOfFile.length / 100)) { System.out.println("Number of files read equals " + j + " %"); break; }//from w ww .j av a2 s . c om } List<String> lines = new ArrayList<String>(); counter++; FileInputStream fis = new FileInputStream(file); // ***********************************************************************************************// ///////////////////// .docx ///////////////////////////// // ***********************************************************************************************// if (file.getName().endsWith(".docx")) { XWPFDocument doc; try { // System.out.println("File read: "+file.getName()); doc = new XWPFDocument(fis); XWPFWordExtractor ex = new XWPFWordExtractor(doc); lines.add(ex.getText()); } catch (IOException e) { // TODO Auto-generated catch block System.out.println("File not read: " + file.getName()); } } // ***********************************************************************************************// ///////////////////// .pptx ///////////////////////////// // ***********************************************************************************************// else if (file.getName().endsWith(".pptx")) { OPCPackage ppt; try { // System.out.println("File read: "+file.getName()); ppt = OPCPackage.open(fis); XSLFPowerPointExtractor xw = new XSLFPowerPointExtractor(ppt); lines.add(xw.getText()); } catch (XmlException e) { // TODO Auto-generated catch block System.out.println("File not read: " + file.getName()); } catch (IOException e) { // TODO Auto-generated catch block System.out.println("File not read: " + file.getName()); } catch (OpenXML4JException e) { System.out.println("File not read: " + file.getName()); } } // ***********************************************************************************************// ///////////////////// .xlsx ///////////////////////////// // ***********************************************************************************************// else if (file.getName().endsWith(".xlsx")) { OPCPackage xls; try { // System.out.println("File read: "+file.getName()); xls = OPCPackage.open(fis); XSSFExcelExtractor xe = new XSSFExcelExtractor(xls); lines.add(xe.getText()); } catch (InvalidFormatException e) { // TODO Auto-generated catch block System.out.println("File not read: " + file.getName()); } catch (IOException e) { System.out.println("File not read: " + file.getName()); } catch (XmlException e) { // TODO Auto-generated catch block System.out.println("File not read: " + file.getName()); } catch (OpenXML4JException e) { System.out.println("File not read: " + file.getName()); } } // ***********************************************************************************************// ///////////////////// .doc ///////////////////////////// // ***********************************************************************************************// else if (file.getName().endsWith(".doc")) { NPOIFSFileSystem fs; try { // System.out.println("File read: "+file.getName()); fs = new NPOIFSFileSystem(file); WordExtractor extractor = new WordExtractor(fs.getRoot()); for (String rawText : extractor.getParagraphText()) { lines.add(extractor.stripFields(rawText)); } } catch (IOException e) { // TODO Auto-generated catch block System.out.println("File not read: " + file.getName()); } } // ***********************************************************************************************// ///////////////////// .pdf ///////////////////////////// // ***********************************************************************************************// else if (file.getName().endsWith(".pdf")) { PDFParser parser; try { // System.out.println("File read: "+file.getName()); parser = new PDFParser(fis); parser.parse(); COSDocument cd = parser.getDocument(); PDFTextStripper stripper = new PDFTextStripper(); lines.add(stripper.getText(new PDDocument(cd))); } catch (IOException e) { // TODO Auto-generated catch block System.out.println("File not read: " + file.getName()); } } // ***********************************************************************************************// ///////////////////// Media Files such as gif, jpeg, .wmv, .mpeg, ///////////////////// .mp4 ///////////////////////////// // ***********************************************************************************************// else if (file.getName().endsWith(".gif") && file.getName().endsWith(".jpeg") && file.getName().endsWith(".wmv") && file.getName().endsWith(".mpeg") && file.getName().endsWith(".mp4")) { lines.add(file.getName()); } // ***********************************************************************************************// ///////////////////// raw text extensions ///////////////////// ///////////////////////////// // ***********************************************************************************************// else { try { // System.out.println("File read: "+file.getName()); lines = Files.readLines(file, Charsets.UTF_8); } catch (IOException e) { // TODO Auto-generated catch block System.out.println("File not read: " + file.getName()); } finally { try { fis.close(); } catch (IOException ioex) { // omitted. } } } // ***********************************************************************************************// ///////////////////// Begin word extraction ///////////////////// ///////////////////////////// // ***********************************************************************************************// int temporaryCounter = 0; // Filter threshold int counterDoc = 0; for (int i = 0; i < lines.size(); i++) { CharArraySet noise = EnglishAnalyzer.getDefaultStopSet(); // We are using a standard tokenizer that eliminates the stop // words. We can use Stemming tokenizer such Porter // A set of English noise keywords is used that will eliminates // words such as "the, a, etc" Analyzer analyzer = new StandardAnalyzer(noise); List<String> token = Tokenizer.tokenizeString(analyzer, lines.get(i)); temporaryCounter = temporaryCounter + token.size(); for (int j = 0; j < token.size(); j++) { // Avoid counting occurrences of words in the same file if (!lookup2.get(file.getName()).contains(token.get(j))) { lookup2.put(file.getName(), token.get(j)); } // Avoid counting occurrences of words in the same file if (!lookup1.get(token.get(j)).contains(file.getName())) { lookup1.put(token.get(j), file.getName()); } } } } // System.out.println(lookup.toString()); return new TextExtractPar(lookup1, lookup2); }
From source file:org.mitre.xtext.converters.MSDocConverter.java
License:Apache License
/** TODO: Replace with a Tika converter? *//* ww w.ja v a2 s. com*/ @Override public ConvertedDocument convert(java.io.File doc) throws IOException { java.io.InputStream io = new FileInputStream(doc); org.apache.poi.hwpf.extractor.WordExtractor ex = new WordExtractor(io); String[] ps = ex.getParagraphText(); io.close(); StringBuilder sb = new StringBuilder(); for (int i = 0; i < ps.length; i++) { sb.append(WordExtractor.stripFields(ps[i]).trim()); sb.append('\n'); } ConvertedDocument textdoc = new ConvertedDocument(doc); textdoc.setPayload(sb.toString()); return textdoc; }
From source file:org.opensextant.xtext.converters.MSDocConverter.java
License:Apache License
/** *//*from ww w .java2 s . c om*/ @Override protected ConvertedDocument conversionImplementation(InputStream input, java.io.File doc) throws IOException { org.apache.poi.hwpf.extractor.WordExtractor ex = new WordExtractor(input); String[] ps = ex.getParagraphText(); input.close(); StringBuilder sb = new StringBuilder(); for (int i = 0; i < ps.length; i++) { sb.append(WordExtractor.stripFields(ps[i]).trim()); sb.append('\n'); } ConvertedDocument textdoc = new ConvertedDocument(doc); textdoc.setText(sb.toString()); ex.close(); return textdoc; }
From source file:projekt.servise.impl.ReadDataFromWordServiceImpl.java
@Override public void getData() { String FilePath = "C:/Users/Lenovo/Documents/NetBeansProjects/SoftwareArchitectureProject-master/src/main/java/projekt/nimekiri_test.doc"; FileInputStream fis;/* w w w . j av a 2 s .co m*/ try { fis = new FileInputStream(new File(FilePath)); HWPFDocument doc = new HWPFDocument(fis); WordExtractor extractor = new WordExtractor(doc); Connection conn = DriverManager.getConnection( "jdbc:postgresql://dev.vk.edu.ee:5432/GroupWork?currentSchema=project", "t131566", "t131566"); String text = extractor.getText(); String strippedText = extractor.stripFields(text).replace("\r\n\r\n", "\n").replace("\t", " ") .replace("\r\n", "\n"); String[] paragraphs = strippedText.split("\n"); String code = ""; List<String> groupNames = groupService.getGroupNames(); for (int i = 8; i < paragraphs.length; i++) { String line = paragraphs[i].replace("*", "").replace("OK", "").replace("TREV", "").replace("REV", ""); int index = 0; String jrk = ""; if (!paragraphs[i].trim().isEmpty() && paragraphs[i].substring(0, 3).contains("Jrk")) { String groupCode = paragraphs[i].substring(17, 21); jrk = paragraphs[i]; index = strippedText.indexOf(jrk); String groupName = ""; int j = 1; do { if (!paragraphs[i - j].trim().isEmpty()) { if (Character.isUpperCase(paragraphs[i - j].charAt(3))) { groupName = paragraphs[i - j]; if (groupName.contains("(KAUGPE)")) { groupName = groupName.replace("(KAUGPE)", "").trim(); } } } j++; } while (!paragraphs[i - j].trim().isEmpty()); PreparedStatement preparedStatementGetGroup = conn.prepareStatement( "SELECT id FROM project.group where name is null and groupcode like ?"); preparedStatementGetGroup.setString(1, groupCode + "%"); ResultSet resultGroup = preparedStatementGetGroup.executeQuery(); while (resultGroup.next()) { int groupId = resultGroup.getInt(1); PreparedStatement preparedStatementSetGroupName = conn .prepareStatement("UPDATE project.group SET name=? where id=?"); preparedStatementSetGroupName.setString(1, groupName.replace(" ", " ")); preparedStatementSetGroupName.setInt(2, groupId); preparedStatementSetGroupName.executeUpdate(); } } } for (int i = 8; i < paragraphs.length; i++) { String line = paragraphs[i].replace("*", "").replace("OK", "").replace("TREV", "").replace("REV", ""); /* int index = 0; String jrk = "";*/ if (!paragraphs[i].trim().isEmpty() && paragraphs[i].substring(0, 3).contains("Jrk")) { /* String groupCode = paragraphs[i].substring(17, 21);*/ code = line.substring(line.indexOf(":") + 1, line.indexOf(":") + 11); code = code.replace("", "").replace(" - ", "").replace(" ", ""); } if (!line.trim().isEmpty() && !line.contains("KOOD") && !line.contains("KAUGPE") && !line.contains("lipilane") && !groupNames.contains(line) && !line.contains("Jrk") && !isAllUpperCase(line)) { String[] splittedLine = line.split(" "); String studentLastname = ""; List<String> newSplittedLine = new ArrayList<String>(); for (String item : splittedLine) { if (!item.isEmpty()) { newSplittedLine.add(item); } } if (newSplittedLine.size() >= 4) { PreparedStatement preparedStatementGetStudent = conn .prepareStatement("SELECT * FROM project.student where code=?"); if (newSplittedLine.size() == 4) { preparedStatementGetStudent.setString(1, newSplittedLine.get(2)); studentLastname = newSplittedLine.get(1); } else if (newSplittedLine.size() == 5) { preparedStatementGetStudent.setString(1, newSplittedLine.get(3)); studentLastname = newSplittedLine.get(1) + " " + newSplittedLine.get(2); } ResultSet resultStudent = preparedStatementGetStudent.executeQuery(); if (!resultStudent.next()) { PreparedStatement preparedStatementGetPerson = conn.prepareStatement( "SELECT * FROM project.person where firstname=? and lastname=?"); preparedStatementGetPerson.setString(1, newSplittedLine.get(0)); preparedStatementGetPerson.setString(2, studentLastname); ResultSet resultPersonExists = preparedStatementGetPerson.executeQuery(); if (!resultPersonExists.next()) { PreparedStatement preparedStatementNewPerson = conn.prepareStatement( "INSERT INTO project.person (firstname,lastname,roleid) VALUES (?,?,?)"); if (newSplittedLine.size() == 4) { preparedStatementNewPerson.setString(1, newSplittedLine.get(0)); preparedStatementNewPerson.setString(2, studentLastname); preparedStatementNewPerson.setInt(3, 2); } else if (newSplittedLine.size() == 5) { preparedStatementNewPerson.setString(1, newSplittedLine.get(0)); preparedStatementNewPerson.setString(2, studentLastname); preparedStatementNewPerson.setInt(3, 2); } preparedStatementNewPerson.executeUpdate(); PreparedStatement preparedStatementLastPerson = conn.prepareStatement( "SELECT id FROM project.person where firstname=? and lastname=? and roleid=?"); preparedStatementLastPerson.setString(1, newSplittedLine.get(0)); preparedStatementLastPerson.setString(2, studentLastname); preparedStatementLastPerson.setInt(3, 2); Integer personId = 0; ResultSet resultPerson = preparedStatementLastPerson.executeQuery(); if (resultPerson.next()) { personId = resultPerson.getInt(1); System.out.println("GROUP CODE " + code); Group1 group = groupService.getByGroupcode(code); if (group != null) { System.out.println("GROUP ID " + group.getId()); PreparedStatement preparedStatementSetStudent = conn.prepareStatement( "INSERT INTO project.student (personid,groupid,code) values(?,?,?)"); preparedStatementSetStudent.setInt(1, personId); preparedStatementSetStudent.setInt(2, group.getId()); preparedStatementSetStudent.setString(3, code); preparedStatementSetStudent.executeUpdate(); System.out.println("NEW STUDENT " + personId); } else { System.out.println("GROUP WAS NULL "); PreparedStatement preparedStatementGroup = conn.prepareStatement( "INSERT INTO project.group (groupcode) VALUES (?)"); preparedStatementGroup.setString(1, code); preparedStatementGroup.executeUpdate(); System.out.println("NEW GROUP " + code); code = code.replace(" ", ""); PreparedStatement preparedStatementLastGroup = conn .prepareStatement("SELECT id FROM project.group where groupcode=?"); preparedStatementLastGroup.setString(1, code); System.out.println("SELECT id FROM project.group where groupcode=" + code); int groupId = 0; ResultSet resultLastGroup = preparedStatementLastGroup.executeQuery(); if (resultLastGroup.next()) { PreparedStatement preparedStatementStudentExist = conn.prepareStatement( "SELECT * FROM project.student where personid=?"); preparedStatementStudentExist.setInt(1, personId); ResultSet studentExists = preparedStatementStudentExist.executeQuery(); if (!studentExists.next()) { groupId = resultLastGroup.getInt(1); PreparedStatement preparedStatementSetStudent = conn .prepareStatement( "INSERT INTO project.student (personid,groupid,code) values(?,?,?)"); preparedStatementSetStudent.setInt(1, personId); preparedStatementSetStudent.setInt(2, groupId); preparedStatementSetStudent.setString(3, code); preparedStatementSetStudent.executeUpdate(); System.out.println("NEW STUDENT " + personId); } } } } } } } // System.out.println(); } } conn.commit(); conn.close(); } catch (IOException e) { Logger.getLogger(ReadDataFromExcelServiceImpl.class.getName()).log(Level.SEVERE, null, e); } catch (SQLException ex) { Logger.getLogger(ReadDataFromWordServiceImpl.class.getName()).log(Level.SEVERE, null, ex); } }