List of usage examples for org.apache.poi.hwpf.extractor WordExtractor getParagraphText
public String[] getParagraphText()
From source file:at.tugraz.sss.serv.SSFileU.java
License:Apache License
public static void writePDFFromDoc(final String docFilePath, final String pdfFilePath) throws Exception { final Document document = new Document(); final POIFSFileSystem fs = new POIFSFileSystem(openFileForRead(docFilePath)); final HWPFDocument word = new HWPFDocument(fs); final WordExtractor we = new WordExtractor(word); final OutputStream out = openOrCreateFileWithPathForWrite(pdfFilePath); final PdfWriter writer = PdfWriter.getInstance(document, out); final Range range = word.getRange(); document.open();/*from ww w .ja va 2s . co m*/ writer.setPageEmpty(true); document.newPage(); writer.setPageEmpty(true); String[] paragraphs = we.getParagraphText(); for (int i = 0; i < paragraphs.length; i++) { org.apache.poi.hwpf.usermodel.Paragraph pr = range.getParagraph(i); // CharacterRun run = pr.getCharacterRun(i); // run.setBold(true); // run.setCapitalized(true); // run.setItalic(true); paragraphs[i] = paragraphs[i].replaceAll("\\cM?\r?\n", ""); System.out.println("Length:" + paragraphs[i].length()); System.out.println("Paragraph" + i + ": " + paragraphs[i].toString()); // add the paragraph to the document document.add(new Paragraph(paragraphs[i])); } document.close(); }
From source file:at.tugraz.sss.serv.util.SSFileU.java
License:Apache License
public static void writePDFFromDoc(final String docFilePath, final String pdfFilePath) throws SSErr { try {// w ww .j a va2 s .c om final Document document = new Document(); final POIFSFileSystem fs = new POIFSFileSystem(openFileForRead(docFilePath)); final HWPFDocument word = new HWPFDocument(fs); final WordExtractor we = new WordExtractor(word); final OutputStream out = openOrCreateFileWithPathForWrite(pdfFilePath); final PdfWriter writer = PdfWriter.getInstance(document, out); final Range range = word.getRange(); document.open(); writer.setPageEmpty(true); document.newPage(); writer.setPageEmpty(true); String[] paragraphs = we.getParagraphText(); for (int i = 0; i < paragraphs.length; i++) { org.apache.poi.hwpf.usermodel.Paragraph pr = range.getParagraph(i); // CharacterRun run = pr.getCharacterRun(i); // run.setBold(true); // run.setCapitalized(true); // run.setItalic(true); paragraphs[i] = paragraphs[i].replaceAll("\\cM?\r?\n", ""); System.out.println("Length:" + paragraphs[i].length()); System.out.println("Paragraph" + i + ": " + paragraphs[i].toString()); // add the paragraph to the document document.add(new Paragraph(paragraphs[i])); } document.close(); } catch (Exception error) { SSServErrReg.regErrThrow(error); } }
From source file:br.com.schumaker.beta.doc.ReadDocMaster.java
public static void main(String[] args) { try {//from w ww.j a v a 2 s . co m File file = new File( "/users/hudsonschumaker/downloads/Guisi01206us - Jira Guide for P3 PECB enhancement requests.doc"); FileInputStream fis = new FileInputStream(file.getAbsolutePath()); HWPFDocument doc = new HWPFDocument(fis); WordExtractor extractor = new WordExtractor(doc); for (String rawText : extractor.getParagraphText()) { String text = extractor.stripFields(rawText); if (text.length() > 10) System.out.println(text.trim()); } } catch (Exception exep) { } }
From source file:com.jaeksoft.searchlib.parser.DocParser.java
License:Open Source License
private void currentWordExtraction(ParserResultItem result, InputStream inputStream) throws IOException { WordExtractor word = null; try {//from w w w. j av a 2s. c o m word = new WordExtractor(inputStream); SummaryInformation info = word.getSummaryInformation(); if (info != null) { result.addField(ParserFieldEnum.title, info.getTitle()); result.addField(ParserFieldEnum.author, info.getAuthor()); result.addField(ParserFieldEnum.subject, info.getSubject()); } String[] paragraphes = word.getParagraphText(); for (String paragraph : paragraphes) { String[] frags = paragraph.split("\\n"); for (String frag : frags) result.addField(ParserFieldEnum.content, StringUtils.replaceConsecutiveSpaces(frag, " ")); } } finally { IOUtils.close(word); } }
From source file:com.opensearchserver.extractor.parser.Doc.java
License:Apache License
private void currentWordExtraction(InputStream inputStream) throws IOException { WordExtractor word = null; try {//from www . java 2 s .co m word = new WordExtractor(inputStream); SummaryInformation info = word.getSummaryInformation(); if (info != null) { metas.add(TITLE, info.getTitle()); metas.add(AUTHOR, info.getAuthor()); metas.add(SUBJECT, info.getSubject()); metas.add(CREATION_DATE, info.getCreateDateTime()); metas.add(MODIFICATION_DATE, info.getLastSaveDateTime()); metas.add(KEYWORDS, info.getKeywords()); } ParserDocument document = getNewParserDocument(); String[] paragraphes = word.getParagraphText(); for (String paragraph : paragraphes) document.add(CONTENT, paragraph); document.add(LANG_DETECTION, languageDetection(CONTENT, 10000)); } finally { IOUtils.closeQuietly(word); } }
From source file:com.pdf.GetPdf.java
public static void docConvert(Document document, String url, String type) throws IOException, DocumentException { WordExtractor we; if (type.equals("doc")) { HWPFDocument wordDoc = new HWPFDocument(new URL(url).openStream()); we = new WordExtractor(wordDoc); String[] paragraphs = we.getParagraphText(); for (int i = 0; i < paragraphs.length; i++) { paragraphs[i] = paragraphs[i].replaceAll("\\cM?\r?\n", ""); document.add(new Paragraph(paragraphs[i])); }/*from ww w . j a va 2s. com*/ } else { XWPFDocument wordDoc = new XWPFDocument(new URL(url).openStream()); List<IBodyElement> contents = wordDoc.getBodyElements(); for (IBodyElement content : contents) { if (content.getElementType() == BodyElementType.PARAGRAPH) { List<XWPFParagraph> paras = content.getBody().getParagraphs(); for (XWPFParagraph para : paras) { document.add(new Paragraph(para.getParagraphText())); } } else if (content.getElementType() == BodyElementType.TABLE) { List<XWPFTable> tables = content.getBody().getTables(); for (XWPFTable table : tables) { List<XWPFTableRow> rows = table.getRows(); for (XWPFTableRow row : rows) { List<XWPFTableCell> tablecells = row.getTableCells(); } } } } } }
From source file:com.unsa.view.MainView.java
License:Creative Commons License
private void DocConverterPDF(File file1) { NPOIFSFileSystem fs = null;/* www. j a v a2 s . com*/ com.lowagie.text.Document document = new com.lowagie.text.Document(); try { System.out.println(file1.getAbsolutePath()); fs = new NPOIFSFileSystem(new FileInputStream(file1.getAbsolutePath())); HWPFDocument doc = new HWPFDocument(fs.getRoot()); WordExtractor we = new WordExtractor(doc); String output = file1.getAbsolutePath().substring(0, file1.getAbsolutePath().length() - 3); OutputStream fileout = new FileOutputStream(new File(output + "pdf")); PdfWriter writer = PdfWriter.getInstance(document, fileout); Range range = doc.getRange(); document.open(); writer.setPageEmpty(true); document.newPage(); writer.setPageEmpty(true); String[] paragraphs = we.getParagraphText(); for (int i = 0; i < paragraphs.length; i++) { org.apache.poi.hwpf.usermodel.Paragraph pr = range.getParagraph(i); paragraphs[i] = paragraphs[i].replaceAll("\\cM?\r?\n", ""); document.add(new Paragraph(paragraphs[i])); } } catch (Exception e) { e.printStackTrace(); } finally { document.close(); } }
From source file:cv_extractor.DocReader.java
protected static void readDocFile(File localFile) { try {//w ww . j a va2 s . c om //Create a input stream to read file FileInputStream fis = new FileInputStream(localFile.getAbsolutePath()); //For reading docx files HWPFDocument doc = new HWPFDocument(fis); WordExtractor we = new WordExtractor(doc); String[] paragraphs = we.getParagraphText(); System.out.println("Total no of paragraph " + paragraphs.length); for (String para : paragraphs) { //Compile the regex defined above Pattern r = Pattern.compile(pattern); //Check if any string matches the compiled pattern Matcher m = r.matcher(para); if (m.find()) { //m.group() Returns the input subsequence matched by the previous match data.add(m.group()); } } fis.close(); } catch (Exception e) { e.printStackTrace(); } }
From source file:File.DOC.ReadDoc.java
public void Read(String path, String namafile) { try {// ww w .j av a 2 s. c o m File file = new File(path + namafile + ".doc"); FileInputStream fis = new FileInputStream(file.getAbsolutePath()); HWPFDocument doc = new HWPFDocument(fis); WordExtractor we = new WordExtractor(doc); String[] paragraphs = we.getParagraphText(); System.out.println("Total no of paragraph " + paragraphs.length); for (String para : paragraphs) { System.out.println(para.toString()); } fis.close(); } catch (Exception ex) { ex.printStackTrace(); } }
From source file:insight.masters.policyanalytics.services.BranchingOriginStanfordKeywords.java
public static String readfromdoc(String datsetspath, String Document) { File file = null;//from w w w. ja v a 2s .c o m WordExtractor extractor = null; String extractedtext = ""; try { file = new File(datsetspath + Document); FileInputStream fis = new FileInputStream(file.getAbsolutePath()); HWPFDocument document = new HWPFDocument(fis); extractor = new WordExtractor(document); String[] fileData = extractor.getParagraphText(); for (int i = 0; i < fileData.length; i++) { if (fileData[i] != null) // System.out.print("{\"text\":\""); System.out.print(fileData[i].replace("\n", "").replace("\r", "")); extractedtext += fileData[i].replace("\n", "").replace("\r", ""); // System.out.print("\"}"); } } catch (Exception exep) { exep.printStackTrace(); } return extractedtext; }