List of usage examples for com.itextpdf.text.pdf.parser TextExtractionStrategy getResultantText
public String getResultantText();
From source file:br.com.smarttaco.util.HelenaBarbosa.java
/** * pdf2txt/* ww w . j a va 2s .com*/ * * @param pdf * @param paginas se for <code>null</code> realiza leitura completa. * @param txt * @throws FileNotFoundException * @throws IOException */ private static void pdf2txt(final String pdf, List<Integer> paginas, final String txt) throws FileNotFoundException, IOException { PdfReader reader = new PdfReader(pdf); //System.out.println(reader.getInfo().toString()); if (paginas != null) { reader.selectPages(paginas); } PdfReaderContentParser parser = new PdfReaderContentParser(reader); PrintWriter out = new PrintWriter(txt, "UTF-8"); TextExtractionStrategy strategy; for (int i = 1; i <= reader.getNumberOfPages(); i++) { strategy = parser.processContent(i, new SimpleTextExtractionStrategy()); out.println(strategy.getResultantText()); } out.flush(); out.close(); reader.close(); }
From source file:com.cloudhub.util.PDFToText.java
License:Apache License
/** * Parses a PDF to a plain text file./*from w w w . j ava 2 s. c o m*/ * * @param source the original PDF * @param destination the resulting text * @throws IOException */ public static void parsePdf(String source, String destination) throws IOException { PdfReader reader = new PdfReader(source); PdfReaderContentParser parser = new PdfReaderContentParser(reader); PrintWriter out = new PrintWriter(new FileOutputStream(destination)); TextExtractionStrategy strategy; for (int i = 1; i <= reader.getNumberOfPages(); i++) { strategy = parser.processContent(i, new SimpleTextExtractionStrategy()); out.println(strategy.getResultantText()); } out.flush(); out.close(); }
From source file:com.erikHolz.vertretungsplan.Converter.java
License:Open Source License
public void parsePDF() throws IOException { PdfReader reader = new PdfReader(fileDest + ".pdf"); PdfReaderContentParser parser = new PdfReaderContentParser(reader); PrintWriter out = new PrintWriter(new FileOutputStream(fileDest + "__.txt")); TextExtractionStrategy strategy; for (int intI = 1; intI <= reader.getNumberOfPages(); intI++) { strategy = parser.processContent(intI, new LocationTextExtractionStrategy()); out.println(strategy.getResultantText()); }//w w w . jav a2s . co m out.flush(); out.close(); reader.close(); // lschen der ursprnglichen pdf File f = new File(fileDest + ".pdf"); if (f.exists()) f.delete(); }
From source file:com.joanzapata.PDFViewActivity.java
License:Open Source License
public void parsePdf(String pdf2, String txt) throws IOException { // String//from ww w.j a v a 2 s. c o m // pdf1=Environment.getExternalStoragePublicDirectory(Environment.DIRECTORY_DOWNLOADS)+ // File.separator + "about.pdf"; PdfReader reader = new PdfReader(pdf2); PdfReaderContentParser parser = new PdfReaderContentParser(reader); File file = getFileStreamPath("test.txt"); if (!file.exists()) { file.createNewFile(); } FileOutputStream writer = openFileOutput(file.getName(), Context.MODE_PRIVATE); // PrintWriter out = new PrintWriter(new FileOutputStream(txt)); TextExtractionStrategy strategy; for (int i = 1; i <= pageNumber; i++) { strategy = parser.processContent(i, new SimpleTextExtractionStrategy()); writer.write(strategy.getResultantText().getBytes()); writer.flush(); } writer.close(); reader.close(); String totalString = readFromFile(); System.out.println(totalString); totalContent = totalString; String word = "Number of words " + wordCount(totalString) + "\n"; String averageWords = "Average length of words " + " " + averageWords(totalString) + "\n"; String sentenseCount = "Number of sentences " + sentenseCount(totalString) + "\n"; String averageSentenses = "Average length of sentences " + averageSentense(totalString) + "\n"; // String complexity= String readability = " Readability Index " + getReadability(totalString) + "\n"; String subject = "Subject Area " + etSubject.getText().toString() + "\n"; String article = "Type of the Article : " + etArticle.getText().toString() + "\n"; String pages = " Pages " + startPage + " to " + pageNumber + "\n"; String book = "book name " + etBookName.getText().toString() + "\n"; String readername = "reader name " + etReaderName.getText().toString() + "\n"; totalInfo = readername + book + subject + "\n" + article + pages + word + averageWords + sentenseCount + averageSentenses + readability; displayMessage(totalInfo); }
From source file:com.sustainalytics.crawlerfilter.PDFTitleGeneration.java
License:Apache License
public static String extractITextText(String pdf) { PdfReader reader = null;/*from w ww. j a v a2s. c om*/ try { reader = new PdfReader(pdf); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } PdfReaderContentParser parser = new PdfReaderContentParser(reader); TextExtractionStrategy strategy; String text = ""; for (int i = 1; i <= reader.getNumberOfPages(); i++) { try { strategy = parser.processContent(i, new SimpleTextExtractionStrategy()); text += strategy.getResultantText(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } reader.close(); return text; }
From source file:com.sustainalytics.crawlerfilter.PDFtoText.java
License:Apache License
public static String extractITextText(String pdf) { PdfReader reader = null;/*ww w.j a v a 2s .c om*/ try { reader = new PdfReader(pdf); } catch (IOException e) { logger.info("Error in reading file with iText parser\n"); } PdfReaderContentParser parser = new PdfReaderContentParser(reader); TextExtractionStrategy strategy; String text = ""; for (int i = 1; i <= reader.getNumberOfPages(); i++) { try { strategy = parser.processContent(i, new SimpleTextExtractionStrategy()); text += strategy.getResultantText(); } catch (IOException e) { logger.info("Error in parsing with iText parser\n"); } logger.info("PDF text extracted from " + pdf + "\n"); } reader.close(); return text; }
From source file:conversorpdf.Conversor.Conversor.java
/** * Parses a PDF to a plain text file.//from w ww . j a v a 2 s . c o m * @param pdf the original PDF * @param txt the resulting text * @throws IOException */ public boolean parsePdf(String pdf, String txt, boolean removerAcento) throws IOException { PdfReader reader = new PdfReader(pdf); PdfReaderContentParser parser = new PdfReaderContentParser(reader); PrintWriter out = new PrintWriter(new FileOutputStream(txt)); TextExtractionStrategy strategy; for (int i = 1; i <= reader.getNumberOfPages(); i++) { strategy = parser.processContent(i, new SimpleTextExtractionStrategy()); if (removerAcento == false) { out.println(strategy.getResultantText()); } else { out.println(this.removeAcentos(strategy.getResultantText())); } } out.flush(); out.close(); return true; }
From source file:coviam.pdf.PdfParser.java
public void getText() { String pdf = "/home/amit/NetBeansProjects/ResumeParser/data/resumes/ejd1.pdf"; String text = "/home/amit/NetBeansProjects/ResumeParser/data/resumes/edj1.txt"; StringBuffer textBuffer = new StringBuffer(); String resultText = ""; PdfReader reader;//from w ww . j a v a2 s .c o m try { reader = new PdfReader(pdf); PdfReaderContentParser contentParser = new PdfReaderContentParser(reader); PrintWriter printWriter = new PrintWriter(new FileOutputStream(text)); TextExtractionStrategy strategy; for (int i = 1; i <= reader.getNumberOfPages(); i++) { strategy = contentParser.processContent(i, new SimpleTextExtractionStrategy()); textBuffer.append(strategy.getResultantText()); } resultText = textBuffer.toString(); resultText = resultText.replaceAll("-\n", ""); System.out.println("-->" + resultText); StringTokenizer stringTokenizer = new StringTokenizer(resultText, "\n"); PrintWriter lineWriter = new PrintWriter( new FileOutputStream("/home/amit/NetBeansProjects/ResumeParser/data/resumes/edj1.txt")); while (stringTokenizer.hasMoreTokens()) { String curToken = stringTokenizer.nextToken(); lineWriter.println("line-->" + curToken); } lineWriter.flush(); lineWriter.close(); System.out.flush(); System.out.close(); } catch (IOException ioe) { } }
From source file:de.offis.health.icardea.cied.pdf.extractor.PDFiText5Extractor.java
License:GNU General Public License
public String getText(int pageNumber) throws IOException, Exception { String returnValue = null;//ww w. j a v a2 s .c o m if (pdfReader != null) { int numberOfPages = getNumberOfPages(); if (pageNumber > 0 && pageNumber <= numberOfPages) { TextExtractionStrategy strategy; PdfReaderContentParser parser = new PdfReaderContentParser(pdfReader); strategy = parser.processContent(pageNumber, new SimpleTextExtractionStrategy()); if (strategy != null && strategy.getResultantText().trim().length() > 0) { returnValue = PAGE_START_MARKER + strategy.getResultantText(); } // end if } else { // TODO: Add own exception. throw new Exception("The given page number (" + pageNumber + ") " + "is not in the range of valid pages (1.." + numberOfPages + ")."); } // end if..else } else { // TODO: Add own exception. throw new Exception("There is no open PDF to work with."); } // end if..else return returnValue; }
From source file:helper.PdfText.java
License:Apache License
/** * @param pdfFile this file will be extracted. * @return the plain text of the pdf/*from ww w. j a v a 2s .c o m*/ */ public String itext(File pdfFile) { PdfReader reader; try { reader = new PdfReader(pdfFile.getAbsolutePath()); PdfReaderContentParser parser = new PdfReaderContentParser(reader); StringBuffer buf = new StringBuffer(); TextExtractionStrategy strategy; for (int i = 1; i <= reader.getNumberOfPages(); i++) { strategy = parser.processContent(i, new SimpleTextExtractionStrategy()); buf.append(strategy.getResultantText()); } return buf.toString(); } catch (IOException e) { throw new HttpArchiveException(500, e); } }