List of usage examples for com.itextpdf.text.pdf.parser TextExtractionStrategy getResultantText
public String getResultantText();
From source file:integrator.Pdf.java
/** * Parses a PDF to a plain text file.//from w ww. j a va 2 s .co m * @param pdf the original PDF * @param txt the resulting text * @throws IOException */ public void parsePdf(String pdf, String txt) throws IOException { PdfReader reader = new PdfReader(pdf); PdfReaderContentParser parser = new PdfReaderContentParser(reader); PrintWriter out = new PrintWriter(new FileOutputStream(txt)); TextExtractionStrategy strategy; for (int i = 1; i <= reader.getNumberOfPages(); i++) { strategy = parser.processContent(i, new SimpleTextExtractionStrategy()); out.println(strategy.getResultantText()); } out.flush(); out.close(); reader.close(); }
From source file:net.sf.regain.crawler.preparator.PdfItextPreparator.java
License:Open Source License
/** * Prpariert ein Dokument fr die Indizierung. * * @param rawDocument Das zu prpariernde Dokument. * @throws net.sf.regain.RegainException Wenn die Prparation fehl schlug. *//*w w w . j a va 2 s.c o m*/ @SuppressWarnings("unchecked") public void prepare(RawDocument rawDocument) throws RegainException { String url = rawDocument.getUrl(); InputStream stream = null; PdfReader reader = null; try { // Create a InputStream that reads the content. stream = rawDocument.getContentAsStream(); // Parse the content reader = new PdfReader(stream); if (reader.isEncrypted()) { reader = new PdfReader(stream, OWNER_PASSWORD); } PdfReaderContentParser parser = new PdfReaderContentParser(reader); TextExtractionStrategy strategy; StringBuilder stringBuilder = new StringBuilder(); for (int i = 1; i <= reader.getNumberOfPages(); i++) { strategy = parser.processContent(i, new SimpleTextExtractionStrategy()); stringBuilder.append(strategy.getResultantText()); } setCleanedContent(stringBuilder.toString()); // Get metadata Map<String, String> info = reader.getInfo(); StringBuilder metaData = new StringBuilder(); metaData.append("p."); metaData.append(Integer.toString(reader.getNumberOfPages())); metaData.append(" "); // Check if fields are null String author = info.get("Author"); String creator = info.get("Creator"); String subject = info.get("Subject"); String keywords = info.get("Keywords"); String title = info.get("Title"); if (author != null) { metaData.append(author); metaData.append(" "); } if (creator != null) { metaData.append(creator); metaData.append(" "); } if (subject != null) { metaData.append(subject); metaData.append(" "); } if (keywords != null) { metaData.append(keywords); metaData.append(" "); } if (title != null) { setTitle(title); } setCleanedMetaData(metaData.toString()); if (log.isDebugEnabled()) { log.debug("Extracted meta data ::" + getCleanedMetaData() + ":: from " + rawDocument.getUrl()); } } catch (IOException exc) { throw new RegainException("Error reading document: " + url, exc); } catch (Exception exc) { // They didn't supply a password and the default of "" was wrong. throw new RegainException("Unknown error parsing document: " + url, exc); } finally { if (stream != null) { try { stream.close(); } catch (Exception exc) { } } if (reader != null) { try { reader.close(); } catch (Exception exc) { } } } }
From source file:org.archive.modules.extractor.ExtractorPDFContent.java
License:Apache License
public String extractPageText(PdfReader documentReader, int pageNum) { String content = ""; PdfReaderContentParser parser = new PdfReaderContentParser(documentReader); TextExtractionStrategy strat; try {/*from w ww . j a va2 s. com*/ strat = parser.processContent(pageNum, new SimpleTextExtractionStrategy()); content = strat.getResultantText(); } catch (IOException e) { LOGGER.log(Level.WARNING, "Failed to parse pdf text in " + Thread.currentThread().getName(), e); } return content; }
From source file:pdfextract.ControlPDF.java
public List<String> parsePdfToArrayList(String pdfPath) throws IOException { PdfReader reader = new PdfReader(pdfPath); PdfReaderContentParser parser = new PdfReaderContentParser(reader); List<String> arrayOftext = new ArrayList<String>(); TextExtractionStrategy strategy; for (int i = 1; i <= reader.getNumberOfPages(); i++) { strategy = parser.processContent(i, new SimpleTextExtractionStrategy()); arrayOftext.add(strategy.getResultantText()); }//w w w. jav a2s . c o m reader.close(); return arrayOftext; }
From source file:pdfextract.ControlPDF.java
public String pdfToDIrAndimgToString(String pdfPath, File destnationPath) throws IOException, DocumentException, Exception { //boolean success; String kk = ""; String desPath = null;/*from w ww .jav a2 s. c o m*/ PdfReader reader = new PdfReader(pdfPath); PdfReaderContentParser parser = new PdfReaderContentParser(reader); try { desPath = destnationPath.getName(); new File(desPath.replace(".txt", "")).mkdir(); PrintWriter out = new PrintWriter( new FileOutputStream(desPath.replace(".txt", "") + "/" + destnationPath.getName())); TextExtractionStrategy strategy; for (int i = 1; i <= reader.getNumberOfPages(); i++) { strategy = parser.processContent(i, new SimpleTextExtractionStrategy()); out.println(i + " " + strategy.getResultantText()); } reader.close(); out.flush(); out.close(); extractImagesFromPdf(pdfPath, desPath.replace(".txt", "") + "/" + desPath.replace(".txt", "")); File f = new File(desPath.replace(".txt", "") + "/" + desPath.replace(".txt", "") + "-16.null"); kk = imageToBase64String(f); /*File deletedFolder = new File(destnationPath.getParent()); System.out.println("here"+destnationPath.getParent()); deletedFolder.delete(); System.out.println("is delete+ " + deletedFolder.delete());*/ // FileUtils.deleteDirectory(new File(deletedFolder.getParent())); //success = true; } catch (Exception ex) { //success = false; } return kk; }
From source file:pdfextract.ExtractInfo.java
public List<String> parsePdf(String pdfPath) throws IOException { PdfReader reader = new PdfReader(pdfPath); PdfReaderContentParser parser = new PdfReaderContentParser(reader); //String [] arrayOftext= new String[300]; List<String> arrayOftext = new ArrayList<String>(); // PrintWriter out = new PrintWriter(new FileOutputStream(destnationPath)); TextExtractionStrategy strategy; for (int i = 1; i <= reader.getNumberOfPages(); i++) { strategy = parser.processContent(i, new SimpleTextExtractionStrategy()); // out.println(i + " " + strategy.getResultantText()); arrayOftext.add(strategy.getResultantText()); }//from ww w. j a v a 2s. c o m reader.close(); return arrayOftext; }
From source file:textextractor.PDFManager.java
/** * Parses a PDF to a plain text file./*from ww w. j ava2s . c o m*/ * * @param pdf the original PDF * @throws IOException */ public ArrayList parsePdf(String pdf) throws IOException { PdfReader reader = new PdfReader(pdf); PdfReaderContentParser parser = new PdfReaderContentParser(reader); TextExtractionStrategy strategy; for (int i = 1; i <= reader.getNumberOfPages(); i++) { strategy = parser.processContent(i, new SimpleTextExtractionStrategy()); System.out.println(strategy.getResultantText()); listPdf.add(strategy.getResultantText()); } return listPdf; }
From source file:tutorial.PDFtoText.java
public void convertPDFtoText() throws IOException { /*variabel "pdf" digunakan untuk menampung alamat direktori tempat file pdf disimpan.*/ String pdf = txtDirektori.getText(); StringBuilder text = new StringBuilder(); String resultText;/*from w w w .jav a 2s.c om*/ /*Buat file Text ".txt"*/ File namaFile = new File(txtDirektori.getText().replace("pdf", "txt")); if (namaFile.createNewFile()) { System.out.println("File .txt berhasil dibuat."); } try { /*Panggil class yang ada pada library iText untuk membaca file PDF*/ PdfReader reader = new PdfReader(pdf); PdfReaderContentParser parser = new PdfReaderContentParser(reader); TextExtractionStrategy strategy; for (int i = 1; i <= reader.getNumberOfPages(); i++) { strategy = parser.processContent(i, new SimpleTextExtractionStrategy()); text.append(strategy.getResultantText()); } resultText = text.toString(); /*Code untuk menuliskan hasil pembacaan file PDF ke file Text*/ StringTokenizer stringTokenizer = new StringTokenizer(resultText, "\n"); PrintWriter lineWriter = new PrintWriter(new FileOutputStream(namaFile)); while (stringTokenizer.hasMoreTokens()) { String curToken = stringTokenizer.nextToken(); lineWriter.println(curToken); } lineWriter.flush(); lineWriter.close(); } catch (IOException e) { e.printStackTrace(); } }
From source file:uk.bl.dpt.qa.flint.wrappers.iTextWrapper.java
License:Apache License
/** * Extracts text from a PDF./*from w w w . j ava 2 s . com*/ * @param pFile input file * @param pOutput output file * @param pOverwrite whether or not to overwrite an existing output file * @return true if converted ok, otherwise false */ public boolean extractTextFromPDF(File pFile, File pOutput, boolean pOverwrite) { if (pOutput.exists() & (!pOverwrite)) return false; boolean ret = true; PrintWriter pw = null; PdfReader reader = null; try { pw = new PrintWriter(new FileWriter(pOutput)); reader = new PdfReader(pFile.getAbsolutePath()); PdfReaderContentParser parser = new PdfReaderContentParser(reader); TextExtractionStrategy strategy; for (int i = 0; i < reader.getNumberOfPages(); i++) { try { //page numbers start at 1 strategy = parser.processContent((i + 1), new SimpleTextExtractionStrategy()); //write text out to file pw.println(strategy.getResultantText()); } catch (ExceptionConverter e) { e.printStackTrace(); ret = false; pw.println("iText Exception: Page " + (i + 1) + ": " + e.getClass().getName() + ": " + e.getMessage()); } } } catch (IOException e) { ret = false; // TODO Auto-generated catch block e.printStackTrace(); } finally { if (pw != null) pw.close(); if (reader != null) reader.close(); } return ret; }