Example usage for com.itextpdf.text.pdf.parser TextExtractionStrategy getResultantText

List of usage examples for com.itextpdf.text.pdf.parser TextExtractionStrategy getResultantText

Introduction

In this page you can find the example usage for com.itextpdf.text.pdf.parser TextExtractionStrategy getResultantText.

Prototype

public String getResultantText();

Source Link

Document

Returns the result so far.

Usage

From source file:integrator.Pdf.java

/**
 * Parses a PDF to a plain text file.//from  w  ww.  j a va 2 s .co m
 * @param pdf the original PDF
 * @param txt the resulting text
 * @throws IOException
 */
public void parsePdf(String pdf, String txt) throws IOException {
    PdfReader reader = new PdfReader(pdf);
    PdfReaderContentParser parser = new PdfReaderContentParser(reader);
    PrintWriter out = new PrintWriter(new FileOutputStream(txt));
    TextExtractionStrategy strategy;
    for (int i = 1; i <= reader.getNumberOfPages(); i++) {
        strategy = parser.processContent(i, new SimpleTextExtractionStrategy());
        out.println(strategy.getResultantText());
    }
    out.flush();
    out.close();
    reader.close();
}

From source file:net.sf.regain.crawler.preparator.PdfItextPreparator.java

License:Open Source License

/**
 * Prpariert ein Dokument fr die Indizierung.
 *
 * @param rawDocument Das zu prpariernde Dokument.
 * @throws net.sf.regain.RegainException Wenn die Prparation fehl schlug.
 *//*w  w w . j a  va  2 s.c o m*/
@SuppressWarnings("unchecked")
public void prepare(RawDocument rawDocument) throws RegainException {
    String url = rawDocument.getUrl();

    InputStream stream = null;
    PdfReader reader = null;

    try {
        // Create a InputStream that reads the content.
        stream = rawDocument.getContentAsStream();

        // Parse the content
        reader = new PdfReader(stream);
        if (reader.isEncrypted()) {
            reader = new PdfReader(stream, OWNER_PASSWORD);
        }
        PdfReaderContentParser parser = new PdfReaderContentParser(reader);

        TextExtractionStrategy strategy;
        StringBuilder stringBuilder = new StringBuilder();
        for (int i = 1; i <= reader.getNumberOfPages(); i++) {
            strategy = parser.processContent(i, new SimpleTextExtractionStrategy());
            stringBuilder.append(strategy.getResultantText());

        }

        setCleanedContent(stringBuilder.toString());

        // Get metadata
        Map<String, String> info = reader.getInfo();

        StringBuilder metaData = new StringBuilder();
        metaData.append("p.");
        metaData.append(Integer.toString(reader.getNumberOfPages()));
        metaData.append(" ");

        // Check if fields are null
        String author = info.get("Author");
        String creator = info.get("Creator");
        String subject = info.get("Subject");
        String keywords = info.get("Keywords");
        String title = info.get("Title");

        if (author != null) {
            metaData.append(author);
            metaData.append(" ");
        }
        if (creator != null) {
            metaData.append(creator);
            metaData.append(" ");
        }
        if (subject != null) {
            metaData.append(subject);
            metaData.append(" ");
        }
        if (keywords != null) {
            metaData.append(keywords);
            metaData.append(" ");
        }

        if (title != null) {
            setTitle(title);
        }

        setCleanedMetaData(metaData.toString());
        if (log.isDebugEnabled()) {
            log.debug("Extracted meta data ::" + getCleanedMetaData() + ":: from " + rawDocument.getUrl());
        }

    } catch (IOException exc) {
        throw new RegainException("Error reading document: " + url, exc);
    } catch (Exception exc) {
        // They didn't supply a password and the default of "" was wrong.
        throw new RegainException("Unknown error parsing document: " + url, exc);

    } finally {
        if (stream != null) {
            try {
                stream.close();
            } catch (Exception exc) {
            }
        }
        if (reader != null) {
            try {
                reader.close();
            } catch (Exception exc) {
            }
        }
    }
}

From source file:org.archive.modules.extractor.ExtractorPDFContent.java

License:Apache License

public String extractPageText(PdfReader documentReader, int pageNum) {
    String content = "";
    PdfReaderContentParser parser = new PdfReaderContentParser(documentReader);
    TextExtractionStrategy strat;
    try {/*from   w  ww . j a va2  s.  com*/
        strat = parser.processContent(pageNum, new SimpleTextExtractionStrategy());
        content = strat.getResultantText();

    } catch (IOException e) {
        LOGGER.log(Level.WARNING, "Failed to parse pdf text in " + Thread.currentThread().getName(), e);
    }
    return content;
}

From source file:pdfextract.ControlPDF.java

public List<String> parsePdfToArrayList(String pdfPath) throws IOException {
    PdfReader reader = new PdfReader(pdfPath);
    PdfReaderContentParser parser = new PdfReaderContentParser(reader);
    List<String> arrayOftext = new ArrayList<String>();

    TextExtractionStrategy strategy;
    for (int i = 1; i <= reader.getNumberOfPages(); i++) {
        strategy = parser.processContent(i, new SimpleTextExtractionStrategy());
        arrayOftext.add(strategy.getResultantText());
    }//w  w  w. jav  a2s . c  o  m
    reader.close();

    return arrayOftext;
}

From source file:pdfextract.ControlPDF.java

public String pdfToDIrAndimgToString(String pdfPath, File destnationPath)
        throws IOException, DocumentException, Exception {
    //boolean success;
    String kk = "";
    String desPath = null;/*from w  ww  .jav a2  s.  c  o  m*/
    PdfReader reader = new PdfReader(pdfPath);
    PdfReaderContentParser parser = new PdfReaderContentParser(reader);
    try {
        desPath = destnationPath.getName();

        new File(desPath.replace(".txt", "")).mkdir();
        PrintWriter out = new PrintWriter(
                new FileOutputStream(desPath.replace(".txt", "") + "/" + destnationPath.getName()));
        TextExtractionStrategy strategy;
        for (int i = 1; i <= reader.getNumberOfPages(); i++) {
            strategy = parser.processContent(i, new SimpleTextExtractionStrategy());
            out.println(i + " " + strategy.getResultantText());
        }
        reader.close();
        out.flush();
        out.close();
        extractImagesFromPdf(pdfPath, desPath.replace(".txt", "") + "/" + desPath.replace(".txt", ""));
        File f = new File(desPath.replace(".txt", "") + "/" + desPath.replace(".txt", "") + "-16.null");
        kk = imageToBase64String(f);
        /*File deletedFolder = new File(destnationPath.getParent());
        System.out.println("here"+destnationPath.getParent());
        deletedFolder.delete();
        System.out.println("is delete+ " + deletedFolder.delete());*/
        //       FileUtils.deleteDirectory(new File(deletedFolder.getParent()));
        //success = true;
    } catch (Exception ex) {
        //success = false;

    }

    return kk;
}

From source file:pdfextract.ExtractInfo.java

public List<String> parsePdf(String pdfPath) throws IOException {
    PdfReader reader = new PdfReader(pdfPath);
    PdfReaderContentParser parser = new PdfReaderContentParser(reader);
    //String [] arrayOftext= new String[300];
    List<String> arrayOftext = new ArrayList<String>();

    // PrintWriter out = new PrintWriter(new FileOutputStream(destnationPath));
    TextExtractionStrategy strategy;
    for (int i = 1; i <= reader.getNumberOfPages(); i++) {
        strategy = parser.processContent(i, new SimpleTextExtractionStrategy());
        // out.println(i + " " + strategy.getResultantText());
        arrayOftext.add(strategy.getResultantText());
    }//from  ww w. j a v  a 2s. c o m
    reader.close();

    return arrayOftext;
}

From source file:textextractor.PDFManager.java

/**
 * Parses a PDF to a plain text file./*from ww  w.  j ava2s . c  o m*/
 *
 * @param pdf the original PDF
 * @throws IOException
 */
public ArrayList parsePdf(String pdf) throws IOException {
    PdfReader reader = new PdfReader(pdf);
    PdfReaderContentParser parser = new PdfReaderContentParser(reader);
    TextExtractionStrategy strategy;
    for (int i = 1; i <= reader.getNumberOfPages(); i++) {
        strategy = parser.processContent(i, new SimpleTextExtractionStrategy());
        System.out.println(strategy.getResultantText());
        listPdf.add(strategy.getResultantText());
    }
    return listPdf;
}

From source file:tutorial.PDFtoText.java

public void convertPDFtoText() throws IOException {
    /*variabel "pdf" digunakan untuk menampung alamat direktori tempat file pdf disimpan.*/
    String pdf = txtDirektori.getText();
    StringBuilder text = new StringBuilder();
    String resultText;/*from w  w w  .jav  a  2s.c  om*/

    /*Buat file Text ".txt"*/
    File namaFile = new File(txtDirektori.getText().replace("pdf", "txt"));
    if (namaFile.createNewFile()) {
        System.out.println("File .txt berhasil dibuat.");
    }

    try {
        /*Panggil class yang ada pada library iText untuk membaca file PDF*/
        PdfReader reader = new PdfReader(pdf);
        PdfReaderContentParser parser = new PdfReaderContentParser(reader);
        TextExtractionStrategy strategy;
        for (int i = 1; i <= reader.getNumberOfPages(); i++) {
            strategy = parser.processContent(i, new SimpleTextExtractionStrategy());
            text.append(strategy.getResultantText());
        }
        resultText = text.toString();

        /*Code untuk menuliskan hasil pembacaan file PDF ke file Text*/
        StringTokenizer stringTokenizer = new StringTokenizer(resultText, "\n");
        PrintWriter lineWriter = new PrintWriter(new FileOutputStream(namaFile));
        while (stringTokenizer.hasMoreTokens()) {
            String curToken = stringTokenizer.nextToken();
            lineWriter.println(curToken);
        }
        lineWriter.flush();
        lineWriter.close();
    } catch (IOException e) {
        e.printStackTrace();
    }
}

From source file:uk.bl.dpt.qa.flint.wrappers.iTextWrapper.java

License:Apache License

/**
 * Extracts text from a PDF./*from   w  w w  . j ava 2  s  .  com*/
 * @param pFile input file
 * @param pOutput output file
 * @param pOverwrite whether or not to overwrite an existing output file
 * @return true if converted ok, otherwise false
 */
public boolean extractTextFromPDF(File pFile, File pOutput, boolean pOverwrite) {
    if (pOutput.exists() & (!pOverwrite))
        return false;

    boolean ret = true;

    PrintWriter pw = null;
    PdfReader reader = null;

    try {
        pw = new PrintWriter(new FileWriter(pOutput));
        reader = new PdfReader(pFile.getAbsolutePath());
        PdfReaderContentParser parser = new PdfReaderContentParser(reader);
        TextExtractionStrategy strategy;
        for (int i = 0; i < reader.getNumberOfPages(); i++) {
            try {
                //page numbers start at 1
                strategy = parser.processContent((i + 1), new SimpleTextExtractionStrategy());
                //write text out to file
                pw.println(strategy.getResultantText());
            } catch (ExceptionConverter e) {
                e.printStackTrace();
                ret = false;
                pw.println("iText Exception: Page " + (i + 1) + ": " + e.getClass().getName() + ": "
                        + e.getMessage());
            }
        }
    } catch (IOException e) {
        ret = false;
        // TODO Auto-generated catch block
        e.printStackTrace();
    } finally {
        if (pw != null)
            pw.close();
        if (reader != null)
            reader.close();
    }

    return ret;
}