Example usage for com.itextpdf.text.pdf.parser PdfTextExtractor getTextFromPage

List of usage examples for com.itextpdf.text.pdf.parser PdfTextExtractor getTextFromPage

Introduction

In this page you can find the example usage for com.itextpdf.text.pdf.parser PdfTextExtractor getTextFromPage.

Prototype

public static String getTextFromPage(PdfReader reader, int pageNumber) throws IOException 

Source Link

Document

Extract text from a specified page using the default strategy.

Usage

From source file:Reader.java

public void showPdf(String s) throws IOException {
    bookNames.add(s);//from  w w  w  .ja  v  a 2  s .c o m

    PdfReader pr = new PdfReader(s);

    String content = PdfTextExtractor.getTextFromPage(pr, 1);

    currentPageNum = 1;
    pageContentPane.setText(content);
}

From source file:Reader.java

public void nextPage() throws IOException {

    PdfReader pr = new PdfReader(fileName);

    //increment current page if there is one more page to read

    if (currentPageNum < pr.getNumberOfPages()) {
        ++currentPageNum;//from  w  w w  .j  a  va2 s.  co m

        String content = PdfTextExtractor.getTextFromPage(pr, currentPageNum);

        //go to the next page
        pageContentPane.setText(content);
    } else {
        String content = PdfTextExtractor.getTextFromPage(pr, currentPageNum);

        //show the last page
        pageContentPane.setText(content);

    }
}

From source file:Reader.java

public void previousPage() throws IOException {

    PdfReader pr = new PdfReader(fileName);

    //decrement current page if it is not the first page

    if (currentPageNum <= pr.getNumberOfPages() && currentPageNum != 1) {
        --currentPageNum;/*from   w  ww  .  j  ava  2  s  .co m*/

        String content = PdfTextExtractor.getTextFromPage(pr, currentPageNum);

        //go to the previous page
        pageContentPane.setText(content);
    } else {

        String content = PdfTextExtractor.getTextFromPage(pr, currentPageNum);

        //show the last page
        pageContentPane.setText(content);

    }
}

From source file:bpmlab.invioscript.ConstruirQualis.java

public static List<String> primeiraValidacao() {
    try {//from  w  w w  . j a v a  2 s  .c  o m
        PdfReader pdfReader = new PdfReader(
                "/home/bpmlab/NetBeansProjects/InvioScript/src/main/java/bpmlab/invioscript/Consulta_Webqualis.pdf");
        String[] linha;
        String novaLinha = null;
        List<String> qualis = new ArrayList<>();
        int total = 0;
        int invalidos = 0;
        for (int i = 1; i <= pdfReader.getNumberOfPages(); i++) {
            linha = PdfTextExtractor.getTextFromPage(pdfReader, i).split("\n");
            for (int j = 1; j < linha.length; j++) {
                total++;
                try {
                    if (linha[j].contains("Friday 06 March 2015") || linha[j].contains("TURISMO")
                            || linha[j].contains("INTERNACIONAIS") || linha[j].contains("DEMOGRAFIA")
                            || linha[j].contains("Lado C") || linha[j].contains("y TA Journal of Food C")
                            || linha[j].contains("www.siicsalud.com C NUTRIO Atualizado")
                            || linha[j].contains("ISSN T?TULO ESTRATO ?REA DE AVALIAO STATUS")) {
                        throw new Exception();
                    }

                    if (!linha[j].contains("Atualizado")) {
                        throw new Exception();
                    }

                    int indexFinal = linha[j].indexOf("Atualizado");

                    if (linha[j].contains(" A1 ")) {
                        novaLinha = linha[j].substring(linha[j].indexOf(" A1 ") + 4, indexFinal);
                    } else if (linha[j].contains(" A2 ")) {
                        novaLinha = linha[j].substring(linha[j].indexOf(" A2 ") + 4, indexFinal);
                    } else if (linha[j].contains(" B1 ")) {
                        novaLinha = linha[j].substring(linha[j].indexOf(" B1 ") + 4, indexFinal);
                    } else if (linha[j].contains(" B2 ")) {
                        novaLinha = linha[j].substring(linha[j].indexOf(" B2 ") + 4, indexFinal);
                    } else if (linha[j].contains(" B3 ")) {
                        novaLinha = linha[j].substring(linha[j].indexOf(" B3 ") + 4, indexFinal);
                    } else if (linha[j].contains(" B4 ")) {
                        novaLinha = linha[j].substring(linha[j].indexOf(" B4 ") + 4, indexFinal);
                    } else if (linha[j].contains(" B5 ")) {
                        novaLinha = linha[j].substring(linha[j].indexOf(" B5 ") + 4, indexFinal);
                    } else if (linha[j].contains(" C ")) {
                        novaLinha = linha[j].substring(linha[j].indexOf(" C ") + 3, indexFinal);
                    } else {
                        throw new Exception();
                    }

                    if (!linha[j].substring(0, 9).matches("\\w\\w\\w\\w-\\w\\w\\w\\w")
                            || linha[j].substring(0, 12).matches("\\w\\w\\w\\w-\\w\\w\\w\\w A1")
                            || linha[j].substring(0, 12).matches("\\w\\w\\w\\w-\\w\\w\\w\\w A2")
                            || linha[j].substring(0, 12).matches("\\w\\w\\w\\w-\\w\\w\\w\\w B1")
                            || linha[j].substring(0, 12).matches("\\w\\w\\w\\w-\\w\\w\\w\\w B2")
                            || linha[j].substring(0, 12).matches("\\w\\w\\w\\w-\\w\\w\\w\\w B3")
                            || linha[j].substring(0, 12).matches("\\w\\w\\w\\w-\\w\\w\\w\\w B4")
                            || linha[j].substring(0, 12).matches("\\w\\w\\w\\w-\\w\\w\\w\\w B5")
                            || linha[j].substring(0, 12).matches("\\w\\w\\w\\w-\\w\\w\\w\\w C ")) {
                        throw new Exception();
                    }
                    if (novaLinha != null) {
                        qualis.add(linha[j]);
                    }
                    novaLinha = null;
                } catch (Exception e) {
                    StringBuilder construirLinha;
                    switch (linha[j]) {
                    case "ADMINISTRAO, CINCIAS CONT?BEIS E":
                        construirLinha = new StringBuilder(linha[j + 1]);
                        construirLinha.insert(linha[j + 1].indexOf("Atualizado") - 1,
                                " " + linha[j] + " " + linha[j + 2]);
                        qualis.add(construirLinha.toString());
                        break;
                    case "CINCIA POL?TICA E RELAES":
                        construirLinha = new StringBuilder(linha[j + 1]);
                        construirLinha.insert(linha[j + 1].indexOf("Atualizado") - 1,
                                " " + linha[j] + " " + linha[j + 2]);
                        qualis.add(construirLinha.toString());
                        break;
                    case "PLANEJAMENTO URBANO E REGIONAL /":
                        construirLinha = new StringBuilder(linha[j + 1]);
                        construirLinha.insert(linha[j + 1].indexOf("Atualizado") - 1,
                                " " + linha[j] + " " + linha[j + 2]);
                        qualis.add(construirLinha.toString());
                        break;
                    case "American Journal of Physiology. Regulatory, Integrative and Comparative Physiology":
                        construirLinha = new StringBuilder(linha[j + 1]);
                        construirLinha.insert(9, " " + linha[j]);
                        qualis.add(construirLinha.toString());
                        break;
                    case "Proceedings of the National Academy of Sciences of the United States of America":
                        construirLinha = new StringBuilder(linha[j + 1]);
                        construirLinha.insert(9, " " + linha[j] + linha[j + 2]);
                        qualis.add(construirLinha.toString());
                        break;
                    case "Revista de Clnica e Pesquisa Odontolgica (Impresso) / Journal of Dental Clinical and":
                        construirLinha = new StringBuilder(linha[j + 1]);
                        construirLinha.insert(9, " " + linha[j] + " " + linha[j + 2]);
                        qualis.add(construirLinha.toString());
                        break;
                    default:
                        invalidos++;
                        if (!(linha[j].contains("Friday 06 March") || linha[j].contains("TURISMO")
                                || linha[j].contains("(Online)") || linha[j].contains("Research")
                                || linha[j].contains("INTERNACIONAIS") || linha[j].contains("DEMOGRAFIA"))) {
                            //                                    System.out.println(linha[j]);
                        }
                        break;
                    }
                }
            }
        }
        for (String q : qualis) {
            System.out.println(q);
        }
        System.out.println("TOTAL: " + total);
        System.out.println("VALIDOS: " + qualis.size() + ";" + ((float) qualis.size() * 100 / total) + "%");
        System.out.println("INVALIDOS: " + invalidos + ";" + ((float) invalidos * 100 / total) + "%");
        System.out.println(qualis.size() + invalidos);
        return qualis;
    } catch (IOException ex) {
        return null;
    }
}

From source file:com.docdoku.server.esindexer.ESTools.java

License:Open Source License

private static String pdfPageToString(PdfReader reader, int pageNumber, String fullName) {
    try {//  w w  w  . j  a  va  2  s  .com
        return PdfTextExtractor.getTextFromPage(reader, pageNumber);
    } catch (Exception e) {
        Logger.getLogger(ESIndexer.class.getName()).log(Level.INFO,
                "A problem occur in the file : " + fullName + ", indexing at page :" + pageNumber);
        Logger.getLogger(ESIndexer.class.getName()).log(Level.FINER, null, e);
        return "";
    }
}

From source file:com.example.pdftranslator.ScreenSlidePageFragment.java

License:Apache License

@Override
public View onCreateView(LayoutInflater inflater, ViewGroup container, Bundle savedInstanceState) {
    // Inflate the layout containing a title and body text.
    ViewGroup rootView = (ViewGroup) inflater.inflate(R.layout.fragment_screen_slide_page, container, false);
    String textFromPdf;// ww w  .j a va 2  s.  c  o  m
    TextView textViewDisplayer;

    try {

        textFromPdf = PdfTextExtractor.getTextFromPage(ActivityTextDisplayer.reader, mPageNumber + 1);
        textFromPdf = textArranged(textFromPdf);

        textViewDisplayer = (TextView) rootView.findViewById(android.R.id.text1);
        textViewDisplayer.setOnTouchListener(this);
        textViewDisplayer.setText(textFromPdf);

    } catch (IOException e1) {
        // TODO Auto-generated catch block
        e1.printStackTrace();
    }

    return rootView;
}

From source file:com.github.naofum.epubconverter.ReadPdf.java

License:Open Source License

public static String extractText(int page) {
    try {/*from   ww w  .  jav a 2s  .  c o  m*/
        return PdfTextExtractor.getTextFromPage(reader, page) + "\n";
    } catch (Exception e) {
        System.err.println("Failed to extract text " + e.getMessage());
        return "";
    } catch (OutOfMemoryError e) {
        System.err.println("Out of memory in text extraction " + e.getMessage());
        return "";
    }
}

From source file:de.codecentric.robot.pdf.PDFKeywords.java

License:Apache License

@RobotKeyword
public void parsePdf(String filename) throws IOException {
    reader = new PdfReader(filename);
    System.out.println("Reading file " + filename);

    pdfData = new HashMap<Integer, String>();
    int numberOfPages = reader.getNumberOfPages();
    for (int page = 1; page <= numberOfPages; page++) {
        System.out.println("Reading page " + page);
        String textFromPage = PdfTextExtractor.getTextFromPage(reader, page);
        pdfData.put(page, textFromPage);

    }/* w w  w  .j  a  v a  2  s.c  o  m*/
}

From source file:de.mpg.escidoc.services.extraction.ExtractionChain.java

License:Open Source License

public ExtractionResult doExtract(String infileName, String outfileName) {
    File outfile = new File(outfileName);

    Date stepStart = new Date();
    Date current;/*from  ww  w.  j  a v a2 s.c  om*/

    logger.info("Extracting PDF content ----------------------------------------");
    logger.info("Infile: " + infileName);
    logger.info("Outfile: " + outfileName);

    logger.info(stepStart + " -- started");

    // xPDF

    try {
        logger.info("Extracting with xPDF");

        StringBuffer command = new StringBuffer(2048);
        command.append(System.getProperty("os.name").contains("Windows") ? pdftotext + " -enc UTF-8 "
                : "/usr/bin/pdftotext -enc UTF-8 ");
        command.append(infileName);
        command.append(" ");
        command.append(outfileName);

        Process proc = Runtime.getRuntime().exec(command.toString());

        StreamGobbler inputGobbler = new StreamGobbler(proc.getInputStream(), "xPDF");
        StreamGobbler errorGobbler = new StreamGobbler(proc.getErrorStream(), "xPDF");

        inputGobbler.start();
        errorGobbler.start();

        int exitCode = proc.waitFor();

        if (proc.exitValue() == 0) {

            if (verbose) {
                BufferedReader bufferedReader = new BufferedReader(
                        new InputStreamReader(new FileInputStream(outfile), "UTF-8"));
                String line;
                while ((line = bufferedReader.readLine()) != null) {
                    logger.info(line);
                }
                bufferedReader.close();
            }
            current = new Date();
            logger.info(current + " -- finished successfully");
            logger.info("Extraction took " + (current.getTime() - stepStart.getTime()));

            return ExtractionResult.OK;
        }
    } catch (Exception e) {
        logger.warn("Error extracting PDF with xPDF:");
        logger.warn(e.getStackTrace());
    }

    current = new Date();
    logger.info(current + " -- finished unsuccessfully");
    logger.info("Extraction attempt took " + (current.getTime() - stepStart.getTime()));

    // PDFBox
    try {
        logger.info("Extracting with PDFBox");
        stepStart = new Date();

        StringBuffer command = new StringBuffer(1024);
        command.append(System.getProperty("os.name").contains("Windows")
                ? "java -Dfile.encoding=UTF-8 -jar " + pdfboxAppJar + " ExtractText "
                : "/usr/bin/java -Dfile.encoding=UTF-8 -jar " + pdfboxAppJar + " ExtractText ");
        command.append(infileName);
        command.append(" ");
        command.append(outfileName);

        Process proc = Runtime.getRuntime().exec(command.toString());
        StreamGobbler inputGobbler = new StreamGobbler(proc.getInputStream(), "PDFBox");
        StreamGobbler errorGobbler = new StreamGobbler(proc.getErrorStream(), "PDFBox");

        inputGobbler.start();
        errorGobbler.start();

        int exitCode = proc.waitFor();

        if (exitCode == 0) {

            if (verbose) {
                BufferedReader bufferedReader = new BufferedReader(
                        new InputStreamReader(new FileInputStream(outfile), "UTF-8"));
                String line;
                while ((line = bufferedReader.readLine()) != null) {
                    logger.info(line);
                }
                bufferedReader.close();
            }
            current = new Date();
            logger.info(current + " -- finished successfully");
            logger.info("Extraction took " + (current.getTime() - stepStart.getTime()));

            return ExtractionResult.OK;
        }
    } catch (Exception e) {
        logger.warn("Error extracting PDF with PDFBox:");
        logger.warn(e.getStackTrace());
    }

    current = new Date();
    logger.info(current + " -- finished unsuccessfully");
    logger.info("Extraction attempt took " + (current.getTime() - stepStart.getTime()));

    // iText
    try {
        logger.info("Extracting with iText");
        stepStart = new Date();

        PdfReader reader = new PdfReader(infileName);
        int numberOfPages = reader.getNumberOfPages();

        outputStreamWriter = new OutputStreamWriter(new FileOutputStream(outfile), "UTF-8");
        for (int i = 0; i < numberOfPages; i++) {
            outputStreamWriter.write(PdfTextExtractor.getTextFromPage(reader, i + 1));
        }

        if (verbose) {
            BufferedReader bufferedReader = new BufferedReader(
                    new InputStreamReader(new FileInputStream(outfile), "UTF-8"));
            String line;
            while ((line = bufferedReader.readLine()) != null) {
                logger.info(line);
            }
            bufferedReader.close();
        }

        current = new Date();
        logger.info(current + " -- finished successfully");
        logger.info("Extraction took " + (current.getTime() - stepStart.getTime()));

        return ExtractionResult.OK;

    } catch (Exception e) {
        logger.warn("Error extracting PDF with iText:", e);
    }

    // tika

    InputStream stream = null;

    try {
        logger.info("Extracting with Tika");
        stepStart = new Date();

        stream = TikaInputStream.get(new File(infileName));

        ContentHandler handler = new BodyContentHandler(TIKA_CONTENT_SIZE);

        new AutoDetectParser().parse(stream, handler, new Metadata(), new ParseContext());

        String content = handler.toString();

        FileUtils.writeStringToFile(outfile, content);

        stream.close();

        if (verbose) {
            BufferedReader bufferedReader = new BufferedReader(
                    new InputStreamReader(new FileInputStream(outfile), "UTF-8"));
            String line;
            while ((line = bufferedReader.readLine()) != null) {
                logger.info(line);
            }
            bufferedReader.close();
        }

        current = new Date();
        logger.info(current + " -- finished successfully");
        logger.info("Extraction took " + (current.getTime() - stepStart.getTime()));

        return ExtractionResult.OK;

    } catch (Exception e) {
        logger.warn("Error extracting Tika:", e);
        try {
            stream.close();
        } catch (IOException e1) {
            e1.printStackTrace();
        }
    }

    current = new Date();
    logger.warn(current + " -- finished unsuccessfully");
    logger.info("Extraction attempt took " + (current.getTime() - stepStart.getTime()));

    logger.info("... giving up");

    return ExtractionResult.FAILURE;
}

From source file:digiho.reading.java

public static void main(String[] args) {
    try {//from   ww  w  .  j a  v a 2 s .co  m

        PdfReader reader = new PdfReader("G:\\43211688.pdf");
        System.out.println("This PDF has " + reader.getNumberOfPages() + " pages.");
        String page = PdfTextExtractor.getTextFromPage(reader, 2);
        System.out.println("Page Content:\n\n" + page + "\n\n");
        System.out.println("Is this document tampered: " + reader.isTampered());
        System.out.println("Is this document encrypted: " + reader.isEncrypted());

    } catch (IOException e) {
        e.printStackTrace();
    }

}