List of usage examples for com.itextpdf.text.pdf.parser PdfTextExtractor getTextFromPage
public static String getTextFromPage(PdfReader reader, int pageNumber) throws IOException
From source file:Reader.java
public void showPdf(String s) throws IOException { bookNames.add(s);//from w w w .ja v a 2 s .c o m PdfReader pr = new PdfReader(s); String content = PdfTextExtractor.getTextFromPage(pr, 1); currentPageNum = 1; pageContentPane.setText(content); }
From source file:Reader.java
public void nextPage() throws IOException { PdfReader pr = new PdfReader(fileName); //increment current page if there is one more page to read if (currentPageNum < pr.getNumberOfPages()) { ++currentPageNum;//from w w w .j a va2 s. co m String content = PdfTextExtractor.getTextFromPage(pr, currentPageNum); //go to the next page pageContentPane.setText(content); } else { String content = PdfTextExtractor.getTextFromPage(pr, currentPageNum); //show the last page pageContentPane.setText(content); } }
From source file:Reader.java
public void previousPage() throws IOException { PdfReader pr = new PdfReader(fileName); //decrement current page if it is not the first page if (currentPageNum <= pr.getNumberOfPages() && currentPageNum != 1) { --currentPageNum;/*from w ww . j ava 2 s .co m*/ String content = PdfTextExtractor.getTextFromPage(pr, currentPageNum); //go to the previous page pageContentPane.setText(content); } else { String content = PdfTextExtractor.getTextFromPage(pr, currentPageNum); //show the last page pageContentPane.setText(content); } }
From source file:bpmlab.invioscript.ConstruirQualis.java
public static List<String> primeiraValidacao() { try {//from w w w . j a v a 2 s .c o m PdfReader pdfReader = new PdfReader( "/home/bpmlab/NetBeansProjects/InvioScript/src/main/java/bpmlab/invioscript/Consulta_Webqualis.pdf"); String[] linha; String novaLinha = null; List<String> qualis = new ArrayList<>(); int total = 0; int invalidos = 0; for (int i = 1; i <= pdfReader.getNumberOfPages(); i++) { linha = PdfTextExtractor.getTextFromPage(pdfReader, i).split("\n"); for (int j = 1; j < linha.length; j++) { total++; try { if (linha[j].contains("Friday 06 March 2015") || linha[j].contains("TURISMO") || linha[j].contains("INTERNACIONAIS") || linha[j].contains("DEMOGRAFIA") || linha[j].contains("Lado C") || linha[j].contains("y TA Journal of Food C") || linha[j].contains("www.siicsalud.com C NUTRIO Atualizado") || linha[j].contains("ISSN T?TULO ESTRATO ?REA DE AVALIAO STATUS")) { throw new Exception(); } if (!linha[j].contains("Atualizado")) { throw new Exception(); } int indexFinal = linha[j].indexOf("Atualizado"); if (linha[j].contains(" A1 ")) { novaLinha = linha[j].substring(linha[j].indexOf(" A1 ") + 4, indexFinal); } else if (linha[j].contains(" A2 ")) { novaLinha = linha[j].substring(linha[j].indexOf(" A2 ") + 4, indexFinal); } else if (linha[j].contains(" B1 ")) { novaLinha = linha[j].substring(linha[j].indexOf(" B1 ") + 4, indexFinal); } else if (linha[j].contains(" B2 ")) { novaLinha = linha[j].substring(linha[j].indexOf(" B2 ") + 4, indexFinal); } else if (linha[j].contains(" B3 ")) { novaLinha = linha[j].substring(linha[j].indexOf(" B3 ") + 4, indexFinal); } else if (linha[j].contains(" B4 ")) { novaLinha = linha[j].substring(linha[j].indexOf(" B4 ") + 4, indexFinal); } else if (linha[j].contains(" B5 ")) { novaLinha = linha[j].substring(linha[j].indexOf(" B5 ") + 4, indexFinal); } else if (linha[j].contains(" C ")) { novaLinha = linha[j].substring(linha[j].indexOf(" C ") + 3, indexFinal); } else { throw new Exception(); } if (!linha[j].substring(0, 9).matches("\\w\\w\\w\\w-\\w\\w\\w\\w") || linha[j].substring(0, 12).matches("\\w\\w\\w\\w-\\w\\w\\w\\w A1") || linha[j].substring(0, 12).matches("\\w\\w\\w\\w-\\w\\w\\w\\w A2") || linha[j].substring(0, 12).matches("\\w\\w\\w\\w-\\w\\w\\w\\w B1") || linha[j].substring(0, 12).matches("\\w\\w\\w\\w-\\w\\w\\w\\w B2") || linha[j].substring(0, 12).matches("\\w\\w\\w\\w-\\w\\w\\w\\w B3") || linha[j].substring(0, 12).matches("\\w\\w\\w\\w-\\w\\w\\w\\w B4") || linha[j].substring(0, 12).matches("\\w\\w\\w\\w-\\w\\w\\w\\w B5") || linha[j].substring(0, 12).matches("\\w\\w\\w\\w-\\w\\w\\w\\w C ")) { throw new Exception(); } if (novaLinha != null) { qualis.add(linha[j]); } novaLinha = null; } catch (Exception e) { StringBuilder construirLinha; switch (linha[j]) { case "ADMINISTRAO, CINCIAS CONT?BEIS E": construirLinha = new StringBuilder(linha[j + 1]); construirLinha.insert(linha[j + 1].indexOf("Atualizado") - 1, " " + linha[j] + " " + linha[j + 2]); qualis.add(construirLinha.toString()); break; case "CINCIA POL?TICA E RELAES": construirLinha = new StringBuilder(linha[j + 1]); construirLinha.insert(linha[j + 1].indexOf("Atualizado") - 1, " " + linha[j] + " " + linha[j + 2]); qualis.add(construirLinha.toString()); break; case "PLANEJAMENTO URBANO E REGIONAL /": construirLinha = new StringBuilder(linha[j + 1]); construirLinha.insert(linha[j + 1].indexOf("Atualizado") - 1, " " + linha[j] + " " + linha[j + 2]); qualis.add(construirLinha.toString()); break; case "American Journal of Physiology. Regulatory, Integrative and Comparative Physiology": construirLinha = new StringBuilder(linha[j + 1]); construirLinha.insert(9, " " + linha[j]); qualis.add(construirLinha.toString()); break; case "Proceedings of the National Academy of Sciences of the United States of America": construirLinha = new StringBuilder(linha[j + 1]); construirLinha.insert(9, " " + linha[j] + linha[j + 2]); qualis.add(construirLinha.toString()); break; case "Revista de Clnica e Pesquisa Odontolgica (Impresso) / Journal of Dental Clinical and": construirLinha = new StringBuilder(linha[j + 1]); construirLinha.insert(9, " " + linha[j] + " " + linha[j + 2]); qualis.add(construirLinha.toString()); break; default: invalidos++; if (!(linha[j].contains("Friday 06 March") || linha[j].contains("TURISMO") || linha[j].contains("(Online)") || linha[j].contains("Research") || linha[j].contains("INTERNACIONAIS") || linha[j].contains("DEMOGRAFIA"))) { // System.out.println(linha[j]); } break; } } } } for (String q : qualis) { System.out.println(q); } System.out.println("TOTAL: " + total); System.out.println("VALIDOS: " + qualis.size() + ";" + ((float) qualis.size() * 100 / total) + "%"); System.out.println("INVALIDOS: " + invalidos + ";" + ((float) invalidos * 100 / total) + "%"); System.out.println(qualis.size() + invalidos); return qualis; } catch (IOException ex) { return null; } }
From source file:com.docdoku.server.esindexer.ESTools.java
License:Open Source License
private static String pdfPageToString(PdfReader reader, int pageNumber, String fullName) { try {// w w w . j a va 2 s .com return PdfTextExtractor.getTextFromPage(reader, pageNumber); } catch (Exception e) { Logger.getLogger(ESIndexer.class.getName()).log(Level.INFO, "A problem occur in the file : " + fullName + ", indexing at page :" + pageNumber); Logger.getLogger(ESIndexer.class.getName()).log(Level.FINER, null, e); return ""; } }
From source file:com.example.pdftranslator.ScreenSlidePageFragment.java
License:Apache License
@Override public View onCreateView(LayoutInflater inflater, ViewGroup container, Bundle savedInstanceState) { // Inflate the layout containing a title and body text. ViewGroup rootView = (ViewGroup) inflater.inflate(R.layout.fragment_screen_slide_page, container, false); String textFromPdf;// ww w .j a va 2 s. c o m TextView textViewDisplayer; try { textFromPdf = PdfTextExtractor.getTextFromPage(ActivityTextDisplayer.reader, mPageNumber + 1); textFromPdf = textArranged(textFromPdf); textViewDisplayer = (TextView) rootView.findViewById(android.R.id.text1); textViewDisplayer.setOnTouchListener(this); textViewDisplayer.setText(textFromPdf); } catch (IOException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } return rootView; }
From source file:com.github.naofum.epubconverter.ReadPdf.java
License:Open Source License
public static String extractText(int page) { try {/*from ww w . jav a 2s . c o m*/ return PdfTextExtractor.getTextFromPage(reader, page) + "\n"; } catch (Exception e) { System.err.println("Failed to extract text " + e.getMessage()); return ""; } catch (OutOfMemoryError e) { System.err.println("Out of memory in text extraction " + e.getMessage()); return ""; } }
From source file:de.codecentric.robot.pdf.PDFKeywords.java
License:Apache License
@RobotKeyword public void parsePdf(String filename) throws IOException { reader = new PdfReader(filename); System.out.println("Reading file " + filename); pdfData = new HashMap<Integer, String>(); int numberOfPages = reader.getNumberOfPages(); for (int page = 1; page <= numberOfPages; page++) { System.out.println("Reading page " + page); String textFromPage = PdfTextExtractor.getTextFromPage(reader, page); pdfData.put(page, textFromPage); }/* w w w .j a v a 2 s.c o m*/ }
From source file:de.mpg.escidoc.services.extraction.ExtractionChain.java
License:Open Source License
public ExtractionResult doExtract(String infileName, String outfileName) { File outfile = new File(outfileName); Date stepStart = new Date(); Date current;/*from ww w. j a v a2 s.c om*/ logger.info("Extracting PDF content ----------------------------------------"); logger.info("Infile: " + infileName); logger.info("Outfile: " + outfileName); logger.info(stepStart + " -- started"); // xPDF try { logger.info("Extracting with xPDF"); StringBuffer command = new StringBuffer(2048); command.append(System.getProperty("os.name").contains("Windows") ? pdftotext + " -enc UTF-8 " : "/usr/bin/pdftotext -enc UTF-8 "); command.append(infileName); command.append(" "); command.append(outfileName); Process proc = Runtime.getRuntime().exec(command.toString()); StreamGobbler inputGobbler = new StreamGobbler(proc.getInputStream(), "xPDF"); StreamGobbler errorGobbler = new StreamGobbler(proc.getErrorStream(), "xPDF"); inputGobbler.start(); errorGobbler.start(); int exitCode = proc.waitFor(); if (proc.exitValue() == 0) { if (verbose) { BufferedReader bufferedReader = new BufferedReader( new InputStreamReader(new FileInputStream(outfile), "UTF-8")); String line; while ((line = bufferedReader.readLine()) != null) { logger.info(line); } bufferedReader.close(); } current = new Date(); logger.info(current + " -- finished successfully"); logger.info("Extraction took " + (current.getTime() - stepStart.getTime())); return ExtractionResult.OK; } } catch (Exception e) { logger.warn("Error extracting PDF with xPDF:"); logger.warn(e.getStackTrace()); } current = new Date(); logger.info(current + " -- finished unsuccessfully"); logger.info("Extraction attempt took " + (current.getTime() - stepStart.getTime())); // PDFBox try { logger.info("Extracting with PDFBox"); stepStart = new Date(); StringBuffer command = new StringBuffer(1024); command.append(System.getProperty("os.name").contains("Windows") ? "java -Dfile.encoding=UTF-8 -jar " + pdfboxAppJar + " ExtractText " : "/usr/bin/java -Dfile.encoding=UTF-8 -jar " + pdfboxAppJar + " ExtractText "); command.append(infileName); command.append(" "); command.append(outfileName); Process proc = Runtime.getRuntime().exec(command.toString()); StreamGobbler inputGobbler = new StreamGobbler(proc.getInputStream(), "PDFBox"); StreamGobbler errorGobbler = new StreamGobbler(proc.getErrorStream(), "PDFBox"); inputGobbler.start(); errorGobbler.start(); int exitCode = proc.waitFor(); if (exitCode == 0) { if (verbose) { BufferedReader bufferedReader = new BufferedReader( new InputStreamReader(new FileInputStream(outfile), "UTF-8")); String line; while ((line = bufferedReader.readLine()) != null) { logger.info(line); } bufferedReader.close(); } current = new Date(); logger.info(current + " -- finished successfully"); logger.info("Extraction took " + (current.getTime() - stepStart.getTime())); return ExtractionResult.OK; } } catch (Exception e) { logger.warn("Error extracting PDF with PDFBox:"); logger.warn(e.getStackTrace()); } current = new Date(); logger.info(current + " -- finished unsuccessfully"); logger.info("Extraction attempt took " + (current.getTime() - stepStart.getTime())); // iText try { logger.info("Extracting with iText"); stepStart = new Date(); PdfReader reader = new PdfReader(infileName); int numberOfPages = reader.getNumberOfPages(); outputStreamWriter = new OutputStreamWriter(new FileOutputStream(outfile), "UTF-8"); for (int i = 0; i < numberOfPages; i++) { outputStreamWriter.write(PdfTextExtractor.getTextFromPage(reader, i + 1)); } if (verbose) { BufferedReader bufferedReader = new BufferedReader( new InputStreamReader(new FileInputStream(outfile), "UTF-8")); String line; while ((line = bufferedReader.readLine()) != null) { logger.info(line); } bufferedReader.close(); } current = new Date(); logger.info(current + " -- finished successfully"); logger.info("Extraction took " + (current.getTime() - stepStart.getTime())); return ExtractionResult.OK; } catch (Exception e) { logger.warn("Error extracting PDF with iText:", e); } // tika InputStream stream = null; try { logger.info("Extracting with Tika"); stepStart = new Date(); stream = TikaInputStream.get(new File(infileName)); ContentHandler handler = new BodyContentHandler(TIKA_CONTENT_SIZE); new AutoDetectParser().parse(stream, handler, new Metadata(), new ParseContext()); String content = handler.toString(); FileUtils.writeStringToFile(outfile, content); stream.close(); if (verbose) { BufferedReader bufferedReader = new BufferedReader( new InputStreamReader(new FileInputStream(outfile), "UTF-8")); String line; while ((line = bufferedReader.readLine()) != null) { logger.info(line); } bufferedReader.close(); } current = new Date(); logger.info(current + " -- finished successfully"); logger.info("Extraction took " + (current.getTime() - stepStart.getTime())); return ExtractionResult.OK; } catch (Exception e) { logger.warn("Error extracting Tika:", e); try { stream.close(); } catch (IOException e1) { e1.printStackTrace(); } } current = new Date(); logger.warn(current + " -- finished unsuccessfully"); logger.info("Extraction attempt took " + (current.getTime() - stepStart.getTime())); logger.info("... giving up"); return ExtractionResult.FAILURE; }
From source file:digiho.reading.java
public static void main(String[] args) { try {//from ww w . j a v a 2 s .co m PdfReader reader = new PdfReader("G:\\43211688.pdf"); System.out.println("This PDF has " + reader.getNumberOfPages() + " pages."); String page = PdfTextExtractor.getTextFromPage(reader, 2); System.out.println("Page Content:\n\n" + page + "\n\n"); System.out.println("Is this document tampered: " + reader.isTampered()); System.out.println("Is this document encrypted: " + reader.isEncrypted()); } catch (IOException e) { e.printStackTrace(); } }