List of usage examples for org.apache.pdfbox.pdmodel PDDocument load
public static PDDocument load(byte[] input) throws IOException
From source file:com.jgaap.generics.DocumentHelper.java
License:Open Source License
/** * Extracts text from a PDF and stores it in the document. Takes an input * stream rather than a file name.// w w w . jav a 2 s . c om * * @param filesInputStream * An input stream pointing to a PDF file. * @throws IOException */ static private char[] loadPDF(InputStream filesInputStream) throws IOException { PDDocument doc; doc = PDDocument.load(filesInputStream); PDFTextStripper pdfStripper = new PDFTextStripper(); pdfStripper.setSortByPosition(false); char[] origText = pdfStripper.getText(doc).toCharArray(); doc.close(); return origText; }
From source file:com.joowon.returnA.classifier.EbsBookCrawler.java
License:Open Source License
public void run() throws IOException { File destination = new File("/Users/Joowon/Desktop"); File bookFolder = new File(getClass().getClassLoader().getResource("book").getFile()); for (File book : bookFolder.listFiles()) { String outputName = destination.getPath() + "/" + book.getName().replace(".pdf", ".txt"); PDDocument document = PDDocument.load(book); String text = ""; for (PDPage page : document.getPages()) { text += new PdfTextExtractor(page) .addRegion(0, 0, (int) page.getMediaBox().getWidth(), (int) page.getMediaBox().getHeight()) .extract();//from ww w .j av a 2 s. co m } new TxtWriter(outputName).write(text); document.close(); } }
From source file:com.joowon.returnA.classifier.export.PdfImageExport.java
License:Open Source License
public static void main(String[] args) throws IOException { String problem = "/Users/Joowon/Documents/Github/ReturnA/data/tests/problem/bnoRiCCI_h3_enga2_mun.pdf"; String solve1 = "/Users/Joowon/Documents/Github/ReturnA/data/tests/solve/eng_hsj (88).pdf"; String solve2 = "/Users/Joowon/Documents/Github/ReturnA/data/tests/solve/eng_hsj_81Np7vXe.pdf"; PDDocument document = PDDocument.load(new File(solve2)); export(document, "/Users/Joowon/Documents/Github/ReturnA", "image"); }
From source file:com.joowon.returnA.classifier.extractor.PdfTextExtractor.java
License:Open Source License
public static void main(String[] args) throws IOException, PrinterException { // Target PDF Document PDDocument document = PDDocument .load(new File("/Users/Joowon/Documents/Github/ReturnA/data/tests/YAPNXRPm_eng1_mun.pdf")); List<String> pdfTextList = new ArrayList<>(); final int width = (int) document.getPage(0).getMediaBox().getWidth(); final int height = (int) document.getPage(0).getMediaBox().getHeight(); // Extract Test Information (first page's info area) pdfTextList.addAll(new PdfTextExtractor(document.getPage(0)).addRegion(0, 0, width, height / 4).extract()); for (int i = 0; i < document.getNumberOfPages(); ++i) { // Left side pdfTextList/*from w w w. j a va 2 s . co m*/ .addAll(new PdfTextExtractor(document.getPage(i)).addRegion(0, 0, width / 2, height).extract()); // Right side pdfTextList.addAll( new PdfTextExtractor(document.getPage(i)).addRegion(width / 2, 0, width / 2, height).extract()); } System.out.println(pdfTextList.toString()); }
From source file:com.joowon.returnA.classifier.ProblemClassifier.java
License:Open Source License
public static void main(String[] args) throws IOException { ClassifierCliParser cliParser = new ClassifierCliParser(args); cliParser.parse();// w w w. ja va 2s.com File sourceDirectory = new File(cliParser.getTarget()); File[] testList = sourceDirectory.listFiles(); // Parse problem datas assert testList != null; for (File pdfFile : testList) { try { File destinationDirectory = new File(pdfFile.getAbsolutePath() + "/problem"); destinationDirectory.mkdirs(); ProblemClassifier problemClassifier = new ProblemClassifier( PDDocument.load(new File(pdfFile.getAbsolutePath() + "/problem.pdf")), destinationDirectory.getAbsolutePath()); // Parse problem group String text = ProblemParser.removeExceptionalText(problemClassifier.getBodyText()); new TxtWriter(destinationDirectory.getAbsolutePath() + "/" + "test.txt").write(text); for (int i = 1; i <= 50; ++i) { String group = ProblemParser.parseProblemGroup(text, i); if (group.length() != 0) { String fileName = destinationDirectory.getAbsolutePath() + "/" + ProblemParser.parseProblemGroupName(group) + ".txt"; System.out.println(fileName); new TxtWriter(fileName).write(group); text = text.replace(group, ""); } } // Remove trash text text = ProblemParser.removeExceptionalText(text); // Parse problem for (int i = 1; i <= 50; ++i) { String problemText = ProblemParser.parseProblem(text, i); String fileName = destinationDirectory.getAbsolutePath() + "/" + ProblemParser.parseProblemName(problemText) + ".txt"; System.out.println(fileName); new TxtWriter(fileName).write(problemText); } problemClassifier.close(); } catch (Exception e) { e.printStackTrace(); } } // Put data into MongoDB new TxtToMongoTransfer(sourceDirectory.getAbsolutePath(), "localhost:" + MongoDbManager.DEFAULT_PORT) .transfer(); }
From source file:com.joowon.returnA.classifier.SolveClassifier.java
License:Open Source License
public static void main(String[] args) throws IOException { ClassifierCliParser cliParser = new ClassifierCliParser(args); cliParser.parse();/*from w w w . ja v a 2s.c o m*/ File sourceDirectory = new File(cliParser.getTarget()); File[] sourcePdfList = sourceDirectory.listFiles(); // Parse solve datas assert sourcePdfList != null; MongoDbManager mongoDbManager = MongoDbManager.getInstance("localhost:" + MongoDbManager.DEFAULT_PORT); for (File pdfFile : sourcePdfList) { try { File destinationDirectory = new File(pdfFile.getAbsolutePath() + "/answer"); destinationDirectory.mkdirs(); SolveClassifier problemClassifier = new SolveClassifier( PDDocument.load(new File(pdfFile.getAbsolutePath() + "/answer.pdf")), destinationDirectory.getAbsolutePath()); // Remove trash text String text = SolveParser.removeExceptionalText(problemClassifier.getBodyText()); String testName = destinationDirectory.getParentFile().getName(); for (Solve solve : SolveParser.parseAnswers(text)) { mongoDbManager.updateAnswer(testName, solve.problemNumber, solve.answer); } problemClassifier.close(); } catch (Exception e) { e.printStackTrace(); } } }
From source file:com.jt.tool.pdf.CreateBookmarks.java
License:Apache License
public static void createBookmark(String srcFile, String targetFile, String reg) throws Exception { PDDocument document = null;//from w w w . j a va 2 s . co m try { document = PDDocument.load(new File(srcFile)); if (document.isEncrypted()) { System.err.println("Error: Cannot add bookmarks to encrypted document."); System.exit(1); } PDDocumentOutline outline = new PDDocumentOutline(); document.getDocumentCatalog().setDocumentOutline(outline); PDOutlineItem pagesOutline = new PDOutlineItem(); pagesOutline.setTitle("All Pages"); // outline.appendChild(pagesOutline); List pages = new ArrayList(); // document.getDocumentCatalog().getAllPages(); for (int i = 12; i < pages.size(); i++) { String pageText = getPageText(document, i + 1, 0); String[] strings = matchTitle(pageText, reg); if (makeBookmark(strings)) { PDPage page = (PDPage) pages.get(i); PDPageFitWidthDestination dest = new PDPageFitWidthDestination(); dest.setPage(page); PDOutlineItem bookmark = new PDOutlineItem(); bookmark.setDestination(dest); bookmark.setTitle(strings[0]); // pagesOutline.appendChild(bookmark); System.out.println("add " + strings[0]); } } pagesOutline.openNode(); outline.openNode(); document.save(targetFile); } finally { if (document != null) { document.close(); } } }
From source file:com.jubination.backend.service.thyrocare.report.parallel.worker.PDFParserBox.java
public String ToText(String url) throws IOException { try {/*from w w w.j ava2 s.com*/ this.pdfStripper = null; this.pdDoc = null; pdDoc = PDDocument.load(new URL(url).openStream()); pdDoc.getClass(); pdfStripper = new PDFTextStripper() { @Override protected void processTextPosition(TextPosition text) { super.processTextPosition(text); } }; pdfStripper.setStartPage(1); pdfStripper.setEndPage(pdDoc.getNumberOfPages()); Text = pdfStripper.getText(pdDoc); return Text; } finally { try { if (pdDoc != null) { pdDoc.close(); } } catch (Exception e) { e.printStackTrace(); } } }
From source file:com.jubinationre.controller.PDFReportAPIController.java
public String ToText(String url) throws IOException { this.pdfStripper = null; this.pdDoc = null; pdDoc = PDDocument.load(new URL(url).openStream()); pdDoc.getClass();/*from w ww .jav a2 s . co m*/ pdfStripper = new PDFTextStripper() { @Override protected void processTextPosition(TextPosition text) { // if(text.getFont().getName().endsWith("Bold")){ super.processTextPosition(text); // } } }; // pdDoc.getNumberOfPages(); pdfStripper.setStartPage(1); //pdfStripper.setEndPage(10); // reading text from page 1 to 10 // if you want to get text from full pdf file use this code pdfStripper.setEndPage(pdDoc.getNumberOfPages()); Text = pdfStripper.getText(pdDoc); return Text; }
From source file:com.liferay.faces.bridge.test.integration.demo.JSFExportPDFPortletTester.java
License:Open Source License
private String getPDFText(InputStream inputStream) throws IOException { String text = ""; PDDocument pdDocument = null;//ww w.ja va2 s .c o m try { pdDocument = PDDocument.load(inputStream); PDFTextStripper pdfTextStripper = new PDFTextStripper(); text = pdfTextStripper.getText(pdDocument); } finally { ClosableUtil.close(pdDocument); ClosableUtil.close(inputStream); } return text; }