Example usage for org.apache.pdfbox.pdmodel PDDocument load

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel PDDocument load.

Prototype

public static PDDocument load(byte[] input) throws IOException

Source Link

Document

Parses a PDF.

Usage

From source file:com.jgaap.generics.DocumentHelper.java

License:Open Source License

/**
 * Extracts text from a PDF and stores it in the document. Takes an input
 * stream rather than a file name.//  w  w w .  jav  a  2  s  .  c om
 * 
 * @param filesInputStream
 *            An input stream pointing to a PDF file.
 * @throws IOException
 */
static private char[] loadPDF(InputStream filesInputStream) throws IOException {
    PDDocument doc;
    doc = PDDocument.load(filesInputStream);
    PDFTextStripper pdfStripper = new PDFTextStripper();
    pdfStripper.setSortByPosition(false);
    char[] origText = pdfStripper.getText(doc).toCharArray();
    doc.close();

    return origText;
}

From source file:com.joowon.returnA.classifier.EbsBookCrawler.java

License:Open Source License

public void run() throws IOException {
    File destination = new File("/Users/Joowon/Desktop");
    File bookFolder = new File(getClass().getClassLoader().getResource("book").getFile());
    for (File book : bookFolder.listFiles()) {
        String outputName = destination.getPath() + "/" + book.getName().replace(".pdf", ".txt");
        PDDocument document = PDDocument.load(book);
        String text = "";
        for (PDPage page : document.getPages()) {
            text += new PdfTextExtractor(page)
                    .addRegion(0, 0, (int) page.getMediaBox().getWidth(), (int) page.getMediaBox().getHeight())
                    .extract();//from  ww w .j  av a  2 s. co m
        }

        new TxtWriter(outputName).write(text);

        document.close();
    }
}

From source file:com.joowon.returnA.classifier.export.PdfImageExport.java

License:Open Source License

public static void main(String[] args) throws IOException {
    String problem = "/Users/Joowon/Documents/Github/ReturnA/data/tests/problem/bnoRiCCI_h3_enga2_mun.pdf";
    String solve1 = "/Users/Joowon/Documents/Github/ReturnA/data/tests/solve/eng_hsj (88).pdf";
    String solve2 = "/Users/Joowon/Documents/Github/ReturnA/data/tests/solve/eng_hsj_81Np7vXe.pdf";
    PDDocument document = PDDocument.load(new File(solve2));
    export(document, "/Users/Joowon/Documents/Github/ReturnA", "image");
}

From source file:com.joowon.returnA.classifier.extractor.PdfTextExtractor.java

License:Open Source License

public static void main(String[] args) throws IOException, PrinterException {
    // Target PDF Document
    PDDocument document = PDDocument
            .load(new File("/Users/Joowon/Documents/Github/ReturnA/data/tests/YAPNXRPm_eng1_mun.pdf"));

    List<String> pdfTextList = new ArrayList<>();
    final int width = (int) document.getPage(0).getMediaBox().getWidth();
    final int height = (int) document.getPage(0).getMediaBox().getHeight();

    // Extract Test Information (first page's info area)
    pdfTextList.addAll(new PdfTextExtractor(document.getPage(0)).addRegion(0, 0, width, height / 4).extract());

    for (int i = 0; i < document.getNumberOfPages(); ++i) {
        // Left side
        pdfTextList/*from w w w. j  a  va  2 s .  co  m*/
                .addAll(new PdfTextExtractor(document.getPage(i)).addRegion(0, 0, width / 2, height).extract());

        // Right side
        pdfTextList.addAll(
                new PdfTextExtractor(document.getPage(i)).addRegion(width / 2, 0, width / 2, height).extract());
    }
    System.out.println(pdfTextList.toString());
}

From source file:com.joowon.returnA.classifier.ProblemClassifier.java

License:Open Source License

public static void main(String[] args) throws IOException {
    ClassifierCliParser cliParser = new ClassifierCliParser(args);
    cliParser.parse();// w  w  w.  ja  va  2s.com

    File sourceDirectory = new File(cliParser.getTarget());
    File[] testList = sourceDirectory.listFiles();

    // Parse problem datas
    assert testList != null;
    for (File pdfFile : testList) {
        try {
            File destinationDirectory = new File(pdfFile.getAbsolutePath() + "/problem");
            destinationDirectory.mkdirs();

            ProblemClassifier problemClassifier = new ProblemClassifier(
                    PDDocument.load(new File(pdfFile.getAbsolutePath() + "/problem.pdf")),
                    destinationDirectory.getAbsolutePath());

            // Parse problem group
            String text = ProblemParser.removeExceptionalText(problemClassifier.getBodyText());
            new TxtWriter(destinationDirectory.getAbsolutePath() + "/" + "test.txt").write(text);
            for (int i = 1; i <= 50; ++i) {
                String group = ProblemParser.parseProblemGroup(text, i);
                if (group.length() != 0) {
                    String fileName = destinationDirectory.getAbsolutePath() + "/"
                            + ProblemParser.parseProblemGroupName(group) + ".txt";
                    System.out.println(fileName);
                    new TxtWriter(fileName).write(group);
                    text = text.replace(group, "");
                }
            }

            // Remove trash text
            text = ProblemParser.removeExceptionalText(text);

            // Parse problem
            for (int i = 1; i <= 50; ++i) {
                String problemText = ProblemParser.parseProblem(text, i);
                String fileName = destinationDirectory.getAbsolutePath() + "/"
                        + ProblemParser.parseProblemName(problemText) + ".txt";
                System.out.println(fileName);
                new TxtWriter(fileName).write(problemText);
            }
            problemClassifier.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    // Put data into MongoDB
    new TxtToMongoTransfer(sourceDirectory.getAbsolutePath(), "localhost:" + MongoDbManager.DEFAULT_PORT)
            .transfer();
}

From source file:com.joowon.returnA.classifier.SolveClassifier.java

License:Open Source License

public static void main(String[] args) throws IOException {
    ClassifierCliParser cliParser = new ClassifierCliParser(args);
    cliParser.parse();/*from   w w w  .  ja  v a  2s.c o m*/

    File sourceDirectory = new File(cliParser.getTarget());
    File[] sourcePdfList = sourceDirectory.listFiles();

    // Parse solve datas
    assert sourcePdfList != null;
    MongoDbManager mongoDbManager = MongoDbManager.getInstance("localhost:" + MongoDbManager.DEFAULT_PORT);
    for (File pdfFile : sourcePdfList) {
        try {
            File destinationDirectory = new File(pdfFile.getAbsolutePath() + "/answer");
            destinationDirectory.mkdirs();

            SolveClassifier problemClassifier = new SolveClassifier(
                    PDDocument.load(new File(pdfFile.getAbsolutePath() + "/answer.pdf")),
                    destinationDirectory.getAbsolutePath());

            // Remove trash text
            String text = SolveParser.removeExceptionalText(problemClassifier.getBodyText());
            String testName = destinationDirectory.getParentFile().getName();

            for (Solve solve : SolveParser.parseAnswers(text)) {
                mongoDbManager.updateAnswer(testName, solve.problemNumber, solve.answer);
            }
            problemClassifier.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

From source file:com.jt.tool.pdf.CreateBookmarks.java

License:Apache License

public static void createBookmark(String srcFile, String targetFile, String reg) throws Exception {
    PDDocument document = null;//from   w  w  w .  j a va  2 s .  co m
    try {
        document = PDDocument.load(new File(srcFile));
        if (document.isEncrypted()) {
            System.err.println("Error: Cannot add bookmarks to encrypted document.");
            System.exit(1);
        }
        PDDocumentOutline outline = new PDDocumentOutline();
        document.getDocumentCatalog().setDocumentOutline(outline);
        PDOutlineItem pagesOutline = new PDOutlineItem();
        pagesOutline.setTitle("All Pages");
        //            outline.appendChild(pagesOutline);
        List pages = new ArrayList();
        //                    document.getDocumentCatalog().getAllPages();
        for (int i = 12; i < pages.size(); i++) {
            String pageText = getPageText(document, i + 1, 0);
            String[] strings = matchTitle(pageText, reg);
            if (makeBookmark(strings)) {
                PDPage page = (PDPage) pages.get(i);
                PDPageFitWidthDestination dest = new PDPageFitWidthDestination();
                dest.setPage(page);
                PDOutlineItem bookmark = new PDOutlineItem();
                bookmark.setDestination(dest);
                bookmark.setTitle(strings[0]);
                //                    pagesOutline.appendChild(bookmark);
                System.out.println("add " + strings[0]);
            }
        }
        pagesOutline.openNode();
        outline.openNode();
        document.save(targetFile);
    } finally {
        if (document != null) {
            document.close();
        }
    }
}

From source file:com.jubination.backend.service.thyrocare.report.parallel.worker.PDFParserBox.java

public String ToText(String url) throws IOException {
    try {/*from  w  w w.j ava2 s.com*/
        this.pdfStripper = null;
        this.pdDoc = null;
        pdDoc = PDDocument.load(new URL(url).openStream());
        pdDoc.getClass();
        pdfStripper = new PDFTextStripper() {
            @Override
            protected void processTextPosition(TextPosition text) {
                super.processTextPosition(text);
            }
        };
        pdfStripper.setStartPage(1);
        pdfStripper.setEndPage(pdDoc.getNumberOfPages());
        Text = pdfStripper.getText(pdDoc);

        return Text;
    } finally {
        try {
            if (pdDoc != null) {
                pdDoc.close();
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

From source file:com.jubinationre.controller.PDFReportAPIController.java

public String ToText(String url) throws IOException {
    this.pdfStripper = null;
    this.pdDoc = null;
    pdDoc = PDDocument.load(new URL(url).openStream());

    pdDoc.getClass();/*from w  ww .jav a2  s .  co m*/

    pdfStripper = new PDFTextStripper() {

        @Override
        protected void processTextPosition(TextPosition text) {
            //  if(text.getFont().getName().endsWith("Bold")){

            super.processTextPosition(text);
            //  }

        }
    };
    // pdDoc.getNumberOfPages();
    pdfStripper.setStartPage(1);
    //pdfStripper.setEndPage(10);

    // reading text from page 1 to 10
    // if you want to get text from full pdf file use this code
    pdfStripper.setEndPage(pdDoc.getNumberOfPages());

    Text = pdfStripper.getText(pdDoc);
    return Text;
}

From source file:com.liferay.faces.bridge.test.integration.demo.JSFExportPDFPortletTester.java

License:Open Source License

private String getPDFText(InputStream inputStream) throws IOException {

    String text = "";
    PDDocument pdDocument = null;//ww  w.ja  va2  s  .c o  m

    try {

        pdDocument = PDDocument.load(inputStream);

        PDFTextStripper pdfTextStripper = new PDFTextStripper();
        text = pdfTextStripper.getText(pdDocument);
    } finally {

        ClosableUtil.close(pdDocument);
        ClosableUtil.close(inputStream);
    }

    return text;
}