Example usage for org.apache.pdfbox.pdmodel PDDocument load

List of usage examples for org.apache.pdfbox.pdmodel PDDocument load

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel PDDocument load.

Prototype

public static PDDocument load(byte[] input) throws IOException 

Source Link

Document

Parses a PDF.

Usage

From source file:com.jgaap.generics.DocumentHelper.java

License:Open Source License

/**
 * Extracts text from a PDF and stores it in the document. Takes an input
 * stream rather than a file name.//  w  w w .  jav  a  2  s  .  c om
 * 
 * @param filesInputStream
 *            An input stream pointing to a PDF file.
 * @throws IOException
 */
static private char[] loadPDF(InputStream filesInputStream) throws IOException {
    PDDocument doc;
    doc = PDDocument.load(filesInputStream);
    PDFTextStripper pdfStripper = new PDFTextStripper();
    pdfStripper.setSortByPosition(false);
    char[] origText = pdfStripper.getText(doc).toCharArray();
    doc.close();

    return origText;
}

From source file:com.joowon.returnA.classifier.EbsBookCrawler.java

License:Open Source License

public void run() throws IOException {
    File destination = new File("/Users/Joowon/Desktop");
    File bookFolder = new File(getClass().getClassLoader().getResource("book").getFile());
    for (File book : bookFolder.listFiles()) {
        String outputName = destination.getPath() + "/" + book.getName().replace(".pdf", ".txt");
        PDDocument document = PDDocument.load(book);
        String text = "";
        for (PDPage page : document.getPages()) {
            text += new PdfTextExtractor(page)
                    .addRegion(0, 0, (int) page.getMediaBox().getWidth(), (int) page.getMediaBox().getHeight())
                    .extract();//from  ww w .j  av a  2 s. co m
        }

        new TxtWriter(outputName).write(text);

        document.close();
    }
}

From source file:com.joowon.returnA.classifier.export.PdfImageExport.java

License:Open Source License

public static void main(String[] args) throws IOException {
    String problem = "/Users/Joowon/Documents/Github/ReturnA/data/tests/problem/bnoRiCCI_h3_enga2_mun.pdf";
    String solve1 = "/Users/Joowon/Documents/Github/ReturnA/data/tests/solve/eng_hsj (88).pdf";
    String solve2 = "/Users/Joowon/Documents/Github/ReturnA/data/tests/solve/eng_hsj_81Np7vXe.pdf";
    PDDocument document = PDDocument.load(new File(solve2));
    export(document, "/Users/Joowon/Documents/Github/ReturnA", "image");
}

From source file:com.joowon.returnA.classifier.extractor.PdfTextExtractor.java

License:Open Source License

public static void main(String[] args) throws IOException, PrinterException {
    // Target PDF Document
    PDDocument document = PDDocument
            .load(new File("/Users/Joowon/Documents/Github/ReturnA/data/tests/YAPNXRPm_eng1_mun.pdf"));

    List<String> pdfTextList = new ArrayList<>();
    final int width = (int) document.getPage(0).getMediaBox().getWidth();
    final int height = (int) document.getPage(0).getMediaBox().getHeight();

    // Extract Test Information (first page's info area)
    pdfTextList.addAll(new PdfTextExtractor(document.getPage(0)).addRegion(0, 0, width, height / 4).extract());

    for (int i = 0; i < document.getNumberOfPages(); ++i) {
        // Left side
        pdfTextList/*from w w w. j  a  va  2 s .  co  m*/
                .addAll(new PdfTextExtractor(document.getPage(i)).addRegion(0, 0, width / 2, height).extract());

        // Right side
        pdfTextList.addAll(
                new PdfTextExtractor(document.getPage(i)).addRegion(width / 2, 0, width / 2, height).extract());
    }
    System.out.println(pdfTextList.toString());
}

From source file:com.joowon.returnA.classifier.ProblemClassifier.java

License:Open Source License

public static void main(String[] args) throws IOException {
    ClassifierCliParser cliParser = new ClassifierCliParser(args);
    cliParser.parse();// w  w  w.  ja  va  2s.com

    File sourceDirectory = new File(cliParser.getTarget());
    File[] testList = sourceDirectory.listFiles();

    // Parse problem datas
    assert testList != null;
    for (File pdfFile : testList) {
        try {
            File destinationDirectory = new File(pdfFile.getAbsolutePath() + "/problem");
            destinationDirectory.mkdirs();

            ProblemClassifier problemClassifier = new ProblemClassifier(
                    PDDocument.load(new File(pdfFile.getAbsolutePath() + "/problem.pdf")),
                    destinationDirectory.getAbsolutePath());

            // Parse problem group
            String text = ProblemParser.removeExceptionalText(problemClassifier.getBodyText());
            new TxtWriter(destinationDirectory.getAbsolutePath() + "/" + "test.txt").write(text);
            for (int i = 1; i <= 50; ++i) {
                String group = ProblemParser.parseProblemGroup(text, i);
                if (group.length() != 0) {
                    String fileName = destinationDirectory.getAbsolutePath() + "/"
                            + ProblemParser.parseProblemGroupName(group) + ".txt";
                    System.out.println(fileName);
                    new TxtWriter(fileName).write(group);
                    text = text.replace(group, "");
                }
            }

            // Remove trash text
            text = ProblemParser.removeExceptionalText(text);

            // Parse problem
            for (int i = 1; i <= 50; ++i) {
                String problemText = ProblemParser.parseProblem(text, i);
                String fileName = destinationDirectory.getAbsolutePath() + "/"
                        + ProblemParser.parseProblemName(problemText) + ".txt";
                System.out.println(fileName);
                new TxtWriter(fileName).write(problemText);
            }
            problemClassifier.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    // Put data into MongoDB
    new TxtToMongoTransfer(sourceDirectory.getAbsolutePath(), "localhost:" + MongoDbManager.DEFAULT_PORT)
            .transfer();
}

From source file:com.joowon.returnA.classifier.SolveClassifier.java

License:Open Source License

public static void main(String[] args) throws IOException {
    ClassifierCliParser cliParser = new ClassifierCliParser(args);
    cliParser.parse();/*from   w w w  .  ja  v a  2s.c o m*/

    File sourceDirectory = new File(cliParser.getTarget());
    File[] sourcePdfList = sourceDirectory.listFiles();

    // Parse solve datas
    assert sourcePdfList != null;
    MongoDbManager mongoDbManager = MongoDbManager.getInstance("localhost:" + MongoDbManager.DEFAULT_PORT);
    for (File pdfFile : sourcePdfList) {
        try {
            File destinationDirectory = new File(pdfFile.getAbsolutePath() + "/answer");
            destinationDirectory.mkdirs();

            SolveClassifier problemClassifier = new SolveClassifier(
                    PDDocument.load(new File(pdfFile.getAbsolutePath() + "/answer.pdf")),
                    destinationDirectory.getAbsolutePath());

            // Remove trash text
            String text = SolveParser.removeExceptionalText(problemClassifier.getBodyText());
            String testName = destinationDirectory.getParentFile().getName();

            for (Solve solve : SolveParser.parseAnswers(text)) {
                mongoDbManager.updateAnswer(testName, solve.problemNumber, solve.answer);
            }
            problemClassifier.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

From source file:com.jt.tool.pdf.CreateBookmarks.java

License:Apache License

public static void createBookmark(String srcFile, String targetFile, String reg) throws Exception {
    PDDocument document = null;//from   w  w  w .  j a va  2 s .  co m
    try {
        document = PDDocument.load(new File(srcFile));
        if (document.isEncrypted()) {
            System.err.println("Error: Cannot add bookmarks to encrypted document.");
            System.exit(1);
        }
        PDDocumentOutline outline = new PDDocumentOutline();
        document.getDocumentCatalog().setDocumentOutline(outline);
        PDOutlineItem pagesOutline = new PDOutlineItem();
        pagesOutline.setTitle("All Pages");
        //            outline.appendChild(pagesOutline);
        List pages = new ArrayList();
        //                    document.getDocumentCatalog().getAllPages();
        for (int i = 12; i < pages.size(); i++) {
            String pageText = getPageText(document, i + 1, 0);
            String[] strings = matchTitle(pageText, reg);
            if (makeBookmark(strings)) {
                PDPage page = (PDPage) pages.get(i);
                PDPageFitWidthDestination dest = new PDPageFitWidthDestination();
                dest.setPage(page);
                PDOutlineItem bookmark = new PDOutlineItem();
                bookmark.setDestination(dest);
                bookmark.setTitle(strings[0]);
                //                    pagesOutline.appendChild(bookmark);
                System.out.println("add " + strings[0]);
            }
        }
        pagesOutline.openNode();
        outline.openNode();
        document.save(targetFile);
    } finally {
        if (document != null) {
            document.close();
        }
    }
}

From source file:com.jubination.backend.service.thyrocare.report.parallel.worker.PDFParserBox.java

public String ToText(String url) throws IOException {
    try {/*from  w  w w.j ava2 s.com*/
        this.pdfStripper = null;
        this.pdDoc = null;
        pdDoc = PDDocument.load(new URL(url).openStream());
        pdDoc.getClass();
        pdfStripper = new PDFTextStripper() {
            @Override
            protected void processTextPosition(TextPosition text) {
                super.processTextPosition(text);
            }
        };
        pdfStripper.setStartPage(1);
        pdfStripper.setEndPage(pdDoc.getNumberOfPages());
        Text = pdfStripper.getText(pdDoc);

        return Text;
    } finally {
        try {
            if (pdDoc != null) {
                pdDoc.close();
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

From source file:com.jubinationre.controller.PDFReportAPIController.java

public String ToText(String url) throws IOException {
    this.pdfStripper = null;
    this.pdDoc = null;
    pdDoc = PDDocument.load(new URL(url).openStream());

    pdDoc.getClass();/*from w  ww .jav a2  s .  co m*/

    pdfStripper = new PDFTextStripper() {

        @Override
        protected void processTextPosition(TextPosition text) {
            //  if(text.getFont().getName().endsWith("Bold")){

            super.processTextPosition(text);
            //  }

        }
    };
    // pdDoc.getNumberOfPages();
    pdfStripper.setStartPage(1);
    //pdfStripper.setEndPage(10);

    // reading text from page 1 to 10
    // if you want to get text from full pdf file use this code
    pdfStripper.setEndPage(pdDoc.getNumberOfPages());

    Text = pdfStripper.getText(pdDoc);
    return Text;
}

From source file:com.liferay.faces.bridge.test.integration.demo.JSFExportPDFPortletTester.java

License:Open Source License

private String getPDFText(InputStream inputStream) throws IOException {

    String text = "";
    PDDocument pdDocument = null;//ww  w.ja  va2  s  .c o  m

    try {

        pdDocument = PDDocument.load(inputStream);

        PDFTextStripper pdfTextStripper = new PDFTextStripper();
        text = pdfTextStripper.getText(pdDocument);
    } finally {

        ClosableUtil.close(pdDocument);
        ClosableUtil.close(inputStream);
    }

    return text;
}