Example usage for org.apache.pdfbox.pdmodel PDDocument load

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel PDDocument load.

Prototype

public static PDDocument load(byte[] input) throws IOException

Source Link

Document

Parses a PDF.

Usage

From source file:extractor.Extractor.java

public static ArrayList<Document> returnDocuments(String pathBase, String[] files) {

    ArrayList<Document> documents = new ArrayList<>();

    for (String file : files) {
        PDDocument pdDocument = null;/*from w  w w . j  a  va 2 s  .co m*/
        String paperString = null;
        try {
            pdDocument = PDDocument.load(new File(pathBase + file));
            paperString = new PDFTextStripper().getText(pdDocument);
            pdDocument.close();
            Document document = new Document(paperString);
            documents.add(document);

        } catch (FileNotFoundException ex) {
            System.out.println("Arquivo no encontrado! Detalhes: " + ex.getLocalizedMessage());
            continue;
        } catch (IOException ex) {
            Logger.getLogger(Classifierdoc.class.getName()).log(Level.SEVERE, null, ex);

        }
    }

    return documents;
}

From source file:extractor.pdftotext.PdfToText.java

private String getPdfBoxRaw(File file) {
    try {/*from w w w .j a  va2 s  . c o m*/
        PDDocument doc = PDDocument.load(file);
        PDFTextStripper stripper = new PDFTextStripper();

        stripper.setPageStart("PAGE START");
        stripper.setPageEnd("PAGE END");
        //gets the text form the doc and replaces unknown signs with \n
        String rawText = stripper.getText(doc).replaceAll("[\\p{Cc}\\p{Cf}\\p{Co}\\p{Cn}]", "\n");
        doc.close();
        return rawText;

    } catch (IOException ex) {
        Logger.getLogger(PdfToText.class.getName()).log(Level.SEVERE, null, ex);
    }
    return "";
}

From source file:FeatureExtraction.FeatureExtractorPDFStructuralPaths.java

@Override
public Map ExtractFeaturesFrequencyFromSingleElement(T element) {
    Map<String, Integer> structuralPaths = new HashMap<>();
    HashSet<COSBase> visitedObjects = new HashSet<>();
    String filePath = (String) element;
    String parentPath = "";
    String pdfObjectName = "Trailer";
    File pdfFile = new File(filePath);
    try {/*from w w  w  .j a va  2s  .co  m*/
        switch (m_parserType) {
        case Sequential:
            try (PDDocument pdf = PDDocument.load(pdfFile); COSDocument pdfDocument = pdf.getDocument()) {
                ExtractPDFStructuralPathsRecursively(pdfDocument.getTrailer().getCOSObject(), pdfObjectName,
                        parentPath, structuralPaths, visitedObjects, 1);
            } catch (Exception e) {
                throw e;
            }
            break;
        case NonSequential:
            File randomAccessFile = new File(filePath + ".ra");
            RandomAccess randomAccess = new RandomAccessFile(randomAccessFile, "rwd");
            try (PDDocument pdf = PDDocument.loadNonSeq(pdfFile, randomAccess);
                    COSDocument pdfDocument = pdf.getDocument()) {
                ExtractPDFStructuralPathsRecursively(pdfDocument.getTrailer().getCOSObject(), pdfObjectName,
                        parentPath, structuralPaths, visitedObjects, 1);
            } catch (Exception e) {
                throw e;
            } finally {
                randomAccessFile.delete();
            }
            break;
        }
    } catch (IOException e) {
        //AddPDFStructuralPath("General Parsing Error", structuralPaths);
        Console.PrintException(String.format("Error parsing PDF file: '%s'", filePath), e);
    }
    return structuralPaths;
}

From source file:FeatureExtraction.FeatureExtractorPDFStructuralPaths.java

public boolean IsCompatiblePDF(File pdfFile) {
    try (PDDocument pdf = PDDocument.load(pdfFile)) {
        return true;
    } catch (Exception e) {
        return false;
    }// w  w  w  .j  a  v  a  2s .co m
}

From source file:FeatureExtraction.FeatureExtractorPDFStructuralPaths.java

/**
 * Return true if the PDF is compatible//  ww w. ja va 2  s.c  o  m
 *
 * @param filePath pdf file path
 * @return true if the PDF is compatible
 */
public boolean IsCompatiblePDF2(String filePath) {
    File pdfFile = new File(filePath);
    Map<String, Integer> structuralPaths = new HashMap<>();
    HashSet<COSBase> visitedObjects = new HashSet<>();
    boolean compatible = true;
    PDDocument pdf = new PDDocument();
    COSDocument pdfDocument = new COSDocument();
    String parentPath = "";
    String pdfObjectName = "Trailer";
    try {
        pdf = PDDocument.load(pdfFile);
        pdfDocument = pdf.getDocument();
        ExtractPDFStructuralPathsRecursively(pdfDocument.getTrailer().getCOSObject(), pdfObjectName, parentPath,
                structuralPaths, visitedObjects, 1);
    } catch (IOException e) {
        compatible = false;
    } finally {
        try {
            pdf.close();
            pdfDocument.close();
        } catch (IOException e) {
            Console.PrintException(String.format("Error closing PDF file: '%s'", filePath), e);
        }
    }
    return compatible;
}

From source file:FeatureExtraction.FeatureExtractorPDFStructuralPathsInputStream.java

@Override
public Map ExtractFeaturesFrequencyFromSingleElement(T element) {
    Map<String, Integer> structuralPaths = new HashMap<>();
    HashSet<COSBase> visitedObjects = new HashSet<>();
    InputStream fileInputStream = (InputStream) element;
    String filePath = "";
    try {/*w w  w  .  ja  v  a  2s  . c  om*/
        switch (m_parserType) {
        case Sequential:
            try (PDDocument pdf = PDDocument.load(fileInputStream)) {
                COSDocument pdfDocument = pdf.getDocument();
                ExtractPDFStructuralPathsRecursively(pdfDocument.getTrailer().getCOSObject(), "Trailer", "",
                        structuralPaths, visitedObjects);
                //ExtractPDFStructuralPathsQUEUE(pdfDocument.getTrailer().getCOSObject(), structuralPaths);
            } catch (Exception e) {
                Console.PrintException(String.format("Error parsing PDF file: '%s'", filePath), e);
            }
            break;
        case NonSequential:
            File randomAccessFile = new File(filePath + ".ra");
            RandomAccess randomAccess = new RandomAccessFile(randomAccessFile, "rwd");
            try (PDDocument pdf = PDDocument.loadNonSeq(fileInputStream, randomAccess)) {
                COSDocument pdfDocument = pdf.getDocument();
                ExtractPDFStructuralPathsRecursively(pdfDocument.getTrailer().getCOSObject(), "Trailer", "",
                        structuralPaths, visitedObjects);
                //ExtractPDFStructuralPathsQUEUE(pdfDocument.getTrailer().getCOSObject(), structuralPaths);
            } catch (Exception e) {
                Console.PrintException(String.format("Error parsing PDF file: '%s'", filePath), e);
            } finally {
                randomAccessFile.delete();
            }
            break;
        }
    } catch (IOException e) {
        Console.PrintException(String.format("Error parsing PDF file: '%s'", filePath), e);
    }
    return structuralPaths;
}

From source file:FeatureExtraction.FeatureExtractorPDFStructuralPathsNew.java

@Override
public Map ExtractFeaturesFrequencyFromSingleElement(T element) {
    Map<String, Integer> structuralPaths = new HashMap<>();
    HashSet<COSBase> visitedObjects = new HashSet<>();
    String filePath = (String) element;
    String parentPath = "";
    String pdfObjectName = "Trailer";
    File pdfFile = new File(filePath);
    try {/*  ww w . ja va 2  s  .co  m*/
        switch (m_parserType) {
        case Sequential:
            try (PDDocument pdf = PDDocument.load(pdfFile); COSDocument pdfDocument = pdf.getDocument()) {
                ExtractPDFStructuralPathsRecursively(pdfDocument.getTrailer().getCOSObject(), pdfObjectName,
                        parentPath, structuralPaths, visitedObjects, 1);
            } catch (Exception e) {
                throw e;
            }
            break;
        case NonSequential:
            File randomAccessFile = new File(filePath + ".ra");
            RandomAccess randomAccess = new RandomAccessFile(randomAccessFile, "rwd");
            try (PDDocument pdf = PDDocument.loadNonSeq(pdfFile, randomAccess);
                    COSDocument pdfDocument = pdf.getDocument()) {
                ExtractPDFStructuralPathsRecursively(pdfDocument.getTrailer().getCOSObject(), pdfObjectName,
                        parentPath, structuralPaths, visitedObjects, 1);
            } catch (Exception e) {
                throw e;
            } finally {
                randomAccessFile.delete();
            }
            break;
        }
    } catch (IOException e) {
        //AddPDFStructuralPath("General Parsing Error", structuralPaths); //Canceled
        Console.PrintException(String.format("Error parsing PDF file: '%s'", filePath), e);
    }
    return structuralPaths;
}

From source file:FeatureExtraction.FeatureExtractorPDFStructuralPathsNew.java

/**
 * Return true if the PDF is compatible//from  w  ww  . jav a  2s.c o  m
 *
 * @param filePath pdf file path
 * @return true if the PDF is compatible
 */
public boolean IsCompatiblePDF(String filePath) {
    File pdfFile = new File(filePath);
    try (PDDocument pdf = PDDocument.load(pdfFile)) {
        return true;
    } catch (Exception e) {
        return false;
    }
}

From source file:FeatureExtraction.FeatureExtractorPDFStructuralPathsOld.java

@Override
public Map ExtractFeaturesFrequencyFromSingleElement(T element) {
    Map<String, Integer> structuralPaths = new HashMap<>();
    HashSet<COSBase> visitedObjects = new HashSet<>();
    String filePath = (String) element;
    File pdfFile = new File(filePath);
    try {//from w  ww. j  a  v a 2 s. c  o m
        switch (m_parserType) {
        case Sequential:
            try (PDDocument pdf = PDDocument.load(pdfFile)) {
                COSDocument pdfDocument = pdf.getDocument();
                ExtractPDFStructuralPathsRecursively(pdfDocument.getTrailer().getCOSObject(), "Trailer", "",
                        structuralPaths, visitedObjects);
            } catch (Exception e) {
                Console.PrintException(String.format("Error parsing PDF file: '%s'", filePath), e);
            }
            break;
        case NonSequential:
            File randomAccessFile = new File(filePath + ".ra");
            RandomAccess randomAccess = new RandomAccessFile(randomAccessFile, "rwd");
            try (PDDocument pdf = PDDocument.loadNonSeq(pdfFile, randomAccess)) {
                COSDocument pdfDocument = pdf.getDocument();
                ExtractPDFStructuralPathsRecursively(pdfDocument.getTrailer().getCOSObject(), "Trailer", "",
                        structuralPaths, visitedObjects);
            } catch (Exception e) {
                Console.PrintException(String.format("Error parsing PDF file: '%s'", filePath), e);
            } finally {
                randomAccessFile.delete();
            }
            break;
        }
    } catch (IOException e) {
        Console.PrintException(String.format("Error parsing PDF file: '%s'", filePath), e);
    }
    return structuralPaths;
}

From source file:FeatureExtraction.FeatureExtractorPDFStructuralPathsOld.java

/**
 * Return true if the PDF is compatible//w ww .j  av a  2 s .co m
 *
 * @param filePath pdf file path
 * @return true if the PDF is compatible
 */
public boolean IsCompatiblePDF2(String filePath) {
    File pdfFile = new File(filePath);
    Map<String, Integer> structuralPaths = new HashMap<>();
    HashSet<COSBase> visitedObjects = new HashSet<>();
    boolean compatible = true;
    PDDocument pdf = new PDDocument();
    COSDocument pdfDocument = new COSDocument();
    try {
        pdf = PDDocument.load(pdfFile);
        pdfDocument = pdf.getDocument();
        ExtractPDFStructuralPathsRecursively(pdfDocument.getTrailer().getCOSObject(), "Trailer", "",
                structuralPaths, visitedObjects);
    } catch (IOException e) {
        compatible = false;
    } finally {
        try {
            pdf.close();
            pdfDocument.close();
        } catch (IOException e) {
            Console.PrintException(String.format("Error closing PDF file: '%s'", filePath), e);
        }
    }
    return compatible;
}