Example usage for org.apache.pdfbox.pdmodel PDDocument load

List of usage examples for org.apache.pdfbox.pdmodel PDDocument load

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel PDDocument load.

Prototype

public static PDDocument load(byte[] input) throws IOException 

Source Link

Document

Parses a PDF.

Usage

From source file:extractor.Extractor.java

public static ArrayList<Document> returnDocuments(String pathBase, String[] files) {

    ArrayList<Document> documents = new ArrayList<>();

    for (String file : files) {
        PDDocument pdDocument = null;/*from w  w w . j  a  va 2 s  .co m*/
        String paperString = null;
        try {
            pdDocument = PDDocument.load(new File(pathBase + file));
            paperString = new PDFTextStripper().getText(pdDocument);
            pdDocument.close();
            Document document = new Document(paperString);
            documents.add(document);

        } catch (FileNotFoundException ex) {
            System.out.println("Arquivo no encontrado! Detalhes: " + ex.getLocalizedMessage());
            continue;
        } catch (IOException ex) {
            Logger.getLogger(Classifierdoc.class.getName()).log(Level.SEVERE, null, ex);

        }
    }

    return documents;
}

From source file:extractor.pdftotext.PdfToText.java

private String getPdfBoxRaw(File file) {
    try {/*from w w w .j a  va2 s  . c o m*/
        PDDocument doc = PDDocument.load(file);
        PDFTextStripper stripper = new PDFTextStripper();

        stripper.setPageStart("PAGE START");
        stripper.setPageEnd("PAGE END");
        //gets the text form the doc and replaces unknown signs with \n
        String rawText = stripper.getText(doc).replaceAll("[\\p{Cc}\\p{Cf}\\p{Co}\\p{Cn}]", "\n");
        doc.close();
        return rawText;

    } catch (IOException ex) {
        Logger.getLogger(PdfToText.class.getName()).log(Level.SEVERE, null, ex);
    }
    return "";
}

From source file:FeatureExtraction.FeatureExtractorPDFStructuralPaths.java

@Override
public Map ExtractFeaturesFrequencyFromSingleElement(T element) {
    Map<String, Integer> structuralPaths = new HashMap<>();
    HashSet<COSBase> visitedObjects = new HashSet<>();
    String filePath = (String) element;
    String parentPath = "";
    String pdfObjectName = "Trailer";
    File pdfFile = new File(filePath);
    try {/*from w w  w  .j a va  2s  .co  m*/
        switch (m_parserType) {
        case Sequential:
            try (PDDocument pdf = PDDocument.load(pdfFile); COSDocument pdfDocument = pdf.getDocument()) {
                ExtractPDFStructuralPathsRecursively(pdfDocument.getTrailer().getCOSObject(), pdfObjectName,
                        parentPath, structuralPaths, visitedObjects, 1);
            } catch (Exception e) {
                throw e;
            }
            break;
        case NonSequential:
            File randomAccessFile = new File(filePath + ".ra");
            RandomAccess randomAccess = new RandomAccessFile(randomAccessFile, "rwd");
            try (PDDocument pdf = PDDocument.loadNonSeq(pdfFile, randomAccess);
                    COSDocument pdfDocument = pdf.getDocument()) {
                ExtractPDFStructuralPathsRecursively(pdfDocument.getTrailer().getCOSObject(), pdfObjectName,
                        parentPath, structuralPaths, visitedObjects, 1);
            } catch (Exception e) {
                throw e;
            } finally {
                randomAccessFile.delete();
            }
            break;
        }
    } catch (IOException e) {
        //AddPDFStructuralPath("General Parsing Error", structuralPaths);
        Console.PrintException(String.format("Error parsing PDF file: '%s'", filePath), e);
    }
    return structuralPaths;
}

From source file:FeatureExtraction.FeatureExtractorPDFStructuralPaths.java

public boolean IsCompatiblePDF(File pdfFile) {
    try (PDDocument pdf = PDDocument.load(pdfFile)) {
        return true;
    } catch (Exception e) {
        return false;
    }// w  w  w  .j  a  v  a  2s .co m
}

From source file:FeatureExtraction.FeatureExtractorPDFStructuralPaths.java

/**
 * Return true if the PDF is compatible//  ww w. ja va 2  s.c  o  m
 *
 * @param filePath pdf file path
 * @return true if the PDF is compatible
 */
public boolean IsCompatiblePDF2(String filePath) {
    File pdfFile = new File(filePath);
    Map<String, Integer> structuralPaths = new HashMap<>();
    HashSet<COSBase> visitedObjects = new HashSet<>();
    boolean compatible = true;
    PDDocument pdf = new PDDocument();
    COSDocument pdfDocument = new COSDocument();
    String parentPath = "";
    String pdfObjectName = "Trailer";
    try {
        pdf = PDDocument.load(pdfFile);
        pdfDocument = pdf.getDocument();
        ExtractPDFStructuralPathsRecursively(pdfDocument.getTrailer().getCOSObject(), pdfObjectName, parentPath,
                structuralPaths, visitedObjects, 1);
    } catch (IOException e) {
        compatible = false;
    } finally {
        try {
            pdf.close();
            pdfDocument.close();
        } catch (IOException e) {
            Console.PrintException(String.format("Error closing PDF file: '%s'", filePath), e);
        }
    }
    return compatible;
}

From source file:FeatureExtraction.FeatureExtractorPDFStructuralPathsInputStream.java

@Override
public Map ExtractFeaturesFrequencyFromSingleElement(T element) {
    Map<String, Integer> structuralPaths = new HashMap<>();
    HashSet<COSBase> visitedObjects = new HashSet<>();
    InputStream fileInputStream = (InputStream) element;
    String filePath = "";
    try {/*w w  w  .  ja  v  a  2s  . c  om*/
        switch (m_parserType) {
        case Sequential:
            try (PDDocument pdf = PDDocument.load(fileInputStream)) {
                COSDocument pdfDocument = pdf.getDocument();
                ExtractPDFStructuralPathsRecursively(pdfDocument.getTrailer().getCOSObject(), "Trailer", "",
                        structuralPaths, visitedObjects);
                //ExtractPDFStructuralPathsQUEUE(pdfDocument.getTrailer().getCOSObject(), structuralPaths);
            } catch (Exception e) {
                Console.PrintException(String.format("Error parsing PDF file: '%s'", filePath), e);
            }
            break;
        case NonSequential:
            File randomAccessFile = new File(filePath + ".ra");
            RandomAccess randomAccess = new RandomAccessFile(randomAccessFile, "rwd");
            try (PDDocument pdf = PDDocument.loadNonSeq(fileInputStream, randomAccess)) {
                COSDocument pdfDocument = pdf.getDocument();
                ExtractPDFStructuralPathsRecursively(pdfDocument.getTrailer().getCOSObject(), "Trailer", "",
                        structuralPaths, visitedObjects);
                //ExtractPDFStructuralPathsQUEUE(pdfDocument.getTrailer().getCOSObject(), structuralPaths);
            } catch (Exception e) {
                Console.PrintException(String.format("Error parsing PDF file: '%s'", filePath), e);
            } finally {
                randomAccessFile.delete();
            }
            break;
        }
    } catch (IOException e) {
        Console.PrintException(String.format("Error parsing PDF file: '%s'", filePath), e);
    }
    return structuralPaths;
}

From source file:FeatureExtraction.FeatureExtractorPDFStructuralPathsNew.java

@Override
public Map ExtractFeaturesFrequencyFromSingleElement(T element) {
    Map<String, Integer> structuralPaths = new HashMap<>();
    HashSet<COSBase> visitedObjects = new HashSet<>();
    String filePath = (String) element;
    String parentPath = "";
    String pdfObjectName = "Trailer";
    File pdfFile = new File(filePath);
    try {/*  ww w . ja va 2  s  .co  m*/
        switch (m_parserType) {
        case Sequential:
            try (PDDocument pdf = PDDocument.load(pdfFile); COSDocument pdfDocument = pdf.getDocument()) {
                ExtractPDFStructuralPathsRecursively(pdfDocument.getTrailer().getCOSObject(), pdfObjectName,
                        parentPath, structuralPaths, visitedObjects, 1);
            } catch (Exception e) {
                throw e;
            }
            break;
        case NonSequential:
            File randomAccessFile = new File(filePath + ".ra");
            RandomAccess randomAccess = new RandomAccessFile(randomAccessFile, "rwd");
            try (PDDocument pdf = PDDocument.loadNonSeq(pdfFile, randomAccess);
                    COSDocument pdfDocument = pdf.getDocument()) {
                ExtractPDFStructuralPathsRecursively(pdfDocument.getTrailer().getCOSObject(), pdfObjectName,
                        parentPath, structuralPaths, visitedObjects, 1);
            } catch (Exception e) {
                throw e;
            } finally {
                randomAccessFile.delete();
            }
            break;
        }
    } catch (IOException e) {
        //AddPDFStructuralPath("General Parsing Error", structuralPaths); //Canceled
        Console.PrintException(String.format("Error parsing PDF file: '%s'", filePath), e);
    }
    return structuralPaths;
}

From source file:FeatureExtraction.FeatureExtractorPDFStructuralPathsNew.java

/**
 * Return true if the PDF is compatible//from  w  ww  . jav a  2s.c o  m
 *
 * @param filePath pdf file path
 * @return true if the PDF is compatible
 */
public boolean IsCompatiblePDF(String filePath) {
    File pdfFile = new File(filePath);
    try (PDDocument pdf = PDDocument.load(pdfFile)) {
        return true;
    } catch (Exception e) {
        return false;
    }
}

From source file:FeatureExtraction.FeatureExtractorPDFStructuralPathsOld.java

@Override
public Map ExtractFeaturesFrequencyFromSingleElement(T element) {
    Map<String, Integer> structuralPaths = new HashMap<>();
    HashSet<COSBase> visitedObjects = new HashSet<>();
    String filePath = (String) element;
    File pdfFile = new File(filePath);
    try {//from w  ww. j  a  v a 2 s. c  o m
        switch (m_parserType) {
        case Sequential:
            try (PDDocument pdf = PDDocument.load(pdfFile)) {
                COSDocument pdfDocument = pdf.getDocument();
                ExtractPDFStructuralPathsRecursively(pdfDocument.getTrailer().getCOSObject(), "Trailer", "",
                        structuralPaths, visitedObjects);
            } catch (Exception e) {
                Console.PrintException(String.format("Error parsing PDF file: '%s'", filePath), e);
            }
            break;
        case NonSequential:
            File randomAccessFile = new File(filePath + ".ra");
            RandomAccess randomAccess = new RandomAccessFile(randomAccessFile, "rwd");
            try (PDDocument pdf = PDDocument.loadNonSeq(pdfFile, randomAccess)) {
                COSDocument pdfDocument = pdf.getDocument();
                ExtractPDFStructuralPathsRecursively(pdfDocument.getTrailer().getCOSObject(), "Trailer", "",
                        structuralPaths, visitedObjects);
            } catch (Exception e) {
                Console.PrintException(String.format("Error parsing PDF file: '%s'", filePath), e);
            } finally {
                randomAccessFile.delete();
            }
            break;
        }
    } catch (IOException e) {
        Console.PrintException(String.format("Error parsing PDF file: '%s'", filePath), e);
    }
    return structuralPaths;
}

From source file:FeatureExtraction.FeatureExtractorPDFStructuralPathsOld.java

/**
 * Return true if the PDF is compatible//w ww .j  av a  2 s .co m
 *
 * @param filePath pdf file path
 * @return true if the PDF is compatible
 */
public boolean IsCompatiblePDF2(String filePath) {
    File pdfFile = new File(filePath);
    Map<String, Integer> structuralPaths = new HashMap<>();
    HashSet<COSBase> visitedObjects = new HashSet<>();
    boolean compatible = true;
    PDDocument pdf = new PDDocument();
    COSDocument pdfDocument = new COSDocument();
    try {
        pdf = PDDocument.load(pdfFile);
        pdfDocument = pdf.getDocument();
        ExtractPDFStructuralPathsRecursively(pdfDocument.getTrailer().getCOSObject(), "Trailer", "",
                structuralPaths, visitedObjects);
    } catch (IOException e) {
        compatible = false;
    } finally {
        try {
            pdf.close();
            pdfDocument.close();
        } catch (IOException e) {
            Console.PrintException(String.format("Error closing PDF file: '%s'", filePath), e);
        }
    }
    return compatible;
}