List of usage examples for org.apache.pdfbox.pdmodel PDDocument load
public static PDDocument load(byte[] input) throws IOException
From source file:extractor.Extractor.java
public static ArrayList<Document> returnDocuments(String pathBase, String[] files) { ArrayList<Document> documents = new ArrayList<>(); for (String file : files) { PDDocument pdDocument = null;/*from w w w . j a va 2 s .co m*/ String paperString = null; try { pdDocument = PDDocument.load(new File(pathBase + file)); paperString = new PDFTextStripper().getText(pdDocument); pdDocument.close(); Document document = new Document(paperString); documents.add(document); } catch (FileNotFoundException ex) { System.out.println("Arquivo no encontrado! Detalhes: " + ex.getLocalizedMessage()); continue; } catch (IOException ex) { Logger.getLogger(Classifierdoc.class.getName()).log(Level.SEVERE, null, ex); } } return documents; }
From source file:extractor.pdftotext.PdfToText.java
private String getPdfBoxRaw(File file) { try {/*from w w w .j a va2 s . c o m*/ PDDocument doc = PDDocument.load(file); PDFTextStripper stripper = new PDFTextStripper(); stripper.setPageStart("PAGE START"); stripper.setPageEnd("PAGE END"); //gets the text form the doc and replaces unknown signs with \n String rawText = stripper.getText(doc).replaceAll("[\\p{Cc}\\p{Cf}\\p{Co}\\p{Cn}]", "\n"); doc.close(); return rawText; } catch (IOException ex) { Logger.getLogger(PdfToText.class.getName()).log(Level.SEVERE, null, ex); } return ""; }
From source file:FeatureExtraction.FeatureExtractorPDFStructuralPaths.java
@Override public Map ExtractFeaturesFrequencyFromSingleElement(T element) { Map<String, Integer> structuralPaths = new HashMap<>(); HashSet<COSBase> visitedObjects = new HashSet<>(); String filePath = (String) element; String parentPath = ""; String pdfObjectName = "Trailer"; File pdfFile = new File(filePath); try {/*from w w w .j a va 2s .co m*/ switch (m_parserType) { case Sequential: try (PDDocument pdf = PDDocument.load(pdfFile); COSDocument pdfDocument = pdf.getDocument()) { ExtractPDFStructuralPathsRecursively(pdfDocument.getTrailer().getCOSObject(), pdfObjectName, parentPath, structuralPaths, visitedObjects, 1); } catch (Exception e) { throw e; } break; case NonSequential: File randomAccessFile = new File(filePath + ".ra"); RandomAccess randomAccess = new RandomAccessFile(randomAccessFile, "rwd"); try (PDDocument pdf = PDDocument.loadNonSeq(pdfFile, randomAccess); COSDocument pdfDocument = pdf.getDocument()) { ExtractPDFStructuralPathsRecursively(pdfDocument.getTrailer().getCOSObject(), pdfObjectName, parentPath, structuralPaths, visitedObjects, 1); } catch (Exception e) { throw e; } finally { randomAccessFile.delete(); } break; } } catch (IOException e) { //AddPDFStructuralPath("General Parsing Error", structuralPaths); Console.PrintException(String.format("Error parsing PDF file: '%s'", filePath), e); } return structuralPaths; }
From source file:FeatureExtraction.FeatureExtractorPDFStructuralPaths.java
public boolean IsCompatiblePDF(File pdfFile) { try (PDDocument pdf = PDDocument.load(pdfFile)) { return true; } catch (Exception e) { return false; }// w w w .j a v a 2s .co m }
From source file:FeatureExtraction.FeatureExtractorPDFStructuralPaths.java
/** * Return true if the PDF is compatible// ww w. ja va 2 s.c o m * * @param filePath pdf file path * @return true if the PDF is compatible */ public boolean IsCompatiblePDF2(String filePath) { File pdfFile = new File(filePath); Map<String, Integer> structuralPaths = new HashMap<>(); HashSet<COSBase> visitedObjects = new HashSet<>(); boolean compatible = true; PDDocument pdf = new PDDocument(); COSDocument pdfDocument = new COSDocument(); String parentPath = ""; String pdfObjectName = "Trailer"; try { pdf = PDDocument.load(pdfFile); pdfDocument = pdf.getDocument(); ExtractPDFStructuralPathsRecursively(pdfDocument.getTrailer().getCOSObject(), pdfObjectName, parentPath, structuralPaths, visitedObjects, 1); } catch (IOException e) { compatible = false; } finally { try { pdf.close(); pdfDocument.close(); } catch (IOException e) { Console.PrintException(String.format("Error closing PDF file: '%s'", filePath), e); } } return compatible; }
From source file:FeatureExtraction.FeatureExtractorPDFStructuralPathsInputStream.java
@Override public Map ExtractFeaturesFrequencyFromSingleElement(T element) { Map<String, Integer> structuralPaths = new HashMap<>(); HashSet<COSBase> visitedObjects = new HashSet<>(); InputStream fileInputStream = (InputStream) element; String filePath = ""; try {/*w w w . ja v a 2s . c om*/ switch (m_parserType) { case Sequential: try (PDDocument pdf = PDDocument.load(fileInputStream)) { COSDocument pdfDocument = pdf.getDocument(); ExtractPDFStructuralPathsRecursively(pdfDocument.getTrailer().getCOSObject(), "Trailer", "", structuralPaths, visitedObjects); //ExtractPDFStructuralPathsQUEUE(pdfDocument.getTrailer().getCOSObject(), structuralPaths); } catch (Exception e) { Console.PrintException(String.format("Error parsing PDF file: '%s'", filePath), e); } break; case NonSequential: File randomAccessFile = new File(filePath + ".ra"); RandomAccess randomAccess = new RandomAccessFile(randomAccessFile, "rwd"); try (PDDocument pdf = PDDocument.loadNonSeq(fileInputStream, randomAccess)) { COSDocument pdfDocument = pdf.getDocument(); ExtractPDFStructuralPathsRecursively(pdfDocument.getTrailer().getCOSObject(), "Trailer", "", structuralPaths, visitedObjects); //ExtractPDFStructuralPathsQUEUE(pdfDocument.getTrailer().getCOSObject(), structuralPaths); } catch (Exception e) { Console.PrintException(String.format("Error parsing PDF file: '%s'", filePath), e); } finally { randomAccessFile.delete(); } break; } } catch (IOException e) { Console.PrintException(String.format("Error parsing PDF file: '%s'", filePath), e); } return structuralPaths; }
From source file:FeatureExtraction.FeatureExtractorPDFStructuralPathsNew.java
@Override public Map ExtractFeaturesFrequencyFromSingleElement(T element) { Map<String, Integer> structuralPaths = new HashMap<>(); HashSet<COSBase> visitedObjects = new HashSet<>(); String filePath = (String) element; String parentPath = ""; String pdfObjectName = "Trailer"; File pdfFile = new File(filePath); try {/* ww w . ja va 2 s .co m*/ switch (m_parserType) { case Sequential: try (PDDocument pdf = PDDocument.load(pdfFile); COSDocument pdfDocument = pdf.getDocument()) { ExtractPDFStructuralPathsRecursively(pdfDocument.getTrailer().getCOSObject(), pdfObjectName, parentPath, structuralPaths, visitedObjects, 1); } catch (Exception e) { throw e; } break; case NonSequential: File randomAccessFile = new File(filePath + ".ra"); RandomAccess randomAccess = new RandomAccessFile(randomAccessFile, "rwd"); try (PDDocument pdf = PDDocument.loadNonSeq(pdfFile, randomAccess); COSDocument pdfDocument = pdf.getDocument()) { ExtractPDFStructuralPathsRecursively(pdfDocument.getTrailer().getCOSObject(), pdfObjectName, parentPath, structuralPaths, visitedObjects, 1); } catch (Exception e) { throw e; } finally { randomAccessFile.delete(); } break; } } catch (IOException e) { //AddPDFStructuralPath("General Parsing Error", structuralPaths); //Canceled Console.PrintException(String.format("Error parsing PDF file: '%s'", filePath), e); } return structuralPaths; }
From source file:FeatureExtraction.FeatureExtractorPDFStructuralPathsNew.java
/** * Return true if the PDF is compatible//from w ww . jav a 2s.c o m * * @param filePath pdf file path * @return true if the PDF is compatible */ public boolean IsCompatiblePDF(String filePath) { File pdfFile = new File(filePath); try (PDDocument pdf = PDDocument.load(pdfFile)) { return true; } catch (Exception e) { return false; } }
From source file:FeatureExtraction.FeatureExtractorPDFStructuralPathsOld.java
@Override public Map ExtractFeaturesFrequencyFromSingleElement(T element) { Map<String, Integer> structuralPaths = new HashMap<>(); HashSet<COSBase> visitedObjects = new HashSet<>(); String filePath = (String) element; File pdfFile = new File(filePath); try {//from w ww. j a v a 2 s. c o m switch (m_parserType) { case Sequential: try (PDDocument pdf = PDDocument.load(pdfFile)) { COSDocument pdfDocument = pdf.getDocument(); ExtractPDFStructuralPathsRecursively(pdfDocument.getTrailer().getCOSObject(), "Trailer", "", structuralPaths, visitedObjects); } catch (Exception e) { Console.PrintException(String.format("Error parsing PDF file: '%s'", filePath), e); } break; case NonSequential: File randomAccessFile = new File(filePath + ".ra"); RandomAccess randomAccess = new RandomAccessFile(randomAccessFile, "rwd"); try (PDDocument pdf = PDDocument.loadNonSeq(pdfFile, randomAccess)) { COSDocument pdfDocument = pdf.getDocument(); ExtractPDFStructuralPathsRecursively(pdfDocument.getTrailer().getCOSObject(), "Trailer", "", structuralPaths, visitedObjects); } catch (Exception e) { Console.PrintException(String.format("Error parsing PDF file: '%s'", filePath), e); } finally { randomAccessFile.delete(); } break; } } catch (IOException e) { Console.PrintException(String.format("Error parsing PDF file: '%s'", filePath), e); } return structuralPaths; }
From source file:FeatureExtraction.FeatureExtractorPDFStructuralPathsOld.java
/** * Return true if the PDF is compatible//w ww .j av a 2 s .co m * * @param filePath pdf file path * @return true if the PDF is compatible */ public boolean IsCompatiblePDF2(String filePath) { File pdfFile = new File(filePath); Map<String, Integer> structuralPaths = new HashMap<>(); HashSet<COSBase> visitedObjects = new HashSet<>(); boolean compatible = true; PDDocument pdf = new PDDocument(); COSDocument pdfDocument = new COSDocument(); try { pdf = PDDocument.load(pdfFile); pdfDocument = pdf.getDocument(); ExtractPDFStructuralPathsRecursively(pdfDocument.getTrailer().getCOSObject(), "Trailer", "", structuralPaths, visitedObjects); } catch (IOException e) { compatible = false; } finally { try { pdf.close(); pdfDocument.close(); } catch (IOException e) { Console.PrintException(String.format("Error closing PDF file: '%s'", filePath), e); } } return compatible; }