List of usage examples for org.apache.pdfbox.io RandomAccessFile RandomAccessFile
public RandomAccessFile(File file, String mode) throws FileNotFoundException
From source file:data.PDFManager.java
/** * // w w w . ja va2s.c o m * @return String do conteudo do pdf * @throws IOException */ public String ToText() throws IOException { this.pdfStripper = null; this.pdDoc = null; this.cosDoc = null; file = new File(filePath); parser = new PDFParser(new RandomAccessFile(file, "r")); parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); pdDoc.getNumberOfPages(); pdfStripper.setStartPage(1); pdfStripper.setEndPage(pdDoc.getNumberOfPages()); Text = pdfStripper.getText(pdDoc); return Text; }
From source file:edu.esprit.filereader.PdfReader.java
public String ToText() throws IOException { this.pdfStripper = null; this.pdDoc = null; this.cosDoc = null; file = new File(filePath); parser = new PDFParser(new RandomAccessFile(file, "r")); parser.parse();//from w w w . j a va2s .co m cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); pdDoc.getNumberOfPages(); pdfStripper.setStartPage(1); pdfStripper.setEndPage(10); // reading text from page 1 to 10 // if you want to get text from full pdf file use this code // pdfStripper.setEndPage(pdDoc.getNumberOfPages()); Text = pdfStripper.getText(pdDoc); return Text; }
From source file:FeatureExtraction.FeatureExtractorPDFStructuralPaths.java
@Override public Map ExtractFeaturesFrequencyFromSingleElement(T element) { Map<String, Integer> structuralPaths = new HashMap<>(); HashSet<COSBase> visitedObjects = new HashSet<>(); String filePath = (String) element; String parentPath = ""; String pdfObjectName = "Trailer"; File pdfFile = new File(filePath); try {/* w ww .j av a2 s . c o m*/ switch (m_parserType) { case Sequential: try (PDDocument pdf = PDDocument.load(pdfFile); COSDocument pdfDocument = pdf.getDocument()) { ExtractPDFStructuralPathsRecursively(pdfDocument.getTrailer().getCOSObject(), pdfObjectName, parentPath, structuralPaths, visitedObjects, 1); } catch (Exception e) { throw e; } break; case NonSequential: File randomAccessFile = new File(filePath + ".ra"); RandomAccess randomAccess = new RandomAccessFile(randomAccessFile, "rwd"); try (PDDocument pdf = PDDocument.loadNonSeq(pdfFile, randomAccess); COSDocument pdfDocument = pdf.getDocument()) { ExtractPDFStructuralPathsRecursively(pdfDocument.getTrailer().getCOSObject(), pdfObjectName, parentPath, structuralPaths, visitedObjects, 1); } catch (Exception e) { throw e; } finally { randomAccessFile.delete(); } break; } } catch (IOException e) { //AddPDFStructuralPath("General Parsing Error", structuralPaths); Console.PrintException(String.format("Error parsing PDF file: '%s'", filePath), e); } return structuralPaths; }
From source file:FeatureExtraction.FeatureExtractorPDFStructuralPathsInputStream.java
@Override public Map ExtractFeaturesFrequencyFromSingleElement(T element) { Map<String, Integer> structuralPaths = new HashMap<>(); HashSet<COSBase> visitedObjects = new HashSet<>(); InputStream fileInputStream = (InputStream) element; String filePath = ""; try {//from w ww. ja v a 2s . com switch (m_parserType) { case Sequential: try (PDDocument pdf = PDDocument.load(fileInputStream)) { COSDocument pdfDocument = pdf.getDocument(); ExtractPDFStructuralPathsRecursively(pdfDocument.getTrailer().getCOSObject(), "Trailer", "", structuralPaths, visitedObjects); //ExtractPDFStructuralPathsQUEUE(pdfDocument.getTrailer().getCOSObject(), structuralPaths); } catch (Exception e) { Console.PrintException(String.format("Error parsing PDF file: '%s'", filePath), e); } break; case NonSequential: File randomAccessFile = new File(filePath + ".ra"); RandomAccess randomAccess = new RandomAccessFile(randomAccessFile, "rwd"); try (PDDocument pdf = PDDocument.loadNonSeq(fileInputStream, randomAccess)) { COSDocument pdfDocument = pdf.getDocument(); ExtractPDFStructuralPathsRecursively(pdfDocument.getTrailer().getCOSObject(), "Trailer", "", structuralPaths, visitedObjects); //ExtractPDFStructuralPathsQUEUE(pdfDocument.getTrailer().getCOSObject(), structuralPaths); } catch (Exception e) { Console.PrintException(String.format("Error parsing PDF file: '%s'", filePath), e); } finally { randomAccessFile.delete(); } break; } } catch (IOException e) { Console.PrintException(String.format("Error parsing PDF file: '%s'", filePath), e); } return structuralPaths; }
From source file:FeatureExtraction.FeatureExtractorPDFStructuralPathsNew.java
@Override public Map ExtractFeaturesFrequencyFromSingleElement(T element) { Map<String, Integer> structuralPaths = new HashMap<>(); HashSet<COSBase> visitedObjects = new HashSet<>(); String filePath = (String) element; String parentPath = ""; String pdfObjectName = "Trailer"; File pdfFile = new File(filePath); try {/*w w w. ja v a 2s .com*/ switch (m_parserType) { case Sequential: try (PDDocument pdf = PDDocument.load(pdfFile); COSDocument pdfDocument = pdf.getDocument()) { ExtractPDFStructuralPathsRecursively(pdfDocument.getTrailer().getCOSObject(), pdfObjectName, parentPath, structuralPaths, visitedObjects, 1); } catch (Exception e) { throw e; } break; case NonSequential: File randomAccessFile = new File(filePath + ".ra"); RandomAccess randomAccess = new RandomAccessFile(randomAccessFile, "rwd"); try (PDDocument pdf = PDDocument.loadNonSeq(pdfFile, randomAccess); COSDocument pdfDocument = pdf.getDocument()) { ExtractPDFStructuralPathsRecursively(pdfDocument.getTrailer().getCOSObject(), pdfObjectName, parentPath, structuralPaths, visitedObjects, 1); } catch (Exception e) { throw e; } finally { randomAccessFile.delete(); } break; } } catch (IOException e) { //AddPDFStructuralPath("General Parsing Error", structuralPaths); //Canceled Console.PrintException(String.format("Error parsing PDF file: '%s'", filePath), e); } return structuralPaths; }
From source file:FeatureExtraction.FeatureExtractorPDFStructuralPathsOld.java
@Override public Map ExtractFeaturesFrequencyFromSingleElement(T element) { Map<String, Integer> structuralPaths = new HashMap<>(); HashSet<COSBase> visitedObjects = new HashSet<>(); String filePath = (String) element; File pdfFile = new File(filePath); try {//w ww .j a v a 2 s . co m switch (m_parserType) { case Sequential: try (PDDocument pdf = PDDocument.load(pdfFile)) { COSDocument pdfDocument = pdf.getDocument(); ExtractPDFStructuralPathsRecursively(pdfDocument.getTrailer().getCOSObject(), "Trailer", "", structuralPaths, visitedObjects); } catch (Exception e) { Console.PrintException(String.format("Error parsing PDF file: '%s'", filePath), e); } break; case NonSequential: File randomAccessFile = new File(filePath + ".ra"); RandomAccess randomAccess = new RandomAccessFile(randomAccessFile, "rwd"); try (PDDocument pdf = PDDocument.loadNonSeq(pdfFile, randomAccess)) { COSDocument pdfDocument = pdf.getDocument(); ExtractPDFStructuralPathsRecursively(pdfDocument.getTrailer().getCOSObject(), "Trailer", "", structuralPaths, visitedObjects); } catch (Exception e) { Console.PrintException(String.format("Error parsing PDF file: '%s'", filePath), e); } finally { randomAccessFile.delete(); } break; } } catch (IOException e) { Console.PrintException(String.format("Error parsing PDF file: '%s'", filePath), e); } return structuralPaths; }
From source file:javaapplication1.PDFManager.java
public Map<String, String> ToText() throws IOException { this.pdfStripper = null; this.pdDoc = null; this.cosDoc = null; file = new File(filePath); parser = new PDFParser(new RandomAccessFile(file, "r")); // update for PDFBox V 2.0 parser.parse();//from w w w.j a v a2 s . c om cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); pdDoc.getNumberOfPages(); // if you want to get text from full pdf file use this code // pdfStripper.setEndPage(pdDoc.getNumberOfPages()); // if you want specific number of pages pdfStripper.setStartPage(1); pdfStripper.setEndPage(1); Text = pdfStripper.getText(pdDoc); System.out.println(Text); // spilt String[] result = Text.split("\n"); Map<String, String> map = new HashMap<String, String>(); try { for (int j = 0; j < result.length; j++) { if (result[j].contains("Type")) { String x = result[j].substring(5); map.put("Type", x); } else if (result[j].contains("Document Number")) { String x = result[j].substring(16); map.put("Document Number", x); } else if (result[j].contains("Date of Birth")) { String x = result[j].substring(14); map.put("Date of Birth", x); } else if (result[j].contains("Date of Expiry")) { String x = result[j].substring(15); map.put("Date of Expiry", x); } else if (result[j].contains("Issuer")) { String x = result[j].substring(7); map.put("Issuer", x); } else if (result[j].contains("Nationality")) { String x = result[j].substring(12); map.put("Nationality", x); } else if (result[j].contains("First Names")) { String x = result[j].substring(12); map.put("First Names", x); } else if (result[j].contains("Last Names")) { String x = result[j].substring(11); map.put("Last Names", x); } else if (result[j].contains("Discretionary 1")) { String x = result[j].substring(16); map.put("Discretionary 1", x); } // else if (result[j].contains("Discretionary 2")) // { // // String x = result[j].substring(16); // map.put("Discretionary 2", x); // // } else if (result[j].contains("Gender")) { String x = result[j].substring(7); map.put("Gender", x); } } } catch (Exception e) { JOptionPane.showMessageDialog(null, "please selecet OCR PDF", "worng pass", JOptionPane.ERROR_MESSAGE); } return map; }
From source file:mj.ocraptor.extraction.tika.parser.pdf.PDFParser.java
License:Apache License
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { PDDocument pdfDocument = null;//from w ww . j a va 2 s .com TemporaryResources tmp = new TemporaryResources(); // config from context, or default if not set via context PDFParserConfig localConfig = context.get(PDFParserConfig.class, defaultConfig); try { // PDFBox can process entirely in memory, or can use a temp file // for unpacked / processed resources // Decide which to do based on if we're reading from a file or not // already TikaInputStream tstream = TikaInputStream.cast(stream); if (tstream != null && tstream.hasFile()) { // File based, take that as a cue to use a temporary file RandomAccess scratchFile = new RandomAccessFile(tmp.createTemporaryFile(), "rw"); if (localConfig.getUseNonSequentialParser() == true) { pdfDocument = PDDocument.loadNonSeq(new CloseShieldInputStream(stream), scratchFile); } else { pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), scratchFile, true); } } else { // Go for the normal, stream based in-memory parsing if (localConfig.getUseNonSequentialParser() == true) { pdfDocument = PDDocument.loadNonSeq(new CloseShieldInputStream(stream), new RandomAccessBuffer()); } else { pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), true); } } if (pdfDocument.isEncrypted()) { String password = null; // Did they supply a new style Password Provider? PasswordProvider passwordProvider = context.get(PasswordProvider.class); if (passwordProvider != null) { password = passwordProvider.getPassword(metadata); } // Fall back on the old style metadata if set if (password == null && metadata.get(PASSWORD) != null) { password = metadata.get(PASSWORD); } // If no password is given, use an empty string as the default if (password == null) { password = ""; } try { pdfDocument.decrypt(password); } catch (Exception e) { // Ignore } } metadata.set(Metadata.CONTENT_TYPE, "application/pdf"); extractMetadata(pdfDocument, metadata); PDF2XHTML.process(pdfDocument, handler, context, metadata, localConfig); } catch (Exception e) { // TODO: logging e.printStackTrace(); } finally { if (pdfDocument != null) { pdfDocument.close(); } if (tmp != null) { tmp.dispose(); tmp.close(); } } handler.endDocument(); }
From source file:net.dstserbak.dataindexer.tokenizer.PDFTokenizer.java
/** * Splits text from PDF URL to words and returns them as TokensMap object. * @param url Input PDF URL//from w w w . j av a 2 s .c om * @return Map that contains tokens, which are belong to PDF document * @throws IOException If an I/O error occurs */ public static TokensMap tokenizePdf(URL url) throws IOException { checkTempFileExistance(); try (RandomAccessFile scratchFile = new RandomAccessFile(PDF_SCRATCH_FILE, "rw")) { try (PDDocument pd = PDDocument.load(url, scratchFile)) { return tokenizeInput(pd); } } }
From source file:opennlp.PDFTools.java
public String getStringFromPDF(String filePath) { String text = null;// w ww .j av a 2s . co m try { File file = new File(filePath); parser = new PDFParser(new RandomAccessFile(file, "r")); parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); pdDoc.getNumberOfPages(); pdfStripper.setStartPage(1); //pdfStripper.setEndPage(10); pdfStripper.setEndPage(pdDoc.getNumberOfPages()); text = pdfStripper.getText(pdDoc); } catch (IOException e) { logger.error("IO ERROR", e); } catch (Exception ex) { logger.error("ERROR", ex); } return text; }