Example usage for org.apache.pdfbox.io RandomAccessFile RandomAccessFile

List of usage examples for org.apache.pdfbox.io RandomAccessFile RandomAccessFile

Introduction

In this page you can find the example usage for org.apache.pdfbox.io RandomAccessFile RandomAccessFile.

Prototype

public RandomAccessFile(File file, String mode) throws FileNotFoundException 

Source Link

Document

Constructor.

Usage

From source file:data.PDFManager.java

/**
 * //  w w w .  ja va2s.c o  m
 * @return String do conteudo do pdf
 * @throws IOException 
 */
public String ToText() throws IOException {
    this.pdfStripper = null;
    this.pdDoc = null;
    this.cosDoc = null;

    file = new File(filePath);
    parser = new PDFParser(new RandomAccessFile(file, "r"));

    parser.parse();
    cosDoc = parser.getDocument();
    pdfStripper = new PDFTextStripper();
    pdDoc = new PDDocument(cosDoc);
    pdDoc.getNumberOfPages();
    pdfStripper.setStartPage(1);
    pdfStripper.setEndPage(pdDoc.getNumberOfPages());

    Text = pdfStripper.getText(pdDoc);
    return Text;
}

From source file:edu.esprit.filereader.PdfReader.java

public String ToText() throws IOException {
    this.pdfStripper = null;
    this.pdDoc = null;
    this.cosDoc = null;

    file = new File(filePath);
    parser = new PDFParser(new RandomAccessFile(file, "r"));

    parser.parse();//from  w  w  w .  j  a va2s .co m
    cosDoc = parser.getDocument();
    pdfStripper = new PDFTextStripper();
    pdDoc = new PDDocument(cosDoc);
    pdDoc.getNumberOfPages();
    pdfStripper.setStartPage(1);
    pdfStripper.setEndPage(10);

    // reading text from page 1 to 10
    // if you want to get text from full pdf file use this code
    // pdfStripper.setEndPage(pdDoc.getNumberOfPages());
    Text = pdfStripper.getText(pdDoc);
    return Text;
}

From source file:FeatureExtraction.FeatureExtractorPDFStructuralPaths.java

@Override
public Map ExtractFeaturesFrequencyFromSingleElement(T element) {
    Map<String, Integer> structuralPaths = new HashMap<>();
    HashSet<COSBase> visitedObjects = new HashSet<>();
    String filePath = (String) element;
    String parentPath = "";
    String pdfObjectName = "Trailer";
    File pdfFile = new File(filePath);
    try {/* w  ww .j  av a2  s . c o m*/
        switch (m_parserType) {
        case Sequential:
            try (PDDocument pdf = PDDocument.load(pdfFile); COSDocument pdfDocument = pdf.getDocument()) {
                ExtractPDFStructuralPathsRecursively(pdfDocument.getTrailer().getCOSObject(), pdfObjectName,
                        parentPath, structuralPaths, visitedObjects, 1);
            } catch (Exception e) {
                throw e;
            }
            break;
        case NonSequential:
            File randomAccessFile = new File(filePath + ".ra");
            RandomAccess randomAccess = new RandomAccessFile(randomAccessFile, "rwd");
            try (PDDocument pdf = PDDocument.loadNonSeq(pdfFile, randomAccess);
                    COSDocument pdfDocument = pdf.getDocument()) {
                ExtractPDFStructuralPathsRecursively(pdfDocument.getTrailer().getCOSObject(), pdfObjectName,
                        parentPath, structuralPaths, visitedObjects, 1);
            } catch (Exception e) {
                throw e;
            } finally {
                randomAccessFile.delete();
            }
            break;
        }
    } catch (IOException e) {
        //AddPDFStructuralPath("General Parsing Error", structuralPaths);
        Console.PrintException(String.format("Error parsing PDF file: '%s'", filePath), e);
    }
    return structuralPaths;
}

From source file:FeatureExtraction.FeatureExtractorPDFStructuralPathsInputStream.java

@Override
public Map ExtractFeaturesFrequencyFromSingleElement(T element) {
    Map<String, Integer> structuralPaths = new HashMap<>();
    HashSet<COSBase> visitedObjects = new HashSet<>();
    InputStream fileInputStream = (InputStream) element;
    String filePath = "";
    try {//from   w ww.  ja  v  a  2s  .  com
        switch (m_parserType) {
        case Sequential:
            try (PDDocument pdf = PDDocument.load(fileInputStream)) {
                COSDocument pdfDocument = pdf.getDocument();
                ExtractPDFStructuralPathsRecursively(pdfDocument.getTrailer().getCOSObject(), "Trailer", "",
                        structuralPaths, visitedObjects);
                //ExtractPDFStructuralPathsQUEUE(pdfDocument.getTrailer().getCOSObject(), structuralPaths);
            } catch (Exception e) {
                Console.PrintException(String.format("Error parsing PDF file: '%s'", filePath), e);
            }
            break;
        case NonSequential:
            File randomAccessFile = new File(filePath + ".ra");
            RandomAccess randomAccess = new RandomAccessFile(randomAccessFile, "rwd");
            try (PDDocument pdf = PDDocument.loadNonSeq(fileInputStream, randomAccess)) {
                COSDocument pdfDocument = pdf.getDocument();
                ExtractPDFStructuralPathsRecursively(pdfDocument.getTrailer().getCOSObject(), "Trailer", "",
                        structuralPaths, visitedObjects);
                //ExtractPDFStructuralPathsQUEUE(pdfDocument.getTrailer().getCOSObject(), structuralPaths);
            } catch (Exception e) {
                Console.PrintException(String.format("Error parsing PDF file: '%s'", filePath), e);
            } finally {
                randomAccessFile.delete();
            }
            break;
        }
    } catch (IOException e) {
        Console.PrintException(String.format("Error parsing PDF file: '%s'", filePath), e);
    }
    return structuralPaths;
}

From source file:FeatureExtraction.FeatureExtractorPDFStructuralPathsNew.java

@Override
public Map ExtractFeaturesFrequencyFromSingleElement(T element) {
    Map<String, Integer> structuralPaths = new HashMap<>();
    HashSet<COSBase> visitedObjects = new HashSet<>();
    String filePath = (String) element;
    String parentPath = "";
    String pdfObjectName = "Trailer";
    File pdfFile = new File(filePath);
    try {/*w  w w. ja v  a 2s .com*/
        switch (m_parserType) {
        case Sequential:
            try (PDDocument pdf = PDDocument.load(pdfFile); COSDocument pdfDocument = pdf.getDocument()) {
                ExtractPDFStructuralPathsRecursively(pdfDocument.getTrailer().getCOSObject(), pdfObjectName,
                        parentPath, structuralPaths, visitedObjects, 1);
            } catch (Exception e) {
                throw e;
            }
            break;
        case NonSequential:
            File randomAccessFile = new File(filePath + ".ra");
            RandomAccess randomAccess = new RandomAccessFile(randomAccessFile, "rwd");
            try (PDDocument pdf = PDDocument.loadNonSeq(pdfFile, randomAccess);
                    COSDocument pdfDocument = pdf.getDocument()) {
                ExtractPDFStructuralPathsRecursively(pdfDocument.getTrailer().getCOSObject(), pdfObjectName,
                        parentPath, structuralPaths, visitedObjects, 1);
            } catch (Exception e) {
                throw e;
            } finally {
                randomAccessFile.delete();
            }
            break;
        }
    } catch (IOException e) {
        //AddPDFStructuralPath("General Parsing Error", structuralPaths); //Canceled
        Console.PrintException(String.format("Error parsing PDF file: '%s'", filePath), e);
    }
    return structuralPaths;
}

From source file:FeatureExtraction.FeatureExtractorPDFStructuralPathsOld.java

@Override
public Map ExtractFeaturesFrequencyFromSingleElement(T element) {
    Map<String, Integer> structuralPaths = new HashMap<>();
    HashSet<COSBase> visitedObjects = new HashSet<>();
    String filePath = (String) element;
    File pdfFile = new File(filePath);
    try {//w ww .j a v  a  2  s .  co  m
        switch (m_parserType) {
        case Sequential:
            try (PDDocument pdf = PDDocument.load(pdfFile)) {
                COSDocument pdfDocument = pdf.getDocument();
                ExtractPDFStructuralPathsRecursively(pdfDocument.getTrailer().getCOSObject(), "Trailer", "",
                        structuralPaths, visitedObjects);
            } catch (Exception e) {
                Console.PrintException(String.format("Error parsing PDF file: '%s'", filePath), e);
            }
            break;
        case NonSequential:
            File randomAccessFile = new File(filePath + ".ra");
            RandomAccess randomAccess = new RandomAccessFile(randomAccessFile, "rwd");
            try (PDDocument pdf = PDDocument.loadNonSeq(pdfFile, randomAccess)) {
                COSDocument pdfDocument = pdf.getDocument();
                ExtractPDFStructuralPathsRecursively(pdfDocument.getTrailer().getCOSObject(), "Trailer", "",
                        structuralPaths, visitedObjects);
            } catch (Exception e) {
                Console.PrintException(String.format("Error parsing PDF file: '%s'", filePath), e);
            } finally {
                randomAccessFile.delete();
            }
            break;
        }
    } catch (IOException e) {
        Console.PrintException(String.format("Error parsing PDF file: '%s'", filePath), e);
    }
    return structuralPaths;
}

From source file:javaapplication1.PDFManager.java

public Map<String, String> ToText() throws IOException {
    this.pdfStripper = null;
    this.pdDoc = null;
    this.cosDoc = null;

    file = new File(filePath);
    parser = new PDFParser(new RandomAccessFile(file, "r")); // update for PDFBox V 2.0

    parser.parse();//from w  w  w.j a v a2 s . c om
    cosDoc = parser.getDocument();
    pdfStripper = new PDFTextStripper();
    pdDoc = new PDDocument(cosDoc);
    pdDoc.getNumberOfPages();
    // if you want to get text from full pdf file use this code
    //       pdfStripper.setEndPage(pdDoc.getNumberOfPages());

    // if you want specific number of pages
    pdfStripper.setStartPage(1);
    pdfStripper.setEndPage(1);
    Text = pdfStripper.getText(pdDoc);
    System.out.println(Text);
    // spilt 

    String[] result = Text.split("\n");
    Map<String, String> map = new HashMap<String, String>();

    try {
        for (int j = 0; j < result.length; j++) {

            if (result[j].contains("Type")) {

                String x = result[j].substring(5);
                map.put("Type", x);
            } else if (result[j].contains("Document Number")) {

                String x = result[j].substring(16);
                map.put("Document Number", x);
            } else if (result[j].contains("Date of Birth")) {

                String x = result[j].substring(14);
                map.put("Date of Birth", x);
            } else if (result[j].contains("Date of Expiry")) {

                String x = result[j].substring(15);
                map.put("Date of Expiry", x);
            } else if (result[j].contains("Issuer")) {

                String x = result[j].substring(7);
                map.put("Issuer", x);
            } else if (result[j].contains("Nationality")) {

                String x = result[j].substring(12);
                map.put("Nationality", x);
            } else if (result[j].contains("First Names")) {

                String x = result[j].substring(12);
                map.put("First Names", x);
            } else if (result[j].contains("Last Names")) {

                String x = result[j].substring(11);
                map.put("Last Names", x);
            } else if (result[j].contains("Discretionary 1")) {

                String x = result[j].substring(16);
                map.put("Discretionary 1", x);
            }
            //                            else if (result[j].contains("Discretionary 2"))
            //                            {
            //                               
            //                                String x = result[j].substring(16);
            //                                map.put("Discretionary 2", x);
            //                                
            //                            }
            else if (result[j].contains("Gender")) {

                String x = result[j].substring(7);
                map.put("Gender", x);
            }
        }

    } catch (Exception e) {

        JOptionPane.showMessageDialog(null, "please selecet OCR PDF", "worng pass", JOptionPane.ERROR_MESSAGE);

    }
    return map;

}

From source file:mj.ocraptor.extraction.tika.parser.pdf.PDFParser.java

License:Apache License

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
        throws IOException, SAXException, TikaException {

    PDDocument pdfDocument = null;//from  w  ww  .  j  a  va  2 s .com
    TemporaryResources tmp = new TemporaryResources();
    // config from context, or default if not set via context
    PDFParserConfig localConfig = context.get(PDFParserConfig.class, defaultConfig);

    try {
        // PDFBox can process entirely in memory, or can use a temp file
        // for unpacked / processed resources
        // Decide which to do based on if we're reading from a file or not
        // already
        TikaInputStream tstream = TikaInputStream.cast(stream);
        if (tstream != null && tstream.hasFile()) {
            // File based, take that as a cue to use a temporary file
            RandomAccess scratchFile = new RandomAccessFile(tmp.createTemporaryFile(), "rw");
            if (localConfig.getUseNonSequentialParser() == true) {
                pdfDocument = PDDocument.loadNonSeq(new CloseShieldInputStream(stream), scratchFile);
            } else {
                pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), scratchFile, true);
            }
        } else {
            // Go for the normal, stream based in-memory parsing
            if (localConfig.getUseNonSequentialParser() == true) {
                pdfDocument = PDDocument.loadNonSeq(new CloseShieldInputStream(stream),
                        new RandomAccessBuffer());
            } else {
                pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), true);
            }
        }

        if (pdfDocument.isEncrypted()) {
            String password = null;

            // Did they supply a new style Password Provider?
            PasswordProvider passwordProvider = context.get(PasswordProvider.class);
            if (passwordProvider != null) {
                password = passwordProvider.getPassword(metadata);
            }

            // Fall back on the old style metadata if set
            if (password == null && metadata.get(PASSWORD) != null) {
                password = metadata.get(PASSWORD);
            }

            // If no password is given, use an empty string as the default
            if (password == null) {
                password = "";
            }

            try {
                pdfDocument.decrypt(password);
            } catch (Exception e) {
                // Ignore
            }
        }

        metadata.set(Metadata.CONTENT_TYPE, "application/pdf");
        extractMetadata(pdfDocument, metadata);
        PDF2XHTML.process(pdfDocument, handler, context, metadata, localConfig);

    } catch (Exception e) {
        // TODO: logging
        e.printStackTrace();
    } finally {

        if (pdfDocument != null) {
            pdfDocument.close();
        }
        if (tmp != null) {
            tmp.dispose();
            tmp.close();
        }
    }
    handler.endDocument();
}

From source file:net.dstserbak.dataindexer.tokenizer.PDFTokenizer.java

/**
 * Splits text from PDF URL to words and returns them as TokensMap object.
 * @param url Input PDF URL//from   w  w w .  j  av  a 2  s  .c  om
 * @return Map that contains tokens, which are belong to PDF document
 * @throws IOException If an I/O error occurs
 */
public static TokensMap tokenizePdf(URL url) throws IOException {
    checkTempFileExistance();
    try (RandomAccessFile scratchFile = new RandomAccessFile(PDF_SCRATCH_FILE, "rw")) {
        try (PDDocument pd = PDDocument.load(url, scratchFile)) {
            return tokenizeInput(pd);
        }
    }
}

From source file:opennlp.PDFTools.java

public String getStringFromPDF(String filePath) {

    String text = null;//  w  ww  .j av  a 2s  . co  m

    try {

        File file = new File(filePath);
        parser = new PDFParser(new RandomAccessFile(file, "r"));

        parser.parse();
        cosDoc = parser.getDocument();
        pdfStripper = new PDFTextStripper();
        pdDoc = new PDDocument(cosDoc);
        pdDoc.getNumberOfPages();
        pdfStripper.setStartPage(1);
        //pdfStripper.setEndPage(10);

        pdfStripper.setEndPage(pdDoc.getNumberOfPages());

        text = pdfStripper.getText(pdDoc);

    } catch (IOException e) {
        logger.error("IO ERROR", e);
    } catch (Exception ex) {
        logger.error("ERROR", ex);
    }

    return text;
}