Example usage for org.apache.pdfbox.pdmodel PDDocument PDDocument

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel PDDocument PDDocument.

Prototype

public PDDocument(COSDocument doc)

Source Link

Document

Constructor that uses an existing document.

Usage

From source file:eu.transkribus.languageresources.extractor.pdf.PDFExtractor.java

@Override
public List<String> extractTextFromDocumentPagewise(String pathToFile) {
    List<String> pageWiseText = new LinkedList<>();

    COSDocument cosDoc = null;//from w  ww  .j  a v  a2 s  .c  o m
    PDDocument pdDoc = null;
    try {
        PDFParser parser = new PDFParser(new FileInputStream(new File(pathToFile)));
        parser.parse();
        cosDoc = parser.getDocument();
        pdDoc = new PDDocument(cosDoc);

        for (int pageId = 0; pageId < pdDoc.getNumberOfPages(); pageId++) {
            pageWiseText.add(extractTextFromPage(pdDoc, pageId));
        }
    } catch (IOException ex) {
        Logger.getLogger(PDFExtractor.class.getName()).log(Level.SEVERE, null, ex);
    } finally {
        if (pdDoc != null) {
            try {
                pdDoc.close();
            } catch (IOException ex) {
                Logger.getLogger(PDFExtractor.class.getName()).log(Level.SEVERE, null, ex);
            }
        }
        if (cosDoc != null) {
            try {
                cosDoc.close();
            } catch (IOException ex) {
                Logger.getLogger(PDFExtractor.class.getName()).log(Level.SEVERE, null, ex);
            }
        }
    }

    return pageWiseText;
}

From source file:eu.transkribus.languageresources.extractor.pdf.PDFExtractor.java

@Override
public String extractTextFromPage(String pathToFile, int page) {
    StringBuilder sb = new StringBuilder();

    COSDocument cosDoc = null;/*from   w  w w.  ja  v a  2s  . co  m*/
    PDDocument pdDoc = null;
    try {
        PDFParser parser = new PDFParser(new FileInputStream(new File(pathToFile)));
        parser.parse();
        cosDoc = parser.getDocument();
        pdDoc = new PDDocument(cosDoc);
        sb.append(extractTextFromPage(pdDoc, page));
    } catch (IOException ex) {
        Logger.getLogger(PDFExtractor.class.getName()).log(Level.SEVERE, null, ex);
    } finally {
        if (pdDoc != null) {
            try {
                pdDoc.close();
            } catch (IOException ex) {
                Logger.getLogger(PDFExtractor.class.getName()).log(Level.SEVERE, null, ex);
            }
        }
        if (cosDoc != null) {
            try {
                cosDoc.close();
            } catch (IOException ex) {
                Logger.getLogger(PDFExtractor.class.getName()).log(Level.SEVERE, null, ex);
            }
        }
    }

    return sb.toString();
}

From source file:FileParser.Parser.java

public ArrayList<String> fileParser() throws IOException, SAXException {
    String path = file.getPath();
    String[] getArray = fileChooser.getFileType();
    String type = getArray[1];//  w  w  w.j  a  v  a2 s .com
    //System.out.println("Type: "+type);
    String fileName = getArray[0];
    String fileContent = "";
    file = fileChooser.getFile();
    getParsedData = new ArrayList<>();

    switch (type) {
    case "txt":

        try {
            FileReader contentReader = new FileReader(file.getPath());
            bReader = new BufferedReader(contentReader);
            while ((fileContent = bReader.readLine()) != null) {
                getParsedData.add(fileContent);
            }
        }

        catch (FileNotFoundException ex) {
            Logger.getLogger(Parser.class.getName()).log(Level.SEVERE, null, ex);
        }

        break;

    case "html":
        // https://scholar.google.com.tr/scholar?hl=tr&q=ecir+u%C4%9Fur+k%C3%BC%C3%A7%C3%BCksille&btnG=&lr=
        String url = "";
        try {
            if (url.isEmpty()) {

                Document doc = Jsoup.parse(file, null);
                fileContent = doc.text();
                getParsedData.add(fileContent);

            } else {
                Document doc = Jsoup.connect(url).get();
                Elements elements = doc.select("div.gs_r");
                for (Element div : elements) {
                    fileContent += div.text();
                }

                getParsedData.add(fileContent);

            }

        } catch (Exception e) {
            e.printStackTrace();
        }

        break;

    case "pdf":
        try

        {
            inputStream = new FileInputStream(file);
            parser = new PDFParser(inputStream);
            parser.parse();
            cosDoc = parser.getDocument();
            pdfStripper = new PDFTextStripper();
            pdDoc = new PDDocument(cosDoc);
            pdfStripper.setStartPage(1);
            pdfStripper.setEndPage(2);
            fileContent = pdfStripper.getText(pdDoc);
            getParsedData.add(fileContent);

        }

        catch (Exception e) {
            e.printStackTrace();
        }

        break;

    case "doc":
        try {

            FileInputStream fis = new FileInputStream(file.getAbsolutePath());
            HWPFDocument document = new HWPFDocument(fis);
            WordExtractor extractor = new WordExtractor(document);
            fileContent = extractor.getText();
            getParsedData.add(fileContent);

        } catch (Exception e) {
            e.printStackTrace();
        }

        break;

    case "xml":
        /*
                
        parsing xml file path
                
        /home/burakcan/Desktop/eurofxref.xml 
        */

        try {
            DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
            DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
            Document doc = dBuilder.parse(file);
            doc.getDocumentElement().normalize();
            Element firstCube = (Element) doc.getElementsByTagName("Cube").item(0);
            Element secondCube = (Element) firstCube.getElementsByTagName("Cube").item(0);

            NodeList nList = doc.getElementsByTagName("Cube");
            for (int i = 0; i < nList.getLength(); i++) {
                Node nNode = nList.item(i);
                Element eElement = (Element) nNode;
                getParsedData.add(eElement.getAttribute("currency") + " " + eElement.getAttribute("rate"));

            }

        }

        catch (ParserConfigurationException ex) {
            Logger.getLogger(Parser.class.getName()).log(Level.SEVERE, null, ex);
        }

        break;

    default:

        JOptionPane.showMessageDialog(null, "This program can not parse your choice!", "Program Error",
                JOptionPane.ERROR_MESSAGE);
        System.exit(0);
    }

    return getParsedData;

}

From source file:hrpod.tools.PDFTools.java

public String getStringFromPDF(InputStream inputStream) {

    String text = null;/*from  w  w w  .  j a va 2s.c o m*/

    try {
        COSDocument cosDoc = getParser(inputStream).getDocument();
        PDFTextStripper pdfStripper = new PDFTextStripper();
        PDDocument pdDoc = new PDDocument(cosDoc);
        pdfStripper.setStartPage(1);
        pdfStripper.setEndPage(pdDoc.getNumberOfPages());

        text = pdfStripper.getText(pdDoc);

    } catch (Exception ex) {
        logger.error("ERROR", ex);
    }

    return text;
}

From source file:hrpod.tools.PDFTools.java

public String[] getPagesFromPDF(InputStream inputStream) {
    String[] pages = null;/* w  w w.  j ava 2 s  . c o m*/
    try {
        COSDocument cosDoc = getParser(inputStream).getDocument();
        PDFTextStripper pdfStripper = new PDFTextStripper();
        PDDocument pdDoc = new PDDocument(cosDoc);
        int pagesCount = pdDoc.getNumberOfPages();
        pages = new String[pagesCount];

        for (int p = 1; p < pagesCount; p++) {
            pdfStripper.setStartPage(p);
            pdfStripper.setEndPage(p + 1);
            pages[p - 1] = pdfStripper.getText(pdDoc);
        }

    } catch (IOException e) {
        logger.error("IO ERROR", e);
    } catch (Exception ex) {
        logger.error("ERROR", ex);
    }
    return pages;
}

From source file:indexer.Indexer.java

public static String getPDF(String fileLoc, int pageNumber) {
    PDDocument pdf = null;//from   w ww .ja  v  a  2 s  .  com
    String parsedText = null;
    COSDocument cosDoc = null;
    //BufferedWriter br = null;
    try {
        File inputPDF = new File(fileLoc);
        PDFParser parser = new PDFParser(new FileInputStream(inputPDF));
        parser.parse();
        cosDoc = parser.getDocument();
        pdf = new PDDocument(cosDoc);
        PDFTextStripper stripper = new PDFTextStripper();
        stripper.setStartPage(pageNumber);
        stripper.setEndPage(pageNumber);
        //br = new BufferedWriter( new OutputStreamWriter(null));
        //stripper.writeText(pdf, br);
        parsedText = stripper.getText(pdf);
        pdf.close();
    } catch (IOException ex) {
        ex.printStackTrace();
    }
    return parsedText;
}

From source file:insight.masters.policyanalytics.services.BranchingOriginStanfordKeywords.java

public static String readfrompdf(String datsetspath, String Document) {
    /**/*  w  w  w .j  ava  2s  .c o  m*/
     * 1 POlicy text Policy Aspects
     */

    PDFParser parser = null;
    PDDocument pdDoc = null;
    COSDocument cosDoc = null;
    PDFTextStripper pdfStripper;

    String parsedText;
    String policytext = null;
    File file = new File(datsetspath + Document);
    try {
        parser = new PDFParser(new FileInputStream(file));
        parser.parse();
        cosDoc = parser.getDocument();
        pdfStripper = new PDFTextStripper();
        pdDoc = new PDDocument(cosDoc);
        parsedText = pdfStripper.getText(pdDoc);
        parsedText.replaceAll("[^A-Za-z0-9. ]+", "");
        policytext = parsedText;
        // System.out.println(policytext);
    } catch (Exception e) {
        e.printStackTrace();
        try {
            if (cosDoc != null)
                cosDoc.close();
            if (pdDoc != null)
                pdDoc.close();
        } catch (Exception e1) {
            e.printStackTrace();
        }

    }
    return policytext;

}

From source file:it.polito.tellmefirst.parsing.PDFparser.java

License:Open Source License

public String pdfToText(File file) throws TMFVisibleException {
    LOG.debug("[pdfToText] - BEGIN");
    String result;// w  w  w.  j av a2s .  c  om
    if (!file.isFile()) {
        throw new TMFVisibleException("File in input is actually not a file.");
    }
    try {
        PDFParser parser = new PDFParser(new FileInputStream(file));
        parser.parse();
        COSDocument cosDoc = parser.getDocument();
        PDFTextStripper pdfStripper = new PDFTextStripper();
        PDDocument pdDoc = new PDDocument(cosDoc);
        //pdfStripper.setStartPage(1);
        //pdfStripper.setEndPage(5);
        // remove syllabification
        String parsedTextWithWrap = pdfStripper.getText(pdDoc);
        result = parsedTextWithWrap.replace("-\n", "");
        if (cosDoc != null)
            cosDoc.close();
        if (pdDoc != null)
            pdDoc.close();
    } catch (Exception e) {
        LOG.error("[pdfToText] - EXCEPTION: ", e);
        throw new TMFVisibleException("Problem parsing file: the PDF document you uploaded seems malformed.");
    }
    LOG.debug("[pdfToText] - END");
    return result;
}

From source file:javaapplication1.PDFManager.java

public Map<String, String> ToText() throws IOException {
    this.pdfStripper = null;
    this.pdDoc = null;
    this.cosDoc = null;

    file = new File(filePath);
    parser = new PDFParser(new RandomAccessFile(file, "r")); // update for PDFBox V 2.0

    parser.parse();//from w w  w  . j av  a2s. co  m
    cosDoc = parser.getDocument();
    pdfStripper = new PDFTextStripper();
    pdDoc = new PDDocument(cosDoc);
    pdDoc.getNumberOfPages();
    // if you want to get text from full pdf file use this code
    //       pdfStripper.setEndPage(pdDoc.getNumberOfPages());

    // if you want specific number of pages
    pdfStripper.setStartPage(1);
    pdfStripper.setEndPage(1);
    Text = pdfStripper.getText(pdDoc);
    System.out.println(Text);
    // spilt 

    String[] result = Text.split("\n");
    Map<String, String> map = new HashMap<String, String>();

    try {
        for (int j = 0; j < result.length; j++) {

            if (result[j].contains("Type")) {

                String x = result[j].substring(5);
                map.put("Type", x);
            } else if (result[j].contains("Document Number")) {

                String x = result[j].substring(16);
                map.put("Document Number", x);
            } else if (result[j].contains("Date of Birth")) {

                String x = result[j].substring(14);
                map.put("Date of Birth", x);
            } else if (result[j].contains("Date of Expiry")) {

                String x = result[j].substring(15);
                map.put("Date of Expiry", x);
            } else if (result[j].contains("Issuer")) {

                String x = result[j].substring(7);
                map.put("Issuer", x);
            } else if (result[j].contains("Nationality")) {

                String x = result[j].substring(12);
                map.put("Nationality", x);
            } else if (result[j].contains("First Names")) {

                String x = result[j].substring(12);
                map.put("First Names", x);
            } else if (result[j].contains("Last Names")) {

                String x = result[j].substring(11);
                map.put("Last Names", x);
            } else if (result[j].contains("Discretionary 1")) {

                String x = result[j].substring(16);
                map.put("Discretionary 1", x);
            }
            //                            else if (result[j].contains("Discretionary 2"))
            //                            {
            //                               
            //                                String x = result[j].substring(16);
            //                                map.put("Discretionary 2", x);
            //                                
            //                            }
            else if (result[j].contains("Gender")) {

                String x = result[j].substring(7);
                map.put("Gender", x);
            }
        }

    } catch (Exception e) {

        JOptionPane.showMessageDialog(null, "please selecet OCR PDF", "worng pass", JOptionPane.ERROR_MESSAGE);

    }
    return map;

}

From source file:javadocofflinesearch.htmlprocessing.PdfAttempter.java

public String pdftoText(InputStream is, boolean stats) throws IOException {
    PDDocument pdDoc = null;//from  w  w w  .j av  a  2s .com
    COSDocument cosDoc = null;
    try {
        PDFParser parser = new PDFParser(is);
        parser.parse();
        cosDoc = parser.getDocument();
        PDFTextStripper pdfStripper = new PDFTextStripper();
        pdDoc = new PDDocument(cosDoc);
        String text = pdfStripper.getText(pdDoc);
        if (stats) {
            vc.addAll(text);
        }
        return text;
    } finally {
        if (cosDoc != null) {
            cosDoc.close();
        }
        if (pdDoc != null) {
            pdDoc.close();
        }
    }
}