Example usage for org.apache.pdfbox.pdmodel PDDocument PDDocument

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel PDDocument PDDocument.

Prototype

public PDDocument(COSDocument doc)

Source Link

Document

Constructor that uses an existing document.

Usage

From source file:jgnash.report.pdf.Report.java

License:Open Source License

public Report() {
    this.pdfDocument = new PDDocument(MemoryUsageSetting.setupMixed(MAX_MEMORY_USAGE));

    setTableFont(loadFont(ReportFactory.getMonoFont(), pdfDocument));
    setHeaderFont(loadFont(ReportFactory.getHeaderFont(), pdfDocument));
    setFooterFont(loadFont(ReportFactory.getProportionalFont(), pdfDocument));

    // restore font size
    baseFontSize = getPreferences().getFloat(BASE_FONT_SIZE, DEFAULT_BASE_FONT_SIZE);

    // restore the page format
    setPageFormat(getPageFormat());//from  w w  w .  ja  v a 2 s .  c  o m
}

From source file:ocr_pdf.OCR_PDF.java

License:GNU General Public License

/**
 * Get PDFBox output of airport diagram.
 * @param fileName name of airport diagram PDF file.
 * @return text representation of airport diagram.
 *///from ww  w .j av  a 2 s .  com
static String getTextPDFBox(String fileName) {
    PDFParser parser;
    String parsedText = null;
    PDFTextStripper pdfStripper = null;
    PDDocument pdDoc = null;
    COSDocument cosDoc = null;
    File file = new File(fileName);
    if (!file.isFile()) {
        System.err.println("File " + fileName + " does not exist.");
        return null;
    }
    try {
        parser = new PDFParser(new FileInputStream(file));
    } catch (IOException e) {
        System.err.println("Unable to open PDF Parser. " + e.getMessage());
        return null;
    }
    try {
        parser.parse();
        cosDoc = parser.getDocument();
        pdfStripper = new PDFTextStripper();
        //true doesn't work so well.
        pdfStripper.setSortByPosition(false);
        pdDoc = new PDDocument(cosDoc);
        parsedText = pdfStripper.getText(pdDoc);
    } catch (Exception e) {
        System.err.println("An exception occured in parsing the PDF Document." + e.getMessage());
    } finally {
        try {
            if (cosDoc != null)
                cosDoc.close();
            if (pdDoc != null)
                pdDoc.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
    return parsedText;
}

From source file:opennlp.PDFTools.java

public String getStringFromPDF(String filePath) {

    String text = null;// w  w  w. j  a  va  2  s  . co  m

    try {

        File file = new File(filePath);
        parser = new PDFParser(new RandomAccessFile(file, "r"));

        parser.parse();
        cosDoc = parser.getDocument();
        pdfStripper = new PDFTextStripper();
        pdDoc = new PDDocument(cosDoc);
        pdDoc.getNumberOfPages();
        pdfStripper.setStartPage(1);
        //pdfStripper.setEndPage(10);

        pdfStripper.setEndPage(pdDoc.getNumberOfPages());

        text = pdfStripper.getText(pdDoc);

    } catch (IOException e) {
        logger.error("IO ERROR", e);
    } catch (Exception ex) {
        logger.error("ERROR", ex);
    }

    return text;
}

From source file:org.crypto.sse.TextExtractPar.java

License:Open Source License

private static TextExtractPar extractOneDoc(File[] listOfFile) throws FileNotFoundException {

    Multimap<String, String> lookup1 = ArrayListMultimap.create();
    Multimap<String, String> lookup2 = ArrayListMultimap.create();

    for (File file : listOfFile) {

        for (int j = 0; j < 100; j++) {

            if (counter == (int) ((j + 1) * listOfFile.length / 100)) {
                System.out.println("Number of files read equals " + j + " %");
                break;
            }// w  ww  .j  a va 2  s.c  om
        }

        List<String> lines = new ArrayList<String>();
        counter++;
        FileInputStream fis = new FileInputStream(file);

        // ***********************************************************************************************//

        ///////////////////// .docx /////////////////////////////

        // ***********************************************************************************************//

        if (file.getName().endsWith(".docx")) {
            XWPFDocument doc;
            try {
                // System.out.println("File read: "+file.getName());

                doc = new XWPFDocument(fis);
                XWPFWordExtractor ex = new XWPFWordExtractor(doc);
                lines.add(ex.getText());
            } catch (IOException e) {
                // TODO Auto-generated catch block
                System.out.println("File not read: " + file.getName());
            }

        }

        // ***********************************************************************************************//

        ///////////////////// .pptx /////////////////////////////

        // ***********************************************************************************************//

        else if (file.getName().endsWith(".pptx")) {

            OPCPackage ppt;
            try {
                // System.out.println("File read: "+file.getName());

                ppt = OPCPackage.open(fis);
                XSLFPowerPointExtractor xw = new XSLFPowerPointExtractor(ppt);
                lines.add(xw.getText());
            } catch (XmlException e) {
                // TODO Auto-generated catch block
                System.out.println("File not read: " + file.getName());
            } catch (IOException e) {
                // TODO Auto-generated catch block
                System.out.println("File not read: " + file.getName());
            } catch (OpenXML4JException e) {
                System.out.println("File not read: " + file.getName());
            }

        }

        // ***********************************************************************************************//

        ///////////////////// .xlsx /////////////////////////////

        // ***********************************************************************************************//

        else if (file.getName().endsWith(".xlsx")) {

            OPCPackage xls;
            try {
                // System.out.println("File read: "+file.getName());

                xls = OPCPackage.open(fis);
                XSSFExcelExtractor xe = new XSSFExcelExtractor(xls);
                lines.add(xe.getText());
            } catch (InvalidFormatException e) {
                // TODO Auto-generated catch block
                System.out.println("File not read: " + file.getName());
            } catch (IOException e) {
                System.out.println("File not read: " + file.getName());

            } catch (XmlException e) {
                // TODO Auto-generated catch block
                System.out.println("File not read: " + file.getName());
            } catch (OpenXML4JException e) {
                System.out.println("File not read: " + file.getName());
            }

        }

        // ***********************************************************************************************//

        ///////////////////// .doc /////////////////////////////

        // ***********************************************************************************************//

        else if (file.getName().endsWith(".doc")) {

            NPOIFSFileSystem fs;
            try {
                // System.out.println("File read: "+file.getName());

                fs = new NPOIFSFileSystem(file);
                WordExtractor extractor = new WordExtractor(fs.getRoot());
                for (String rawText : extractor.getParagraphText()) {
                    lines.add(extractor.stripFields(rawText));
                }
            } catch (IOException e) {
                // TODO Auto-generated catch block
                System.out.println("File not read: " + file.getName());
            }

        }

        // ***********************************************************************************************//

        ///////////////////// .pdf /////////////////////////////

        // ***********************************************************************************************//

        else if (file.getName().endsWith(".pdf")) {

            PDFParser parser;
            try {
                // System.out.println("File read: "+file.getName());

                parser = new PDFParser(fis);
                parser.parse();
                COSDocument cd = parser.getDocument();
                PDFTextStripper stripper = new PDFTextStripper();
                lines.add(stripper.getText(new PDDocument(cd)));

            } catch (IOException e) {
                // TODO Auto-generated catch block
                System.out.println("File not read: " + file.getName());
            }

        }

        // ***********************************************************************************************//

        ///////////////////// Media Files such as gif, jpeg, .wmv, .mpeg,
        ///////////////////// .mp4 /////////////////////////////

        // ***********************************************************************************************//

        else if (file.getName().endsWith(".gif") && file.getName().endsWith(".jpeg")
                && file.getName().endsWith(".wmv") && file.getName().endsWith(".mpeg")
                && file.getName().endsWith(".mp4")) {

            lines.add(file.getName());

        }

        // ***********************************************************************************************//

        ///////////////////// raw text extensions
        ///////////////////// /////////////////////////////

        // ***********************************************************************************************//

        else {
            try {
                // System.out.println("File read: "+file.getName());

                lines = Files.readLines(file, Charsets.UTF_8);
            } catch (IOException e) {
                // TODO Auto-generated catch block
                System.out.println("File not read: " + file.getName());
            } finally {
                try {
                    fis.close();
                } catch (IOException ioex) {
                    // omitted.
                }
            }
        }

        // ***********************************************************************************************//

        ///////////////////// Begin word extraction
        ///////////////////// /////////////////////////////

        // ***********************************************************************************************//

        int temporaryCounter = 0;

        // Filter threshold
        int counterDoc = 0;
        for (int i = 0; i < lines.size(); i++) {

            CharArraySet noise = EnglishAnalyzer.getDefaultStopSet();

            // We are using a standard tokenizer that eliminates the stop
            // words. We can use Stemming tokenizer such Porter
            // A set of English noise keywords is used that will eliminates
            // words such as "the, a, etc"

            Analyzer analyzer = new StandardAnalyzer(noise);
            List<String> token = Tokenizer.tokenizeString(analyzer, lines.get(i));
            temporaryCounter = temporaryCounter + token.size();
            for (int j = 0; j < token.size(); j++) {

                // Avoid counting occurrences of words in the same file
                if (!lookup2.get(file.getName()).contains(token.get(j))) {
                    lookup2.put(file.getName(), token.get(j));
                }

                // Avoid counting occurrences of words in the same file
                if (!lookup1.get(token.get(j)).contains(file.getName())) {
                    lookup1.put(token.get(j), file.getName());
                }

            }

        }

    }

    // System.out.println(lookup.toString());
    return new TextExtractPar(lookup1, lookup2);

}

From source file:org.dspace.content.packager.PDFPackager.java

License:BSD License

private void crosswalkPDF(Context context, Item item, InputStream metadata)
        throws CrosswalkException, IOException, SQLException, AuthorizeException {
    COSDocument cos = null;//from w  w  w  . j  a v  a 2 s.  c o  m

    try {
        PDFParser parser = new PDFParser(metadata);
        parser.parse();
        cos = parser.getDocument();

        // sanity check: PDFBox breaks on encrypted documents, so give up.
        if (cos.getEncryptionDictionary() != null) {
            throw new MetadataValidationException("This packager cannot accept an encrypted PDF document.");
        }

        /* PDF to DC "crosswalk":
         *
         * NOTE: This is not in a crosswalk plugin because (a) it isn't
         * useful anywhere else, and more importantly, (b) the source
         * data is not XML so it doesn't fit the plugin's interface.
         *
         * pattern of crosswalk -- PDF dict entries to DC:
         *   Title -> title.null
         *   Author -> contributor.author
         *   CreationDate -> date.created
         *   ModDate -> date.created
         *   Creator -> description.provenance (application that created orig)
         *   Producer -> description.provenance (convertor to pdf)
         *   Subject -> description.abstract
         *   Keywords -> subject.other
         *    date is java.util.Calendar
         */
        PDDocument pd = new PDDocument(cos);
        PDDocumentInformation docinfo = pd.getDocumentInformation();
        String title = docinfo.getTitle();

        // sanity check: item must have a title.
        if (title == null) {
            throw new MetadataValidationException(
                    "This PDF file is unacceptable, it does not have a value for \"Title\" in its Info dictionary.");
        }
        if (log.isDebugEnabled()) {
            log.debug("PDF Info dict title=\"" + title + "\"");
        }
        item.addDC("title", null, "en", title);
        String value = docinfo.getAuthor();
        if (value != null) {
            item.addDC("contributor", "author", null, value);
            if (log.isDebugEnabled()) {
                log.debug("PDF Info dict author=\"" + value + "\"");
            }
        }

        value = docinfo.getCreator();
        if (value != null) {
            item.addDC("description", "provenance", "en",
                    "Application that created the original document: " + value);
        }

        value = docinfo.getProducer();
        if (value != null) {
            item.addDC("description", "provenance", "en", "Original document converted to PDF by: " + value);
        }

        value = docinfo.getSubject();
        if (value != null) {
            item.addDC("description", "abstract", null, value);
        }

        value = docinfo.getKeywords();
        if (value != null) {
            item.addDC("subject", "other", null, value);
        }

        // Take either CreationDate or ModDate as "date.created",
        // Too bad there's no place to put "last modified" in the DC.
        Calendar calValue = docinfo.getCreationDate();
        if (calValue == null) {
            calValue = docinfo.getModificationDate();
        }

        if (calValue != null) {
            item.addDC("date", "created", null, (new DCDate(calValue.getTime())).toString());
        }
        item.update();
    } finally {
        if (cos != null) {
            cos.close();
        }
    }
}

From source file:org.dspace.submit.step.UploadStep.java

License:BSD License

/**
 * Process the upload of a new file!//w  ww.  ja  v a  2 s.  c o m
 * 
 * @param context
 *            current DSpace context
 * @param request
 *            current servlet request object
 * @param response
 *            current servlet response object
 * @param subInfo
 *            submission info object
 * 
 * @return Status or error flag which will be processed by
 *         UI-related code! (if STATUS_COMPLETE or 0 is returned,
 *         no errors occurred!)
 */
public int processUploadFile(Context context, HttpServletRequest request, HttpServletResponse response,
        SubmissionInfo subInfo) throws ServletException, IOException, SQLException, AuthorizeException {
    boolean formatKnown = true;
    boolean fileOK = false;
    BitstreamFormat bf = null;
    Bitstream b = null;

    //NOTE: File should already be uploaded. 
    //Manakin does this automatically via Cocoon.
    //For JSP-UI, the SubmissionController.uploadFiles() does the actual upload

    Enumeration attNames = request.getAttributeNames();

    //loop through our request attributes
    while (attNames.hasMoreElements()) {
        String attr = (String) attNames.nextElement();

        //if this ends with "-path", this attribute
        //represents a newly uploaded file
        if (attr.endsWith("-path")) {
            //strip off the -path to get the actual parameter 
            //that the file was uploaded as
            String param = attr.replace("-path", "");
            String exten = param.substring(param.length() - 3);
            // Load the file's path and input stream and description
            String filePath = (String) request.getAttribute(param + "-path");
            InputStream fileInputStreamTest = (InputStream) request.getAttribute(param + "-inputstream");

            ByteArrayOutputStream baos = new ByteArrayOutputStream();
            byte[] buf = new byte[1024];
            int n = 0;
            while ((n = fileInputStreamTest.read(buf)) >= 0)
                baos.write(buf, 0, n);
            byte[] content = baos.toByteArray();

            InputStream fileInputStream = new ByteArrayInputStream(content);

            InputStream fileInputStreamPdf = new ByteArrayInputStream(content);

            InputStream ifAnsi = new ByteArrayInputStream(content);

            //InputStream fss = fileInputStream.cl

            //attempt to get description from attribute first, then direct from a parameter
            String fileDescription = (String) request.getAttribute(param + "-description");
            if (fileDescription == null || fileDescription.length() == 0) {
                fileDescription = request.getParameter("description");
            }

            // if information wasn't passed by User Interface, we had a problem
            // with the upload
            if (filePath == null || fileInputStream == null) {
                return STATUS_UPLOAD_ERROR;
            }

            if (subInfo == null) {
                // In any event, if we don't have the submission info, the request
                // was malformed
                return STATUS_INTEGRITY_ERROR;
            }

            // Create the bitstream
            Item item = subInfo.getSubmissionItem().getItem();

            // do we already have a bundle?
            Bundle[] bundles = item.getBundles("ORIGINAL");

            if (bundles.length < 1) {
                // set bundle's name to ORIGINAL
                b = item.createSingleBitstream(fileInputStream, "ORIGINAL");
            } else {
                // we have a bundle already, just add bitstream
                b = bundles[0].createBitstream(fileInputStream);
            }

            //fileDescription.op

            if (exten.toLowerCase().equals("pdf")) {
                try {
                    PDFTextStripper pdfStripper = null;
                    PDDocument docum = null;
                    PDFParser parser = new PDFParser(fileInputStreamPdf);
                    COSDocument cosDoc = null;

                    parser.parse();
                    cosDoc = parser.getDocument();
                    pdfStripper = new PDFTextStripper();
                    docum = new PDDocument(cosDoc);
                    //pdfStripper.getText(docum);

                    String parsedText = pdfStripper.getText(docum);
                    Integer fifty = (Integer) Math.round(parsedText.length() / 2);
                    if (fifty < 0) {
                        fifty = fifty * (-1);
                    }
                    Integer toCut = 500;
                    if ((parsedText.length() - fifty) < 500) {
                        toCut = parsedText.length();
                    }

                    log.info("FUCKTHISSHIT: " + fifty + " " + toCut);
                    String subText = parsedText.substring(fifty, fifty + toCut - 1);
                    try {
                        subText = subText.substring(subText.indexOf(".") + 1);
                    } catch (Exception e) {

                    }
                    item.addMetadata("dc", "textpart", null, null, subText + "...");
                    item.update();
                    context.commit();
                    log.info(parsedText);
                } catch (Exception e) {
                    log.info("omgerror: " + e.toString());
                }
            }

            if (exten.toLowerCase().equals("txt")) {
                StringWriter writer = new StringWriter();
                IOUtils.copy(fileInputStreamPdf, writer, "UTF-8");

                String theString = writer.toString();
                if (theString.startsWith("\uFEFF")) {

                } else {
                    StringWriter writerAnsi = new StringWriter();
                    IOUtils.copy(ifAnsi, writerAnsi, "Cp1252");
                    theString = writerAnsi.toString();
                }
                Integer fifty = (Integer) Math.round(theString.length() * (50 / 100.0f));
                Integer toCut = 500;
                if ((theString.length() - fifty) < 500) {
                    toCut = theString.length();
                }
                String subText = theString.substring(fifty, toCut - 1);
                item.addMetadata("dc", "textpart", null, null, subText + "...");
                item.update();
                context.commit();
                log.info(subText);
            }

            log.info("OMGTEST: " + exten);

            if (exten.toLowerCase().equals("doc")) {
                WordExtractor extractor = null;
                try {

                    HWPFDocument document = new HWPFDocument(fileInputStreamPdf);
                    extractor = new WordExtractor(document);
                    String fileData = extractor.getText();
                    Integer fifty = (Integer) Math.round(50 * 100 / fileData.length());
                    Integer toCut = 500;
                    if ((fileData.length() - fifty) < 500) {
                        toCut = fileData.length();
                    }
                    String subText = fileData.substring(fifty, toCut - 1);
                    item.addMetadata("dc", "textpart", null, null, subText + "...");
                    item.update();
                    context.commit();
                } catch (Exception exep) {
                    log.info("OMGTESTIK:" + exep);
                }
            }

            if ((exten.toLowerCase().equals("ocx"))) {
                XWPFDocument document = new XWPFDocument(fileInputStreamPdf);
                XWPFWordExtractor extractor = null;
                extractor = new XWPFWordExtractor(document);

                String text = extractor.getText();
                Integer fifty = (Integer) Math.round(50 * 100 / text.length());
                Integer toCut = 500;
                if ((text.length() - fifty) < 500) {
                    toCut = text.length();
                }
                String subText = text.substring(fifty, toCut - 1);
                item.addMetadata("dc", "textpart", null, null, subText + "...");
                item.update();
                context.commit();
            }

            // Strip all but the last filename. It would be nice
            // to know which OS the file came from.
            String noPath = filePath;

            while (noPath.indexOf('/') > -1) {
                noPath = noPath.substring(noPath.indexOf('/') + 1);
            }

            while (noPath.indexOf('\\') > -1) {
                noPath = noPath.substring(noPath.indexOf('\\') + 1);
            }

            b.setName(noPath);
            b.setSource(filePath);
            b.setDescription(fileDescription);

            // Identify the format
            bf = FormatIdentifier.guessFormat(context, b);
            b.setFormat(bf);

            // Update to DB
            b.update();
            item.update();

            if ((bf != null) && (bf.isInternal())) {
                log.warn("Attempt to upload file format marked as internal system use only");
                backoutBitstream(subInfo, b, item);
                return STATUS_UPLOAD_ERROR;
            }

            // Check for virus
            if (ConfigurationManager.getBooleanProperty("submission-curation", "virus-scan")) {
                Curator curator = new Curator();
                curator.addTask("vscan").curate(item);
                int status = curator.getStatus("vscan");
                if (status == Curator.CURATE_ERROR) {
                    backoutBitstream(subInfo, b, item);
                    return STATUS_VIRUS_CHECKER_UNAVAILABLE;
                } else if (status == Curator.CURATE_FAIL) {
                    backoutBitstream(subInfo, b, item);
                    return STATUS_CONTAINS_VIRUS;
                }
            }

            // If we got this far then everything is more or less ok.

            // Comment - not sure if this is the right place for a commit here
            // but I'm not brave enough to remove it - Robin.
            context.commit();

            // save this bitstream to the submission info, as the
            // bitstream we're currently working with
            subInfo.setBitstream(b);

            //if format was not identified
            if (bf == null) {
                return STATUS_UNKNOWN_FORMAT;
            }

        } //end if attribute ends with "-path"
    } //end while

    return STATUS_COMPLETE;

}

From source file:org.encuestame.business.search.IndexerFile.java

License:Apache License

/**
 * Parse pdf Document./*from  w  w w.  ja va 2  s  .c  o  m*/
 * @param file
 * @return
 * @throws IOException
 */
public static PDDocument parsePdfDocument(final File file) throws IOException {
    InputStream is = new FileInputStream(file);
    COSDocument cosDoc = null;
    PDDocument pdDoc = null;
    try {
        cosDoc = SearchUtils.parseDocument(is);
        pdDoc = new PDDocument(cosDoc);
    } catch (IOException e) {
        // TODO Auto-generated catch block
        log.error(e);
    } finally {
        if (pdDoc == null) {
            log.error("PdDocument is null");
        } else {
            pdDoc.close();
        }
    }

    return pdDoc;
}

From source file:org.encuestame.business.search.SearchUtils.java

License:Apache License

/**
 * Create PDF Document./*from w  w w . j a v a2 s .c o  m*/
 * @param file {@link File}
 * @param Long attachmentId.
 * @return {@link Document}
 * @throws Exception
 */
public static Document createPdfDocument(final File file) throws Exception {
    InputStream is = new FileInputStream(file);
    COSDocument cosDoc = null;
    String docText = "";
    PDDocument pdDoc = null;
    try {
        cosDoc = parseDocument(is);
        pdDoc = new PDDocument(cosDoc);
        PDFTextStripper stripper = new PDFTextStripper();
        docText = stripper.getText(pdDoc);
        log.debug("PDF Doc Text " + docText.length());
    } finally {
        if (pdDoc == null) {
            log.error("PdDocument is null");
        } else {
            pdDoc.close();
        }
    }
    final Document doc = SearchUtils.addFields(file, docText);
    return doc;
}

From source file:org.kimios.kernel.index.filters.PDFFilter.java

License:Open Source License

public String getBody(InputStream in) throws IOException {
    PDFParser parser = new PDFParser(in);
    parser.parse();/*w  w w. j  ava 2  s  .c  o  m*/
    COSDocument cosDoc = parser.getDocument();
    PDDocument pDDoc = new PDDocument(cosDoc);
    String out = new PDFTextStripper().getText(pDDoc);
    pDDoc.close();
    return out;
}

From source file:org.nuxeo.typeDocPkg.PdfDoc.java

License:Apache License

private boolean setMain(String FileName) throws Exception {
    file = new File(FileName);
    if (!file.isFile()) {
        System.err.println("File " + "output.pdf" + " does not exist.");
        return false;
    }/* w ww. j ava 2 s . c o m*/
    try {
        parser = new PDFParser(new FileInputStream(file));
    } catch (IOException e) {
        log.error("Unable to open PDF Parser. ", e);
        return false;
    }

    try {
        parser.parse();
        cosDoc = parser.getDocument();
        pdfStripper = new PDFTextStripper();
        pdDoc = new PDDocument(cosDoc);
    } catch (Exception e) {
        log.error("error in setMain method ", e);
        return false;
    }

    return true;
}