Example usage for org.apache.pdfbox.pdmodel PDDocument isEncrypted

List of usage examples for org.apache.pdfbox.pdmodel PDDocument isEncrypted

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel PDDocument isEncrypted.

Prototype

public boolean isEncrypted() 

Source Link

Document

This will tell if this document is encrypted or not.

Usage

From source file:net.sf.jabref.imports.PdfContentImporter.java

License:Open Source License

@Override
public List<BibtexEntry> importEntries(InputStream in, OutputPrinter status) throws IOException {
    final ArrayList<BibtexEntry> res = new ArrayList<BibtexEntry>(1);

    PDDocument document;
    try {/*w  w w  .jav  a2s  .  com*/
        document = PDDocument.load(in);
    } catch (IOException e) {
        PdfContentImporter.logger.log(Level.SEVERE, "Could not load document", e);
        return res;
    }

    try {
        if (document.isEncrypted()) {
            PdfContentImporter.logger.log(Level.INFO, Globals.lang("Encrypted documents are not supported"));
            //return res;
        }

        PDFTextStripper stripper = new PDFTextStripper();
        stripper.setStartPage(1);
        stripper.setEndPage(1);
        stripper.setSortByPosition(true);
        stripper.setParagraphEnd(System.getProperty("line.separator"));
        StringWriter writer = new StringWriter();
        stripper.writeText(document, writer);
        String textResult = writer.toString();

        String doi = DOIUtil.getDOI(textResult);
        if (doi.length() < textResult.length()) {
            // A DOI was found in the text
            // We do NO parsing of the text, but use the DOI fetcher

            ImportInspector i = new ImportInspector() {

                @Override
                public void toFront() {
                }

                @Override
                public void setProgress(int current, int max) {
                }

                @Override
                public void addEntry(BibtexEntry entry) {
                    // add the entry to the result object
                    res.add(entry);
                }
            };
            PdfContentImporter.doiToBibTeXFetcher.processQuery(doi, i, status);
            if (res.size() != 0) {
                // if something has been found, return the result
                return res;
            } else {
                // otherwise, we just parse the PDF
            }
        }

        String author;
        String editor = null;
        String institution = null;
        String abstractT = null;
        String keywords = null;
        String title;
        String conference = null;
        String DOI = null;
        String series = null;
        String volume = null;
        String number = null;
        String pages = null;
        // year is a class variable as the method extractYear() uses it;
        String publisher = null;
        BibtexEntryType type = BibtexEntryType.INPROCEEDINGS;

        final String lineBreak = System.getProperty("line.separator");

        split = textResult.split(lineBreak);

        // idea: split[] contains the different lines
        // blocks are separated by empty lines
        // treat each block
        //   or do special treatment at authors (which are not broken)
        //   therefore, we do a line-based and not a block-based splitting
        // i points to the current line
        // curString (mostly) contains the current block
        //   the different lines are joined into one and thereby separated by " "

        proceedToNextNonEmptyLine();
        if (i >= split.length) {
            // PDF could not be parsed or is empty
            // return empty list
            return res;
        }
        curString = split[i];
        i = i + 1;

        if (curString.length() > 4) {
            // special case: possibly conference as first line on the page
            extractYear();
            if (curString.contains("Conference")) {
                fillCurStringWithNonEmptyLines();
                conference = curString;
                curString = "";
            } else {
                // e.g. Copyright (c) 1998 by the Genetics Society of America
                // future work: get year using RegEx
                String lower = curString.toLowerCase();
                if (lower.contains("copyright")) {
                    fillCurStringWithNonEmptyLines();
                    publisher = curString;
                    curString = "";
                }
            }
        }

        // start: title
        fillCurStringWithNonEmptyLines();
        title = streamlineTitle(curString);
        curString = "";
        //i points to the next non-empty line

        // after title: authors
        author = null;
        while ((i < split.length) && (!split[i].equals(""))) {
            // author names are unlikely to be split among different lines
            // treat them line by line
            curString = streamlineNames(split[i]);
            if (author == null) {
                author = curString;
            } else {
                if (curString.equals("")) {
                    // if split[i] is "and" then "" is returned by streamlineNames -> do nothing
                } else {
                    author = author.concat(" and ").concat(curString);
                }
            }
            i++;
        }
        curString = "";
        i++;

        // then, abstract and keywords follow
        while (i < split.length) {
            curString = split[i];
            if ((curString.length() >= "Abstract".length())
                    && (curString.substring(0, "Abstract".length()).equalsIgnoreCase("Abstract"))) {
                if (curString.length() == "Abstract".length()) {
                    // only word "abstract" found -- skip line
                    curString = "";
                } else {
                    curString = curString.substring("Abstract".length() + 1).trim().concat(lineBreak);
                }
                i++;
                // fillCurStringWithNonEmptyLines() cannot be used as that uses " " as line separator
                // whereas we need linebreak as separator
                while ((i < split.length) && (!split[i].equals(""))) {
                    curString = curString.concat(split[i]).concat(lineBreak);
                    i++;
                }
                abstractT = curString;
                i++;
            } else if ((curString.length() >= "Keywords".length())
                    && (curString.substring(0, "Keywords".length()).equalsIgnoreCase("Keywords"))) {
                if (curString.length() == "Keywords".length()) {
                    // only word "Keywords" found -- skip line
                    curString = "";
                } else {
                    curString = curString.substring("Keywords".length() + 1).trim();
                }
                i++;
                fillCurStringWithNonEmptyLines();
                keywords = removeNonLettersAtEnd(curString);
            } else {
                String lower = curString.toLowerCase();

                int pos = lower.indexOf("technical");
                if (pos >= 0) {
                    type = BibtexEntryType.TECHREPORT;
                    pos = curString.trim().lastIndexOf(' ');
                    if (pos >= 0) {
                        // assumption: last character of curString is NOT ' '
                        //   otherwise pos+1 leads to an out-of-bounds exception
                        number = curString.substring(pos + 1);
                    }
                }

                i++;
                proceedToNextNonEmptyLine();
            }
        }

        i = split.length - 1;

        // last block: DOI, detailed information
        // sometimes, this information is in the third last block etc...
        // therefore, read until the beginning of the file 

        while (i >= 0) {
            readLastBlock();
            // i now points to the block before or is -1
            // curString contains the last block, separated by " "

            extractYear();

            int pos = curString.indexOf("(Eds.)");
            if ((pos >= 0) && (publisher == null)) {
                // looks like a Springer last line
                // e.g: A. Persson and J. Stirna (Eds.): PoEM 2009, LNBIP 39, pp. 161-175, 2009.
                publisher = "Springer";
                editor = streamlineNames(curString.substring(0, pos - 1));
                curString = curString.substring(pos + "(Eds.)".length() + 2); //+2 because of ":" after (Eds.) and the subsequent space
                String[] springerSplit = curString.split(", ");
                if (springerSplit.length >= 4) {
                    conference = springerSplit[0];

                    String seriesData = springerSplit[1];
                    int lastSpace = seriesData.lastIndexOf(' ');
                    series = seriesData.substring(0, lastSpace);
                    volume = seriesData.substring(lastSpace + 1);

                    pages = springerSplit[2].substring(4);

                    if (springerSplit[3].length() >= 4) {
                        year = springerSplit[3].substring(0, 4);
                    }
                }
            } else {
                if (DOI == null) {
                    pos = curString.indexOf("DOI");
                    if (pos < 0) {
                        pos = curString.indexOf("doi");
                    }
                    if (pos >= 0) {
                        pos += 3;
                        char delimiter = curString.charAt(pos);
                        if ((delimiter == ':') || (delimiter == ' ')) {
                            pos++;
                        }
                        int nextSpace = curString.indexOf(' ', pos);
                        if (nextSpace > 0) {
                            DOI = curString.substring(pos, nextSpace);
                        } else {
                            DOI = curString.substring(pos);
                        }
                    }
                }

                if ((publisher == null) && (curString.contains("IEEE"))) {
                    // IEEE has the conference things at the end
                    publisher = "IEEE";

                    // year is extracted by extractYear
                    // otherwise, we could it determine as follows: 
                    // String yearStr = curString.substring(curString.length()-4);
                    // if (isYear(yearStr)) {
                    //   year = yearStr;
                    // }

                    if (conference == null) {
                        pos = curString.indexOf('$');
                        if (pos > 0) {
                            // we found the price
                            // before the price, the ISSN is stated
                            // skip that
                            pos -= 2;
                            while ((pos >= 0) && (curString.charAt(pos) != ' ')) {
                                pos--;
                            }
                            if (pos > 0) {
                                conference = curString.substring(0, pos);
                            }
                        }
                    }
                }

                //               String lower = curString.toLowerCase();
                //               if (institution == null) {
                //                  
                //               }

            }
        }

        BibtexEntry entry = new BibtexEntry();
        entry.setType(type);

        if (author != null) {
            entry.setField("author", author);
        }
        if (editor != null) {
            entry.setField("editor", editor);
        }
        if (institution != null) {
            entry.setField("institution", institution);
        }
        if (abstractT != null) {
            entry.setField("abstract", abstractT);
        }
        if (keywords != null) {
            entry.setField("keywords", keywords);
        }
        if (title != null) {
            entry.setField("title", title);
        }
        if (conference != null) {
            entry.setField("booktitle", conference);
        }
        if (DOI != null) {
            entry.setField("doi", DOI);
        }
        if (series != null) {
            entry.setField("series", series);
        }
        if (volume != null) {
            entry.setField("volume", volume);
        }
        if (number != null) {
            entry.setField("number", number);
        }
        if (pages != null) {
            entry.setField("pages", pages);
        }
        if (year != null) {
            entry.setField("year", year);
        }
        if (publisher != null) {
            entry.setField("publisher", publisher);
        }

        entry.setField("review", textResult);

        res.add(entry);
    } catch (NoClassDefFoundError e) {
        if (e.getMessage().equals("org/bouncycastle/jce/provider/BouncyCastleProvider")) {
            status.showMessage(Globals.lang(
                    "Java Bouncy Castle library not found. Please download and install it. For more information see http://www.bouncycastle.org/."));
        } else {
            PdfContentImporter.logger.log(Level.SEVERE, e.getLocalizedMessage(), e);
        }
    } finally {
        document.close();
    }

    return res;
}

From source file:net.sf.jabref.util.XMPUtil.java

License:Open Source License

/**
 * Try to read the given BibTexEntry from the XMP-stream of the given
 * inputstream containing a PDF-file.//from w w  w  .  j  ava2s  . c o m
 * 
 * @param inputStream
 *            The inputstream to read from.
 * 
 * @throws IOException
 *             Throws an IOException if the file cannot be read, so the user
 *             than remove a lock or cancel the operation.
 */
@SuppressWarnings("unchecked")
public static List<BibtexEntry> readXMP(InputStream inputStream) throws IOException {

    List<BibtexEntry> result = new LinkedList<BibtexEntry>();

    PDDocument document = null;

    try {
        document = PDDocument.load(inputStream);
        if (document.isEncrypted()) {
            throw new EncryptionNotSupportedException("Error: Cannot read metadata from encrypted document.");
        }

        XMPMetadata meta = XMPUtil.getXMPMetadata(document);

        // If we did not find any XMP metadata, search for non XMP metadata
        if (meta != null) {

            List<XMPSchema> schemas = meta.getSchemasByNamespaceURI(XMPSchemaBibtex.NAMESPACE);

            for (XMPSchema schema : schemas) {
                XMPSchemaBibtex bib = (XMPSchemaBibtex) schema;

                result.add(bib.getBibtexEntry());
            }

            // If we did not find anything have a look if a Dublin Core exists
            if (result.isEmpty()) {
                schemas = meta.getSchemasByNamespaceURI(XMPSchemaDublinCore.NAMESPACE);
                for (XMPSchema schema : schemas) {
                    XMPSchemaDublinCore dc = (XMPSchemaDublinCore) schema;

                    BibtexEntry entry = XMPUtil.getBibtexEntryFromDublinCore(dc);

                    if (entry != null) {
                        result.add(entry);
                    }
                }
            }
        }
        if (result.isEmpty()) {
            BibtexEntry entry = XMPUtil
                    .getBibtexEntryFromDocumentInformation(document.getDocumentInformation());

            if (entry != null) {
                result.add(entry);
            }
        }
    } finally {
        if (document != null) {
            document.close();
        }
    }

    // return null, if no metadata was found
    if (result.isEmpty()) {
        return null;
    }
    return result;
}

From source file:net.sf.jabref.util.XMPUtil.java

License:Open Source License

/**
 * Will read the XMPMetadata from the given pdf file, closing the file
 * afterwards./* w  w w .  j  av a  2s.c  o  m*/
 * 
 * @param inputStream
 *            The inputStream representing a PDF-file to read the
 *            XMPMetadata from.
 * @return The XMPMetadata object found in the file or null if none is
 *         found.
 * @throws IOException
 */
private static XMPMetadata readRawXMP(InputStream inputStream) throws IOException {
    PDDocument document = null;

    try {
        document = PDDocument.load(inputStream);
        if (document.isEncrypted()) {
            throw new EncryptionNotSupportedException("Error: Cannot read metadata from encrypted document.");
        }

        return XMPUtil.getXMPMetadata(document);

    } finally {
        if (document != null) {
            document.close();
        }
    }
}

From source file:net.sf.jabref.util.XMPUtil.java

License:Open Source License

/**
 * Try to write the given BibTexEntry in the XMP-stream of the given
 * PDF-file.//w w w.  j  a v  a  2  s  .com
 * 
 * Throws an IOException if the file cannot be read or written, so the user
 * can remove a lock or cancel the operation.
 * 
 * The method will overwrite existing BibTeX-XMP-data, but keep other
 * existing metadata.
 * 
 * @param file
 *            The file to write the entries to.
 * @param bibtexEntries
 *            The entries to write to the file. *
 * @param database
 *            maybenull An optional database which the given bibtex entries
 *            belong to, which will be used to resolve strings. If the
 *            database is null the strings will not be resolved.
 * @param writePDFInfo
 *            Write information also in PDF document properties
 * @throws TransformerException
 *             If the entry was malformed or unsupported.
 * @throws IOException
 *             If the file could not be written to or could not be found.
 */
@SuppressWarnings("unchecked")
public static void writeXMP(File file, Collection<BibtexEntry> bibtexEntries, BibtexDatabase database,
        boolean writePDFInfo) throws IOException, TransformerException {

    if (database != null) {
        bibtexEntries = database.resolveForStrings(bibtexEntries, false);
    }

    PDDocument document = null;

    try {
        document = PDDocument.load(file.getAbsoluteFile());
        if (document.isEncrypted()) {
            throw new EncryptionNotSupportedException("Error: Cannot add metadata to encrypted document.");
        }

        if (writePDFInfo && (bibtexEntries.size() == 1)) {
            XMPUtil.writeDocumentInformation(document, bibtexEntries.iterator().next(), null);
            XMPUtil.writeDublinCore(document, bibtexEntries, null);
        }

        PDDocumentCatalog catalog = document.getDocumentCatalog();
        PDMetadata metaRaw = catalog.getMetadata();

        XMPMetadata meta;
        if (metaRaw != null) {
            meta = new XMPMetadata(XMLUtil.parse(metaRaw.createInputStream()));
        } else {
            meta = new XMPMetadata();
        }
        meta.addXMLNSMapping(XMPSchemaBibtex.NAMESPACE, XMPSchemaBibtex.class);

        // Remove all current Bibtex-schemas
        List<XMPSchema> schemas = meta.getSchemasByNamespaceURI(XMPSchemaBibtex.NAMESPACE);
        for (XMPSchema schema : schemas) {
            XMPSchemaBibtex bib = (XMPSchemaBibtex) schema;
            bib.getElement().getParentNode().removeChild(bib.getElement());
        }

        for (BibtexEntry e : bibtexEntries) {
            XMPSchemaBibtex bibtex = new XMPSchemaBibtex(meta);
            meta.addSchema(bibtex);
            bibtex.setBibtexEntry(e, null);
        }

        // Save to stream and then input that stream to the PDF
        ByteArrayOutputStream os = new ByteArrayOutputStream();
        meta.save(os);
        ByteArrayInputStream is = new ByteArrayInputStream(os.toByteArray());
        PDMetadata metadataStream = new PDMetadata(document, is, false);
        catalog.setMetadata(metadataStream);

        // Save
        try {
            document.save(file.getAbsolutePath());
        } catch (COSVisitorException e) {
            throw new TransformerException("Could not write XMP-metadata: " + e.getLocalizedMessage());
        }

    } finally {
        if (document != null) {
            document.close();
        }
    }
}

From source file:net.sf.jsignpdf.preview.Pdf2Image.java

License:Mozilla Public License

/**
 * Returns image (or null if failed) generated from given page in PDF using
 * PDFBox tool./*from  ww w  .j a  va  2s  . c  o m*/
 * 
 * @param aPage
 *            page in PDF (1 based)
 * @return image or null
 */
public BufferedImage getImageUsingPdfBox(final int aPage) {
    BufferedImage tmpResult = null;
    PDDocument tmpDoc = null;

    try {
        tmpDoc = PDDocument.load(options.getInFile());
        if (tmpDoc.isEncrypted()) {
            tmpDoc.decrypt(options.getPdfOwnerPwdStrX());
        }
        int resolution;
        try {
            resolution = Toolkit.getDefaultToolkit().getScreenResolution();
        } catch (HeadlessException e) {
            resolution = 96;
        }

        final PDPage page = (PDPage) tmpDoc.getDocumentCatalog().getAllPages().get(aPage - 1);
        tmpResult = page.convertToImage(BufferedImage.TYPE_INT_RGB, resolution);
    } catch (Exception e) {
        e.printStackTrace();
    } finally {
        if (tmpDoc != null) {
            try {
                tmpDoc.close();
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
    }
    return tmpResult;
}

From source file:net.sf.mmm.content.parser.impl.pdf.ContentParserPdf.java

License:Apache License

/**
 * {@inheritDoc}/*from  www .ja va2s  .  c  o m*/
 */
@Override
public void parse(InputStream inputStream, long filesize, ContentParserOptions options,
        MutableGenericContext context) throws Exception {

    PDFParser parser = new PDFParser(inputStream);
    parser.parse();
    PDDocument pdfDoc = parser.getPDDocument();
    try {
        if (pdfDoc.isEncrypted()) {
            // pdfDoc.decrypt("password");
            return;
        }
        PDDocumentInformation info = pdfDoc.getDocumentInformation();
        String title = info.getTitle();
        if (title != null) {
            context.setVariable(VARIABLE_NAME_TITLE, title);
        }
        String keywords = info.getKeywords();
        if (keywords != null) {
            context.setVariable(VARIABLE_NAME_KEYWORDS, keywords);
        }
        String author = info.getAuthor();
        if (author != null) {
            context.setVariable(VARIABLE_NAME_CREATOR, author);
        }

        if (filesize < options.getMaximumBufferSize()) {
            PDFTextStripper stripper = new PDFTextStripper();
            context.setVariable(VARIABLE_NAME_TEXT, stripper.getText(pdfDoc));
        }
    } finally {
        pdfDoc.close();
    }
}

From source file:net.sourceforge.docfetcher.parse.PDFParser.java

License:Open Source License

public String renderText(File file) throws ParseException {
    PDDocument pdfDoc = null;
    try {//from  w w w .j  a v a 2 s  . c om
        pdfDoc = PDDocument.load(file);
        if (pdfDoc.isEncrypted()) {
            try {
                pdfDoc.openProtection(new StandardDecryptionMaterial(""));
            } catch (Exception e) {
                throw new ParseException(file, Msg.no_extraction_permission.value());
            }
        }
        PDFTextStripper stripper = new PDFTextStripper();
        StringWriter writer = new StringWriter();
        stripper.writeText(pdfDoc, writer);
        return writer.toString();
    } catch (IOException e) {
        throw new ParseException(file, Msg.file_not_readable.value());
    } finally {
        if (pdfDoc != null) {
            try {
                pdfDoc.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
}

From source file:net.sourceforge.docfetcher.parse.PDFParser.java

License:Open Source License

public Document parse(File file) throws ParseException {
    PDDocument pdfDoc = null;
    try {/*from w ww .  j  a v  a  2 s .c  om*/
        // Check if PDF file is encrypted
        pdfDoc = PDDocument.load(file);
        if (pdfDoc.isEncrypted()) {
            try {
                pdfDoc.openProtection(new StandardDecryptionMaterial(""));
            } catch (Exception e) {
                throw new ParseException(file, Msg.no_extraction_permission.value());
            }
        }

        // Get tags and contents
        PDFTextStripper stripper = new PDFTextStripper();
        StringWriter writer = new StringWriter();
        stripper.writeText(pdfDoc, writer);
        DocFetcher.getInstance().setExceptionHandlerEnabled(true);
        PDDocumentInformation pdInfo = pdfDoc.getDocumentInformation();
        String[] metaData = new String[] { pdInfo.getTitle(), pdInfo.getAuthor(), pdInfo.getSubject(),
                pdInfo.getKeywords(), };
        for (String field : metaData)
            if (field != null)
                writer.append(" ").append(field); //$NON-NLS-1$
        return new Document(file, metaData[0], writer.getBuffer()).addAuthor(metaData[1]);
    } catch (IOException e) {
        throw new ParseException(file, Msg.file_not_readable.value());
    } finally {
        if (pdfDoc != null) {
            try {
                pdfDoc.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
}

From source file:net.yacy.cider.parser.idiom.pdfIdiom.java

License:Open Source License

@Override
public Model parse(DataSource source) throws ParserException {
    // create an empty Model
    Model model = ModelFactory.createDefaultModel();
    Resource resource = source.hasURI() ? model.createResource(source.getURI().toNormalform(true, true))
            : model.createResource();// w ww. j  av  a  2  s .  co m

    // open pdf document
    final PDDocument theDocument;
    final PDFParser parser;
    try {
        parser = new PDFParser(source.getStream());
        parser.parse();
        theDocument = parser.getPDDocument();
    } catch (IOException e) {
        log.error(e.getMessage(), e);
        throw new ParserException(e.getMessage(), source.getURI());
    }

    if (theDocument.isEncrypted()) {
        try {
            theDocument.openProtection(new StandardDecryptionMaterial(""));
        } catch (BadSecurityHandlerException e) {
            throw new ParserException("PDF Encrypted (BadSecurityHandlerException): " + e.getMessage(),
                    source.getURI(), e);
        } catch (IOException e) {
            throw new ParserException("PDF Encrypted (IOException): " + e.getMessage(), source.getURI(), e);
        } catch (CryptographyException e) {
            throw new ParserException("PDF Encrypted (CryptographyException): " + e.getMessage(),
                    source.getURI(), e);
        }
        final AccessPermission perm = theDocument.getCurrentAccessPermission();
        if (perm == null || !perm.canExtractContent())
            throw new ParserException("PDF cannot be decrypted", source.getURI());
    }

    // get metadata
    final PDDocumentInformation theDocInfo = theDocument.getDocumentInformation();
    String docTitle = null, docSubject = null, docAuthor = null, docKeywordStr = null;
    if (theDocInfo != null) {
        docTitle = theDocInfo.getTitle();
        docSubject = theDocInfo.getSubject();
        docAuthor = theDocInfo.getAuthor();
        docKeywordStr = theDocInfo.getKeywords();
    }

    if (docAuthor != null && docAuthor.length() > 0) {
        resource.addProperty(VCARD.FN, docAuthor);
        resource.addProperty(DC.creator, docAuthor);
    }
    if (docSubject != null && docSubject.length() > 0) {
        resource.addProperty(DC.subject, docSubject);
    }
    if (docTitle != null && docTitle.length() > 0) {
        resource.addProperty(DC.title, docTitle);
    }
    String[] docKeywords = null;
    if (docKeywordStr != null && docKeywordStr.length() > 0) {
        docKeywords = docKeywordStr.split(" |,");
        resource.addProperty(DC.coverage, concat(docKeywords));
    }

    // get the content
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    Writer writer;
    try {
        writer = new OutputStreamWriter(baos, "UTF-8");
    } catch (UnsupportedEncodingException e1) {
        writer = new OutputStreamWriter(baos);
    }
    try {
        final PDFTextStripper stripper = new PDFTextStripper();
        stripper.writeText(theDocument, writer);
        theDocument.close();
        writer.close();
    } catch (IOException e) {
        if (writer != null)
            try {
                writer.close();
            } catch (final Exception ex) {
            }
        throw new ParserException("PDF content reader", source.getURI(), e);
    }
    String content;
    try {
        content = new String(baos.toByteArray(), "UTF-8");
    } catch (UnsupportedEncodingException e) {
        content = new String(baos.toByteArray());
    }
    if (content != null && content.length() > 0) {
        resource.addProperty(CIDER.data_content_text, content);
    }

    return model;
}

From source file:net.yacy.document.parser.pdfParser.java

License:Open Source License

@Override
public Document[] parse(final AnchorURL location, final String mimeType, final String charset,
        final VocabularyScraper scraper, final int timezoneOffset, final InputStream source)
        throws Parser.Failure, InterruptedException {

    // check memory for parser
    if (!MemoryControl.request(200 * 1024 * 1024, false))
        throw new Parser.Failure("Not enough Memory available for pdf parser: " + MemoryControl.available(),
                location);/*from  www  .  j av  a2 s  . c om*/

    // create a pdf parser
    PDDocument pdfDoc;
    //final PDFParser pdfParser;
    try {
        Thread.currentThread().setPriority(Thread.MIN_PRIORITY); // the pdfparser is a big pain
        pdfDoc = PDDocument.load(source);
        //PDFParser pdfParser = new PDFParser(source);
        //pdfParser.parse();
        //pdfDoc = pdfParser.getPDDocument();
    } catch (final IOException e) {
        throw new Parser.Failure(e.getMessage(), location);
    } finally {
        Thread.currentThread().setPriority(Thread.NORM_PRIORITY);
    }

    if (pdfDoc.isEncrypted()) {
        try {
            pdfDoc.openProtection(new StandardDecryptionMaterial(""));
        } catch (final BadSecurityHandlerException e) {
            try {
                pdfDoc.close();
            } catch (final IOException ee) {
            }
            throw new Parser.Failure("Document is encrypted (1): " + e.getMessage(), location);
        } catch (final IOException e) {
            try {
                pdfDoc.close();
            } catch (final IOException ee) {
            }
            throw new Parser.Failure("Document is encrypted (2): " + e.getMessage(), location);
        } catch (final CryptographyException e) {
            try {
                pdfDoc.close();
            } catch (final IOException ee) {
            }
            throw new Parser.Failure("Document is encrypted (3): " + e.getMessage(), location);
        }
        final AccessPermission perm = pdfDoc.getCurrentAccessPermission();
        if (perm == null || !perm.canExtractContent()) {
            try {
                pdfDoc.close();
            } catch (final IOException ee) {
            }
            throw new Parser.Failure("Document is encrypted and cannot be decrypted", location);
        }
    }

    // extracting some metadata
    PDDocumentInformation info = pdfDoc.getDocumentInformation();
    String docTitle = null, docSubject = null, docAuthor = null, docPublisher = null, docKeywordStr = null;
    Date docDate = new Date();
    if (info != null) {
        docTitle = info.getTitle();
        docSubject = info.getSubject();
        docAuthor = info.getAuthor();
        docPublisher = info.getProducer();
        if (docPublisher == null || docPublisher.isEmpty())
            docPublisher = info.getCreator();
        docKeywordStr = info.getKeywords();
        try {
            if (info.getModificationDate() != null)
                docDate = info.getModificationDate().getTime();
        } catch (IOException e) {
        }
        // unused:
        // info.getTrapped());
    }
    info = null;

    if (docTitle == null || docTitle.isEmpty()) {
        docTitle = MultiProtocolURL.unescape(location.getFileName());
    }
    if (docTitle == null) {
        docTitle = docSubject;
    }
    String[] docKeywords = null;
    if (docKeywordStr != null) {
        docKeywords = docKeywordStr.split(" |,");
    }

    Collection<AnchorURL>[] pdflinks = null;
    Document[] result = null;
    try {
        // get the links
        pdflinks = extractPdfLinks(pdfDoc);

        // get the fulltext (either per document or for each page)
        final PDFTextStripper stripper = new PDFTextStripper("UTF-8");

        if (individualPages) {
            // this is a hack which stores individual pages of the source pdf into individual index documents
            // the new documents will get a virtual link with a post argument page=X appended to the original url

            // collect text
            int pagecount = pdfDoc.getNumberOfPages();
            String[] pages = new String[pagecount];
            for (int page = 1; page <= pagecount; page++) {
                stripper.setStartPage(page);
                stripper.setEndPage(page);
                pages[page - 1] = stripper.getText(pdfDoc);
                //System.out.println("PAGE " + page + ": " + pages[page - 1]);
            }

            // create individual documents for each page
            assert pages.length == pdflinks.length : "pages.length = " + pages.length + ", pdflinks.length = "
                    + pdflinks.length;
            result = new Document[Math.min(pages.length, pdflinks.length)];
            String loc = location.toNormalform(true);
            for (int page = 0; page < result.length; page++) {
                result[page] = new Document(
                        new AnchorURL(loc + (loc.indexOf('?') > 0 ? '&' : '?') + individualPagePropertyname
                                + '=' + (page + 1)), // these are virtual new pages; we cannot combine them with '#' as that would be removed when computing the urlhash
                        mimeType, "UTF-8", this, null, docKeywords, singleList(docTitle), docAuthor,
                        docPublisher, null, null, 0.0f, 0.0f,
                        pages == null || page > pages.length ? new byte[0] : UTF8.getBytes(pages[page]),
                        pdflinks == null || page >= pdflinks.length ? null : pdflinks[page], null, null, false,
                        docDate);
            }
        } else {
            // collect the whole text at once
            final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE);
            byte[] contentBytes = new byte[0];
            stripper.setEndPage(3); // get first 3 pages (always)
            writer.append(stripper.getText(pdfDoc));
            contentBytes = writer.getBytes(); // remember text in case of interrupting thread

            if (pdfDoc.getNumberOfPages() > 3) { // spare creating/starting thread if all pages read
                stripper.setStartPage(4); // continue with page 4 (terminated, resulting in no text)
                stripper.setEndPage(Integer.MAX_VALUE); // set to default
                // we start the pdf parsing in a separate thread to ensure that it can be terminated
                final PDDocument pdfDocC = pdfDoc;
                final Thread t = new Thread() {
                    @Override
                    public void run() {
                        Thread.currentThread().setName("pdfParser.getText:" + location);
                        try {
                            writer.append(stripper.getText(pdfDocC));
                        } catch (final Throwable e) {
                        }
                    }
                };
                t.start();
                t.join(3000); // pdfbox likes to forget to terminate ... (quite often)
                if (t.isAlive())
                    t.interrupt();
            }
            contentBytes = writer.getBytes(); // get final text before closing writer

            Collection<AnchorURL> pdflinksCombined = new HashSet<AnchorURL>();
            for (Collection<AnchorURL> pdflinksx : pdflinks)
                if (pdflinksx != null)
                    pdflinksCombined.addAll(pdflinksx);
            result = new Document[] { new Document(location, mimeType, "UTF-8", this, null, docKeywords,
                    singleList(docTitle), docAuthor, docPublisher, null, null, 0.0f, 0.0f, contentBytes,
                    pdflinksCombined, null, null, false, docDate) };
        }
    } catch (final Throwable e) {
        //close the writer (in finally)
        //throw new Parser.Failure(e.getMessage(), location);
    } finally {
        try {
            pdfDoc.close();
        } catch (final Throwable e) {
        }
    }

    // clear resources in pdfbox. they say that is resolved but it's not. see:
    // https://issues.apache.org/jira/browse/PDFBOX-313
    // https://issues.apache.org/jira/browse/PDFBOX-351
    // https://issues.apache.org/jira/browse/PDFBOX-441
    // the pdfbox still generates enormeous number of object allocations and don't delete these
    // the following Object are statically stored and never flushed:
    // COSFloat, COSArray, COSInteger, COSObjectKey, COSObject, COSDictionary,
    // COSStream, COSString, COSName, COSDocument, COSInteger[], COSNull
    // the great number of these objects can easily be seen in Java Visual VM
    // we try to get this shit out of the memory here by forced clear calls, hope the best the rubbish gets out.
    pdfDoc = null;
    clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes();

    return result;
}