List of usage examples for org.apache.pdfbox.pdmodel PDDocument isEncrypted
public boolean isEncrypted()
From source file:net.sf.jabref.imports.PdfContentImporter.java
License:Open Source License
@Override public List<BibtexEntry> importEntries(InputStream in, OutputPrinter status) throws IOException { final ArrayList<BibtexEntry> res = new ArrayList<BibtexEntry>(1); PDDocument document; try {/*w w w .jav a2s . com*/ document = PDDocument.load(in); } catch (IOException e) { PdfContentImporter.logger.log(Level.SEVERE, "Could not load document", e); return res; } try { if (document.isEncrypted()) { PdfContentImporter.logger.log(Level.INFO, Globals.lang("Encrypted documents are not supported")); //return res; } PDFTextStripper stripper = new PDFTextStripper(); stripper.setStartPage(1); stripper.setEndPage(1); stripper.setSortByPosition(true); stripper.setParagraphEnd(System.getProperty("line.separator")); StringWriter writer = new StringWriter(); stripper.writeText(document, writer); String textResult = writer.toString(); String doi = DOIUtil.getDOI(textResult); if (doi.length() < textResult.length()) { // A DOI was found in the text // We do NO parsing of the text, but use the DOI fetcher ImportInspector i = new ImportInspector() { @Override public void toFront() { } @Override public void setProgress(int current, int max) { } @Override public void addEntry(BibtexEntry entry) { // add the entry to the result object res.add(entry); } }; PdfContentImporter.doiToBibTeXFetcher.processQuery(doi, i, status); if (res.size() != 0) { // if something has been found, return the result return res; } else { // otherwise, we just parse the PDF } } String author; String editor = null; String institution = null; String abstractT = null; String keywords = null; String title; String conference = null; String DOI = null; String series = null; String volume = null; String number = null; String pages = null; // year is a class variable as the method extractYear() uses it; String publisher = null; BibtexEntryType type = BibtexEntryType.INPROCEEDINGS; final String lineBreak = System.getProperty("line.separator"); split = textResult.split(lineBreak); // idea: split[] contains the different lines // blocks are separated by empty lines // treat each block // or do special treatment at authors (which are not broken) // therefore, we do a line-based and not a block-based splitting // i points to the current line // curString (mostly) contains the current block // the different lines are joined into one and thereby separated by " " proceedToNextNonEmptyLine(); if (i >= split.length) { // PDF could not be parsed or is empty // return empty list return res; } curString = split[i]; i = i + 1; if (curString.length() > 4) { // special case: possibly conference as first line on the page extractYear(); if (curString.contains("Conference")) { fillCurStringWithNonEmptyLines(); conference = curString; curString = ""; } else { // e.g. Copyright (c) 1998 by the Genetics Society of America // future work: get year using RegEx String lower = curString.toLowerCase(); if (lower.contains("copyright")) { fillCurStringWithNonEmptyLines(); publisher = curString; curString = ""; } } } // start: title fillCurStringWithNonEmptyLines(); title = streamlineTitle(curString); curString = ""; //i points to the next non-empty line // after title: authors author = null; while ((i < split.length) && (!split[i].equals(""))) { // author names are unlikely to be split among different lines // treat them line by line curString = streamlineNames(split[i]); if (author == null) { author = curString; } else { if (curString.equals("")) { // if split[i] is "and" then "" is returned by streamlineNames -> do nothing } else { author = author.concat(" and ").concat(curString); } } i++; } curString = ""; i++; // then, abstract and keywords follow while (i < split.length) { curString = split[i]; if ((curString.length() >= "Abstract".length()) && (curString.substring(0, "Abstract".length()).equalsIgnoreCase("Abstract"))) { if (curString.length() == "Abstract".length()) { // only word "abstract" found -- skip line curString = ""; } else { curString = curString.substring("Abstract".length() + 1).trim().concat(lineBreak); } i++; // fillCurStringWithNonEmptyLines() cannot be used as that uses " " as line separator // whereas we need linebreak as separator while ((i < split.length) && (!split[i].equals(""))) { curString = curString.concat(split[i]).concat(lineBreak); i++; } abstractT = curString; i++; } else if ((curString.length() >= "Keywords".length()) && (curString.substring(0, "Keywords".length()).equalsIgnoreCase("Keywords"))) { if (curString.length() == "Keywords".length()) { // only word "Keywords" found -- skip line curString = ""; } else { curString = curString.substring("Keywords".length() + 1).trim(); } i++; fillCurStringWithNonEmptyLines(); keywords = removeNonLettersAtEnd(curString); } else { String lower = curString.toLowerCase(); int pos = lower.indexOf("technical"); if (pos >= 0) { type = BibtexEntryType.TECHREPORT; pos = curString.trim().lastIndexOf(' '); if (pos >= 0) { // assumption: last character of curString is NOT ' ' // otherwise pos+1 leads to an out-of-bounds exception number = curString.substring(pos + 1); } } i++; proceedToNextNonEmptyLine(); } } i = split.length - 1; // last block: DOI, detailed information // sometimes, this information is in the third last block etc... // therefore, read until the beginning of the file while (i >= 0) { readLastBlock(); // i now points to the block before or is -1 // curString contains the last block, separated by " " extractYear(); int pos = curString.indexOf("(Eds.)"); if ((pos >= 0) && (publisher == null)) { // looks like a Springer last line // e.g: A. Persson and J. Stirna (Eds.): PoEM 2009, LNBIP 39, pp. 161-175, 2009. publisher = "Springer"; editor = streamlineNames(curString.substring(0, pos - 1)); curString = curString.substring(pos + "(Eds.)".length() + 2); //+2 because of ":" after (Eds.) and the subsequent space String[] springerSplit = curString.split(", "); if (springerSplit.length >= 4) { conference = springerSplit[0]; String seriesData = springerSplit[1]; int lastSpace = seriesData.lastIndexOf(' '); series = seriesData.substring(0, lastSpace); volume = seriesData.substring(lastSpace + 1); pages = springerSplit[2].substring(4); if (springerSplit[3].length() >= 4) { year = springerSplit[3].substring(0, 4); } } } else { if (DOI == null) { pos = curString.indexOf("DOI"); if (pos < 0) { pos = curString.indexOf("doi"); } if (pos >= 0) { pos += 3; char delimiter = curString.charAt(pos); if ((delimiter == ':') || (delimiter == ' ')) { pos++; } int nextSpace = curString.indexOf(' ', pos); if (nextSpace > 0) { DOI = curString.substring(pos, nextSpace); } else { DOI = curString.substring(pos); } } } if ((publisher == null) && (curString.contains("IEEE"))) { // IEEE has the conference things at the end publisher = "IEEE"; // year is extracted by extractYear // otherwise, we could it determine as follows: // String yearStr = curString.substring(curString.length()-4); // if (isYear(yearStr)) { // year = yearStr; // } if (conference == null) { pos = curString.indexOf('$'); if (pos > 0) { // we found the price // before the price, the ISSN is stated // skip that pos -= 2; while ((pos >= 0) && (curString.charAt(pos) != ' ')) { pos--; } if (pos > 0) { conference = curString.substring(0, pos); } } } } // String lower = curString.toLowerCase(); // if (institution == null) { // // } } } BibtexEntry entry = new BibtexEntry(); entry.setType(type); if (author != null) { entry.setField("author", author); } if (editor != null) { entry.setField("editor", editor); } if (institution != null) { entry.setField("institution", institution); } if (abstractT != null) { entry.setField("abstract", abstractT); } if (keywords != null) { entry.setField("keywords", keywords); } if (title != null) { entry.setField("title", title); } if (conference != null) { entry.setField("booktitle", conference); } if (DOI != null) { entry.setField("doi", DOI); } if (series != null) { entry.setField("series", series); } if (volume != null) { entry.setField("volume", volume); } if (number != null) { entry.setField("number", number); } if (pages != null) { entry.setField("pages", pages); } if (year != null) { entry.setField("year", year); } if (publisher != null) { entry.setField("publisher", publisher); } entry.setField("review", textResult); res.add(entry); } catch (NoClassDefFoundError e) { if (e.getMessage().equals("org/bouncycastle/jce/provider/BouncyCastleProvider")) { status.showMessage(Globals.lang( "Java Bouncy Castle library not found. Please download and install it. For more information see http://www.bouncycastle.org/.")); } else { PdfContentImporter.logger.log(Level.SEVERE, e.getLocalizedMessage(), e); } } finally { document.close(); } return res; }
From source file:net.sf.jabref.util.XMPUtil.java
License:Open Source License
/** * Try to read the given BibTexEntry from the XMP-stream of the given * inputstream containing a PDF-file.//from w w w . j ava2s . c o m * * @param inputStream * The inputstream to read from. * * @throws IOException * Throws an IOException if the file cannot be read, so the user * than remove a lock or cancel the operation. */ @SuppressWarnings("unchecked") public static List<BibtexEntry> readXMP(InputStream inputStream) throws IOException { List<BibtexEntry> result = new LinkedList<BibtexEntry>(); PDDocument document = null; try { document = PDDocument.load(inputStream); if (document.isEncrypted()) { throw new EncryptionNotSupportedException("Error: Cannot read metadata from encrypted document."); } XMPMetadata meta = XMPUtil.getXMPMetadata(document); // If we did not find any XMP metadata, search for non XMP metadata if (meta != null) { List<XMPSchema> schemas = meta.getSchemasByNamespaceURI(XMPSchemaBibtex.NAMESPACE); for (XMPSchema schema : schemas) { XMPSchemaBibtex bib = (XMPSchemaBibtex) schema; result.add(bib.getBibtexEntry()); } // If we did not find anything have a look if a Dublin Core exists if (result.isEmpty()) { schemas = meta.getSchemasByNamespaceURI(XMPSchemaDublinCore.NAMESPACE); for (XMPSchema schema : schemas) { XMPSchemaDublinCore dc = (XMPSchemaDublinCore) schema; BibtexEntry entry = XMPUtil.getBibtexEntryFromDublinCore(dc); if (entry != null) { result.add(entry); } } } } if (result.isEmpty()) { BibtexEntry entry = XMPUtil .getBibtexEntryFromDocumentInformation(document.getDocumentInformation()); if (entry != null) { result.add(entry); } } } finally { if (document != null) { document.close(); } } // return null, if no metadata was found if (result.isEmpty()) { return null; } return result; }
From source file:net.sf.jabref.util.XMPUtil.java
License:Open Source License
/** * Will read the XMPMetadata from the given pdf file, closing the file * afterwards./* w w w . j av a 2s.c o m*/ * * @param inputStream * The inputStream representing a PDF-file to read the * XMPMetadata from. * @return The XMPMetadata object found in the file or null if none is * found. * @throws IOException */ private static XMPMetadata readRawXMP(InputStream inputStream) throws IOException { PDDocument document = null; try { document = PDDocument.load(inputStream); if (document.isEncrypted()) { throw new EncryptionNotSupportedException("Error: Cannot read metadata from encrypted document."); } return XMPUtil.getXMPMetadata(document); } finally { if (document != null) { document.close(); } } }
From source file:net.sf.jabref.util.XMPUtil.java
License:Open Source License
/** * Try to write the given BibTexEntry in the XMP-stream of the given * PDF-file.//w w w. j a v a 2 s .com * * Throws an IOException if the file cannot be read or written, so the user * can remove a lock or cancel the operation. * * The method will overwrite existing BibTeX-XMP-data, but keep other * existing metadata. * * @param file * The file to write the entries to. * @param bibtexEntries * The entries to write to the file. * * @param database * maybenull An optional database which the given bibtex entries * belong to, which will be used to resolve strings. If the * database is null the strings will not be resolved. * @param writePDFInfo * Write information also in PDF document properties * @throws TransformerException * If the entry was malformed or unsupported. * @throws IOException * If the file could not be written to or could not be found. */ @SuppressWarnings("unchecked") public static void writeXMP(File file, Collection<BibtexEntry> bibtexEntries, BibtexDatabase database, boolean writePDFInfo) throws IOException, TransformerException { if (database != null) { bibtexEntries = database.resolveForStrings(bibtexEntries, false); } PDDocument document = null; try { document = PDDocument.load(file.getAbsoluteFile()); if (document.isEncrypted()) { throw new EncryptionNotSupportedException("Error: Cannot add metadata to encrypted document."); } if (writePDFInfo && (bibtexEntries.size() == 1)) { XMPUtil.writeDocumentInformation(document, bibtexEntries.iterator().next(), null); XMPUtil.writeDublinCore(document, bibtexEntries, null); } PDDocumentCatalog catalog = document.getDocumentCatalog(); PDMetadata metaRaw = catalog.getMetadata(); XMPMetadata meta; if (metaRaw != null) { meta = new XMPMetadata(XMLUtil.parse(metaRaw.createInputStream())); } else { meta = new XMPMetadata(); } meta.addXMLNSMapping(XMPSchemaBibtex.NAMESPACE, XMPSchemaBibtex.class); // Remove all current Bibtex-schemas List<XMPSchema> schemas = meta.getSchemasByNamespaceURI(XMPSchemaBibtex.NAMESPACE); for (XMPSchema schema : schemas) { XMPSchemaBibtex bib = (XMPSchemaBibtex) schema; bib.getElement().getParentNode().removeChild(bib.getElement()); } for (BibtexEntry e : bibtexEntries) { XMPSchemaBibtex bibtex = new XMPSchemaBibtex(meta); meta.addSchema(bibtex); bibtex.setBibtexEntry(e, null); } // Save to stream and then input that stream to the PDF ByteArrayOutputStream os = new ByteArrayOutputStream(); meta.save(os); ByteArrayInputStream is = new ByteArrayInputStream(os.toByteArray()); PDMetadata metadataStream = new PDMetadata(document, is, false); catalog.setMetadata(metadataStream); // Save try { document.save(file.getAbsolutePath()); } catch (COSVisitorException e) { throw new TransformerException("Could not write XMP-metadata: " + e.getLocalizedMessage()); } } finally { if (document != null) { document.close(); } } }
From source file:net.sf.jsignpdf.preview.Pdf2Image.java
License:Mozilla Public License
/** * Returns image (or null if failed) generated from given page in PDF using * PDFBox tool./*from ww w .j a va 2s . c o m*/ * * @param aPage * page in PDF (1 based) * @return image or null */ public BufferedImage getImageUsingPdfBox(final int aPage) { BufferedImage tmpResult = null; PDDocument tmpDoc = null; try { tmpDoc = PDDocument.load(options.getInFile()); if (tmpDoc.isEncrypted()) { tmpDoc.decrypt(options.getPdfOwnerPwdStrX()); } int resolution; try { resolution = Toolkit.getDefaultToolkit().getScreenResolution(); } catch (HeadlessException e) { resolution = 96; } final PDPage page = (PDPage) tmpDoc.getDocumentCatalog().getAllPages().get(aPage - 1); tmpResult = page.convertToImage(BufferedImage.TYPE_INT_RGB, resolution); } catch (Exception e) { e.printStackTrace(); } finally { if (tmpDoc != null) { try { tmpDoc.close(); } catch (Exception e) { e.printStackTrace(); } } } return tmpResult; }
From source file:net.sf.mmm.content.parser.impl.pdf.ContentParserPdf.java
License:Apache License
/** * {@inheritDoc}/*from www .ja va2s . c o m*/ */ @Override public void parse(InputStream inputStream, long filesize, ContentParserOptions options, MutableGenericContext context) throws Exception { PDFParser parser = new PDFParser(inputStream); parser.parse(); PDDocument pdfDoc = parser.getPDDocument(); try { if (pdfDoc.isEncrypted()) { // pdfDoc.decrypt("password"); return; } PDDocumentInformation info = pdfDoc.getDocumentInformation(); String title = info.getTitle(); if (title != null) { context.setVariable(VARIABLE_NAME_TITLE, title); } String keywords = info.getKeywords(); if (keywords != null) { context.setVariable(VARIABLE_NAME_KEYWORDS, keywords); } String author = info.getAuthor(); if (author != null) { context.setVariable(VARIABLE_NAME_CREATOR, author); } if (filesize < options.getMaximumBufferSize()) { PDFTextStripper stripper = new PDFTextStripper(); context.setVariable(VARIABLE_NAME_TEXT, stripper.getText(pdfDoc)); } } finally { pdfDoc.close(); } }
From source file:net.sourceforge.docfetcher.parse.PDFParser.java
License:Open Source License
public String renderText(File file) throws ParseException { PDDocument pdfDoc = null; try {//from w w w .j a v a 2 s . c om pdfDoc = PDDocument.load(file); if (pdfDoc.isEncrypted()) { try { pdfDoc.openProtection(new StandardDecryptionMaterial("")); } catch (Exception e) { throw new ParseException(file, Msg.no_extraction_permission.value()); } } PDFTextStripper stripper = new PDFTextStripper(); StringWriter writer = new StringWriter(); stripper.writeText(pdfDoc, writer); return writer.toString(); } catch (IOException e) { throw new ParseException(file, Msg.file_not_readable.value()); } finally { if (pdfDoc != null) { try { pdfDoc.close(); } catch (IOException e) { e.printStackTrace(); } } } }
From source file:net.sourceforge.docfetcher.parse.PDFParser.java
License:Open Source License
public Document parse(File file) throws ParseException { PDDocument pdfDoc = null; try {/*from w ww . j a v a 2 s .c om*/ // Check if PDF file is encrypted pdfDoc = PDDocument.load(file); if (pdfDoc.isEncrypted()) { try { pdfDoc.openProtection(new StandardDecryptionMaterial("")); } catch (Exception e) { throw new ParseException(file, Msg.no_extraction_permission.value()); } } // Get tags and contents PDFTextStripper stripper = new PDFTextStripper(); StringWriter writer = new StringWriter(); stripper.writeText(pdfDoc, writer); DocFetcher.getInstance().setExceptionHandlerEnabled(true); PDDocumentInformation pdInfo = pdfDoc.getDocumentInformation(); String[] metaData = new String[] { pdInfo.getTitle(), pdInfo.getAuthor(), pdInfo.getSubject(), pdInfo.getKeywords(), }; for (String field : metaData) if (field != null) writer.append(" ").append(field); //$NON-NLS-1$ return new Document(file, metaData[0], writer.getBuffer()).addAuthor(metaData[1]); } catch (IOException e) { throw new ParseException(file, Msg.file_not_readable.value()); } finally { if (pdfDoc != null) { try { pdfDoc.close(); } catch (IOException e) { e.printStackTrace(); } } } }
From source file:net.yacy.cider.parser.idiom.pdfIdiom.java
License:Open Source License
@Override public Model parse(DataSource source) throws ParserException { // create an empty Model Model model = ModelFactory.createDefaultModel(); Resource resource = source.hasURI() ? model.createResource(source.getURI().toNormalform(true, true)) : model.createResource();// w ww. j av a 2 s . co m // open pdf document final PDDocument theDocument; final PDFParser parser; try { parser = new PDFParser(source.getStream()); parser.parse(); theDocument = parser.getPDDocument(); } catch (IOException e) { log.error(e.getMessage(), e); throw new ParserException(e.getMessage(), source.getURI()); } if (theDocument.isEncrypted()) { try { theDocument.openProtection(new StandardDecryptionMaterial("")); } catch (BadSecurityHandlerException e) { throw new ParserException("PDF Encrypted (BadSecurityHandlerException): " + e.getMessage(), source.getURI(), e); } catch (IOException e) { throw new ParserException("PDF Encrypted (IOException): " + e.getMessage(), source.getURI(), e); } catch (CryptographyException e) { throw new ParserException("PDF Encrypted (CryptographyException): " + e.getMessage(), source.getURI(), e); } final AccessPermission perm = theDocument.getCurrentAccessPermission(); if (perm == null || !perm.canExtractContent()) throw new ParserException("PDF cannot be decrypted", source.getURI()); } // get metadata final PDDocumentInformation theDocInfo = theDocument.getDocumentInformation(); String docTitle = null, docSubject = null, docAuthor = null, docKeywordStr = null; if (theDocInfo != null) { docTitle = theDocInfo.getTitle(); docSubject = theDocInfo.getSubject(); docAuthor = theDocInfo.getAuthor(); docKeywordStr = theDocInfo.getKeywords(); } if (docAuthor != null && docAuthor.length() > 0) { resource.addProperty(VCARD.FN, docAuthor); resource.addProperty(DC.creator, docAuthor); } if (docSubject != null && docSubject.length() > 0) { resource.addProperty(DC.subject, docSubject); } if (docTitle != null && docTitle.length() > 0) { resource.addProperty(DC.title, docTitle); } String[] docKeywords = null; if (docKeywordStr != null && docKeywordStr.length() > 0) { docKeywords = docKeywordStr.split(" |,"); resource.addProperty(DC.coverage, concat(docKeywords)); } // get the content ByteArrayOutputStream baos = new ByteArrayOutputStream(); Writer writer; try { writer = new OutputStreamWriter(baos, "UTF-8"); } catch (UnsupportedEncodingException e1) { writer = new OutputStreamWriter(baos); } try { final PDFTextStripper stripper = new PDFTextStripper(); stripper.writeText(theDocument, writer); theDocument.close(); writer.close(); } catch (IOException e) { if (writer != null) try { writer.close(); } catch (final Exception ex) { } throw new ParserException("PDF content reader", source.getURI(), e); } String content; try { content = new String(baos.toByteArray(), "UTF-8"); } catch (UnsupportedEncodingException e) { content = new String(baos.toByteArray()); } if (content != null && content.length() > 0) { resource.addProperty(CIDER.data_content_text, content); } return model; }
From source file:net.yacy.document.parser.pdfParser.java
License:Open Source License
@Override public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final int timezoneOffset, final InputStream source) throws Parser.Failure, InterruptedException { // check memory for parser if (!MemoryControl.request(200 * 1024 * 1024, false)) throw new Parser.Failure("Not enough Memory available for pdf parser: " + MemoryControl.available(), location);/*from www . j av a2 s . c om*/ // create a pdf parser PDDocument pdfDoc; //final PDFParser pdfParser; try { Thread.currentThread().setPriority(Thread.MIN_PRIORITY); // the pdfparser is a big pain pdfDoc = PDDocument.load(source); //PDFParser pdfParser = new PDFParser(source); //pdfParser.parse(); //pdfDoc = pdfParser.getPDDocument(); } catch (final IOException e) { throw new Parser.Failure(e.getMessage(), location); } finally { Thread.currentThread().setPriority(Thread.NORM_PRIORITY); } if (pdfDoc.isEncrypted()) { try { pdfDoc.openProtection(new StandardDecryptionMaterial("")); } catch (final BadSecurityHandlerException e) { try { pdfDoc.close(); } catch (final IOException ee) { } throw new Parser.Failure("Document is encrypted (1): " + e.getMessage(), location); } catch (final IOException e) { try { pdfDoc.close(); } catch (final IOException ee) { } throw new Parser.Failure("Document is encrypted (2): " + e.getMessage(), location); } catch (final CryptographyException e) { try { pdfDoc.close(); } catch (final IOException ee) { } throw new Parser.Failure("Document is encrypted (3): " + e.getMessage(), location); } final AccessPermission perm = pdfDoc.getCurrentAccessPermission(); if (perm == null || !perm.canExtractContent()) { try { pdfDoc.close(); } catch (final IOException ee) { } throw new Parser.Failure("Document is encrypted and cannot be decrypted", location); } } // extracting some metadata PDDocumentInformation info = pdfDoc.getDocumentInformation(); String docTitle = null, docSubject = null, docAuthor = null, docPublisher = null, docKeywordStr = null; Date docDate = new Date(); if (info != null) { docTitle = info.getTitle(); docSubject = info.getSubject(); docAuthor = info.getAuthor(); docPublisher = info.getProducer(); if (docPublisher == null || docPublisher.isEmpty()) docPublisher = info.getCreator(); docKeywordStr = info.getKeywords(); try { if (info.getModificationDate() != null) docDate = info.getModificationDate().getTime(); } catch (IOException e) { } // unused: // info.getTrapped()); } info = null; if (docTitle == null || docTitle.isEmpty()) { docTitle = MultiProtocolURL.unescape(location.getFileName()); } if (docTitle == null) { docTitle = docSubject; } String[] docKeywords = null; if (docKeywordStr != null) { docKeywords = docKeywordStr.split(" |,"); } Collection<AnchorURL>[] pdflinks = null; Document[] result = null; try { // get the links pdflinks = extractPdfLinks(pdfDoc); // get the fulltext (either per document or for each page) final PDFTextStripper stripper = new PDFTextStripper("UTF-8"); if (individualPages) { // this is a hack which stores individual pages of the source pdf into individual index documents // the new documents will get a virtual link with a post argument page=X appended to the original url // collect text int pagecount = pdfDoc.getNumberOfPages(); String[] pages = new String[pagecount]; for (int page = 1; page <= pagecount; page++) { stripper.setStartPage(page); stripper.setEndPage(page); pages[page - 1] = stripper.getText(pdfDoc); //System.out.println("PAGE " + page + ": " + pages[page - 1]); } // create individual documents for each page assert pages.length == pdflinks.length : "pages.length = " + pages.length + ", pdflinks.length = " + pdflinks.length; result = new Document[Math.min(pages.length, pdflinks.length)]; String loc = location.toNormalform(true); for (int page = 0; page < result.length; page++) { result[page] = new Document( new AnchorURL(loc + (loc.indexOf('?') > 0 ? '&' : '?') + individualPagePropertyname + '=' + (page + 1)), // these are virtual new pages; we cannot combine them with '#' as that would be removed when computing the urlhash mimeType, "UTF-8", this, null, docKeywords, singleList(docTitle), docAuthor, docPublisher, null, null, 0.0f, 0.0f, pages == null || page > pages.length ? new byte[0] : UTF8.getBytes(pages[page]), pdflinks == null || page >= pdflinks.length ? null : pdflinks[page], null, null, false, docDate); } } else { // collect the whole text at once final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE); byte[] contentBytes = new byte[0]; stripper.setEndPage(3); // get first 3 pages (always) writer.append(stripper.getText(pdfDoc)); contentBytes = writer.getBytes(); // remember text in case of interrupting thread if (pdfDoc.getNumberOfPages() > 3) { // spare creating/starting thread if all pages read stripper.setStartPage(4); // continue with page 4 (terminated, resulting in no text) stripper.setEndPage(Integer.MAX_VALUE); // set to default // we start the pdf parsing in a separate thread to ensure that it can be terminated final PDDocument pdfDocC = pdfDoc; final Thread t = new Thread() { @Override public void run() { Thread.currentThread().setName("pdfParser.getText:" + location); try { writer.append(stripper.getText(pdfDocC)); } catch (final Throwable e) { } } }; t.start(); t.join(3000); // pdfbox likes to forget to terminate ... (quite often) if (t.isAlive()) t.interrupt(); } contentBytes = writer.getBytes(); // get final text before closing writer Collection<AnchorURL> pdflinksCombined = new HashSet<AnchorURL>(); for (Collection<AnchorURL> pdflinksx : pdflinks) if (pdflinksx != null) pdflinksCombined.addAll(pdflinksx); result = new Document[] { new Document(location, mimeType, "UTF-8", this, null, docKeywords, singleList(docTitle), docAuthor, docPublisher, null, null, 0.0f, 0.0f, contentBytes, pdflinksCombined, null, null, false, docDate) }; } } catch (final Throwable e) { //close the writer (in finally) //throw new Parser.Failure(e.getMessage(), location); } finally { try { pdfDoc.close(); } catch (final Throwable e) { } } // clear resources in pdfbox. they say that is resolved but it's not. see: // https://issues.apache.org/jira/browse/PDFBOX-313 // https://issues.apache.org/jira/browse/PDFBOX-351 // https://issues.apache.org/jira/browse/PDFBOX-441 // the pdfbox still generates enormeous number of object allocations and don't delete these // the following Object are statically stored and never flushed: // COSFloat, COSArray, COSInteger, COSObjectKey, COSObject, COSDictionary, // COSStream, COSString, COSName, COSDocument, COSInteger[], COSNull // the great number of these objects can easily be seen in Java Visual VM // we try to get this shit out of the memory here by forced clear calls, hope the best the rubbish gets out. pdfDoc = null; clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes(); return result; }