List of usage examples for org.apache.pdfbox.pdmodel PDDocument PDDocument
public PDDocument(COSDocument doc)
From source file:jgnash.report.pdf.Report.java
License:Open Source License
public Report() { this.pdfDocument = new PDDocument(MemoryUsageSetting.setupMixed(MAX_MEMORY_USAGE)); setTableFont(loadFont(ReportFactory.getMonoFont(), pdfDocument)); setHeaderFont(loadFont(ReportFactory.getHeaderFont(), pdfDocument)); setFooterFont(loadFont(ReportFactory.getProportionalFont(), pdfDocument)); // restore font size baseFontSize = getPreferences().getFloat(BASE_FONT_SIZE, DEFAULT_BASE_FONT_SIZE); // restore the page format setPageFormat(getPageFormat());//from w w w . ja v a 2 s . c o m }
From source file:ocr_pdf.OCR_PDF.java
License:GNU General Public License
/** * Get PDFBox output of airport diagram. * @param fileName name of airport diagram PDF file. * @return text representation of airport diagram. *///from ww w .j av a 2 s . com static String getTextPDFBox(String fileName) { PDFParser parser; String parsedText = null; PDFTextStripper pdfStripper = null; PDDocument pdDoc = null; COSDocument cosDoc = null; File file = new File(fileName); if (!file.isFile()) { System.err.println("File " + fileName + " does not exist."); return null; } try { parser = new PDFParser(new FileInputStream(file)); } catch (IOException e) { System.err.println("Unable to open PDF Parser. " + e.getMessage()); return null; } try { parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); //true doesn't work so well. pdfStripper.setSortByPosition(false); pdDoc = new PDDocument(cosDoc); parsedText = pdfStripper.getText(pdDoc); } catch (Exception e) { System.err.println("An exception occured in parsing the PDF Document." + e.getMessage()); } finally { try { if (cosDoc != null) cosDoc.close(); if (pdDoc != null) pdDoc.close(); } catch (Exception e) { e.printStackTrace(); } } return parsedText; }
From source file:opennlp.PDFTools.java
public String getStringFromPDF(String filePath) { String text = null;// w w w. j a va 2 s . co m try { File file = new File(filePath); parser = new PDFParser(new RandomAccessFile(file, "r")); parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); pdDoc.getNumberOfPages(); pdfStripper.setStartPage(1); //pdfStripper.setEndPage(10); pdfStripper.setEndPage(pdDoc.getNumberOfPages()); text = pdfStripper.getText(pdDoc); } catch (IOException e) { logger.error("IO ERROR", e); } catch (Exception ex) { logger.error("ERROR", ex); } return text; }
From source file:org.crypto.sse.TextExtractPar.java
License:Open Source License
private static TextExtractPar extractOneDoc(File[] listOfFile) throws FileNotFoundException { Multimap<String, String> lookup1 = ArrayListMultimap.create(); Multimap<String, String> lookup2 = ArrayListMultimap.create(); for (File file : listOfFile) { for (int j = 0; j < 100; j++) { if (counter == (int) ((j + 1) * listOfFile.length / 100)) { System.out.println("Number of files read equals " + j + " %"); break; }// w ww .j a va 2 s.c om } List<String> lines = new ArrayList<String>(); counter++; FileInputStream fis = new FileInputStream(file); // ***********************************************************************************************// ///////////////////// .docx ///////////////////////////// // ***********************************************************************************************// if (file.getName().endsWith(".docx")) { XWPFDocument doc; try { // System.out.println("File read: "+file.getName()); doc = new XWPFDocument(fis); XWPFWordExtractor ex = new XWPFWordExtractor(doc); lines.add(ex.getText()); } catch (IOException e) { // TODO Auto-generated catch block System.out.println("File not read: " + file.getName()); } } // ***********************************************************************************************// ///////////////////// .pptx ///////////////////////////// // ***********************************************************************************************// else if (file.getName().endsWith(".pptx")) { OPCPackage ppt; try { // System.out.println("File read: "+file.getName()); ppt = OPCPackage.open(fis); XSLFPowerPointExtractor xw = new XSLFPowerPointExtractor(ppt); lines.add(xw.getText()); } catch (XmlException e) { // TODO Auto-generated catch block System.out.println("File not read: " + file.getName()); } catch (IOException e) { // TODO Auto-generated catch block System.out.println("File not read: " + file.getName()); } catch (OpenXML4JException e) { System.out.println("File not read: " + file.getName()); } } // ***********************************************************************************************// ///////////////////// .xlsx ///////////////////////////// // ***********************************************************************************************// else if (file.getName().endsWith(".xlsx")) { OPCPackage xls; try { // System.out.println("File read: "+file.getName()); xls = OPCPackage.open(fis); XSSFExcelExtractor xe = new XSSFExcelExtractor(xls); lines.add(xe.getText()); } catch (InvalidFormatException e) { // TODO Auto-generated catch block System.out.println("File not read: " + file.getName()); } catch (IOException e) { System.out.println("File not read: " + file.getName()); } catch (XmlException e) { // TODO Auto-generated catch block System.out.println("File not read: " + file.getName()); } catch (OpenXML4JException e) { System.out.println("File not read: " + file.getName()); } } // ***********************************************************************************************// ///////////////////// .doc ///////////////////////////// // ***********************************************************************************************// else if (file.getName().endsWith(".doc")) { NPOIFSFileSystem fs; try { // System.out.println("File read: "+file.getName()); fs = new NPOIFSFileSystem(file); WordExtractor extractor = new WordExtractor(fs.getRoot()); for (String rawText : extractor.getParagraphText()) { lines.add(extractor.stripFields(rawText)); } } catch (IOException e) { // TODO Auto-generated catch block System.out.println("File not read: " + file.getName()); } } // ***********************************************************************************************// ///////////////////// .pdf ///////////////////////////// // ***********************************************************************************************// else if (file.getName().endsWith(".pdf")) { PDFParser parser; try { // System.out.println("File read: "+file.getName()); parser = new PDFParser(fis); parser.parse(); COSDocument cd = parser.getDocument(); PDFTextStripper stripper = new PDFTextStripper(); lines.add(stripper.getText(new PDDocument(cd))); } catch (IOException e) { // TODO Auto-generated catch block System.out.println("File not read: " + file.getName()); } } // ***********************************************************************************************// ///////////////////// Media Files such as gif, jpeg, .wmv, .mpeg, ///////////////////// .mp4 ///////////////////////////// // ***********************************************************************************************// else if (file.getName().endsWith(".gif") && file.getName().endsWith(".jpeg") && file.getName().endsWith(".wmv") && file.getName().endsWith(".mpeg") && file.getName().endsWith(".mp4")) { lines.add(file.getName()); } // ***********************************************************************************************// ///////////////////// raw text extensions ///////////////////// ///////////////////////////// // ***********************************************************************************************// else { try { // System.out.println("File read: "+file.getName()); lines = Files.readLines(file, Charsets.UTF_8); } catch (IOException e) { // TODO Auto-generated catch block System.out.println("File not read: " + file.getName()); } finally { try { fis.close(); } catch (IOException ioex) { // omitted. } } } // ***********************************************************************************************// ///////////////////// Begin word extraction ///////////////////// ///////////////////////////// // ***********************************************************************************************// int temporaryCounter = 0; // Filter threshold int counterDoc = 0; for (int i = 0; i < lines.size(); i++) { CharArraySet noise = EnglishAnalyzer.getDefaultStopSet(); // We are using a standard tokenizer that eliminates the stop // words. We can use Stemming tokenizer such Porter // A set of English noise keywords is used that will eliminates // words such as "the, a, etc" Analyzer analyzer = new StandardAnalyzer(noise); List<String> token = Tokenizer.tokenizeString(analyzer, lines.get(i)); temporaryCounter = temporaryCounter + token.size(); for (int j = 0; j < token.size(); j++) { // Avoid counting occurrences of words in the same file if (!lookup2.get(file.getName()).contains(token.get(j))) { lookup2.put(file.getName(), token.get(j)); } // Avoid counting occurrences of words in the same file if (!lookup1.get(token.get(j)).contains(file.getName())) { lookup1.put(token.get(j), file.getName()); } } } } // System.out.println(lookup.toString()); return new TextExtractPar(lookup1, lookup2); }
From source file:org.dspace.content.packager.PDFPackager.java
License:BSD License
private void crosswalkPDF(Context context, Item item, InputStream metadata) throws CrosswalkException, IOException, SQLException, AuthorizeException { COSDocument cos = null;//from w w w . j a v a 2 s. c o m try { PDFParser parser = new PDFParser(metadata); parser.parse(); cos = parser.getDocument(); // sanity check: PDFBox breaks on encrypted documents, so give up. if (cos.getEncryptionDictionary() != null) { throw new MetadataValidationException("This packager cannot accept an encrypted PDF document."); } /* PDF to DC "crosswalk": * * NOTE: This is not in a crosswalk plugin because (a) it isn't * useful anywhere else, and more importantly, (b) the source * data is not XML so it doesn't fit the plugin's interface. * * pattern of crosswalk -- PDF dict entries to DC: * Title -> title.null * Author -> contributor.author * CreationDate -> date.created * ModDate -> date.created * Creator -> description.provenance (application that created orig) * Producer -> description.provenance (convertor to pdf) * Subject -> description.abstract * Keywords -> subject.other * date is java.util.Calendar */ PDDocument pd = new PDDocument(cos); PDDocumentInformation docinfo = pd.getDocumentInformation(); String title = docinfo.getTitle(); // sanity check: item must have a title. if (title == null) { throw new MetadataValidationException( "This PDF file is unacceptable, it does not have a value for \"Title\" in its Info dictionary."); } if (log.isDebugEnabled()) { log.debug("PDF Info dict title=\"" + title + "\""); } item.addDC("title", null, "en", title); String value = docinfo.getAuthor(); if (value != null) { item.addDC("contributor", "author", null, value); if (log.isDebugEnabled()) { log.debug("PDF Info dict author=\"" + value + "\""); } } value = docinfo.getCreator(); if (value != null) { item.addDC("description", "provenance", "en", "Application that created the original document: " + value); } value = docinfo.getProducer(); if (value != null) { item.addDC("description", "provenance", "en", "Original document converted to PDF by: " + value); } value = docinfo.getSubject(); if (value != null) { item.addDC("description", "abstract", null, value); } value = docinfo.getKeywords(); if (value != null) { item.addDC("subject", "other", null, value); } // Take either CreationDate or ModDate as "date.created", // Too bad there's no place to put "last modified" in the DC. Calendar calValue = docinfo.getCreationDate(); if (calValue == null) { calValue = docinfo.getModificationDate(); } if (calValue != null) { item.addDC("date", "created", null, (new DCDate(calValue.getTime())).toString()); } item.update(); } finally { if (cos != null) { cos.close(); } } }
From source file:org.dspace.submit.step.UploadStep.java
License:BSD License
/** * Process the upload of a new file!//w ww. ja v a 2 s. c o m * * @param context * current DSpace context * @param request * current servlet request object * @param response * current servlet response object * @param subInfo * submission info object * * @return Status or error flag which will be processed by * UI-related code! (if STATUS_COMPLETE or 0 is returned, * no errors occurred!) */ public int processUploadFile(Context context, HttpServletRequest request, HttpServletResponse response, SubmissionInfo subInfo) throws ServletException, IOException, SQLException, AuthorizeException { boolean formatKnown = true; boolean fileOK = false; BitstreamFormat bf = null; Bitstream b = null; //NOTE: File should already be uploaded. //Manakin does this automatically via Cocoon. //For JSP-UI, the SubmissionController.uploadFiles() does the actual upload Enumeration attNames = request.getAttributeNames(); //loop through our request attributes while (attNames.hasMoreElements()) { String attr = (String) attNames.nextElement(); //if this ends with "-path", this attribute //represents a newly uploaded file if (attr.endsWith("-path")) { //strip off the -path to get the actual parameter //that the file was uploaded as String param = attr.replace("-path", ""); String exten = param.substring(param.length() - 3); // Load the file's path and input stream and description String filePath = (String) request.getAttribute(param + "-path"); InputStream fileInputStreamTest = (InputStream) request.getAttribute(param + "-inputstream"); ByteArrayOutputStream baos = new ByteArrayOutputStream(); byte[] buf = new byte[1024]; int n = 0; while ((n = fileInputStreamTest.read(buf)) >= 0) baos.write(buf, 0, n); byte[] content = baos.toByteArray(); InputStream fileInputStream = new ByteArrayInputStream(content); InputStream fileInputStreamPdf = new ByteArrayInputStream(content); InputStream ifAnsi = new ByteArrayInputStream(content); //InputStream fss = fileInputStream.cl //attempt to get description from attribute first, then direct from a parameter String fileDescription = (String) request.getAttribute(param + "-description"); if (fileDescription == null || fileDescription.length() == 0) { fileDescription = request.getParameter("description"); } // if information wasn't passed by User Interface, we had a problem // with the upload if (filePath == null || fileInputStream == null) { return STATUS_UPLOAD_ERROR; } if (subInfo == null) { // In any event, if we don't have the submission info, the request // was malformed return STATUS_INTEGRITY_ERROR; } // Create the bitstream Item item = subInfo.getSubmissionItem().getItem(); // do we already have a bundle? Bundle[] bundles = item.getBundles("ORIGINAL"); if (bundles.length < 1) { // set bundle's name to ORIGINAL b = item.createSingleBitstream(fileInputStream, "ORIGINAL"); } else { // we have a bundle already, just add bitstream b = bundles[0].createBitstream(fileInputStream); } //fileDescription.op if (exten.toLowerCase().equals("pdf")) { try { PDFTextStripper pdfStripper = null; PDDocument docum = null; PDFParser parser = new PDFParser(fileInputStreamPdf); COSDocument cosDoc = null; parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); docum = new PDDocument(cosDoc); //pdfStripper.getText(docum); String parsedText = pdfStripper.getText(docum); Integer fifty = (Integer) Math.round(parsedText.length() / 2); if (fifty < 0) { fifty = fifty * (-1); } Integer toCut = 500; if ((parsedText.length() - fifty) < 500) { toCut = parsedText.length(); } log.info("FUCKTHISSHIT: " + fifty + " " + toCut); String subText = parsedText.substring(fifty, fifty + toCut - 1); try { subText = subText.substring(subText.indexOf(".") + 1); } catch (Exception e) { } item.addMetadata("dc", "textpart", null, null, subText + "..."); item.update(); context.commit(); log.info(parsedText); } catch (Exception e) { log.info("omgerror: " + e.toString()); } } if (exten.toLowerCase().equals("txt")) { StringWriter writer = new StringWriter(); IOUtils.copy(fileInputStreamPdf, writer, "UTF-8"); String theString = writer.toString(); if (theString.startsWith("\uFEFF")) { } else { StringWriter writerAnsi = new StringWriter(); IOUtils.copy(ifAnsi, writerAnsi, "Cp1252"); theString = writerAnsi.toString(); } Integer fifty = (Integer) Math.round(theString.length() * (50 / 100.0f)); Integer toCut = 500; if ((theString.length() - fifty) < 500) { toCut = theString.length(); } String subText = theString.substring(fifty, toCut - 1); item.addMetadata("dc", "textpart", null, null, subText + "..."); item.update(); context.commit(); log.info(subText); } log.info("OMGTEST: " + exten); if (exten.toLowerCase().equals("doc")) { WordExtractor extractor = null; try { HWPFDocument document = new HWPFDocument(fileInputStreamPdf); extractor = new WordExtractor(document); String fileData = extractor.getText(); Integer fifty = (Integer) Math.round(50 * 100 / fileData.length()); Integer toCut = 500; if ((fileData.length() - fifty) < 500) { toCut = fileData.length(); } String subText = fileData.substring(fifty, toCut - 1); item.addMetadata("dc", "textpart", null, null, subText + "..."); item.update(); context.commit(); } catch (Exception exep) { log.info("OMGTESTIK:" + exep); } } if ((exten.toLowerCase().equals("ocx"))) { XWPFDocument document = new XWPFDocument(fileInputStreamPdf); XWPFWordExtractor extractor = null; extractor = new XWPFWordExtractor(document); String text = extractor.getText(); Integer fifty = (Integer) Math.round(50 * 100 / text.length()); Integer toCut = 500; if ((text.length() - fifty) < 500) { toCut = text.length(); } String subText = text.substring(fifty, toCut - 1); item.addMetadata("dc", "textpart", null, null, subText + "..."); item.update(); context.commit(); } // Strip all but the last filename. It would be nice // to know which OS the file came from. String noPath = filePath; while (noPath.indexOf('/') > -1) { noPath = noPath.substring(noPath.indexOf('/') + 1); } while (noPath.indexOf('\\') > -1) { noPath = noPath.substring(noPath.indexOf('\\') + 1); } b.setName(noPath); b.setSource(filePath); b.setDescription(fileDescription); // Identify the format bf = FormatIdentifier.guessFormat(context, b); b.setFormat(bf); // Update to DB b.update(); item.update(); if ((bf != null) && (bf.isInternal())) { log.warn("Attempt to upload file format marked as internal system use only"); backoutBitstream(subInfo, b, item); return STATUS_UPLOAD_ERROR; } // Check for virus if (ConfigurationManager.getBooleanProperty("submission-curation", "virus-scan")) { Curator curator = new Curator(); curator.addTask("vscan").curate(item); int status = curator.getStatus("vscan"); if (status == Curator.CURATE_ERROR) { backoutBitstream(subInfo, b, item); return STATUS_VIRUS_CHECKER_UNAVAILABLE; } else if (status == Curator.CURATE_FAIL) { backoutBitstream(subInfo, b, item); return STATUS_CONTAINS_VIRUS; } } // If we got this far then everything is more or less ok. // Comment - not sure if this is the right place for a commit here // but I'm not brave enough to remove it - Robin. context.commit(); // save this bitstream to the submission info, as the // bitstream we're currently working with subInfo.setBitstream(b); //if format was not identified if (bf == null) { return STATUS_UNKNOWN_FORMAT; } } //end if attribute ends with "-path" } //end while return STATUS_COMPLETE; }
From source file:org.encuestame.business.search.IndexerFile.java
License:Apache License
/** * Parse pdf Document./*from w w w. ja va 2 s .c o m*/ * @param file * @return * @throws IOException */ public static PDDocument parsePdfDocument(final File file) throws IOException { InputStream is = new FileInputStream(file); COSDocument cosDoc = null; PDDocument pdDoc = null; try { cosDoc = SearchUtils.parseDocument(is); pdDoc = new PDDocument(cosDoc); } catch (IOException e) { // TODO Auto-generated catch block log.error(e); } finally { if (pdDoc == null) { log.error("PdDocument is null"); } else { pdDoc.close(); } } return pdDoc; }
From source file:org.encuestame.business.search.SearchUtils.java
License:Apache License
/** * Create PDF Document./*from w w w . j a v a2 s .c o m*/ * @param file {@link File} * @param Long attachmentId. * @return {@link Document} * @throws Exception */ public static Document createPdfDocument(final File file) throws Exception { InputStream is = new FileInputStream(file); COSDocument cosDoc = null; String docText = ""; PDDocument pdDoc = null; try { cosDoc = parseDocument(is); pdDoc = new PDDocument(cosDoc); PDFTextStripper stripper = new PDFTextStripper(); docText = stripper.getText(pdDoc); log.debug("PDF Doc Text " + docText.length()); } finally { if (pdDoc == null) { log.error("PdDocument is null"); } else { pdDoc.close(); } } final Document doc = SearchUtils.addFields(file, docText); return doc; }
From source file:org.kimios.kernel.index.filters.PDFFilter.java
License:Open Source License
public String getBody(InputStream in) throws IOException { PDFParser parser = new PDFParser(in); parser.parse();/*w w w. j ava 2 s .c o m*/ COSDocument cosDoc = parser.getDocument(); PDDocument pDDoc = new PDDocument(cosDoc); String out = new PDFTextStripper().getText(pDDoc); pDDoc.close(); return out; }
From source file:org.nuxeo.typeDocPkg.PdfDoc.java
License:Apache License
private boolean setMain(String FileName) throws Exception { file = new File(FileName); if (!file.isFile()) { System.err.println("File " + "output.pdf" + " does not exist."); return false; }/* w ww. j ava 2 s . c o m*/ try { parser = new PDFParser(new FileInputStream(file)); } catch (IOException e) { log.error("Unable to open PDF Parser. ", e); return false; } try { parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); } catch (Exception e) { log.error("error in setMain method ", e); return false; } return true; }