List of usage examples for org.apache.pdfbox.pdmodel PDDocument close
@Override public void close() throws IOException
From source file:name.marcelomorales.siqisiqi.pdfbox.CoordinatesGenerator.java
License:Apache License
public void generarPdf(OutputStream os, String template, Map<String, Object> m, String path, String coordenates, float fontSize, float ancho) throws IOException { long t = System.currentTimeMillis(); PDDocument doc = null; try {//from w w w . j a v a2s .c o m doc = PDDocument.load(new File(path)); List pages = doc.getDocumentCatalog().getAllPages(); PDPage sourcePage = (PDPage) pages.get(0); boolean append = sourcePage.getContents() != null; PDPageContentStream contentStream = new PDPageContentStream(doc, sourcePage, append, true); StringReader fileReader = null; try { fileReader = new StringReader(template); List<String> list = CharStreams.readLines(fileReader); boolean textHasBegun = false; float currentOffset = 0f; for (String line : list) { if (line == null) { continue; } if (line.startsWith("#")) { continue; } final Iterable<String> str = Splitter.on(',').omitEmptyStrings().trimResults().split(line); final String[] split = Iterables.toArray(str, String.class); if (split == null || split.length < 4) { continue; } if (Character.isDigit(split[0].charAt(0))) { if (textHasBegun) { contentStream.endText(); } contentStream.beginText(); textHasBegun = true; contentStream.moveTextPositionByAmount(parseFloat(split[0]), parseFloat(split[1])); } else { contentStream.moveTextPositionByAmount(currentOffset, 0); } if (!textHasBegun) { LOGGER.warn("Hay un posible mal uso de un .ree", new Throwable()); contentStream.beginText(); textHasBegun = true; } PDType1Font font; if ("b".equals(split[2])) { font = HELVETICA_BOLD; } else { font = HELVETICA; } contentStream.setFont(font, fontSize); Object text = null; if (split[3].startsWith("\"")) { // TODO: text = substring(split[3], 1, -1); } else { // TODO: text = new PropertyModel(m, split[3]).getObject(); } if (text == null) { LOGGER.warn("Propiedad {} no se encuentra", split[3]); //contentStream.drawString("ERROR: propiedad no encontrada"); contentStream.drawString(" "); } else { String string = text.toString(); currentOffset = font.getStringWidth(string) * ancho; contentStream.drawString(string); } } if (textHasBegun) { contentStream.endText(); } } finally { Closeables.closeQuietly(fileReader); } contentStream.close(); try { doc.save(os); } catch (COSVisitorException e) { throw new IOException("Ha ocurrido un error al escribir en el Os", e); } } finally { if (doc != null) { doc.close(); } LOGGER.info("Me ha tomado {} milisegundos hacer el pdf", System.currentTimeMillis() - t); } }
From source file:net.anthonypoon.billscrapper.JavaBillScrapper.java
public JavaBillScrapper(File pdfFile) throws IOException { PDDocument doc = PDDocument.load(pdfFile); PDFTextStripper stripper = new PDFTextStripper(); String rawText = stripper.getText(doc); String[] textArray = rawText.split("[\\r\\n]+"); this.billObj = parsePdf(textArray); doc.close(); }
From source file:net.anthonypoon.billscrapper.JavaBillScrapper.java
public static void main(String[] args) { // TODO code application logic here try {// w w w . ja v a 2 s . c o m for (String arg : args) { if (!arg.startsWith("-")) { filePaths.add(arg); } else { try { options.add(Flags.fromString(arg)); } catch (IllegalArgumentException ex) { System.err.println("Illegal options: " + arg); } } } Collections.sort(filePaths); for (String filePath : filePaths) { System.out.println("Loading: " + filePath); PDDocument doc = PDDocument.load(new File(filePath)); PDFTextStripper stripper = new PDFTextStripper(); String rawText = stripper.getText(doc); String[] textArray = rawText.split("[\\r\\n]+"); Bill bill = parsePdf(textArray); if (options.contains(Flags.INSERT_INTO_DB)) { DatabaseConnector db = new DatabaseConnector(); DbWriter writer = new DbWriter(db.getConnection()); boolean isInserted = writer.insertDetail(bill.getBillSummary(), bill.getPhoneSummaryData(), bill.getPhoneDetail()); writer.commit(); doc.close(); if (!isInserted) { System.out.println(filePath + " was not inserted into database."); } } } } catch (Exception ex) { ex.printStackTrace(System.out); } }
From source file:net.awl.edoc.pdfa.PdfBoxIsartorValidate.java
License:Apache License
public static void coin(File f) { nbFile++;//w w w. j a va 2 s. c o m // PDFBox try { PDDocument document = PDDocument.load(f); COSDocument cDocument = document.getDocument(); boolean result = PDFParser.parse(new FileInputStream(f)); if (result) { nbOk++; } else { nbBad++; } ; document.close(); } catch (IOException e) { System.err.println("Failed for : " + f.getAbsolutePath()); // } catch (ParseException e) { // nbBad++; } catch (Throwable e) { nbBad++; } }
From source file:net.bookinaction.ExtractAnnotations.java
License:Apache License
public void doJob(String job, Float[] pA) throws IOException { PDDocument document = null; Stamper s = new Stamper(); // utility class final String job_file = job + ".pdf"; final String dic_file = job + "-dict.txt"; final String new_job = job + "-new.pdf"; PrintWriter writer = new PrintWriter(dic_file); ImageLocationListener imageLocationsListener = new ImageLocationListener(); AnnotationMaker annotMaker = new AnnotationMaker(); try {// w w w .j a va 2s. c o m document = PDDocument.load(new File(job_file)); int pageNum = 0; for (PDPage page : document.getPages()) { pageNum++; PDRectangle cropBox = page.getCropBox(); List<PDAnnotation> annotations = page.getAnnotations(); // extract image locations List<Rectangle2D> imageRects = new ArrayList<Rectangle2D>(); imageLocationsListener.setImageRects(imageRects); imageLocationsListener.processPage(page); int im = 0; for (Rectangle2D pdImageRect : imageRects) { s.recordImage(writer, pageNum, "[im" + im + "]", (Rectangle2D.Float) pdImageRect); annotations.add(annotMaker.squareAnnotation(Color.YELLOW, (Rectangle2D.Float) pdImageRect, "[im" + im + "]")); im++; } PDFTextStripperByArea stripper = new PDFTextStripperByArea(); int j = 0; List<PDAnnotation> viableAnnots = new ArrayList(); for (PDAnnotation annot : annotations) { if (annot instanceof PDAnnotationTextMarkup || annot instanceof PDAnnotationLink) { stripper.addRegion(Integer.toString(j++), s.getAwtRect( s.adjustedRect(annot.getRectangle(), pA[0], pA[1], pA[2], pA[3]), cropBox)); viableAnnots.add(annot); } else if (annot instanceof PDAnnotationPopup || annot instanceof PDAnnotationText) { viableAnnots.add(annot); } } stripper.extractRegions(page); List<PDRectangle> rects = new ArrayList<PDRectangle>(); List<String> comments = new ArrayList<String>(); List<String> highlightTexts = new ArrayList<String>(); j = 0; for (PDAnnotation viableAnnot : viableAnnots) { if (viableAnnot instanceof PDAnnotationTextMarkup) { String highlightText = stripper.getTextForRegion(Integer.toString(j++)); String withoutCR = highlightText.replace((char) 0x0A, '^'); String comment = viableAnnot.getContents(); String colorString = String.format("%06x", viableAnnot.getColor().toRGB()); PDRectangle aRect = s.adjustedRect(viableAnnot.getRectangle(), pA[4], pA[5], pA[6], pA[7]); rects.add(aRect); comments.add(comment); highlightTexts.add(highlightText); s.recordTextMarkup(writer, pageNum, comment, withoutCR, aRect, colorString); } else if (viableAnnot instanceof PDAnnotationText) { String comment = viableAnnot.getContents(); String colorString = String.format("%06x", viableAnnot.getColor().toRGB()); for (Rectangle2D pdImageRect : imageRects) { if (pdImageRect.contains(viableAnnot.getRectangle().getLowerLeftX(), viableAnnot.getRectangle().getLowerLeftY())) { s.recordTextMarkup(writer, pageNum, comment, "", (Rectangle2D.Float) pdImageRect, colorString); annotations.add(annotMaker.squareAnnotation(Color.GREEN, (Rectangle2D.Float) pdImageRect, comment)); } ; } } } PDPageContentStream canvas = new PDPageContentStream(document, page, true, true, true); int i = 0; for (PDRectangle pdRect : rects) { String comment = comments.get(i); String highlightText = highlightTexts.get(i); //annotations.add(linkAnnotation(pdRect, comment, highlightText)); //annotations.add(annotationSquareCircle(pdRect, BLUE)); s.showBox(canvas, new Rectangle2D.Float(pdRect.getLowerLeftX(), pdRect.getUpperRightY(), pdRect.getWidth(), pdRect.getHeight()), cropBox, Color.BLUE); i++; } canvas.close(); } writer.close(); document.save(new_job); } finally { if (document != null) { document.close(); } } }
From source file:net.bookinaction.TextInfoExtractor.java
License:Apache License
public void doTextPosition(String source, String coord_text, StripperParam stripperParam) throws IOException { String source_pdf = source;/*from w w w. ja v a 2 s . co m*/ String new_file = source.split("\\.")[0] + "-new.pdf"; PDDocument document = PDDocument.load(new File(source_pdf)); PrintWriter writer = new PrintWriter(new File(coord_text)); //s.recordHeader(writer, source_pdf, document.getNumberOfPages(), sParam); for (int i = 0; i < document.getNumberOfPages(); i++) { getTextPositionFromPage(document, stripperParam, i + 1, writer, true); } if (document != null) { document.save(new_file); document.close(); } if (writer != null) writer.close(); }
From source file:net.homeip.donaldm.doxmentor4j.indexers.PDFIndexer.java
License:Open Source License
@Override public Reader getText(URI uri, int page, StringBuilder title) throws FileNotFoundException, MalformedURLException, IOException //----------------------------------------------------------------------------------------- { FileWriter writer = null;/*from w ww. j ava 2 s . c o m*/ PDDocument pdf = null; PDFTextStripper stripper = null; java.io.File tmpPdf = null; try { tmpPdf = Utils.uri2File(uri); if (tmpPdf != null) pdf = PDDocument.load(tmpPdf.getAbsolutePath(), true); else pdf = PDDocument.load(uri.toURL(), true); PDDocumentInformation pdfInfo = pdf.getDocumentInformation(); String s = pdfInfo.getTitle(); if ((s == null) || (s.length() == 0)) s = uri.getPath(); if (title != null) title.append(s); stripper = new PDFTextStripper(); if (page >= 0) { stripper.setStartPage(page); stripper.setEndPage(page); } else { stripper.setStartPage(1); stripper.setEndPage(pdf.getNumberOfPages()); } java.io.File f = java.io.File.createTempFile("pdf", ".tmp"); writer = new FileWriter(f); stripper.writeText(pdf, writer); try { writer.close(); writer = null; } catch (Exception _e) { } stripper.resetEngine(); return new FileReader(f); } finally { if (stripper != null) try { stripper.resetEngine(); } catch (Exception _e) { } if (pdf != null) try { pdf.close(); } catch (Exception _e) { } if (writer != null) try { writer.close(); } catch (Exception _e) { } if ((tmpPdf != null) && (tmpPdf.getAbsolutePath().toLowerCase().indexOf(System.getProperty("java.io.tmpdir")) >= 0)) tmpPdf.delete(); } }
From source file:net.homeip.donaldm.doxmentor4j.indexers.PDFIndexer.java
License:Open Source License
@Override public long index(String href, URI uri, boolean followLinks, Object... extraParams) throws IOException //----------------------------------------------------------------------------------------------------- { if (m_indexWriter == null) { logger.error("PDFIndexer: index writer is null"); return -1; }// ww w.j a v a2 s .com PDDocument pdf = null; PDFTextStripper stripper = null; Reader reader = null; Writer writer = null; java.io.File tmpPdf = null; try { tmpPdf = Utils.uri2File(uri); if (tmpPdf != null) pdf = PDDocument.load(tmpPdf.getAbsolutePath(), true); else pdf = PDDocument.load(uri.toURL(), true); PDDocumentInformation pdfInfo = pdf.getDocumentInformation(); String title = pdfInfo.getTitle(); if ((title == null) || (title.isEmpty())) title = uri.getPath(); stripper = new PDFTextStripper(); int noPages = pdf.getNumberOfPages(); stripper.setSuppressDuplicateOverlappingText(false); if (noPages != PDDocument.UNKNOWN_NUMBER_OF_PAGES) { for (int page = 1; page <= noPages; page++) { stripper.setStartPage(page); stripper.setEndPage(page); writer = new StringWriter(); stripper.writeText(pdf, writer); reader = new StringReader(writer.toString()); Document doc = new Document(); doc.add(new Field("path", href, Field.Store.YES, Field.Index.NO)); doc.add(new Field("title", title.toString(), Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("contents", reader, Field.TermVector.WITH_POSITIONS_OFFSETS)); doc.add(new Field("page", Integer.toString(page), Field.Store.YES, Field.Index.NO)); if (addDocument(doc)) AjaxIndexer.incrementCount(); try { writer.close(); writer = null; } catch (Exception _e) { } try { reader.close(); reader = null; } catch (Exception _e) { } if ((page % 50) == 0) { try { System.runFinalization(); System.gc(); } catch (Exception _e) { } } } } else { java.io.File f = java.io.File.createTempFile("pdf", ".tmp"); writer = new FileWriter(f); stripper.writeText(pdf, writer); try { writer.close(); writer = null; } catch (Exception _e) { } reader = new FileReader(f); Document doc = new Document(); doc.add(new Field("path", href, Field.Store.YES, Field.Index.NO)); doc.add(new Field("title", title.toString(), Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("contents", reader, Field.TermVector.WITH_POSITIONS_OFFSETS)); doc.add(new Field("page", "-1", Field.Store.YES, Field.Index.NO)); if (addDocument(doc)) AjaxIndexer.incrementCount(); try { reader.close(); reader = null; } catch (Exception _e) { } try { System.runFinalization(); System.gc(); } catch (Exception _e) { } } return 1; } catch (Exception e) { logger.error("Error indexing PDF text from " + uri.toString(), e); return -1; } finally { if (stripper != null) try { stripper.resetEngine(); } catch (Exception _e) { } if (pdf != null) try { pdf.close(); } catch (Exception _e) { } if (writer != null) try { writer.close(); } catch (Exception _e) { } if (reader != null) try { reader.close(); } catch (Exception _e) { } if ((tmpPdf != null) && (tmpPdf.getAbsolutePath().toLowerCase().indexOf(System.getProperty("java.io.tmpdir")) >= 0)) tmpPdf.delete(); } }
From source file:net.ontopia.topicmaps.classify.PDFFormatModule.java
License:Apache License
public void readContent(ClassifiableContentIF cc, TextHandlerIF handler) { try {/*from ww w . j av a2s .c o m*/ PDDocument pdoc = PDDocument.load(new BufferedInputStream(new ByteArrayInputStream(cc.getContent()))); PDFTextStripper stripper = new PDFTextStripper(); String s = stripper.getText(pdoc); pdoc.close(); char[] c = s.toCharArray(); handler.startRegion("document"); handler.text(c, 0, c.length); handler.endRegion(); } catch (Exception e) { throw new OntopiaRuntimeException(e); } }
From source file:net.sf.jabref.imports.PdfContentImporter.java
License:Open Source License
@Override public List<BibtexEntry> importEntries(InputStream in, OutputPrinter status) throws IOException { final ArrayList<BibtexEntry> res = new ArrayList<BibtexEntry>(1); PDDocument document; try {/* ww w. j a v a 2s . c o m*/ document = PDDocument.load(in); } catch (IOException e) { PdfContentImporter.logger.log(Level.SEVERE, "Could not load document", e); return res; } try { if (document.isEncrypted()) { PdfContentImporter.logger.log(Level.INFO, Globals.lang("Encrypted documents are not supported")); //return res; } PDFTextStripper stripper = new PDFTextStripper(); stripper.setStartPage(1); stripper.setEndPage(1); stripper.setSortByPosition(true); stripper.setParagraphEnd(System.getProperty("line.separator")); StringWriter writer = new StringWriter(); stripper.writeText(document, writer); String textResult = writer.toString(); String doi = DOIUtil.getDOI(textResult); if (doi.length() < textResult.length()) { // A DOI was found in the text // We do NO parsing of the text, but use the DOI fetcher ImportInspector i = new ImportInspector() { @Override public void toFront() { } @Override public void setProgress(int current, int max) { } @Override public void addEntry(BibtexEntry entry) { // add the entry to the result object res.add(entry); } }; PdfContentImporter.doiToBibTeXFetcher.processQuery(doi, i, status); if (res.size() != 0) { // if something has been found, return the result return res; } else { // otherwise, we just parse the PDF } } String author; String editor = null; String institution = null; String abstractT = null; String keywords = null; String title; String conference = null; String DOI = null; String series = null; String volume = null; String number = null; String pages = null; // year is a class variable as the method extractYear() uses it; String publisher = null; BibtexEntryType type = BibtexEntryType.INPROCEEDINGS; final String lineBreak = System.getProperty("line.separator"); split = textResult.split(lineBreak); // idea: split[] contains the different lines // blocks are separated by empty lines // treat each block // or do special treatment at authors (which are not broken) // therefore, we do a line-based and not a block-based splitting // i points to the current line // curString (mostly) contains the current block // the different lines are joined into one and thereby separated by " " proceedToNextNonEmptyLine(); if (i >= split.length) { // PDF could not be parsed or is empty // return empty list return res; } curString = split[i]; i = i + 1; if (curString.length() > 4) { // special case: possibly conference as first line on the page extractYear(); if (curString.contains("Conference")) { fillCurStringWithNonEmptyLines(); conference = curString; curString = ""; } else { // e.g. Copyright (c) 1998 by the Genetics Society of America // future work: get year using RegEx String lower = curString.toLowerCase(); if (lower.contains("copyright")) { fillCurStringWithNonEmptyLines(); publisher = curString; curString = ""; } } } // start: title fillCurStringWithNonEmptyLines(); title = streamlineTitle(curString); curString = ""; //i points to the next non-empty line // after title: authors author = null; while ((i < split.length) && (!split[i].equals(""))) { // author names are unlikely to be split among different lines // treat them line by line curString = streamlineNames(split[i]); if (author == null) { author = curString; } else { if (curString.equals("")) { // if split[i] is "and" then "" is returned by streamlineNames -> do nothing } else { author = author.concat(" and ").concat(curString); } } i++; } curString = ""; i++; // then, abstract and keywords follow while (i < split.length) { curString = split[i]; if ((curString.length() >= "Abstract".length()) && (curString.substring(0, "Abstract".length()).equalsIgnoreCase("Abstract"))) { if (curString.length() == "Abstract".length()) { // only word "abstract" found -- skip line curString = ""; } else { curString = curString.substring("Abstract".length() + 1).trim().concat(lineBreak); } i++; // fillCurStringWithNonEmptyLines() cannot be used as that uses " " as line separator // whereas we need linebreak as separator while ((i < split.length) && (!split[i].equals(""))) { curString = curString.concat(split[i]).concat(lineBreak); i++; } abstractT = curString; i++; } else if ((curString.length() >= "Keywords".length()) && (curString.substring(0, "Keywords".length()).equalsIgnoreCase("Keywords"))) { if (curString.length() == "Keywords".length()) { // only word "Keywords" found -- skip line curString = ""; } else { curString = curString.substring("Keywords".length() + 1).trim(); } i++; fillCurStringWithNonEmptyLines(); keywords = removeNonLettersAtEnd(curString); } else { String lower = curString.toLowerCase(); int pos = lower.indexOf("technical"); if (pos >= 0) { type = BibtexEntryType.TECHREPORT; pos = curString.trim().lastIndexOf(' '); if (pos >= 0) { // assumption: last character of curString is NOT ' ' // otherwise pos+1 leads to an out-of-bounds exception number = curString.substring(pos + 1); } } i++; proceedToNextNonEmptyLine(); } } i = split.length - 1; // last block: DOI, detailed information // sometimes, this information is in the third last block etc... // therefore, read until the beginning of the file while (i >= 0) { readLastBlock(); // i now points to the block before or is -1 // curString contains the last block, separated by " " extractYear(); int pos = curString.indexOf("(Eds.)"); if ((pos >= 0) && (publisher == null)) { // looks like a Springer last line // e.g: A. Persson and J. Stirna (Eds.): PoEM 2009, LNBIP 39, pp. 161-175, 2009. publisher = "Springer"; editor = streamlineNames(curString.substring(0, pos - 1)); curString = curString.substring(pos + "(Eds.)".length() + 2); //+2 because of ":" after (Eds.) and the subsequent space String[] springerSplit = curString.split(", "); if (springerSplit.length >= 4) { conference = springerSplit[0]; String seriesData = springerSplit[1]; int lastSpace = seriesData.lastIndexOf(' '); series = seriesData.substring(0, lastSpace); volume = seriesData.substring(lastSpace + 1); pages = springerSplit[2].substring(4); if (springerSplit[3].length() >= 4) { year = springerSplit[3].substring(0, 4); } } } else { if (DOI == null) { pos = curString.indexOf("DOI"); if (pos < 0) { pos = curString.indexOf("doi"); } if (pos >= 0) { pos += 3; char delimiter = curString.charAt(pos); if ((delimiter == ':') || (delimiter == ' ')) { pos++; } int nextSpace = curString.indexOf(' ', pos); if (nextSpace > 0) { DOI = curString.substring(pos, nextSpace); } else { DOI = curString.substring(pos); } } } if ((publisher == null) && (curString.contains("IEEE"))) { // IEEE has the conference things at the end publisher = "IEEE"; // year is extracted by extractYear // otherwise, we could it determine as follows: // String yearStr = curString.substring(curString.length()-4); // if (isYear(yearStr)) { // year = yearStr; // } if (conference == null) { pos = curString.indexOf('$'); if (pos > 0) { // we found the price // before the price, the ISSN is stated // skip that pos -= 2; while ((pos >= 0) && (curString.charAt(pos) != ' ')) { pos--; } if (pos > 0) { conference = curString.substring(0, pos); } } } } // String lower = curString.toLowerCase(); // if (institution == null) { // // } } } BibtexEntry entry = new BibtexEntry(); entry.setType(type); if (author != null) { entry.setField("author", author); } if (editor != null) { entry.setField("editor", editor); } if (institution != null) { entry.setField("institution", institution); } if (abstractT != null) { entry.setField("abstract", abstractT); } if (keywords != null) { entry.setField("keywords", keywords); } if (title != null) { entry.setField("title", title); } if (conference != null) { entry.setField("booktitle", conference); } if (DOI != null) { entry.setField("doi", DOI); } if (series != null) { entry.setField("series", series); } if (volume != null) { entry.setField("volume", volume); } if (number != null) { entry.setField("number", number); } if (pages != null) { entry.setField("pages", pages); } if (year != null) { entry.setField("year", year); } if (publisher != null) { entry.setField("publisher", publisher); } entry.setField("review", textResult); res.add(entry); } catch (NoClassDefFoundError e) { if (e.getMessage().equals("org/bouncycastle/jce/provider/BouncyCastleProvider")) { status.showMessage(Globals.lang( "Java Bouncy Castle library not found. Please download and install it. For more information see http://www.bouncycastle.org/.")); } else { PdfContentImporter.logger.log(Level.SEVERE, e.getLocalizedMessage(), e); } } finally { document.close(); } return res; }