List of usage examples for org.apache.pdfbox.text PDFTextStripper setStartPage
public void setStartPage(int startPageValue)
From source file:com.jt.tool.pdf.CreateBookmarks.java
License:Apache License
/** * parse text/*from w ww . j a v a2 s.c om*/ */ public static String getPageText(PDDocument document, int start, int offset) throws Exception { PDFTextStripper stripper = new PDFTextStripper(); stripper.setStartPage(start); stripper.setEndPage(start + offset); return stripper.getText(document); }
From source file:com.plumblarrick.andrew.cityrecordtextextractor.IssueExtractorPositional.java
public void extractToFile(String inFileName, String outFileName) throws IOException { this.inFileName = inFileName; this.outFileName = outFileName; try {//from w ww.j a v a 2s. c o m document = PDDocument.load(new File(inFileName)); PDFTextStripper stripper = new CRTStripper(); //stripper.setSortByPosition(true); stripper.setStartPage(0); stripper.setEndPage(document.getNumberOfPages()); fileOut = (new BufferedWriter(new PrintWriter(outFileName, "UTF-8"))); fileOut.write("Source file: " + inFileName + "\n"); stripper.writeText(document, fileOut); } finally { if (document != null) { document.close(); fileOut.flush(); fileOut.close(); } } }
From source file:com.testautomationguru.utility.PDFUtil.java
License:Apache License
/** * This method returns the content of the document *///from w ww . ja va 2s . c om private String getPDFText(String file, int startPage, int endPage) throws IOException { logger.info("file : " + file); logger.info("startPage : " + startPage); logger.info("endPage : " + endPage); PDDocument doc = PDDocument.load(new File(file)); PDFTextStripper localStripper = new PDFTextStripper(); if (null != this.stripper) { localStripper = this.stripper; } this.updateStartAndEndPages(file, startPage, endPage); localStripper.setStartPage(this.startPage); localStripper.setEndPage(this.endPage); String txt = localStripper.getText(doc); logger.info("PDF Text before trimming : " + txt); if (this.bTrimWhiteSpace) { txt = txt.trim().replaceAll("\\s+", " ").trim(); logger.info("PDF Text after trimming : " + txt); } doc.close(); return txt; }
From source file:com.validation.manager.core.server.core.AttachmentServerTest.java
License:Apache License
/** * Test of addFile method, of class AttachmentServer. */// ww w . j av a2 s. c o m @Test public void testAddRetrieveTextFile() { try { System.out.println("add text File"); File f = new File("target/Test.txt"); f.deleteOnExit(); List<String> lines = Arrays.asList("The first line", "The second line"); Path file = Paths.get(f.getAbsolutePath()); Files.write(file, lines, Charset.forName("UTF-8")); AttachmentServer instance = new AttachmentServer(); instance.addFile(f, f.getName()); instance.write2DB(); //Delete the file FileUtils.delete(f.getAbsolutePath()); assertEquals(1, (int) instance.getAttachmentType().getId());//Text file System.out.println("retrieveFile"); AttachmentServer temp = new AttachmentServer(instance.getAttachmentPK()); File loadedFile = temp.getAttachedFile("target/loaded/"); BufferedReader br = new BufferedReader(new FileReader(loadedFile)); String line; int count = 0; while ((line = br.readLine()) != null) { assertEquals(lines.get(count), line); System.out.println(line); count++; } assertEquals(lines.size(), count); //Create pdf file System.out.println("add pdf File"); File pdf = Tool.convertToPDF(loadedFile, "target/Text.pdf"); pdf.deleteOnExit(); instance = new AttachmentServer(); instance.addFile(pdf, pdf.getName()); instance.write2DB(); //Delete the file FileUtils.delete(pdf.getAbsolutePath()); assertEquals(2, (int) instance.getAttachmentType().getId());//PDF file System.out.println("retrieveFile"); temp = new AttachmentServer(instance.getAttachmentPK()); loadedFile = temp.getAttachedFile("target/loaded/"); PDFTextStripper pdfStripper; PDDocument pdDoc = null; COSDocument cosDoc = null; try { PDFParser parser = new PDFParser(new RandomAccessBufferedFileInputStream(loadedFile)); parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); pdfStripper.setStartPage(1); pdfStripper.setEndPage(1); String parsedText = pdfStripper.getText(pdDoc); System.out.println(parsedText); } catch (IOException ex) { Exceptions.printStackTrace(ex); fail(); } finally { if (cosDoc != null) { cosDoc.close(); } if (pdDoc != null) { pdDoc.close(); } } } catch (IOException | VMException ex) { Exceptions.printStackTrace(ex); fail(); } }
From source file:cz.incad.kramerius.k5indexer.KrameriusPDFDocument.java
public String getPage(int page) throws Exception { logger.log(Level.INFO, "Getting page {0}", page); try {//from www. j a v a2s .c o m PDFTextStripper stripper = new PDFTextStripper(/*"UTF-8"*/); if (page != -1) { stripper.setStartPage(page); stripper.setEndPage(page); } return StringEscapeUtils.escapeXml(stripper.getText(pdDoc)); } catch (Exception ex) { return ""; } }
From source file:dk.defxws.fedoragsearch.server.TransformerToText.java
License:Open Source License
public static StringBuffer getTextFromPDF(PDDocument pdDoc, String pageNum) throws Exception { StringBuffer docText = new StringBuffer(); String password = ""; // extract PDF document's textual content try {//from w w w . j a v a 2 s.c o m PDFTextStripper stripper = new PDFTextStripper(/*"UTF-8"*/); int page = Integer.parseInt(pageNum); if (page != -1) { stripper.setStartPage(page); stripper.setEndPage(page); } docText = new StringBuffer(stripper.getText(pdDoc)); } catch (IOException e) { throw new Exception("Cannot parse PDF document", e); } return docText; }
From source file:dk.defxws.fedoragsearch.server.TransformerToText.java
License:Open Source License
/** * //from www. j a v a2 s . c o m * * @throws Exception. */ private StringBuffer getTextFromPDF(byte[] doc, String pageNum) throws Exception { StringBuffer docText = new StringBuffer(); PDDocument pdDoc = null; String password = ""; // extract PDF document's textual content try { PDFTextStripper stripper = new PDFTextStripper(/*"UTF-8"*/); int page = Integer.parseInt(pageNum); if (page != -1) { stripper.setStartPage(page); stripper.setEndPage(page); } //password pdDoc = PDDocument.load(new ByteArrayInputStream(doc), password); // new PDDocument(cosDoc); docText = new StringBuffer(stripper.getText(pdDoc)); } catch (IOException e) { throw new Exception("Cannot parse PDF document", e); } finally { closePDDocument(pdDoc); } return docText; }
From source file:org.quelea.services.importexport.SurvivorSongbookParser.java
License:Open Source License
/** * Get the text on a page in the PDF document. * @param document the document.//from w w w . java2s .c o m * @param stripper the PDF stripper used to get the text. * @param page the page number. * @return the text on the given page. * @throws IOException if something went wrong. */ private String getPageText(PDDocument document, PDFTextStripper stripper, int page) throws IOException { stripper.setStartPage(page); stripper.setEndPage(page); StringWriter textWriter = new StringWriter(); stripper.writeText(document, textWriter); return textWriter.toString().replace("", "'").replace("`", "'"); }
From source file:pdf.to.info.PDF.java
/** * Creating a PDDocument object/*w ww.j a va 2s .c om*/ * * @param filePath * @return * @throws java.io.IOException */ private PDDocument ReadPDDoc(String filePath) throws IOException { File file = new File(filePath); PDFParser parser = new PDFParser(new RandomAccessFile(file, "r")); // update for PDFBox V 2.0 parser.parse(); COSDocument cosDoc = parser.getDocument(); PDFTextStripper pdfStripper = new PDFTextStripper(); PDDocument pdDoc = new PDDocument(cosDoc); pdDoc.getNumberOfPages(); pdfStripper.setStartPage(1); pdfStripper.setEndPage(1); // for reading all pages of pdf file // pdfStripper.setEndPage(pdDoc.getNumberOfPages()); return pdDoc; }
From source file:pdfconverter.converter3.java
@SuppressWarnings("deprecation") public static void main(String[] args) throws IOException, WriteException { workbook = Workbook.createWorkbook(new File(output)); System.out.println("File created"); WritableSheet sheet = workbook.createSheet("Page", 0); ExcelStart(sheet);//from w w w . jav a 2 s . c o m //Scanner user_input = new Scanner( System.in ); File dir = new File(path); //System.out.println(dir.getPath()); File[] dirList = dir.listFiles(new FilenameFilter() { @Override public boolean accept(File dir, String name) { return name.endsWith(".pdf"); } }); int counter = 1; PDDocument pd; PDFTextStripper stripper = new PDFTextStripper(); PDFTextStripperByArea areaSearch = new PDFTextStripperByArea(); PDFTextStripperByArea stripper2 = new PDFTextStripperByArea(); PDFTextStripperByArea stripper3 = new PDFTextStripperByArea(); //PDRectangle rect = new PDRectangle(0, 0, 100, 100); stripper.setStartPage(1); //Start extracting from page 3 stripper.setEndPage(1); //Extract till page 5 File f = new File(dirList[0].getPath()); pd = PDDocument.load(f); //int curHeight = 136; //int rowCount = 37; int curHeight = 116; int rowCount = 39; int rowHeight = 9; int sheetRowCount = 0; int pageStop = 1491; for (int curpage = 800; curpage < pageStop; curpage++) { if (counter > 800) { break; } PDPage page = pd.getPage(curpage); System.out.println("Now parsing page " + curpage); for (int curRow = 0; curRow < 80; curRow++) { Rectangle2D.Float cell = new Rectangle2D.Float(0, curHeight, 80, rowHeight); String name = "cell-1-" + curRow; areaSearch.addRegion(name, cell); areaSearch.extractRegions(page); String text = areaSearch.getTextForRegion(name); areaSearch.removeRegion(name); AddCell(sheet, text, 0, sheetRowCount + 1); cell = new Rectangle2D.Float(80, curHeight, 30, rowHeight); name = "cell-2-" + curRow; areaSearch.addRegion(name, cell); areaSearch.extractRegions(page); text = areaSearch.getTextForRegion(name); areaSearch.removeRegion(name); AddCell(sheet, text, 1, sheetRowCount + 1); cell = new Rectangle2D.Float(110, curHeight, 40, rowHeight); name = "cell-3-" + curRow; areaSearch.addRegion(name, cell); areaSearch.extractRegions(page); text = areaSearch.getTextForRegion(name); areaSearch.removeRegion(name); AddCell(sheet, text, 2, sheetRowCount + 1); cell = new Rectangle2D.Float(150, curHeight, 120, rowHeight); name = "cell-4-" + curRow; areaSearch.addRegion(name, cell); areaSearch.extractRegions(page); text = areaSearch.getTextForRegion(name); areaSearch.removeRegion(name); AddCell(sheet, text, 3, sheetRowCount + 1); cell = new Rectangle2D.Float(270, curHeight, 120, rowHeight); name = "cell-5-" + curRow; areaSearch.addRegion(name, cell); areaSearch.extractRegions(page); text = areaSearch.getTextForRegion(name); areaSearch.removeRegion(name); AddCell(sheet, text, 4, sheetRowCount + 1); cell = new Rectangle2D.Float(390, curHeight, 40, rowHeight); name = "cell-6-" + curRow; areaSearch.addRegion(name, cell); areaSearch.extractRegions(page); text = areaSearch.getTextForRegion(name); areaSearch.removeRegion(name); AddCell(sheet, text, 5, sheetRowCount + 1); cell = new Rectangle2D.Float(430, curHeight, 46, rowHeight); name = "cell-7-" + curRow; areaSearch.addRegion(name, cell); areaSearch.extractRegions(page); text = areaSearch.getTextForRegion(name); areaSearch.removeRegion(name); AddCell(sheet, text, 6, sheetRowCount + 1); cell = new Rectangle2D.Float(476, curHeight, 82, rowHeight); name = "cell-8-" + curRow; areaSearch.addRegion(name, cell); areaSearch.extractRegions(page); text = areaSearch.getTextForRegion(name); areaSearch.removeRegion(name); AddCell(sheet, text, 7, sheetRowCount + 1); cell = new Rectangle2D.Float(558, curHeight, 65, rowHeight); name = "cell-9-" + curRow; areaSearch.addRegion(name, cell); areaSearch.extractRegions(page); text = areaSearch.getTextForRegion(name); areaSearch.removeRegion(name); AddCell(sheet, text, 8, sheetRowCount + 1); cell = new Rectangle2D.Float(623, curHeight, 66, rowHeight); name = "cell-10-" + curRow; areaSearch.addRegion(name, cell); areaSearch.extractRegions(page); text = areaSearch.getTextForRegion(name); areaSearch.removeRegion(name); AddCell(sheet, text, 9, sheetRowCount + 1); cell = new Rectangle2D.Float(689, curHeight, 100, rowHeight); name = "cell-11-" + curRow; areaSearch.addRegion(name, cell); areaSearch.extractRegions(page); text = areaSearch.getTextForRegion(name); areaSearch.removeRegion(name); AddCell(sheet, text, 10, sheetRowCount + 1); sheetRowCount++; curHeight += rowHeight; } //Rectangle2D.Float issueDate = new Rectangle2D.Float(0, 0, 80, page.getMediaBox().getHeight()); //stripper2.addRegion("issueDate", issueDate); //Rectangle2D.Float amount = new Rectangle2D.Float(80, 0, 30, page.getMediaBox().getHeight()); //stripper2.addRegion("amount", amount); //Rectangle2D.Float citation = new Rectangle2D.Float(110, 0, 40, page.getMediaBox().getHeight()); //stripper2.addRegion("citation", citation); //Rectangle2D.Float violation = new Rectangle2D.Float(150, 0, 120, page.getMediaBox().getHeight()); //stripper2.addRegion("violation", violation); //Rectangle2D.Float comment = new Rectangle2D.Float(270, 0, 120, page.getMediaBox().getHeight()); //stripper2.addRegion("comment", comment); //Rectangle2D.Float warning = new Rectangle2D.Float(390, 0, 40, page.getMediaBox().getHeight()); //stripper2.addRegion("warning", warning); //Rectangle2D.Float license = new Rectangle2D.Float(430, 0, 46, page.getMediaBox().getHeight()); //stripper2.addRegion("license", license); //Rectangle2D.Float lot = new Rectangle2D.Float(476, 0, 82, page.getMediaBox().getHeight()); //stripper2.addRegion("lot", lot); //Rectangle2D.Float make = new Rectangle2D.Float(558, 0, 65, page.getMediaBox().getHeight()); //stripper2.addRegion("make", make); //Rectangle2D.Float officer = new Rectangle2D.Float(623, 0, 66, page.getMediaBox().getHeight()); //stripper2.addRegion("officer", officer); //Rectangle2D.Float state = new Rectangle2D.Float(689, 0, 100, page.getMediaBox().getHeight()); //stripper2.addRegion("state", state); //stripper2.extractRegions(page); //String text = stripper2.getTextForRegion("license"); //Rectangle2D.Float row = new Rectangle2D.Float(0, 156, 80, 10); //stripper3.addRegion("row", row); //stripper3.extractRegions(page); //String text = stripper3.getTextForRegion("row"); //System.out.println(text); counter++; curHeight = 116; rowCount = 39; } //AddRow(sheet, text, counter); //counter++; pd.close(); System.out.println("Data extracted to Excel, parsing through Excel data..."); boolean multiline = true; while (multiline) { multiline = false; for (int row = 0; row < sheet.getRows(); row++) { Cell cell = sheet.getCell(0, row); if (cell.getContents().length() < 5) { multiline = true; WritableCell cell2 = sheet.getWritableCell(4, row - 1); WritableCell cell3 = sheet.getWritableCell(4, row); String content = cell2.getContents() + cell3.getContents(); content = content.replace("\n", "").replace("\r", ""); Label l = (Label) cell2; l.setString(content); sheet.removeRow(row); } } } System.out.println("Data extraction complete"); workbook.write(); workbook.close(); }