List of usage examples for org.apache.pdfbox.text PDFTextStripper PDFTextStripper
public PDFTextStripper() throws IOException
From source file:org.knime.ext.textprocessing.nodes.source.parser.pdf.PDFDocumentParser.java
License:Open Source License
private Document parseInternal(final InputStream is) throws Exception { m_currentDoc = new DocumentBuilder(m_tokenizerName); m_currentDoc.setDocumentFile(new File(m_docPath)); m_currentDoc.setDocumentType(m_type); m_currentDoc.addDocumentCategory(m_category); m_currentDoc.addDocumentSource(m_source); if (m_charset == null) { m_charset = Charset.defaultCharset(); }/*from ww w . j a v a 2 s . c o m*/ PDDocument document = null; try { document = PDDocument.load(is); // extract text from pdf PDFTextStripper stripper = new PDFTextStripper(); stripper.setSortByPosition(true); String text = stripper.getText(document); m_currentDoc.addSection(text, SectionAnnotation.UNKNOWN); // extract meta data from pdf String title = null; String authors = null; if (m_filenameAsTitle) { title = m_docPath.toString().trim(); } PDDocumentInformation information = document.getDocumentInformation(); if (information != null) { if (!checkTitle(title)) { title = information.getTitle(); } authors = information.getAuthor(); } // if title meta data does not exist use first sentence if (!checkTitle(title)) { List<Section> sections = m_currentDoc.getSections(); if (sections.size() > 0) { try { title = sections.get(0).getParagraphs().get(0).getSentences().get(0).getText().trim(); } catch (IndexOutOfBoundsException e) { LOGGER.debug("Parsed PDF document " + m_docPath + " is empty."); title = ""; } } } // if no useful first sentence exist use filename if (!checkTitle(title)) { title = m_docPath.toString().trim(); } m_currentDoc.addTitle(title); // use author meta data if (authors != null) { Set<Author> authSet = AuthorUtil.parseAuthors(authors); for (Author a : authSet) { m_currentDoc.addAuthor(a); } } // add document to list return m_currentDoc.createDocument(); } finally { if (document != null) { document.close(); } } }
From source file:org.quelea.services.importexport.SurvivorSongbookParser.java
License:Open Source License
/** * Get all the songs in the PDF document. * @return a list of all the songs.//from ww w . ja v a 2s . co m * @throws IOException if something went wrong. */ @Override public List<SongDisplayable> getSongs(File location, StatusPanel statusPanel) throws IOException { PDDocument document = PDDocument.load(location); List<SongDisplayable> pdfSongs = new ArrayList<>(); PDFTextStripper stripper = new PDFTextStripper(); List<String> songParts = new ArrayList<>(); for (int i = 0; i < document.getNumberOfPages(); i++) { String pageText = getPageText(document, stripper, i); if (pageText.trim().isEmpty()) { continue; } songParts.add(pageText); boolean twoPart = pageText.contains("(1 of"); if (i < document.getNumberOfPages() - 1) { //This section in case the original (1 of x) is missed out String nextPageText = getPageText(document, stripper, i + 1); if (nextPageText.contains("(2 of")) { twoPart = true; } } if (!twoPart) { SongDisplayable song = processSong(songParts.toArray(new String[songParts.size()])); if (song != null) { pdfSongs.add(song); } songParts.clear(); } } document.close(); if (pdfSongs == null) { return new ArrayList<>(); } else { return pdfSongs; } }
From source file:org.titans.fyp.webcrawler.PageCollector.java
License:Open Source License
private static void pdfToText(String pdfURL) { pdfURL = "https://" + pdfURL.split("://")[1]; // System.out.println(pdfURL); try {/* w w w . j a v a 2 s . c o m*/ PDDocument pddDocument = PDDocument.load((new URL(pdfURL)).openStream()); PDFTextStripper textStripper = new PDFTextStripper(); String doc = textStripper.getText(pddDocument); pddDocument.close(); System.out.println(doc); } catch (Exception e) { e.getMessage(); } }
From source file:org.vesalainen.ham.pdf.RfaxTest.java
License:Open Source License
public void test() throws IOException { PDDocument document = PDDocument.load(new File("rfax.pdf")); if (!document.isEncrypted()) { PDFTextStripper stripper = new PDFTextStripper(); String text = stripper.getText(document); try (BufferedWriter bw = Files.newBufferedWriter(Paths.get("src", "main", "resources", "rfax.txt"))) { bw.write(text);/*from w ww . j ava 2s . co m*/ } } document.close(); }
From source file:org.wildfly.camel.test.fop.FopIntegrationTest.java
License:Apache License
private String extractTextFromDocument(PDDocument document) throws IOException { Writer output = new StringWriter(); PDFTextStripper stripper = new PDFTextStripper(); stripper.writeText(document, output); return output.toString().trim(); }
From source file:PDF.PDFTest.java
License:Apache License
public static void constrainText(String start, String end, File file) throws IOException { PDDocument doc = PDDocument.load(file); PDFTextStripper stripper = new PDFTextStripper(); String text = stripper.getText(doc); // we get the text of the entire // document into a String String[] split_on_start = text.split(start); // split on the start // parameter, take upper // bound//www . j a va 2 s.c o m String[] split_on_end = split_on_start[1].split(end); // split on end // parameter, // take lower // bound String constrained_string = start; constrained_string += split_on_end[0]; // the final string will be the // area in between start and end doc.close(); System.out.print(constrained_string); }
From source file:pdf.to.info.PDF.java
/** * Reading text from PDF file//from w ww.ja va 2 s . c o m * * @param filePath * @return * @throws java.io.IOException */ public String ReadText(String filePath) throws IOException { PDFTextStripper pdfStripper = new PDFTextStripper(); return pdfStripper.getText(ReadPDDoc(filePath)); }
From source file:pdf.to.info.PDF.java
/** * Creating a PDDocument object/*from w w w . jav a 2 s . c o m*/ * * @param filePath * @return * @throws java.io.IOException */ private PDDocument ReadPDDoc(String filePath) throws IOException { File file = new File(filePath); PDFParser parser = new PDFParser(new RandomAccessFile(file, "r")); // update for PDFBox V 2.0 parser.parse(); COSDocument cosDoc = parser.getDocument(); PDFTextStripper pdfStripper = new PDFTextStripper(); PDDocument pdDoc = new PDDocument(cosDoc); pdDoc.getNumberOfPages(); pdfStripper.setStartPage(1); pdfStripper.setEndPage(1); // for reading all pages of pdf file // pdfStripper.setEndPage(pdDoc.getNumberOfPages()); return pdDoc; }
From source file:pdfconverter.converter3.java
@SuppressWarnings("deprecation") public static void main(String[] args) throws IOException, WriteException { workbook = Workbook.createWorkbook(new File(output)); System.out.println("File created"); WritableSheet sheet = workbook.createSheet("Page", 0); ExcelStart(sheet);// w w w . j av a 2 s. c o m //Scanner user_input = new Scanner( System.in ); File dir = new File(path); //System.out.println(dir.getPath()); File[] dirList = dir.listFiles(new FilenameFilter() { @Override public boolean accept(File dir, String name) { return name.endsWith(".pdf"); } }); int counter = 1; PDDocument pd; PDFTextStripper stripper = new PDFTextStripper(); PDFTextStripperByArea areaSearch = new PDFTextStripperByArea(); PDFTextStripperByArea stripper2 = new PDFTextStripperByArea(); PDFTextStripperByArea stripper3 = new PDFTextStripperByArea(); //PDRectangle rect = new PDRectangle(0, 0, 100, 100); stripper.setStartPage(1); //Start extracting from page 3 stripper.setEndPage(1); //Extract till page 5 File f = new File(dirList[0].getPath()); pd = PDDocument.load(f); //int curHeight = 136; //int rowCount = 37; int curHeight = 116; int rowCount = 39; int rowHeight = 9; int sheetRowCount = 0; int pageStop = 1491; for (int curpage = 800; curpage < pageStop; curpage++) { if (counter > 800) { break; } PDPage page = pd.getPage(curpage); System.out.println("Now parsing page " + curpage); for (int curRow = 0; curRow < 80; curRow++) { Rectangle2D.Float cell = new Rectangle2D.Float(0, curHeight, 80, rowHeight); String name = "cell-1-" + curRow; areaSearch.addRegion(name, cell); areaSearch.extractRegions(page); String text = areaSearch.getTextForRegion(name); areaSearch.removeRegion(name); AddCell(sheet, text, 0, sheetRowCount + 1); cell = new Rectangle2D.Float(80, curHeight, 30, rowHeight); name = "cell-2-" + curRow; areaSearch.addRegion(name, cell); areaSearch.extractRegions(page); text = areaSearch.getTextForRegion(name); areaSearch.removeRegion(name); AddCell(sheet, text, 1, sheetRowCount + 1); cell = new Rectangle2D.Float(110, curHeight, 40, rowHeight); name = "cell-3-" + curRow; areaSearch.addRegion(name, cell); areaSearch.extractRegions(page); text = areaSearch.getTextForRegion(name); areaSearch.removeRegion(name); AddCell(sheet, text, 2, sheetRowCount + 1); cell = new Rectangle2D.Float(150, curHeight, 120, rowHeight); name = "cell-4-" + curRow; areaSearch.addRegion(name, cell); areaSearch.extractRegions(page); text = areaSearch.getTextForRegion(name); areaSearch.removeRegion(name); AddCell(sheet, text, 3, sheetRowCount + 1); cell = new Rectangle2D.Float(270, curHeight, 120, rowHeight); name = "cell-5-" + curRow; areaSearch.addRegion(name, cell); areaSearch.extractRegions(page); text = areaSearch.getTextForRegion(name); areaSearch.removeRegion(name); AddCell(sheet, text, 4, sheetRowCount + 1); cell = new Rectangle2D.Float(390, curHeight, 40, rowHeight); name = "cell-6-" + curRow; areaSearch.addRegion(name, cell); areaSearch.extractRegions(page); text = areaSearch.getTextForRegion(name); areaSearch.removeRegion(name); AddCell(sheet, text, 5, sheetRowCount + 1); cell = new Rectangle2D.Float(430, curHeight, 46, rowHeight); name = "cell-7-" + curRow; areaSearch.addRegion(name, cell); areaSearch.extractRegions(page); text = areaSearch.getTextForRegion(name); areaSearch.removeRegion(name); AddCell(sheet, text, 6, sheetRowCount + 1); cell = new Rectangle2D.Float(476, curHeight, 82, rowHeight); name = "cell-8-" + curRow; areaSearch.addRegion(name, cell); areaSearch.extractRegions(page); text = areaSearch.getTextForRegion(name); areaSearch.removeRegion(name); AddCell(sheet, text, 7, sheetRowCount + 1); cell = new Rectangle2D.Float(558, curHeight, 65, rowHeight); name = "cell-9-" + curRow; areaSearch.addRegion(name, cell); areaSearch.extractRegions(page); text = areaSearch.getTextForRegion(name); areaSearch.removeRegion(name); AddCell(sheet, text, 8, sheetRowCount + 1); cell = new Rectangle2D.Float(623, curHeight, 66, rowHeight); name = "cell-10-" + curRow; areaSearch.addRegion(name, cell); areaSearch.extractRegions(page); text = areaSearch.getTextForRegion(name); areaSearch.removeRegion(name); AddCell(sheet, text, 9, sheetRowCount + 1); cell = new Rectangle2D.Float(689, curHeight, 100, rowHeight); name = "cell-11-" + curRow; areaSearch.addRegion(name, cell); areaSearch.extractRegions(page); text = areaSearch.getTextForRegion(name); areaSearch.removeRegion(name); AddCell(sheet, text, 10, sheetRowCount + 1); sheetRowCount++; curHeight += rowHeight; } //Rectangle2D.Float issueDate = new Rectangle2D.Float(0, 0, 80, page.getMediaBox().getHeight()); //stripper2.addRegion("issueDate", issueDate); //Rectangle2D.Float amount = new Rectangle2D.Float(80, 0, 30, page.getMediaBox().getHeight()); //stripper2.addRegion("amount", amount); //Rectangle2D.Float citation = new Rectangle2D.Float(110, 0, 40, page.getMediaBox().getHeight()); //stripper2.addRegion("citation", citation); //Rectangle2D.Float violation = new Rectangle2D.Float(150, 0, 120, page.getMediaBox().getHeight()); //stripper2.addRegion("violation", violation); //Rectangle2D.Float comment = new Rectangle2D.Float(270, 0, 120, page.getMediaBox().getHeight()); //stripper2.addRegion("comment", comment); //Rectangle2D.Float warning = new Rectangle2D.Float(390, 0, 40, page.getMediaBox().getHeight()); //stripper2.addRegion("warning", warning); //Rectangle2D.Float license = new Rectangle2D.Float(430, 0, 46, page.getMediaBox().getHeight()); //stripper2.addRegion("license", license); //Rectangle2D.Float lot = new Rectangle2D.Float(476, 0, 82, page.getMediaBox().getHeight()); //stripper2.addRegion("lot", lot); //Rectangle2D.Float make = new Rectangle2D.Float(558, 0, 65, page.getMediaBox().getHeight()); //stripper2.addRegion("make", make); //Rectangle2D.Float officer = new Rectangle2D.Float(623, 0, 66, page.getMediaBox().getHeight()); //stripper2.addRegion("officer", officer); //Rectangle2D.Float state = new Rectangle2D.Float(689, 0, 100, page.getMediaBox().getHeight()); //stripper2.addRegion("state", state); //stripper2.extractRegions(page); //String text = stripper2.getTextForRegion("license"); //Rectangle2D.Float row = new Rectangle2D.Float(0, 156, 80, 10); //stripper3.addRegion("row", row); //stripper3.extractRegions(page); //String text = stripper3.getTextForRegion("row"); //System.out.println(text); counter++; curHeight = 116; rowCount = 39; } //AddRow(sheet, text, counter); //counter++; pd.close(); System.out.println("Data extracted to Excel, parsing through Excel data..."); boolean multiline = true; while (multiline) { multiline = false; for (int row = 0; row < sheet.getRows(); row++) { Cell cell = sheet.getCell(0, row); if (cell.getContents().length() < 5) { multiline = true; WritableCell cell2 = sheet.getWritableCell(4, row - 1); WritableCell cell3 = sheet.getWritableCell(4, row); String content = cell2.getContents() + cell3.getContents(); content = content.replace("\n", "").replace("\r", ""); Label l = (Label) cell2; l.setString(content); sheet.removeRow(row); } } } System.out.println("Data extraction complete"); workbook.write(); workbook.close(); }
From source file:PDSL.PDFProcessor.java
public void pdfToText(String dirFrom, String dirTo) throws IOException { File pdfFolder = new File(dirFrom); File[] listOfPDF = pdfFolder.listFiles(); for (File thePDF : listOfPDF) { PDDocument pdDoc = PDDocument.load(thePDF); PDFTextStripper pdfStripper = new PDFTextStripper(); String parsedText = pdfStripper.getText(pdDoc); PrintWriter out = new PrintWriter(dirTo + "/" + thePDF.getName().replace(".pdf", ".txt")); out.write(parsedText);/* w ww .j a v a 2 s . c o m*/ out.close(); } }