Example usage for org.apache.pdfbox.text PDFTextStripper setStartPage

Introduction

In this page you can find the example usage for org.apache.pdfbox.text PDFTextStripper setStartPage.

Prototype

public void setStartPage(int startPageValue)

Source Link

Document

This will set the first page to be extracted by this class.

Usage

From source file:com.jt.tool.pdf.CreateBookmarks.java

License:Apache License

/**
 * parse text/*from w  ww . j  a  v  a2  s.c  om*/
 */
public static String getPageText(PDDocument document, int start, int offset) throws Exception {
    PDFTextStripper stripper = new PDFTextStripper();
    stripper.setStartPage(start);
    stripper.setEndPage(start + offset);
    return stripper.getText(document);
}

From source file:com.plumblarrick.andrew.cityrecordtextextractor.IssueExtractorPositional.java

public void extractToFile(String inFileName, String outFileName) throws IOException {

    this.inFileName = inFileName;
    this.outFileName = outFileName;
    try {//from w  ww.j  a v  a  2s. c o  m
        document = PDDocument.load(new File(inFileName));

        PDFTextStripper stripper = new CRTStripper();
        //stripper.setSortByPosition(true);
        stripper.setStartPage(0);
        stripper.setEndPage(document.getNumberOfPages());

        fileOut = (new BufferedWriter(new PrintWriter(outFileName, "UTF-8")));

        fileOut.write("Source file: " + inFileName + "\n");
        stripper.writeText(document, fileOut);

    } finally {
        if (document != null) {
            document.close();
            fileOut.flush();
            fileOut.close();
        }
    }
}

From source file:com.testautomationguru.utility.PDFUtil.java

License:Apache License

/**
* This method returns the content of the document
*///from   w  ww .  ja va  2s .  c om
private String getPDFText(String file, int startPage, int endPage) throws IOException {

    logger.info("file : " + file);
    logger.info("startPage : " + startPage);
    logger.info("endPage : " + endPage);

    PDDocument doc = PDDocument.load(new File(file));

    PDFTextStripper localStripper = new PDFTextStripper();
    if (null != this.stripper) {
        localStripper = this.stripper;
    }

    this.updateStartAndEndPages(file, startPage, endPage);
    localStripper.setStartPage(this.startPage);
    localStripper.setEndPage(this.endPage);

    String txt = localStripper.getText(doc);
    logger.info("PDF Text before trimming : " + txt);
    if (this.bTrimWhiteSpace) {
        txt = txt.trim().replaceAll("\\s+", " ").trim();
        logger.info("PDF Text after  trimming : " + txt);
    }

    doc.close();
    return txt;
}

From source file:com.validation.manager.core.server.core.AttachmentServerTest.java

License:Apache License

/**
 * Test of addFile method, of class AttachmentServer.
 */// ww w  . j av  a2 s.  c  o m
@Test
public void testAddRetrieveTextFile() {
    try {
        System.out.println("add text File");
        File f = new File("target/Test.txt");
        f.deleteOnExit();
        List<String> lines = Arrays.asList("The first line", "The second line");
        Path file = Paths.get(f.getAbsolutePath());
        Files.write(file, lines, Charset.forName("UTF-8"));
        AttachmentServer instance = new AttachmentServer();
        instance.addFile(f, f.getName());
        instance.write2DB();
        //Delete the file
        FileUtils.delete(f.getAbsolutePath());
        assertEquals(1, (int) instance.getAttachmentType().getId());//Text file
        System.out.println("retrieveFile");
        AttachmentServer temp = new AttachmentServer(instance.getAttachmentPK());
        File loadedFile = temp.getAttachedFile("target/loaded/");
        BufferedReader br = new BufferedReader(new FileReader(loadedFile));
        String line;
        int count = 0;
        while ((line = br.readLine()) != null) {
            assertEquals(lines.get(count), line);
            System.out.println(line);
            count++;
        }
        assertEquals(lines.size(), count);
        //Create pdf file
        System.out.println("add pdf File");
        File pdf = Tool.convertToPDF(loadedFile, "target/Text.pdf");
        pdf.deleteOnExit();
        instance = new AttachmentServer();
        instance.addFile(pdf, pdf.getName());
        instance.write2DB();
        //Delete the file
        FileUtils.delete(pdf.getAbsolutePath());
        assertEquals(2, (int) instance.getAttachmentType().getId());//PDF file
        System.out.println("retrieveFile");
        temp = new AttachmentServer(instance.getAttachmentPK());
        loadedFile = temp.getAttachedFile("target/loaded/");
        PDFTextStripper pdfStripper;
        PDDocument pdDoc = null;
        COSDocument cosDoc = null;
        try {
            PDFParser parser = new PDFParser(new RandomAccessBufferedFileInputStream(loadedFile));
            parser.parse();
            cosDoc = parser.getDocument();
            pdfStripper = new PDFTextStripper();
            pdDoc = new PDDocument(cosDoc);
            pdfStripper.setStartPage(1);
            pdfStripper.setEndPage(1);
            String parsedText = pdfStripper.getText(pdDoc);
            System.out.println(parsedText);
        } catch (IOException ex) {
            Exceptions.printStackTrace(ex);
            fail();
        } finally {
            if (cosDoc != null) {
                cosDoc.close();
            }
            if (pdDoc != null) {
                pdDoc.close();
            }
        }
    } catch (IOException | VMException ex) {
        Exceptions.printStackTrace(ex);
        fail();
    }
}

From source file:cz.incad.kramerius.k5indexer.KrameriusPDFDocument.java

public String getPage(int page) throws Exception {
    logger.log(Level.INFO, "Getting page {0}", page);
    try {//from   www. j  a  v a2s  .c  o  m
        PDFTextStripper stripper = new PDFTextStripper(/*"UTF-8"*/);
        if (page != -1) {
            stripper.setStartPage(page);
            stripper.setEndPage(page);
        }

        return StringEscapeUtils.escapeXml(stripper.getText(pdDoc));
    } catch (Exception ex) {
        return "";
    }
}

From source file:dk.defxws.fedoragsearch.server.TransformerToText.java

License:Open Source License

public static StringBuffer getTextFromPDF(PDDocument pdDoc, String pageNum) throws Exception {
    StringBuffer docText = new StringBuffer();
    String password = "";
    // extract PDF document's textual content
    try {//from  w w w  . j a v  a  2 s.c  o m
        PDFTextStripper stripper = new PDFTextStripper(/*"UTF-8"*/);
        int page = Integer.parseInt(pageNum);
        if (page != -1) {
            stripper.setStartPage(page);
            stripper.setEndPage(page);
        }
        docText = new StringBuffer(stripper.getText(pdDoc));
    } catch (IOException e) {
        throw new Exception("Cannot parse PDF document", e);
    }
    return docText;
}

From source file:dk.defxws.fedoragsearch.server.TransformerToText.java

License:Open Source License

/**
 * //from www.  j a v a2 s  .  c  o m
 *
 * @throws Exception.
 */
private StringBuffer getTextFromPDF(byte[] doc, String pageNum) throws Exception {
    StringBuffer docText = new StringBuffer();
    PDDocument pdDoc = null;
    String password = "";

    // extract PDF document's textual content
    try {
        PDFTextStripper stripper = new PDFTextStripper(/*"UTF-8"*/);
        int page = Integer.parseInt(pageNum);
        if (page != -1) {
            stripper.setStartPage(page);
            stripper.setEndPage(page);
        }
        //password
        pdDoc = PDDocument.load(new ByteArrayInputStream(doc), password); // new PDDocument(cosDoc);
        docText = new StringBuffer(stripper.getText(pdDoc));
    } catch (IOException e) {
        throw new Exception("Cannot parse PDF document", e);
    } finally {
        closePDDocument(pdDoc);
    }
    return docText;
}

From source file:org.quelea.services.importexport.SurvivorSongbookParser.java

License:Open Source License

/**
 * Get the text on a page in the PDF document.
 * @param document the document.//from   w w w  . java2s .c  o m
 * @param stripper the PDF stripper used to get the text.
 * @param page     the page number.
 * @return the text on the given page.
 * @throws IOException if something went wrong.
 */
private String getPageText(PDDocument document, PDFTextStripper stripper, int page) throws IOException {
    stripper.setStartPage(page);
    stripper.setEndPage(page);
    StringWriter textWriter = new StringWriter();
    stripper.writeText(document, textWriter);
    return textWriter.toString().replace("", "'").replace("`", "'");
}

From source file:pdf.to.info.PDF.java

/**
 * Creating a PDDocument object/*w ww.j  a va  2s  .c om*/
 *
 * @param filePath
 * @return
 * @throws java.io.IOException
 */
private PDDocument ReadPDDoc(String filePath) throws IOException {
    File file = new File(filePath);
    PDFParser parser = new PDFParser(new RandomAccessFile(file, "r")); // update for PDFBox V 2.0
    parser.parse();
    COSDocument cosDoc = parser.getDocument();
    PDFTextStripper pdfStripper = new PDFTextStripper();
    PDDocument pdDoc = new PDDocument(cosDoc);
    pdDoc.getNumberOfPages();
    pdfStripper.setStartPage(1);
    pdfStripper.setEndPage(1);
    // for reading all pages of pdf file
    // pdfStripper.setEndPage(pdDoc.getNumberOfPages());
    return pdDoc;
}

From source file:pdfconverter.converter3.java

@SuppressWarnings("deprecation")

public static void main(String[] args) throws IOException, WriteException {
    workbook = Workbook.createWorkbook(new File(output));
    System.out.println("File created");
    WritableSheet sheet = workbook.createSheet("Page", 0);
    ExcelStart(sheet);//from  w  w w  . jav a  2  s .  c  o m

    //Scanner user_input = new Scanner( System.in );
    File dir = new File(path);
    //System.out.println(dir.getPath());
    File[] dirList = dir.listFiles(new FilenameFilter() {
        @Override
        public boolean accept(File dir, String name) {
            return name.endsWith(".pdf");
        }
    });

    int counter = 1;
    PDDocument pd;
    PDFTextStripper stripper = new PDFTextStripper();
    PDFTextStripperByArea areaSearch = new PDFTextStripperByArea();
    PDFTextStripperByArea stripper2 = new PDFTextStripperByArea();
    PDFTextStripperByArea stripper3 = new PDFTextStripperByArea();
    //PDRectangle rect = new PDRectangle(0, 0, 100, 100);
    stripper.setStartPage(1); //Start extracting from page 3
    stripper.setEndPage(1); //Extract till page 5
    File f = new File(dirList[0].getPath());

    pd = PDDocument.load(f);
    //int curHeight = 136;
    //int rowCount = 37;
    int curHeight = 116;
    int rowCount = 39;
    int rowHeight = 9;
    int sheetRowCount = 0;
    int pageStop = 1491;

    for (int curpage = 800; curpage < pageStop; curpage++) {
        if (counter > 800) {
            break;
        }
        PDPage page = pd.getPage(curpage);

        System.out.println("Now parsing page " + curpage);
        for (int curRow = 0; curRow < 80; curRow++) {
            Rectangle2D.Float cell = new Rectangle2D.Float(0, curHeight, 80, rowHeight);
            String name = "cell-1-" + curRow;
            areaSearch.addRegion(name, cell);
            areaSearch.extractRegions(page);
            String text = areaSearch.getTextForRegion(name);
            areaSearch.removeRegion(name);
            AddCell(sheet, text, 0, sheetRowCount + 1);

            cell = new Rectangle2D.Float(80, curHeight, 30, rowHeight);
            name = "cell-2-" + curRow;
            areaSearch.addRegion(name, cell);
            areaSearch.extractRegions(page);
            text = areaSearch.getTextForRegion(name);
            areaSearch.removeRegion(name);
            AddCell(sheet, text, 1, sheetRowCount + 1);

            cell = new Rectangle2D.Float(110, curHeight, 40, rowHeight);
            name = "cell-3-" + curRow;
            areaSearch.addRegion(name, cell);
            areaSearch.extractRegions(page);
            text = areaSearch.getTextForRegion(name);
            areaSearch.removeRegion(name);
            AddCell(sheet, text, 2, sheetRowCount + 1);

            cell = new Rectangle2D.Float(150, curHeight, 120, rowHeight);
            name = "cell-4-" + curRow;
            areaSearch.addRegion(name, cell);
            areaSearch.extractRegions(page);
            text = areaSearch.getTextForRegion(name);
            areaSearch.removeRegion(name);
            AddCell(sheet, text, 3, sheetRowCount + 1);

            cell = new Rectangle2D.Float(270, curHeight, 120, rowHeight);
            name = "cell-5-" + curRow;
            areaSearch.addRegion(name, cell);
            areaSearch.extractRegions(page);
            text = areaSearch.getTextForRegion(name);
            areaSearch.removeRegion(name);
            AddCell(sheet, text, 4, sheetRowCount + 1);

            cell = new Rectangle2D.Float(390, curHeight, 40, rowHeight);
            name = "cell-6-" + curRow;
            areaSearch.addRegion(name, cell);
            areaSearch.extractRegions(page);
            text = areaSearch.getTextForRegion(name);
            areaSearch.removeRegion(name);
            AddCell(sheet, text, 5, sheetRowCount + 1);

            cell = new Rectangle2D.Float(430, curHeight, 46, rowHeight);
            name = "cell-7-" + curRow;
            areaSearch.addRegion(name, cell);
            areaSearch.extractRegions(page);
            text = areaSearch.getTextForRegion(name);
            areaSearch.removeRegion(name);
            AddCell(sheet, text, 6, sheetRowCount + 1);

            cell = new Rectangle2D.Float(476, curHeight, 82, rowHeight);
            name = "cell-8-" + curRow;
            areaSearch.addRegion(name, cell);
            areaSearch.extractRegions(page);
            text = areaSearch.getTextForRegion(name);
            areaSearch.removeRegion(name);
            AddCell(sheet, text, 7, sheetRowCount + 1);

            cell = new Rectangle2D.Float(558, curHeight, 65, rowHeight);
            name = "cell-9-" + curRow;
            areaSearch.addRegion(name, cell);
            areaSearch.extractRegions(page);
            text = areaSearch.getTextForRegion(name);
            areaSearch.removeRegion(name);
            AddCell(sheet, text, 8, sheetRowCount + 1);

            cell = new Rectangle2D.Float(623, curHeight, 66, rowHeight);
            name = "cell-10-" + curRow;
            areaSearch.addRegion(name, cell);
            areaSearch.extractRegions(page);
            text = areaSearch.getTextForRegion(name);
            areaSearch.removeRegion(name);
            AddCell(sheet, text, 9, sheetRowCount + 1);

            cell = new Rectangle2D.Float(689, curHeight, 100, rowHeight);
            name = "cell-11-" + curRow;
            areaSearch.addRegion(name, cell);
            areaSearch.extractRegions(page);
            text = areaSearch.getTextForRegion(name);
            areaSearch.removeRegion(name);
            AddCell(sheet, text, 10, sheetRowCount + 1);

            sheetRowCount++;
            curHeight += rowHeight;
        }

        //Rectangle2D.Float issueDate = new Rectangle2D.Float(0, 0, 80, page.getMediaBox().getHeight());
        //stripper2.addRegion("issueDate", issueDate);
        //Rectangle2D.Float amount = new Rectangle2D.Float(80, 0, 30, page.getMediaBox().getHeight());
        //stripper2.addRegion("amount", amount);
        //Rectangle2D.Float citation = new Rectangle2D.Float(110, 0, 40, page.getMediaBox().getHeight());
        //stripper2.addRegion("citation", citation);
        //Rectangle2D.Float violation = new Rectangle2D.Float(150, 0, 120, page.getMediaBox().getHeight());
        //stripper2.addRegion("violation", violation);
        //Rectangle2D.Float comment = new Rectangle2D.Float(270, 0, 120, page.getMediaBox().getHeight());
        //stripper2.addRegion("comment", comment);
        //Rectangle2D.Float warning = new Rectangle2D.Float(390, 0, 40, page.getMediaBox().getHeight());
        //stripper2.addRegion("warning", warning);
        //Rectangle2D.Float license = new Rectangle2D.Float(430, 0, 46, page.getMediaBox().getHeight());
        //stripper2.addRegion("license", license);
        //Rectangle2D.Float lot = new Rectangle2D.Float(476, 0, 82, page.getMediaBox().getHeight());
        //stripper2.addRegion("lot", lot);
        //Rectangle2D.Float make = new Rectangle2D.Float(558, 0, 65, page.getMediaBox().getHeight());
        //stripper2.addRegion("make", make);
        //Rectangle2D.Float officer = new Rectangle2D.Float(623, 0, 66, page.getMediaBox().getHeight());
        //stripper2.addRegion("officer", officer);
        //Rectangle2D.Float state = new Rectangle2D.Float(689, 0, 100, page.getMediaBox().getHeight());
        //stripper2.addRegion("state", state);

        //stripper2.extractRegions(page);
        //String text = stripper2.getTextForRegion("license");

        //Rectangle2D.Float row = new Rectangle2D.Float(0, 156, 80, 10);
        //stripper3.addRegion("row", row);
        //stripper3.extractRegions(page);
        //String text = stripper3.getTextForRegion("row");
        //System.out.println(text);
        counter++;
        curHeight = 116;
        rowCount = 39;
    }
    //AddRow(sheet, text, counter);
    //counter++;
    pd.close();

    System.out.println("Data extracted to Excel, parsing through Excel data...");

    boolean multiline = true;
    while (multiline) {
        multiline = false;
        for (int row = 0; row < sheet.getRows(); row++) {
            Cell cell = sheet.getCell(0, row);
            if (cell.getContents().length() < 5) {
                multiline = true;
                WritableCell cell2 = sheet.getWritableCell(4, row - 1);
                WritableCell cell3 = sheet.getWritableCell(4, row);
                String content = cell2.getContents() + cell3.getContents();
                content = content.replace("\n", "").replace("\r", "");
                Label l = (Label) cell2;
                l.setString(content);
                sheet.removeRow(row);
            }
        }
    }

    System.out.println("Data extraction complete");
    workbook.write();
    workbook.close();
}