Example usage for org.apache.pdfbox.text PDFTextStripper PDFTextStripper

Introduction

In this page you can find the example usage for org.apache.pdfbox.text PDFTextStripper PDFTextStripper.

Prototype

public PDFTextStripper() throws IOException

Source Link

Document

Instantiate a new PDFTextStripper object.

Usage

From source file:org.knime.ext.textprocessing.nodes.source.parser.pdf.PDFDocumentParser.java

License:Open Source License

private Document parseInternal(final InputStream is) throws Exception {
    m_currentDoc = new DocumentBuilder(m_tokenizerName);
    m_currentDoc.setDocumentFile(new File(m_docPath));
    m_currentDoc.setDocumentType(m_type);
    m_currentDoc.addDocumentCategory(m_category);
    m_currentDoc.addDocumentSource(m_source);

    if (m_charset == null) {
        m_charset = Charset.defaultCharset();
    }/*from ww  w  . j  a  v a  2 s .  c  o  m*/

    PDDocument document = null;
    try {
        document = PDDocument.load(is);

        // extract text from pdf
        PDFTextStripper stripper = new PDFTextStripper();
        stripper.setSortByPosition(true);
        String text = stripper.getText(document);
        m_currentDoc.addSection(text, SectionAnnotation.UNKNOWN);

        // extract meta data from pdf
        String title = null;
        String authors = null;

        if (m_filenameAsTitle) {
            title = m_docPath.toString().trim();
        }

        PDDocumentInformation information = document.getDocumentInformation();
        if (information != null) {
            if (!checkTitle(title)) {
                title = information.getTitle();
            }
            authors = information.getAuthor();
        }

        // if title meta data does not exist use first sentence
        if (!checkTitle(title)) {
            List<Section> sections = m_currentDoc.getSections();
            if (sections.size() > 0) {
                try {
                    title = sections.get(0).getParagraphs().get(0).getSentences().get(0).getText().trim();
                } catch (IndexOutOfBoundsException e) {
                    LOGGER.debug("Parsed PDF document " + m_docPath + " is empty.");
                    title = "";
                }
            }
        }
        // if no useful first sentence exist use filename
        if (!checkTitle(title)) {
            title = m_docPath.toString().trim();
        }
        m_currentDoc.addTitle(title);

        // use author meta data
        if (authors != null) {
            Set<Author> authSet = AuthorUtil.parseAuthors(authors);
            for (Author a : authSet) {
                m_currentDoc.addAuthor(a);
            }
        }

        // add document to list
        return m_currentDoc.createDocument();
    } finally {
        if (document != null) {
            document.close();
        }
    }
}

From source file:org.quelea.services.importexport.SurvivorSongbookParser.java

License:Open Source License

/**
 * Get all the songs in the PDF document.
 * @return a list of all the songs.//from ww w .  ja  v  a  2s .  co  m
 * @throws IOException if something went wrong.
 */
@Override
public List<SongDisplayable> getSongs(File location, StatusPanel statusPanel) throws IOException {
    PDDocument document = PDDocument.load(location);
    List<SongDisplayable> pdfSongs = new ArrayList<>();
    PDFTextStripper stripper = new PDFTextStripper();
    List<String> songParts = new ArrayList<>();
    for (int i = 0; i < document.getNumberOfPages(); i++) {
        String pageText = getPageText(document, stripper, i);
        if (pageText.trim().isEmpty()) {
            continue;
        }
        songParts.add(pageText);
        boolean twoPart = pageText.contains("(1 of");
        if (i < document.getNumberOfPages() - 1) { //This section in case the original (1 of x) is missed out
            String nextPageText = getPageText(document, stripper, i + 1);
            if (nextPageText.contains("(2 of")) {
                twoPart = true;
            }
        }
        if (!twoPart) {
            SongDisplayable song = processSong(songParts.toArray(new String[songParts.size()]));
            if (song != null) {
                pdfSongs.add(song);
            }
            songParts.clear();
        }
    }
    document.close();
    if (pdfSongs == null) {
        return new ArrayList<>();
    } else {
        return pdfSongs;
    }
}

From source file:org.titans.fyp.webcrawler.PageCollector.java

License:Open Source License

private static void pdfToText(String pdfURL) {

    pdfURL = "https://" + pdfURL.split("://")[1];
    //        System.out.println(pdfURL);

    try {/* w  w  w .  j a  v a 2 s  . c o m*/

        PDDocument pddDocument = PDDocument.load((new URL(pdfURL)).openStream());
        PDFTextStripper textStripper = new PDFTextStripper();
        String doc = textStripper.getText(pddDocument);
        pddDocument.close();
        System.out.println(doc);
    } catch (Exception e) {
        e.getMessage();
    }
}

From source file:org.vesalainen.ham.pdf.RfaxTest.java

License:Open Source License

public void test() throws IOException {
    PDDocument document = PDDocument.load(new File("rfax.pdf"));
    if (!document.isEncrypted()) {
        PDFTextStripper stripper = new PDFTextStripper();
        String text = stripper.getText(document);
        try (BufferedWriter bw = Files.newBufferedWriter(Paths.get("src", "main", "resources", "rfax.txt"))) {
            bw.write(text);/*from  w  ww  . j ava  2s .  co m*/
        }
    }
    document.close();
}

From source file:org.wildfly.camel.test.fop.FopIntegrationTest.java

License:Apache License

private String extractTextFromDocument(PDDocument document) throws IOException {
    Writer output = new StringWriter();
    PDFTextStripper stripper = new PDFTextStripper();
    stripper.writeText(document, output);
    return output.toString().trim();
}

From source file:PDF.PDFTest.java

License:Apache License

public static void constrainText(String start, String end, File file) throws IOException {
    PDDocument doc = PDDocument.load(file);
    PDFTextStripper stripper = new PDFTextStripper();
    String text = stripper.getText(doc); // we get the text of the entire
    // document into a String
    String[] split_on_start = text.split(start); // split on the start
    // parameter, take upper
    // bound//www . j a va  2 s.c o  m
    String[] split_on_end = split_on_start[1].split(end); // split on end
    // parameter,
    // take lower
    // bound
    String constrained_string = start;
    constrained_string += split_on_end[0]; // the final string will be the
    // area in between start and end
    doc.close();
    System.out.print(constrained_string);
}

From source file:pdf.to.info.PDF.java

/**
 * Reading text from PDF file//from  w  ww.ja  va 2 s .  c  o  m
 *
 * @param filePath
 * @return
 * @throws java.io.IOException
 */
public String ReadText(String filePath) throws IOException {
    PDFTextStripper pdfStripper = new PDFTextStripper();
    return pdfStripper.getText(ReadPDDoc(filePath));
}

From source file:pdf.to.info.PDF.java

/**
 * Creating a PDDocument object/*from  w w  w .  jav a  2 s  .  c o  m*/
 *
 * @param filePath
 * @return
 * @throws java.io.IOException
 */
private PDDocument ReadPDDoc(String filePath) throws IOException {
    File file = new File(filePath);
    PDFParser parser = new PDFParser(new RandomAccessFile(file, "r")); // update for PDFBox V 2.0
    parser.parse();
    COSDocument cosDoc = parser.getDocument();
    PDFTextStripper pdfStripper = new PDFTextStripper();
    PDDocument pdDoc = new PDDocument(cosDoc);
    pdDoc.getNumberOfPages();
    pdfStripper.setStartPage(1);
    pdfStripper.setEndPage(1);
    // for reading all pages of pdf file
    // pdfStripper.setEndPage(pdDoc.getNumberOfPages());
    return pdDoc;
}

From source file:pdfconverter.converter3.java

@SuppressWarnings("deprecation")

public static void main(String[] args) throws IOException, WriteException {
    workbook = Workbook.createWorkbook(new File(output));
    System.out.println("File created");
    WritableSheet sheet = workbook.createSheet("Page", 0);
    ExcelStart(sheet);// w  w w .  j av  a  2 s.  c o  m

    //Scanner user_input = new Scanner( System.in );
    File dir = new File(path);
    //System.out.println(dir.getPath());
    File[] dirList = dir.listFiles(new FilenameFilter() {
        @Override
        public boolean accept(File dir, String name) {
            return name.endsWith(".pdf");
        }
    });

    int counter = 1;
    PDDocument pd;
    PDFTextStripper stripper = new PDFTextStripper();
    PDFTextStripperByArea areaSearch = new PDFTextStripperByArea();
    PDFTextStripperByArea stripper2 = new PDFTextStripperByArea();
    PDFTextStripperByArea stripper3 = new PDFTextStripperByArea();
    //PDRectangle rect = new PDRectangle(0, 0, 100, 100);
    stripper.setStartPage(1); //Start extracting from page 3
    stripper.setEndPage(1); //Extract till page 5
    File f = new File(dirList[0].getPath());

    pd = PDDocument.load(f);
    //int curHeight = 136;
    //int rowCount = 37;
    int curHeight = 116;
    int rowCount = 39;
    int rowHeight = 9;
    int sheetRowCount = 0;
    int pageStop = 1491;

    for (int curpage = 800; curpage < pageStop; curpage++) {
        if (counter > 800) {
            break;
        }
        PDPage page = pd.getPage(curpage);

        System.out.println("Now parsing page " + curpage);
        for (int curRow = 0; curRow < 80; curRow++) {
            Rectangle2D.Float cell = new Rectangle2D.Float(0, curHeight, 80, rowHeight);
            String name = "cell-1-" + curRow;
            areaSearch.addRegion(name, cell);
            areaSearch.extractRegions(page);
            String text = areaSearch.getTextForRegion(name);
            areaSearch.removeRegion(name);
            AddCell(sheet, text, 0, sheetRowCount + 1);

            cell = new Rectangle2D.Float(80, curHeight, 30, rowHeight);
            name = "cell-2-" + curRow;
            areaSearch.addRegion(name, cell);
            areaSearch.extractRegions(page);
            text = areaSearch.getTextForRegion(name);
            areaSearch.removeRegion(name);
            AddCell(sheet, text, 1, sheetRowCount + 1);

            cell = new Rectangle2D.Float(110, curHeight, 40, rowHeight);
            name = "cell-3-" + curRow;
            areaSearch.addRegion(name, cell);
            areaSearch.extractRegions(page);
            text = areaSearch.getTextForRegion(name);
            areaSearch.removeRegion(name);
            AddCell(sheet, text, 2, sheetRowCount + 1);

            cell = new Rectangle2D.Float(150, curHeight, 120, rowHeight);
            name = "cell-4-" + curRow;
            areaSearch.addRegion(name, cell);
            areaSearch.extractRegions(page);
            text = areaSearch.getTextForRegion(name);
            areaSearch.removeRegion(name);
            AddCell(sheet, text, 3, sheetRowCount + 1);

            cell = new Rectangle2D.Float(270, curHeight, 120, rowHeight);
            name = "cell-5-" + curRow;
            areaSearch.addRegion(name, cell);
            areaSearch.extractRegions(page);
            text = areaSearch.getTextForRegion(name);
            areaSearch.removeRegion(name);
            AddCell(sheet, text, 4, sheetRowCount + 1);

            cell = new Rectangle2D.Float(390, curHeight, 40, rowHeight);
            name = "cell-6-" + curRow;
            areaSearch.addRegion(name, cell);
            areaSearch.extractRegions(page);
            text = areaSearch.getTextForRegion(name);
            areaSearch.removeRegion(name);
            AddCell(sheet, text, 5, sheetRowCount + 1);

            cell = new Rectangle2D.Float(430, curHeight, 46, rowHeight);
            name = "cell-7-" + curRow;
            areaSearch.addRegion(name, cell);
            areaSearch.extractRegions(page);
            text = areaSearch.getTextForRegion(name);
            areaSearch.removeRegion(name);
            AddCell(sheet, text, 6, sheetRowCount + 1);

            cell = new Rectangle2D.Float(476, curHeight, 82, rowHeight);
            name = "cell-8-" + curRow;
            areaSearch.addRegion(name, cell);
            areaSearch.extractRegions(page);
            text = areaSearch.getTextForRegion(name);
            areaSearch.removeRegion(name);
            AddCell(sheet, text, 7, sheetRowCount + 1);

            cell = new Rectangle2D.Float(558, curHeight, 65, rowHeight);
            name = "cell-9-" + curRow;
            areaSearch.addRegion(name, cell);
            areaSearch.extractRegions(page);
            text = areaSearch.getTextForRegion(name);
            areaSearch.removeRegion(name);
            AddCell(sheet, text, 8, sheetRowCount + 1);

            cell = new Rectangle2D.Float(623, curHeight, 66, rowHeight);
            name = "cell-10-" + curRow;
            areaSearch.addRegion(name, cell);
            areaSearch.extractRegions(page);
            text = areaSearch.getTextForRegion(name);
            areaSearch.removeRegion(name);
            AddCell(sheet, text, 9, sheetRowCount + 1);

            cell = new Rectangle2D.Float(689, curHeight, 100, rowHeight);
            name = "cell-11-" + curRow;
            areaSearch.addRegion(name, cell);
            areaSearch.extractRegions(page);
            text = areaSearch.getTextForRegion(name);
            areaSearch.removeRegion(name);
            AddCell(sheet, text, 10, sheetRowCount + 1);

            sheetRowCount++;
            curHeight += rowHeight;
        }

        //Rectangle2D.Float issueDate = new Rectangle2D.Float(0, 0, 80, page.getMediaBox().getHeight());
        //stripper2.addRegion("issueDate", issueDate);
        //Rectangle2D.Float amount = new Rectangle2D.Float(80, 0, 30, page.getMediaBox().getHeight());
        //stripper2.addRegion("amount", amount);
        //Rectangle2D.Float citation = new Rectangle2D.Float(110, 0, 40, page.getMediaBox().getHeight());
        //stripper2.addRegion("citation", citation);
        //Rectangle2D.Float violation = new Rectangle2D.Float(150, 0, 120, page.getMediaBox().getHeight());
        //stripper2.addRegion("violation", violation);
        //Rectangle2D.Float comment = new Rectangle2D.Float(270, 0, 120, page.getMediaBox().getHeight());
        //stripper2.addRegion("comment", comment);
        //Rectangle2D.Float warning = new Rectangle2D.Float(390, 0, 40, page.getMediaBox().getHeight());
        //stripper2.addRegion("warning", warning);
        //Rectangle2D.Float license = new Rectangle2D.Float(430, 0, 46, page.getMediaBox().getHeight());
        //stripper2.addRegion("license", license);
        //Rectangle2D.Float lot = new Rectangle2D.Float(476, 0, 82, page.getMediaBox().getHeight());
        //stripper2.addRegion("lot", lot);
        //Rectangle2D.Float make = new Rectangle2D.Float(558, 0, 65, page.getMediaBox().getHeight());
        //stripper2.addRegion("make", make);
        //Rectangle2D.Float officer = new Rectangle2D.Float(623, 0, 66, page.getMediaBox().getHeight());
        //stripper2.addRegion("officer", officer);
        //Rectangle2D.Float state = new Rectangle2D.Float(689, 0, 100, page.getMediaBox().getHeight());
        //stripper2.addRegion("state", state);

        //stripper2.extractRegions(page);
        //String text = stripper2.getTextForRegion("license");

        //Rectangle2D.Float row = new Rectangle2D.Float(0, 156, 80, 10);
        //stripper3.addRegion("row", row);
        //stripper3.extractRegions(page);
        //String text = stripper3.getTextForRegion("row");
        //System.out.println(text);
        counter++;
        curHeight = 116;
        rowCount = 39;
    }
    //AddRow(sheet, text, counter);
    //counter++;
    pd.close();

    System.out.println("Data extracted to Excel, parsing through Excel data...");

    boolean multiline = true;
    while (multiline) {
        multiline = false;
        for (int row = 0; row < sheet.getRows(); row++) {
            Cell cell = sheet.getCell(0, row);
            if (cell.getContents().length() < 5) {
                multiline = true;
                WritableCell cell2 = sheet.getWritableCell(4, row - 1);
                WritableCell cell3 = sheet.getWritableCell(4, row);
                String content = cell2.getContents() + cell3.getContents();
                content = content.replace("\n", "").replace("\r", "");
                Label l = (Label) cell2;
                l.setString(content);
                sheet.removeRow(row);
            }
        }
    }

    System.out.println("Data extraction complete");
    workbook.write();
    workbook.close();
}

From source file:PDSL.PDFProcessor.java

public void pdfToText(String dirFrom, String dirTo) throws IOException {
    File pdfFolder = new File(dirFrom);
    File[] listOfPDF = pdfFolder.listFiles();
    for (File thePDF : listOfPDF) {
        PDDocument pdDoc = PDDocument.load(thePDF);
        PDFTextStripper pdfStripper = new PDFTextStripper();
        String parsedText = pdfStripper.getText(pdDoc);
        PrintWriter out = new PrintWriter(dirTo + "/" + thePDF.getName().replace(".pdf", ".txt"));
        out.write(parsedText);/* w  ww .j  a  v  a 2 s  .  c  o m*/
        out.close();
    }
}