net.dstserbak.dataindexer.tokenizer.PDFTokenizer.java Source code

Java tutorial

Introduction

Here is the source code for net.dstserbak.dataindexer.tokenizer.PDFTokenizer.java

Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package net.dstserbak.dataindexer.tokenizer;

import java.io.*;

import org.apache.pdfbox.pdmodel.*;
import org.apache.pdfbox.util.*;
import org.apache.pdfbox.io.RandomAccessFile;

import java.util.StringTokenizer;

import java.net.URL;
import java.util.logging.Level;
import java.util.logging.Logger;

/**
 * Utility class provides methods to tokenize PDF documents.
 * 
 * @author Deniss Stserbak
 */
final public class PDFTokenizer {

    /**
     * Logger for this class
     */
    private static final Logger log = Logger.getLogger(PDFTokenizer.class.getName());

    /**
     * Temporary file name, that is needed to read large documents.
     */
    private static final File PDF_SCRATCH_FILE = new File("pdfScratchFile.tmp");

    private PDFTokenizer() {
    }

    /**
     * Splits text from PDF URL to words and returns them as TokensMap object.
     * @param url Input PDF URL
     * @return Map that contains tokens, which are belong to PDF document
     * @throws IOException If an I/O error occurs
     */
    public static TokensMap tokenizePdf(URL url) throws IOException {
        checkTempFileExistance();
        try (RandomAccessFile scratchFile = new RandomAccessFile(PDF_SCRATCH_FILE, "rw")) {
            try (PDDocument pd = PDDocument.load(url, scratchFile)) {
                return tokenizeInput(pd);
            }
        }
    }

    /**
     * Checks for TEMP_PDF_FILE existence and deletes it, if true
     * @throws IOException If an I/O error occurs
     */
    private static void checkTempFileExistance() throws IOException {
        if (PDF_SCRATCH_FILE.exists()) {
            PDF_SCRATCH_FILE.delete();
        }
    }

    /**
     * Reads all pages of the PDF file and splits text to words, which are
     * returned as TokensMap object.
     * @param pd PDDocument object that is created from stream
     * @return Map that contains tokens, which are belong to PDF document
     * @throws IOException If an I/O error occurs
     */
    private static TokensMap tokenizeInput(PDDocument pd) throws IOException {
        int numberOfPages = pd.getNumberOfPages();
        if (pd.isEncrypted()) {
            log.log(Level.SEVERE, "PDF is ecrypted");
            return null;
        } else if (numberOfPages < 1) {
            log.log(Level.SEVERE, "PDF number of pages is less than 1");
            return null;
        }

        PDFTextStripper stripper = new PDFTextStripper();
        stripper.setStartPage(1);
        stripper.setEndPage(numberOfPages);
        StringTokenizer st = new StringTokenizer(stripper.getText(pd));
        TokensMap tokensMap = new TokensMap();
        TokenizerUtils.addTokensToMap(tokensMap, st);
        return tokensMap;
    }
}